xref: /titanic_52/usr/src/uts/sun4v/io/vsw.c (revision c99350c4354495dc6d31650fe88bcbdad2b11c12)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 
74 /*
75  * Function prototypes.
76  */
77 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
78 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
79 static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
80 static	void vsw_get_md_properties(vsw_t *vswp);
81 static	int vsw_setup_layer2(vsw_t *);
82 static	int vsw_setup_layer3(vsw_t *);
83 
84 /* MAC layer routines */
85 static	int vsw_mac_attach(vsw_t *vswp);
86 static	void vsw_mac_detach(vsw_t *vswp);
87 static void vsw_notify_cb(void *, mac_notify_type_t);
88 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
89 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
90 static int vsw_mac_register(vsw_t *);
91 static int vsw_mac_unregister(vsw_t *);
92 static int vsw_m_stat(void *, uint_t, uint64_t *);
93 static void vsw_m_stop(void *arg);
94 static int vsw_m_start(void *arg);
95 static int vsw_m_unicst(void *arg, const uint8_t *);
96 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
97 static int vsw_m_promisc(void *arg, boolean_t);
98 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
99 
100 /* MDEG routines */
101 static	void vsw_mdeg_register(vsw_t *vswp);
102 static	void vsw_mdeg_unregister(vsw_t *vswp);
103 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
104 
105 /* Port add/deletion routines */
106 static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
107 static	int vsw_port_attach(vsw_t *vswp, int p_instance,
108 	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
109 static	int vsw_detach_ports(vsw_t *vswp);
110 static	int vsw_port_detach(vsw_t *vswp, int p_instance);
111 static	int vsw_port_delete(vsw_port_t *port);
112 static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
113 static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
114 static	int vsw_init_ldcs(vsw_port_t *port);
115 static	int vsw_uninit_ldcs(vsw_port_t *port);
116 static	int vsw_ldc_init(vsw_ldc_t *ldcp);
117 static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
118 static	int vsw_drain_ldcs(vsw_port_t *port);
119 static	int vsw_drain_port_taskq(vsw_port_t *port);
120 static	void vsw_marker_task(void *);
121 static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
122 static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
123 
124 /* Interrupt routines */
125 static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
126 
127 /* Handshake routines */
128 static	void vsw_restart_handshake(vsw_ldc_t *);
129 static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
130 static	void vsw_next_milestone(vsw_ldc_t *);
131 static	int vsw_supported_version(vio_ver_msg_t *);
132 
133 /* Data processing routines */
134 static void vsw_process_pkt(void *);
135 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
136 static void vsw_process_ctrl_pkt(void *);
137 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
138 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
139 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
140 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
141 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
142 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
143 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
144 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
145 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
146 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
147 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
148 
149 /* Switching/data transmit routines */
150 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
151 	    vsw_port_t *port, mac_resource_handle_t);
152 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
153 	    vsw_port_t *port, mac_resource_handle_t);
154 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
155 	    vsw_port_t *port);
156 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
157 	    vsw_port_t *port);
158 static	int vsw_portsend(vsw_port_t *, mblk_t *);
159 static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
160 static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
161 
162 /* Packet creation routines */
163 static void vsw_send_ver(vsw_ldc_t *);
164 static void vsw_send_attr(vsw_ldc_t *);
165 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
166 static void vsw_send_dring_info(vsw_ldc_t *);
167 static void vsw_send_rdx(vsw_ldc_t *);
168 
169 static void vsw_send_msg(vsw_ldc_t *, void *, int);
170 
171 /* Forwarding database (FDB) routines */
172 static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
173 static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
174 static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
175 static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
176 static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
177 static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
178 static	void vsw_del_addr(uint8_t, void *, uint64_t);
179 static	void vsw_del_mcst_port(vsw_port_t *);
180 static	void vsw_del_mcst_vsw(vsw_t *);
181 
182 /* Dring routines */
183 static dring_info_t *vsw_create_dring(vsw_ldc_t *);
184 static void vsw_create_privring(vsw_ldc_t *);
185 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
186 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
187     int *);
188 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
189 
190 static void vsw_set_lane_attr(vsw_t *, lane_t *);
191 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
192 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
193 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
194 static int vsw_check_dring_info(vio_dring_reg_msg_t *);
195 
196 /* Misc support routines */
197 static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
198 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
199 static int vsw_free_ring(dring_info_t *);
200 
201 
202 /* Debugging routines */
203 static void dump_flags(uint64_t);
204 static void display_state(void);
205 static void display_lane(lane_t *);
206 static void display_ring(dring_info_t *);
207 
208 int	vsw_num_handshakes = 3;		/* # of handshake attempts */
209 int	vsw_wretries = 100;		/* # of write attempts */
210 int	vsw_chain_len = 150;		/* max # of mblks in msg chain */
211 int	vsw_desc_delay = 0;		/* delay in us */
212 int	vsw_read_attempts = 5;		/* # of reads of descriptor */
213 
214 uint32_t	vsw_mblk_size = VSW_MBLK_SIZE;
215 uint32_t	vsw_num_mblks = VSW_NUM_MBLKS;
216 
217 
218 /*
219  * mode specific frame switching function
220  */
221 void		(*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *,
222 			mac_resource_handle_t);
223 
224 static	mac_callbacks_t	vsw_m_callbacks = {
225 	0,
226 	vsw_m_stat,
227 	vsw_m_start,
228 	vsw_m_stop,
229 	vsw_m_promisc,
230 	vsw_m_multicst,
231 	vsw_m_unicst,
232 	vsw_m_tx,
233 	NULL,
234 	NULL,
235 	NULL
236 };
237 
238 static	struct	cb_ops	vsw_cb_ops = {
239 	nulldev,			/* cb_open */
240 	nulldev,			/* cb_close */
241 	nodev,				/* cb_strategy */
242 	nodev,				/* cb_print */
243 	nodev,				/* cb_dump */
244 	nodev,				/* cb_read */
245 	nodev,				/* cb_write */
246 	nodev,				/* cb_ioctl */
247 	nodev,				/* cb_devmap */
248 	nodev,				/* cb_mmap */
249 	nodev,				/* cb_segmap */
250 	nochpoll,			/* cb_chpoll */
251 	ddi_prop_op,			/* cb_prop_op */
252 	NULL,				/* cb_stream */
253 	D_MP,				/* cb_flag */
254 	CB_REV,				/* rev */
255 	nodev,				/* int (*cb_aread)() */
256 	nodev				/* int (*cb_awrite)() */
257 };
258 
259 static	struct	dev_ops	vsw_ops = {
260 	DEVO_REV,		/* devo_rev */
261 	0,			/* devo_refcnt */
262 	vsw_getinfo,		/* devo_getinfo */
263 	nulldev,		/* devo_identify */
264 	nulldev,		/* devo_probe */
265 	vsw_attach,		/* devo_attach */
266 	vsw_detach,		/* devo_detach */
267 	nodev,			/* devo_reset */
268 	&vsw_cb_ops,		/* devo_cb_ops */
269 	(struct bus_ops *)NULL,	/* devo_bus_ops */
270 	ddi_power		/* devo_power */
271 };
272 
273 extern	struct	mod_ops	mod_driverops;
274 static struct modldrv vswmodldrv = {
275 	&mod_driverops,
276 	"sun4v Virtual Switch Driver %I%",
277 	&vsw_ops,
278 };
279 
280 #define	LDC_ENTER_LOCK(ldcp)	\
281 				mutex_enter(&((ldcp)->ldc_cblock));\
282 				mutex_enter(&((ldcp)->ldc_txlock));
283 #define	LDC_EXIT_LOCK(ldcp)	\
284 				mutex_exit(&((ldcp)->ldc_txlock));\
285 				mutex_exit(&((ldcp)->ldc_cblock));
286 
287 /* Driver soft state ptr  */
288 static void	*vsw_state;
289 
290 /*
291  * Linked list of "vsw_t" structures - one per instance.
292  */
293 vsw_t		*vsw_head = NULL;
294 krwlock_t	vsw_rw;
295 
296 /*
297  * Property names
298  */
299 static char vdev_propname[] = "virtual-device";
300 static char vsw_propname[] = "virtual-network-switch";
301 static char physdev_propname[] = "vsw-phys-dev";
302 static char smode_propname[] = "vsw-switch-mode";
303 static char macaddr_propname[] = "local-mac-address";
304 static char remaddr_propname[] = "remote-mac-address";
305 static char ldcids_propname[] = "ldc-ids";
306 static char chan_propname[] = "channel-endpoint";
307 static char id_propname[] = "id";
308 static char reg_propname[] = "reg";
309 
310 /* supported versions */
311 static	ver_sup_t	vsw_versions[] = { {1, 0} };
312 
313 /*
314  * Matching criteria passed to the MDEG to register interest
315  * in changes to 'virtual-device-port' nodes identified by their
316  * 'id' property.
317  */
318 static md_prop_match_t vport_prop_match[] = {
319 	{ MDET_PROP_VAL,    "id"   },
320 	{ MDET_LIST_END,    NULL    }
321 };
322 
323 static mdeg_node_match_t vport_match = { "virtual-device-port",
324 						vport_prop_match };
325 
326 /*
327  * Specification of an MD node passed to the MDEG to filter any
328  * 'vport' nodes that do not belong to the specified node. This
329  * template is copied for each vsw instance and filled in with
330  * the appropriate 'cfg-handle' value before being passed to the MDEG.
331  */
332 static mdeg_prop_spec_t vsw_prop_template[] = {
333 	{ MDET_PROP_STR,    "name",		vsw_propname },
334 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
335 	{ MDET_LIST_END,    NULL,		NULL	}
336 };
337 
338 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
339 
340 /*
341  * Print debug messages - set to 0x1f to enable all msgs
342  * or 0x0 to turn all off.
343  */
344 int vswdbg = 0x0;
345 
346 /*
347  * debug levels:
348  * 0x01:	Function entry/exit tracing
349  * 0x02:	Internal function messages
350  * 0x04:	Verbose internal messages
351  * 0x08:	Warning messages
352  * 0x10:	Error messages
353  */
354 
355 static void
356 vswdebug(vsw_t *vswp, const char *fmt, ...)
357 {
358 	char buf[512];
359 	va_list ap;
360 
361 	va_start(ap, fmt);
362 	(void) vsprintf(buf, fmt, ap);
363 	va_end(ap);
364 
365 	if (vswp == NULL)
366 		cmn_err(CE_CONT, "%s\n", buf);
367 	else
368 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
369 }
370 
371 /*
372  * For the moment the state dump routines have their own
373  * private flag.
374  */
375 #define	DUMP_STATE	0
376 
377 #if DUMP_STATE
378 
379 #define	DUMP_TAG(tag) \
380 {			\
381 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
382 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
383 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
384 }
385 
386 #define	DUMP_TAG_PTR(tag) \
387 {			\
388 	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
389 	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
390 	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
391 }
392 
393 #define	DUMP_FLAGS(flags) dump_flags(flags);
394 #define	DISPLAY_STATE()	display_state()
395 
396 #else
397 
398 #define	DUMP_TAG(tag)
399 #define	DUMP_TAG_PTR(tag)
400 #define	DUMP_FLAGS(state)
401 #define	DISPLAY_STATE()
402 
403 #endif	/* DUMP_STATE */
404 
405 #ifdef DEBUG
406 
407 #define	D1		\
408 if (vswdbg & 0x01)	\
409 	vswdebug
410 
411 #define	D2		\
412 if (vswdbg & 0x02)	\
413 	vswdebug
414 
415 #define	D3		\
416 if (vswdbg & 0x04)	\
417 	vswdebug
418 
419 #define	DWARN		\
420 if (vswdbg & 0x08)	\
421 	vswdebug
422 
423 #define	DERR		\
424 if (vswdbg & 0x10)	\
425 	vswdebug
426 
427 #else
428 
429 #define	DERR		if (0)	vswdebug
430 #define	DWARN		if (0)	vswdebug
431 #define	D1		if (0)	vswdebug
432 #define	D2		if (0)	vswdebug
433 #define	D3		if (0)	vswdebug
434 
435 #endif	/* DEBUG */
436 
437 static struct modlinkage modlinkage = {
438 	MODREV_1,
439 	&vswmodldrv,
440 	NULL
441 };
442 
443 int
444 _init(void)
445 {
446 	int status;
447 
448 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
449 
450 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
451 	if (status != 0) {
452 		return (status);
453 	}
454 
455 	mac_init_ops(&vsw_ops, "vsw");
456 	status = mod_install(&modlinkage);
457 	if (status != 0) {
458 		ddi_soft_state_fini(&vsw_state);
459 	}
460 	return (status);
461 }
462 
463 int
464 _fini(void)
465 {
466 	int status;
467 
468 	status = mod_remove(&modlinkage);
469 	if (status != 0)
470 		return (status);
471 	mac_fini_ops(&vsw_ops);
472 	ddi_soft_state_fini(&vsw_state);
473 
474 	rw_destroy(&vsw_rw);
475 
476 	return (status);
477 }
478 
479 int
480 _info(struct modinfo *modinfop)
481 {
482 	return (mod_info(&modlinkage, modinfop));
483 }
484 
485 static int
486 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
487 {
488 	vsw_t		*vswp;
489 	int		smode, instance, i;
490 	char		hashname[MAXNAMELEN];
491 	char		qname[TASKQ_NAMELEN];
492 	int		rv = 1;
493 	enum		{ PROG_init = 0x0, PROG_if_lock = 0x1,
494 				PROG_fdb = 0x2, PROG_mfdb = 0x4,
495 				PROG_report_dev = 0x8, PROG_plist = 0x10,
496 				PROG_taskq = 0x20}
497 			progress;
498 
499 	progress = PROG_init;
500 
501 	switch (cmd) {
502 	case DDI_ATTACH:
503 		break;
504 	case DDI_RESUME:
505 		/* nothing to do for this non-device */
506 		return (DDI_SUCCESS);
507 	case DDI_PM_RESUME:
508 	default:
509 		return (DDI_FAILURE);
510 	}
511 
512 	instance = ddi_get_instance(dip);
513 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
514 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
515 		return (DDI_FAILURE);
516 	}
517 	vswp = ddi_get_soft_state(vsw_state, instance);
518 
519 	if (vswp == NULL) {
520 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
521 		goto vsw_attach_fail;
522 	}
523 
524 	vswp->dip = dip;
525 	vswp->instance = instance;
526 	ddi_set_driver_private(dip, (caddr_t)vswp);
527 
528 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
529 
530 	progress |= PROG_if_lock;
531 
532 	/*
533 	 * User specifies (via MD) an array of switching modes in
534 	 * decreasing order of preference. Default mode is always
535 	 * layer 2 (mac switching), so init array with that value.
536 	 */
537 	vswp->smode_idx = 0;
538 	for (i = 0; i < NUM_SMODES; i++)
539 		vswp->smode[i] = VSW_LAYER2;
540 
541 	/*
542 	 * Get the various properties such as physical device name
543 	 * (vsw-phys-dev), switch mode etc from the MD.
544 	 */
545 	vsw_get_md_properties(vswp);
546 
547 	/* setup the unicast forwarding database  */
548 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
549 							vswp->instance);
550 	D2(vswp, "creating unicast hash table (%s)...", hashname);
551 	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
552 		mod_hash_null_valdtor, sizeof (void *));
553 
554 	progress |= PROG_fdb;
555 
556 	/* setup the multicast fowarding database */
557 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
558 							vswp->instance);
559 	D2(vswp, "creating multicast hash table %s)...", hashname);
560 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
561 	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
562 			mod_hash_null_valdtor, sizeof (void *));
563 
564 	progress |= PROG_mfdb;
565 
566 	/*
567 	 * create lock protecting list of multicast addresses
568 	 * which could come via m_multicst() entry point when plumbed.
569 	 */
570 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
571 	vswp->mcap = NULL;
572 
573 	ddi_report_dev(vswp->dip);
574 
575 	progress |= PROG_report_dev;
576 
577 	WRITE_ENTER(&vsw_rw);
578 	vswp->next = vsw_head;
579 	vsw_head = vswp;
580 	RW_EXIT(&vsw_rw);
581 
582 	/* setup the port list */
583 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
584 	vswp->plist.head = NULL;
585 
586 	progress |= PROG_plist;
587 
588 	/*
589 	 * Create the taskq which will process all the VIO
590 	 * control messages.
591 	 */
592 	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
593 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
594 					TASKQ_DEFAULTPRI, 0)) == NULL) {
595 		cmn_err(CE_WARN, "Unable to create task queue");
596 		goto vsw_attach_fail;
597 	}
598 
599 	progress |= PROG_taskq;
600 
601 	/* select best switching mode */
602 	for (i = 0; i < NUM_SMODES; i++) {
603 		smode = vswp->smode[i];
604 		switch (smode) {
605 		case VSW_LAYER2:
606 			rv = vsw_setup_layer2(vswp);
607 			break;
608 
609 		case VSW_LAYER2_PROMISC:
610 			rv = vsw_setup_layer2(vswp);
611 			break;
612 
613 		case VSW_LAYER3:
614 			rv = vsw_setup_layer3(vswp);
615 			break;
616 
617 		default:
618 			DERR(vswp, "unknown switch mode");
619 			break;
620 		}
621 
622 		if (rv == 0) {
623 			vswp->smode_idx = i;
624 			break;
625 		}
626 	}
627 
628 	if (rv == 1) {
629 		cmn_err(CE_WARN, "Unable to setup switching mode");
630 		goto vsw_attach_fail;
631 	}
632 
633 	D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]);
634 
635 	/*
636 	 * Register with the MAC layer as a network device so
637 	 * we can be plumbed if desired.
638 	 *
639 	 * Do this in both layer 2 and layer 3 mode.
640 	 */
641 	vswp->if_state &= ~VSW_IF_UP;
642 	if (vswp->mdprops & VSW_MD_MACADDR) {
643 		if (vsw_mac_register(vswp) != 0) {
644 			cmn_err(CE_WARN, "Unable to register as provider "
645 				" with MAC layer, continuing with attach");
646 		}
647 	}
648 
649 	/* prevent auto-detaching */
650 	if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
651 				DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
652 		cmn_err(CE_NOTE, "Unable to set \"%s\" property for "
653 			"instance %u", DDI_NO_AUTODETACH, instance);
654 	}
655 
656 	/*
657 	 * Now we have everything setup, register for MD change
658 	 * events.
659 	 */
660 	vsw_mdeg_register(vswp);
661 
662 	return (DDI_SUCCESS);
663 
664 vsw_attach_fail:
665 	DERR(NULL, "vsw_attach: failed");
666 
667 	if (progress & PROG_taskq)
668 		ddi_taskq_destroy(vswp->taskq_p);
669 
670 	if (progress & PROG_plist)
671 		rw_destroy(&vswp->plist.lockrw);
672 
673 	if (progress & PROG_report_dev) {
674 		ddi_remove_minor_node(dip, NULL);
675 		mutex_destroy(&vswp->mca_lock);
676 	}
677 
678 	if (progress & PROG_mfdb) {
679 		mod_hash_destroy_hash(vswp->mfdb);
680 		vswp->mfdb = NULL;
681 		rw_destroy(&vswp->mfdbrw);
682 	}
683 
684 	if (progress & PROG_fdb) {
685 		mod_hash_destroy_hash(vswp->fdb);
686 		vswp->fdb = NULL;
687 	}
688 
689 	if (progress & PROG_if_lock)
690 		rw_destroy(&vswp->if_lockrw);
691 
692 	ddi_soft_state_free(vsw_state, instance);
693 	return (DDI_FAILURE);
694 }
695 
696 static int
697 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
698 {
699 	vio_mblk_pool_t		*poolp, *npoolp;
700 	vsw_t			**vswpp, *vswp;
701 	int 			instance;
702 
703 	instance = ddi_get_instance(dip);
704 	vswp = ddi_get_soft_state(vsw_state, instance);
705 
706 	if (vswp == NULL) {
707 		return (DDI_FAILURE);
708 	}
709 
710 	switch (cmd) {
711 	case DDI_DETACH:
712 		break;
713 	case DDI_SUSPEND:
714 	case DDI_PM_SUSPEND:
715 	default:
716 		return (DDI_FAILURE);
717 	}
718 
719 	D2(vswp, "detaching instance %d", instance);
720 
721 	if (vswp->mdprops & VSW_MD_MACADDR) {
722 		if (vsw_mac_unregister(vswp) != 0) {
723 			cmn_err(CE_WARN, "Unable to detach from MAC layer");
724 			return (DDI_FAILURE);
725 		}
726 		rw_destroy(&vswp->if_lockrw);
727 	}
728 
729 	vsw_mdeg_unregister(vswp);
730 
731 	if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
732 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) {
733 		vsw_mac_detach(vswp);
734 	}
735 
736 	if (vsw_detach_ports(vswp) != 0) {
737 		cmn_err(CE_WARN, "Unable to detach ports");
738 		return (DDI_FAILURE);
739 	}
740 
741 	/*
742 	 * Destroy any free pools that may still exist.
743 	 */
744 	poolp = vswp->rxh;
745 	while (poolp != NULL) {
746 		npoolp = vswp->rxh = poolp->nextp;
747 		if (vio_destroy_mblks(poolp) != 0) {
748 			vswp->rxh = poolp;
749 			return (DDI_FAILURE);
750 		}
751 		poolp = npoolp;
752 	}
753 
754 	/*
755 	 * Remove this instance from any entries it may be on in
756 	 * the hash table by using the list of addresses maintained
757 	 * in the vsw_t structure.
758 	 */
759 	vsw_del_mcst_vsw(vswp);
760 
761 	vswp->mcap = NULL;
762 	mutex_destroy(&vswp->mca_lock);
763 
764 	/*
765 	 * By now any pending tasks have finished and the underlying
766 	 * ldc's have been destroyed, so its safe to delete the control
767 	 * message taskq.
768 	 */
769 	if (vswp->taskq_p != NULL)
770 		ddi_taskq_destroy(vswp->taskq_p);
771 
772 	/*
773 	 * At this stage all the data pointers in the hash table
774 	 * should be NULL, as all the ports have been removed and will
775 	 * have deleted themselves from the port lists which the data
776 	 * pointers point to. Hence we can destroy the table using the
777 	 * default destructors.
778 	 */
779 	D2(vswp, "vsw_detach: destroying hash tables..");
780 	mod_hash_destroy_hash(vswp->fdb);
781 	vswp->fdb = NULL;
782 
783 	WRITE_ENTER(&vswp->mfdbrw);
784 	mod_hash_destroy_hash(vswp->mfdb);
785 	vswp->mfdb = NULL;
786 	RW_EXIT(&vswp->mfdbrw);
787 	rw_destroy(&vswp->mfdbrw);
788 
789 	ddi_remove_minor_node(dip, NULL);
790 
791 	rw_destroy(&vswp->plist.lockrw);
792 	WRITE_ENTER(&vsw_rw);
793 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
794 		if (*vswpp == vswp) {
795 			*vswpp = vswp->next;
796 			break;
797 		}
798 	}
799 	RW_EXIT(&vsw_rw);
800 	ddi_soft_state_free(vsw_state, instance);
801 
802 	return (DDI_SUCCESS);
803 }
804 
805 static int
806 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
807 {
808 	_NOTE(ARGUNUSED(dip))
809 
810 	vsw_t	*vswp = NULL;
811 	dev_t	dev = (dev_t)arg;
812 	int	instance;
813 
814 	instance = getminor(dev);
815 
816 	switch (infocmd) {
817 	case DDI_INFO_DEVT2DEVINFO:
818 		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
819 			*result = NULL;
820 			return (DDI_FAILURE);
821 		}
822 		*result = vswp->dip;
823 		return (DDI_SUCCESS);
824 
825 	case DDI_INFO_DEVT2INSTANCE:
826 		*result = (void *)(uintptr_t)instance;
827 		return (DDI_SUCCESS);
828 
829 	default:
830 		*result = NULL;
831 		return (DDI_FAILURE);
832 	}
833 }
834 
835 /*
836  * Get the properties from our MD node.
837  */
838 static void
839 vsw_get_md_properties(vsw_t *vswp)
840 {
841 	md_t		*mdp = NULL;
842 	int		num_nodes = 0;
843 	int		len = 0, listsz = 0;
844 	int		num_vdev = 0;
845 	int		i, idx;
846 	boolean_t	found_node = B_FALSE;
847 	char		*smode = NULL;
848 	char		*curr_mode = NULL;
849 	char		*physname = NULL;
850 	char		*node_name = NULL;
851 	char		*dev;
852 	uint64_t 	macaddr = 0;
853 	uint64_t	md_inst, obp_inst;
854 	mde_cookie_t	*listp = NULL;
855 	mde_cookie_t	rootnode;
856 
857 	D1(vswp, "%s: enter", __func__);
858 
859 	/*
860 	 * Further down we compare the obp 'reg' property to the
861 	 * 'cfg-handle' property in the vsw MD node to determine
862 	 * if the node refers to this particular instance. So if
863 	 * we can't read the obp value then there is no point
864 	 * in proceeding further.
865 	 */
866 	if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip,
867 			DDI_PROP_DONTPASS, reg_propname) != 1) {
868 		cmn_err(CE_WARN, "Unable to read %s property "
869 			"from OBP device node", reg_propname);
870 		return;
871 	}
872 
873 	obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
874 		DDI_PROP_DONTPASS, reg_propname, 0);
875 
876 	D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst);
877 
878 	if ((mdp = md_get_handle()) == NULL) {
879 		DERR(vswp, "%s: unable to init MD", __func__);
880 		return;
881 	}
882 
883 	if ((num_nodes = md_node_count(mdp)) <= 0) {
884 		DERR(vswp, "%s: invalid number of  nodes found %d",
885 			__func__, num_nodes);
886 		(void) md_fini_handle(mdp);
887 		return;
888 	}
889 
890 	D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes);
891 
892 	/* allocate enough space for node list */
893 	listsz = num_nodes * sizeof (mde_cookie_t);
894 	listp = kmem_zalloc(listsz, KM_SLEEP);
895 
896 	rootnode = md_root_node(mdp);
897 
898 	/* Get the list of virtual devices */
899 	num_vdev = md_scan_dag(mdp, rootnode,
900 		md_find_name(mdp, vdev_propname),
901 		md_find_name(mdp, "fwd"), listp);
902 
903 	if (num_vdev <= 0) {
904 		DERR(vswp, "%s: didn't find any virtual-device nodes in MD",
905 			__func__);
906 		goto md_prop_exit;
907 	}
908 
909 	D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev);
910 
911 	/* Look for the virtual switch nodes in the list */
912 	for (idx = 0; idx < num_vdev; idx++) {
913 		if (md_get_prop_str(mdp, listp[idx],
914 				"name", &node_name) != 0) {
915 			DERR(vswp, "%s: unable to get node name", __func__);
916 			continue;
917 
918 		}
919 
920 		if (strcmp(node_name, vsw_propname) == 0) {
921 			/* Virtual switch node */
922 			if (md_get_prop_val(mdp, listp[idx],
923 				"cfg-handle", &md_inst) != 0) {
924 				DERR(vswp, "%s: unable to get cfg-handle from"
925 					" node %d", __func__, idx);
926 				goto md_prop_exit;
927 			} else if (md_inst == obp_inst) {
928 				D2(vswp, "%s: found matching node (%d)"
929 					" 0x%llx == 0x%llx", __func__, idx,
930 					md_inst, obp_inst);
931 				found_node = B_TRUE;
932 				break;
933 			}
934 		}
935 	}
936 
937 	if (!found_node) {
938 		DWARN(vswp, "%s: couldn't find correct vsw node", __func__);
939 		goto md_prop_exit;
940 	}
941 
942 	/*
943 	 * Now, having found the correct node, get the various properties.
944 	 */
945 
946 	if (md_get_prop_data(mdp, listp[idx], physdev_propname,
947 				(uint8_t **)(&physname), &len) != 0) {
948 		cmn_err(CE_WARN, "%s: unable to get name(s) of physical "
949 			"device(s) from MD", __func__);
950 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
951 		cmn_err(CE_WARN, "%s is too long a device name", physname);
952 	} else {
953 		(void) strncpy(vswp->physname, physname, strlen(physname) + 1);
954 		vswp->mdprops |= VSW_MD_PHYSNAME;
955 		D2(vswp, "%s: using first device specified (%s)",
956 			__func__, vswp->physname);
957 	}
958 
959 #ifdef DEBUG
960 	/*
961 	 * As a temporary measure to aid testing we check to see if there
962 	 * is a vsw.conf file present. If there is we use the value of the
963 	 * vsw_physname property in the file as the name of the physical
964 	 * device, overriding the value from the MD.
965 	 *
966 	 * There may be multiple devices listed, but for the moment
967 	 * we just use the first one.
968 	 */
969 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
970 		"vsw_physname", &dev) == DDI_PROP_SUCCESS) {
971 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
972 			cmn_err(CE_WARN, "%s is too long a device name", dev);
973 		} else {
974 			cmn_err(CE_NOTE, "%s: using device name (%s) from "
975 				"config file", __func__, dev);
976 
977 			(void) strncpy(vswp->physname, dev, strlen(dev) + 1);
978 			vswp->mdprops |= VSW_MD_PHYSNAME;
979 		}
980 
981 		ddi_prop_free(dev);
982 
983 	}
984 #endif
985 
986 	/* local mac address */
987 	if (md_get_prop_val(mdp, listp[idx],
988 			macaddr_propname, &macaddr) != 0) {
989 		cmn_err(CE_WARN, "%s: unable to get local MAC address",
990 								__func__);
991 	} else {
992 		READ_ENTER(&vswp->if_lockrw);
993 		for (i = ETHERADDRL - 1; i >= 0; i--) {
994 			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
995 			macaddr >>= 8;
996 		}
997 		RW_EXIT(&vswp->if_lockrw);
998 		vswp->mdprops |= VSW_MD_MACADDR;
999 	}
1000 
1001 	/*
1002 	 * Get the switch-mode property. The modes are listed in
1003 	 * decreasing order of preference, i.e. prefered mode is
1004 	 * first item in list.
1005 	 */
1006 	len = 0;
1007 	if (md_get_prop_data(mdp, listp[idx], smode_propname,
1008 				(uint8_t **)(&smode), &len) != 0) {
1009 		/*
1010 		 * Unable to get switch-mode property, so just use
1011 		 * default values which vswp->smode[] array has already
1012 		 * been pre-populated with, namely layer2.
1013 		 */
1014 		cmn_err(CE_WARN, "%s: unable to get switch mode property, "
1015 			"defaulting to layer 2 mode", __func__);
1016 	} else {
1017 		i = 0;
1018 		curr_mode = smode;
1019 		/*
1020 		 * Modes of operation:
1021 		 * 'switched'	 - layer 2 switching, underlying HW in
1022 		 *			non-promiscuous mode.
1023 		 * 'promiscuous' - layer 2 switching, underlying HW in
1024 		 *			promiscuous mode.
1025 		 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
1026 		 *			in non-promiscuous mode.
1027 		 */
1028 		while ((curr_mode < (smode + len)) && (i < NUM_SMODES)) {
1029 			D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
1030 			if (strcmp(curr_mode, "switched") == 0)
1031 				vswp->smode[i] = VSW_LAYER2;
1032 			else if (strcmp(curr_mode, "promiscuous") == 0)
1033 				vswp->smode[i] = VSW_LAYER2_PROMISC;
1034 			else if (strcmp(curr_mode, "routed") == 0)
1035 				vswp->smode[i] = VSW_LAYER3;
1036 			else {
1037 				DERR(vswp, "%s: unknown mode %s",
1038 					__func__, curr_mode);
1039 				/* default to layer 2 */
1040 				vswp->smode[i] = VSW_LAYER2;
1041 			}
1042 			curr_mode += strlen(curr_mode) + 1;
1043 			i++;
1044 		}
1045 
1046 		vswp->mdprops |= VSW_MD_SMODE;
1047 	}
1048 
1049 md_prop_exit:
1050 	(void) md_fini_handle(mdp);
1051 
1052 	kmem_free(listp, listsz);
1053 
1054 	D1(vswp, "%s: exit", __func__);
1055 }
1056 
1057 static int
1058 vsw_setup_layer2(vsw_t *vswp)
1059 {
1060 	int		rv = 0;
1061 
1062 	D1(vswp, "%s: enter", __func__);
1063 
1064 	vsw_switch_frame = vsw_switch_l2_frame;
1065 
1066 	/*
1067 	 * Attempt to link into the MAC layer so we can get
1068 	 * and send packets out over the physical adapter.
1069 	 */
1070 	if (vswp->mdprops & VSW_MD_PHYSNAME) {
1071 		if (vsw_mac_attach(vswp) != 0) {
1072 			/*
1073 			 * Registration with the MAC layer has failed,
1074 			 * so return 1 so that can fall back to next
1075 			 * prefered switching method.
1076 			 */
1077 			cmn_err(CE_WARN, "!unable to join as MAC layer "
1078 				"client, continuing with attach");
1079 			rv = 1;
1080 		}
1081 	} else {
1082 		/* No physical device name found in MD */
1083 		DERR(vswp, "%s: no physical device name specified", __func__);
1084 		rv = 1;
1085 	}
1086 
1087 	D1(vswp, "%s: exit", __func__);
1088 
1089 	return (rv);
1090 }
1091 
1092 static int
1093 vsw_setup_layer3(vsw_t *vswp)
1094 {
1095 	D1(vswp, "%s: enter", __func__);
1096 
1097 	D2(vswp, "%s: operating in layer 3 mode", __func__);
1098 	vsw_switch_frame = vsw_switch_l3_frame;
1099 
1100 	D1(vswp, "%s: exit", __func__);
1101 
1102 	return (0);
1103 }
1104 
1105 /*
1106  * Link into the MAC layer to gain access to the services provided by
1107  * the underlying physical device driver (which should also have
1108  * registered with the MAC layer).
1109  *
1110  * Only when in layer 2 mode.
1111  */
1112 static int
1113 vsw_mac_attach(vsw_t *vswp)
1114 {
1115 	char	drv[LIFNAMSIZ];
1116 	uint_t	ddi_instance;
1117 
1118 	D1(vswp, "vsw_mac_attach: enter");
1119 
1120 	vswp->mh = NULL;
1121 	vswp->mrh = NULL;
1122 	vswp->mnh = NULL;
1123 
1124 	ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
1125 
1126 	if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) {
1127 		cmn_err(CE_WARN, "invalid device name: %s", vswp->physname);
1128 		goto mac_fail_exit;
1129 	}
1130 	if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) {
1131 		cmn_err(CE_WARN, "mac_open %s failed", vswp->physname);
1132 		goto mac_fail_exit;
1133 	}
1134 
1135 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
1136 
1137 	/* register for changes in the interface */
1138 	vswp->mnh = mac_notify_add(vswp->mh, vsw_notify_cb, (void *)vswp);
1139 
1140 	/* register our rx callback function */
1141 	vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
1142 
1143 	/* get the MAC tx fn */
1144 	vswp->txinfo = mac_tx_get(vswp->mh);
1145 
1146 	/* start the interface */
1147 	if (mac_start(vswp->mh) != 0) {
1148 		cmn_err(CE_WARN, "could not start mac interface");
1149 		goto mac_fail_exit;
1150 	}
1151 
1152 	/* get and store original promisc setting */
1153 	vswp->init_promisc = mac_promisc_get(vswp->mh, MAC_DEVPROMISC);
1154 
1155 	/*
1156 	 * FUTURE: When we have the ability to set multiple unicast
1157 	 * mac address then we won't have to set the device into
1158 	 * promisc mode, but for the moment its the only way we.
1159 	 * can see pkts that logical domains we are serving are
1160 	 * interested in.
1161 	 */
1162 	if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) &&
1163 					(vswp->init_promisc == B_FALSE)) {
1164 		DERR(vswp, "vsw_mac_attach: enabling promisc mode..");
1165 
1166 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
1167 			DERR(vswp, "vsw_mac_attach: unable to set device"
1168 				" into promiscuous mode");
1169 			goto mac_fail_exit;
1170 		}
1171 	}
1172 
1173 	D1(vswp, "vsw_mac_attach: exit");
1174 	return (0);
1175 
1176 mac_fail_exit:
1177 	if (vswp->mh != NULL) {
1178 		mac_promisc_set(vswp->mh, vswp->init_promisc, MAC_DEVPROMISC);
1179 		if (vswp->mrh != NULL)
1180 			mac_rx_remove(vswp->mh, vswp->mrh);
1181 
1182 		if (vswp->mnh != NULL)
1183 			mac_notify_remove(vswp->mh, vswp->mnh);
1184 
1185 		mac_close(vswp->mh);
1186 	}
1187 
1188 	vswp->mrh = NULL;
1189 	vswp->mnh = NULL;
1190 	vswp->mh = NULL;
1191 	vswp->txinfo = NULL;
1192 
1193 	D1(vswp, "vsw_mac_attach: fail exit");
1194 	return (1);
1195 }
1196 
1197 static void
1198 vsw_mac_detach(vsw_t *vswp)
1199 {
1200 	D1(vswp, "vsw_mac_detach: enter");
1201 
1202 	if (vswp->mh != NULL) {
1203 		/* restore promisc to original setting */
1204 		mac_promisc_set(vswp->mh, vswp->init_promisc, MAC_DEVPROMISC);
1205 		if (vswp->mrh != NULL)
1206 			mac_rx_remove(vswp->mh, vswp->mrh);
1207 
1208 		if (vswp->mnh != NULL)
1209 			mac_notify_remove(vswp->mh, vswp->mnh);
1210 
1211 		mac_close(vswp->mh);
1212 	}
1213 
1214 	vswp->mrh = NULL;
1215 	vswp->mnh = NULL;
1216 	vswp->mh = NULL;
1217 	vswp->txinfo = NULL;
1218 
1219 	D1(vswp, "vsw_mac_detach: exit");
1220 }
1221 
1222 /*
1223  * Get notified of changes to the interface.
1224  *
1225  * For the moment we brute force the interface back
1226  * into promisc mode if it is unset (e.g. by snoop).
1227  * When we have the ability to set multiple mac addresses,
1228  * we will need to see if this is necessary.
1229  */
1230 static void
1231 vsw_notify_cb(void *arg, mac_notify_type_t type)
1232 {
1233 	vsw_t		*vswp = (vsw_t *)arg;
1234 
1235 	switch (type) {
1236 	case MAC_NOTE_PROMISC:
1237 		vswp->txinfo = mac_tx_get(vswp->mh);
1238 		if (mac_promisc_get(vswp->mh, MAC_DEVPROMISC) == B_TRUE) {
1239 			D2(vswp, "%s: still in PROMISC mode", __func__);
1240 		} else {
1241 			D2(vswp, "%s: now in NON-PROMISC mode", __func__);
1242 			D2(vswp, "...re-enabling");
1243 			mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC);
1244 		}
1245 		break;
1246 	default:
1247 		break;
1248 	}
1249 }
1250 
1251 /*
1252  * receive callback routine. Invoked by MAC layer when there
1253  * are pkts being passed up from physical device.
1254  *
1255  * PERF: It may be more efficient when the card is in promisc
1256  * mode to check the dest address of the pkts here (against
1257  * the FDB) rather than checking later. Needs to be investigated.
1258  */
1259 static void
1260 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1261 {
1262 	_NOTE(ARGUNUSED(mrh))
1263 
1264 	vsw_t		*vswp = (vsw_t *)arg;
1265 
1266 	ASSERT(vswp != NULL);
1267 
1268 	D1(vswp, "vsw_rx_cb: enter");
1269 
1270 	/* switch the chain of packets received */
1271 	vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
1272 
1273 	D1(vswp, "vsw_rx_cb: exit");
1274 }
1275 
1276 /*
1277  * Send a message out over the physical device via the MAC layer.
1278  *
1279  * Returns any mblks that it was unable to transmit.
1280  */
1281 static mblk_t *
1282 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
1283 {
1284 	const mac_txinfo_t	*mtp;
1285 	mblk_t			*nextp;
1286 
1287 	if (vswp->mh == NULL) {
1288 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
1289 		return (mp);
1290 	} else {
1291 		for (;;) {
1292 			nextp = mp->b_next;
1293 			mp->b_next = NULL;
1294 
1295 			mtp = vswp->txinfo;
1296 			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
1297 				mp->b_next = nextp;
1298 				break;
1299 			}
1300 
1301 			if ((mp = nextp) == NULL)
1302 				break;
1303 
1304 		}
1305 
1306 	}
1307 
1308 	return (mp);
1309 }
1310 
1311 /*
1312  * Register with the MAC layer as a network device, so we
1313  * can be plumbed if necessary.
1314  */
1315 static int
1316 vsw_mac_register(vsw_t *vswp)
1317 {
1318 	mac_register_t	*macp;
1319 	int		rv;
1320 
1321 	D1(vswp, "%s: enter", __func__);
1322 
1323 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
1324 		return (EINVAL);
1325 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1326 	macp->m_driver = vswp;
1327 	macp->m_dip = vswp->dip;
1328 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
1329 	macp->m_callbacks = &vsw_m_callbacks;
1330 	macp->m_min_sdu = 0;
1331 	macp->m_max_sdu = ETHERMTU;
1332 	rv = mac_register(macp, &vswp->if_mh);
1333 	mac_free(macp);
1334 	if (rv == 0)
1335 		vswp->if_state |= VSW_IF_REG;
1336 
1337 	D1(vswp, "%s: exit", __func__);
1338 
1339 	return (rv);
1340 }
1341 
1342 static int
1343 vsw_mac_unregister(vsw_t *vswp)
1344 {
1345 	int		rv = 0;
1346 
1347 	D1(vswp, "%s: enter", __func__);
1348 
1349 	WRITE_ENTER(&vswp->if_lockrw);
1350 
1351 	if (vswp->if_state & VSW_IF_REG) {
1352 		rv = mac_unregister(vswp->if_mh);
1353 		if (rv != 0) {
1354 			DWARN(vswp, "%s: unable to unregister from MAC "
1355 				"framework", __func__);
1356 
1357 			RW_EXIT(&vswp->if_lockrw);
1358 			D1(vswp, "%s: fail exit", __func__);
1359 			return (rv);
1360 		}
1361 
1362 		/* mark i/f as down and unregistered */
1363 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
1364 	}
1365 	RW_EXIT(&vswp->if_lockrw);
1366 
1367 	vswp->mdprops &= ~VSW_MD_MACADDR;
1368 
1369 	D1(vswp, "%s: exit", __func__);
1370 
1371 	return (rv);
1372 }
1373 
1374 static int
1375 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
1376 {
1377 	vsw_t			*vswp = (vsw_t *)arg;
1378 
1379 	D1(vswp, "%s: enter", __func__);
1380 
1381 	if (vswp->mh == NULL)
1382 		return (EINVAL);
1383 
1384 	/* return stats from underlying device */
1385 	*val = mac_stat_get(vswp->mh, stat);
1386 	return (0);
1387 }
1388 
1389 static void
1390 vsw_m_stop(void *arg)
1391 {
1392 	vsw_t		*vswp = (vsw_t *)arg;
1393 
1394 	D1(vswp, "%s: enter", __func__);
1395 
1396 	WRITE_ENTER(&vswp->if_lockrw);
1397 	vswp->if_state &= ~VSW_IF_UP;
1398 	RW_EXIT(&vswp->if_lockrw);
1399 
1400 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1401 }
1402 
1403 static int
1404 vsw_m_start(void *arg)
1405 {
1406 	vsw_t		*vswp = (vsw_t *)arg;
1407 
1408 	D1(vswp, "%s: enter", __func__);
1409 
1410 	WRITE_ENTER(&vswp->if_lockrw);
1411 	vswp->if_state |= VSW_IF_UP;
1412 	RW_EXIT(&vswp->if_lockrw);
1413 
1414 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1415 	return (0);
1416 }
1417 
1418 /*
1419  * Change the local interface address.
1420  */
1421 static int
1422 vsw_m_unicst(void *arg, const uint8_t *macaddr)
1423 {
1424 	vsw_t		*vswp = (vsw_t *)arg;
1425 
1426 	D1(vswp, "%s: enter", __func__);
1427 
1428 	WRITE_ENTER(&vswp->if_lockrw);
1429 	ether_copy(macaddr, &vswp->if_addr);
1430 	RW_EXIT(&vswp->if_lockrw);
1431 
1432 	D1(vswp, "%s: exit", __func__);
1433 
1434 	return (0);
1435 }
1436 
1437 static int
1438 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
1439 {
1440 	vsw_t		*vswp = (vsw_t *)arg;
1441 	mcst_addr_t	*mcst_p = NULL;
1442 	uint64_t	addr = 0x0;
1443 	int		i;
1444 
1445 	D1(vswp, "%s: enter", __func__);
1446 
1447 	/*
1448 	 * Convert address into form that can be used
1449 	 * as hash table key.
1450 	 */
1451 	for (i = 0; i < ETHERADDRL; i++) {
1452 		addr = (addr << 8) | mca[i];
1453 	}
1454 
1455 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
1456 
1457 	if (add) {
1458 		D2(vswp, "%s: adding multicast", __func__);
1459 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1460 			/*
1461 			 * Update the list of multicast addresses
1462 			 * contained within the vsw_t structure to
1463 			 * include this new one.
1464 			 */
1465 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
1466 			if (mcst_p == NULL) {
1467 				DERR(vswp, "%s unable to alloc mem", __func__);
1468 				return (1);
1469 			}
1470 			mcst_p->addr = addr;
1471 
1472 			mutex_enter(&vswp->mca_lock);
1473 			mcst_p->nextp = vswp->mcap;
1474 			vswp->mcap = mcst_p;
1475 			mutex_exit(&vswp->mca_lock);
1476 
1477 			/*
1478 			 * Call into the underlying driver to program the
1479 			 * address into HW.
1480 			 *
1481 			 * Note:
1482 			 * Can safely ignore the return value as the card
1483 			 * will for the moment always be in promisc mode.
1484 			 * When we can program multiple MAC addresses into the
1485 			 * HW then we will need to care about the return
1486 			 * value here.
1487 			 */
1488 			if (vswp->mh != NULL)
1489 				(void) mac_multicst_add(vswp->mh, mca);
1490 		}
1491 	} else {
1492 		D2(vswp, "%s: removing multicast", __func__);
1493 		/*
1494 		 * Remove the address from the hash table..
1495 		 */
1496 		if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1497 
1498 			/*
1499 			 * ..and then from the list maintained in the
1500 			 * vsw_t structure.
1501 			 */
1502 			vsw_del_addr(VSW_LOCALDEV, vswp, addr);
1503 
1504 			if (vswp->mh != NULL)
1505 				(void) mac_multicst_remove(vswp->mh, mca);
1506 		}
1507 	}
1508 
1509 	D1(vswp, "%s: exit", __func__);
1510 
1511 	return (0);
1512 }
1513 
1514 static int
1515 vsw_m_promisc(void *arg, boolean_t on)
1516 {
1517 	vsw_t		*vswp = (vsw_t *)arg;
1518 
1519 	D1(vswp, "%s: enter", __func__);
1520 
1521 	WRITE_ENTER(&vswp->if_lockrw);
1522 	if (on)
1523 		vswp->if_state |= VSW_IF_PROMISC;
1524 	else
1525 		vswp->if_state &= ~VSW_IF_PROMISC;
1526 	RW_EXIT(&vswp->if_lockrw);
1527 
1528 	D1(vswp, "%s: exit", __func__);
1529 
1530 	return (0);
1531 }
1532 
1533 static mblk_t *
1534 vsw_m_tx(void *arg, mblk_t *mp)
1535 {
1536 	vsw_t		*vswp = (vsw_t *)arg;
1537 
1538 	D1(vswp, "%s: enter", __func__);
1539 
1540 	vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
1541 
1542 	D1(vswp, "%s: exit", __func__);
1543 
1544 	return (NULL);
1545 }
1546 
1547 /*
1548  * Register for machine description (MD) updates.
1549  */
1550 static void
1551 vsw_mdeg_register(vsw_t *vswp)
1552 {
1553 	mdeg_prop_spec_t	*pspecp;
1554 	mdeg_node_spec_t	*inst_specp;
1555 	mdeg_handle_t		mdeg_hdl;
1556 	size_t			templatesz;
1557 	int			inst, rv;
1558 
1559 	D1(vswp, "%s: enter", __func__);
1560 
1561 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
1562 		DDI_PROP_DONTPASS, reg_propname, -1);
1563 	if (inst == -1) {
1564 		DERR(vswp, "%s: unable to get %s property",
1565 						__func__, reg_propname);
1566 		return;
1567 	}
1568 
1569 	D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
1570 
1571 	/*
1572 	 * Allocate and initialize a per-instance copy
1573 	 * of the global property spec array that will
1574 	 * uniquely identify this vsw instance.
1575 	 */
1576 	templatesz = sizeof (vsw_prop_template);
1577 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
1578 
1579 	bcopy(vsw_prop_template, pspecp, templatesz);
1580 
1581 	VSW_SET_MDEG_PROP_INST(pspecp, inst);
1582 
1583 	/* initialize the complete prop spec structure */
1584 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
1585 	inst_specp->namep = "virtual-device";
1586 	inst_specp->specp = pspecp;
1587 
1588 	/* perform the registration */
1589 	rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb,
1590 	    (void *)vswp, &mdeg_hdl);
1591 
1592 	if (rv != MDEG_SUCCESS) {
1593 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
1594 		kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
1595 		kmem_free(pspecp, templatesz);
1596 		return;
1597 	}
1598 
1599 	/* save off data that will be needed later */
1600 	vswp->inst_spec = inst_specp;
1601 	vswp->mdeg_hdl = mdeg_hdl;
1602 
1603 	D1(vswp, "%s: exit", __func__);
1604 }
1605 
1606 static void
1607 vsw_mdeg_unregister(vsw_t *vswp)
1608 {
1609 	D1(vswp, "vsw_mdeg_unregister: enter");
1610 
1611 	(void) mdeg_unregister(vswp->mdeg_hdl);
1612 
1613 	if (vswp->inst_spec->specp != NULL) {
1614 		(void) kmem_free(vswp->inst_spec->specp,
1615 			sizeof (vsw_prop_template));
1616 		vswp->inst_spec->specp = NULL;
1617 	}
1618 
1619 	if (vswp->inst_spec != NULL) {
1620 		(void) kmem_free(vswp->inst_spec,
1621 			sizeof (mdeg_node_spec_t));
1622 		vswp->inst_spec = NULL;
1623 	}
1624 
1625 	D1(vswp, "vsw_mdeg_unregister: exit");
1626 }
1627 
1628 static int
1629 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
1630 {
1631 	vsw_t		*vswp;
1632 	int		idx;
1633 	md_t		*mdp;
1634 	mde_cookie_t	node;
1635 	uint64_t	inst;
1636 
1637 	if (resp == NULL)
1638 		return (MDEG_FAILURE);
1639 
1640 	vswp = (vsw_t *)cb_argp;
1641 
1642 	D1(vswp, "%s: added %d : removed %d : matched %d",
1643 		__func__, resp->added.nelem, resp->removed.nelem,
1644 		resp->match_prev.nelem);
1645 
1646 	/* process added ports */
1647 	for (idx = 0; idx < resp->added.nelem; idx++) {
1648 		mdp = resp->added.mdp;
1649 		node = resp->added.mdep[idx];
1650 
1651 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
1652 
1653 		if (vsw_port_add(vswp, mdp, &node) != 0) {
1654 			cmn_err(CE_WARN, "Unable to add new port (0x%lx)",
1655 					node);
1656 		}
1657 	}
1658 
1659 	/* process removed ports */
1660 	for (idx = 0; idx < resp->removed.nelem; idx++) {
1661 		mdp = resp->removed.mdp;
1662 		node = resp->removed.mdep[idx];
1663 
1664 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
1665 			DERR(vswp, "%s: prop(%s) not found port(%d)",
1666 				__func__, id_propname, idx);
1667 			continue;
1668 		}
1669 
1670 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
1671 
1672 		if (vsw_port_detach(vswp, inst) != 0) {
1673 			cmn_err(CE_WARN, "Unable to remove port %ld", inst);
1674 		}
1675 	}
1676 
1677 	/*
1678 	 * Currently no support for updating already active ports.
1679 	 * So, ignore the match_curr and match_priv arrays for now.
1680 	 */
1681 
1682 	D1(vswp, "%s: exit", __func__);
1683 
1684 	return (MDEG_SUCCESS);
1685 }
1686 
1687 /*
1688  * Add a new port to the system.
1689  *
1690  * Returns 0 on success, 1 on failure.
1691  */
1692 int
1693 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
1694 {
1695 	uint64_t		ldc_id;
1696 	uint8_t			*addrp;
1697 	int			i, addrsz;
1698 	int			num_nodes = 0, nchan = 0;
1699 	int			listsz = 0;
1700 	mde_cookie_t		*listp = NULL;
1701 	struct ether_addr	ea;
1702 	uint64_t		macaddr;
1703 	uint64_t		inst = 0;
1704 	vsw_port_t		*port;
1705 
1706 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
1707 		DWARN(vswp, "%s: prop(%s) not found", __func__,
1708 			id_propname);
1709 		return (1);
1710 	}
1711 
1712 	/*
1713 	 * Find the channel endpoint node(s) (which should be under this
1714 	 * port node) which contain the channel id(s).
1715 	 */
1716 	if ((num_nodes = md_node_count(mdp)) <= 0) {
1717 		DERR(vswp, "%s: invalid number of nodes found (%d)",
1718 			__func__, num_nodes);
1719 		return (1);
1720 	}
1721 
1722 	/* allocate enough space for node list */
1723 	listsz = num_nodes * sizeof (mde_cookie_t);
1724 	listp = kmem_zalloc(listsz, KM_SLEEP);
1725 
1726 	nchan = md_scan_dag(mdp, *node,
1727 		md_find_name(mdp, chan_propname),
1728 		md_find_name(mdp, "fwd"), listp);
1729 
1730 	if (nchan <= 0) {
1731 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
1732 		kmem_free(listp, listsz);
1733 		return (1);
1734 	}
1735 
1736 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
1737 
1738 	/* use property from first node found */
1739 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
1740 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
1741 			id_propname);
1742 		kmem_free(listp, listsz);
1743 		return (1);
1744 	}
1745 
1746 	/* don't need list any more */
1747 	kmem_free(listp, listsz);
1748 
1749 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
1750 
1751 	/* read mac-address property */
1752 	if (md_get_prop_data(mdp, *node, remaddr_propname,
1753 					&addrp, &addrsz)) {
1754 		DWARN(vswp, "%s: prop(%s) not found",
1755 				__func__, remaddr_propname);
1756 		return (1);
1757 	}
1758 
1759 	if (addrsz < ETHERADDRL) {
1760 		DWARN(vswp, "%s: invalid address size", __func__);
1761 		return (1);
1762 	}
1763 
1764 	macaddr = *((uint64_t *)addrp);
1765 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
1766 
1767 	for (i = ETHERADDRL - 1; i >= 0; i--) {
1768 		ea.ether_addr_octet[i] = macaddr & 0xFF;
1769 		macaddr >>= 8;
1770 	}
1771 
1772 	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
1773 		DERR(vswp, "%s: failed to attach port", __func__);
1774 		return (1);
1775 	}
1776 
1777 	port = vsw_lookup_port(vswp, (int)inst);
1778 
1779 	/* just successfuly created the port, so it should exist */
1780 	ASSERT(port != NULL);
1781 
1782 	return (0);
1783 }
1784 
1785 /*
1786  * Attach the specified port.
1787  *
1788  * Returns 0 on success, 1 on failure.
1789  */
1790 static int
1791 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
1792 struct ether_addr *macaddr)
1793 {
1794 	vsw_port_list_t		*plist = &vswp->plist;
1795 	vsw_port_t		*port, **prev_port;
1796 	int			i;
1797 
1798 	D1(vswp, "%s: enter : port %d", __func__, p_instance);
1799 
1800 	/* port already exists? */
1801 	READ_ENTER(&plist->lockrw);
1802 	for (port = plist->head; port != NULL; port = port->p_next) {
1803 		if (port->p_instance == p_instance) {
1804 			DWARN(vswp, "%s: port instance %d already attached",
1805 				__func__, p_instance);
1806 			RW_EXIT(&plist->lockrw);
1807 			return (1);
1808 		}
1809 	}
1810 	RW_EXIT(&plist->lockrw);
1811 
1812 	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
1813 	port->p_vswp = vswp;
1814 	port->p_instance = p_instance;
1815 	port->p_ldclist.num_ldcs = 0;
1816 	port->p_ldclist.head = NULL;
1817 
1818 	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
1819 
1820 	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
1821 	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
1822 
1823 	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
1824 	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
1825 
1826 	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
1827 	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
1828 	port->state = VSW_PORT_INIT;
1829 
1830 	if (nids > VSW_PORT_MAX_LDCS) {
1831 		D2(vswp, "%s: using first of %d ldc ids",
1832 			__func__, nids);
1833 		nids = VSW_PORT_MAX_LDCS;
1834 	}
1835 
1836 	D2(vswp, "%s: %d nids", __func__, nids);
1837 	for (i = 0; i < nids; i++) {
1838 		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
1839 		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
1840 			DERR(vswp, "%s: ldc_attach failed", __func__);
1841 
1842 			rw_destroy(&port->p_ldclist.lockrw);
1843 
1844 			cv_destroy(&port->ref_cv);
1845 			mutex_destroy(&port->ref_lock);
1846 
1847 			cv_destroy(&port->state_cv);
1848 			mutex_destroy(&port->state_lock);
1849 
1850 			mutex_destroy(&port->tx_lock);
1851 			mutex_destroy(&port->mca_lock);
1852 			kmem_free(port, sizeof (vsw_port_t));
1853 			return (1);
1854 		}
1855 	}
1856 
1857 	ether_copy(macaddr, &port->p_macaddr);
1858 
1859 	WRITE_ENTER(&plist->lockrw);
1860 
1861 	/* create the fdb entry for this port/mac address */
1862 	(void) vsw_add_fdb(vswp, port);
1863 
1864 	/* link it into the list of ports for this vsw instance */
1865 	prev_port = (vsw_port_t **)(&plist->head);
1866 	port->p_next = *prev_port;
1867 	*prev_port = port;
1868 	plist->num_ports++;
1869 	RW_EXIT(&plist->lockrw);
1870 
1871 	/*
1872 	 * Initialise the port and any ldc's under it.
1873 	 */
1874 	(void) vsw_init_ldcs(port);
1875 
1876 	D1(vswp, "%s: exit", __func__);
1877 	return (0);
1878 }
1879 
1880 /*
1881  * Detach the specified port.
1882  *
1883  * Returns 0 on success, 1 on failure.
1884  */
1885 static int
1886 vsw_port_detach(vsw_t *vswp, int p_instance)
1887 {
1888 	vsw_port_t	*port = NULL;
1889 	vsw_port_list_t	*plist = &vswp->plist;
1890 
1891 	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
1892 
1893 	WRITE_ENTER(&plist->lockrw);
1894 
1895 	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
1896 		RW_EXIT(&plist->lockrw);
1897 		return (1);
1898 	}
1899 
1900 	if (vsw_plist_del_node(vswp, port)) {
1901 		RW_EXIT(&plist->lockrw);
1902 		return (1);
1903 	}
1904 
1905 	/* Remove the fdb entry for this port/mac address */
1906 	(void) vsw_del_fdb(vswp, port);
1907 
1908 	/* Remove any multicast addresses.. */
1909 	vsw_del_mcst_port(port);
1910 
1911 	/*
1912 	 * No longer need to hold lock on port list now that we
1913 	 * have unlinked the target port from the list.
1914 	 */
1915 	RW_EXIT(&plist->lockrw);
1916 
1917 	if (vsw_port_delete(port)) {
1918 		return (1);
1919 	}
1920 
1921 	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
1922 	return (0);
1923 }
1924 
1925 /*
1926  * Detach all active ports.
1927  *
1928  * Returns 0 on success, 1 on failure.
1929  */
1930 static int
1931 vsw_detach_ports(vsw_t *vswp)
1932 {
1933 	vsw_port_list_t 	*plist = &vswp->plist;
1934 	vsw_port_t		*port = NULL;
1935 
1936 	D1(vswp, "%s: enter", __func__);
1937 
1938 	WRITE_ENTER(&plist->lockrw);
1939 
1940 	while ((port = plist->head) != NULL) {
1941 		if (vsw_plist_del_node(vswp, port)) {
1942 			DERR(vswp, "%s: Error deleting port %d"
1943 				" from port list", __func__,
1944 				port->p_instance);
1945 			RW_EXIT(&plist->lockrw);
1946 			return (1);
1947 		}
1948 
1949 		/* Remove the fdb entry for this port/mac address */
1950 		(void) vsw_del_fdb(vswp, port);
1951 
1952 		/* Remove any multicast addresses.. */
1953 		vsw_del_mcst_port(port);
1954 
1955 		/*
1956 		 * No longer need to hold the lock on the port list
1957 		 * now that we have unlinked the target port from the
1958 		 * list.
1959 		 */
1960 		RW_EXIT(&plist->lockrw);
1961 		if (vsw_port_delete(port)) {
1962 			DERR(vswp, "%s: Error deleting port %d",
1963 				__func__, port->p_instance);
1964 			return (1);
1965 		}
1966 		WRITE_ENTER(&plist->lockrw);
1967 	}
1968 	RW_EXIT(&plist->lockrw);
1969 
1970 	D1(vswp, "%s: exit", __func__);
1971 
1972 	return (0);
1973 }
1974 
1975 /*
1976  * Delete the specified port.
1977  *
1978  * Returns 0 on success, 1 on failure.
1979  */
1980 static int
1981 vsw_port_delete(vsw_port_t *port)
1982 {
1983 	vsw_ldc_list_t 		*ldcl;
1984 	vsw_t			*vswp = port->p_vswp;
1985 
1986 	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
1987 
1988 	(void) vsw_uninit_ldcs(port);
1989 
1990 	/*
1991 	 * Wait for any pending ctrl msg tasks which reference this
1992 	 * port to finish.
1993 	 */
1994 	if (vsw_drain_port_taskq(port))
1995 		return (1);
1996 
1997 	/*
1998 	 * Wait for port reference count to hit zero.
1999 	 */
2000 	mutex_enter(&port->ref_lock);
2001 	while (port->ref_cnt != 0)
2002 		cv_wait(&port->ref_cv, &port->ref_lock);
2003 	mutex_exit(&port->ref_lock);
2004 
2005 	/*
2006 	 * Wait for any active callbacks to finish
2007 	 */
2008 	if (vsw_drain_ldcs(port))
2009 		return (1);
2010 
2011 	ldcl = &port->p_ldclist;
2012 	WRITE_ENTER(&ldcl->lockrw);
2013 	while (ldcl->num_ldcs > 0) {
2014 		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
2015 			cmn_err(CE_WARN, "unable to detach ldc %ld",
2016 					ldcl->head->ldc_id);
2017 			RW_EXIT(&ldcl->lockrw);
2018 			return (1);
2019 		}
2020 	}
2021 	RW_EXIT(&ldcl->lockrw);
2022 
2023 	rw_destroy(&port->p_ldclist.lockrw);
2024 
2025 	mutex_destroy(&port->mca_lock);
2026 	mutex_destroy(&port->tx_lock);
2027 	cv_destroy(&port->ref_cv);
2028 	mutex_destroy(&port->ref_lock);
2029 
2030 	cv_destroy(&port->state_cv);
2031 	mutex_destroy(&port->state_lock);
2032 
2033 	kmem_free(port, sizeof (vsw_port_t));
2034 
2035 	D1(vswp, "%s: exit", __func__);
2036 
2037 	return (0);
2038 }
2039 
2040 /*
2041  * Attach a logical domain channel (ldc) under a specified port.
2042  *
2043  * Returns 0 on success, 1 on failure.
2044  */
2045 static int
2046 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
2047 {
2048 	vsw_t 		*vswp = port->p_vswp;
2049 	vsw_ldc_list_t *ldcl = &port->p_ldclist;
2050 	vsw_ldc_t 	*ldcp = NULL;
2051 	ldc_attr_t 	attr;
2052 	ldc_status_t	istatus;
2053 	int 		status = DDI_FAILURE;
2054 	int		rv;
2055 
2056 	D1(vswp, "%s: enter", __func__);
2057 
2058 	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
2059 	if (ldcp == NULL) {
2060 		DERR(vswp, "%s: kmem_zalloc failed", __func__);
2061 		return (1);
2062 	}
2063 	ldcp->ldc_id = ldc_id;
2064 
2065 	/* allocate pool of receive mblks */
2066 	rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
2067 	if (rv) {
2068 		DWARN(vswp, "%s: unable to create free mblk pool for"
2069 			" channel %ld (rv %d)", __func__, ldc_id, rv);
2070 		kmem_free(ldcp, sizeof (vsw_ldc_t));
2071 		return (1);
2072 	}
2073 
2074 	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
2075 	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
2076 	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
2077 	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
2078 
2079 	/* required for handshake with peer */
2080 	ldcp->local_session = (uint64_t)ddi_get_lbolt();
2081 	ldcp->peer_session = 0;
2082 	ldcp->session_status = 0;
2083 
2084 	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
2085 	ldcp->hss_id = 1;	/* Initial handshake session id */
2086 
2087 	/* only set for outbound lane, inbound set by peer */
2088 	mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
2089 	mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
2090 	vsw_set_lane_attr(vswp, &ldcp->lane_out);
2091 
2092 	attr.devclass = LDC_DEV_NT_SVC;
2093 	attr.instance = ddi_get_instance(vswp->dip);
2094 	attr.mode = LDC_MODE_UNRELIABLE;
2095 	attr.qlen = VSW_LDC_QLEN;
2096 	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
2097 	if (status != 0) {
2098 		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
2099 		    __func__, ldc_id, status);
2100 		goto ldc_attach_fail;
2101 	}
2102 
2103 	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
2104 	if (status != 0) {
2105 		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
2106 		    __func__, ldc_id, status);
2107 		(void) ldc_fini(ldcp->ldc_handle);
2108 		goto ldc_attach_fail;
2109 	}
2110 
2111 
2112 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2113 		DERR(vswp, "%s: ldc_status failed", __func__);
2114 		return (1);
2115 	}
2116 
2117 	ldcp->ldc_status = istatus;
2118 	ldcp->ldc_port = port;
2119 	ldcp->ldc_vswp = vswp;
2120 
2121 	/* link it into the list of channels for this port */
2122 	WRITE_ENTER(&ldcl->lockrw);
2123 	ldcp->ldc_next = ldcl->head;
2124 	ldcl->head = ldcp;
2125 	ldcl->num_ldcs++;
2126 	RW_EXIT(&ldcl->lockrw);
2127 
2128 	D1(vswp, "%s: exit", __func__);
2129 	return (0);
2130 
2131 ldc_attach_fail:
2132 	mutex_destroy(&ldcp->ldc_txlock);
2133 	mutex_destroy(&ldcp->ldc_cblock);
2134 
2135 	cv_destroy(&ldcp->drain_cv);
2136 
2137 	if (ldcp->rxh != NULL) {
2138 		if (vio_destroy_mblks(ldcp->rxh) != 0) {
2139 			/*
2140 			 * Something odd has happened, as the destroy
2141 			 * will only fail if some mblks have been allocated
2142 			 * from the pool already (which shouldn't happen)
2143 			 * and have not been returned.
2144 			 *
2145 			 * Add the pool pointer to a list maintained in
2146 			 * the device instance. Another attempt will be made
2147 			 * to free the pool when the device itself detaches.
2148 			 */
2149 			cmn_err(CE_WARN, "Creation of ldc channel %ld failed"
2150 				" and cannot destroy associated mblk pool",
2151 				ldc_id);
2152 			ldcp->rxh->nextp =  vswp->rxh;
2153 			vswp->rxh = ldcp->rxh;
2154 		}
2155 	}
2156 	mutex_destroy(&ldcp->drain_cv_lock);
2157 	mutex_destroy(&ldcp->hss_lock);
2158 
2159 	mutex_destroy(&ldcp->lane_in.seq_lock);
2160 	mutex_destroy(&ldcp->lane_out.seq_lock);
2161 	kmem_free(ldcp, sizeof (vsw_ldc_t));
2162 
2163 	return (1);
2164 }
2165 
2166 /*
2167  * Detach a logical domain channel (ldc) belonging to a
2168  * particular port.
2169  *
2170  * Returns 0 on success, 1 on failure.
2171  */
2172 static int
2173 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
2174 {
2175 	vsw_t 		*vswp = port->p_vswp;
2176 	vsw_ldc_t 	*ldcp, *prev_ldcp;
2177 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2178 	int 		rv;
2179 
2180 	prev_ldcp = ldcl->head;
2181 	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
2182 		if (ldcp->ldc_id == ldc_id) {
2183 			break;
2184 		}
2185 	}
2186 
2187 	/* specified ldc id not found */
2188 	if (ldcp == NULL) {
2189 		DERR(vswp, "%s: ldcp = NULL", __func__);
2190 		return (1);
2191 	}
2192 
2193 	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
2194 
2195 	/*
2196 	 * Before we can close the channel we must release any mapped
2197 	 * resources (e.g. drings).
2198 	 */
2199 	vsw_free_lane_resources(ldcp, INBOUND);
2200 	vsw_free_lane_resources(ldcp, OUTBOUND);
2201 
2202 	/*
2203 	 * If the close fails we are in serious trouble, as won't
2204 	 * be able to delete the parent port.
2205 	 */
2206 	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
2207 		DERR(vswp, "%s: error %d closing channel %lld",
2208 			__func__, rv, ldcp->ldc_id);
2209 		return (1);
2210 	}
2211 
2212 	(void) ldc_fini(ldcp->ldc_handle);
2213 
2214 	ldcp->ldc_status = LDC_INIT;
2215 	ldcp->ldc_handle = NULL;
2216 	ldcp->ldc_vswp = NULL;
2217 
2218 	if (ldcp->rxh != NULL) {
2219 		if (vio_destroy_mblks(ldcp->rxh)) {
2220 			/*
2221 			 * Mostly likely some mblks are still in use and
2222 			 * have not been returned to the pool. Add the pool
2223 			 * to the list maintained in the device instance.
2224 			 * Another attempt will be made to destroy the pool
2225 			 * when the device detaches.
2226 			 */
2227 			ldcp->rxh->nextp =  vswp->rxh;
2228 			vswp->rxh = ldcp->rxh;
2229 		}
2230 	}
2231 
2232 	mutex_destroy(&ldcp->ldc_txlock);
2233 	mutex_destroy(&ldcp->ldc_cblock);
2234 	cv_destroy(&ldcp->drain_cv);
2235 	mutex_destroy(&ldcp->drain_cv_lock);
2236 	mutex_destroy(&ldcp->hss_lock);
2237 	mutex_destroy(&ldcp->lane_in.seq_lock);
2238 	mutex_destroy(&ldcp->lane_out.seq_lock);
2239 
2240 	/* unlink it from the list */
2241 	prev_ldcp = ldcp->ldc_next;
2242 	ldcl->num_ldcs--;
2243 	kmem_free(ldcp, sizeof (vsw_ldc_t));
2244 
2245 	return (0);
2246 }
2247 
2248 /*
2249  * Open and attempt to bring up the channel. Note that channel
2250  * can only be brought up if peer has also opened channel.
2251  *
2252  * Returns 0 if can open and bring up channel, otherwise
2253  * returns 1.
2254  */
2255 static int
2256 vsw_ldc_init(vsw_ldc_t *ldcp)
2257 {
2258 	vsw_t 		*vswp = ldcp->ldc_vswp;
2259 	ldc_status_t	istatus = 0;
2260 	int		rv;
2261 
2262 	D1(vswp, "%s: enter", __func__);
2263 
2264 	LDC_ENTER_LOCK(ldcp);
2265 
2266 	/* don't start at 0 in case clients don't like that */
2267 	ldcp->next_ident = 1;
2268 
2269 	rv = ldc_open(ldcp->ldc_handle);
2270 	if (rv != 0) {
2271 		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
2272 		    __func__, ldcp->ldc_id, rv);
2273 		LDC_EXIT_LOCK(ldcp);
2274 		return (1);
2275 	}
2276 
2277 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2278 		DERR(vswp, "%s: unable to get status", __func__);
2279 		LDC_EXIT_LOCK(ldcp);
2280 		return (1);
2281 
2282 	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
2283 		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
2284 		    __func__, ldcp->ldc_id, istatus);
2285 		LDC_EXIT_LOCK(ldcp);
2286 		return (1);
2287 	}
2288 
2289 	ldcp->ldc_status = istatus;
2290 	rv = ldc_up(ldcp->ldc_handle);
2291 	if (rv != 0) {
2292 		/*
2293 		 * Not a fatal error for ldc_up() to fail, as peer
2294 		 * end point may simply not be ready yet.
2295 		 */
2296 		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
2297 			ldcp->ldc_id, rv);
2298 		LDC_EXIT_LOCK(ldcp);
2299 		return (1);
2300 	}
2301 
2302 	/*
2303 	 * ldc_up() call is non-blocking so need to explicitly
2304 	 * check channel status to see if in fact the channel
2305 	 * is UP.
2306 	 */
2307 	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
2308 		DERR(vswp, "%s: unable to get status", __func__);
2309 		LDC_EXIT_LOCK(ldcp);
2310 		return (1);
2311 
2312 	} else if (istatus != LDC_UP) {
2313 		DERR(vswp, "%s: id(%lld) status(%d) is not UP",
2314 		    __func__, ldcp->ldc_id, istatus);
2315 	} else {
2316 		ldcp->ldc_status = istatus;
2317 	}
2318 
2319 	LDC_EXIT_LOCK(ldcp);
2320 
2321 	D1(vswp, "%s: exit", __func__);
2322 	return (0);
2323 }
2324 
2325 /* disable callbacks on the channel */
2326 static int
2327 vsw_ldc_uninit(vsw_ldc_t *ldcp)
2328 {
2329 	vsw_t	*vswp = ldcp->ldc_vswp;
2330 	int	rv;
2331 
2332 	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
2333 
2334 	LDC_ENTER_LOCK(ldcp);
2335 
2336 	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
2337 	if (rv != 0) {
2338 		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
2339 			"interrupts (rv = %d)\n", ldcp->ldc_id, rv);
2340 		LDC_EXIT_LOCK(ldcp);
2341 		return (1);
2342 	}
2343 
2344 	ldcp->ldc_status = LDC_INIT;
2345 
2346 	LDC_EXIT_LOCK(ldcp);
2347 
2348 	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
2349 
2350 	return (0);
2351 }
2352 
2353 static int
2354 vsw_init_ldcs(vsw_port_t *port)
2355 {
2356 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2357 	vsw_ldc_t	*ldcp;
2358 
2359 	READ_ENTER(&ldcl->lockrw);
2360 	ldcp =  ldcl->head;
2361 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2362 		(void) vsw_ldc_init(ldcp);
2363 	}
2364 	RW_EXIT(&ldcl->lockrw);
2365 
2366 	return (0);
2367 }
2368 
2369 static int
2370 vsw_uninit_ldcs(vsw_port_t *port)
2371 {
2372 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2373 	vsw_ldc_t	*ldcp;
2374 
2375 	D1(NULL, "vsw_uninit_ldcs: enter\n");
2376 
2377 	READ_ENTER(&ldcl->lockrw);
2378 	ldcp =  ldcl->head;
2379 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2380 		(void) vsw_ldc_uninit(ldcp);
2381 	}
2382 	RW_EXIT(&ldcl->lockrw);
2383 
2384 	D1(NULL, "vsw_uninit_ldcs: exit\n");
2385 
2386 	return (0);
2387 }
2388 
2389 /*
2390  * Wait until the callback(s) associated with the ldcs under the specified
2391  * port have completed.
2392  *
2393  * Prior to this function being invoked each channel under this port
2394  * should have been quiesced via ldc_set_cb_mode(DISABLE).
2395  *
2396  * A short explaination of what we are doing below..
2397  *
2398  * The simplest approach would be to have a reference counter in
2399  * the ldc structure which is increment/decremented by the callbacks as
2400  * they use the channel. The drain function could then simply disable any
2401  * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
2402  * there is a tiny window here - before the callback is able to get the lock
2403  * on the channel it is interrupted and this function gets to execute. It
2404  * sees that the ref count is zero and believes its free to delete the
2405  * associated data structures.
2406  *
2407  * We get around this by taking advantage of the fact that before the ldc
2408  * framework invokes a callback it sets a flag to indicate that there is a
2409  * callback active (or about to become active). If when we attempt to
2410  * unregister a callback when this active flag is set then the unregister
2411  * will fail with EWOULDBLOCK.
2412  *
2413  * If the unregister fails we do a cv_timedwait. We will either be signaled
2414  * by the callback as it is exiting (note we have to wait a short period to
2415  * allow the callback to return fully to the ldc framework and it to clear
2416  * the active flag), or by the timer expiring. In either case we again attempt
2417  * the unregister. We repeat this until we can succesfully unregister the
2418  * callback.
2419  *
2420  * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
2421  * the case where the callback has finished but the ldc framework has not yet
2422  * cleared the active flag. In this case we would never get a cv_signal.
2423  */
2424 static int
2425 vsw_drain_ldcs(vsw_port_t *port)
2426 {
2427 	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
2428 	vsw_ldc_t	*ldcp;
2429 	vsw_t		*vswp = port->p_vswp;
2430 
2431 	D1(vswp, "%s: enter", __func__);
2432 
2433 	READ_ENTER(&ldcl->lockrw);
2434 
2435 	ldcp = ldcl->head;
2436 
2437 	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
2438 		/*
2439 		 * If we can unregister the channel callback then we
2440 		 * know that there is no callback either running or
2441 		 * scheduled to run for this channel so move on to next
2442 		 * channel in the list.
2443 		 */
2444 		mutex_enter(&ldcp->drain_cv_lock);
2445 
2446 		/* prompt active callbacks to quit */
2447 		ldcp->drain_state = VSW_LDC_DRAINING;
2448 
2449 		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
2450 			D2(vswp, "%s: unreg callback for chan %ld", __func__,
2451 				ldcp->ldc_id);
2452 			mutex_exit(&ldcp->drain_cv_lock);
2453 			continue;
2454 		} else {
2455 			/*
2456 			 * If we end up here we know that either 1) a callback
2457 			 * is currently executing, 2) is about to start (i.e.
2458 			 * the ldc framework has set the active flag but
2459 			 * has not actually invoked the callback yet, or 3)
2460 			 * has finished and has returned to the ldc framework
2461 			 * but the ldc framework has not yet cleared the
2462 			 * active bit.
2463 			 *
2464 			 * Wait for it to finish.
2465 			 */
2466 			while (ldc_unreg_callback(ldcp->ldc_handle)
2467 								== EWOULDBLOCK)
2468 				(void) cv_timedwait(&ldcp->drain_cv,
2469 					&ldcp->drain_cv_lock, lbolt + hz);
2470 
2471 			mutex_exit(&ldcp->drain_cv_lock);
2472 			D2(vswp, "%s: unreg callback for chan %ld after "
2473 				"timeout", __func__, ldcp->ldc_id);
2474 		}
2475 	}
2476 	RW_EXIT(&ldcl->lockrw);
2477 
2478 	D1(vswp, "%s: exit", __func__);
2479 	return (0);
2480 }
2481 
2482 /*
2483  * Wait until all tasks which reference this port have completed.
2484  *
2485  * Prior to this function being invoked each channel under this port
2486  * should have been quiesced via ldc_set_cb_mode(DISABLE).
2487  */
2488 static int
2489 vsw_drain_port_taskq(vsw_port_t *port)
2490 {
2491 	vsw_t		*vswp = port->p_vswp;
2492 
2493 	D1(vswp, "%s: enter", __func__);
2494 
2495 	/*
2496 	 * Mark the port as in the process of being detached, and
2497 	 * dispatch a marker task to the queue so we know when all
2498 	 * relevant tasks have completed.
2499 	 */
2500 	mutex_enter(&port->state_lock);
2501 	port->state = VSW_PORT_DETACHING;
2502 
2503 	if ((vswp->taskq_p == NULL) ||
2504 		(ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
2505 			port, DDI_NOSLEEP) != DDI_SUCCESS)) {
2506 		DERR(vswp, "%s: unable to dispatch marker task",
2507 			__func__);
2508 		mutex_exit(&port->state_lock);
2509 		return (1);
2510 	}
2511 
2512 	/*
2513 	 * Wait for the marker task to finish.
2514 	 */
2515 	while (port->state != VSW_PORT_DETACHABLE)
2516 		cv_wait(&port->state_cv, &port->state_lock);
2517 
2518 	mutex_exit(&port->state_lock);
2519 
2520 	D1(vswp, "%s: exit", __func__);
2521 
2522 	return (0);
2523 }
2524 
2525 static void
2526 vsw_marker_task(void *arg)
2527 {
2528 	vsw_port_t	*port = arg;
2529 	vsw_t		*vswp = port->p_vswp;
2530 
2531 	D1(vswp, "%s: enter", __func__);
2532 
2533 	mutex_enter(&port->state_lock);
2534 
2535 	/*
2536 	 * No further tasks should be dispatched which reference
2537 	 * this port so ok to mark it as safe to detach.
2538 	 */
2539 	port->state = VSW_PORT_DETACHABLE;
2540 
2541 	cv_signal(&port->state_cv);
2542 
2543 	mutex_exit(&port->state_lock);
2544 
2545 	D1(vswp, "%s: exit", __func__);
2546 }
2547 
2548 static vsw_port_t *
2549 vsw_lookup_port(vsw_t *vswp, int p_instance)
2550 {
2551 	vsw_port_list_t *plist = &vswp->plist;
2552 	vsw_port_t	*port;
2553 
2554 	for (port = plist->head; port != NULL; port = port->p_next) {
2555 		if (port->p_instance == p_instance) {
2556 			D2(vswp, "vsw_lookup_port: found p_instance\n");
2557 			return (port);
2558 		}
2559 	}
2560 
2561 	return (NULL);
2562 }
2563 
2564 /*
2565  * Search for and remove the specified port from the port
2566  * list. Returns 0 if able to locate and remove port, otherwise
2567  * returns 1.
2568  */
2569 static int
2570 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
2571 {
2572 	vsw_port_list_t *plist = &vswp->plist;
2573 	vsw_port_t	*curr_p, *prev_p;
2574 
2575 	if (plist->head == NULL)
2576 		return (1);
2577 
2578 	curr_p = prev_p = plist->head;
2579 
2580 	while (curr_p != NULL) {
2581 		if (curr_p == port) {
2582 			if (prev_p == curr_p) {
2583 				plist->head = curr_p->p_next;
2584 			} else {
2585 				prev_p->p_next = curr_p->p_next;
2586 			}
2587 			plist->num_ports--;
2588 			break;
2589 		} else {
2590 			prev_p = curr_p;
2591 			curr_p = curr_p->p_next;
2592 		}
2593 	}
2594 	return (0);
2595 }
2596 
2597 /*
2598  * Interrupt handler for ldc messages.
2599  */
2600 static uint_t
2601 vsw_ldc_cb(uint64_t event, caddr_t arg)
2602 {
2603 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
2604 	vsw_t 		*vswp = ldcp->ldc_vswp;
2605 	ldc_status_t	lstatus;
2606 	int		rv;
2607 
2608 	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2609 
2610 	mutex_enter(&ldcp->ldc_cblock);
2611 
2612 	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
2613 		mutex_exit(&ldcp->ldc_cblock);
2614 		return (LDC_SUCCESS);
2615 	}
2616 
2617 	if (event & LDC_EVT_UP) {
2618 		/*
2619 		 * Channel has come up, get the state and then start
2620 		 * the handshake.
2621 		 */
2622 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
2623 		if (rv != 0) {
2624 			cmn_err(CE_WARN, "Unable to read channel state");
2625 		}
2626 		ldcp->ldc_status = lstatus;
2627 
2628 		D2(vswp, "%s: id(%ld) event(%llx) UP:  status(%ld)",
2629 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2630 
2631 		vsw_restart_handshake(ldcp);
2632 
2633 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
2634 	}
2635 
2636 	if (event & LDC_EVT_READ) {
2637 		/*
2638 		 * Data available for reading.
2639 		 */
2640 		D2(vswp, "%s: id(ld) event(%llx) data READ",
2641 				__func__, ldcp->ldc_id, event);
2642 
2643 		vsw_process_pkt(ldcp);
2644 
2645 		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
2646 
2647 		goto vsw_cb_exit;
2648 	}
2649 
2650 	if (event & LDC_EVT_RESET) {
2651 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
2652 		if (rv != 0) {
2653 			cmn_err(CE_WARN, "Unable to read channel state");
2654 		} else {
2655 			ldcp->ldc_status = lstatus;
2656 		}
2657 		D2(vswp, "%s: id(%ld) event(%llx) RESET:  status (%ld)",
2658 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2659 	}
2660 
2661 	if (event & LDC_EVT_DOWN) {
2662 		rv = ldc_status(ldcp->ldc_handle, &lstatus);
2663 		if (rv != 0) {
2664 			cmn_err(CE_WARN, "Unable to read channel state");
2665 		} else {
2666 			ldcp->ldc_status = lstatus;
2667 		}
2668 
2669 		D2(vswp, "%s: id(%ld) event(%llx) DOWN:  status (%ld)",
2670 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2671 
2672 	}
2673 
2674 	/*
2675 	 * Catch either LDC_EVT_WRITE which we don't support or any
2676 	 * unknown event.
2677 	 */
2678 	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
2679 					| LDC_EVT_DOWN | LDC_EVT_READ)) {
2680 
2681 		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
2682 			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
2683 	}
2684 
2685 vsw_cb_exit:
2686 	mutex_exit(&ldcp->ldc_cblock);
2687 
2688 	/*
2689 	 * Let the drain function know we are finishing if it
2690 	 * is waiting.
2691 	 */
2692 	mutex_enter(&ldcp->drain_cv_lock);
2693 	if (ldcp->drain_state == VSW_LDC_DRAINING)
2694 		cv_signal(&ldcp->drain_cv);
2695 	mutex_exit(&ldcp->drain_cv_lock);
2696 
2697 	return (LDC_SUCCESS);
2698 }
2699 
2700 /*
2701  * (Re)start a handshake with our peer by sending them
2702  * our version info.
2703  */
2704 static void
2705 vsw_restart_handshake(vsw_ldc_t *ldcp)
2706 {
2707 	vsw_t		*vswp = ldcp->ldc_vswp;
2708 	vsw_port_t	*port;
2709 	vsw_ldc_list_t	*ldcl;
2710 
2711 	D1(vswp, "vsw_restart_handshake: enter");
2712 
2713 	port = ldcp->ldc_port;
2714 	ldcl = &port->p_ldclist;
2715 
2716 	WRITE_ENTER(&ldcl->lockrw);
2717 
2718 	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
2719 		ldcp->lane_in.lstate, ldcp->lane_out.lstate);
2720 
2721 	vsw_free_lane_resources(ldcp, INBOUND);
2722 	vsw_free_lane_resources(ldcp, OUTBOUND);
2723 	RW_EXIT(&ldcl->lockrw);
2724 
2725 	ldcp->lane_in.lstate = 0;
2726 	ldcp->lane_out.lstate = 0;
2727 
2728 	/*
2729 	 * Remove parent port from any multicast groups
2730 	 * it may have registered with. Client must resend
2731 	 * multicast add command after handshake completes.
2732 	 */
2733 	(void) vsw_del_fdb(vswp, port);
2734 
2735 	vsw_del_mcst_port(port);
2736 
2737 	ldcp->hphase = VSW_MILESTONE0;
2738 
2739 	ldcp->peer_session = 0;
2740 	ldcp->session_status = 0;
2741 
2742 	/*
2743 	 * We now increment the transaction group id. This allows
2744 	 * us to identify and disard any tasks which are still pending
2745 	 * on the taskq and refer to the handshake session we are about
2746 	 * to restart. These stale messages no longer have any real
2747 	 * meaning.
2748 	 */
2749 	mutex_enter(&ldcp->hss_lock);
2750 	ldcp->hss_id++;
2751 	mutex_exit(&ldcp->hss_lock);
2752 
2753 	if (ldcp->hcnt++ > vsw_num_handshakes) {
2754 		cmn_err(CE_WARN, "exceeded number of permitted "
2755 			"handshake attempts (%d) on channel %ld",
2756 			ldcp->hcnt, ldcp->ldc_id);
2757 		return;
2758 	}
2759 
2760 	vsw_send_ver(ldcp);
2761 
2762 	D1(vswp, "vsw_restart_handshake: exit");
2763 }
2764 
2765 /*
2766  * returns 0 if legal for event signified by flag to have
2767  * occured at the time it did. Otherwise returns 1.
2768  */
2769 int
2770 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
2771 {
2772 	vsw_t		*vswp = ldcp->ldc_vswp;
2773 	uint64_t	state;
2774 	uint64_t	phase;
2775 
2776 	if (dir == INBOUND)
2777 		state = ldcp->lane_in.lstate;
2778 	else
2779 		state = ldcp->lane_out.lstate;
2780 
2781 	phase = ldcp->hphase;
2782 
2783 	switch (flag) {
2784 	case VSW_VER_INFO_RECV:
2785 		if (phase > VSW_MILESTONE0) {
2786 			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
2787 				" when in state %d\n", ldcp->ldc_id, phase);
2788 			vsw_restart_handshake(ldcp);
2789 			return (1);
2790 		}
2791 		break;
2792 
2793 	case VSW_VER_ACK_RECV:
2794 	case VSW_VER_NACK_RECV:
2795 		if (!(state & VSW_VER_INFO_SENT)) {
2796 			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
2797 				" or VER_NACK when in state %d\n",
2798 				ldcp->ldc_id, phase);
2799 			vsw_restart_handshake(ldcp);
2800 			return (1);
2801 		} else
2802 			state &= ~VSW_VER_INFO_SENT;
2803 		break;
2804 
2805 	case VSW_ATTR_INFO_RECV:
2806 		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
2807 			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
2808 				" when in state %d\n", ldcp->ldc_id, phase);
2809 			vsw_restart_handshake(ldcp);
2810 			return (1);
2811 		}
2812 		break;
2813 
2814 	case VSW_ATTR_ACK_RECV:
2815 	case VSW_ATTR_NACK_RECV:
2816 		if (!(state & VSW_ATTR_INFO_SENT)) {
2817 			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
2818 				" or ATTR_NACK when in state %d\n",
2819 				ldcp->ldc_id, phase);
2820 			vsw_restart_handshake(ldcp);
2821 			return (1);
2822 		} else
2823 			state &= ~VSW_ATTR_INFO_SENT;
2824 		break;
2825 
2826 	case VSW_DRING_INFO_RECV:
2827 		if (phase < VSW_MILESTONE1) {
2828 			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
2829 				" when in state %d\n", ldcp->ldc_id, phase);
2830 			vsw_restart_handshake(ldcp);
2831 			return (1);
2832 		}
2833 		break;
2834 
2835 	case VSW_DRING_ACK_RECV:
2836 	case VSW_DRING_NACK_RECV:
2837 		if (!(state & VSW_DRING_INFO_SENT)) {
2838 			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
2839 				" or DRING_NACK when in state %d\n",
2840 				ldcp->ldc_id, phase);
2841 			vsw_restart_handshake(ldcp);
2842 			return (1);
2843 		} else
2844 			state &= ~VSW_DRING_INFO_SENT;
2845 		break;
2846 
2847 	case VSW_RDX_INFO_RECV:
2848 		if (phase < VSW_MILESTONE3) {
2849 			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
2850 				" when in state %d\n", ldcp->ldc_id, phase);
2851 			vsw_restart_handshake(ldcp);
2852 			return (1);
2853 		}
2854 		break;
2855 
2856 	case VSW_RDX_ACK_RECV:
2857 	case VSW_RDX_NACK_RECV:
2858 		if (!(state & VSW_RDX_INFO_SENT)) {
2859 			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
2860 				" or RDX_NACK when in state %d\n",
2861 				ldcp->ldc_id, phase);
2862 			vsw_restart_handshake(ldcp);
2863 			return (1);
2864 		} else
2865 			state &= ~VSW_RDX_INFO_SENT;
2866 		break;
2867 
2868 	case VSW_MCST_INFO_RECV:
2869 		if (phase < VSW_MILESTONE3) {
2870 			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
2871 				" when in state %d\n", ldcp->ldc_id, phase);
2872 			vsw_restart_handshake(ldcp);
2873 			return (1);
2874 		}
2875 		break;
2876 
2877 	default:
2878 		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
2879 				ldcp->ldc_id, flag);
2880 		return (1);
2881 	}
2882 
2883 	if (dir == INBOUND)
2884 		ldcp->lane_in.lstate = state;
2885 	else
2886 		ldcp->lane_out.lstate = state;
2887 
2888 	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
2889 
2890 	return (0);
2891 }
2892 
2893 void
2894 vsw_next_milestone(vsw_ldc_t *ldcp)
2895 {
2896 	vsw_t		*vswp = ldcp->ldc_vswp;
2897 
2898 	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
2899 		ldcp->ldc_id, ldcp->hphase);
2900 
2901 	DUMP_FLAGS(ldcp->lane_in.lstate);
2902 	DUMP_FLAGS(ldcp->lane_out.lstate);
2903 
2904 	switch (ldcp->hphase) {
2905 
2906 	case VSW_MILESTONE0:
2907 		/*
2908 		 * If we haven't started to handshake with our peer,
2909 		 * start to do so now.
2910 		 */
2911 		if (ldcp->lane_out.lstate == 0) {
2912 			D2(vswp, "%s: (chan %lld) starting handshake "
2913 				"with peer", __func__, ldcp->ldc_id);
2914 			vsw_restart_handshake(ldcp);
2915 		}
2916 
2917 		/*
2918 		 * Only way to pass this milestone is to have successfully
2919 		 * negotiated version info.
2920 		 */
2921 		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
2922 			(ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
2923 
2924 			D2(vswp, "%s: (chan %lld) leaving milestone 0",
2925 				__func__, ldcp->ldc_id);
2926 
2927 			/*
2928 			 * Next milestone is passed when attribute
2929 			 * information has been successfully exchanged.
2930 			 */
2931 			ldcp->hphase = VSW_MILESTONE1;
2932 			vsw_send_attr(ldcp);
2933 
2934 		}
2935 		break;
2936 
2937 	case VSW_MILESTONE1:
2938 		/*
2939 		 * Only way to pass this milestone is to have successfully
2940 		 * negotiated attribute information.
2941 		 */
2942 		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
2943 
2944 			ldcp->hphase = VSW_MILESTONE2;
2945 
2946 			/*
2947 			 * If the peer device has said it wishes to
2948 			 * use descriptor rings then we send it our ring
2949 			 * info, otherwise we just set up a private ring
2950 			 * which we use an internal buffer
2951 			 */
2952 			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
2953 				vsw_send_dring_info(ldcp);
2954 		}
2955 		break;
2956 
2957 
2958 	case VSW_MILESTONE2:
2959 		/*
2960 		 * If peer has indicated in its attribute message that
2961 		 * it wishes to use descriptor rings then the only way
2962 		 * to pass this milestone is for us to have received
2963 		 * valid dring info.
2964 		 *
2965 		 * If peer is not using descriptor rings then just fall
2966 		 * through.
2967 		 */
2968 		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
2969 			(!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
2970 			break;
2971 
2972 		D2(vswp, "%s: (chan %lld) leaving milestone 2",
2973 				__func__, ldcp->ldc_id);
2974 
2975 		ldcp->hphase = VSW_MILESTONE3;
2976 		vsw_send_rdx(ldcp);
2977 		break;
2978 
2979 	case VSW_MILESTONE3:
2980 		/*
2981 		 * Pass this milestone when all paramaters have been
2982 		 * successfully exchanged and RDX sent in both directions.
2983 		 *
2984 		 * Mark outbound lane as available to transmit data.
2985 		 */
2986 		if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) &&
2987 			(ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) {
2988 
2989 			D2(vswp, "%s: (chan %lld) leaving milestone 3",
2990 				__func__, ldcp->ldc_id);
2991 			D2(vswp, "%s: ** handshake complete **", __func__);
2992 			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
2993 			ldcp->hphase = VSW_MILESTONE4;
2994 			ldcp->hcnt = 0;
2995 			DISPLAY_STATE();
2996 		}
2997 		break;
2998 
2999 	case VSW_MILESTONE4:
3000 		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
3001 							ldcp->ldc_id);
3002 		break;
3003 
3004 	default:
3005 		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
3006 			ldcp->ldc_id, ldcp->hphase);
3007 	}
3008 
3009 	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
3010 		ldcp->hphase);
3011 }
3012 
3013 /*
3014  * Check if major version is supported.
3015  *
3016  * Returns 0 if finds supported major number, and if necessary
3017  * adjusts the minor field.
3018  *
3019  * Returns 1 if can't match major number exactly. Sets mjor/minor
3020  * to next lowest support values, or to zero if no other values possible.
3021  */
3022 static int
3023 vsw_supported_version(vio_ver_msg_t *vp)
3024 {
3025 	int	i;
3026 
3027 	D1(NULL, "vsw_supported_version: enter");
3028 
3029 	for (i = 0; i < VSW_NUM_VER; i++) {
3030 		if (vsw_versions[i].ver_major == vp->ver_major) {
3031 			/*
3032 			 * Matching or lower major version found. Update
3033 			 * minor number if necessary.
3034 			 */
3035 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
3036 				D2(NULL, "%s: adjusting minor value"
3037 					" from %d to %d", __func__,
3038 					vp->ver_minor,
3039 					vsw_versions[i].ver_minor);
3040 				vp->ver_minor = vsw_versions[i].ver_minor;
3041 			}
3042 
3043 			return (0);
3044 		}
3045 
3046 		if (vsw_versions[i].ver_major < vp->ver_major) {
3047 			if (vp->ver_minor > vsw_versions[i].ver_minor) {
3048 				D2(NULL, "%s: adjusting minor value"
3049 					" from %d to %d", __func__,
3050 					vp->ver_minor,
3051 					vsw_versions[i].ver_minor);
3052 				vp->ver_minor = vsw_versions[i].ver_minor;
3053 			}
3054 			return (1);
3055 		}
3056 	}
3057 
3058 	/* No match was possible, zero out fields */
3059 	vp->ver_major = 0;
3060 	vp->ver_minor = 0;
3061 
3062 	D1(NULL, "vsw_supported_version: exit");
3063 
3064 	return (1);
3065 }
3066 
3067 /*
3068  * Main routine for processing messages received over LDC.
3069  */
3070 static void
3071 vsw_process_pkt(void *arg)
3072 {
3073 	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
3074 	vsw_t 		*vswp = ldcp->ldc_vswp;
3075 	size_t		msglen;
3076 	vio_msg_tag_t	tag;
3077 	def_msg_t	dmsg;
3078 	int 		rv = 0;
3079 
3080 	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3081 
3082 	/*
3083 	 * If channel is up read messages until channel is empty.
3084 	 */
3085 	do {
3086 		msglen = sizeof (dmsg);
3087 		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
3088 
3089 		if (rv != 0) {
3090 			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
3091 				"len(%d)\n", __func__, ldcp->ldc_id,
3092 							rv, msglen);
3093 			break;
3094 		}
3095 
3096 		if (msglen == 0) {
3097 			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
3098 			ldcp->ldc_id);
3099 			break;
3100 		}
3101 
3102 		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
3103 		    ldcp->ldc_id, msglen);
3104 
3105 		/*
3106 		 * Figure out what sort of packet we have gotten by
3107 		 * examining the msg tag, and then switch it appropriately.
3108 		 */
3109 		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
3110 
3111 		switch (tag.vio_msgtype) {
3112 		case VIO_TYPE_CTRL:
3113 			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
3114 			break;
3115 		case VIO_TYPE_DATA:
3116 			vsw_process_data_pkt(ldcp, &dmsg, tag);
3117 			break;
3118 		case VIO_TYPE_ERR:
3119 			vsw_process_err_pkt(ldcp, &dmsg, tag);
3120 			break;
3121 		default:
3122 			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
3123 				"id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
3124 			break;
3125 		}
3126 	} while (msglen);
3127 
3128 	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
3129 }
3130 
3131 /*
3132  * Dispatch a task to process a VIO control message.
3133  */
3134 static void
3135 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
3136 {
3137 	vsw_ctrl_task_t		*ctaskp = NULL;
3138 	vsw_port_t		*port = ldcp->ldc_port;
3139 	vsw_t			*vswp = port->p_vswp;
3140 
3141 	D1(vswp, "%s: enter", __func__);
3142 
3143 	/*
3144 	 * We need to handle RDX ACK messages in-band as once they
3145 	 * are exchanged it is possible that we will get an
3146 	 * immediate (legitimate) data packet.
3147 	 */
3148 	if ((tag.vio_subtype_env == VIO_RDX) &&
3149 		(tag.vio_subtype == VIO_SUBTYPE_ACK)) {
3150 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV))
3151 			return;
3152 
3153 		ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV;
3154 		vsw_next_milestone(ldcp);
3155 		D2(vswp, "%s (%ld) handling RDX_ACK in place", __func__,
3156 			ldcp->ldc_id);
3157 		return;
3158 	}
3159 
3160 	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
3161 
3162 	if (ctaskp == NULL) {
3163 		DERR(vswp, "%s: unable to alloc space for ctrl"
3164 			" msg", __func__);
3165 		vsw_restart_handshake(ldcp);
3166 		return;
3167 	}
3168 
3169 	ctaskp->ldcp = ldcp;
3170 	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
3171 	mutex_enter(&ldcp->hss_lock);
3172 	ctaskp->hss_id = ldcp->hss_id;
3173 	mutex_exit(&ldcp->hss_lock);
3174 
3175 	/*
3176 	 * Dispatch task to processing taskq if port is not in
3177 	 * the process of being detached.
3178 	 */
3179 	mutex_enter(&port->state_lock);
3180 	if (port->state == VSW_PORT_INIT) {
3181 		if ((vswp->taskq_p == NULL) ||
3182 			(ddi_taskq_dispatch(vswp->taskq_p,
3183 			vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
3184 							!= DDI_SUCCESS)) {
3185 			DERR(vswp, "%s: unable to dispatch task to taskq",
3186 				__func__);
3187 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3188 			mutex_exit(&port->state_lock);
3189 			vsw_restart_handshake(ldcp);
3190 			return;
3191 		}
3192 	} else {
3193 		DWARN(vswp, "%s: port %d detaching, not dispatching "
3194 			"task", __func__, port->p_instance);
3195 	}
3196 
3197 	mutex_exit(&port->state_lock);
3198 
3199 	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
3200 			ldcp->ldc_id);
3201 	D1(vswp, "%s: exit", __func__);
3202 }
3203 
3204 /*
3205  * Process a VIO ctrl message. Invoked from taskq.
3206  */
3207 static void
3208 vsw_process_ctrl_pkt(void *arg)
3209 {
3210 	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
3211 	vsw_ldc_t	*ldcp = ctaskp->ldcp;
3212 	vsw_t 		*vswp = ldcp->ldc_vswp;
3213 	vio_msg_tag_t	tag;
3214 	uint16_t	env;
3215 
3216 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3217 
3218 	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
3219 	env = tag.vio_subtype_env;
3220 
3221 	/* stale pkt check */
3222 	mutex_enter(&ldcp->hss_lock);
3223 	if (ctaskp->hss_id < ldcp->hss_id) {
3224 		DWARN(vswp, "%s: discarding stale packet belonging to"
3225 			" earlier (%ld) handshake session", __func__,
3226 			ctaskp->hss_id);
3227 		mutex_exit(&ldcp->hss_lock);
3228 		return;
3229 	}
3230 	mutex_exit(&ldcp->hss_lock);
3231 
3232 	/* session id check */
3233 	if (ldcp->session_status & VSW_PEER_SESSION) {
3234 		if (ldcp->peer_session != tag.vio_sid) {
3235 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3236 				__func__, ldcp->ldc_id, tag.vio_sid);
3237 			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3238 			vsw_restart_handshake(ldcp);
3239 			return;
3240 		}
3241 	}
3242 
3243 	/*
3244 	 * Switch on vio_subtype envelope, then let lower routines
3245 	 * decide if its an INFO, ACK or NACK packet.
3246 	 */
3247 	switch (env) {
3248 	case VIO_VER_INFO:
3249 		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
3250 		break;
3251 	case VIO_DRING_REG:
3252 		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
3253 		break;
3254 	case VIO_DRING_UNREG:
3255 		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
3256 		break;
3257 	case VIO_ATTR_INFO:
3258 		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
3259 		break;
3260 	case VNET_MCAST_INFO:
3261 		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
3262 		break;
3263 	case VIO_RDX:
3264 		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
3265 		break;
3266 	default:
3267 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
3268 							__func__, env);
3269 	}
3270 
3271 	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
3272 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3273 }
3274 
3275 /*
3276  * Version negotiation. We can end up here either because our peer
3277  * has responded to a handshake message we have sent it, or our peer
3278  * has initiated a handshake with us. If its the former then can only
3279  * be ACK or NACK, if its the later can only be INFO.
3280  *
3281  * If its an ACK we move to the next stage of the handshake, namely
3282  * attribute exchange. If its a NACK we see if we can specify another
3283  * version, if we can't we stop.
3284  *
3285  * If it is an INFO we reset all params associated with communication
3286  * in that direction over this channel (remember connection is
3287  * essentially 2 independent simplex channels).
3288  */
3289 void
3290 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
3291 {
3292 	vio_ver_msg_t	*ver_pkt;
3293 	vsw_t 		*vswp = ldcp->ldc_vswp;
3294 
3295 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3296 
3297 	/*
3298 	 * We know this is a ctrl/version packet so
3299 	 * cast it into the correct structure.
3300 	 */
3301 	ver_pkt = (vio_ver_msg_t *)pkt;
3302 
3303 	switch (ver_pkt->tag.vio_subtype) {
3304 	case VIO_SUBTYPE_INFO:
3305 		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
3306 
3307 		/*
3308 		 * Record the session id, which we will use from now
3309 		 * until we see another VER_INFO msg. Even then the
3310 		 * session id in most cases will be unchanged, execpt
3311 		 * if channel was reset.
3312 		 */
3313 		if ((ldcp->session_status & VSW_PEER_SESSION) &&
3314 			(ldcp->peer_session != ver_pkt->tag.vio_sid)) {
3315 			DERR(vswp, "%s: updating session id for chan %lld "
3316 				"from %llx to %llx", __func__, ldcp->ldc_id,
3317 				ldcp->peer_session, ver_pkt->tag.vio_sid);
3318 		}
3319 
3320 		ldcp->peer_session = ver_pkt->tag.vio_sid;
3321 		ldcp->session_status |= VSW_PEER_SESSION;
3322 
3323 		/* Legal message at this time ? */
3324 		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
3325 			return;
3326 
3327 		/*
3328 		 * First check the device class. Currently only expect
3329 		 * to be talking to a network device. In the future may
3330 		 * also talk to another switch.
3331 		 */
3332 		if (ver_pkt->dev_class != VDEV_NETWORK) {
3333 			DERR(vswp, "%s: illegal device class %d", __func__,
3334 				ver_pkt->dev_class);
3335 
3336 			ver_pkt->tag.vio_sid = ldcp->local_session;
3337 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3338 
3339 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3340 
3341 			vsw_send_msg(ldcp, (void *)ver_pkt,
3342 					sizeof (vio_ver_msg_t));
3343 
3344 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
3345 			vsw_next_milestone(ldcp);
3346 			return;
3347 		} else {
3348 			ldcp->dev_class = ver_pkt->dev_class;
3349 		}
3350 
3351 		/*
3352 		 * Now check the version.
3353 		 */
3354 		if (vsw_supported_version(ver_pkt) == 0) {
3355 			/*
3356 			 * Support this major version and possibly
3357 			 * adjusted minor version.
3358 			 */
3359 
3360 			D2(vswp, "%s: accepted ver %d:%d", __func__,
3361 				ver_pkt->ver_major, ver_pkt->ver_minor);
3362 
3363 			/* Store accepted values */
3364 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
3365 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3366 
3367 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3368 
3369 			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
3370 		} else {
3371 			/*
3372 			 * NACK back with the next lower major/minor
3373 			 * pairing we support (if don't suuport any more
3374 			 * versions then they will be set to zero.
3375 			 */
3376 
3377 			D2(vswp, "%s: replying with ver %d:%d", __func__,
3378 				ver_pkt->ver_major, ver_pkt->ver_minor);
3379 
3380 			/* Store updated values */
3381 			ldcp->lane_in.ver_major = ver_pkt->ver_major;
3382 			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3383 
3384 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3385 
3386 			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
3387 		}
3388 
3389 		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3390 		ver_pkt->tag.vio_sid = ldcp->local_session;
3391 		vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t));
3392 
3393 		vsw_next_milestone(ldcp);
3394 		break;
3395 
3396 	case VIO_SUBTYPE_ACK:
3397 		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
3398 
3399 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
3400 			return;
3401 
3402 		/* Store updated values */
3403 		ldcp->lane_in.ver_major = ver_pkt->ver_major;
3404 		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
3405 
3406 
3407 		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
3408 		vsw_next_milestone(ldcp);
3409 
3410 		break;
3411 
3412 	case VIO_SUBTYPE_NACK:
3413 		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
3414 
3415 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
3416 			return;
3417 
3418 		/*
3419 		 * If our peer sent us a NACK with the ver fields set to
3420 		 * zero then there is nothing more we can do. Otherwise see
3421 		 * if we support either the version suggested, or a lesser
3422 		 * one.
3423 		 */
3424 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
3425 			DERR(vswp, "%s: peer unable to negotiate any "
3426 				"further.", __func__);
3427 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
3428 			vsw_next_milestone(ldcp);
3429 			return;
3430 		}
3431 
3432 		/*
3433 		 * Check to see if we support this major version or
3434 		 * a lower one. If we don't then maj/min will be set
3435 		 * to zero.
3436 		 */
3437 		(void) vsw_supported_version(ver_pkt);
3438 		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
3439 			/* Nothing more we can do */
3440 			DERR(vswp, "%s: version negotiation failed.\n",
3441 								__func__);
3442 			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
3443 			vsw_next_milestone(ldcp);
3444 		} else {
3445 			/* found a supported major version */
3446 			ldcp->lane_out.ver_major = ver_pkt->ver_major;
3447 			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
3448 
3449 			D2(vswp, "%s: resending with updated values (%x, %x)",
3450 				__func__, ver_pkt->ver_major,
3451 				ver_pkt->ver_minor);
3452 
3453 			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
3454 			ver_pkt->tag.vio_sid = ldcp->local_session;
3455 			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
3456 
3457 			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
3458 
3459 			vsw_send_msg(ldcp, (void *)ver_pkt,
3460 					sizeof (vio_ver_msg_t));
3461 
3462 			vsw_next_milestone(ldcp);
3463 
3464 		}
3465 		break;
3466 
3467 	default:
3468 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3469 			ver_pkt->tag.vio_subtype);
3470 	}
3471 
3472 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
3473 }
3474 
3475 /*
3476  * Process an attribute packet. We can end up here either because our peer
3477  * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
3478  * peer has sent us an attribute INFO message
3479  *
3480  * If its an ACK we then move to the next stage of the handshake which
3481  * is to send our descriptor ring info to our peer. If its a NACK then
3482  * there is nothing more we can (currently) do.
3483  *
3484  * If we get a valid/acceptable INFO packet (and we have already negotiated
3485  * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
3486  * NACK back and reset channel state to INACTIV.
3487  *
3488  * FUTURE: in time we will probably negotiate over attributes, but for
3489  * the moment unacceptable attributes are regarded as a fatal error.
3490  *
3491  */
3492 void
3493 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
3494 {
3495 	vnet_attr_msg_t		*attr_pkt;
3496 	vsw_t			*vswp = ldcp->ldc_vswp;
3497 	vsw_port_t		*port = ldcp->ldc_port;
3498 	uint64_t		macaddr = 0;
3499 	int			i;
3500 
3501 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3502 
3503 	/*
3504 	 * We know this is a ctrl/attr packet so
3505 	 * cast it into the correct structure.
3506 	 */
3507 	attr_pkt = (vnet_attr_msg_t *)pkt;
3508 
3509 	switch (attr_pkt->tag.vio_subtype) {
3510 	case VIO_SUBTYPE_INFO:
3511 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3512 
3513 		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
3514 			return;
3515 
3516 		/*
3517 		 * If the attributes are unacceptable then we NACK back.
3518 		 */
3519 		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
3520 
3521 			DERR(vswp, "%s (chan %d): invalid attributes",
3522 				__func__, ldcp->ldc_id);
3523 
3524 			vsw_free_lane_resources(ldcp, INBOUND);
3525 
3526 			attr_pkt->tag.vio_sid = ldcp->local_session;
3527 			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3528 
3529 			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
3530 			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
3531 			vsw_send_msg(ldcp, (void *)attr_pkt,
3532 					sizeof (vnet_attr_msg_t));
3533 
3534 			vsw_next_milestone(ldcp);
3535 			return;
3536 		}
3537 
3538 		/*
3539 		 * Otherwise store attributes for this lane and update
3540 		 * lane state.
3541 		 */
3542 		ldcp->lane_in.mtu = attr_pkt->mtu;
3543 		ldcp->lane_in.addr = attr_pkt->addr;
3544 		ldcp->lane_in.addr_type = attr_pkt->addr_type;
3545 		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
3546 		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
3547 
3548 		macaddr = ldcp->lane_in.addr;
3549 		for (i = ETHERADDRL - 1; i >= 0; i--) {
3550 			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
3551 			macaddr >>= 8;
3552 		}
3553 
3554 		/* create the fdb entry for this port/mac address */
3555 		(void) vsw_add_fdb(vswp, port);
3556 
3557 		/* setup device specifc xmit routines */
3558 		mutex_enter(&port->tx_lock);
3559 		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
3560 			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
3561 			port->transmit = vsw_dringsend;
3562 		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
3563 			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
3564 			vsw_create_privring(ldcp);
3565 			port->transmit = vsw_descrsend;
3566 		}
3567 		mutex_exit(&port->tx_lock);
3568 
3569 		attr_pkt->tag.vio_sid = ldcp->local_session;
3570 		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3571 
3572 		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
3573 
3574 		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
3575 
3576 		vsw_send_msg(ldcp, (void *)attr_pkt,
3577 					sizeof (vnet_attr_msg_t));
3578 
3579 		vsw_next_milestone(ldcp);
3580 		break;
3581 
3582 	case VIO_SUBTYPE_ACK:
3583 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3584 
3585 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
3586 			return;
3587 
3588 		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
3589 		vsw_next_milestone(ldcp);
3590 		break;
3591 
3592 	case VIO_SUBTYPE_NACK:
3593 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3594 
3595 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
3596 			return;
3597 
3598 		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
3599 		vsw_next_milestone(ldcp);
3600 		break;
3601 
3602 	default:
3603 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3604 			attr_pkt->tag.vio_subtype);
3605 	}
3606 
3607 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3608 }
3609 
3610 /*
3611  * Process a dring info packet. We can end up here either because our peer
3612  * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
3613  * peer has sent us a dring INFO message.
3614  *
3615  * If we get a valid/acceptable INFO packet (and we have already negotiated
3616  * a version) we ACK back and update the lane state, otherwise we NACK back.
3617  *
3618  * FUTURE: nothing to stop client from sending us info on multiple dring's
3619  * but for the moment we will just use the first one we are given.
3620  *
3621  */
3622 void
3623 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3624 {
3625 	vio_dring_reg_msg_t	*dring_pkt;
3626 	vsw_t			*vswp = ldcp->ldc_vswp;
3627 	ldc_mem_info_t		minfo;
3628 	dring_info_t		*dp, *dbp;
3629 	int			dring_found = 0;
3630 
3631 	/*
3632 	 * We know this is a ctrl/dring packet so
3633 	 * cast it into the correct structure.
3634 	 */
3635 	dring_pkt = (vio_dring_reg_msg_t *)pkt;
3636 
3637 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3638 
3639 	switch (dring_pkt->tag.vio_subtype) {
3640 	case VIO_SUBTYPE_INFO:
3641 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3642 
3643 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3644 			return;
3645 
3646 		/*
3647 		 * If the dring params are unacceptable then we NACK back.
3648 		 */
3649 		if (vsw_check_dring_info(dring_pkt)) {
3650 
3651 			DERR(vswp, "%s (%lld): invalid dring info",
3652 				__func__, ldcp->ldc_id);
3653 
3654 			vsw_free_lane_resources(ldcp, INBOUND);
3655 
3656 			dring_pkt->tag.vio_sid = ldcp->local_session;
3657 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3658 
3659 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3660 
3661 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3662 
3663 			vsw_send_msg(ldcp, (void *)dring_pkt,
3664 					sizeof (vio_dring_reg_msg_t));
3665 
3666 			vsw_next_milestone(ldcp);
3667 			return;
3668 		}
3669 
3670 		/*
3671 		 * Otherwise, attempt to map in the dring using the
3672 		 * cookie. If that succeeds we send back a unique dring
3673 		 * identifier that the sending side will use in future
3674 		 * to refer to this descriptor ring.
3675 		 */
3676 		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
3677 
3678 		dp->num_descriptors = dring_pkt->num_descriptors;
3679 		dp->descriptor_size = dring_pkt->descriptor_size;
3680 		dp->options = dring_pkt->options;
3681 		dp->ncookies = dring_pkt->ncookies;
3682 
3683 		/*
3684 		 * Note: should only get one cookie. Enforced in
3685 		 * the ldc layer.
3686 		 */
3687 		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
3688 			sizeof (ldc_mem_cookie_t));
3689 
3690 		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
3691 			dp->num_descriptors, dp->descriptor_size);
3692 		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
3693 			dp->options, dp->ncookies);
3694 
3695 		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
3696 			dp->ncookies, dp->num_descriptors,
3697 			dp->descriptor_size, LDC_SHADOW_MAP,
3698 			&(dp->handle))) != 0) {
3699 
3700 			DERR(vswp, "%s: dring_map failed\n", __func__);
3701 
3702 			kmem_free(dp, sizeof (dring_info_t));
3703 			vsw_free_lane_resources(ldcp, INBOUND);
3704 
3705 			dring_pkt->tag.vio_sid = ldcp->local_session;
3706 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3707 
3708 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3709 
3710 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3711 			vsw_send_msg(ldcp, (void *)dring_pkt,
3712 				sizeof (vio_dring_reg_msg_t));
3713 
3714 			vsw_next_milestone(ldcp);
3715 			return;
3716 		}
3717 
3718 		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
3719 
3720 			DERR(vswp, "%s: dring_addr failed\n", __func__);
3721 
3722 			kmem_free(dp, sizeof (dring_info_t));
3723 			vsw_free_lane_resources(ldcp, INBOUND);
3724 
3725 			dring_pkt->tag.vio_sid = ldcp->local_session;
3726 			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
3727 
3728 			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
3729 
3730 			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
3731 			vsw_send_msg(ldcp, (void *)dring_pkt,
3732 				sizeof (vio_dring_reg_msg_t));
3733 
3734 			vsw_next_milestone(ldcp);
3735 			return;
3736 		} else {
3737 			/* store the address of the pub part of ring */
3738 			dp->pub_addr = minfo.vaddr;
3739 		}
3740 
3741 		/* no private section as we are importing */
3742 		dp->priv_addr = NULL;
3743 
3744 		/*
3745 		 * Using simple mono increasing int for ident at
3746 		 * the moment.
3747 		 */
3748 		dp->ident = ldcp->next_ident;
3749 		ldcp->next_ident++;
3750 
3751 		dp->end_idx = 0;
3752 		dp->next = NULL;
3753 
3754 		/*
3755 		 * Link it onto the end of the list of drings
3756 		 * for this lane.
3757 		 */
3758 		if (ldcp->lane_in.dringp == NULL) {
3759 			D2(vswp, "%s: adding first INBOUND dring", __func__);
3760 			ldcp->lane_in.dringp = dp;
3761 		} else {
3762 			dbp = ldcp->lane_in.dringp;
3763 
3764 			while (dbp->next != NULL)
3765 				dbp = dbp->next;
3766 
3767 			dbp->next = dp;
3768 		}
3769 
3770 		/* acknowledge it */
3771 		dring_pkt->tag.vio_sid = ldcp->local_session;
3772 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3773 		dring_pkt->dring_ident = dp->ident;
3774 
3775 		vsw_send_msg(ldcp, (void *)dring_pkt,
3776 				sizeof (vio_dring_reg_msg_t));
3777 
3778 		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
3779 		vsw_next_milestone(ldcp);
3780 		break;
3781 
3782 	case VIO_SUBTYPE_ACK:
3783 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3784 
3785 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
3786 			return;
3787 
3788 		/*
3789 		 * Peer is acknowledging our dring info and will have
3790 		 * sent us a dring identifier which we will use to
3791 		 * refer to this ring w.r.t. our peer.
3792 		 */
3793 		dp = ldcp->lane_out.dringp;
3794 		if (dp != NULL) {
3795 			/*
3796 			 * Find the ring this ident should be associated
3797 			 * with.
3798 			 */
3799 			if (vsw_dring_match(dp, dring_pkt)) {
3800 				dring_found = 1;
3801 
3802 			} else while (dp != NULL) {
3803 				if (vsw_dring_match(dp, dring_pkt)) {
3804 					dring_found = 1;
3805 					break;
3806 				}
3807 				dp = dp->next;
3808 			}
3809 
3810 			if (dring_found == 0) {
3811 				DERR(NULL, "%s: unrecognised ring cookie",
3812 					__func__);
3813 				vsw_restart_handshake(ldcp);
3814 				return;
3815 			}
3816 
3817 		} else {
3818 			DERR(vswp, "%s: DRING ACK received but no drings "
3819 				"allocated", __func__);
3820 			vsw_restart_handshake(ldcp);
3821 			return;
3822 		}
3823 
3824 		/* store ident */
3825 		dp->ident = dring_pkt->dring_ident;
3826 		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
3827 		vsw_next_milestone(ldcp);
3828 		break;
3829 
3830 	case VIO_SUBTYPE_NACK:
3831 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3832 
3833 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3834 			return;
3835 
3836 		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
3837 		vsw_next_milestone(ldcp);
3838 		break;
3839 
3840 	default:
3841 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3842 			dring_pkt->tag.vio_subtype);
3843 	}
3844 
3845 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3846 }
3847 
3848 /*
3849  * Process a request from peer to unregister a dring.
3850  *
3851  * For the moment we just restart the handshake if our
3852  * peer endpoint attempts to unregister a dring.
3853  */
3854 void
3855 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3856 {
3857 	vsw_t			*vswp = ldcp->ldc_vswp;
3858 	vio_dring_unreg_msg_t	*dring_pkt;
3859 
3860 	/*
3861 	 * We know this is a ctrl/dring packet so
3862 	 * cast it into the correct structure.
3863 	 */
3864 	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3865 
3866 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3867 
3868 	switch (dring_pkt->tag.vio_subtype) {
3869 	case VIO_SUBTYPE_INFO:
3870 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3871 
3872 		DWARN(vswp, "%s: restarting handshake..", __func__);
3873 		vsw_restart_handshake(ldcp);
3874 		break;
3875 
3876 	case VIO_SUBTYPE_ACK:
3877 		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3878 
3879 		DWARN(vswp, "%s: restarting handshake..", __func__);
3880 		vsw_restart_handshake(ldcp);
3881 		break;
3882 
3883 	case VIO_SUBTYPE_NACK:
3884 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3885 
3886 		DWARN(vswp, "%s: restarting handshake..", __func__);
3887 		vsw_restart_handshake(ldcp);
3888 		break;
3889 
3890 	default:
3891 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3892 			dring_pkt->tag.vio_subtype);
3893 		vsw_restart_handshake(ldcp);
3894 	}
3895 
3896 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3897 }
3898 
3899 #define	SND_MCST_NACK(ldcp, pkt) \
3900 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3901 	pkt->tag.vio_sid = ldcp->local_session; \
3902 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t));
3903 
3904 /*
3905  * Process a multicast request from a vnet.
3906  *
3907  * Vnet's specify a multicast address that they are interested in. This
3908  * address is used as a key into the hash table which forms the multicast
3909  * forwarding database (mFDB).
3910  *
3911  * The table keys are the multicast addresses, while the table entries
3912  * are pointers to lists of ports which wish to receive packets for the
3913  * specified multicast address.
3914  *
3915  * When a multicast packet is being switched we use the address as a key
3916  * into the hash table, and then walk the appropriate port list forwarding
3917  * the pkt to each port in turn.
3918  *
3919  * If a vnet is no longer interested in a particular multicast grouping
3920  * we simply find the correct location in the hash table and then delete
3921  * the relevant port from the port list.
3922  *
3923  * To deal with the case whereby a port is being deleted without first
3924  * removing itself from the lists in the hash table, we maintain a list
3925  * of multicast addresses the port has registered an interest in, within
3926  * the port structure itself. We then simply walk that list of addresses
3927  * using them as keys into the hash table and remove the port from the
3928  * appropriate lists.
3929  */
3930 static void
3931 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3932 {
3933 	vnet_mcast_msg_t	*mcst_pkt;
3934 	vsw_port_t		*port = ldcp->ldc_port;
3935 	vsw_t			*vswp = ldcp->ldc_vswp;
3936 	int			i;
3937 
3938 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3939 
3940 	/*
3941 	 * We know this is a ctrl/mcast packet so
3942 	 * cast it into the correct structure.
3943 	 */
3944 	mcst_pkt = (vnet_mcast_msg_t *)pkt;
3945 
3946 	switch (mcst_pkt->tag.vio_subtype) {
3947 	case VIO_SUBTYPE_INFO:
3948 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3949 
3950 		/*
3951 		 * Check if in correct state to receive a multicast
3952 		 * message (i.e. handshake complete). If not reset
3953 		 * the handshake.
3954 		 */
3955 		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3956 			return;
3957 
3958 		/*
3959 		 * Before attempting to add or remove address check
3960 		 * that they are valid multicast addresses.
3961 		 * If not, then NACK back.
3962 		 */
3963 		for (i = 0; i < mcst_pkt->count; i++) {
3964 			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3965 				DERR(vswp, "%s: invalid multicast address",
3966 								__func__);
3967 				SND_MCST_NACK(ldcp, mcst_pkt);
3968 				return;
3969 			}
3970 		}
3971 
3972 		/*
3973 		 * Now add/remove the addresses. If this fails we
3974 		 * NACK back.
3975 		 */
3976 		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3977 			SND_MCST_NACK(ldcp, mcst_pkt);
3978 			return;
3979 		}
3980 
3981 		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3982 		mcst_pkt->tag.vio_sid = ldcp->local_session;
3983 
3984 		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3985 
3986 		vsw_send_msg(ldcp, (void *)mcst_pkt,
3987 					sizeof (vnet_mcast_msg_t));
3988 		break;
3989 
3990 	case VIO_SUBTYPE_ACK:
3991 		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3992 
3993 		/*
3994 		 * We shouldn't ever get a multicast ACK message as
3995 		 * at the moment we never request multicast addresses
3996 		 * to be set on some other device. This may change in
3997 		 * the future if we have cascading switches.
3998 		 */
3999 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
4000 			return;
4001 
4002 				/* Do nothing */
4003 		break;
4004 
4005 	case VIO_SUBTYPE_NACK:
4006 		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4007 
4008 		/*
4009 		 * We shouldn't get a multicast NACK packet for the
4010 		 * same reasons as we shouldn't get a ACK packet.
4011 		 */
4012 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
4013 			return;
4014 
4015 				/* Do nothing */
4016 		break;
4017 
4018 	default:
4019 		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
4020 			mcst_pkt->tag.vio_subtype);
4021 	}
4022 
4023 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4024 }
4025 
4026 static void
4027 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
4028 {
4029 	vio_rdx_msg_t	*rdx_pkt;
4030 	vsw_t		*vswp = ldcp->ldc_vswp;
4031 
4032 	/*
4033 	 * We know this is a ctrl/rdx packet so
4034 	 * cast it into the correct structure.
4035 	 */
4036 	rdx_pkt = (vio_rdx_msg_t *)pkt;
4037 
4038 	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
4039 
4040 	switch (rdx_pkt->tag.vio_subtype) {
4041 	case VIO_SUBTYPE_INFO:
4042 		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4043 
4044 		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV))
4045 			return;
4046 
4047 		rdx_pkt->tag.vio_sid = ldcp->local_session;
4048 		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4049 
4050 		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
4051 
4052 		ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT;
4053 
4054 		vsw_send_msg(ldcp, (void *)rdx_pkt,
4055 				sizeof (vio_rdx_msg_t));
4056 
4057 		vsw_next_milestone(ldcp);
4058 		break;
4059 
4060 	case VIO_SUBTYPE_ACK:
4061 		/*
4062 		 * Should be handled in-band by callback handler.
4063 		 */
4064 		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
4065 		vsw_restart_handshake(ldcp);
4066 		break;
4067 
4068 	case VIO_SUBTYPE_NACK:
4069 		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4070 
4071 		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV))
4072 			return;
4073 
4074 		ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV;
4075 		vsw_next_milestone(ldcp);
4076 		break;
4077 
4078 	default:
4079 		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
4080 			rdx_pkt->tag.vio_subtype);
4081 	}
4082 
4083 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4084 }
4085 
4086 static void
4087 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
4088 {
4089 	uint16_t	env = tag.vio_subtype_env;
4090 	vsw_t		*vswp = ldcp->ldc_vswp;
4091 
4092 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4093 
4094 	/* session id check */
4095 	if (ldcp->session_status & VSW_PEER_SESSION) {
4096 		if (ldcp->peer_session != tag.vio_sid) {
4097 			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
4098 				__func__, ldcp->ldc_id, tag.vio_sid);
4099 			vsw_restart_handshake(ldcp);
4100 			return;
4101 		}
4102 	}
4103 
4104 	/*
4105 	 * It is an error for us to be getting data packets
4106 	 * before the handshake has completed.
4107 	 */
4108 	if (ldcp->hphase != VSW_MILESTONE4) {
4109 		DERR(vswp, "%s: got data packet before handshake complete "
4110 			"hphase %d (%x: %x)", __func__, ldcp->hphase,
4111 			ldcp->lane_in.lstate, ldcp->lane_out.lstate);
4112 		DUMP_FLAGS(ldcp->lane_in.lstate);
4113 		DUMP_FLAGS(ldcp->lane_out.lstate);
4114 		vsw_restart_handshake(ldcp);
4115 		return;
4116 	}
4117 
4118 	/*
4119 	 * Switch on vio_subtype envelope, then let lower routines
4120 	 * decide if its an INFO, ACK or NACK packet.
4121 	 */
4122 	if (env == VIO_DRING_DATA) {
4123 		vsw_process_data_dring_pkt(ldcp, dpkt);
4124 	} else if (env == VIO_PKT_DATA) {
4125 		vsw_process_data_raw_pkt(ldcp, dpkt);
4126 	} else if (env == VIO_DESC_DATA) {
4127 		vsw_process_data_ibnd_pkt(ldcp, dpkt);
4128 	} else {
4129 		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
4130 							__func__, env);
4131 	}
4132 
4133 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4134 }
4135 
4136 #define	SND_DRING_NACK(ldcp, pkt) \
4137 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4138 	pkt->tag.vio_sid = ldcp->local_session; \
4139 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t));
4140 
4141 static void
4142 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
4143 {
4144 	vio_dring_msg_t		*dring_pkt;
4145 	vnet_public_desc_t	*pub_addr = NULL;
4146 	vsw_private_desc_t	*priv_addr = NULL;
4147 	dring_info_t		*dp = NULL;
4148 	vsw_t			*vswp = ldcp->ldc_vswp;
4149 	mblk_t			*mp = NULL;
4150 	mblk_t			*bp = NULL;
4151 	mblk_t			*bpt = NULL;
4152 	size_t			nbytes = 0;
4153 	size_t			off = 0;
4154 	uint64_t		ncookies = 0;
4155 	uint64_t		chain = 0;
4156 	uint64_t		j, len;
4157 	uint32_t		pos, start, datalen;
4158 	uint32_t		range_start, range_end;
4159 	int32_t			end, num, cnt = 0;
4160 	int			i, rv;
4161 	boolean_t		ack_needed = B_FALSE;
4162 	boolean_t		prev_desc_ack = B_FALSE;
4163 	int			read_attempts = 0;
4164 
4165 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4166 
4167 	/*
4168 	 * We know this is a data/dring packet so
4169 	 * cast it into the correct structure.
4170 	 */
4171 	dring_pkt = (vio_dring_msg_t *)dpkt;
4172 
4173 	/*
4174 	 * Switch on the vio_subtype. If its INFO then we need to
4175 	 * process the data. If its an ACK we need to make sure
4176 	 * it makes sense (i.e did we send an earlier data/info),
4177 	 * and if its a NACK then we maybe attempt a retry.
4178 	 */
4179 	switch (dring_pkt->tag.vio_subtype) {
4180 	case VIO_SUBTYPE_INFO:
4181 		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
4182 
4183 		if ((dp = vsw_ident2dring(&ldcp->lane_in,
4184 				dring_pkt->dring_ident)) == NULL) {
4185 
4186 			DERR(vswp, "%s(%lld): unable to find dring from "
4187 				"ident 0x%llx", __func__, ldcp->ldc_id,
4188 				dring_pkt->dring_ident);
4189 
4190 			SND_DRING_NACK(ldcp, dring_pkt);
4191 			return;
4192 		}
4193 
4194 		start = pos = dring_pkt->start_idx;
4195 		end = dring_pkt->end_idx;
4196 		len = dp->num_descriptors;
4197 
4198 		range_start = range_end = pos;
4199 
4200 		D2(vswp, "%s(%lld): start index %ld : end %ld\n",
4201 			__func__, ldcp->ldc_id, start, end);
4202 
4203 		if (end == -1) {
4204 			num = -1;
4205 		} else if (num >= 0) {
4206 			num = end >= pos ?
4207 				end - pos + 1: (len - pos + 1) + end;
4208 
4209 			/* basic sanity check */
4210 			if (end > len) {
4211 				DERR(vswp, "%s(%lld): endpoint %lld outside "
4212 					"ring length %lld", __func__,
4213 					ldcp->ldc_id, end, len);
4214 
4215 				SND_DRING_NACK(ldcp, dring_pkt);
4216 				return;
4217 			}
4218 		} else {
4219 			DERR(vswp, "%s(%lld): invalid endpoint %lld",
4220 				__func__, ldcp->ldc_id, end);
4221 			SND_DRING_NACK(ldcp, dring_pkt);
4222 			return;
4223 		}
4224 
4225 		while (cnt != num) {
4226 vsw_recheck_desc:
4227 			if ((rv = ldc_mem_dring_acquire(dp->handle,
4228 							pos, pos)) != 0) {
4229 				DERR(vswp, "%s(%lld): unable to acquire "
4230 					"descriptor at pos %d: err %d",
4231 					__func__, pos, ldcp->ldc_id, rv);
4232 				SND_DRING_NACK(ldcp, dring_pkt);
4233 				return;
4234 			}
4235 
4236 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
4237 
4238 			/*
4239 			 * When given a bounded range of descriptors
4240 			 * to process, its an error to hit a descriptor
4241 			 * which is not ready. In the non-bounded case
4242 			 * (end_idx == -1) this simply indicates we have
4243 			 * reached the end of the current active range.
4244 			 */
4245 			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
4246 				/* unbound - no error */
4247 				if (end == -1) {
4248 					if (read_attempts == vsw_read_attempts)
4249 						break;
4250 
4251 					delay(drv_usectohz(vsw_desc_delay));
4252 					read_attempts++;
4253 					goto vsw_recheck_desc;
4254 				}
4255 
4256 				/* bounded - error - so NACK back */
4257 				DERR(vswp, "%s(%lld): descriptor not READY "
4258 					"(%d)", __func__, ldcp->ldc_id,
4259 					pub_addr->hdr.dstate);
4260 				SND_DRING_NACK(ldcp, dring_pkt);
4261 				return;
4262 			}
4263 
4264 			DTRACE_PROBE1(read_attempts, int, read_attempts);
4265 
4266 			range_end = pos;
4267 
4268 			/*
4269 			 * If we ACK'd the previous descriptor then now
4270 			 * record the new range start position for later
4271 			 * ACK's.
4272 			 */
4273 			if (prev_desc_ack) {
4274 				range_start = pos;
4275 
4276 				D2(vswp, "%s(%lld): updating range start "
4277 					"to be %d", __func__, ldcp->ldc_id,
4278 					range_start);
4279 
4280 				prev_desc_ack = B_FALSE;
4281 			}
4282 
4283 			/*
4284 			 * Data is padded to align on 8 byte boundary,
4285 			 * datalen is actual data length, i.e. minus that
4286 			 * padding.
4287 			 */
4288 			datalen = pub_addr->nbytes;
4289 
4290 			/*
4291 			 * Does peer wish us to ACK when we have finished
4292 			 * with this descriptor ?
4293 			 */
4294 			if (pub_addr->hdr.ack)
4295 				ack_needed = B_TRUE;
4296 
4297 			D2(vswp, "%s(%lld): processing desc %lld at pos"
4298 				" 0x%llx : dstate 0x%lx : datalen 0x%lx",
4299 				__func__, ldcp->ldc_id, pos, pub_addr,
4300 				pub_addr->hdr.dstate, datalen);
4301 
4302 			/*
4303 			 * Mark that we are starting to process descriptor.
4304 			 */
4305 			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
4306 
4307 			mp = vio_allocb(ldcp->rxh);
4308 			if (mp == NULL) {
4309 				/*
4310 				 * No free receive buffers available, so
4311 				 * fallback onto allocb(9F). Make sure that
4312 				 * we get a data buffer which is a multiple
4313 				 * of 8 as this is required by ldc_mem_copy.
4314 				 */
4315 				DTRACE_PROBE(allocb);
4316 				mp = allocb(datalen + VNET_IPALIGN + 8,
4317 								BPRI_MED);
4318 			}
4319 
4320 			/*
4321 			 * Ensure that we ask ldc for an aligned
4322 			 * number of bytes.
4323 			 */
4324 			nbytes = datalen + VNET_IPALIGN;
4325 			if (nbytes & 0x7) {
4326 				off = 8 - (nbytes & 0x7);
4327 				nbytes += off;
4328 			}
4329 
4330 			ncookies = pub_addr->ncookies;
4331 			rv = ldc_mem_copy(ldcp->ldc_handle,
4332 				(caddr_t)mp->b_rptr, 0, &nbytes,
4333 				pub_addr->memcookie, ncookies,
4334 				LDC_COPY_IN);
4335 
4336 			if (rv != 0) {
4337 				DERR(vswp, "%s(%d): unable to copy in "
4338 					"data from %d cookies in desc %d"
4339 					" (rv %d)", __func__, ldcp->ldc_id,
4340 					ncookies, pos, rv);
4341 				freemsg(mp);
4342 
4343 				pub_addr->hdr.dstate = VIO_DESC_DONE;
4344 				(void) ldc_mem_dring_release(dp->handle,
4345 								pos, pos);
4346 				break;
4347 			} else {
4348 				D2(vswp, "%s(%d): copied in %ld bytes"
4349 					" using %d cookies", __func__,
4350 					ldcp->ldc_id, nbytes, ncookies);
4351 			}
4352 
4353 			/* adjust the read pointer to skip over the padding */
4354 			mp->b_rptr += VNET_IPALIGN;
4355 
4356 			/* point to the actual end of data */
4357 			mp->b_wptr = mp->b_rptr + datalen;
4358 
4359 			/* build a chain of received packets */
4360 			if (bp == NULL) {
4361 				/* first pkt */
4362 				bp = mp;
4363 				bp->b_next = bp->b_prev = NULL;
4364 				bpt = bp;
4365 				chain = 1;
4366 			} else {
4367 				mp->b_next = NULL;
4368 				mp->b_prev = bpt;
4369 				bpt->b_next = mp;
4370 				bpt = mp;
4371 				chain++;
4372 			}
4373 
4374 			/* mark we are finished with this descriptor */
4375 			pub_addr->hdr.dstate = VIO_DESC_DONE;
4376 
4377 			(void) ldc_mem_dring_release(dp->handle, pos, pos);
4378 
4379 			/*
4380 			 * Send an ACK back to peer if requested.
4381 			 */
4382 			if (ack_needed) {
4383 				ack_needed = B_FALSE;
4384 
4385 				dring_pkt->start_idx = range_start;
4386 				dring_pkt->end_idx = range_end;
4387 
4388 				DERR(vswp, "%s(%lld): processed %d %d, ACK"
4389 					" requested", __func__, ldcp->ldc_id,
4390 					dring_pkt->start_idx,
4391 					dring_pkt->end_idx);
4392 
4393 				dring_pkt->dring_process_state = VIO_DP_ACTIVE;
4394 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4395 				dring_pkt->tag.vio_sid = ldcp->local_session;
4396 				vsw_send_msg(ldcp, (void *)dring_pkt,
4397 					sizeof (vio_dring_msg_t));
4398 
4399 				prev_desc_ack = B_TRUE;
4400 				range_start = pos;
4401 			}
4402 
4403 			/* next descriptor */
4404 			pos = (pos + 1) % len;
4405 			cnt++;
4406 
4407 			/*
4408 			 * Break out of loop here and stop processing to
4409 			 * allow some other network device (or disk) to
4410 			 * get access to the cpu.
4411 			 */
4412 			/* send the chain of packets to be switched */
4413 			if (chain > vsw_chain_len) {
4414 				D3(vswp, "%s(%lld): switching chain of %d "
4415 					"msgs", __func__, ldcp->ldc_id, chain);
4416 				vsw_switch_frame(vswp, bp, VSW_VNETPORT,
4417 							ldcp->ldc_port, NULL);
4418 				bp = NULL;
4419 				break;
4420 			}
4421 		}
4422 
4423 		/* send the chain of packets to be switched */
4424 		if (bp != NULL) {
4425 			D3(vswp, "%s(%lld): switching chain of %d msgs",
4426 					__func__, ldcp->ldc_id, chain);
4427 			vsw_switch_frame(vswp, bp, VSW_VNETPORT,
4428 							ldcp->ldc_port, NULL);
4429 		}
4430 
4431 		DTRACE_PROBE1(msg_cnt, int, cnt);
4432 
4433 		/*
4434 		 * We are now finished so ACK back with the state
4435 		 * set to STOPPING so our peer knows we are finished
4436 		 */
4437 		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
4438 		dring_pkt->tag.vio_sid = ldcp->local_session;
4439 
4440 		dring_pkt->dring_process_state = VIO_DP_STOPPED;
4441 
4442 		DTRACE_PROBE(stop_process_sent);
4443 
4444 		/*
4445 		 * We have not processed any more descriptors beyond
4446 		 * the last one we ACK'd.
4447 		 */
4448 		if (prev_desc_ack)
4449 			range_start = range_end;
4450 
4451 		dring_pkt->start_idx = range_start;
4452 		dring_pkt->end_idx = range_end;
4453 
4454 		D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
4455 			__func__, ldcp->ldc_id, dring_pkt->start_idx,
4456 			dring_pkt->end_idx);
4457 
4458 		vsw_send_msg(ldcp, (void *)dring_pkt,
4459 					sizeof (vio_dring_msg_t));
4460 		break;
4461 
4462 	case VIO_SUBTYPE_ACK:
4463 		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
4464 		/*
4465 		 * Verify that the relevant descriptors are all
4466 		 * marked as DONE
4467 		 */
4468 		if ((dp = vsw_ident2dring(&ldcp->lane_out,
4469 			dring_pkt->dring_ident)) == NULL) {
4470 			DERR(vswp, "%s: unknown ident in ACK", __func__);
4471 			return;
4472 		}
4473 
4474 		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
4475 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4476 
4477 		start = end = 0;
4478 		start = dring_pkt->start_idx;
4479 		end = dring_pkt->end_idx;
4480 		len = dp->num_descriptors;
4481 
4482 		j = num = 0;
4483 		/* calculate # descriptors taking into a/c wrap around */
4484 		num = end >= start ? end - start + 1: (len - start + 1) + end;
4485 
4486 		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
4487 			__func__, ldcp->ldc_id, start, end, num);
4488 
4489 		mutex_enter(&dp->dlock);
4490 		dp->last_ack_recv = end;
4491 		mutex_exit(&dp->dlock);
4492 
4493 		for (i = start; j < num; i = (i + 1) % len, j++) {
4494 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
4495 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4496 
4497 			/*
4498 			 * If the last descriptor in a range has the ACK
4499 			 * bit set then we will get two messages from our
4500 			 * peer relating to it. The normal ACK msg and then
4501 			 * a subsequent STOP msg. The first message will have
4502 			 * resulted in the descriptor being reclaimed and
4503 			 * its state set to FREE so when we encounter a non
4504 			 * DONE descriptor we need to check to see if its
4505 			 * because we have just reclaimed it.
4506 			 */
4507 			mutex_enter(&priv_addr->dstate_lock);
4508 			if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
4509 				/* clear all the fields */
4510 				bzero(priv_addr->datap, priv_addr->datalen);
4511 				priv_addr->datalen = 0;
4512 
4513 				pub_addr->hdr.dstate = VIO_DESC_FREE;
4514 				pub_addr->hdr.ack = 0;
4515 
4516 				priv_addr->dstate = VIO_DESC_FREE;
4517 				mutex_exit(&priv_addr->dstate_lock);
4518 
4519 				D3(vswp, "clearing descp %d : pub state "
4520 					"0x%llx : priv state 0x%llx", i,
4521 					pub_addr->hdr.dstate,
4522 					priv_addr->dstate);
4523 
4524 			} else {
4525 				mutex_exit(&priv_addr->dstate_lock);
4526 
4527 				if (dring_pkt->dring_process_state !=
4528 							VIO_DP_STOPPED) {
4529 					DERR(vswp, "%s: descriptor %lld at pos "
4530 						" 0x%llx not DONE (0x%lx)\n",
4531 						__func__, i, pub_addr,
4532 						pub_addr->hdr.dstate);
4533 					return;
4534 				}
4535 			}
4536 		}
4537 
4538 		/*
4539 		 * If our peer is stopping processing descriptors then
4540 		 * we check to make sure it has processed all the descriptors
4541 		 * we have updated. If not then we send it a new message
4542 		 * to prompt it to restart.
4543 		 */
4544 		if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
4545 			DTRACE_PROBE(stop_process_recv);
4546 			D2(vswp, "%s(%lld): got stopping msg : %d : %d",
4547 				__func__, ldcp->ldc_id, dring_pkt->start_idx,
4548 				dring_pkt->end_idx);
4549 
4550 			/*
4551 			 * Check next descriptor in public section of ring.
4552 			 * If its marked as READY then we need to prompt our
4553 			 * peer to start processing the ring again.
4554 			 */
4555 			i = (end + 1) % len;
4556 			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
4557 			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
4558 
4559 			/*
4560 			 * Hold the restart lock across all of this to
4561 			 * make sure that its not possible for us to
4562 			 * decide that a msg needs to be sent in the future
4563 			 * but the sending code having already checked is
4564 			 * about to exit.
4565 			 */
4566 			mutex_enter(&dp->restart_lock);
4567 			mutex_enter(&priv_addr->dstate_lock);
4568 			if (pub_addr->hdr.dstate == VIO_DESC_READY) {
4569 
4570 				mutex_exit(&priv_addr->dstate_lock);
4571 
4572 				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4573 				dring_pkt->tag.vio_sid = ldcp->local_session;
4574 
4575 				mutex_enter(&ldcp->lane_out.seq_lock);
4576 				dring_pkt->seq_num = ldcp->lane_out.seq_num++;
4577 				mutex_exit(&ldcp->lane_out.seq_lock);
4578 
4579 				dring_pkt->start_idx = (end + 1) % len;
4580 				dring_pkt->end_idx = -1;
4581 
4582 				D2(vswp, "%s(%lld) : sending restart msg:"
4583 					" %d : %d", __func__, ldcp->ldc_id,
4584 					dring_pkt->start_idx,
4585 					dring_pkt->end_idx);
4586 
4587 				vsw_send_msg(ldcp, (void *)dring_pkt,
4588 						sizeof (vio_dring_msg_t));
4589 			} else {
4590 				mutex_exit(&priv_addr->dstate_lock);
4591 				dp->restart_reqd = B_TRUE;
4592 			}
4593 			mutex_exit(&dp->restart_lock);
4594 		}
4595 		break;
4596 
4597 	case VIO_SUBTYPE_NACK:
4598 		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
4599 						__func__, ldcp->ldc_id);
4600 		/*
4601 		 * Something is badly wrong if we are getting NACK's
4602 		 * for our data pkts. So reset the channel.
4603 		 */
4604 		vsw_restart_handshake(ldcp);
4605 
4606 		break;
4607 
4608 	default:
4609 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4610 			ldcp->ldc_id, dring_pkt->tag.vio_subtype);
4611 	}
4612 
4613 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4614 }
4615 
4616 /*
4617  * VIO_PKT_DATA (a.k.a raw data mode )
4618  *
4619  * Note - currently not supported. Do nothing.
4620  */
4621 static void
4622 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
4623 {
4624 	_NOTE(ARGUNUSED(dpkt))
4625 
4626 	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4627 
4628 	DERR(NULL, "%s (%lld): currently  not supported",
4629 						__func__, ldcp->ldc_id);
4630 
4631 	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4632 }
4633 
4634 #define	SND_IBND_DESC_NACK(ldcp, pkt) \
4635 	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
4636 	pkt->tag.vio_sid = ldcp->local_session; \
4637 	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t));
4638 
4639 /*
4640  * Process an in-band descriptor message (most likely from
4641  * OBP).
4642  */
4643 static void
4644 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
4645 {
4646 	vio_ibnd_desc_t		*ibnd_desc;
4647 	dring_info_t		*dp = NULL;
4648 	vsw_private_desc_t	*priv_addr = NULL;
4649 	vsw_t			*vswp = ldcp->ldc_vswp;
4650 	mblk_t			*mp = NULL;
4651 	size_t			nbytes = 0;
4652 	size_t			off = 0;
4653 	uint64_t		idx = 0;
4654 	uint32_t		datalen = 0;
4655 	uint64_t		ncookies = 0;
4656 	int			rv;
4657 
4658 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4659 
4660 	ibnd_desc = (vio_ibnd_desc_t *)pkt;
4661 
4662 	switch (ibnd_desc->hdr.tag.vio_subtype) {
4663 	case VIO_SUBTYPE_INFO:
4664 		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
4665 
4666 		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
4667 			return;
4668 
4669 		/*
4670 		 * Data is padded to align on a 8 byte boundary,
4671 		 * nbytes is actual data length, i.e. minus that
4672 		 * padding.
4673 		 */
4674 		datalen = ibnd_desc->nbytes;
4675 
4676 		D2(vswp, "%s(%lld): processing inband desc : "
4677 			": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
4678 
4679 		ncookies = ibnd_desc->ncookies;
4680 
4681 		/*
4682 		 * allocb(9F) returns an aligned data block. We
4683 		 * need to ensure that we ask ldc for an aligned
4684 		 * number of bytes also.
4685 		 */
4686 		nbytes = datalen;
4687 		if (nbytes & 0x7) {
4688 			off = 8 - (nbytes & 0x7);
4689 			nbytes += off;
4690 		}
4691 
4692 		mp = allocb(datalen, BPRI_MED);
4693 		if (mp == NULL) {
4694 			DERR(vswp, "%s(%lld): allocb failed",
4695 					__func__, ldcp->ldc_id);
4696 			return;
4697 		}
4698 
4699 		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
4700 			0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
4701 			LDC_COPY_IN);
4702 
4703 		if (rv != 0) {
4704 			DERR(vswp, "%s(%d): unable to copy in data from "
4705 				"%d cookie(s)", __func__,
4706 				ldcp->ldc_id, ncookies);
4707 			freemsg(mp);
4708 			return;
4709 		} else {
4710 			D2(vswp, "%s(%d): copied in %ld bytes using %d "
4711 				"cookies", __func__, ldcp->ldc_id, nbytes,
4712 				ncookies);
4713 		}
4714 
4715 		/* point to the actual end of data */
4716 		mp->b_wptr = mp->b_rptr + datalen;
4717 
4718 		/*
4719 		 * We ACK back every in-band descriptor message we process
4720 		 */
4721 		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
4722 		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
4723 		vsw_send_msg(ldcp, (void *)ibnd_desc,
4724 				sizeof (vio_ibnd_desc_t));
4725 
4726 		/* send the packet to be switched */
4727 		vsw_switch_frame(vswp, mp, VSW_VNETPORT,
4728 					ldcp->ldc_port, NULL);
4729 
4730 		break;
4731 
4732 	case VIO_SUBTYPE_ACK:
4733 		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
4734 
4735 		/* Verify the ACK is valid */
4736 		idx = ibnd_desc->hdr.desc_handle;
4737 
4738 		if (idx >= VSW_RING_NUM_EL) {
4739 			cmn_err(CE_WARN, "%s: corrupted ACK received "
4740 				"(idx %ld)", __func__, idx);
4741 			return;
4742 		}
4743 
4744 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4745 			DERR(vswp, "%s: no dring found", __func__);
4746 			return;
4747 		}
4748 
4749 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4750 
4751 		/* move to correct location in ring */
4752 		priv_addr += idx;
4753 
4754 		/*
4755 		 * When we sent the in-band message to our peer we
4756 		 * marked the copy in our private ring as READY. We now
4757 		 * check that the descriptor we are being ACK'ed for is in
4758 		 * fact READY, i.e. it is one we have shared with our peer.
4759 		 */
4760 		mutex_enter(&priv_addr->dstate_lock);
4761 		if (priv_addr->dstate != VIO_DESC_READY) {
4762 			mutex_exit(&priv_addr->dstate_lock);
4763 			cmn_err(CE_WARN, "%s: (%ld) desc at index %ld not "
4764 				"READY (0x%lx)", __func__, ldcp->ldc_id, idx,
4765 				priv_addr->dstate);
4766 			cmn_err(CE_CONT, "%s: bound %d: ncookies %ld\n",
4767 				__func__, priv_addr->bound,
4768 				priv_addr->ncookies);
4769 			cmn_err(CE_CONT, "datalen %ld\n", priv_addr->datalen);
4770 			return;
4771 		} else {
4772 			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
4773 				ldcp->ldc_id, idx);
4774 
4775 			/* release resources associated with sent msg */
4776 			bzero(priv_addr->datap, priv_addr->datalen);
4777 			priv_addr->datalen = 0;
4778 			priv_addr->dstate = VIO_DESC_FREE;
4779 			mutex_exit(&priv_addr->dstate_lock);
4780 		}
4781 		break;
4782 
4783 	case VIO_SUBTYPE_NACK:
4784 		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
4785 
4786 		/*
4787 		 * We should only get a NACK if our peer doesn't like
4788 		 * something about a message we have sent it. If this
4789 		 * happens we just release the resources associated with
4790 		 * the message. (We are relying on higher layers to decide
4791 		 * whether or not to resend.
4792 		 */
4793 
4794 		/* limit check */
4795 		idx = ibnd_desc->hdr.desc_handle;
4796 
4797 		if (idx >= VSW_RING_NUM_EL) {
4798 			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
4799 				__func__, idx);
4800 			return;
4801 		}
4802 
4803 		if ((dp = ldcp->lane_out.dringp) == NULL) {
4804 			DERR(vswp, "%s: no dring found", __func__);
4805 			return;
4806 		}
4807 
4808 		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
4809 
4810 		/* move to correct location in ring */
4811 		priv_addr += idx;
4812 
4813 		/* release resources associated with sent msg */
4814 		mutex_enter(&priv_addr->dstate_lock);
4815 		bzero(priv_addr->datap, priv_addr->datalen);
4816 		priv_addr->datalen = 0;
4817 		priv_addr->dstate = VIO_DESC_FREE;
4818 		mutex_exit(&priv_addr->dstate_lock);
4819 
4820 		break;
4821 
4822 	default:
4823 		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
4824 			ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
4825 	}
4826 
4827 	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
4828 }
4829 
4830 static void
4831 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
4832 {
4833 	_NOTE(ARGUNUSED(epkt))
4834 
4835 	vsw_t		*vswp = ldcp->ldc_vswp;
4836 	uint16_t	env = tag.vio_subtype_env;
4837 
4838 	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
4839 
4840 	/*
4841 	 * Error vio_subtypes have yet to be defined. So for
4842 	 * the moment we can't do anything.
4843 	 */
4844 	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
4845 
4846 	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
4847 }
4848 
4849 /*
4850  * Switch the given ethernet frame when operating in layer 2 mode.
4851  *
4852  * vswp: pointer to the vsw instance
4853  * mp: pointer to chain of ethernet frame(s) to be switched
4854  * caller: identifies the source of this frame as:
4855  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
4856  *		2. VSW_PHYSDEV - the physical ethernet device
4857  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
4858  * arg: argument provided by the caller.
4859  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
4860  *		2. for PHYSDEV - NULL
4861  *		3. for LOCALDEV - pointer to to this vsw_t(self)
4862  */
4863 void
4864 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
4865 			vsw_port_t *arg, mac_resource_handle_t mrh)
4866 {
4867 	struct ether_header	*ehp;
4868 	vsw_port_t		*port = NULL;
4869 	mblk_t			*bp, *ret_m;
4870 	mblk_t			*nmp = NULL;
4871 	vsw_port_list_t		*plist = &vswp->plist;
4872 
4873 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
4874 
4875 	/*
4876 	 * PERF: rather than breaking up the chain here, scan it
4877 	 * to find all mblks heading to same destination and then
4878 	 * pass that sub-chain to the lower transmit functions.
4879 	 */
4880 
4881 	/* process the chain of packets */
4882 	bp = mp;
4883 	while (bp) {
4884 		mp = bp;
4885 		bp = bp->b_next;
4886 		mp->b_next = mp->b_prev = NULL;
4887 		ehp = (struct ether_header *)mp->b_rptr;
4888 
4889 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
4890 			__func__, MBLKSIZE(mp), MBLKL(mp));
4891 
4892 		READ_ENTER(&vswp->if_lockrw);
4893 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
4894 			/*
4895 			 * If destination is VSW_LOCALDEV (vsw as an eth
4896 			 * interface) and if the device is up & running,
4897 			 * send the packet up the stack on this host.
4898 			 * If the virtual interface is down, drop the packet.
4899 			 */
4900 			if (caller != VSW_LOCALDEV) {
4901 				if (vswp->if_state & VSW_IF_UP) {
4902 					RW_EXIT(&vswp->if_lockrw);
4903 					mac_rx(vswp->if_mh, mrh, mp);
4904 				} else {
4905 					RW_EXIT(&vswp->if_lockrw);
4906 					/* Interface down, drop pkt */
4907 					freemsg(mp);
4908 				}
4909 			} else {
4910 				RW_EXIT(&vswp->if_lockrw);
4911 				freemsg(mp);
4912 			}
4913 			continue;
4914 		}
4915 		RW_EXIT(&vswp->if_lockrw);
4916 
4917 		READ_ENTER(&plist->lockrw);
4918 		port = vsw_lookup_fdb(vswp, ehp);
4919 		if (port) {
4920 			/*
4921 			 * Mark the port as in-use.
4922 			 */
4923 			mutex_enter(&port->ref_lock);
4924 			port->ref_cnt++;
4925 			mutex_exit(&port->ref_lock);
4926 			RW_EXIT(&plist->lockrw);
4927 
4928 			/*
4929 			 * If plumbed and in promisc mode then copy msg
4930 			 * and send up the stack.
4931 			 */
4932 			READ_ENTER(&vswp->if_lockrw);
4933 			if (VSW_U_P(vswp->if_state)) {
4934 				RW_EXIT(&vswp->if_lockrw);
4935 				nmp = copymsg(mp);
4936 				if (nmp)
4937 					mac_rx(vswp->if_mh, mrh, nmp);
4938 			} else {
4939 				RW_EXIT(&vswp->if_lockrw);
4940 			}
4941 
4942 			/*
4943 			 * If the destination is in FDB, the packet
4944 			 * should be forwarded to the correponding
4945 			 * vsw_port (connected to a vnet device -
4946 			 * VSW_VNETPORT)
4947 			 */
4948 			(void) vsw_portsend(port, mp);
4949 
4950 			/*
4951 			 * Decrement use count in port and check if
4952 			 * should wake delete thread.
4953 			 */
4954 			mutex_enter(&port->ref_lock);
4955 			port->ref_cnt--;
4956 			if (port->ref_cnt == 0)
4957 				cv_signal(&port->ref_cv);
4958 			mutex_exit(&port->ref_lock);
4959 		} else {
4960 			RW_EXIT(&plist->lockrw);
4961 			/*
4962 			 * Destination not in FDB.
4963 			 *
4964 			 * If the destination is broadcast or
4965 			 * multicast forward the packet to all
4966 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
4967 			 * except the caller.
4968 			 */
4969 			if (IS_BROADCAST(ehp)) {
4970 				D3(vswp, "%s: BROADCAST pkt", __func__);
4971 				(void) vsw_forward_all(vswp, mp,
4972 								caller, arg);
4973 			} else if (IS_MULTICAST(ehp)) {
4974 				D3(vswp, "%s: MULTICAST pkt", __func__);
4975 				(void) vsw_forward_grp(vswp, mp,
4976 							caller, arg);
4977 			} else {
4978 				/*
4979 				 * If the destination is unicast, and came
4980 				 * from either a logical network device or
4981 				 * the switch itself when it is plumbed, then
4982 				 * send it out on the physical device and also
4983 				 * up the stack if the logical interface is
4984 				 * in promiscious mode.
4985 				 *
4986 				 * NOTE:  The assumption here is that if we
4987 				 * cannot find the destination in our fdb, its
4988 				 * a unicast address, and came from either a
4989 				 * vnet or down the stack (when plumbed) it
4990 				 * must be destinded for an ethernet device
4991 				 * outside our ldoms.
4992 				 */
4993 				if (caller == VSW_VNETPORT) {
4994 					READ_ENTER(&vswp->if_lockrw);
4995 					if (VSW_U_P(vswp->if_state)) {
4996 						RW_EXIT(&vswp->if_lockrw);
4997 						nmp = copymsg(mp);
4998 						if (nmp)
4999 							mac_rx(vswp->if_mh,
5000 								mrh, nmp);
5001 					} else {
5002 						RW_EXIT(&vswp->if_lockrw);
5003 					}
5004 					if ((ret_m = vsw_tx_msg(vswp, mp))
5005 								!= NULL) {
5006 						DERR(vswp, "%s: drop mblks to "
5007 							"phys dev", __func__);
5008 						freemsg(ret_m);
5009 					}
5010 
5011 				} else if (caller == VSW_PHYSDEV) {
5012 					/*
5013 					 * Pkt seen because card in promisc
5014 					 * mode. Send up stack if plumbed in
5015 					 * promisc mode, else drop it.
5016 					 */
5017 					READ_ENTER(&vswp->if_lockrw);
5018 					if (VSW_U_P(vswp->if_state)) {
5019 						RW_EXIT(&vswp->if_lockrw);
5020 						mac_rx(vswp->if_mh, mrh, mp);
5021 					} else {
5022 						RW_EXIT(&vswp->if_lockrw);
5023 						freemsg(mp);
5024 					}
5025 
5026 				} else if (caller == VSW_LOCALDEV) {
5027 					/*
5028 					 * Pkt came down the stack, send out
5029 					 * over physical device.
5030 					 */
5031 					if ((ret_m = vsw_tx_msg(vswp, mp))
5032 								!= NULL) {
5033 						DERR(vswp, "%s: drop mblks to "
5034 							"phys dev", __func__);
5035 						freemsg(ret_m);
5036 					}
5037 				}
5038 			}
5039 		}
5040 	}
5041 	D1(vswp, "%s: exit\n", __func__);
5042 }
5043 
5044 /*
5045  * Switch ethernet frame when in layer 3 mode (i.e. using IP
5046  * layer to do the routing).
5047  *
5048  * There is a large amount of overlap between this function and
5049  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
5050  * both these functions.
5051  */
5052 void
5053 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
5054 			vsw_port_t *arg, mac_resource_handle_t mrh)
5055 {
5056 	struct ether_header	*ehp;
5057 	vsw_port_t		*port = NULL;
5058 	mblk_t			*bp = NULL;
5059 	vsw_port_list_t		*plist = &vswp->plist;
5060 
5061 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
5062 
5063 	/*
5064 	 * In layer 3 mode should only ever be switching packets
5065 	 * between IP layer and vnet devices. So make sure thats
5066 	 * who is invoking us.
5067 	 */
5068 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
5069 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
5070 		freemsgchain(mp);
5071 		return;
5072 	}
5073 
5074 	/* process the chain of packets */
5075 	bp = mp;
5076 	while (bp) {
5077 		mp = bp;
5078 		bp = bp->b_next;
5079 		mp->b_next = mp->b_prev = NULL;
5080 		ehp = (struct ether_header *)mp->b_rptr;
5081 
5082 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
5083 			__func__, MBLKSIZE(mp), MBLKL(mp));
5084 
5085 		READ_ENTER(&plist->lockrw);
5086 		port = vsw_lookup_fdb(vswp, ehp);
5087 		if (port) {
5088 			/*
5089 			 * Mark port as in-use.
5090 			 */
5091 			mutex_enter(&port->ref_lock);
5092 			port->ref_cnt++;
5093 			mutex_exit(&port->ref_lock);
5094 			RW_EXIT(&plist->lockrw);
5095 
5096 			D2(vswp, "%s: sending to target port", __func__);
5097 			(void) vsw_portsend(port, mp);
5098 
5099 			/*
5100 			 * Finished with port so decrement ref count and
5101 			 * check if should wake delete thread.
5102 			 */
5103 			mutex_enter(&port->ref_lock);
5104 			port->ref_cnt--;
5105 			if (port->ref_cnt == 0)
5106 				cv_signal(&port->ref_cv);
5107 			mutex_exit(&port->ref_lock);
5108 		} else {
5109 			RW_EXIT(&plist->lockrw);
5110 			/*
5111 			 * Destination not in FDB
5112 			 *
5113 			 * If the destination is broadcast or
5114 			 * multicast forward the packet to all
5115 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
5116 			 * except the caller.
5117 			 */
5118 			if (IS_BROADCAST(ehp)) {
5119 				D2(vswp, "%s: BROADCAST pkt", __func__);
5120 				(void) vsw_forward_all(vswp, mp,
5121 								caller, arg);
5122 			} else if (IS_MULTICAST(ehp)) {
5123 				D2(vswp, "%s: MULTICAST pkt", __func__);
5124 				(void) vsw_forward_grp(vswp, mp,
5125 							caller, arg);
5126 			} else {
5127 				/*
5128 				 * Unicast pkt from vnet that we don't have
5129 				 * an FDB entry for, so must be destinded for
5130 				 * the outside world. Attempt to send up to the
5131 				 * IP layer to allow it to deal with it.
5132 				 */
5133 				if (caller == VSW_VNETPORT) {
5134 					READ_ENTER(&vswp->if_lockrw);
5135 					if (vswp->if_state & VSW_IF_UP) {
5136 						RW_EXIT(&vswp->if_lockrw);
5137 						D2(vswp, "%s: sending up",
5138 							__func__);
5139 						mac_rx(vswp->if_mh, mrh, mp);
5140 					} else {
5141 						RW_EXIT(&vswp->if_lockrw);
5142 						/* Interface down, drop pkt */
5143 						D2(vswp, "%s I/F down",
5144 								__func__);
5145 						freemsg(mp);
5146 					}
5147 				}
5148 			}
5149 		}
5150 	}
5151 
5152 	D1(vswp, "%s: exit", __func__);
5153 }
5154 
5155 /*
5156  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
5157  * except the caller (port on which frame arrived).
5158  */
5159 static int
5160 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
5161 {
5162 	vsw_port_list_t	*plist = &vswp->plist;
5163 	vsw_port_t	*portp;
5164 	mblk_t		*nmp = NULL;
5165 	mblk_t		*ret_m = NULL;
5166 	int		skip_port = 0;
5167 
5168 	D1(vswp, "vsw_forward_all: enter\n");
5169 
5170 	/*
5171 	 * Broadcast message from inside ldoms so send to outside
5172 	 * world if in either of layer 2 modes.
5173 	 */
5174 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
5175 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
5176 		((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
5177 
5178 		nmp = dupmsg(mp);
5179 		if (nmp) {
5180 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
5181 				DERR(vswp, "%s: dropping pkt(s) "
5182 				"consisting of %ld bytes of data for"
5183 				" physical device", __func__, MBLKL(ret_m));
5184 			freemsg(ret_m);
5185 			}
5186 		}
5187 	}
5188 
5189 	if (caller == VSW_VNETPORT)
5190 		skip_port = 1;
5191 
5192 	/*
5193 	 * Broadcast message from other vnet (layer 2 or 3) or outside
5194 	 * world (layer 2 only), send up stack if plumbed.
5195 	 */
5196 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
5197 		READ_ENTER(&vswp->if_lockrw);
5198 		if (vswp->if_state & VSW_IF_UP) {
5199 			RW_EXIT(&vswp->if_lockrw);
5200 			nmp = copymsg(mp);
5201 			if (nmp)
5202 				mac_rx(vswp->if_mh, NULL, nmp);
5203 		} else {
5204 			RW_EXIT(&vswp->if_lockrw);
5205 		}
5206 	}
5207 
5208 	/* send it to all VNETPORTs */
5209 	READ_ENTER(&plist->lockrw);
5210 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
5211 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
5212 		/*
5213 		 * Caution ! - don't reorder these two checks as arg
5214 		 * will be NULL if the caller is PHYSDEV. skip_port is
5215 		 * only set if caller is VNETPORT.
5216 		 */
5217 		if ((skip_port) && (portp == arg))
5218 			continue;
5219 		else {
5220 			nmp = dupmsg(mp);
5221 			if (nmp) {
5222 				(void) vsw_portsend(portp, nmp);
5223 			} else {
5224 				DERR(vswp, "vsw_forward_all: nmp NULL");
5225 			}
5226 		}
5227 	}
5228 	RW_EXIT(&plist->lockrw);
5229 
5230 	freemsg(mp);
5231 
5232 	D1(vswp, "vsw_forward_all: exit\n");
5233 	return (0);
5234 }
5235 
5236 /*
5237  * Forward pkts to any devices or interfaces which have registered
5238  * an interest in them (i.e. multicast groups).
5239  */
5240 static int
5241 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
5242 {
5243 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
5244 	mfdb_ent_t		*entp = NULL;
5245 	mfdb_ent_t		*tpp = NULL;
5246 	vsw_port_t 		*port;
5247 	uint64_t		key = 0;
5248 	mblk_t			*nmp = NULL;
5249 	mblk_t			*ret_m = NULL;
5250 	boolean_t		check_if = B_TRUE;
5251 
5252 	/*
5253 	 * Convert address to hash table key
5254 	 */
5255 	KEY_HASH(key, ehp->ether_dhost);
5256 
5257 	D1(vswp, "%s: key 0x%llx", __func__, key);
5258 
5259 	/*
5260 	 * If pkt came from either a vnet or down the stack (if we are
5261 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
5262 	 * over the physical adapter, and then check to see if any other
5263 	 * vnets are interested in it.
5264 	 */
5265 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
5266 		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
5267 		((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
5268 		nmp = dupmsg(mp);
5269 		if (nmp) {
5270 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
5271 				DERR(vswp, "%s: dropping pkt(s) "
5272 					"consisting of %ld bytes of "
5273 					"data for physical device",
5274 					__func__, MBLKL(ret_m));
5275 				freemsg(ret_m);
5276 			}
5277 		}
5278 	}
5279 
5280 	READ_ENTER(&vswp->mfdbrw);
5281 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
5282 				(mod_hash_val_t *)&entp) != 0) {
5283 		D3(vswp, "%s: no table entry found for addr 0x%llx",
5284 								__func__, key);
5285 	} else {
5286 		/*
5287 		 * Send to list of devices associated with this address...
5288 		 */
5289 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
5290 
5291 			/* dont send to ourselves */
5292 			if ((caller == VSW_VNETPORT) &&
5293 				(tpp->d_addr == (void *)arg)) {
5294 				port = (vsw_port_t *)tpp->d_addr;
5295 				D3(vswp, "%s: not sending to ourselves"
5296 					" : port %d", __func__,
5297 					port->p_instance);
5298 				continue;
5299 
5300 			} else if ((caller == VSW_LOCALDEV) &&
5301 				(tpp->d_type == VSW_LOCALDEV)) {
5302 				D3(vswp, "%s: not sending back up stack",
5303 					__func__);
5304 				continue;
5305 			}
5306 
5307 			if (tpp->d_type == VSW_VNETPORT) {
5308 				port = (vsw_port_t *)tpp->d_addr;
5309 				D3(vswp, "%s: sending to port %ld for "
5310 					" addr 0x%llx", __func__,
5311 					port->p_instance, key);
5312 
5313 				nmp = dupmsg(mp);
5314 				if (nmp)
5315 					(void) vsw_portsend(port, nmp);
5316 			} else {
5317 				if (vswp->if_state & VSW_IF_UP) {
5318 					nmp = copymsg(mp);
5319 					if (nmp)
5320 						mac_rx(vswp->if_mh, NULL, nmp);
5321 					check_if = B_FALSE;
5322 					D3(vswp, "%s: sending up stack"
5323 						" for addr 0x%llx", __func__,
5324 						key);
5325 				}
5326 			}
5327 		}
5328 	}
5329 
5330 	RW_EXIT(&vswp->mfdbrw);
5331 
5332 	/*
5333 	 * If the pkt came from either a vnet or from physical device,
5334 	 * and if we havent already sent the pkt up the stack then we
5335 	 * check now if we can/should (i.e. the interface is plumbed
5336 	 * and in promisc mode).
5337 	 */
5338 	if ((check_if) &&
5339 		((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
5340 		READ_ENTER(&vswp->if_lockrw);
5341 		if (VSW_U_P(vswp->if_state)) {
5342 			RW_EXIT(&vswp->if_lockrw);
5343 			D3(vswp, "%s: (caller %d) finally sending up stack"
5344 				" for addr 0x%llx", __func__, caller, key);
5345 			nmp = copymsg(mp);
5346 			if (nmp)
5347 				mac_rx(vswp->if_mh, NULL, nmp);
5348 		} else {
5349 			RW_EXIT(&vswp->if_lockrw);
5350 		}
5351 	}
5352 
5353 	freemsg(mp);
5354 
5355 	D1(vswp, "%s: exit", __func__);
5356 
5357 	return (0);
5358 }
5359 
5360 /* transmit the packet over the given port */
5361 static int
5362 vsw_portsend(vsw_port_t *port, mblk_t *mp)
5363 {
5364 	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
5365 	vsw_ldc_t 	*ldcp;
5366 	int		status = 0;
5367 
5368 
5369 	READ_ENTER(&ldcl->lockrw);
5370 	/*
5371 	 * Note for now, we have a single channel.
5372 	 */
5373 	ldcp = ldcl->head;
5374 	if (ldcp == NULL) {
5375 		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
5376 		freemsg(mp);
5377 		RW_EXIT(&ldcl->lockrw);
5378 		return (1);
5379 	}
5380 
5381 	/*
5382 	 * Send the message out using the appropriate
5383 	 * transmit function which will free mblock when it
5384 	 * is finished with it.
5385 	 */
5386 	mutex_enter(&port->tx_lock);
5387 	if (port->transmit != NULL)
5388 		status = (*port->transmit)(ldcp, mp);
5389 	else {
5390 		freemsg(mp);
5391 	}
5392 	mutex_exit(&port->tx_lock);
5393 
5394 	RW_EXIT(&ldcl->lockrw);
5395 
5396 	return (status);
5397 }
5398 
5399 /*
5400  * Send packet out via descriptor ring to a logical device.
5401  */
5402 static int
5403 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
5404 {
5405 	vio_dring_msg_t		dring_pkt;
5406 	dring_info_t		*dp = NULL;
5407 	vsw_private_desc_t	*priv_desc = NULL;
5408 	vnet_public_desc_t	*pub = NULL;
5409 	vsw_t			*vswp = ldcp->ldc_vswp;
5410 	mblk_t			*bp;
5411 	size_t			n, size;
5412 	caddr_t			bufp;
5413 	int			idx;
5414 	int			status = LDC_TX_SUCCESS;
5415 
5416 	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
5417 
5418 	/* TODO: make test a macro */
5419 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
5420 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
5421 		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
5422 			"packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
5423 			ldcp->lane_out.lstate);
5424 		freemsg(mp);
5425 		return (LDC_TX_FAILURE);
5426 	}
5427 
5428 	/*
5429 	 * Note - using first ring only, this may change
5430 	 * in the future.
5431 	 */
5432 	if ((dp = ldcp->lane_out.dringp) == NULL) {
5433 		DERR(vswp, "%s(%lld): no dring for outbound lane on"
5434 			" channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
5435 		freemsg(mp);
5436 		return (LDC_TX_FAILURE);
5437 	}
5438 
5439 	size = msgsize(mp);
5440 	if (size > (size_t)ETHERMAX) {
5441 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
5442 		    ldcp->ldc_id, size);
5443 		freemsg(mp);
5444 		return (LDC_TX_FAILURE);
5445 	}
5446 
5447 	/*
5448 	 * Find a free descriptor
5449 	 *
5450 	 * Note: for the moment we are assuming that we will only
5451 	 * have one dring going from the switch to each of its
5452 	 * peers. This may change in the future.
5453 	 */
5454 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
5455 		D2(vswp, "%s(%lld): no descriptor available for ring "
5456 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
5457 
5458 		/* nothing more we can do */
5459 		status = LDC_TX_NORESOURCES;
5460 		goto vsw_dringsend_free_exit;
5461 	} else {
5462 		D2(vswp, "%s(%lld): free private descriptor found at pos "
5463 			"%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
5464 			priv_desc);
5465 	}
5466 
5467 	/* copy data into the descriptor */
5468 	bufp = priv_desc->datap;
5469 	bufp += VNET_IPALIGN;
5470 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
5471 		n = MBLKL(bp);
5472 		bcopy(bp->b_rptr, bufp, n);
5473 		bufp += n;
5474 	}
5475 
5476 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
5477 
5478 	pub = priv_desc->descp;
5479 	pub->nbytes = priv_desc->datalen;
5480 
5481 	mutex_enter(&priv_desc->dstate_lock);
5482 	pub->hdr.dstate = VIO_DESC_READY;
5483 	mutex_exit(&priv_desc->dstate_lock);
5484 
5485 	/*
5486 	 * Determine whether or not we need to send a message to our
5487 	 * peer prompting them to read our newly updated descriptor(s).
5488 	 */
5489 	mutex_enter(&dp->restart_lock);
5490 	if (dp->restart_reqd) {
5491 		dp->restart_reqd = B_FALSE;
5492 		mutex_exit(&dp->restart_lock);
5493 
5494 		/*
5495 		 * Send a vio_dring_msg to peer to prompt them to read
5496 		 * the updated descriptor ring.
5497 		 */
5498 		dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
5499 		dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
5500 		dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
5501 		dring_pkt.tag.vio_sid = ldcp->local_session;
5502 
5503 		/* Note - for now using first ring */
5504 		dring_pkt.dring_ident = dp->ident;
5505 
5506 		mutex_enter(&ldcp->lane_out.seq_lock);
5507 		dring_pkt.seq_num = ldcp->lane_out.seq_num++;
5508 		mutex_exit(&ldcp->lane_out.seq_lock);
5509 
5510 		/*
5511 		 * If last_ack_recv is -1 then we know we've not
5512 		 * received any ack's yet, so this must be the first
5513 		 * msg sent, so set the start to the begining of the ring.
5514 		 */
5515 		mutex_enter(&dp->dlock);
5516 		if (dp->last_ack_recv == -1) {
5517 			dring_pkt.start_idx = 0;
5518 		} else {
5519 			dring_pkt.start_idx = (dp->last_ack_recv + 1) %
5520 						dp->num_descriptors;
5521 		}
5522 		dring_pkt.end_idx = -1;
5523 		mutex_exit(&dp->dlock);
5524 
5525 		D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
5526 			ldcp->ldc_id, dp, dring_pkt.dring_ident);
5527 		D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
5528 			__func__, ldcp->ldc_id, dring_pkt.start_idx,
5529 			dring_pkt.end_idx, dring_pkt.seq_num);
5530 
5531 		vsw_send_msg(ldcp, (void *)&dring_pkt,
5532 						sizeof (vio_dring_msg_t));
5533 	} else {
5534 		mutex_exit(&dp->restart_lock);
5535 		D2(vswp, "%s(%lld): updating descp %d", __func__,
5536 			ldcp->ldc_id, idx);
5537 	}
5538 
5539 vsw_dringsend_free_exit:
5540 
5541 	/* free the message block */
5542 	freemsg(mp);
5543 
5544 	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
5545 	return (status);
5546 }
5547 
5548 /*
5549  * Send an in-band descriptor message over ldc.
5550  */
5551 static int
5552 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
5553 {
5554 	vsw_t			*vswp = ldcp->ldc_vswp;
5555 	vio_ibnd_desc_t		ibnd_msg;
5556 	vsw_private_desc_t	*priv_desc = NULL;
5557 	dring_info_t		*dp = NULL;
5558 	size_t			n, size = 0;
5559 	caddr_t			bufp;
5560 	mblk_t			*bp;
5561 	int			idx, i;
5562 	int			status = LDC_TX_SUCCESS;
5563 	static int		warn_msg = 1;
5564 
5565 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
5566 
5567 	ASSERT(mp != NULL);
5568 
5569 	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
5570 		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
5571 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
5572 			__func__, ldcp->ldc_id, ldcp->ldc_status,
5573 			ldcp->lane_out.lstate);
5574 		freemsg(mp);
5575 		return (LDC_TX_FAILURE);
5576 	}
5577 
5578 	/*
5579 	 * only expect single dring to exist, which we use
5580 	 * as an internal buffer, rather than a transfer channel.
5581 	 */
5582 	if ((dp = ldcp->lane_out.dringp) == NULL) {
5583 		DERR(vswp, "%s(%lld): no dring for outbound lane",
5584 			__func__, ldcp->ldc_id);
5585 		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
5586 			__func__, ldcp->ldc_id, ldcp->ldc_status,
5587 			ldcp->lane_out.lstate);
5588 		freemsg(mp);
5589 		return (LDC_TX_FAILURE);
5590 	}
5591 
5592 	size = msgsize(mp);
5593 	if (size > (size_t)ETHERMAX) {
5594 		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
5595 		    ldcp->ldc_id, size);
5596 		freemsg(mp);
5597 		return (LDC_TX_FAILURE);
5598 	}
5599 
5600 	/*
5601 	 * Find a free descriptor in our buffer ring
5602 	 */
5603 	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
5604 		if (warn_msg) {
5605 			DERR(vswp, "%s(%lld): no descriptor available for ring "
5606 			"at 0x%llx", __func__, ldcp->ldc_id, dp);
5607 			warn_msg = 0;
5608 		}
5609 
5610 		/* nothing more we can do */
5611 		status = LDC_TX_NORESOURCES;
5612 		goto vsw_descrsend_free_exit;
5613 	} else {
5614 		D2(vswp, "%s(%lld): free private descriptor found at pos "
5615 			"%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
5616 			priv_desc);
5617 		warn_msg = 1;
5618 	}
5619 
5620 	/* copy data into the descriptor */
5621 	bufp = priv_desc->datap;
5622 	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
5623 		n = MBLKL(bp);
5624 		bcopy(bp->b_rptr, bufp, n);
5625 		bufp += n;
5626 	}
5627 
5628 	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
5629 
5630 	/* create and send the in-band descp msg */
5631 	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
5632 	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
5633 	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
5634 	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
5635 
5636 	mutex_enter(&ldcp->lane_out.seq_lock);
5637 	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
5638 	mutex_exit(&ldcp->lane_out.seq_lock);
5639 
5640 	/*
5641 	 * Copy the mem cookies describing the data from the
5642 	 * private region of the descriptor ring into the inband
5643 	 * descriptor.
5644 	 */
5645 	for (i = 0; i < priv_desc->ncookies; i++) {
5646 		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
5647 			sizeof (ldc_mem_cookie_t));
5648 	}
5649 
5650 	ibnd_msg.hdr.desc_handle = idx;
5651 	ibnd_msg.ncookies = priv_desc->ncookies;
5652 	ibnd_msg.nbytes = size;
5653 
5654 	vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t));
5655 
5656 vsw_descrsend_free_exit:
5657 
5658 	/* free the allocated message blocks */
5659 	freemsg(mp);
5660 
5661 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
5662 	return (status);
5663 }
5664 
5665 static void
5666 vsw_send_ver(vsw_ldc_t *ldcp)
5667 {
5668 	vsw_t		*vswp = ldcp->ldc_vswp;
5669 	lane_t		*lp = &ldcp->lane_out;
5670 	vio_ver_msg_t	ver_msg;
5671 
5672 	D1(vswp, "%s enter", __func__);
5673 
5674 	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5675 	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5676 	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
5677 	ver_msg.tag.vio_sid = ldcp->local_session;
5678 
5679 	ver_msg.ver_major = vsw_versions[0].ver_major;
5680 	ver_msg.ver_minor = vsw_versions[0].ver_minor;
5681 	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
5682 
5683 	lp->lstate |= VSW_VER_INFO_SENT;
5684 	lp->ver_major = ver_msg.ver_major;
5685 	lp->ver_minor = ver_msg.ver_minor;
5686 
5687 	DUMP_TAG(ver_msg.tag);
5688 
5689 	vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t));
5690 
5691 	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
5692 }
5693 
5694 static void
5695 vsw_send_attr(vsw_ldc_t *ldcp)
5696 {
5697 	vsw_t			*vswp = ldcp->ldc_vswp;
5698 	lane_t			*lp = &ldcp->lane_out;
5699 	vnet_attr_msg_t		attr_msg;
5700 
5701 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5702 
5703 	/*
5704 	 * Subtype is set to INFO by default
5705 	 */
5706 	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5707 	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5708 	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
5709 	attr_msg.tag.vio_sid = ldcp->local_session;
5710 
5711 	/* payload copied from default settings for lane */
5712 	attr_msg.mtu = lp->mtu;
5713 	attr_msg.addr_type = lp->addr_type;
5714 	attr_msg.xfer_mode = lp->xfer_mode;
5715 	attr_msg.ack_freq = lp->xfer_mode;
5716 
5717 	READ_ENTER(&vswp->if_lockrw);
5718 	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
5719 	RW_EXIT(&vswp->if_lockrw);
5720 
5721 	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
5722 
5723 	DUMP_TAG(attr_msg.tag);
5724 
5725 	vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t));
5726 
5727 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5728 }
5729 
5730 /*
5731  * Create dring info msg (which also results in the creation of
5732  * a dring).
5733  */
5734 static vio_dring_reg_msg_t *
5735 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
5736 {
5737 	vio_dring_reg_msg_t	*mp;
5738 	dring_info_t		*dp;
5739 	vsw_t			*vswp = ldcp->ldc_vswp;
5740 
5741 	D1(vswp, "vsw_create_dring_info_pkt enter\n");
5742 
5743 	/*
5744 	 * If we can't create a dring, obviously no point sending
5745 	 * a message.
5746 	 */
5747 	if ((dp = vsw_create_dring(ldcp)) == NULL)
5748 		return (NULL);
5749 
5750 	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
5751 
5752 	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
5753 	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
5754 	mp->tag.vio_subtype_env = VIO_DRING_REG;
5755 	mp->tag.vio_sid = ldcp->local_session;
5756 
5757 	/* payload */
5758 	mp->num_descriptors = dp->num_descriptors;
5759 	mp->descriptor_size = dp->descriptor_size;
5760 	mp->options = dp->options;
5761 	mp->ncookies = dp->ncookies;
5762 	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
5763 
5764 	mp->dring_ident = 0;
5765 
5766 	D1(vswp, "vsw_create_dring_info_pkt exit\n");
5767 
5768 	return (mp);
5769 }
5770 
5771 static void
5772 vsw_send_dring_info(vsw_ldc_t *ldcp)
5773 {
5774 	vio_dring_reg_msg_t	*dring_msg;
5775 	vsw_t			*vswp = ldcp->ldc_vswp;
5776 
5777 	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
5778 
5779 	dring_msg = vsw_create_dring_info_pkt(ldcp);
5780 	if (dring_msg == NULL) {
5781 		cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg");
5782 		return;
5783 	}
5784 
5785 	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
5786 
5787 	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
5788 
5789 	vsw_send_msg(ldcp, dring_msg,
5790 		sizeof (vio_dring_reg_msg_t));
5791 
5792 	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
5793 
5794 	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
5795 }
5796 
5797 static void
5798 vsw_send_rdx(vsw_ldc_t *ldcp)
5799 {
5800 	vsw_t		*vswp = ldcp->ldc_vswp;
5801 	vio_rdx_msg_t	rdx_msg;
5802 
5803 	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
5804 
5805 	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
5806 	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
5807 	rdx_msg.tag.vio_subtype_env = VIO_RDX;
5808 	rdx_msg.tag.vio_sid = ldcp->local_session;
5809 
5810 	ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT;
5811 
5812 	DUMP_TAG(rdx_msg.tag);
5813 
5814 	vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t));
5815 
5816 	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
5817 }
5818 
5819 /*
5820  * Generic routine to send message out over ldc channel.
5821  */
5822 static void
5823 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size)
5824 {
5825 	int		rv;
5826 	size_t		msglen = size;
5827 	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
5828 	vsw_t		*vswp = ldcp->ldc_vswp;
5829 
5830 	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
5831 			ldcp->ldc_id, size);
5832 
5833 	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
5834 	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
5835 	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
5836 
5837 	mutex_enter(&ldcp->ldc_txlock);
5838 	do {
5839 		msglen = size;
5840 		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
5841 	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
5842 
5843 	mutex_exit(&ldcp->ldc_txlock);
5844 
5845 	if ((rv != 0) || (msglen != size)) {
5846 		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
5847 			"rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
5848 			rv, size, msglen);
5849 	}
5850 
5851 	D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes",
5852 			ldcp->ldc_id, msglen);
5853 }
5854 
5855 /*
5856  * Add an entry into FDB, for the given mac address and port_id.
5857  * Returns 0 on success, 1 on failure.
5858  *
5859  * Lock protecting FDB must be held by calling process.
5860  */
5861 static int
5862 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
5863 {
5864 	uint64_t	addr = 0;
5865 
5866 	D1(vswp, "%s: enter", __func__);
5867 
5868 	KEY_HASH(addr, port->p_macaddr);
5869 
5870 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
5871 
5872 	/*
5873 	 * Note: duplicate keys will be rejected by mod_hash.
5874 	 */
5875 	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
5876 				(mod_hash_val_t)port) != 0) {
5877 		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
5878 		return (1);
5879 	}
5880 
5881 	D1(vswp, "%s: exit", __func__);
5882 	return (0);
5883 }
5884 
5885 /*
5886  * Remove an entry from FDB.
5887  * Returns 0 on success, 1 on failure.
5888  */
5889 static int
5890 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
5891 {
5892 	uint64_t	addr = 0;
5893 
5894 	D1(vswp, "%s: enter", __func__);
5895 
5896 	KEY_HASH(addr, port->p_macaddr);
5897 
5898 	D2(vswp, "%s: key = 0x%llx", __func__, addr);
5899 
5900 	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
5901 
5902 	D1(vswp, "%s: enter", __func__);
5903 
5904 	return (0);
5905 }
5906 
5907 /*
5908  * Search fdb for a given mac address.
5909  * Returns pointer to the entry if found, else returns NULL.
5910  */
5911 static vsw_port_t *
5912 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
5913 {
5914 	uint64_t	key = 0;
5915 	vsw_port_t	*port = NULL;
5916 
5917 	D1(vswp, "%s: enter", __func__);
5918 
5919 	KEY_HASH(key, ehp->ether_dhost);
5920 
5921 	D2(vswp, "%s: key = 0x%llx", __func__, key);
5922 
5923 	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
5924 				(mod_hash_val_t *)&port) != 0) {
5925 		return (NULL);
5926 	}
5927 
5928 	D1(vswp, "%s: exit", __func__);
5929 
5930 	return (port);
5931 }
5932 
5933 /*
5934  * Add or remove multicast address(es).
5935  *
5936  * Returns 0 on success, 1 on failure.
5937  */
5938 static int
5939 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
5940 {
5941 	mcst_addr_t		*mcst_p = NULL;
5942 	vsw_t			*vswp = port->p_vswp;
5943 	uint64_t		addr = 0x0;
5944 	int			i;
5945 
5946 	D1(vswp, "%s: enter", __func__);
5947 
5948 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
5949 
5950 	for (i = 0; i < mcst_pkt->count; i++) {
5951 		/*
5952 		 * Convert address into form that can be used
5953 		 * as hash table key.
5954 		 */
5955 		KEY_HASH(addr, mcst_pkt->mca[i]);
5956 
5957 		/*
5958 		 * Add or delete the specified address/port combination.
5959 		 */
5960 		if (mcst_pkt->set == 0x1) {
5961 			D3(vswp, "%s: adding multicast address 0x%llx for "
5962 				"port %ld", __func__, addr, port->p_instance);
5963 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
5964 				/*
5965 				 * Update the list of multicast
5966 				 * addresses contained within the
5967 				 * port structure to include this new
5968 				 * one.
5969 				 */
5970 				mcst_p = kmem_alloc(sizeof (mcst_addr_t),
5971 								KM_NOSLEEP);
5972 				if (mcst_p == NULL) {
5973 					DERR(vswp, "%s: unable to alloc mem",
5974 						__func__);
5975 					return (1);
5976 				}
5977 
5978 				mcst_p->nextp = NULL;
5979 				mcst_p->addr = addr;
5980 
5981 				mutex_enter(&port->mca_lock);
5982 				mcst_p->nextp = port->mcap;
5983 				port->mcap = mcst_p;
5984 				mutex_exit(&port->mca_lock);
5985 
5986 				/*
5987 				 * Program the address into HW. If the addr
5988 				 * has already been programmed then the MAC
5989 				 * just increments a ref counter (which is
5990 				 * used when the address is being deleted)
5991 				 *
5992 				 * Note:
5993 				 * For the moment we dont care if this
5994 				 * succeeds because the card must be in
5995 				 * promics mode. When we have the ability
5996 				 * to program multiple unicst address into
5997 				 * the card then we will need to check this
5998 				 * return value.
5999 				 */
6000 				if (vswp->mh != NULL)
6001 					(void) mac_multicst_add(vswp->mh,
6002 						(uchar_t *)&mcst_pkt->mca[i]);
6003 
6004 			} else {
6005 				DERR(vswp, "%s: error adding multicast "
6006 					"address 0x%llx for port %ld",
6007 					__func__, addr, port->p_instance);
6008 				return (1);
6009 			}
6010 		} else {
6011 			/*
6012 			 * Delete an entry from the multicast hash
6013 			 * table and update the address list
6014 			 * appropriately.
6015 			 */
6016 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
6017 				D3(vswp, "%s: deleting multicast address "
6018 					"0x%llx for port %ld", __func__, addr,
6019 					port->p_instance);
6020 
6021 				vsw_del_addr(VSW_VNETPORT, port, addr);
6022 
6023 				/*
6024 				 * Remove the address from HW. The address
6025 				 * will actually only be removed once the ref
6026 				 * count within the MAC layer has dropped to
6027 				 * zero. I.e. we can safely call this fn even
6028 				 * if other ports are interested in this
6029 				 * address.
6030 				 */
6031 				if (vswp->mh != NULL)
6032 					(void) mac_multicst_remove(vswp->mh,
6033 						(uchar_t *)&mcst_pkt->mca[i]);
6034 
6035 			} else {
6036 				DERR(vswp, "%s: error deleting multicast "
6037 					"addr 0x%llx for port %ld",
6038 					__func__, addr, port->p_instance);
6039 				return (1);
6040 			}
6041 		}
6042 	}
6043 	D1(vswp, "%s: exit", __func__);
6044 	return (0);
6045 }
6046 
6047 /*
6048  * Add a new multicast entry.
6049  *
6050  * Search hash table based on address. If match found then
6051  * update associated val (which is chain of ports), otherwise
6052  * create new key/val (addr/port) pair and insert into table.
6053  */
6054 static int
6055 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
6056 {
6057 	int		dup = 0;
6058 	int		rv = 0;
6059 	mfdb_ent_t	*ment = NULL;
6060 	mfdb_ent_t	*tmp_ent = NULL;
6061 	mfdb_ent_t	*new_ent = NULL;
6062 	void		*tgt = NULL;
6063 
6064 	if (devtype == VSW_VNETPORT) {
6065 		/*
6066 		 * Being invoked from a vnet.
6067 		 */
6068 		ASSERT(arg != NULL);
6069 		tgt = arg;
6070 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
6071 			((vsw_port_t *)arg)->p_instance, addr);
6072 	} else {
6073 		/*
6074 		 * We are being invoked via the m_multicst mac entry
6075 		 * point.
6076 		 */
6077 		D2(NULL, "%s: address 0x%llx", __func__, addr);
6078 		tgt = (void *)vswp;
6079 	}
6080 
6081 	WRITE_ENTER(&vswp->mfdbrw);
6082 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
6083 				(mod_hash_val_t *)&ment) != 0) {
6084 
6085 		/* address not currently in table */
6086 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
6087 		ment->d_addr = (void *)tgt;
6088 		ment->d_type = devtype;
6089 		ment->nextp = NULL;
6090 
6091 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
6092 			(mod_hash_val_t)ment) != 0) {
6093 			DERR(vswp, "%s: hash table insertion failed", __func__);
6094 			kmem_free(ment, sizeof (mfdb_ent_t));
6095 			rv = 1;
6096 		} else {
6097 			D2(vswp, "%s: added initial entry for 0x%llx to "
6098 				"table", __func__, addr);
6099 		}
6100 	} else {
6101 		/*
6102 		 * Address in table. Check to see if specified port
6103 		 * is already associated with the address. If not add
6104 		 * it now.
6105 		 */
6106 		tmp_ent = ment;
6107 		while (tmp_ent != NULL) {
6108 			if (tmp_ent->d_addr == (void *)tgt) {
6109 				if (devtype == VSW_VNETPORT) {
6110 					DERR(vswp, "%s: duplicate port entry "
6111 						"found for portid %ld and key "
6112 						"0x%llx", __func__,
6113 						((vsw_port_t *)arg)->p_instance,
6114 						addr);
6115 				} else {
6116 					DERR(vswp, "%s: duplicate entry found"
6117 						"for key 0x%llx",
6118 						__func__, addr);
6119 				}
6120 				rv = 1;
6121 				dup = 1;
6122 				break;
6123 			}
6124 			tmp_ent = tmp_ent->nextp;
6125 		}
6126 
6127 		/*
6128 		 * Port not on list so add it to end now.
6129 		 */
6130 		if (0 == dup) {
6131 			D2(vswp, "%s: added entry for 0x%llx to table",
6132 				__func__, addr);
6133 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
6134 			new_ent->d_addr = (void *)tgt;
6135 			new_ent->d_type = devtype;
6136 			new_ent->nextp = NULL;
6137 
6138 			tmp_ent = ment;
6139 			while (tmp_ent->nextp != NULL)
6140 				tmp_ent = tmp_ent->nextp;
6141 
6142 			tmp_ent->nextp = new_ent;
6143 		}
6144 	}
6145 
6146 	RW_EXIT(&vswp->mfdbrw);
6147 	return (rv);
6148 }
6149 
6150 /*
6151  * Remove a multicast entry from the hashtable.
6152  *
6153  * Search hash table based on address. If match found, scan
6154  * list of ports associated with address. If specified port
6155  * found remove it from list.
6156  */
6157 static int
6158 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
6159 {
6160 	mfdb_ent_t	*ment = NULL;
6161 	mfdb_ent_t	*curr_p, *prev_p;
6162 	void		*tgt = NULL;
6163 
6164 	D1(vswp, "%s: enter", __func__);
6165 
6166 	if (devtype == VSW_VNETPORT) {
6167 		tgt = (vsw_port_t *)arg;
6168 		D2(vswp, "%s: removing port %d from mFDB for address"
6169 			" 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
6170 			addr);
6171 	} else {
6172 		D2(vswp, "%s: removing entry", __func__);
6173 		tgt = (void *)vswp;
6174 	}
6175 
6176 	WRITE_ENTER(&vswp->mfdbrw);
6177 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
6178 				(mod_hash_val_t *)&ment) != 0) {
6179 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
6180 		RW_EXIT(&vswp->mfdbrw);
6181 		return (1);
6182 	}
6183 
6184 	prev_p = curr_p = ment;
6185 
6186 	while (curr_p != NULL) {
6187 		if (curr_p->d_addr == (void *)tgt) {
6188 			if (devtype == VSW_VNETPORT) {
6189 				D2(vswp, "%s: port %d found", __func__,
6190 					((vsw_port_t *)tgt)->p_instance);
6191 			} else {
6192 				D2(vswp, "%s: instance found", __func__);
6193 			}
6194 
6195 			if (prev_p == curr_p) {
6196 				/*
6197 				 * head of list, if no other element is in
6198 				 * list then destroy this entry, otherwise
6199 				 * just replace it with updated value.
6200 				 */
6201 				ment = curr_p->nextp;
6202 				kmem_free(curr_p, sizeof (mfdb_ent_t));
6203 				if (ment == NULL) {
6204 					(void) mod_hash_destroy(vswp->mfdb,
6205 							(mod_hash_val_t)addr);
6206 				} else {
6207 					(void) mod_hash_replace(vswp->mfdb,
6208 							(mod_hash_key_t)addr,
6209 							(mod_hash_val_t)ment);
6210 				}
6211 			} else {
6212 				/*
6213 				 * Not head of list, no need to do
6214 				 * replacement, just adjust list pointers.
6215 				 */
6216 				prev_p->nextp = curr_p->nextp;
6217 				kmem_free(curr_p, sizeof (mfdb_ent_t));
6218 			}
6219 			break;
6220 		}
6221 
6222 		prev_p = curr_p;
6223 		curr_p = curr_p->nextp;
6224 	}
6225 
6226 	RW_EXIT(&vswp->mfdbrw);
6227 
6228 	D1(vswp, "%s: exit", __func__);
6229 
6230 	return (0);
6231 }
6232 
6233 /*
6234  * Port is being deleted, but has registered an interest in one
6235  * or more multicast groups. Using the list of addresses maintained
6236  * within the port structure find the appropriate entry in the hash
6237  * table and remove this port from the list of interested ports.
6238  */
6239 static void
6240 vsw_del_mcst_port(vsw_port_t *port)
6241 {
6242 	mcst_addr_t	*mcst_p = NULL;
6243 	vsw_t		*vswp = port->p_vswp;
6244 
6245 	D1(vswp, "%s: enter", __func__);
6246 
6247 	mutex_enter(&port->mca_lock);
6248 	while (port->mcap != NULL) {
6249 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
6250 					port->mcap->addr, port);
6251 
6252 		mcst_p = port->mcap->nextp;
6253 		kmem_free(port->mcap, sizeof (mcst_addr_t));
6254 		port->mcap = mcst_p;
6255 	}
6256 	mutex_exit(&port->mca_lock);
6257 
6258 	D1(vswp, "%s: exit", __func__);
6259 }
6260 
6261 /*
6262  * This vsw instance is detaching, but has registered an interest in one
6263  * or more multicast groups. Using the list of addresses maintained
6264  * within the vsw structure find the appropriate entry in the hash
6265  * table and remove this instance from the list of interested ports.
6266  */
6267 static void
6268 vsw_del_mcst_vsw(vsw_t *vswp)
6269 {
6270 	mcst_addr_t	*next_p = NULL;
6271 
6272 	D1(vswp, "%s: enter", __func__);
6273 
6274 	mutex_enter(&vswp->mca_lock);
6275 
6276 	while (vswp->mcap != NULL) {
6277 		DERR(vswp, "%s: deleting addr 0x%llx",
6278 			__func__, vswp->mcap->addr);
6279 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV,
6280 				vswp->mcap->addr, NULL);
6281 
6282 		next_p = vswp->mcap->nextp;
6283 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
6284 		vswp->mcap = next_p;
6285 	}
6286 
6287 	vswp->mcap = NULL;
6288 	mutex_exit(&vswp->mca_lock);
6289 
6290 	D1(vswp, "%s: exit", __func__);
6291 }
6292 
6293 
6294 /*
6295  * Remove the specified address from the list of address maintained
6296  * in this port node.
6297  */
6298 static void
6299 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
6300 {
6301 	vsw_t		*vswp = NULL;
6302 	vsw_port_t	*port = NULL;
6303 	mcst_addr_t	*prev_p = NULL;
6304 	mcst_addr_t	*curr_p = NULL;
6305 
6306 	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
6307 		__func__, devtype, addr);
6308 
6309 	if (devtype == VSW_VNETPORT) {
6310 		port = (vsw_port_t *)arg;
6311 		mutex_enter(&port->mca_lock);
6312 		prev_p = curr_p = port->mcap;
6313 	} else {
6314 		vswp = (vsw_t *)arg;
6315 		mutex_enter(&vswp->mca_lock);
6316 		prev_p = curr_p = vswp->mcap;
6317 	}
6318 
6319 	while (curr_p != NULL) {
6320 		if (curr_p->addr == addr) {
6321 			D2(NULL, "%s: address found", __func__);
6322 			/* match found */
6323 			if (prev_p == curr_p) {
6324 				/* list head */
6325 				if (devtype == VSW_VNETPORT)
6326 					port->mcap = curr_p->nextp;
6327 				else
6328 					vswp->mcap = curr_p->nextp;
6329 			} else {
6330 				prev_p->nextp = curr_p->nextp;
6331 			}
6332 			kmem_free(curr_p, sizeof (mcst_addr_t));
6333 			break;
6334 		} else {
6335 			prev_p = curr_p;
6336 			curr_p = curr_p->nextp;
6337 		}
6338 	}
6339 
6340 	if (devtype == VSW_VNETPORT)
6341 		mutex_exit(&port->mca_lock);
6342 	else
6343 		mutex_exit(&vswp->mca_lock);
6344 
6345 	D1(NULL, "%s: exit", __func__);
6346 }
6347 
6348 /*
6349  * Creates a descriptor ring (dring) and links it into the
6350  * link of outbound drings for this channel.
6351  *
6352  * Returns NULL if creation failed.
6353  */
6354 static dring_info_t *
6355 vsw_create_dring(vsw_ldc_t *ldcp)
6356 {
6357 	vsw_private_desc_t	*priv_addr = NULL;
6358 	vsw_t			*vswp = ldcp->ldc_vswp;
6359 	ldc_mem_info_t		minfo;
6360 	dring_info_t		*dp, *tp;
6361 	int			i;
6362 
6363 	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
6364 
6365 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
6366 
6367 	/* create public section of ring */
6368 	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
6369 			VSW_PUB_SIZE, &dp->handle)) != 0) {
6370 
6371 		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
6372 			"failed", ldcp->ldc_id);
6373 		goto create_fail_exit;
6374 	}
6375 
6376 	ASSERT(dp->handle != NULL);
6377 
6378 	/*
6379 	 * Get the base address of the public section of the ring.
6380 	 */
6381 	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
6382 		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
6383 			ldcp->ldc_id);
6384 		goto dring_fail_exit;
6385 	} else {
6386 		ASSERT(minfo.vaddr != 0);
6387 		dp->pub_addr = minfo.vaddr;
6388 	}
6389 
6390 	dp->num_descriptors = VSW_RING_NUM_EL;
6391 	dp->descriptor_size = VSW_PUB_SIZE;
6392 	dp->options = VIO_TX_DRING;
6393 	dp->ncookies = 1;	/* guaranteed by ldc */
6394 
6395 	/*
6396 	 * create private portion of ring
6397 	 */
6398 	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
6399 		(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
6400 
6401 	if (vsw_setup_ring(ldcp, dp)) {
6402 		DERR(vswp, "%s: unable to setup ring", __func__);
6403 		goto dring_fail_exit;
6404 	}
6405 
6406 	/* haven't used any descriptors yet */
6407 	dp->end_idx = 0;
6408 	dp->last_ack_recv = -1;
6409 
6410 	/* bind dring to the channel */
6411 	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
6412 		LDC_SHADOW_MAP, LDC_MEM_RW,
6413 		&dp->cookie[0], &dp->ncookies)) != 0) {
6414 		DERR(vswp, "vsw_create_dring: unable to bind to channel "
6415 			"%lld", ldcp->ldc_id);
6416 		goto dring_fail_exit;
6417 	}
6418 
6419 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
6420 	dp->restart_reqd = B_TRUE;
6421 
6422 	/*
6423 	 * Only ever create rings for outgoing lane. Link it onto
6424 	 * end of list.
6425 	 */
6426 	if (ldcp->lane_out.dringp == NULL) {
6427 		D2(vswp, "vsw_create_dring: adding first outbound ring");
6428 		ldcp->lane_out.dringp = dp;
6429 	} else {
6430 		tp = ldcp->lane_out.dringp;
6431 		while (tp->next != NULL)
6432 			tp = tp->next;
6433 
6434 		tp->next = dp;
6435 	}
6436 
6437 	return (dp);
6438 
6439 dring_fail_exit:
6440 	(void) ldc_mem_dring_destroy(dp->handle);
6441 
6442 create_fail_exit:
6443 	if (dp->priv_addr != NULL) {
6444 		priv_addr = dp->priv_addr;
6445 		for (i = 0; i < VSW_RING_NUM_EL; i++) {
6446 			if (priv_addr->memhandle != NULL)
6447 				(void) ldc_mem_free_handle(
6448 						priv_addr->memhandle);
6449 			priv_addr++;
6450 		}
6451 		kmem_free(dp->priv_addr,
6452 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
6453 	}
6454 	mutex_destroy(&dp->dlock);
6455 
6456 	kmem_free(dp, sizeof (dring_info_t));
6457 	return (NULL);
6458 }
6459 
6460 /*
6461  * Create a ring consisting of just a private portion and link
6462  * it into the list of rings for the outbound lane.
6463  *
6464  * These type of rings are used primarily for temporary data
6465  * storage (i.e. as data buffers).
6466  */
6467 void
6468 vsw_create_privring(vsw_ldc_t *ldcp)
6469 {
6470 	dring_info_t		*dp, *tp;
6471 	vsw_t			*vswp = ldcp->ldc_vswp;
6472 
6473 	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
6474 
6475 	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
6476 
6477 	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
6478 
6479 	/* no public section */
6480 	dp->pub_addr = NULL;
6481 
6482 	dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
6483 					VSW_RING_NUM_EL), KM_SLEEP);
6484 
6485 	if (vsw_setup_ring(ldcp, dp)) {
6486 		DERR(vswp, "%s: setup of ring failed", __func__);
6487 		kmem_free(dp->priv_addr,
6488 			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
6489 		mutex_destroy(&dp->dlock);
6490 		kmem_free(dp, sizeof (dring_info_t));
6491 		return;
6492 	}
6493 
6494 	/* haven't used any descriptors yet */
6495 	dp->end_idx = 0;
6496 
6497 	mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
6498 	dp->restart_reqd = B_TRUE;
6499 
6500 	/*
6501 	 * Only ever create rings for outgoing lane. Link it onto
6502 	 * end of list.
6503 	 */
6504 	if (ldcp->lane_out.dringp == NULL) {
6505 		D2(vswp, "%s: adding first outbound privring", __func__);
6506 		ldcp->lane_out.dringp = dp;
6507 	} else {
6508 		tp = ldcp->lane_out.dringp;
6509 		while (tp->next != NULL)
6510 			tp = tp->next;
6511 
6512 		tp->next = dp;
6513 	}
6514 
6515 	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
6516 }
6517 
6518 /*
6519  * Setup the descriptors in the dring. Returns 0 on success, 1 on
6520  * failure.
6521  */
6522 int
6523 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
6524 {
6525 	vnet_public_desc_t	*pub_addr = NULL;
6526 	vsw_private_desc_t	*priv_addr = NULL;
6527 	vsw_t			*vswp = ldcp->ldc_vswp;
6528 	uint64_t		*tmpp;
6529 	uint64_t		offset = 0;
6530 	uint32_t		ncookies = 0;
6531 	static char		*name = "vsw_setup_ring";
6532 	int			i, j, nc, rv;
6533 
6534 	priv_addr = dp->priv_addr;
6535 	pub_addr = dp->pub_addr;
6536 
6537 	/* public section may be null but private should never be */
6538 	ASSERT(priv_addr != NULL);
6539 
6540 	/*
6541 	 * Allocate the region of memory which will be used to hold
6542 	 * the data the descriptors will refer to.
6543 	 */
6544 	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
6545 	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
6546 
6547 	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
6548 		dp->data_sz, dp->data_addr);
6549 
6550 	tmpp = (uint64_t *)dp->data_addr;
6551 	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
6552 
6553 	/*
6554 	 * Initialise some of the private and public (if they exist)
6555 	 * descriptor fields.
6556 	 */
6557 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
6558 		mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
6559 
6560 		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
6561 			&priv_addr->memhandle)) != 0) {
6562 			DERR(vswp, "%s: alloc mem handle failed", name);
6563 			goto setup_ring_cleanup;
6564 		}
6565 
6566 		priv_addr->datap = (void *)tmpp;
6567 
6568 		rv = ldc_mem_bind_handle(priv_addr->memhandle,
6569 			(caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
6570 			LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
6571 			&(priv_addr->memcookie[0]), &ncookies);
6572 		if (rv != 0) {
6573 			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
6574 				"(rv %d)", name, ldcp->ldc_id, rv);
6575 			goto setup_ring_cleanup;
6576 		}
6577 		priv_addr->bound = 1;
6578 
6579 		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
6580 			name, i, priv_addr->memcookie[0].addr,
6581 			priv_addr->memcookie[0].size);
6582 
6583 		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
6584 			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
6585 				"invalid num of cookies (%d) for size 0x%llx",
6586 				name, ldcp->ldc_id, ncookies,
6587 				VSW_RING_EL_DATA_SZ);
6588 
6589 			goto setup_ring_cleanup;
6590 		} else {
6591 			for (j = 1; j < ncookies; j++) {
6592 				rv = ldc_mem_nextcookie(priv_addr->memhandle,
6593 					&(priv_addr->memcookie[j]));
6594 				if (rv != 0) {
6595 					DERR(vswp, "%s: ldc_mem_nextcookie "
6596 						"failed rv (%d)", name, rv);
6597 					goto setup_ring_cleanup;
6598 				}
6599 				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
6600 					"size 0x%llx", name, j,
6601 					priv_addr->memcookie[j].addr,
6602 					priv_addr->memcookie[j].size);
6603 			}
6604 
6605 		}
6606 		priv_addr->ncookies = ncookies;
6607 		priv_addr->dstate = VIO_DESC_FREE;
6608 
6609 		if (pub_addr != NULL) {
6610 
6611 			/* link pub and private sides */
6612 			priv_addr->descp = pub_addr;
6613 
6614 			pub_addr->ncookies = priv_addr->ncookies;
6615 
6616 			for (nc = 0; nc < pub_addr->ncookies; nc++) {
6617 				bcopy(&priv_addr->memcookie[nc],
6618 					&pub_addr->memcookie[nc],
6619 					sizeof (ldc_mem_cookie_t));
6620 			}
6621 
6622 			pub_addr->hdr.dstate = VIO_DESC_FREE;
6623 			pub_addr++;
6624 		}
6625 
6626 		/*
6627 		 * move to next element in the dring and the next
6628 		 * position in the data buffer.
6629 		 */
6630 		priv_addr++;
6631 		tmpp += offset;
6632 	}
6633 
6634 	return (0);
6635 
6636 setup_ring_cleanup:
6637 	priv_addr = dp->priv_addr;
6638 
6639 	for (j = 0; j < i; j++) {
6640 		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
6641 		(void) ldc_mem_free_handle(priv_addr->memhandle);
6642 
6643 		mutex_destroy(&priv_addr->dstate_lock);
6644 
6645 		priv_addr++;
6646 	}
6647 	kmem_free(dp->data_addr, dp->data_sz);
6648 
6649 	return (1);
6650 }
6651 
6652 /*
6653  * Searches the private section of a ring for a free descriptor,
6654  * starting at the location of the last free descriptor found
6655  * previously.
6656  *
6657  * Returns 0 if free descriptor is available, and updates state
6658  * of private descriptor to VIO_DESC_READY,  otherwise returns 1.
6659  *
6660  * FUTURE: might need to return contiguous range of descriptors
6661  * as dring info msg assumes all will be contiguous.
6662  */
6663 static int
6664 vsw_dring_find_free_desc(dring_info_t *dringp,
6665 		vsw_private_desc_t **priv_p, int *idx)
6666 {
6667 	vsw_private_desc_t	*addr = NULL;
6668 	int			num = VSW_RING_NUM_EL;
6669 	int			ret = 1;
6670 
6671 	D1(NULL, "%s enter\n", __func__);
6672 
6673 	ASSERT(dringp->priv_addr != NULL);
6674 
6675 	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
6676 			__func__, dringp, dringp->end_idx);
6677 
6678 	addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
6679 
6680 	mutex_enter(&addr->dstate_lock);
6681 	if (addr->dstate == VIO_DESC_FREE) {
6682 		addr->dstate = VIO_DESC_READY;
6683 		*priv_p = addr;
6684 		*idx = dringp->end_idx;
6685 		dringp->end_idx = (dringp->end_idx + 1) % num;
6686 		ret = 0;
6687 
6688 	}
6689 	mutex_exit(&addr->dstate_lock);
6690 
6691 	/* ring full */
6692 	if (ret == 1) {
6693 		D2(NULL, "%s: no desp free: started at %d", __func__,
6694 			dringp->end_idx);
6695 	}
6696 
6697 	D1(NULL, "%s: exit\n", __func__);
6698 
6699 	return (ret);
6700 }
6701 
6702 /*
6703  * Map from a dring identifier to the ring itself. Returns
6704  * pointer to ring or NULL if no match found.
6705  */
6706 static dring_info_t *
6707 vsw_ident2dring(lane_t *lane, uint64_t ident)
6708 {
6709 	dring_info_t	*dp = NULL;
6710 
6711 	if ((dp = lane->dringp) == NULL) {
6712 		return (NULL);
6713 	} else {
6714 		if (dp->ident == ident)
6715 			return (dp);
6716 
6717 		while (dp != NULL) {
6718 			if (dp->ident == ident)
6719 				break;
6720 			dp = dp->next;
6721 		}
6722 	}
6723 
6724 	return (dp);
6725 }
6726 
6727 /*
6728  * Set the default lane attributes. These are copied into
6729  * the attr msg we send to our peer. If they are not acceptable
6730  * then (currently) the handshake ends.
6731  */
6732 static void
6733 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
6734 {
6735 	bzero(lp, sizeof (lane_t));
6736 
6737 	READ_ENTER(&vswp->if_lockrw);
6738 	ether_copy(&(vswp->if_addr), &(lp->addr));
6739 	RW_EXIT(&vswp->if_lockrw);
6740 
6741 	lp->mtu = VSW_MTU;
6742 	lp->addr_type = ADDR_TYPE_MAC;
6743 	lp->xfer_mode = VIO_DRING_MODE;
6744 	lp->ack_freq = 0;	/* for shared mode */
6745 
6746 	mutex_enter(&lp->seq_lock);
6747 	lp->seq_num = VNET_ISS;
6748 	mutex_exit(&lp->seq_lock);
6749 }
6750 
6751 /*
6752  * Verify that the attributes are acceptable.
6753  *
6754  * FUTURE: If some attributes are not acceptable, change them
6755  * our desired values.
6756  */
6757 static int
6758 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
6759 {
6760 	int	ret = 0;
6761 
6762 	D1(NULL, "vsw_check_attr enter\n");
6763 
6764 	/*
6765 	 * Note we currently only support in-band descriptors
6766 	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
6767 	 */
6768 	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
6769 			(pkt->xfer_mode != VIO_DRING_MODE)) {
6770 		D2(NULL, "vsw_check_attr: unknown mode %x\n",
6771 			pkt->xfer_mode);
6772 		ret = 1;
6773 	}
6774 
6775 	/* Only support MAC addresses at moment. */
6776 	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
6777 		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
6778 			"or address 0x%llx\n", pkt->addr_type,
6779 			pkt->addr);
6780 		ret = 1;
6781 	}
6782 
6783 	/*
6784 	 * MAC address supplied by device should match that stored
6785 	 * in the vsw-port OBP node. Need to decide what to do if they
6786 	 * don't match, for the moment just warn but don't fail.
6787 	 */
6788 	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
6789 		DERR(NULL, "vsw_check_attr: device supplied address "
6790 			"0x%llx doesn't match node address 0x%llx\n",
6791 			pkt->addr, port->p_macaddr);
6792 	}
6793 
6794 	/*
6795 	 * Ack freq only makes sense in pkt mode, in shared
6796 	 * mode the ring descriptors say whether or not to
6797 	 * send back an ACK.
6798 	 */
6799 	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
6800 				(pkt->ack_freq > 0)) {
6801 		D2(NULL, "vsw_check_attr: non zero ack freq "
6802 			" in SHM mode\n");
6803 		ret = 1;
6804 	}
6805 
6806 	/*
6807 	 * Note: for the moment we only support ETHER
6808 	 * frames. This may change in the future.
6809 	 */
6810 	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
6811 		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
6812 			pkt->mtu);
6813 		ret = 1;
6814 	}
6815 
6816 	D1(NULL, "vsw_check_attr exit\n");
6817 
6818 	return (ret);
6819 }
6820 
6821 /*
6822  * Returns 1 if there is a problem, 0 otherwise.
6823  */
6824 static int
6825 vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
6826 {
6827 	_NOTE(ARGUNUSED(pkt))
6828 
6829 	int	ret = 0;
6830 
6831 	D1(NULL, "vsw_check_dring_info enter\n");
6832 
6833 	if ((pkt->num_descriptors == 0) ||
6834 		(pkt->descriptor_size == 0) ||
6835 		(pkt->ncookies != 1)) {
6836 		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
6837 		ret = 1;
6838 	}
6839 
6840 	D1(NULL, "vsw_check_dring_info exit\n");
6841 
6842 	return (ret);
6843 }
6844 
6845 /*
6846  * Returns 1 if two memory cookies match. Otherwise returns 0.
6847  */
6848 static int
6849 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
6850 {
6851 	if ((m1->addr != m2->addr) ||
6852 		(m2->size != m2->size)) {
6853 		return (0);
6854 	} else {
6855 		return (1);
6856 	}
6857 }
6858 
6859 /*
6860  * Returns 1 if ring described in reg message matches that
6861  * described by dring_info structure. Otherwise returns 0.
6862  */
6863 static int
6864 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
6865 {
6866 	if ((msg->descriptor_size != dp->descriptor_size) ||
6867 		(msg->num_descriptors != dp->num_descriptors) ||
6868 		(msg->ncookies != dp->ncookies) ||
6869 		!(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
6870 		return (0);
6871 	} else {
6872 		return (1);
6873 	}
6874 
6875 }
6876 
6877 static caddr_t
6878 vsw_print_ethaddr(uint8_t *a, char *ebuf)
6879 {
6880 	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
6881 	    a[0], a[1], a[2], a[3], a[4], a[5]);
6882 	return (ebuf);
6883 }
6884 
6885 /*
6886  * Reset and free all the resources associated with
6887  * the channel.
6888  */
6889 static void
6890 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
6891 {
6892 	dring_info_t		*dp, *dpp;
6893 	lane_t			*lp = NULL;
6894 	int			rv = 0;
6895 
6896 	ASSERT(ldcp != NULL);
6897 
6898 	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
6899 
6900 	if (dir == INBOUND) {
6901 		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
6902 			" of channel %lld", __func__, ldcp->ldc_id);
6903 		lp = &ldcp->lane_in;
6904 	} else {
6905 		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
6906 			" of channel %lld", __func__, ldcp->ldc_id);
6907 		lp = &ldcp->lane_out;
6908 	}
6909 
6910 	lp->lstate = VSW_LANE_INACTIV;
6911 	mutex_enter(&lp->seq_lock);
6912 	lp->seq_num = VNET_ISS;
6913 	mutex_exit(&lp->seq_lock);
6914 	if (lp->dringp) {
6915 		if (dir == INBOUND) {
6916 			dp = lp->dringp;
6917 			while (dp != NULL) {
6918 				dpp = dp->next;
6919 				if (dp->handle != NULL)
6920 					(void) ldc_mem_dring_unmap(dp->handle);
6921 				kmem_free(dp, sizeof (dring_info_t));
6922 				dp = dpp;
6923 			}
6924 		} else {
6925 			/*
6926 			 * unbind, destroy exported dring, free dring struct
6927 			 */
6928 			dp = lp->dringp;
6929 			rv = vsw_free_ring(dp);
6930 		}
6931 		if (rv == 0) {
6932 			lp->dringp = NULL;
6933 		}
6934 	}
6935 
6936 	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
6937 }
6938 
6939 /*
6940  * Free ring and all associated resources.
6941  */
6942 static int
6943 vsw_free_ring(dring_info_t *dp)
6944 {
6945 	vsw_private_desc_t	*paddr = NULL;
6946 	dring_info_t		*dpp;
6947 	int			i, rv = 1;
6948 
6949 	while (dp != NULL) {
6950 		mutex_enter(&dp->dlock);
6951 		dpp = dp->next;
6952 		if (dp->priv_addr != NULL) {
6953 			/*
6954 			 * First unbind and free the memory handles
6955 			 * stored in each descriptor within the ring.
6956 			 */
6957 			for (i = 0; i < VSW_RING_NUM_EL; i++) {
6958 				paddr = (vsw_private_desc_t *)
6959 						dp->priv_addr + i;
6960 				if (paddr->memhandle != NULL) {
6961 					if (paddr->bound == 1) {
6962 						rv = ldc_mem_unbind_handle(
6963 							paddr->memhandle);
6964 
6965 						if (rv != 0) {
6966 							DERR(NULL, "error "
6967 							"unbinding handle for "
6968 							"ring 0x%llx at pos %d",
6969 							dp, i);
6970 							mutex_exit(&dp->dlock);
6971 							return (rv);
6972 						}
6973 						paddr->bound = 0;
6974 					}
6975 
6976 					rv = ldc_mem_free_handle(
6977 							paddr->memhandle);
6978 					if (rv != 0) {
6979 						DERR(NULL, "error freeing "
6980 							"handle for ring "
6981 							"0x%llx at pos %d",
6982 							dp, i);
6983 						mutex_exit(&dp->dlock);
6984 						return (rv);
6985 					}
6986 					paddr->memhandle = NULL;
6987 				}
6988 				mutex_destroy(&paddr->dstate_lock);
6989 			}
6990 			kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
6991 					* VSW_RING_NUM_EL));
6992 		}
6993 
6994 		/*
6995 		 * Now unbind and destroy the ring itself.
6996 		 */
6997 		if (dp->handle != NULL) {
6998 			(void) ldc_mem_dring_unbind(dp->handle);
6999 			(void) ldc_mem_dring_destroy(dp->handle);
7000 		}
7001 
7002 		if (dp->data_addr != NULL) {
7003 			kmem_free(dp->data_addr, dp->data_sz);
7004 		}
7005 
7006 		mutex_exit(&dp->dlock);
7007 		mutex_destroy(&dp->dlock);
7008 		mutex_destroy(&dp->restart_lock);
7009 		kmem_free(dp, sizeof (dring_info_t));
7010 
7011 		dp = dpp;
7012 	}
7013 	return (0);
7014 }
7015 
7016 /*
7017  * Debugging routines
7018  */
7019 static void
7020 display_state(void)
7021 {
7022 	vsw_t		*vswp;
7023 	vsw_port_list_t	*plist;
7024 	vsw_port_t 	*port;
7025 	vsw_ldc_list_t	*ldcl;
7026 	vsw_ldc_t 	*ldcp;
7027 
7028 	cmn_err(CE_NOTE, "***** system state *****");
7029 
7030 	for (vswp = vsw_head; vswp; vswp = vswp->next) {
7031 		plist = &vswp->plist;
7032 		READ_ENTER(&plist->lockrw);
7033 		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
7034 			vswp->instance, plist->num_ports);
7035 
7036 		for (port = plist->head; port != NULL; port = port->p_next) {
7037 			ldcl = &port->p_ldclist;
7038 			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
7039 				port->p_instance, ldcl->num_ldcs);
7040 			READ_ENTER(&ldcl->lockrw);
7041 			ldcp = ldcl->head;
7042 			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
7043 				cmn_err(CE_CONT, "chan %lu : dev %d : "
7044 					"status %d : phase %u\n",
7045 					ldcp->ldc_id, ldcp->dev_class,
7046 					ldcp->ldc_status, ldcp->hphase);
7047 				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
7048 					"psession %lu\n",
7049 					ldcp->ldc_id,
7050 					ldcp->local_session,
7051 					ldcp->peer_session);
7052 
7053 				cmn_err(CE_CONT, "Inbound lane:\n");
7054 				display_lane(&ldcp->lane_in);
7055 				cmn_err(CE_CONT, "Outbound lane:\n");
7056 				display_lane(&ldcp->lane_out);
7057 			}
7058 			RW_EXIT(&ldcl->lockrw);
7059 		}
7060 		RW_EXIT(&plist->lockrw);
7061 	}
7062 	cmn_err(CE_NOTE, "***** system state *****");
7063 }
7064 
7065 static void
7066 display_lane(lane_t *lp)
7067 {
7068 	dring_info_t	*drp;
7069 
7070 	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
7071 		lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
7072 	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
7073 		lp->addr_type, lp->addr, lp->xfer_mode);
7074 	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
7075 
7076 	cmn_err(CE_CONT, "Dring info:\n");
7077 	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
7078 		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
7079 			drp->num_descriptors, drp->descriptor_size);
7080 		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
7081 		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
7082 			(uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
7083 		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
7084 			drp->ident, drp->end_idx);
7085 		display_ring(drp);
7086 	}
7087 }
7088 
7089 static void
7090 display_ring(dring_info_t *dringp)
7091 {
7092 	uint64_t		i;
7093 	uint64_t		priv_count = 0;
7094 	uint64_t		pub_count = 0;
7095 	vnet_public_desc_t	*pub_addr = NULL;
7096 	vsw_private_desc_t	*priv_addr = NULL;
7097 
7098 	for (i = 0; i < VSW_RING_NUM_EL; i++) {
7099 		if (dringp->pub_addr != NULL) {
7100 			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
7101 
7102 			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
7103 				pub_count++;
7104 		}
7105 
7106 		if (dringp->priv_addr != NULL) {
7107 			priv_addr =
7108 				(vsw_private_desc_t *)dringp->priv_addr + i;
7109 
7110 			if (priv_addr->dstate == VIO_DESC_FREE)
7111 				priv_count++;
7112 		}
7113 	}
7114 	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
7115 			i, priv_count, pub_count);
7116 }
7117 
7118 static void
7119 dump_flags(uint64_t state)
7120 {
7121 	int	i;
7122 
7123 	typedef struct flag_name {
7124 		int	flag_val;
7125 		char	*flag_name;
7126 	} flag_name_t;
7127 
7128 	flag_name_t	flags[] = {
7129 		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
7130 		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
7131 		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
7132 		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
7133 		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
7134 		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
7135 		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
7136 		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
7137 		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
7138 		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
7139 		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
7140 		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
7141 		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
7142 		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
7143 		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
7144 		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
7145 		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
7146 		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
7147 		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
7148 		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
7149 		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
7150 		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
7151 		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
7152 		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
7153 		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
7154 		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
7155 		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
7156 		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
7157 		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
7158 		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
7159 		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
7160 
7161 	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
7162 	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
7163 		if (state & flags[i].flag_val)
7164 			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
7165 	}
7166 }
7167