xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_phys.c (revision ee03c681cedb48165922333190cdd8b230ffa073)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/vsw.h>
66 
67 /* MAC Ring table functions. */
68 static void vsw_mac_ring_tbl_init(vsw_t *vswp);
69 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
70 static void vsw_queue_worker(vsw_mac_ring_t *rrp);
71 static void vsw_queue_stop(vsw_queue_t *vqp);
72 static vsw_queue_t *vsw_queue_create();
73 static void vsw_queue_destroy(vsw_queue_t *vqp);
74 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
75 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
76 
77 /* MAC layer routines */
78 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
79 		mac_resource_t *mrp);
80 static	int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
81 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
82 static	int vsw_unset_hw_addr(vsw_t *, int);
83 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
84 static int vsw_prog_if(vsw_t *);
85 
86 /* Support functions */
87 static int vsw_prog_ports(vsw_t *);
88 int vsw_set_hw(vsw_t *, vsw_port_t *, int);
89 int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
90 void vsw_reconfig_hw(vsw_t *);
91 int vsw_mac_attach(vsw_t *vswp);
92 void vsw_mac_detach(vsw_t *vswp);
93 int vsw_mac_open(vsw_t *vswp);
94 void vsw_mac_close(vsw_t *vswp);
95 void vsw_unset_addrs(vsw_t *vswp);
96 void vsw_set_addrs(vsw_t *vswp);
97 int vsw_get_hw_maddr(vsw_t *);
98 mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
99 
100 /*
101  * Tunables used in this file.
102  */
103 extern int vsw_mac_open_retries;
104 extern boolean_t vsw_multi_ring_enable;
105 extern int vsw_mac_rx_rings;
106 
107 /*
108  * Check to see if the card supports the setting of multiple unicst
109  * addresses.
110  *
111  * Returns 0 if card supports the programming of multiple unicast addresses,
112  * otherwise returns 1.
113  */
114 int
115 vsw_get_hw_maddr(vsw_t *vswp)
116 {
117 	D1(vswp, "%s: enter", __func__);
118 
119 	ASSERT(MUTEX_HELD(&vswp->mac_lock));
120 
121 	if (vswp->mh == NULL)
122 		return (1);
123 
124 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
125 		cmn_err(CE_WARN, "!vsw%d: device (%s) does not support "
126 		    "setting multiple unicast addresses", vswp->instance,
127 		    vswp->physname);
128 		return (1);
129 	}
130 
131 	D2(vswp, "%s: %d addrs : %d free", __func__,
132 	    vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
133 
134 	D1(vswp, "%s: exit", __func__);
135 
136 	return (0);
137 }
138 
139 /*
140  * Program unicast and multicast addresses of vsw interface and the ports
141  * into the physical device.
142  */
143 void
144 vsw_set_addrs(vsw_t *vswp)
145 {
146 	vsw_port_list_t	*plist = &vswp->plist;
147 	vsw_port_t	*port;
148 	mcst_addr_t	*mcap;
149 	int		rv;
150 
151 	READ_ENTER(&vswp->if_lockrw);
152 
153 	if (vswp->if_state & VSW_IF_UP) {
154 
155 		/* program unicst addr of vsw interface in the physdev */
156 		if (vswp->addr_set == VSW_ADDR_UNSET) {
157 			mutex_enter(&vswp->hw_lock);
158 			rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
159 			mutex_exit(&vswp->hw_lock);
160 			if (rv != 0) {
161 				cmn_err(CE_NOTE,
162 				    "!vsw%d: failed to program interface "
163 				    "unicast address\n", vswp->instance);
164 			}
165 			/*
166 			 * Notify the MAC layer of the changed address.
167 			 */
168 			mac_unicst_update(vswp->if_mh,
169 			    (uint8_t *)&vswp->if_addr);
170 		}
171 
172 		/* program mcast addrs of vsw interface in the physdev */
173 		mutex_enter(&vswp->mca_lock);
174 		mutex_enter(&vswp->mac_lock);
175 		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
176 			if (mcap->mac_added)
177 				continue;
178 			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
179 			if (rv == 0) {
180 				mcap->mac_added = B_TRUE;
181 			} else {
182 				cmn_err(CE_WARN, "!vsw%d: unable to add "
183 				    "multicast address: %s\n", vswp->instance,
184 				    ether_sprintf((void *)&mcap->mca));
185 			}
186 		}
187 		mutex_exit(&vswp->mac_lock);
188 		mutex_exit(&vswp->mca_lock);
189 
190 	}
191 
192 	RW_EXIT(&vswp->if_lockrw);
193 
194 	WRITE_ENTER(&plist->lockrw);
195 
196 	/* program unicast address of ports in the physical device */
197 	mutex_enter(&vswp->hw_lock);
198 	for (port = plist->head; port != NULL; port = port->p_next) {
199 		if (port->addr_set != VSW_ADDR_UNSET) /* addr already set */
200 			continue;
201 		if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
202 			cmn_err(CE_NOTE,
203 			    "!vsw%d: port:%d failed to set unicast address\n",
204 			    vswp->instance, port->p_instance);
205 		}
206 	}
207 	mutex_exit(&vswp->hw_lock);
208 
209 	/* program multicast addresses of ports in the physdev */
210 	for (port = plist->head; port != NULL; port = port->p_next) {
211 		mutex_enter(&port->mca_lock);
212 		mutex_enter(&vswp->mac_lock);
213 		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
214 			if (mcap->mac_added)
215 				continue;
216 			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
217 			if (rv == 0) {
218 				mcap->mac_added = B_TRUE;
219 			} else {
220 				cmn_err(CE_WARN, "!vsw%d: unable to add "
221 				    "multicast address: %s\n", vswp->instance,
222 				    ether_sprintf((void *)&mcap->mca));
223 			}
224 		}
225 		mutex_exit(&vswp->mac_lock);
226 		mutex_exit(&port->mca_lock);
227 	}
228 
229 	RW_EXIT(&plist->lockrw);
230 }
231 
232 /*
233  * Remove unicast and multicast addresses of vsw interface and the ports
234  * from the physical device.
235  */
236 void
237 vsw_unset_addrs(vsw_t *vswp)
238 {
239 	vsw_port_list_t	*plist = &vswp->plist;
240 	vsw_port_t	*port;
241 	mcst_addr_t	*mcap;
242 
243 	READ_ENTER(&vswp->if_lockrw);
244 
245 	if (vswp->if_state & VSW_IF_UP) {
246 
247 		/*
248 		 * Remove unicast addr of vsw interfce
249 		 * from current physdev
250 		 */
251 		mutex_enter(&vswp->hw_lock);
252 		(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
253 		mutex_exit(&vswp->hw_lock);
254 
255 		/*
256 		 * Remove mcast addrs of vsw interface
257 		 * from current physdev
258 		 */
259 		mutex_enter(&vswp->mca_lock);
260 		mutex_enter(&vswp->mac_lock);
261 		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
262 			if (!mcap->mac_added)
263 				continue;
264 			(void) mac_multicst_remove(vswp->mh,
265 			    (uchar_t *)&mcap->mca);
266 			mcap->mac_added = B_FALSE;
267 		}
268 		mutex_exit(&vswp->mac_lock);
269 		mutex_exit(&vswp->mca_lock);
270 
271 	}
272 
273 	RW_EXIT(&vswp->if_lockrw);
274 
275 	WRITE_ENTER(&plist->lockrw);
276 
277 	/*
278 	 * Remove unicast address of ports from the current physical device
279 	 */
280 	mutex_enter(&vswp->hw_lock);
281 	for (port = plist->head; port != NULL; port = port->p_next) {
282 		/* Remove address if was programmed into HW. */
283 		if (port->addr_set == VSW_ADDR_UNSET)
284 			continue;
285 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
286 	}
287 	mutex_exit(&vswp->hw_lock);
288 
289 	/* Remove multicast addresses of ports from the current physdev */
290 	for (port = plist->head; port != NULL; port = port->p_next) {
291 		mutex_enter(&port->mca_lock);
292 		mutex_enter(&vswp->mac_lock);
293 		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
294 			if (!mcap->mac_added)
295 				continue;
296 			(void) mac_multicst_remove(vswp->mh,
297 			    (uchar_t *)&mcap->mca);
298 			mcap->mac_added = B_FALSE;
299 		}
300 		mutex_exit(&vswp->mac_lock);
301 		mutex_exit(&port->mca_lock);
302 	}
303 
304 	RW_EXIT(&plist->lockrw);
305 }
306 
307 /*
308  * Open the underlying physical device for access in layer2 mode.
309  * Returns:
310  * 0 on success
311  * EAGAIN if mac_open() fails due to the device being not available yet.
312  * EIO on any other failures.
313  */
314 int
315 vsw_mac_open(vsw_t *vswp)
316 {
317 	int	rv;
318 
319 	ASSERT(MUTEX_HELD(&vswp->mac_lock));
320 
321 	if (vswp->mh != NULL) {
322 		/* already open */
323 		return (0);
324 	}
325 
326 	if (vswp->mac_open_retries++ >= vsw_mac_open_retries) {
327 		/* exceeded max retries */
328 		return (EIO);
329 	}
330 
331 	rv = mac_open(vswp->physname, &vswp->mh);
332 	if (rv != 0) {
333 		/*
334 		 * If mac_open() failed and the error indicates that the
335 		 * device is not available yet, then, we return EAGAIN to
336 		 * indicate that it needs to be retried.
337 		 * For example, this may happen during boot up, as the
338 		 * required link aggregation groups(devices) have not been
339 		 * created yet.
340 		 */
341 		if (rv == ENOENT) {
342 			return (EAGAIN);
343 		} else {
344 			cmn_err(CE_WARN, "vsw%d: mac_open %s failed rv:%x",
345 			    vswp->instance, vswp->physname, rv);
346 			return (EIO);
347 		}
348 	}
349 
350 	vswp->mac_open_retries = 0;
351 
352 	return (0);
353 }
354 
355 /*
356  * Close the underlying physical device.
357  */
358 void
359 vsw_mac_close(vsw_t *vswp)
360 {
361 	ASSERT(MUTEX_HELD(&vswp->mac_lock));
362 
363 	if (vswp->mh != NULL) {
364 		mac_close(vswp->mh);
365 		vswp->mh = NULL;
366 	}
367 }
368 
369 /*
370  * Link into the MAC layer to gain access to the services provided by
371  * the underlying physical device driver (which should also have
372  * registered with the MAC layer).
373  *
374  * Only when in layer 2 mode.
375  */
376 int
377 vsw_mac_attach(vsw_t *vswp)
378 {
379 	D1(vswp, "%s: enter", __func__);
380 
381 	ASSERT(vswp->mrh == NULL);
382 	ASSERT(vswp->mstarted == B_FALSE);
383 	ASSERT(vswp->mresources == B_FALSE);
384 
385 	ASSERT(MUTEX_HELD(&vswp->mac_lock));
386 
387 	ASSERT(vswp->mh != NULL);
388 
389 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
390 
391 	if (vsw_multi_ring_enable) {
392 		/*
393 		 * Initialize the ring table.
394 		 */
395 		vsw_mac_ring_tbl_init(vswp);
396 
397 		/*
398 		 * Register our rx callback function.
399 		 */
400 		vswp->mrh = mac_rx_add(vswp->mh,
401 		    vsw_rx_queue_cb, (void *)vswp);
402 		ASSERT(vswp->mrh != NULL);
403 
404 		/*
405 		 * Register our mac resource callback.
406 		 */
407 		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
408 		vswp->mresources = B_TRUE;
409 
410 		/*
411 		 * Get the ring resources available to us from
412 		 * the mac below us.
413 		 */
414 		mac_resources(vswp->mh);
415 	} else {
416 		/*
417 		 * Just register our rx callback function
418 		 */
419 		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
420 		ASSERT(vswp->mrh != NULL);
421 	}
422 
423 	/* Get the MAC tx fn */
424 	vswp->txinfo = mac_tx_get(vswp->mh);
425 
426 	/* start the interface */
427 	if (mac_start(vswp->mh) != 0) {
428 		cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
429 		    vswp->instance);
430 		goto mac_fail_exit;
431 	}
432 
433 	vswp->mstarted = B_TRUE;
434 
435 	D1(vswp, "%s: exit", __func__);
436 	return (0);
437 
438 mac_fail_exit:
439 	vsw_mac_detach(vswp);
440 
441 	D1(vswp, "%s: exit", __func__);
442 	return (1);
443 }
444 
445 void
446 vsw_mac_detach(vsw_t *vswp)
447 {
448 	D1(vswp, "vsw_mac_detach: enter");
449 
450 	ASSERT(vswp != NULL);
451 	ASSERT(MUTEX_HELD(&vswp->mac_lock));
452 
453 	if (vsw_multi_ring_enable) {
454 		vsw_mac_ring_tbl_destroy(vswp);
455 	}
456 
457 	if (vswp->mh != NULL) {
458 		if (vswp->mstarted)
459 			mac_stop(vswp->mh);
460 		if (vswp->mrh != NULL)
461 			mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE);
462 		if (vswp->mresources)
463 			mac_resource_set(vswp->mh, NULL, NULL);
464 	}
465 
466 	vswp->mrh = NULL;
467 	vswp->txinfo = NULL;
468 	vswp->mstarted = B_FALSE;
469 
470 	D1(vswp, "vsw_mac_detach: exit");
471 }
472 
473 /*
474  * Depending on the mode specified, the capabilites and capacity
475  * of the underlying device setup the physical device.
476  *
477  * If in layer 3 mode, then do nothing.
478  *
479  * If in layer 2 programmed mode attempt to program the unicast address
480  * associated with the port into the physical device. If this is not
481  * possible due to resource exhaustion or simply because the device does
482  * not support multiple unicast addresses then if required fallback onto
483  * putting the card into promisc mode.
484  *
485  * If in promisc mode then simply set the card into promisc mode.
486  *
487  * Returns 0 success, 1 on failure.
488  */
489 int
490 vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
491 {
492 	mac_multi_addr_t	mac_addr;
493 	int			err;
494 
495 	D1(vswp, "%s: enter", __func__);
496 
497 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
498 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
499 
500 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
501 		return (0);
502 
503 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
504 		return (vsw_set_hw_promisc(vswp, port, type));
505 	}
506 
507 	/*
508 	 * Attempt to program the unicast address into the HW.
509 	 */
510 	mac_addr.mma_addrlen = ETHERADDRL;
511 	if (type == VSW_VNETPORT) {
512 		ASSERT(port != NULL);
513 		ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
514 	} else {
515 		ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
516 	}
517 
518 	err = vsw_set_hw_addr(vswp, &mac_addr);
519 	if (err == ENOSPC) {
520 		/*
521 		 * Mark that attempt should be made to re-config sometime
522 		 * in future if a port is deleted.
523 		 */
524 		vswp->recfg_reqd = B_TRUE;
525 
526 		/*
527 		 * Only 1 mode specified, nothing more to do.
528 		 */
529 		if (vswp->smode_num == 1)
530 			return (err);
531 
532 		/*
533 		 * If promiscuous was next mode specified try to
534 		 * set the card into that mode.
535 		 */
536 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
537 		    (vswp->smode[vswp->smode_idx + 1] ==
538 		    VSW_LAYER2_PROMISC)) {
539 			vswp->smode_idx += 1;
540 			return (vsw_set_hw_promisc(vswp, port, type));
541 		}
542 		return (err);
543 	}
544 
545 	if (err != 0)
546 		return (err);
547 
548 	if (type == VSW_VNETPORT) {
549 		port->addr_slot = mac_addr.mma_slot;
550 		port->addr_set = VSW_ADDR_HW;
551 	} else {
552 		vswp->addr_slot = mac_addr.mma_slot;
553 		vswp->addr_set = VSW_ADDR_HW;
554 	}
555 
556 	D2(vswp, "programmed addr %s into slot %d "
557 	"of device %s", ether_sprintf((void *)mac_addr.mma_addr),
558 	    mac_addr.mma_slot, vswp->physname);
559 
560 	D1(vswp, "%s: exit", __func__);
561 
562 	return (0);
563 }
564 
565 /*
566  * If in layer 3 mode do nothing.
567  *
568  * If in layer 2 switched mode remove the address from the physical
569  * device.
570  *
571  * If in layer 2 promiscuous mode disable promisc mode.
572  *
573  * Returns 0 on success.
574  */
575 int
576 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
577 {
578 	mac_addr_slot_t	slot;
579 	int		rv;
580 
581 	D1(vswp, "%s: enter", __func__);
582 
583 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
584 
585 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
586 		return (0);
587 
588 	switch (type) {
589 	case VSW_VNETPORT:
590 		ASSERT(port != NULL);
591 
592 		if (port->addr_set == VSW_ADDR_PROMISC) {
593 			return (vsw_unset_hw_promisc(vswp, port, type));
594 
595 		} else if (port->addr_set == VSW_ADDR_HW) {
596 			slot = port->addr_slot;
597 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
598 				port->addr_set = VSW_ADDR_UNSET;
599 		}
600 
601 		break;
602 
603 	case VSW_LOCALDEV:
604 		if (vswp->addr_set == VSW_ADDR_PROMISC) {
605 			return (vsw_unset_hw_promisc(vswp, NULL, type));
606 
607 		} else if (vswp->addr_set == VSW_ADDR_HW) {
608 			slot = vswp->addr_slot;
609 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
610 				vswp->addr_set = VSW_ADDR_UNSET;
611 		}
612 
613 		break;
614 
615 	default:
616 		/* should never happen */
617 		DERR(vswp, "%s: unknown type %d", __func__, type);
618 		ASSERT(0);
619 		return (1);
620 	}
621 
622 	D1(vswp, "%s: exit", __func__);
623 	return (rv);
624 }
625 
626 /*
627  * Attempt to program a unicast address into HW.
628  *
629  * Returns 0 on sucess, 1 on failure.
630  */
631 static int
632 vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
633 {
634 	void	*mah;
635 	int	rv = EINVAL;
636 
637 	D1(vswp, "%s: enter", __func__);
638 
639 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
640 
641 	if (vswp->maddr.maddr_handle == NULL)
642 		return (rv);
643 
644 	mah = vswp->maddr.maddr_handle;
645 
646 	rv = vswp->maddr.maddr_add(mah, mac);
647 
648 	if (rv == 0)
649 		return (rv);
650 
651 	/*
652 	 * Its okay for the add to fail because we have exhausted
653 	 * all the resouces in the hardware device. Any other error
654 	 * we want to flag.
655 	 */
656 	if (rv != ENOSPC) {
657 		cmn_err(CE_WARN, "!vsw%d: error programming "
658 		    "address %s into HW err (%d)",
659 		    vswp->instance, ether_sprintf((void *)mac->mma_addr), rv);
660 	}
661 	D1(vswp, "%s: exit", __func__);
662 	return (rv);
663 }
664 
665 /*
666  * Remove a unicast mac address which has previously been programmed
667  * into HW.
668  *
669  * Returns 0 on sucess, 1 on failure.
670  */
671 static int
672 vsw_unset_hw_addr(vsw_t *vswp, int slot)
673 {
674 	void	*mah;
675 	int	rv;
676 
677 	D1(vswp, "%s: enter", __func__);
678 
679 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
680 	ASSERT(slot >= 0);
681 
682 	if (vswp->maddr.maddr_handle == NULL)
683 		return (1);
684 
685 	mah = vswp->maddr.maddr_handle;
686 
687 	rv = vswp->maddr.maddr_remove(mah, slot);
688 	if (rv != 0) {
689 		cmn_err(CE_WARN, "!vsw%d: unable to remove address "
690 		    "from slot %d in device %s (err %d)",
691 		    vswp->instance, slot, vswp->physname, rv);
692 		return (1);
693 	}
694 
695 	D2(vswp, "removed addr from slot %d in device %s",
696 	    slot, vswp->physname);
697 
698 	D1(vswp, "%s: exit", __func__);
699 	return (0);
700 }
701 
702 /*
703  * Set network card into promisc mode.
704  *
705  * Returns 0 on success, 1 on failure.
706  */
707 static int
708 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
709 {
710 	D1(vswp, "%s: enter", __func__);
711 
712 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
713 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
714 
715 	mutex_enter(&vswp->mac_lock);
716 	if (vswp->mh == NULL) {
717 		mutex_exit(&vswp->mac_lock);
718 		return (1);
719 	}
720 
721 	if (vswp->promisc_cnt++ == 0) {
722 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
723 			vswp->promisc_cnt--;
724 			mutex_exit(&vswp->mac_lock);
725 			return (1);
726 		}
727 		cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
728 		    "promiscuous mode", vswp->instance, vswp->physname);
729 	}
730 	mutex_exit(&vswp->mac_lock);
731 
732 	if (type == VSW_VNETPORT) {
733 		ASSERT(port != NULL);
734 		port->addr_set = VSW_ADDR_PROMISC;
735 	} else {
736 		vswp->addr_set = VSW_ADDR_PROMISC;
737 	}
738 
739 	D1(vswp, "%s: exit", __func__);
740 
741 	return (0);
742 }
743 
744 /*
745  * Turn off promiscuous mode on network card.
746  *
747  * Returns 0 on success, 1 on failure.
748  */
749 static int
750 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
751 {
752 	vsw_port_list_t 	*plist = &vswp->plist;
753 
754 	D2(vswp, "%s: enter", __func__);
755 
756 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
757 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
758 
759 	mutex_enter(&vswp->mac_lock);
760 	if (vswp->mh == NULL) {
761 		mutex_exit(&vswp->mac_lock);
762 		return (1);
763 	}
764 
765 	if (--vswp->promisc_cnt == 0) {
766 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
767 			vswp->promisc_cnt++;
768 			mutex_exit(&vswp->mac_lock);
769 			return (1);
770 		}
771 
772 		/*
773 		 * We are exiting promisc mode either because we were
774 		 * only in promisc mode because we had failed over from
775 		 * switched mode due to HW resource issues, or the user
776 		 * wanted the card in promisc mode for all the ports and
777 		 * the last port is now being deleted. Tweak the message
778 		 * accordingly.
779 		 */
780 		if (plist->num_ports != 0) {
781 			cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
782 			    "programmed mode", vswp->instance, vswp->physname);
783 		} else {
784 			cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
785 			    "promiscuous mode", vswp->instance, vswp->physname);
786 		}
787 	}
788 	mutex_exit(&vswp->mac_lock);
789 
790 	if (type == VSW_VNETPORT) {
791 		ASSERT(port != NULL);
792 		ASSERT(port->addr_set == VSW_ADDR_PROMISC);
793 		port->addr_set = VSW_ADDR_UNSET;
794 	} else {
795 		ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
796 		vswp->addr_set = VSW_ADDR_UNSET;
797 	}
798 
799 	D1(vswp, "%s: exit", __func__);
800 	return (0);
801 }
802 
803 /*
804  * Determine whether or not we are operating in our prefered
805  * mode and if not whether the physical resources now allow us
806  * to operate in it.
807  *
808  * If a port is being removed should only be invoked after port has been
809  * removed from the port list.
810  */
811 void
812 vsw_reconfig_hw(vsw_t *vswp)
813 {
814 	int			s_idx;
815 
816 	D1(vswp, "%s: enter", __func__);
817 
818 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
819 
820 	if (vswp->maddr.maddr_handle == NULL) {
821 		return;
822 	}
823 
824 	/*
825 	 * If we are in layer 2 (i.e. switched) or would like to be
826 	 * in layer 2 then check if any ports or the vswitch itself
827 	 * need to be programmed into the HW.
828 	 *
829 	 * This can happen in two cases - switched was specified as
830 	 * the prefered mode of operation but we exhausted the HW
831 	 * resources and so failed over to the next specifed mode,
832 	 * or switched was the only mode specified so after HW
833 	 * resources were exhausted there was nothing more we
834 	 * could do.
835 	 */
836 	if (vswp->smode_idx > 0)
837 		s_idx = vswp->smode_idx - 1;
838 	else
839 		s_idx = vswp->smode_idx;
840 
841 	if (vswp->smode[s_idx] != VSW_LAYER2) {
842 		return;
843 	}
844 
845 	D2(vswp, "%s: attempting reconfig..", __func__);
846 
847 	/*
848 	 * First, attempt to set the vswitch mac address into HW,
849 	 * if required.
850 	 */
851 	if (vsw_prog_if(vswp)) {
852 		return;
853 	}
854 
855 	/*
856 	 * Next, attempt to set any ports which have not yet been
857 	 * programmed into HW.
858 	 */
859 	if (vsw_prog_ports(vswp)) {
860 		return;
861 	}
862 
863 	/*
864 	 * By now we know that have programmed all desired ports etc
865 	 * into HW, so safe to mark reconfiguration as complete.
866 	 */
867 	vswp->recfg_reqd = B_FALSE;
868 
869 	vswp->smode_idx = s_idx;
870 
871 	D1(vswp, "%s: exit", __func__);
872 }
873 
874 /*
875  * Check to see if vsw itself is plumbed, and if so whether or not
876  * its mac address should be written into HW.
877  *
878  * Returns 0 if could set address, or didn't have to set it.
879  * Returns 1 if failed to set address.
880  */
881 static int
882 vsw_prog_if(vsw_t *vswp)
883 {
884 	mac_multi_addr_t	addr;
885 
886 	D1(vswp, "%s: enter", __func__);
887 
888 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
889 
890 	READ_ENTER(&vswp->if_lockrw);
891 	if ((vswp->if_state & VSW_IF_UP) &&
892 	    (vswp->addr_set != VSW_ADDR_HW)) {
893 
894 		addr.mma_addrlen = ETHERADDRL;
895 		ether_copy(&vswp->if_addr, &addr.mma_addr);
896 
897 		if (vsw_set_hw_addr(vswp, &addr) != 0) {
898 			RW_EXIT(&vswp->if_lockrw);
899 			return (1);
900 		}
901 
902 		vswp->addr_slot = addr.mma_slot;
903 
904 		/*
905 		 * If previously when plumbed had had to place
906 		 * interface into promisc mode, now reverse that.
907 		 *
908 		 * Note that interface will only actually be set into
909 		 * non-promisc mode when last port/interface has been
910 		 * programmed into HW.
911 		 */
912 		if (vswp->addr_set == VSW_ADDR_PROMISC)
913 			(void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);
914 
915 		vswp->addr_set = VSW_ADDR_HW;
916 	}
917 	RW_EXIT(&vswp->if_lockrw);
918 
919 	D1(vswp, "%s: exit", __func__);
920 	return (0);
921 }
922 
923 /*
924  * Scan the port list for any ports which have not yet been set
925  * into HW. For those found attempt to program their mac addresses
926  * into the physical device.
927  *
928  * Returns 0 if able to program all required ports (can be 0) into HW.
929  * Returns 1 if failed to set at least one mac address.
930  */
931 static int
932 vsw_prog_ports(vsw_t *vswp)
933 {
934 	mac_multi_addr_t	addr;
935 	vsw_port_list_t		*plist = &vswp->plist;
936 	vsw_port_t		*tp;
937 	int			rv = 0;
938 
939 	D1(vswp, "%s: enter", __func__);
940 
941 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
942 
943 	READ_ENTER(&plist->lockrw);
944 	for (tp = plist->head; tp != NULL; tp = tp->p_next) {
945 		if (tp->addr_set != VSW_ADDR_HW) {
946 			addr.mma_addrlen = ETHERADDRL;
947 			ether_copy(&tp->p_macaddr, &addr.mma_addr);
948 
949 			if (vsw_set_hw_addr(vswp, &addr) != 0) {
950 				rv = 1;
951 				break;
952 			}
953 
954 			tp->addr_slot = addr.mma_slot;
955 
956 			/*
957 			 * If when this port had first attached we had
958 			 * had to place the interface into promisc mode,
959 			 * then now reverse that.
960 			 *
961 			 * Note that the interface will not actually
962 			 * change to non-promisc mode until all ports
963 			 * have been programmed.
964 			 */
965 			if (tp->addr_set == VSW_ADDR_PROMISC)
966 				(void) vsw_unset_hw_promisc(vswp,
967 				    tp, VSW_VNETPORT);
968 
969 			tp->addr_set = VSW_ADDR_HW;
970 		}
971 	}
972 	RW_EXIT(&plist->lockrw);
973 
974 	D1(vswp, "%s: exit", __func__);
975 	return (rv);
976 }
977 
978 static void
979 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
980 {
981 	ringp->ring_state = VSW_MAC_RING_FREE;
982 	ringp->ring_arg = NULL;
983 	ringp->ring_blank = NULL;
984 	ringp->ring_vqp = NULL;
985 	ringp->ring_vswp = vswp;
986 }
987 
988 static void
989 vsw_mac_ring_tbl_init(vsw_t *vswp)
990 {
991 	int		i;
992 
993 	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
994 
995 	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
996 	vswp->mac_ring_tbl  =
997 	    kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP);
998 
999 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
1000 		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
1001 }
1002 
1003 static void
1004 vsw_mac_ring_tbl_destroy(vsw_t *vswp)
1005 {
1006 	int		i;
1007 	vsw_mac_ring_t	*ringp;
1008 
1009 	mutex_enter(&vswp->mac_ring_lock);
1010 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1011 		ringp = &vswp->mac_ring_tbl[i];
1012 
1013 		if (ringp->ring_state != VSW_MAC_RING_FREE) {
1014 			/*
1015 			 * Destroy the queue.
1016 			 */
1017 			vsw_queue_stop(ringp->ring_vqp);
1018 			vsw_queue_destroy(ringp->ring_vqp);
1019 
1020 			/*
1021 			 * Re-initialize the structure.
1022 			 */
1023 			vsw_mac_ring_tbl_entry_init(vswp, ringp);
1024 		}
1025 	}
1026 	mutex_exit(&vswp->mac_ring_lock);
1027 
1028 	mutex_destroy(&vswp->mac_ring_lock);
1029 	kmem_free(vswp->mac_ring_tbl,
1030 	    vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
1031 	vswp->mac_ring_tbl_sz = 0;
1032 }
1033 
1034 /*
1035  * Handle resource add callbacks from the driver below.
1036  */
1037 static mac_resource_handle_t
1038 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
1039 {
1040 	vsw_t		*vswp = (vsw_t *)arg;
1041 	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
1042 	vsw_mac_ring_t	*ringp;
1043 	vsw_queue_t	*vqp;
1044 	int		i;
1045 
1046 	ASSERT(vswp != NULL);
1047 	ASSERT(mrp != NULL);
1048 	ASSERT(vswp->mac_ring_tbl != NULL);
1049 
1050 	D1(vswp, "%s: enter", __func__);
1051 
1052 	/*
1053 	 * Check to make sure we have the correct resource type.
1054 	 */
1055 	if (mrp->mr_type != MAC_RX_FIFO)
1056 		return (NULL);
1057 
1058 	/*
1059 	 * Find a open entry in the ring table.
1060 	 */
1061 	mutex_enter(&vswp->mac_ring_lock);
1062 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1063 		ringp = &vswp->mac_ring_tbl[i];
1064 
1065 		/*
1066 		 * Check for an empty slot, if found, then setup queue
1067 		 * and thread.
1068 		 */
1069 		if (ringp->ring_state == VSW_MAC_RING_FREE) {
1070 			/*
1071 			 * Create the queue for this ring.
1072 			 */
1073 			vqp = vsw_queue_create();
1074 
1075 			/*
1076 			 * Initialize the ring data structure.
1077 			 */
1078 			ringp->ring_vqp = vqp;
1079 			ringp->ring_arg = mrfp->mrf_arg;
1080 			ringp->ring_blank = mrfp->mrf_blank;
1081 			ringp->ring_state = VSW_MAC_RING_INUSE;
1082 
1083 			/*
1084 			 * Create the worker thread.
1085 			 */
1086 			vqp->vq_worker = thread_create(NULL, 0,
1087 			    vsw_queue_worker, ringp, 0, &p0,
1088 			    TS_RUN, minclsyspri);
1089 			if (vqp->vq_worker == NULL) {
1090 				vsw_queue_destroy(vqp);
1091 				vsw_mac_ring_tbl_entry_init(vswp, ringp);
1092 				ringp = NULL;
1093 			}
1094 
1095 			if (ringp != NULL) {
1096 				/*
1097 				 * Make sure thread get's running state for
1098 				 * this ring.
1099 				 */
1100 				mutex_enter(&vqp->vq_lock);
1101 				while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
1102 				    (vqp->vq_state != VSW_QUEUE_DRAINED)) {
1103 					cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1104 				}
1105 
1106 				/*
1107 				 * If the thread is not running, cleanup.
1108 				 */
1109 				if (vqp->vq_state == VSW_QUEUE_DRAINED) {
1110 					vsw_queue_destroy(vqp);
1111 					vsw_mac_ring_tbl_entry_init(vswp,
1112 					    ringp);
1113 					ringp = NULL;
1114 				}
1115 				mutex_exit(&vqp->vq_lock);
1116 			}
1117 
1118 			mutex_exit(&vswp->mac_ring_lock);
1119 			D1(vswp, "%s: exit", __func__);
1120 			return ((mac_resource_handle_t)ringp);
1121 		}
1122 	}
1123 	mutex_exit(&vswp->mac_ring_lock);
1124 
1125 	/*
1126 	 * No slots in the ring table available.
1127 	 */
1128 	D1(vswp, "%s: exit", __func__);
1129 	return (NULL);
1130 }
1131 
1132 static void
1133 vsw_queue_stop(vsw_queue_t *vqp)
1134 {
1135 	mutex_enter(&vqp->vq_lock);
1136 
1137 	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
1138 		vqp->vq_state = VSW_QUEUE_STOP;
1139 		cv_signal(&vqp->vq_cv);
1140 
1141 		while (vqp->vq_state != VSW_QUEUE_DRAINED)
1142 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1143 	}
1144 
1145 	vqp->vq_state = VSW_QUEUE_STOPPED;
1146 
1147 	mutex_exit(&vqp->vq_lock);
1148 }
1149 
1150 static vsw_queue_t *
1151 vsw_queue_create()
1152 {
1153 	vsw_queue_t *vqp;
1154 
1155 	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
1156 
1157 	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
1158 	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
1159 	vqp->vq_first = NULL;
1160 	vqp->vq_last = NULL;
1161 	vqp->vq_state = VSW_QUEUE_STOPPED;
1162 
1163 	return (vqp);
1164 }
1165 
1166 static void
1167 vsw_queue_destroy(vsw_queue_t *vqp)
1168 {
1169 	cv_destroy(&vqp->vq_cv);
1170 	mutex_destroy(&vqp->vq_lock);
1171 	kmem_free(vqp, sizeof (vsw_queue_t));
1172 }
1173 
1174 static void
1175 vsw_queue_worker(vsw_mac_ring_t *rrp)
1176 {
1177 	mblk_t		*mp;
1178 	vsw_queue_t	*vqp = rrp->ring_vqp;
1179 	vsw_t		*vswp = rrp->ring_vswp;
1180 
1181 	mutex_enter(&vqp->vq_lock);
1182 
1183 	ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
1184 
1185 	/*
1186 	 * Set the state to running, since the thread is now active.
1187 	 */
1188 	vqp->vq_state = VSW_QUEUE_RUNNING;
1189 	cv_signal(&vqp->vq_cv);
1190 
1191 	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
1192 		/*
1193 		 * Wait for work to do or the state has changed
1194 		 * to not running.
1195 		 */
1196 		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
1197 		    (vqp->vq_first == NULL)) {
1198 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1199 		}
1200 
1201 		/*
1202 		 * Process packets that we received from the interface.
1203 		 */
1204 		if (vqp->vq_first != NULL) {
1205 			mp = vqp->vq_first;
1206 
1207 			vqp->vq_first = NULL;
1208 			vqp->vq_last = NULL;
1209 
1210 			mutex_exit(&vqp->vq_lock);
1211 
1212 			/* switch the chain of packets received */
1213 			vswp->vsw_switch_frame(vswp, mp,
1214 			    VSW_PHYSDEV, NULL, NULL);
1215 
1216 			mutex_enter(&vqp->vq_lock);
1217 		}
1218 	}
1219 
1220 	/*
1221 	 * We are drained and signal we are done.
1222 	 */
1223 	vqp->vq_state = VSW_QUEUE_DRAINED;
1224 	cv_signal(&vqp->vq_cv);
1225 
1226 	/*
1227 	 * Exit lock and drain the remaining packets.
1228 	 */
1229 	mutex_exit(&vqp->vq_lock);
1230 
1231 	/*
1232 	 * Exit the thread
1233 	 */
1234 	thread_exit();
1235 }
1236 
1237 /*
1238  * static void
1239  * vsw_rx_queue_cb() - Receive callback routine when
1240  *	vsw_multi_ring_enable is non-zero.  Queue the packets
1241  *	to a packet queue for a worker thread to process.
1242  */
1243 static void
1244 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1245 {
1246 	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
1247 	vsw_t		*vswp = (vsw_t *)arg;
1248 	vsw_queue_t	*vqp;
1249 	mblk_t		*bp, *last;
1250 
1251 	ASSERT(mrh != NULL);
1252 	ASSERT(vswp != NULL);
1253 	ASSERT(mp != NULL);
1254 
1255 	D1(vswp, "%s: enter", __func__);
1256 
1257 	/*
1258 	 * Find the last element in the mblk chain.
1259 	 */
1260 	bp = mp;
1261 	do {
1262 		last = bp;
1263 		bp = bp->b_next;
1264 	} while (bp != NULL);
1265 
1266 	/* Get the queue for the packets */
1267 	vqp = ringp->ring_vqp;
1268 
1269 	/*
1270 	 * Grab the lock such we can queue the packets.
1271 	 */
1272 	mutex_enter(&vqp->vq_lock);
1273 
1274 	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
1275 		freemsgchain(mp);
1276 		mutex_exit(&vqp->vq_lock);
1277 		goto vsw_rx_queue_cb_exit;
1278 	}
1279 
1280 	/*
1281 	 * Add the mblk chain to the queue.  If there
1282 	 * is some mblks in the queue, then add the new
1283 	 * chain to the end.
1284 	 */
1285 	if (vqp->vq_first == NULL)
1286 		vqp->vq_first = mp;
1287 	else
1288 		vqp->vq_last->b_next = mp;
1289 
1290 	vqp->vq_last = last;
1291 
1292 	/*
1293 	 * Signal the worker thread that there is work to
1294 	 * do.
1295 	 */
1296 	cv_signal(&vqp->vq_cv);
1297 
1298 	/*
1299 	 * Let go of the lock and exit.
1300 	 */
1301 	mutex_exit(&vqp->vq_lock);
1302 
1303 vsw_rx_queue_cb_exit:
1304 	D1(vswp, "%s: exit", __func__);
1305 }
1306 
1307 /*
1308  * receive callback routine. Invoked by MAC layer when there
1309  * are pkts being passed up from physical device.
1310  *
1311  * PERF: It may be more efficient when the card is in promisc
1312  * mode to check the dest address of the pkts here (against
1313  * the FDB) rather than checking later. Needs to be investigated.
1314  */
1315 static void
1316 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1317 {
1318 	_NOTE(ARGUNUSED(mrh))
1319 
1320 	vsw_t		*vswp = (vsw_t *)arg;
1321 
1322 	ASSERT(vswp != NULL);
1323 
1324 	D1(vswp, "vsw_rx_cb: enter");
1325 
1326 	/* switch the chain of packets received */
1327 	vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
1328 
1329 	D1(vswp, "vsw_rx_cb: exit");
1330 }
1331 
1332 /*
1333  * Send a message out over the physical device via the MAC layer.
1334  *
1335  * Returns any mblks that it was unable to transmit.
1336  */
1337 mblk_t *
1338 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
1339 {
1340 	const mac_txinfo_t	*mtp;
1341 
1342 	mutex_enter(&vswp->mac_lock);
1343 	if ((vswp->mh == NULL) || (vswp->mstarted == B_FALSE)) {
1344 
1345 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
1346 		mutex_exit(&vswp->mac_lock);
1347 		return (mp);
1348 	} else {
1349 		mtp = vswp->txinfo;
1350 		mp = mtp->mt_fn(mtp->mt_arg, mp);
1351 	}
1352 	mutex_exit(&vswp->mac_lock);
1353 
1354 	return (mp);
1355 }
1356