xref: /titanic_50/usr/src/uts/sun4v/io/vsw_phys.c (revision 15a2c75300554c829663ead6bc3489d86c83cde2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/vsw.h>
66 
67 /* MAC Ring table functions. */
68 static void vsw_mac_ring_tbl_init(vsw_t *vswp);
69 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
70 static void vsw_queue_worker(vsw_mac_ring_t *rrp);
71 static void vsw_queue_stop(vsw_queue_t *vqp);
72 static vsw_queue_t *vsw_queue_create();
73 static void vsw_queue_destroy(vsw_queue_t *vqp);
74 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
75 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
76 
77 /* MAC layer routines */
78 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
79 		mac_resource_t *mrp);
80 static	int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
81 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
82 static	int vsw_unset_hw_addr(vsw_t *, int);
83 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
84 static int vsw_prog_if(vsw_t *);
85 
86 /* Support functions */
87 static int vsw_prog_ports(vsw_t *);
88 int vsw_set_hw(vsw_t *, vsw_port_t *, int);
89 int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
90 void vsw_reconfig_hw(vsw_t *);
91 int vsw_mac_attach(vsw_t *vswp);
92 void vsw_mac_detach(vsw_t *vswp);
93 int vsw_mac_open(vsw_t *vswp);
94 void vsw_mac_close(vsw_t *vswp);
95 void vsw_unset_addrs(vsw_t *vswp);
96 void vsw_set_addrs(vsw_t *vswp);
97 int vsw_get_hw_maddr(vsw_t *);
98 mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
99 
100 /*
101  * Tunables used in this file.
102  */
103 extern int vsw_mac_open_retries;
104 extern boolean_t vsw_multi_ring_enable;
105 extern int vsw_mac_rx_rings;
106 
107 /*
108  * Check to see if the card supports the setting of multiple unicst
109  * addresses.
110  *
111  * Returns 0 if card supports the programming of multiple unicast addresses,
112  * otherwise returns 1.
113  */
114 int
115 vsw_get_hw_maddr(vsw_t *vswp)
116 {
117 	D1(vswp, "%s: enter", __func__);
118 
119 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
120 
121 	if (vswp->mh == NULL)
122 		return (1);
123 
124 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
125 		cmn_err(CE_NOTE, "!vsw%d: device (%s) does not support "
126 		    "programming multiple addresses", vswp->instance,
127 		    vswp->physname);
128 		return (1);
129 	}
130 
131 	D2(vswp, "%s: %d addrs : %d free", __func__,
132 	    vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
133 
134 	D1(vswp, "%s: exit", __func__);
135 
136 	return (0);
137 }
138 
139 /*
140  * Program unicast and multicast addresses of vsw interface and the ports
141  * into the physical device.
142  */
143 void
144 vsw_set_addrs(vsw_t *vswp)
145 {
146 	vsw_port_list_t	*plist = &vswp->plist;
147 	vsw_port_t	*port;
148 	mcst_addr_t	*mcap;
149 	int		rv;
150 
151 	READ_ENTER(&vswp->if_lockrw);
152 
153 	if (vswp->if_state & VSW_IF_UP) {
154 
155 		/* program unicst addr of vsw interface in the physdev */
156 		if (vswp->addr_set == VSW_ADDR_UNSET) {
157 			mutex_enter(&vswp->hw_lock);
158 			rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
159 			mutex_exit(&vswp->hw_lock);
160 			if (rv != 0) {
161 				cmn_err(CE_NOTE,
162 				    "!vsw%d: failed to program interface "
163 				    "unicast address\n", vswp->instance);
164 			}
165 			/*
166 			 * Notify the MAC layer of the changed address.
167 			 */
168 			mac_unicst_update(vswp->if_mh,
169 			    (uint8_t *)&vswp->if_addr);
170 		}
171 
172 		/* program mcast addrs of vsw interface in the physdev */
173 		mutex_enter(&vswp->mca_lock);
174 		WRITE_ENTER(&vswp->mac_rwlock);
175 		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
176 			if (mcap->mac_added)
177 				continue;
178 			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
179 			if (rv == 0) {
180 				mcap->mac_added = B_TRUE;
181 			} else {
182 				cmn_err(CE_NOTE, "!vsw%d: unable to add "
183 				    "multicast address: %s\n", vswp->instance,
184 				    ether_sprintf((void *)&mcap->mca));
185 			}
186 		}
187 		RW_EXIT(&vswp->mac_rwlock);
188 		mutex_exit(&vswp->mca_lock);
189 
190 	}
191 
192 	RW_EXIT(&vswp->if_lockrw);
193 
194 	WRITE_ENTER(&plist->lockrw);
195 
196 	/* program unicast address of ports in the physical device */
197 	mutex_enter(&vswp->hw_lock);
198 	for (port = plist->head; port != NULL; port = port->p_next) {
199 		if (port->addr_set != VSW_ADDR_UNSET) /* addr already set */
200 			continue;
201 		if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
202 			cmn_err(CE_NOTE,
203 			    "!vsw%d: port:%d failed to set unicast address\n",
204 			    vswp->instance, port->p_instance);
205 		}
206 	}
207 	mutex_exit(&vswp->hw_lock);
208 
209 	/* program multicast addresses of ports in the physdev */
210 	for (port = plist->head; port != NULL; port = port->p_next) {
211 		mutex_enter(&port->mca_lock);
212 		WRITE_ENTER(&vswp->mac_rwlock);
213 		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
214 			if (mcap->mac_added)
215 				continue;
216 			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
217 			if (rv == 0) {
218 				mcap->mac_added = B_TRUE;
219 			} else {
220 				cmn_err(CE_NOTE, "!vsw%d: unable to add "
221 				    "multicast address: %s\n", vswp->instance,
222 				    ether_sprintf((void *)&mcap->mca));
223 			}
224 		}
225 		RW_EXIT(&vswp->mac_rwlock);
226 		mutex_exit(&port->mca_lock);
227 	}
228 
229 	RW_EXIT(&plist->lockrw);
230 }
231 
232 /*
233  * Remove unicast and multicast addresses of vsw interface and the ports
234  * from the physical device.
235  */
236 void
237 vsw_unset_addrs(vsw_t *vswp)
238 {
239 	vsw_port_list_t	*plist = &vswp->plist;
240 	vsw_port_t	*port;
241 	mcst_addr_t	*mcap;
242 
243 	READ_ENTER(&vswp->if_lockrw);
244 
245 	if (vswp->if_state & VSW_IF_UP) {
246 
247 		/*
248 		 * Remove unicast addr of vsw interfce
249 		 * from current physdev
250 		 */
251 		mutex_enter(&vswp->hw_lock);
252 		(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
253 		mutex_exit(&vswp->hw_lock);
254 
255 		/*
256 		 * Remove mcast addrs of vsw interface
257 		 * from current physdev
258 		 */
259 		mutex_enter(&vswp->mca_lock);
260 		WRITE_ENTER(&vswp->mac_rwlock);
261 		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
262 			if (!mcap->mac_added)
263 				continue;
264 			(void) mac_multicst_remove(vswp->mh,
265 			    (uchar_t *)&mcap->mca);
266 			mcap->mac_added = B_FALSE;
267 		}
268 		RW_EXIT(&vswp->mac_rwlock);
269 		mutex_exit(&vswp->mca_lock);
270 
271 	}
272 
273 	RW_EXIT(&vswp->if_lockrw);
274 
275 	WRITE_ENTER(&plist->lockrw);
276 
277 	/*
278 	 * Remove unicast address of ports from the current physical device
279 	 */
280 	mutex_enter(&vswp->hw_lock);
281 	for (port = plist->head; port != NULL; port = port->p_next) {
282 		/* Remove address if was programmed into HW. */
283 		if (port->addr_set == VSW_ADDR_UNSET)
284 			continue;
285 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
286 	}
287 	mutex_exit(&vswp->hw_lock);
288 
289 	/* Remove multicast addresses of ports from the current physdev */
290 	for (port = plist->head; port != NULL; port = port->p_next) {
291 		mutex_enter(&port->mca_lock);
292 		WRITE_ENTER(&vswp->mac_rwlock);
293 		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
294 			if (!mcap->mac_added)
295 				continue;
296 			(void) mac_multicst_remove(vswp->mh,
297 			    (uchar_t *)&mcap->mca);
298 			mcap->mac_added = B_FALSE;
299 		}
300 		RW_EXIT(&vswp->mac_rwlock);
301 		mutex_exit(&port->mca_lock);
302 	}
303 
304 	RW_EXIT(&plist->lockrw);
305 }
306 
307 /*
308  * Open the underlying physical device for access in layer2 mode.
309  * Returns:
310  * 0 on success
311  * EAGAIN if mac_open() fails due to the device being not available yet.
312  * EIO on any other failures.
313  */
314 int
315 vsw_mac_open(vsw_t *vswp)
316 {
317 	int	rv;
318 
319 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
320 
321 	if (vswp->mh != NULL) {
322 		/* already open */
323 		return (0);
324 	}
325 
326 	if (vswp->mac_open_retries++ >= vsw_mac_open_retries) {
327 		/* exceeded max retries */
328 		return (EIO);
329 	}
330 
331 	if ((rv = mac_open_by_linkname(vswp->physname, &vswp->mh)) != 0) {
332 		/*
333 		 * If mac_open() failed and the error indicates that either
334 		 * the dlmgmtd door or the device is not available yet, we
335 		 * return EAGAIN to indicate that mac_open() needs to be
336 		 * retried. For example, this may happen during boot up, if
337 		 * the required link aggregation groups(devices) have not
338 		 * been created yet.
339 		 */
340 		if (rv == ENOENT || rv == EBADF) {
341 			return (EAGAIN);
342 		} else {
343 			cmn_err(CE_WARN, "vsw%d: device (%s) open failed rv:%x",
344 			    vswp->instance, vswp->physname, rv);
345 			return (EIO);
346 		}
347 	}
348 
349 	vswp->mac_open_retries = 0;
350 
351 	return (0);
352 }
353 
354 /*
355  * Close the underlying physical device.
356  */
357 void
358 vsw_mac_close(vsw_t *vswp)
359 {
360 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
361 
362 	if (vswp->mh != NULL) {
363 		mac_close(vswp->mh);
364 		vswp->mh = NULL;
365 	}
366 }
367 
368 /*
369  * Link into the MAC layer to gain access to the services provided by
370  * the underlying physical device driver (which should also have
371  * registered with the MAC layer).
372  *
373  * Only when in layer 2 mode.
374  */
375 int
376 vsw_mac_attach(vsw_t *vswp)
377 {
378 	D1(vswp, "%s: enter", __func__);
379 
380 	ASSERT(vswp->mrh == NULL);
381 	ASSERT(vswp->mstarted == B_FALSE);
382 	ASSERT(vswp->mresources == B_FALSE);
383 
384 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
385 
386 	ASSERT(vswp->mh != NULL);
387 
388 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
389 
390 	if (vsw_multi_ring_enable) {
391 		/*
392 		 * Initialize the ring table.
393 		 */
394 		vsw_mac_ring_tbl_init(vswp);
395 
396 		/*
397 		 * Register our rx callback function.
398 		 */
399 		vswp->mrh = mac_rx_add(vswp->mh,
400 		    vsw_rx_queue_cb, (void *)vswp);
401 		ASSERT(vswp->mrh != NULL);
402 
403 		/*
404 		 * Register our mac resource callback.
405 		 */
406 		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
407 		vswp->mresources = B_TRUE;
408 
409 		/*
410 		 * Get the ring resources available to us from
411 		 * the mac below us.
412 		 */
413 		mac_resources(vswp->mh);
414 	} else {
415 		/*
416 		 * Just register our rx callback function
417 		 */
418 		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
419 		ASSERT(vswp->mrh != NULL);
420 	}
421 
422 	/* Get the MAC tx fn */
423 	vswp->txinfo = mac_tx_get(vswp->mh);
424 
425 	/* start the interface */
426 	if (mac_start(vswp->mh) != 0) {
427 		cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
428 		    vswp->instance);
429 		goto mac_fail_exit;
430 	}
431 
432 	vswp->mstarted = B_TRUE;
433 
434 	D1(vswp, "%s: exit", __func__);
435 	return (0);
436 
437 mac_fail_exit:
438 	vsw_mac_detach(vswp);
439 
440 	D1(vswp, "%s: exit", __func__);
441 	return (1);
442 }
443 
444 void
445 vsw_mac_detach(vsw_t *vswp)
446 {
447 	D1(vswp, "vsw_mac_detach: enter");
448 
449 	ASSERT(vswp != NULL);
450 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
451 
452 	if (vsw_multi_ring_enable) {
453 		vsw_mac_ring_tbl_destroy(vswp);
454 	}
455 
456 	if (vswp->mh != NULL) {
457 		if (vswp->mstarted)
458 			mac_stop(vswp->mh);
459 		if (vswp->mrh != NULL)
460 			mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE);
461 		if (vswp->mresources)
462 			mac_resource_set(vswp->mh, NULL, NULL);
463 	}
464 
465 	vswp->mrh = NULL;
466 	vswp->txinfo = NULL;
467 	vswp->mstarted = B_FALSE;
468 
469 	D1(vswp, "vsw_mac_detach: exit");
470 }
471 
472 /*
473  * Depending on the mode specified, the capabilites and capacity
474  * of the underlying device setup the physical device.
475  *
476  * If in layer 3 mode, then do nothing.
477  *
478  * If in layer 2 programmed mode attempt to program the unicast address
479  * associated with the port into the physical device. If this is not
480  * possible due to resource exhaustion or simply because the device does
481  * not support multiple unicast addresses then if required fallback onto
482  * putting the card into promisc mode.
483  *
484  * If in promisc mode then simply set the card into promisc mode.
485  *
486  * Returns 0 success, 1 on failure.
487  */
488 int
489 vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
490 {
491 	mac_multi_addr_t	mac_addr;
492 	int			err;
493 
494 	D1(vswp, "%s: enter", __func__);
495 
496 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
497 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
498 
499 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
500 		return (0);
501 
502 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
503 		return (vsw_set_hw_promisc(vswp, port, type));
504 	}
505 
506 	/*
507 	 * Attempt to program the unicast address into the HW.
508 	 */
509 	mac_addr.mma_addrlen = ETHERADDRL;
510 	if (type == VSW_VNETPORT) {
511 		ASSERT(port != NULL);
512 		ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
513 	} else {
514 		ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
515 	}
516 
517 	err = vsw_set_hw_addr(vswp, &mac_addr);
518 	if (err == ENOSPC) {
519 		/*
520 		 * Mark that attempt should be made to re-config sometime
521 		 * in future if a port is deleted.
522 		 */
523 		vswp->recfg_reqd = B_TRUE;
524 
525 		/*
526 		 * Only 1 mode specified, nothing more to do.
527 		 */
528 		if (vswp->smode_num == 1)
529 			return (err);
530 
531 		/*
532 		 * If promiscuous was next mode specified try to
533 		 * set the card into that mode.
534 		 */
535 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
536 		    (vswp->smode[vswp->smode_idx + 1] ==
537 		    VSW_LAYER2_PROMISC)) {
538 			vswp->smode_idx += 1;
539 			return (vsw_set_hw_promisc(vswp, port, type));
540 		}
541 		return (err);
542 	}
543 
544 	if (err != 0)
545 		return (err);
546 
547 	if (type == VSW_VNETPORT) {
548 		port->addr_slot = mac_addr.mma_slot;
549 		port->addr_set = VSW_ADDR_HW;
550 	} else {
551 		vswp->addr_slot = mac_addr.mma_slot;
552 		vswp->addr_set = VSW_ADDR_HW;
553 	}
554 
555 	D2(vswp, "programmed addr %s into slot %d "
556 	"of device %s", ether_sprintf((void *)mac_addr.mma_addr),
557 	    mac_addr.mma_slot, vswp->physname);
558 
559 	D1(vswp, "%s: exit", __func__);
560 
561 	return (0);
562 }
563 
564 /*
565  * If in layer 3 mode do nothing.
566  *
567  * If in layer 2 switched mode remove the address from the physical
568  * device.
569  *
570  * If in layer 2 promiscuous mode disable promisc mode.
571  *
572  * Returns 0 on success.
573  */
574 int
575 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
576 {
577 	mac_addr_slot_t	slot;
578 	int		rv;
579 
580 	D1(vswp, "%s: enter", __func__);
581 
582 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
583 
584 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
585 		return (0);
586 
587 	switch (type) {
588 	case VSW_VNETPORT:
589 		ASSERT(port != NULL);
590 
591 		if (port->addr_set == VSW_ADDR_PROMISC) {
592 			return (vsw_unset_hw_promisc(vswp, port, type));
593 
594 		} else if (port->addr_set == VSW_ADDR_HW) {
595 			slot = port->addr_slot;
596 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
597 				port->addr_set = VSW_ADDR_UNSET;
598 		}
599 
600 		break;
601 
602 	case VSW_LOCALDEV:
603 		if (vswp->addr_set == VSW_ADDR_PROMISC) {
604 			return (vsw_unset_hw_promisc(vswp, NULL, type));
605 
606 		} else if (vswp->addr_set == VSW_ADDR_HW) {
607 			slot = vswp->addr_slot;
608 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
609 				vswp->addr_set = VSW_ADDR_UNSET;
610 		}
611 
612 		break;
613 
614 	default:
615 		/* should never happen */
616 		DERR(vswp, "%s: unknown type %d", __func__, type);
617 		ASSERT(0);
618 		return (1);
619 	}
620 
621 	D1(vswp, "%s: exit", __func__);
622 	return (rv);
623 }
624 
625 /*
626  * Attempt to program a unicast address into HW.
627  *
628  * Returns 0 on sucess, 1 on failure.
629  */
630 static int
631 vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
632 {
633 	void	*mah;
634 	int	rv = EINVAL;
635 
636 	D1(vswp, "%s: enter", __func__);
637 
638 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
639 
640 	if (vswp->maddr.maddr_handle == NULL)
641 		return (rv);
642 
643 	mah = vswp->maddr.maddr_handle;
644 
645 	rv = vswp->maddr.maddr_add(mah, mac);
646 
647 	if (rv == 0)
648 		return (rv);
649 
650 	/*
651 	 * Its okay for the add to fail because we have exhausted
652 	 * all the resouces in the hardware device. Any other error
653 	 * we want to flag.
654 	 */
655 	if (rv != ENOSPC) {
656 		cmn_err(CE_NOTE, "!vsw%d: error programming "
657 		    "address %s into HW err (%d)",
658 		    vswp->instance, ether_sprintf((void *)mac->mma_addr), rv);
659 	}
660 	D1(vswp, "%s: exit", __func__);
661 	return (rv);
662 }
663 
664 /*
665  * Remove a unicast mac address which has previously been programmed
666  * into HW.
667  *
668  * Returns 0 on sucess, 1 on failure.
669  */
670 static int
671 vsw_unset_hw_addr(vsw_t *vswp, int slot)
672 {
673 	void	*mah;
674 	int	rv;
675 
676 	D1(vswp, "%s: enter", __func__);
677 
678 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
679 	ASSERT(slot >= 0);
680 
681 	if (vswp->maddr.maddr_handle == NULL)
682 		return (1);
683 
684 	mah = vswp->maddr.maddr_handle;
685 
686 	rv = vswp->maddr.maddr_remove(mah, slot);
687 	if (rv != 0) {
688 		DWARN(vswp, "%s: unable to remove address "
689 		    "from slot %d in device %s (err %d)",
690 		    __func__, slot, vswp->physname, rv);
691 		return (1);
692 	}
693 
694 	D2(vswp, "removed addr from slot %d in device %s",
695 	    slot, vswp->physname);
696 
697 	D1(vswp, "%s: exit", __func__);
698 	return (0);
699 }
700 
701 /*
702  * Set network card into promisc mode.
703  *
704  * Returns 0 on success, 1 on failure.
705  */
706 static int
707 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
708 {
709 	D1(vswp, "%s: enter", __func__);
710 
711 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
712 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
713 
714 	WRITE_ENTER(&vswp->mac_rwlock);
715 	if (vswp->mh == NULL) {
716 		RW_EXIT(&vswp->mac_rwlock);
717 		return (1);
718 	}
719 
720 	if (vswp->promisc_cnt++ == 0) {
721 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
722 			vswp->promisc_cnt--;
723 			RW_EXIT(&vswp->mac_rwlock);
724 			return (1);
725 		}
726 		cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
727 		    "promiscuous mode", vswp->instance, vswp->physname);
728 	}
729 	RW_EXIT(&vswp->mac_rwlock);
730 
731 	if (type == VSW_VNETPORT) {
732 		ASSERT(port != NULL);
733 		port->addr_set = VSW_ADDR_PROMISC;
734 	} else {
735 		vswp->addr_set = VSW_ADDR_PROMISC;
736 	}
737 
738 	D1(vswp, "%s: exit", __func__);
739 
740 	return (0);
741 }
742 
743 /*
744  * Turn off promiscuous mode on network card.
745  *
746  * Returns 0 on success, 1 on failure.
747  */
748 static int
749 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
750 {
751 	vsw_port_list_t 	*plist = &vswp->plist;
752 
753 	D2(vswp, "%s: enter", __func__);
754 
755 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
756 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
757 
758 	WRITE_ENTER(&vswp->mac_rwlock);
759 	if (vswp->mh == NULL) {
760 		RW_EXIT(&vswp->mac_rwlock);
761 		return (1);
762 	}
763 
764 	if (--vswp->promisc_cnt == 0) {
765 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
766 			vswp->promisc_cnt++;
767 			RW_EXIT(&vswp->mac_rwlock);
768 			return (1);
769 		}
770 
771 		/*
772 		 * We are exiting promisc mode either because we were
773 		 * only in promisc mode because we had failed over from
774 		 * switched mode due to HW resource issues, or the user
775 		 * wanted the card in promisc mode for all the ports and
776 		 * the last port is now being deleted. Tweak the message
777 		 * accordingly.
778 		 */
779 		if (plist->num_ports != 0) {
780 			cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
781 			    "programmed mode", vswp->instance, vswp->physname);
782 		} else {
783 			cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
784 			    "promiscuous mode", vswp->instance, vswp->physname);
785 		}
786 	}
787 	RW_EXIT(&vswp->mac_rwlock);
788 
789 	if (type == VSW_VNETPORT) {
790 		ASSERT(port != NULL);
791 		ASSERT(port->addr_set == VSW_ADDR_PROMISC);
792 		port->addr_set = VSW_ADDR_UNSET;
793 	} else {
794 		ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
795 		vswp->addr_set = VSW_ADDR_UNSET;
796 	}
797 
798 	D1(vswp, "%s: exit", __func__);
799 	return (0);
800 }
801 
802 /*
803  * Determine whether or not we are operating in our prefered
804  * mode and if not whether the physical resources now allow us
805  * to operate in it.
806  *
807  * If a port is being removed should only be invoked after port has been
808  * removed from the port list.
809  */
810 void
811 vsw_reconfig_hw(vsw_t *vswp)
812 {
813 	int			s_idx;
814 
815 	D1(vswp, "%s: enter", __func__);
816 
817 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
818 
819 	if (vswp->maddr.maddr_handle == NULL) {
820 		return;
821 	}
822 
823 	/*
824 	 * If we are in layer 2 (i.e. switched) or would like to be
825 	 * in layer 2 then check if any ports or the vswitch itself
826 	 * need to be programmed into the HW.
827 	 *
828 	 * This can happen in two cases - switched was specified as
829 	 * the prefered mode of operation but we exhausted the HW
830 	 * resources and so failed over to the next specifed mode,
831 	 * or switched was the only mode specified so after HW
832 	 * resources were exhausted there was nothing more we
833 	 * could do.
834 	 */
835 	if (vswp->smode_idx > 0)
836 		s_idx = vswp->smode_idx - 1;
837 	else
838 		s_idx = vswp->smode_idx;
839 
840 	if (vswp->smode[s_idx] != VSW_LAYER2) {
841 		return;
842 	}
843 
844 	D2(vswp, "%s: attempting reconfig..", __func__);
845 
846 	/*
847 	 * First, attempt to set the vswitch mac address into HW,
848 	 * if required.
849 	 */
850 	if (vsw_prog_if(vswp)) {
851 		return;
852 	}
853 
854 	/*
855 	 * Next, attempt to set any ports which have not yet been
856 	 * programmed into HW.
857 	 */
858 	if (vsw_prog_ports(vswp)) {
859 		return;
860 	}
861 
862 	/*
863 	 * By now we know that have programmed all desired ports etc
864 	 * into HW, so safe to mark reconfiguration as complete.
865 	 */
866 	vswp->recfg_reqd = B_FALSE;
867 
868 	vswp->smode_idx = s_idx;
869 
870 	D1(vswp, "%s: exit", __func__);
871 }
872 
873 /*
874  * Check to see if vsw itself is plumbed, and if so whether or not
875  * its mac address should be written into HW.
876  *
877  * Returns 0 if could set address, or didn't have to set it.
878  * Returns 1 if failed to set address.
879  */
880 static int
881 vsw_prog_if(vsw_t *vswp)
882 {
883 	mac_multi_addr_t	addr;
884 
885 	D1(vswp, "%s: enter", __func__);
886 
887 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
888 
889 	READ_ENTER(&vswp->if_lockrw);
890 	if ((vswp->if_state & VSW_IF_UP) &&
891 	    (vswp->addr_set != VSW_ADDR_HW)) {
892 
893 		addr.mma_addrlen = ETHERADDRL;
894 		ether_copy(&vswp->if_addr, &addr.mma_addr);
895 
896 		if (vsw_set_hw_addr(vswp, &addr) != 0) {
897 			RW_EXIT(&vswp->if_lockrw);
898 			return (1);
899 		}
900 
901 		vswp->addr_slot = addr.mma_slot;
902 
903 		/*
904 		 * If previously when plumbed had had to place
905 		 * interface into promisc mode, now reverse that.
906 		 *
907 		 * Note that interface will only actually be set into
908 		 * non-promisc mode when last port/interface has been
909 		 * programmed into HW.
910 		 */
911 		if (vswp->addr_set == VSW_ADDR_PROMISC)
912 			(void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);
913 
914 		vswp->addr_set = VSW_ADDR_HW;
915 	}
916 	RW_EXIT(&vswp->if_lockrw);
917 
918 	D1(vswp, "%s: exit", __func__);
919 	return (0);
920 }
921 
922 /*
923  * Scan the port list for any ports which have not yet been set
924  * into HW. For those found attempt to program their mac addresses
925  * into the physical device.
926  *
927  * Returns 0 if able to program all required ports (can be 0) into HW.
928  * Returns 1 if failed to set at least one mac address.
929  */
930 static int
931 vsw_prog_ports(vsw_t *vswp)
932 {
933 	mac_multi_addr_t	addr;
934 	vsw_port_list_t		*plist = &vswp->plist;
935 	vsw_port_t		*tp;
936 	int			rv = 0;
937 
938 	D1(vswp, "%s: enter", __func__);
939 
940 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
941 
942 	READ_ENTER(&plist->lockrw);
943 	for (tp = plist->head; tp != NULL; tp = tp->p_next) {
944 		if (tp->addr_set != VSW_ADDR_HW) {
945 			addr.mma_addrlen = ETHERADDRL;
946 			ether_copy(&tp->p_macaddr, &addr.mma_addr);
947 
948 			if (vsw_set_hw_addr(vswp, &addr) != 0) {
949 				rv = 1;
950 				break;
951 			}
952 
953 			tp->addr_slot = addr.mma_slot;
954 
955 			/*
956 			 * If when this port had first attached we had
957 			 * had to place the interface into promisc mode,
958 			 * then now reverse that.
959 			 *
960 			 * Note that the interface will not actually
961 			 * change to non-promisc mode until all ports
962 			 * have been programmed.
963 			 */
964 			if (tp->addr_set == VSW_ADDR_PROMISC)
965 				(void) vsw_unset_hw_promisc(vswp,
966 				    tp, VSW_VNETPORT);
967 
968 			tp->addr_set = VSW_ADDR_HW;
969 		}
970 	}
971 	RW_EXIT(&plist->lockrw);
972 
973 	D1(vswp, "%s: exit", __func__);
974 	return (rv);
975 }
976 
977 static void
978 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
979 {
980 	ringp->ring_state = VSW_MAC_RING_FREE;
981 	ringp->ring_arg = NULL;
982 	ringp->ring_blank = NULL;
983 	ringp->ring_vqp = NULL;
984 	ringp->ring_vswp = vswp;
985 }
986 
987 static void
988 vsw_mac_ring_tbl_init(vsw_t *vswp)
989 {
990 	int		i;
991 
992 	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
993 
994 	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
995 	vswp->mac_ring_tbl  =
996 	    kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP);
997 
998 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
999 		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
1000 }
1001 
1002 static void
1003 vsw_mac_ring_tbl_destroy(vsw_t *vswp)
1004 {
1005 	int		i;
1006 	vsw_mac_ring_t	*ringp;
1007 
1008 	mutex_enter(&vswp->mac_ring_lock);
1009 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1010 		ringp = &vswp->mac_ring_tbl[i];
1011 
1012 		if (ringp->ring_state != VSW_MAC_RING_FREE) {
1013 			/*
1014 			 * Destroy the queue.
1015 			 */
1016 			vsw_queue_stop(ringp->ring_vqp);
1017 			vsw_queue_destroy(ringp->ring_vqp);
1018 
1019 			/*
1020 			 * Re-initialize the structure.
1021 			 */
1022 			vsw_mac_ring_tbl_entry_init(vswp, ringp);
1023 		}
1024 	}
1025 	mutex_exit(&vswp->mac_ring_lock);
1026 
1027 	mutex_destroy(&vswp->mac_ring_lock);
1028 	kmem_free(vswp->mac_ring_tbl,
1029 	    vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
1030 	vswp->mac_ring_tbl_sz = 0;
1031 }
1032 
1033 /*
1034  * Handle resource add callbacks from the driver below.
1035  */
1036 static mac_resource_handle_t
1037 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
1038 {
1039 	vsw_t		*vswp = (vsw_t *)arg;
1040 	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
1041 	vsw_mac_ring_t	*ringp;
1042 	vsw_queue_t	*vqp;
1043 	int		i;
1044 
1045 	ASSERT(vswp != NULL);
1046 	ASSERT(mrp != NULL);
1047 	ASSERT(vswp->mac_ring_tbl != NULL);
1048 
1049 	D1(vswp, "%s: enter", __func__);
1050 
1051 	/*
1052 	 * Check to make sure we have the correct resource type.
1053 	 */
1054 	if (mrp->mr_type != MAC_RX_FIFO)
1055 		return (NULL);
1056 
1057 	/*
1058 	 * Find a open entry in the ring table.
1059 	 */
1060 	mutex_enter(&vswp->mac_ring_lock);
1061 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1062 		ringp = &vswp->mac_ring_tbl[i];
1063 
1064 		/*
1065 		 * Check for an empty slot, if found, then setup queue
1066 		 * and thread.
1067 		 */
1068 		if (ringp->ring_state == VSW_MAC_RING_FREE) {
1069 			/*
1070 			 * Create the queue for this ring.
1071 			 */
1072 			vqp = vsw_queue_create();
1073 
1074 			/*
1075 			 * Initialize the ring data structure.
1076 			 */
1077 			ringp->ring_vqp = vqp;
1078 			ringp->ring_arg = mrfp->mrf_arg;
1079 			ringp->ring_blank = mrfp->mrf_blank;
1080 			ringp->ring_state = VSW_MAC_RING_INUSE;
1081 
1082 			/*
1083 			 * Create the worker thread.
1084 			 */
1085 			vqp->vq_worker = thread_create(NULL, 0,
1086 			    vsw_queue_worker, ringp, 0, &p0,
1087 			    TS_RUN, minclsyspri);
1088 			if (vqp->vq_worker == NULL) {
1089 				vsw_queue_destroy(vqp);
1090 				vsw_mac_ring_tbl_entry_init(vswp, ringp);
1091 				ringp = NULL;
1092 			}
1093 
1094 			if (ringp != NULL) {
1095 				/*
1096 				 * Make sure thread get's running state for
1097 				 * this ring.
1098 				 */
1099 				mutex_enter(&vqp->vq_lock);
1100 				while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
1101 				    (vqp->vq_state != VSW_QUEUE_DRAINED)) {
1102 					cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1103 				}
1104 
1105 				/*
1106 				 * If the thread is not running, cleanup.
1107 				 */
1108 				if (vqp->vq_state == VSW_QUEUE_DRAINED) {
1109 					vsw_queue_destroy(vqp);
1110 					vsw_mac_ring_tbl_entry_init(vswp,
1111 					    ringp);
1112 					ringp = NULL;
1113 				}
1114 				mutex_exit(&vqp->vq_lock);
1115 			}
1116 
1117 			mutex_exit(&vswp->mac_ring_lock);
1118 			D1(vswp, "%s: exit", __func__);
1119 			return ((mac_resource_handle_t)ringp);
1120 		}
1121 	}
1122 	mutex_exit(&vswp->mac_ring_lock);
1123 
1124 	/*
1125 	 * No slots in the ring table available.
1126 	 */
1127 	D1(vswp, "%s: exit", __func__);
1128 	return (NULL);
1129 }
1130 
1131 static void
1132 vsw_queue_stop(vsw_queue_t *vqp)
1133 {
1134 	mutex_enter(&vqp->vq_lock);
1135 
1136 	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
1137 		vqp->vq_state = VSW_QUEUE_STOP;
1138 		cv_signal(&vqp->vq_cv);
1139 
1140 		while (vqp->vq_state != VSW_QUEUE_DRAINED)
1141 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1142 	}
1143 
1144 	vqp->vq_state = VSW_QUEUE_STOPPED;
1145 
1146 	mutex_exit(&vqp->vq_lock);
1147 }
1148 
1149 static vsw_queue_t *
1150 vsw_queue_create()
1151 {
1152 	vsw_queue_t *vqp;
1153 
1154 	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
1155 
1156 	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
1157 	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
1158 	vqp->vq_first = NULL;
1159 	vqp->vq_last = NULL;
1160 	vqp->vq_state = VSW_QUEUE_STOPPED;
1161 
1162 	return (vqp);
1163 }
1164 
1165 static void
1166 vsw_queue_destroy(vsw_queue_t *vqp)
1167 {
1168 	cv_destroy(&vqp->vq_cv);
1169 	mutex_destroy(&vqp->vq_lock);
1170 	kmem_free(vqp, sizeof (vsw_queue_t));
1171 }
1172 
1173 static void
1174 vsw_queue_worker(vsw_mac_ring_t *rrp)
1175 {
1176 	mblk_t		*mp;
1177 	vsw_queue_t	*vqp = rrp->ring_vqp;
1178 	vsw_t		*vswp = rrp->ring_vswp;
1179 
1180 	mutex_enter(&vqp->vq_lock);
1181 
1182 	ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
1183 
1184 	/*
1185 	 * Set the state to running, since the thread is now active.
1186 	 */
1187 	vqp->vq_state = VSW_QUEUE_RUNNING;
1188 	cv_signal(&vqp->vq_cv);
1189 
1190 	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
1191 		/*
1192 		 * Wait for work to do or the state has changed
1193 		 * to not running.
1194 		 */
1195 		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
1196 		    (vqp->vq_first == NULL)) {
1197 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1198 		}
1199 
1200 		/*
1201 		 * Process packets that we received from the interface.
1202 		 */
1203 		if (vqp->vq_first != NULL) {
1204 			mp = vqp->vq_first;
1205 
1206 			vqp->vq_first = NULL;
1207 			vqp->vq_last = NULL;
1208 
1209 			mutex_exit(&vqp->vq_lock);
1210 
1211 			/* switch the chain of packets received */
1212 			vswp->vsw_switch_frame(vswp, mp,
1213 			    VSW_PHYSDEV, NULL, NULL);
1214 
1215 			mutex_enter(&vqp->vq_lock);
1216 		}
1217 	}
1218 
1219 	/*
1220 	 * We are drained and signal we are done.
1221 	 */
1222 	vqp->vq_state = VSW_QUEUE_DRAINED;
1223 	cv_signal(&vqp->vq_cv);
1224 
1225 	/*
1226 	 * Exit lock and drain the remaining packets.
1227 	 */
1228 	mutex_exit(&vqp->vq_lock);
1229 
1230 	/*
1231 	 * Exit the thread
1232 	 */
1233 	thread_exit();
1234 }
1235 
1236 /*
1237  * static void
1238  * vsw_rx_queue_cb() - Receive callback routine when
1239  *	vsw_multi_ring_enable is non-zero.  Queue the packets
1240  *	to a packet queue for a worker thread to process.
1241  */
1242 static void
1243 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1244 {
1245 	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
1246 	vsw_t		*vswp = (vsw_t *)arg;
1247 	vsw_queue_t	*vqp;
1248 	mblk_t		*bp, *last;
1249 
1250 	ASSERT(mrh != NULL);
1251 	ASSERT(vswp != NULL);
1252 	ASSERT(mp != NULL);
1253 
1254 	D1(vswp, "%s: enter", __func__);
1255 
1256 	/*
1257 	 * Find the last element in the mblk chain.
1258 	 */
1259 	bp = mp;
1260 	do {
1261 		last = bp;
1262 		bp = bp->b_next;
1263 	} while (bp != NULL);
1264 
1265 	/* Get the queue for the packets */
1266 	vqp = ringp->ring_vqp;
1267 
1268 	/*
1269 	 * Grab the lock such we can queue the packets.
1270 	 */
1271 	mutex_enter(&vqp->vq_lock);
1272 
1273 	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
1274 		freemsgchain(mp);
1275 		mutex_exit(&vqp->vq_lock);
1276 		goto vsw_rx_queue_cb_exit;
1277 	}
1278 
1279 	/*
1280 	 * Add the mblk chain to the queue.  If there
1281 	 * is some mblks in the queue, then add the new
1282 	 * chain to the end.
1283 	 */
1284 	if (vqp->vq_first == NULL)
1285 		vqp->vq_first = mp;
1286 	else
1287 		vqp->vq_last->b_next = mp;
1288 
1289 	vqp->vq_last = last;
1290 
1291 	/*
1292 	 * Signal the worker thread that there is work to
1293 	 * do.
1294 	 */
1295 	cv_signal(&vqp->vq_cv);
1296 
1297 	/*
1298 	 * Let go of the lock and exit.
1299 	 */
1300 	mutex_exit(&vqp->vq_lock);
1301 
1302 vsw_rx_queue_cb_exit:
1303 	D1(vswp, "%s: exit", __func__);
1304 }
1305 
1306 /*
1307  * receive callback routine. Invoked by MAC layer when there
1308  * are pkts being passed up from physical device.
1309  *
1310  * PERF: It may be more efficient when the card is in promisc
1311  * mode to check the dest address of the pkts here (against
1312  * the FDB) rather than checking later. Needs to be investigated.
1313  */
1314 static void
1315 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1316 {
1317 	_NOTE(ARGUNUSED(mrh))
1318 
1319 	vsw_t		*vswp = (vsw_t *)arg;
1320 
1321 	ASSERT(vswp != NULL);
1322 
1323 	D1(vswp, "vsw_rx_cb: enter");
1324 
1325 	/* switch the chain of packets received */
1326 	vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
1327 
1328 	D1(vswp, "vsw_rx_cb: exit");
1329 }
1330 
1331 /*
1332  * Send a message out over the physical device via the MAC layer.
1333  *
1334  * Returns any mblks that it was unable to transmit.
1335  */
1336 mblk_t *
1337 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
1338 {
1339 	const mac_txinfo_t	*mtp;
1340 
1341 	READ_ENTER(&vswp->mac_rwlock);
1342 	if ((vswp->mh == NULL) || (vswp->mstarted == B_FALSE)) {
1343 
1344 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
1345 		RW_EXIT(&vswp->mac_rwlock);
1346 		return (mp);
1347 	} else {
1348 		mtp = vswp->txinfo;
1349 		mp = mtp->mt_fn(mtp->mt_arg, mp);
1350 	}
1351 	RW_EXIT(&vswp->mac_rwlock);
1352 
1353 	return (mp);
1354 }
1355