xref: /titanic_50/usr/src/uts/sun4v/io/vsw_phys.c (revision 2e83744e07e0937d9ade0801c0a4d8316ac3071e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <netinet/arp.h>
55 #include <inet/arp.h>
56 #include <sys/varargs.h>
57 #include <sys/machsystm.h>
58 #include <sys/modctl.h>
59 #include <sys/modhash.h>
60 #include <sys/mac.h>
61 #include <sys/mac_ether.h>
62 #include <sys/taskq.h>
63 #include <sys/note.h>
64 #include <sys/mach_descrip.h>
65 #include <sys/mac.h>
66 #include <sys/mdeg.h>
67 #include <sys/vsw.h>
68 
69 /* MAC Ring table functions. */
70 static void vsw_mac_ring_tbl_init(vsw_t *vswp);
71 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
72 static void vsw_queue_worker(vsw_mac_ring_t *rrp);
73 static void vsw_queue_stop(vsw_queue_t *vqp);
74 static vsw_queue_t *vsw_queue_create();
75 static void vsw_queue_destroy(vsw_queue_t *vqp);
76 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
77 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
78 
79 /* MAC layer routines */
80 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
81 		mac_resource_t *mrp);
82 static	int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
83 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
84 static	int vsw_unset_hw_addr(vsw_t *, int);
85 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
86 static int vsw_prog_if(vsw_t *);
87 
88 /* Support functions */
89 static int vsw_prog_ports(vsw_t *);
90 int vsw_set_hw(vsw_t *, vsw_port_t *, int);
91 int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
92 void vsw_reconfig_hw(vsw_t *);
93 int vsw_mac_attach(vsw_t *vswp);
94 void vsw_mac_detach(vsw_t *vswp);
95 int vsw_mac_open(vsw_t *vswp);
96 void vsw_mac_close(vsw_t *vswp);
97 void vsw_unset_addrs(vsw_t *vswp);
98 void vsw_set_addrs(vsw_t *vswp);
99 int vsw_get_hw_maddr(vsw_t *);
100 mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
101 void vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr);
102 
103 /*
104  * Tunables used in this file.
105  */
106 extern int vsw_mac_open_retries;
107 extern boolean_t vsw_multi_ring_enable;
108 extern int vsw_mac_rx_rings;
109 extern uint32_t vsw_publish_macaddr_count;
110 
111 /*
112  * Check to see if the card supports the setting of multiple unicst
113  * addresses.
114  *
115  * Returns 0 if card supports the programming of multiple unicast addresses,
116  * otherwise returns 1.
117  */
118 int
119 vsw_get_hw_maddr(vsw_t *vswp)
120 {
121 	D1(vswp, "%s: enter", __func__);
122 
123 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
124 
125 	if (vswp->mh == NULL)
126 		return (1);
127 
128 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
129 		cmn_err(CE_NOTE, "!vsw%d: device (%s) does not support "
130 		    "programming multiple addresses", vswp->instance,
131 		    vswp->physname);
132 		return (1);
133 	}
134 
135 	D2(vswp, "%s: %d addrs : %d free", __func__,
136 	    vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
137 
138 	D1(vswp, "%s: exit", __func__);
139 
140 	return (0);
141 }
142 
143 /*
144  * Program unicast and multicast addresses of vsw interface and the ports
145  * into the physical device.
146  */
147 void
148 vsw_set_addrs(vsw_t *vswp)
149 {
150 	vsw_port_list_t	*plist = &vswp->plist;
151 	vsw_port_t	*port;
152 	mcst_addr_t	*mcap;
153 	int		rv;
154 
155 	READ_ENTER(&vswp->if_lockrw);
156 
157 	if (vswp->if_state & VSW_IF_UP) {
158 
159 		/* program unicst addr of vsw interface in the physdev */
160 		if (vswp->addr_set == VSW_ADDR_UNSET) {
161 			mutex_enter(&vswp->hw_lock);
162 			rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
163 			mutex_exit(&vswp->hw_lock);
164 			if (rv != 0) {
165 				cmn_err(CE_NOTE,
166 				    "!vsw%d: failed to program interface "
167 				    "unicast address\n", vswp->instance);
168 			}
169 			/*
170 			 * Notify the MAC layer of the changed address.
171 			 */
172 			mac_unicst_update(vswp->if_mh,
173 			    (uint8_t *)&vswp->if_addr);
174 		}
175 
176 		/* program mcast addrs of vsw interface in the physdev */
177 		mutex_enter(&vswp->mca_lock);
178 		WRITE_ENTER(&vswp->mac_rwlock);
179 		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
180 			if (mcap->mac_added)
181 				continue;
182 			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
183 			if (rv == 0) {
184 				mcap->mac_added = B_TRUE;
185 			} else {
186 				cmn_err(CE_NOTE, "!vsw%d: unable to add "
187 				    "multicast address: %s\n", vswp->instance,
188 				    ether_sprintf((void *)&mcap->mca));
189 			}
190 		}
191 		RW_EXIT(&vswp->mac_rwlock);
192 		mutex_exit(&vswp->mca_lock);
193 
194 	}
195 
196 	RW_EXIT(&vswp->if_lockrw);
197 
198 	WRITE_ENTER(&plist->lockrw);
199 
200 	/* program unicast address of ports in the physical device */
201 	mutex_enter(&vswp->hw_lock);
202 	for (port = plist->head; port != NULL; port = port->p_next) {
203 		if (port->addr_set != VSW_ADDR_UNSET) /* addr already set */
204 			continue;
205 		if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
206 			cmn_err(CE_NOTE,
207 			    "!vsw%d: port:%d failed to set unicast address\n",
208 			    vswp->instance, port->p_instance);
209 		}
210 	}
211 	mutex_exit(&vswp->hw_lock);
212 
213 	/* program multicast addresses of ports in the physdev */
214 	for (port = plist->head; port != NULL; port = port->p_next) {
215 		mutex_enter(&port->mca_lock);
216 		WRITE_ENTER(&vswp->mac_rwlock);
217 		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
218 			if (mcap->mac_added)
219 				continue;
220 			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
221 			if (rv == 0) {
222 				mcap->mac_added = B_TRUE;
223 			} else {
224 				cmn_err(CE_NOTE, "!vsw%d: unable to add "
225 				    "multicast address: %s\n", vswp->instance,
226 				    ether_sprintf((void *)&mcap->mca));
227 			}
228 		}
229 		RW_EXIT(&vswp->mac_rwlock);
230 		mutex_exit(&port->mca_lock);
231 	}
232 
233 	/* announce macaddr of vnets to the physical switch */
234 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
235 		for (port = plist->head; port != NULL; port = port->p_next) {
236 			vsw_publish_macaddr(vswp, (uint8_t *)&port->p_macaddr);
237 		}
238 	}
239 
240 	RW_EXIT(&plist->lockrw);
241 }
242 
243 /*
244  * Remove unicast and multicast addresses of vsw interface and the ports
245  * from the physical device.
246  */
247 void
248 vsw_unset_addrs(vsw_t *vswp)
249 {
250 	vsw_port_list_t	*plist = &vswp->plist;
251 	vsw_port_t	*port;
252 	mcst_addr_t	*mcap;
253 
254 	READ_ENTER(&vswp->if_lockrw);
255 
256 	if (vswp->if_state & VSW_IF_UP) {
257 
258 		/*
259 		 * Remove unicast addr of vsw interfce
260 		 * from current physdev
261 		 */
262 		mutex_enter(&vswp->hw_lock);
263 		(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
264 		mutex_exit(&vswp->hw_lock);
265 
266 		/*
267 		 * Remove mcast addrs of vsw interface
268 		 * from current physdev
269 		 */
270 		mutex_enter(&vswp->mca_lock);
271 		WRITE_ENTER(&vswp->mac_rwlock);
272 		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
273 			if (!mcap->mac_added)
274 				continue;
275 			(void) mac_multicst_remove(vswp->mh,
276 			    (uchar_t *)&mcap->mca);
277 			mcap->mac_added = B_FALSE;
278 		}
279 		RW_EXIT(&vswp->mac_rwlock);
280 		mutex_exit(&vswp->mca_lock);
281 
282 	}
283 
284 	RW_EXIT(&vswp->if_lockrw);
285 
286 	WRITE_ENTER(&plist->lockrw);
287 
288 	/*
289 	 * Remove unicast address of ports from the current physical device
290 	 */
291 	mutex_enter(&vswp->hw_lock);
292 	for (port = plist->head; port != NULL; port = port->p_next) {
293 		/* Remove address if was programmed into HW. */
294 		if (port->addr_set == VSW_ADDR_UNSET)
295 			continue;
296 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
297 	}
298 	mutex_exit(&vswp->hw_lock);
299 
300 	/* Remove multicast addresses of ports from the current physdev */
301 	for (port = plist->head; port != NULL; port = port->p_next) {
302 		mutex_enter(&port->mca_lock);
303 		WRITE_ENTER(&vswp->mac_rwlock);
304 		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
305 			if (!mcap->mac_added)
306 				continue;
307 			(void) mac_multicst_remove(vswp->mh,
308 			    (uchar_t *)&mcap->mca);
309 			mcap->mac_added = B_FALSE;
310 		}
311 		RW_EXIT(&vswp->mac_rwlock);
312 		mutex_exit(&port->mca_lock);
313 	}
314 
315 	RW_EXIT(&plist->lockrw);
316 }
317 
318 /*
319  * Open the underlying physical device for access in layer2 mode.
320  * Returns:
321  * 0 on success
322  * EAGAIN if mac_open() fails due to the device being not available yet.
323  * EIO on any other failures.
324  */
325 int
326 vsw_mac_open(vsw_t *vswp)
327 {
328 	int	rv;
329 
330 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
331 
332 	if (vswp->mh != NULL) {
333 		/* already open */
334 		return (0);
335 	}
336 
337 	if (vswp->mac_open_retries++ >= vsw_mac_open_retries) {
338 		/* exceeded max retries */
339 		return (EIO);
340 	}
341 
342 	if ((rv = mac_open_by_linkname(vswp->physname, &vswp->mh)) != 0) {
343 		/*
344 		 * If mac_open() failed and the error indicates that either
345 		 * the dlmgmtd door or the device is not available yet, we
346 		 * return EAGAIN to indicate that mac_open() needs to be
347 		 * retried. For example, this may happen during boot up, if
348 		 * the required link aggregation groups(devices) have not
349 		 * been created yet.
350 		 */
351 		if (rv == ENOENT || rv == EBADF) {
352 			return (EAGAIN);
353 		} else {
354 			cmn_err(CE_WARN, "vsw%d: device (%s) open failed rv:%x",
355 			    vswp->instance, vswp->physname, rv);
356 			return (EIO);
357 		}
358 	}
359 
360 	vswp->mac_open_retries = 0;
361 
362 	return (0);
363 }
364 
365 /*
366  * Close the underlying physical device.
367  */
368 void
369 vsw_mac_close(vsw_t *vswp)
370 {
371 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
372 
373 	if (vswp->mh != NULL) {
374 		mac_close(vswp->mh);
375 		vswp->mh = NULL;
376 	}
377 }
378 
379 /*
380  * Link into the MAC layer to gain access to the services provided by
381  * the underlying physical device driver (which should also have
382  * registered with the MAC layer).
383  *
384  * Only when in layer 2 mode.
385  */
386 int
387 vsw_mac_attach(vsw_t *vswp)
388 {
389 	D1(vswp, "%s: enter", __func__);
390 
391 	ASSERT(vswp->mrh == NULL);
392 	ASSERT(vswp->mstarted == B_FALSE);
393 	ASSERT(vswp->mresources == B_FALSE);
394 
395 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
396 
397 	ASSERT(vswp->mh != NULL);
398 
399 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
400 
401 	if (vsw_multi_ring_enable) {
402 		/*
403 		 * Initialize the ring table.
404 		 */
405 		vsw_mac_ring_tbl_init(vswp);
406 
407 		/*
408 		 * Register our rx callback function.
409 		 */
410 		vswp->mrh = mac_rx_add(vswp->mh,
411 		    vsw_rx_queue_cb, (void *)vswp);
412 		ASSERT(vswp->mrh != NULL);
413 
414 		/*
415 		 * Register our mac resource callback.
416 		 */
417 		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
418 		vswp->mresources = B_TRUE;
419 
420 		/*
421 		 * Get the ring resources available to us from
422 		 * the mac below us.
423 		 */
424 		mac_resources(vswp->mh);
425 	} else {
426 		/*
427 		 * Just register our rx callback function
428 		 */
429 		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
430 		ASSERT(vswp->mrh != NULL);
431 	}
432 
433 	/* Get the MAC tx fn */
434 	vswp->txinfo = mac_tx_get(vswp->mh);
435 
436 	/* start the interface */
437 	if (mac_start(vswp->mh) != 0) {
438 		cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
439 		    vswp->instance);
440 		goto mac_fail_exit;
441 	}
442 
443 	vswp->mstarted = B_TRUE;
444 
445 	D1(vswp, "%s: exit", __func__);
446 	return (0);
447 
448 mac_fail_exit:
449 	vsw_mac_detach(vswp);
450 
451 	D1(vswp, "%s: exit", __func__);
452 	return (1);
453 }
454 
455 void
456 vsw_mac_detach(vsw_t *vswp)
457 {
458 	D1(vswp, "vsw_mac_detach: enter");
459 
460 	ASSERT(vswp != NULL);
461 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
462 
463 	if (vsw_multi_ring_enable) {
464 		vsw_mac_ring_tbl_destroy(vswp);
465 	}
466 
467 	if (vswp->mh != NULL) {
468 		if (vswp->mstarted)
469 			mac_stop(vswp->mh);
470 		if (vswp->mrh != NULL)
471 			mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE);
472 		if (vswp->mresources)
473 			mac_resource_set(vswp->mh, NULL, NULL);
474 	}
475 
476 	vswp->mrh = NULL;
477 	vswp->txinfo = NULL;
478 	vswp->mstarted = B_FALSE;
479 
480 	D1(vswp, "vsw_mac_detach: exit");
481 }
482 
483 /*
484  * Depending on the mode specified, the capabilites and capacity
485  * of the underlying device setup the physical device.
486  *
487  * If in layer 3 mode, then do nothing.
488  *
489  * If in layer 2 programmed mode attempt to program the unicast address
490  * associated with the port into the physical device. If this is not
491  * possible due to resource exhaustion or simply because the device does
492  * not support multiple unicast addresses then if required fallback onto
493  * putting the card into promisc mode.
494  *
495  * If in promisc mode then simply set the card into promisc mode.
496  *
497  * Returns 0 success, 1 on failure.
498  */
499 int
500 vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
501 {
502 	mac_multi_addr_t	mac_addr;
503 	int			err;
504 
505 	D1(vswp, "%s: enter", __func__);
506 
507 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
508 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
509 
510 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
511 		return (0);
512 
513 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
514 		return (vsw_set_hw_promisc(vswp, port, type));
515 	}
516 
517 	/*
518 	 * Attempt to program the unicast address into the HW.
519 	 */
520 	mac_addr.mma_addrlen = ETHERADDRL;
521 	if (type == VSW_VNETPORT) {
522 		ASSERT(port != NULL);
523 		ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
524 	} else {
525 		ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
526 	}
527 
528 	err = vsw_set_hw_addr(vswp, &mac_addr);
529 	if (err == ENOSPC) {
530 		/*
531 		 * Mark that attempt should be made to re-config sometime
532 		 * in future if a port is deleted.
533 		 */
534 		vswp->recfg_reqd = B_TRUE;
535 
536 		/*
537 		 * Only 1 mode specified, nothing more to do.
538 		 */
539 		if (vswp->smode_num == 1)
540 			return (err);
541 
542 		/*
543 		 * If promiscuous was next mode specified try to
544 		 * set the card into that mode.
545 		 */
546 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
547 		    (vswp->smode[vswp->smode_idx + 1] ==
548 		    VSW_LAYER2_PROMISC)) {
549 			vswp->smode_idx += 1;
550 			return (vsw_set_hw_promisc(vswp, port, type));
551 		}
552 		return (err);
553 	}
554 
555 	if (err != 0)
556 		return (err);
557 
558 	if (type == VSW_VNETPORT) {
559 		port->addr_slot = mac_addr.mma_slot;
560 		port->addr_set = VSW_ADDR_HW;
561 	} else {
562 		vswp->addr_slot = mac_addr.mma_slot;
563 		vswp->addr_set = VSW_ADDR_HW;
564 	}
565 
566 	D2(vswp, "programmed addr %s into slot %d "
567 	"of device %s", ether_sprintf((void *)mac_addr.mma_addr),
568 	    mac_addr.mma_slot, vswp->physname);
569 
570 	D1(vswp, "%s: exit", __func__);
571 
572 	return (0);
573 }
574 
575 /*
576  * If in layer 3 mode do nothing.
577  *
578  * If in layer 2 switched mode remove the address from the physical
579  * device.
580  *
581  * If in layer 2 promiscuous mode disable promisc mode.
582  *
583  * Returns 0 on success.
584  */
585 int
586 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
587 {
588 	mac_addr_slot_t	slot;
589 	int		rv;
590 
591 	D1(vswp, "%s: enter", __func__);
592 
593 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
594 
595 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
596 		return (0);
597 
598 	switch (type) {
599 	case VSW_VNETPORT:
600 		ASSERT(port != NULL);
601 
602 		if (port->addr_set == VSW_ADDR_PROMISC) {
603 			return (vsw_unset_hw_promisc(vswp, port, type));
604 
605 		} else if (port->addr_set == VSW_ADDR_HW) {
606 			slot = port->addr_slot;
607 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
608 				port->addr_set = VSW_ADDR_UNSET;
609 		}
610 
611 		break;
612 
613 	case VSW_LOCALDEV:
614 		if (vswp->addr_set == VSW_ADDR_PROMISC) {
615 			return (vsw_unset_hw_promisc(vswp, NULL, type));
616 
617 		} else if (vswp->addr_set == VSW_ADDR_HW) {
618 			slot = vswp->addr_slot;
619 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
620 				vswp->addr_set = VSW_ADDR_UNSET;
621 		}
622 
623 		break;
624 
625 	default:
626 		/* should never happen */
627 		DERR(vswp, "%s: unknown type %d", __func__, type);
628 		ASSERT(0);
629 		return (1);
630 	}
631 
632 	D1(vswp, "%s: exit", __func__);
633 	return (rv);
634 }
635 
636 /*
637  * Attempt to program a unicast address into HW.
638  *
639  * Returns 0 on sucess, 1 on failure.
640  */
641 static int
642 vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
643 {
644 	void	*mah;
645 	int	rv = EINVAL;
646 
647 	D1(vswp, "%s: enter", __func__);
648 
649 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
650 
651 	if (vswp->maddr.maddr_handle == NULL)
652 		return (rv);
653 
654 	mah = vswp->maddr.maddr_handle;
655 
656 	rv = vswp->maddr.maddr_add(mah, mac);
657 
658 	if (rv == 0)
659 		return (rv);
660 
661 	/*
662 	 * Its okay for the add to fail because we have exhausted
663 	 * all the resouces in the hardware device. Any other error
664 	 * we want to flag.
665 	 */
666 	if (rv != ENOSPC) {
667 		cmn_err(CE_NOTE, "!vsw%d: error programming "
668 		    "address %s into HW err (%d)",
669 		    vswp->instance, ether_sprintf((void *)mac->mma_addr), rv);
670 	}
671 	D1(vswp, "%s: exit", __func__);
672 	return (rv);
673 }
674 
675 /*
676  * Remove a unicast mac address which has previously been programmed
677  * into HW.
678  *
679  * Returns 0 on sucess, 1 on failure.
680  */
681 static int
682 vsw_unset_hw_addr(vsw_t *vswp, int slot)
683 {
684 	void	*mah;
685 	int	rv;
686 
687 	D1(vswp, "%s: enter", __func__);
688 
689 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
690 	ASSERT(slot >= 0);
691 
692 	if (vswp->maddr.maddr_handle == NULL)
693 		return (1);
694 
695 	mah = vswp->maddr.maddr_handle;
696 
697 	rv = vswp->maddr.maddr_remove(mah, slot);
698 	if (rv != 0) {
699 		DWARN(vswp, "%s: unable to remove address "
700 		    "from slot %d in device %s (err %d)",
701 		    __func__, slot, vswp->physname, rv);
702 		return (1);
703 	}
704 
705 	D2(vswp, "removed addr from slot %d in device %s",
706 	    slot, vswp->physname);
707 
708 	D1(vswp, "%s: exit", __func__);
709 	return (0);
710 }
711 
712 /*
713  * Set network card into promisc mode.
714  *
715  * Returns 0 on success, 1 on failure.
716  */
717 static int
718 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
719 {
720 	D1(vswp, "%s: enter", __func__);
721 
722 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
723 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
724 
725 	WRITE_ENTER(&vswp->mac_rwlock);
726 	if (vswp->mh == NULL) {
727 		RW_EXIT(&vswp->mac_rwlock);
728 		return (1);
729 	}
730 
731 	if (vswp->promisc_cnt++ == 0) {
732 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
733 			vswp->promisc_cnt--;
734 			RW_EXIT(&vswp->mac_rwlock);
735 			return (1);
736 		}
737 		cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
738 		    "promiscuous mode", vswp->instance, vswp->physname);
739 	}
740 	RW_EXIT(&vswp->mac_rwlock);
741 
742 	if (type == VSW_VNETPORT) {
743 		ASSERT(port != NULL);
744 		port->addr_set = VSW_ADDR_PROMISC;
745 	} else {
746 		vswp->addr_set = VSW_ADDR_PROMISC;
747 	}
748 
749 	D1(vswp, "%s: exit", __func__);
750 
751 	return (0);
752 }
753 
754 /*
755  * Turn off promiscuous mode on network card.
756  *
757  * Returns 0 on success, 1 on failure.
758  */
759 static int
760 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
761 {
762 	vsw_port_list_t 	*plist = &vswp->plist;
763 
764 	D2(vswp, "%s: enter", __func__);
765 
766 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
767 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
768 
769 	WRITE_ENTER(&vswp->mac_rwlock);
770 	if (vswp->mh == NULL) {
771 		RW_EXIT(&vswp->mac_rwlock);
772 		return (1);
773 	}
774 
775 	if (--vswp->promisc_cnt == 0) {
776 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
777 			vswp->promisc_cnt++;
778 			RW_EXIT(&vswp->mac_rwlock);
779 			return (1);
780 		}
781 
782 		/*
783 		 * We are exiting promisc mode either because we were
784 		 * only in promisc mode because we had failed over from
785 		 * switched mode due to HW resource issues, or the user
786 		 * wanted the card in promisc mode for all the ports and
787 		 * the last port is now being deleted. Tweak the message
788 		 * accordingly.
789 		 */
790 		if (plist->num_ports != 0) {
791 			cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
792 			    "programmed mode", vswp->instance, vswp->physname);
793 		} else {
794 			cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
795 			    "promiscuous mode", vswp->instance, vswp->physname);
796 		}
797 	}
798 	RW_EXIT(&vswp->mac_rwlock);
799 
800 	if (type == VSW_VNETPORT) {
801 		ASSERT(port != NULL);
802 		ASSERT(port->addr_set == VSW_ADDR_PROMISC);
803 		port->addr_set = VSW_ADDR_UNSET;
804 	} else {
805 		ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
806 		vswp->addr_set = VSW_ADDR_UNSET;
807 	}
808 
809 	D1(vswp, "%s: exit", __func__);
810 	return (0);
811 }
812 
813 /*
814  * Determine whether or not we are operating in our prefered
815  * mode and if not whether the physical resources now allow us
816  * to operate in it.
817  *
818  * If a port is being removed should only be invoked after port has been
819  * removed from the port list.
820  */
821 void
822 vsw_reconfig_hw(vsw_t *vswp)
823 {
824 	int			s_idx;
825 
826 	D1(vswp, "%s: enter", __func__);
827 
828 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
829 
830 	if (vswp->maddr.maddr_handle == NULL) {
831 		return;
832 	}
833 
834 	/*
835 	 * If we are in layer 2 (i.e. switched) or would like to be
836 	 * in layer 2 then check if any ports or the vswitch itself
837 	 * need to be programmed into the HW.
838 	 *
839 	 * This can happen in two cases - switched was specified as
840 	 * the prefered mode of operation but we exhausted the HW
841 	 * resources and so failed over to the next specifed mode,
842 	 * or switched was the only mode specified so after HW
843 	 * resources were exhausted there was nothing more we
844 	 * could do.
845 	 */
846 	if (vswp->smode_idx > 0)
847 		s_idx = vswp->smode_idx - 1;
848 	else
849 		s_idx = vswp->smode_idx;
850 
851 	if (vswp->smode[s_idx] != VSW_LAYER2) {
852 		return;
853 	}
854 
855 	D2(vswp, "%s: attempting reconfig..", __func__);
856 
857 	/*
858 	 * First, attempt to set the vswitch mac address into HW,
859 	 * if required.
860 	 */
861 	if (vsw_prog_if(vswp)) {
862 		return;
863 	}
864 
865 	/*
866 	 * Next, attempt to set any ports which have not yet been
867 	 * programmed into HW.
868 	 */
869 	if (vsw_prog_ports(vswp)) {
870 		return;
871 	}
872 
873 	/*
874 	 * By now we know that have programmed all desired ports etc
875 	 * into HW, so safe to mark reconfiguration as complete.
876 	 */
877 	vswp->recfg_reqd = B_FALSE;
878 
879 	vswp->smode_idx = s_idx;
880 
881 	D1(vswp, "%s: exit", __func__);
882 }
883 
884 /*
885  * Check to see if vsw itself is plumbed, and if so whether or not
886  * its mac address should be written into HW.
887  *
888  * Returns 0 if could set address, or didn't have to set it.
889  * Returns 1 if failed to set address.
890  */
891 static int
892 vsw_prog_if(vsw_t *vswp)
893 {
894 	mac_multi_addr_t	addr;
895 
896 	D1(vswp, "%s: enter", __func__);
897 
898 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
899 
900 	READ_ENTER(&vswp->if_lockrw);
901 	if ((vswp->if_state & VSW_IF_UP) &&
902 	    (vswp->addr_set != VSW_ADDR_HW)) {
903 
904 		addr.mma_addrlen = ETHERADDRL;
905 		ether_copy(&vswp->if_addr, &addr.mma_addr);
906 
907 		if (vsw_set_hw_addr(vswp, &addr) != 0) {
908 			RW_EXIT(&vswp->if_lockrw);
909 			return (1);
910 		}
911 
912 		vswp->addr_slot = addr.mma_slot;
913 
914 		/*
915 		 * If previously when plumbed had had to place
916 		 * interface into promisc mode, now reverse that.
917 		 *
918 		 * Note that interface will only actually be set into
919 		 * non-promisc mode when last port/interface has been
920 		 * programmed into HW.
921 		 */
922 		if (vswp->addr_set == VSW_ADDR_PROMISC)
923 			(void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);
924 
925 		vswp->addr_set = VSW_ADDR_HW;
926 	}
927 	RW_EXIT(&vswp->if_lockrw);
928 
929 	D1(vswp, "%s: exit", __func__);
930 	return (0);
931 }
932 
933 /*
934  * Scan the port list for any ports which have not yet been set
935  * into HW. For those found attempt to program their mac addresses
936  * into the physical device.
937  *
938  * Returns 0 if able to program all required ports (can be 0) into HW.
939  * Returns 1 if failed to set at least one mac address.
940  */
941 static int
942 vsw_prog_ports(vsw_t *vswp)
943 {
944 	mac_multi_addr_t	addr;
945 	vsw_port_list_t		*plist = &vswp->plist;
946 	vsw_port_t		*tp;
947 	int			rv = 0;
948 
949 	D1(vswp, "%s: enter", __func__);
950 
951 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
952 
953 	READ_ENTER(&plist->lockrw);
954 	for (tp = plist->head; tp != NULL; tp = tp->p_next) {
955 		if (tp->addr_set != VSW_ADDR_HW) {
956 			addr.mma_addrlen = ETHERADDRL;
957 			ether_copy(&tp->p_macaddr, &addr.mma_addr);
958 
959 			if (vsw_set_hw_addr(vswp, &addr) != 0) {
960 				rv = 1;
961 				break;
962 			}
963 
964 			tp->addr_slot = addr.mma_slot;
965 
966 			/*
967 			 * If when this port had first attached we had
968 			 * had to place the interface into promisc mode,
969 			 * then now reverse that.
970 			 *
971 			 * Note that the interface will not actually
972 			 * change to non-promisc mode until all ports
973 			 * have been programmed.
974 			 */
975 			if (tp->addr_set == VSW_ADDR_PROMISC)
976 				(void) vsw_unset_hw_promisc(vswp,
977 				    tp, VSW_VNETPORT);
978 
979 			tp->addr_set = VSW_ADDR_HW;
980 		}
981 	}
982 	RW_EXIT(&plist->lockrw);
983 
984 	D1(vswp, "%s: exit", __func__);
985 	return (rv);
986 }
987 
988 static void
989 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
990 {
991 	ringp->ring_state = VSW_MAC_RING_FREE;
992 	ringp->ring_arg = NULL;
993 	ringp->ring_blank = NULL;
994 	ringp->ring_vqp = NULL;
995 	ringp->ring_vswp = vswp;
996 }
997 
998 static void
999 vsw_mac_ring_tbl_init(vsw_t *vswp)
1000 {
1001 	int		i;
1002 
1003 	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
1004 
1005 	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
1006 	vswp->mac_ring_tbl  =
1007 	    kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP);
1008 
1009 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
1010 		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
1011 }
1012 
1013 static void
1014 vsw_mac_ring_tbl_destroy(vsw_t *vswp)
1015 {
1016 	int		i;
1017 	vsw_mac_ring_t	*ringp;
1018 
1019 	mutex_enter(&vswp->mac_ring_lock);
1020 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1021 		ringp = &vswp->mac_ring_tbl[i];
1022 
1023 		if (ringp->ring_state != VSW_MAC_RING_FREE) {
1024 			/*
1025 			 * Destroy the queue.
1026 			 */
1027 			vsw_queue_stop(ringp->ring_vqp);
1028 			vsw_queue_destroy(ringp->ring_vqp);
1029 
1030 			/*
1031 			 * Re-initialize the structure.
1032 			 */
1033 			vsw_mac_ring_tbl_entry_init(vswp, ringp);
1034 		}
1035 	}
1036 	mutex_exit(&vswp->mac_ring_lock);
1037 
1038 	mutex_destroy(&vswp->mac_ring_lock);
1039 	kmem_free(vswp->mac_ring_tbl,
1040 	    vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
1041 	vswp->mac_ring_tbl_sz = 0;
1042 }
1043 
1044 /*
1045  * Handle resource add callbacks from the driver below.
1046  */
1047 static mac_resource_handle_t
1048 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
1049 {
1050 	vsw_t		*vswp = (vsw_t *)arg;
1051 	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
1052 	vsw_mac_ring_t	*ringp;
1053 	vsw_queue_t	*vqp;
1054 	int		i;
1055 
1056 	ASSERT(vswp != NULL);
1057 	ASSERT(mrp != NULL);
1058 	ASSERT(vswp->mac_ring_tbl != NULL);
1059 
1060 	D1(vswp, "%s: enter", __func__);
1061 
1062 	/*
1063 	 * Check to make sure we have the correct resource type.
1064 	 */
1065 	if (mrp->mr_type != MAC_RX_FIFO)
1066 		return (NULL);
1067 
1068 	/*
1069 	 * Find a open entry in the ring table.
1070 	 */
1071 	mutex_enter(&vswp->mac_ring_lock);
1072 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1073 		ringp = &vswp->mac_ring_tbl[i];
1074 
1075 		/*
1076 		 * Check for an empty slot, if found, then setup queue
1077 		 * and thread.
1078 		 */
1079 		if (ringp->ring_state == VSW_MAC_RING_FREE) {
1080 			/*
1081 			 * Create the queue for this ring.
1082 			 */
1083 			vqp = vsw_queue_create();
1084 
1085 			/*
1086 			 * Initialize the ring data structure.
1087 			 */
1088 			ringp->ring_vqp = vqp;
1089 			ringp->ring_arg = mrfp->mrf_arg;
1090 			ringp->ring_blank = mrfp->mrf_blank;
1091 			ringp->ring_state = VSW_MAC_RING_INUSE;
1092 
1093 			/*
1094 			 * Create the worker thread.
1095 			 */
1096 			vqp->vq_worker = thread_create(NULL, 0,
1097 			    vsw_queue_worker, ringp, 0, &p0,
1098 			    TS_RUN, minclsyspri);
1099 			if (vqp->vq_worker == NULL) {
1100 				vsw_queue_destroy(vqp);
1101 				vsw_mac_ring_tbl_entry_init(vswp, ringp);
1102 				ringp = NULL;
1103 			}
1104 
1105 			if (ringp != NULL) {
1106 				/*
1107 				 * Make sure thread get's running state for
1108 				 * this ring.
1109 				 */
1110 				mutex_enter(&vqp->vq_lock);
1111 				while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
1112 				    (vqp->vq_state != VSW_QUEUE_DRAINED)) {
1113 					cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1114 				}
1115 
1116 				/*
1117 				 * If the thread is not running, cleanup.
1118 				 */
1119 				if (vqp->vq_state == VSW_QUEUE_DRAINED) {
1120 					vsw_queue_destroy(vqp);
1121 					vsw_mac_ring_tbl_entry_init(vswp,
1122 					    ringp);
1123 					ringp = NULL;
1124 				}
1125 				mutex_exit(&vqp->vq_lock);
1126 			}
1127 
1128 			mutex_exit(&vswp->mac_ring_lock);
1129 			D1(vswp, "%s: exit", __func__);
1130 			return ((mac_resource_handle_t)ringp);
1131 		}
1132 	}
1133 	mutex_exit(&vswp->mac_ring_lock);
1134 
1135 	/*
1136 	 * No slots in the ring table available.
1137 	 */
1138 	D1(vswp, "%s: exit", __func__);
1139 	return (NULL);
1140 }
1141 
1142 static void
1143 vsw_queue_stop(vsw_queue_t *vqp)
1144 {
1145 	mutex_enter(&vqp->vq_lock);
1146 
1147 	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
1148 		vqp->vq_state = VSW_QUEUE_STOP;
1149 		cv_signal(&vqp->vq_cv);
1150 
1151 		while (vqp->vq_state != VSW_QUEUE_DRAINED)
1152 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1153 	}
1154 
1155 	vqp->vq_state = VSW_QUEUE_STOPPED;
1156 
1157 	mutex_exit(&vqp->vq_lock);
1158 }
1159 
1160 static vsw_queue_t *
1161 vsw_queue_create()
1162 {
1163 	vsw_queue_t *vqp;
1164 
1165 	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
1166 
1167 	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
1168 	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
1169 	vqp->vq_first = NULL;
1170 	vqp->vq_last = NULL;
1171 	vqp->vq_state = VSW_QUEUE_STOPPED;
1172 
1173 	return (vqp);
1174 }
1175 
1176 static void
1177 vsw_queue_destroy(vsw_queue_t *vqp)
1178 {
1179 	cv_destroy(&vqp->vq_cv);
1180 	mutex_destroy(&vqp->vq_lock);
1181 	kmem_free(vqp, sizeof (vsw_queue_t));
1182 }
1183 
1184 static void
1185 vsw_queue_worker(vsw_mac_ring_t *rrp)
1186 {
1187 	mblk_t		*mp;
1188 	vsw_queue_t	*vqp = rrp->ring_vqp;
1189 	vsw_t		*vswp = rrp->ring_vswp;
1190 
1191 	mutex_enter(&vqp->vq_lock);
1192 
1193 	ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
1194 
1195 	/*
1196 	 * Set the state to running, since the thread is now active.
1197 	 */
1198 	vqp->vq_state = VSW_QUEUE_RUNNING;
1199 	cv_signal(&vqp->vq_cv);
1200 
1201 	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
1202 		/*
1203 		 * Wait for work to do or the state has changed
1204 		 * to not running.
1205 		 */
1206 		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
1207 		    (vqp->vq_first == NULL)) {
1208 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1209 		}
1210 
1211 		/*
1212 		 * Process packets that we received from the interface.
1213 		 */
1214 		if (vqp->vq_first != NULL) {
1215 			mp = vqp->vq_first;
1216 
1217 			vqp->vq_first = NULL;
1218 			vqp->vq_last = NULL;
1219 
1220 			mutex_exit(&vqp->vq_lock);
1221 
1222 			/* switch the chain of packets received */
1223 			vswp->vsw_switch_frame(vswp, mp,
1224 			    VSW_PHYSDEV, NULL, NULL);
1225 
1226 			mutex_enter(&vqp->vq_lock);
1227 		}
1228 	}
1229 
1230 	/*
1231 	 * We are drained and signal we are done.
1232 	 */
1233 	vqp->vq_state = VSW_QUEUE_DRAINED;
1234 	cv_signal(&vqp->vq_cv);
1235 
1236 	/*
1237 	 * Exit lock and drain the remaining packets.
1238 	 */
1239 	mutex_exit(&vqp->vq_lock);
1240 
1241 	/*
1242 	 * Exit the thread
1243 	 */
1244 	thread_exit();
1245 }
1246 
1247 /*
1248  * static void
1249  * vsw_rx_queue_cb() - Receive callback routine when
1250  *	vsw_multi_ring_enable is non-zero.  Queue the packets
1251  *	to a packet queue for a worker thread to process.
1252  */
1253 static void
1254 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1255 {
1256 	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
1257 	vsw_t		*vswp = (vsw_t *)arg;
1258 	vsw_queue_t	*vqp;
1259 	mblk_t		*bp, *last;
1260 
1261 	ASSERT(mrh != NULL);
1262 	ASSERT(vswp != NULL);
1263 	ASSERT(mp != NULL);
1264 
1265 	D1(vswp, "%s: enter", __func__);
1266 
1267 	/*
1268 	 * Find the last element in the mblk chain.
1269 	 */
1270 	bp = mp;
1271 	do {
1272 		last = bp;
1273 		bp = bp->b_next;
1274 	} while (bp != NULL);
1275 
1276 	/* Get the queue for the packets */
1277 	vqp = ringp->ring_vqp;
1278 
1279 	/*
1280 	 * Grab the lock such we can queue the packets.
1281 	 */
1282 	mutex_enter(&vqp->vq_lock);
1283 
1284 	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
1285 		freemsgchain(mp);
1286 		mutex_exit(&vqp->vq_lock);
1287 		goto vsw_rx_queue_cb_exit;
1288 	}
1289 
1290 	/*
1291 	 * Add the mblk chain to the queue.  If there
1292 	 * is some mblks in the queue, then add the new
1293 	 * chain to the end.
1294 	 */
1295 	if (vqp->vq_first == NULL)
1296 		vqp->vq_first = mp;
1297 	else
1298 		vqp->vq_last->b_next = mp;
1299 
1300 	vqp->vq_last = last;
1301 
1302 	/*
1303 	 * Signal the worker thread that there is work to
1304 	 * do.
1305 	 */
1306 	cv_signal(&vqp->vq_cv);
1307 
1308 	/*
1309 	 * Let go of the lock and exit.
1310 	 */
1311 	mutex_exit(&vqp->vq_lock);
1312 
1313 vsw_rx_queue_cb_exit:
1314 	D1(vswp, "%s: exit", __func__);
1315 }
1316 
1317 /*
1318  * receive callback routine. Invoked by MAC layer when there
1319  * are pkts being passed up from physical device.
1320  *
1321  * PERF: It may be more efficient when the card is in promisc
1322  * mode to check the dest address of the pkts here (against
1323  * the FDB) rather than checking later. Needs to be investigated.
1324  */
1325 static void
1326 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1327 {
1328 	_NOTE(ARGUNUSED(mrh))
1329 
1330 	vsw_t		*vswp = (vsw_t *)arg;
1331 
1332 	ASSERT(vswp != NULL);
1333 
1334 	D1(vswp, "vsw_rx_cb: enter");
1335 
1336 	/* switch the chain of packets received */
1337 	vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
1338 
1339 	D1(vswp, "vsw_rx_cb: exit");
1340 }
1341 
1342 /*
1343  * Send a message out over the physical device via the MAC layer.
1344  *
1345  * Returns any mblks that it was unable to transmit.
1346  */
1347 mblk_t *
1348 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
1349 {
1350 	const mac_txinfo_t	*mtp;
1351 
1352 	READ_ENTER(&vswp->mac_rwlock);
1353 	if ((vswp->mh == NULL) || (vswp->mstarted == B_FALSE)) {
1354 
1355 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
1356 		RW_EXIT(&vswp->mac_rwlock);
1357 		return (mp);
1358 	} else {
1359 		mtp = vswp->txinfo;
1360 		mp = mtp->mt_fn(mtp->mt_arg, mp);
1361 	}
1362 	RW_EXIT(&vswp->mac_rwlock);
1363 
1364 	return (mp);
1365 }
1366 
1367 #define	ARH_FIXED_LEN	8    /* Length of fixed part of ARP header(see arp.h) */
1368 
1369 /*
1370  * Send a gratuitous RARP packet to notify the physical switch to update its
1371  * Layer2 forwarding table for the given mac address. This is done to allow the
1372  * switch to quickly learn the macaddr-port association when a guest is live
1373  * migrated or when vsw's physical device is changed dynamically. Any protocol
1374  * packet would serve this purpose, but we choose RARP, as it allows us to
1375  * accomplish this within L2 (ie, no need to specify IP addr etc in the packet)
1376  * The macaddr of vnet is retained across migration. Hence, we don't need to
1377  * update the arp cache of other hosts within the broadcast domain. Note that
1378  * it is harmless to send these RARP packets during normal port attach of a
1379  * client vnet. This can can be turned off if needed, by setting
1380  * vsw_publish_macaddr_count to zero in /etc/system.
1381  */
1382 void
1383 vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
1384 {
1385 	mblk_t			*mp;
1386 	mblk_t			*bp;
1387 	struct arphdr		*arh;
1388 	struct	ether_header 	*ehp;
1389 	int			count = 0;
1390 	int			plen = 4;
1391 	uint8_t			*cp;
1392 
1393 	mp = allocb(ETHERMIN, BPRI_MED);
1394 	if (mp == NULL) {
1395 		return;
1396 	}
1397 
1398 	/* Initialize eth header */
1399 	ehp = (struct  ether_header *)mp->b_rptr;
1400 	bcopy(&etherbroadcastaddr, &ehp->ether_dhost, ETHERADDRL);
1401 	bcopy(addr, &ehp->ether_shost, ETHERADDRL);
1402 	ehp->ether_type = htons(ETHERTYPE_REVARP);
1403 
1404 	/* Initialize arp packet */
1405 	arh = (struct arphdr *)(mp->b_rptr + sizeof (struct ether_header));
1406 	cp = (uint8_t *)arh;
1407 
1408 	arh->ar_hrd = htons(ARPHRD_ETHER);	/* Hardware type:  ethernet */
1409 	arh->ar_pro = htons(ETHERTYPE_IP);	/* Protocol type:  IP */
1410 	arh->ar_hln = ETHERADDRL;	/* Length of hardware address:  6 */
1411 	arh->ar_pln = plen;		/* Length of protocol address:  4 */
1412 	arh->ar_op = htons(REVARP_REQUEST);	/* Opcode: REVARP Request */
1413 
1414 	cp += ARH_FIXED_LEN;
1415 
1416 	/* Sender's hardware address and protocol address */
1417 	bcopy(addr, cp, ETHERADDRL);
1418 	cp += ETHERADDRL;
1419 	bzero(cp, plen);	/* INADDR_ANY */
1420 	cp += plen;
1421 
1422 	/* Target hardware address and protocol address */
1423 	bcopy(addr, cp, ETHERADDRL);
1424 	cp += ETHERADDRL;
1425 	bzero(cp, plen);	/* INADDR_ANY */
1426 	cp += plen;
1427 
1428 	mp->b_wptr += ETHERMIN;	/* total size is 42; round up to ETHERMIN */
1429 
1430 	for (count = 0; count < vsw_publish_macaddr_count; count++) {
1431 
1432 		bp = dupmsg(mp);
1433 		if (bp == NULL) {
1434 			continue;
1435 		}
1436 
1437 		/* transmit the packet */
1438 		bp = vsw_tx_msg(vswp, bp);
1439 		if (bp != NULL) {
1440 			freemsg(bp);
1441 		}
1442 	}
1443 
1444 	freemsg(mp);
1445 }
1446