xref: /titanic_44/usr/src/uts/sun4v/io/vsw_phys.c (revision 3f1e69bef33050bee99ea1e9992af13fc467281f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <netinet/arp.h>
53 #include <inet/arp.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/vsw.h>
66 
67 /* MAC Ring table functions. */
68 static void vsw_mac_ring_tbl_init(vsw_t *vswp);
69 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp);
70 static void vsw_queue_worker(vsw_mac_ring_t *rrp);
71 static void vsw_queue_stop(vsw_queue_t *vqp);
72 static vsw_queue_t *vsw_queue_create();
73 static void vsw_queue_destroy(vsw_queue_t *vqp);
74 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *);
75 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
76 
77 /* MAC layer routines */
78 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg,
79 		mac_resource_t *mrp);
80 static	int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *);
81 static	int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int);
82 static	int vsw_unset_hw_addr(vsw_t *, int);
83 static	int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int);
84 static int vsw_prog_if(vsw_t *);
85 static	void vsw_mac_set_mtu(vsw_t *vswp, uint32_t mtu);
86 
87 /* Support functions */
88 static int vsw_prog_ports(vsw_t *);
89 int vsw_set_hw(vsw_t *, vsw_port_t *, int);
90 int vsw_unset_hw(vsw_t *, vsw_port_t *, int);
91 void vsw_reconfig_hw(vsw_t *);
92 int vsw_mac_attach(vsw_t *vswp);
93 void vsw_mac_detach(vsw_t *vswp);
94 int vsw_mac_open(vsw_t *vswp);
95 void vsw_mac_close(vsw_t *vswp);
96 void vsw_unset_addrs(vsw_t *vswp);
97 void vsw_set_addrs(vsw_t *vswp);
98 int vsw_get_hw_maddr(vsw_t *);
99 mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
100 void vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr);
101 
102 static char mac_mtu_propname[] = "mtu";
103 
104 /*
105  * Tunables used in this file.
106  */
107 extern int vsw_mac_open_retries;
108 extern boolean_t vsw_multi_ring_enable;
109 extern int vsw_mac_rx_rings;
110 extern uint32_t vsw_publish_macaddr_count;
111 
112 /*
113  * Check to see if the card supports the setting of multiple unicst
114  * addresses.
115  *
116  * Returns 0 if card supports the programming of multiple unicast addresses,
117  * otherwise returns 1.
118  */
119 int
120 vsw_get_hw_maddr(vsw_t *vswp)
121 {
122 	D1(vswp, "%s: enter", __func__);
123 
124 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
125 
126 	if (vswp->mh == NULL)
127 		return (1);
128 
129 	if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) {
130 		cmn_err(CE_NOTE, "!vsw%d: device (%s) does not support "
131 		    "programming multiple addresses", vswp->instance,
132 		    vswp->physname);
133 		return (1);
134 	}
135 
136 	D2(vswp, "%s: %d addrs : %d free", __func__,
137 	    vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree);
138 
139 	D1(vswp, "%s: exit", __func__);
140 
141 	return (0);
142 }
143 
144 /*
145  * Program unicast and multicast addresses of vsw interface and the ports
146  * into the physical device.
147  */
148 void
149 vsw_set_addrs(vsw_t *vswp)
150 {
151 	vsw_port_list_t	*plist = &vswp->plist;
152 	vsw_port_t	*port;
153 	mcst_addr_t	*mcap;
154 	int		rv;
155 
156 	READ_ENTER(&vswp->if_lockrw);
157 
158 	if (vswp->if_state & VSW_IF_UP) {
159 
160 		/* program unicst addr of vsw interface in the physdev */
161 		if (vswp->addr_set == VSW_ADDR_UNSET) {
162 			mutex_enter(&vswp->hw_lock);
163 			rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV);
164 			mutex_exit(&vswp->hw_lock);
165 			if (rv != 0) {
166 				cmn_err(CE_NOTE,
167 				    "!vsw%d: failed to program interface "
168 				    "unicast address\n", vswp->instance);
169 			}
170 			/*
171 			 * Notify the MAC layer of the changed address.
172 			 */
173 			mac_unicst_update(vswp->if_mh,
174 			    (uint8_t *)&vswp->if_addr);
175 		}
176 
177 		/* program mcast addrs of vsw interface in the physdev */
178 		mutex_enter(&vswp->mca_lock);
179 		WRITE_ENTER(&vswp->mac_rwlock);
180 		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
181 			if (mcap->mac_added)
182 				continue;
183 			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
184 			if (rv == 0) {
185 				mcap->mac_added = B_TRUE;
186 			} else {
187 				cmn_err(CE_NOTE, "!vsw%d: unable to add "
188 				    "multicast address: %s\n", vswp->instance,
189 				    ether_sprintf((void *)&mcap->mca));
190 			}
191 		}
192 		RW_EXIT(&vswp->mac_rwlock);
193 		mutex_exit(&vswp->mca_lock);
194 
195 	}
196 
197 	RW_EXIT(&vswp->if_lockrw);
198 
199 	WRITE_ENTER(&plist->lockrw);
200 
201 	/* program unicast address of ports in the physical device */
202 	mutex_enter(&vswp->hw_lock);
203 	for (port = plist->head; port != NULL; port = port->p_next) {
204 		if (port->addr_set != VSW_ADDR_UNSET) /* addr already set */
205 			continue;
206 		if (vsw_set_hw(vswp, port, VSW_VNETPORT)) {
207 			cmn_err(CE_NOTE,
208 			    "!vsw%d: port:%d failed to set unicast address\n",
209 			    vswp->instance, port->p_instance);
210 		}
211 	}
212 	mutex_exit(&vswp->hw_lock);
213 
214 	/* program multicast addresses of ports in the physdev */
215 	for (port = plist->head; port != NULL; port = port->p_next) {
216 		mutex_enter(&port->mca_lock);
217 		WRITE_ENTER(&vswp->mac_rwlock);
218 		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
219 			if (mcap->mac_added)
220 				continue;
221 			rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca);
222 			if (rv == 0) {
223 				mcap->mac_added = B_TRUE;
224 			} else {
225 				cmn_err(CE_NOTE, "!vsw%d: unable to add "
226 				    "multicast address: %s\n", vswp->instance,
227 				    ether_sprintf((void *)&mcap->mca));
228 			}
229 		}
230 		RW_EXIT(&vswp->mac_rwlock);
231 		mutex_exit(&port->mca_lock);
232 	}
233 
234 	/* announce macaddr of vnets to the physical switch */
235 	if (vsw_publish_macaddr_count != 0) {	/* enabled */
236 		for (port = plist->head; port != NULL; port = port->p_next) {
237 			vsw_publish_macaddr(vswp, (uint8_t *)&port->p_macaddr);
238 		}
239 	}
240 
241 	RW_EXIT(&plist->lockrw);
242 }
243 
244 /*
245  * Remove unicast and multicast addresses of vsw interface and the ports
246  * from the physical device.
247  */
248 void
249 vsw_unset_addrs(vsw_t *vswp)
250 {
251 	vsw_port_list_t	*plist = &vswp->plist;
252 	vsw_port_t	*port;
253 	mcst_addr_t	*mcap;
254 
255 	READ_ENTER(&vswp->if_lockrw);
256 
257 	if (vswp->if_state & VSW_IF_UP) {
258 
259 		/*
260 		 * Remove unicast addr of vsw interfce
261 		 * from current physdev
262 		 */
263 		mutex_enter(&vswp->hw_lock);
264 		(void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
265 		mutex_exit(&vswp->hw_lock);
266 
267 		/*
268 		 * Remove mcast addrs of vsw interface
269 		 * from current physdev
270 		 */
271 		mutex_enter(&vswp->mca_lock);
272 		WRITE_ENTER(&vswp->mac_rwlock);
273 		for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) {
274 			if (!mcap->mac_added)
275 				continue;
276 			(void) mac_multicst_remove(vswp->mh,
277 			    (uchar_t *)&mcap->mca);
278 			mcap->mac_added = B_FALSE;
279 		}
280 		RW_EXIT(&vswp->mac_rwlock);
281 		mutex_exit(&vswp->mca_lock);
282 
283 	}
284 
285 	RW_EXIT(&vswp->if_lockrw);
286 
287 	WRITE_ENTER(&plist->lockrw);
288 
289 	/*
290 	 * Remove unicast address of ports from the current physical device
291 	 */
292 	mutex_enter(&vswp->hw_lock);
293 	for (port = plist->head; port != NULL; port = port->p_next) {
294 		/* Remove address if was programmed into HW. */
295 		if (port->addr_set == VSW_ADDR_UNSET)
296 			continue;
297 		(void) vsw_unset_hw(vswp, port, VSW_VNETPORT);
298 	}
299 	mutex_exit(&vswp->hw_lock);
300 
301 	/* Remove multicast addresses of ports from the current physdev */
302 	for (port = plist->head; port != NULL; port = port->p_next) {
303 		mutex_enter(&port->mca_lock);
304 		WRITE_ENTER(&vswp->mac_rwlock);
305 		for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) {
306 			if (!mcap->mac_added)
307 				continue;
308 			(void) mac_multicst_remove(vswp->mh,
309 			    (uchar_t *)&mcap->mca);
310 			mcap->mac_added = B_FALSE;
311 		}
312 		RW_EXIT(&vswp->mac_rwlock);
313 		mutex_exit(&port->mca_lock);
314 	}
315 
316 	RW_EXIT(&plist->lockrw);
317 }
318 
319 /*
320  * Open the underlying physical device for access in layer2 mode.
321  * Returns:
322  * 0 on success
323  * EAGAIN if mac_open() fails due to the device being not available yet.
324  * EIO on any other failures.
325  */
326 int
327 vsw_mac_open(vsw_t *vswp)
328 {
329 	int	rv;
330 
331 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
332 
333 	if (vswp->mh != NULL) {
334 		/* already open */
335 		return (0);
336 	}
337 
338 	if (vswp->mac_open_retries++ >= vsw_mac_open_retries) {
339 		/* exceeded max retries */
340 		return (EIO);
341 	}
342 
343 	if ((rv = mac_open_by_linkname(vswp->physname, &vswp->mh)) != 0) {
344 		/*
345 		 * If mac_open() failed and the error indicates that either
346 		 * the dlmgmtd door or the device is not available yet, we
347 		 * return EAGAIN to indicate that mac_open() needs to be
348 		 * retried. For example, this may happen during boot up, if
349 		 * the required link aggregation groups(devices) have not
350 		 * been created yet.
351 		 */
352 		if (rv == ENOENT || rv == EBADF) {
353 			return (EAGAIN);
354 		} else {
355 			cmn_err(CE_WARN, "vsw%d: device (%s) open failed rv:%x",
356 			    vswp->instance, vswp->physname, rv);
357 			return (EIO);
358 		}
359 	}
360 
361 	vswp->mac_open_retries = 0;
362 
363 	return (0);
364 }
365 
366 /*
367  * Close the underlying physical device.
368  */
369 void
370 vsw_mac_close(vsw_t *vswp)
371 {
372 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
373 
374 	if (vswp->mh != NULL) {
375 		mac_close(vswp->mh);
376 		vswp->mh = NULL;
377 	}
378 }
379 
380 /*
381  * Link into the MAC layer to gain access to the services provided by
382  * the underlying physical device driver (which should also have
383  * registered with the MAC layer).
384  *
385  * Only when in layer 2 mode.
386  */
387 int
388 vsw_mac_attach(vsw_t *vswp)
389 {
390 	D1(vswp, "%s: enter", __func__);
391 
392 	ASSERT(vswp->mrh == NULL);
393 	ASSERT(vswp->mstarted == B_FALSE);
394 	ASSERT(vswp->mresources == B_FALSE);
395 
396 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
397 
398 	ASSERT(vswp->mh != NULL);
399 
400 	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
401 
402 	vsw_mac_set_mtu(vswp, vswp->mtu);
403 
404 	if (vsw_multi_ring_enable) {
405 		/*
406 		 * Initialize the ring table.
407 		 */
408 		vsw_mac_ring_tbl_init(vswp);
409 
410 		/*
411 		 * Register our rx callback function.
412 		 */
413 		vswp->mrh = mac_rx_add(vswp->mh,
414 		    vsw_rx_queue_cb, (void *)vswp);
415 		ASSERT(vswp->mrh != NULL);
416 
417 		/*
418 		 * Register our mac resource callback.
419 		 */
420 		mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp);
421 		vswp->mresources = B_TRUE;
422 
423 		/*
424 		 * Get the ring resources available to us from
425 		 * the mac below us.
426 		 */
427 		mac_resources(vswp->mh);
428 	} else {
429 		/*
430 		 * Just register our rx callback function
431 		 */
432 		vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
433 		ASSERT(vswp->mrh != NULL);
434 	}
435 
436 	/* Get the MAC tx fn */
437 	vswp->txinfo = mac_tx_get(vswp->mh);
438 
439 	/* start the interface */
440 	if (mac_start(vswp->mh) != 0) {
441 		cmn_err(CE_WARN, "!vsw%d: Could not start mac interface",
442 		    vswp->instance);
443 		goto mac_fail_exit;
444 	}
445 
446 	vswp->mstarted = B_TRUE;
447 
448 	D1(vswp, "%s: exit", __func__);
449 	return (0);
450 
451 mac_fail_exit:
452 	vsw_mac_detach(vswp);
453 
454 	D1(vswp, "%s: exit", __func__);
455 	return (1);
456 }
457 
458 void
459 vsw_mac_detach(vsw_t *vswp)
460 {
461 	D1(vswp, "vsw_mac_detach: enter");
462 
463 	ASSERT(vswp != NULL);
464 	ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock));
465 
466 	if (vsw_multi_ring_enable) {
467 		vsw_mac_ring_tbl_destroy(vswp);
468 	}
469 
470 	if (vswp->mh != NULL) {
471 		if (vswp->mstarted)
472 			mac_stop(vswp->mh);
473 		if (vswp->mrh != NULL)
474 			mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE);
475 		if (vswp->mresources)
476 			mac_resource_set(vswp->mh, NULL, NULL);
477 		if (vswp->mtu != vswp->mtu_physdev_orig) {
478 			vsw_mac_set_mtu(vswp, vswp->mtu_physdev_orig);
479 		}
480 	}
481 
482 	vswp->mrh = NULL;
483 	vswp->txinfo = NULL;
484 	vswp->mstarted = B_FALSE;
485 
486 	D1(vswp, "vsw_mac_detach: exit");
487 }
488 
489 /*
490  * Depending on the mode specified, the capabilites and capacity
491  * of the underlying device setup the physical device.
492  *
493  * If in layer 3 mode, then do nothing.
494  *
495  * If in layer 2 programmed mode attempt to program the unicast address
496  * associated with the port into the physical device. If this is not
497  * possible due to resource exhaustion or simply because the device does
498  * not support multiple unicast addresses then if required fallback onto
499  * putting the card into promisc mode.
500  *
501  * If in promisc mode then simply set the card into promisc mode.
502  *
503  * Returns 0 success, 1 on failure.
504  */
505 int
506 vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
507 {
508 	mac_multi_addr_t	mac_addr;
509 	int			err;
510 
511 	D1(vswp, "%s: enter", __func__);
512 
513 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
514 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
515 
516 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
517 		return (0);
518 
519 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) {
520 		return (vsw_set_hw_promisc(vswp, port, type));
521 	}
522 
523 	/*
524 	 * Attempt to program the unicast address into the HW.
525 	 */
526 	mac_addr.mma_addrlen = ETHERADDRL;
527 	if (type == VSW_VNETPORT) {
528 		ASSERT(port != NULL);
529 		ether_copy(&port->p_macaddr, &mac_addr.mma_addr);
530 	} else {
531 		ether_copy(&vswp->if_addr, &mac_addr.mma_addr);
532 	}
533 
534 	err = vsw_set_hw_addr(vswp, &mac_addr);
535 	if (err == ENOSPC) {
536 		/*
537 		 * Mark that attempt should be made to re-config sometime
538 		 * in future if a port is deleted.
539 		 */
540 		vswp->recfg_reqd = B_TRUE;
541 
542 		/*
543 		 * Only 1 mode specified, nothing more to do.
544 		 */
545 		if (vswp->smode_num == 1)
546 			return (err);
547 
548 		/*
549 		 * If promiscuous was next mode specified try to
550 		 * set the card into that mode.
551 		 */
552 		if ((vswp->smode_idx <= (vswp->smode_num - 2)) &&
553 		    (vswp->smode[vswp->smode_idx + 1] ==
554 		    VSW_LAYER2_PROMISC)) {
555 			vswp->smode_idx += 1;
556 			return (vsw_set_hw_promisc(vswp, port, type));
557 		}
558 		return (err);
559 	}
560 
561 	if (err != 0)
562 		return (err);
563 
564 	if (type == VSW_VNETPORT) {
565 		port->addr_slot = mac_addr.mma_slot;
566 		port->addr_set = VSW_ADDR_HW;
567 	} else {
568 		vswp->addr_slot = mac_addr.mma_slot;
569 		vswp->addr_set = VSW_ADDR_HW;
570 	}
571 
572 	D2(vswp, "programmed addr %s into slot %d "
573 	"of device %s", ether_sprintf((void *)mac_addr.mma_addr),
574 	    mac_addr.mma_slot, vswp->physname);
575 
576 	D1(vswp, "%s: exit", __func__);
577 
578 	return (0);
579 }
580 
581 /*
582  * If in layer 3 mode do nothing.
583  *
584  * If in layer 2 switched mode remove the address from the physical
585  * device.
586  *
587  * If in layer 2 promiscuous mode disable promisc mode.
588  *
589  * Returns 0 on success.
590  */
591 int
592 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
593 {
594 	mac_addr_slot_t	slot;
595 	int		rv;
596 
597 	D1(vswp, "%s: enter", __func__);
598 
599 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
600 
601 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER3)
602 		return (0);
603 
604 	switch (type) {
605 	case VSW_VNETPORT:
606 		ASSERT(port != NULL);
607 
608 		if (port->addr_set == VSW_ADDR_PROMISC) {
609 			return (vsw_unset_hw_promisc(vswp, port, type));
610 
611 		} else if (port->addr_set == VSW_ADDR_HW) {
612 			slot = port->addr_slot;
613 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
614 				port->addr_set = VSW_ADDR_UNSET;
615 		}
616 
617 		break;
618 
619 	case VSW_LOCALDEV:
620 		if (vswp->addr_set == VSW_ADDR_PROMISC) {
621 			return (vsw_unset_hw_promisc(vswp, NULL, type));
622 
623 		} else if (vswp->addr_set == VSW_ADDR_HW) {
624 			slot = vswp->addr_slot;
625 			if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0)
626 				vswp->addr_set = VSW_ADDR_UNSET;
627 		}
628 
629 		break;
630 
631 	default:
632 		/* should never happen */
633 		DERR(vswp, "%s: unknown type %d", __func__, type);
634 		ASSERT(0);
635 		return (1);
636 	}
637 
638 	D1(vswp, "%s: exit", __func__);
639 	return (rv);
640 }
641 
642 /*
643  * Attempt to program a unicast address into HW.
644  *
645  * Returns 0 on sucess, 1 on failure.
646  */
647 static int
648 vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac)
649 {
650 	void	*mah;
651 	int	rv = EINVAL;
652 
653 	D1(vswp, "%s: enter", __func__);
654 
655 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
656 
657 	if (vswp->maddr.maddr_handle == NULL)
658 		return (rv);
659 
660 	mah = vswp->maddr.maddr_handle;
661 
662 	rv = vswp->maddr.maddr_add(mah, mac);
663 
664 	if (rv == 0)
665 		return (rv);
666 
667 	/*
668 	 * Its okay for the add to fail because we have exhausted
669 	 * all the resouces in the hardware device. Any other error
670 	 * we want to flag.
671 	 */
672 	if (rv != ENOSPC) {
673 		cmn_err(CE_NOTE, "!vsw%d: error programming "
674 		    "address %s into HW err (%d)",
675 		    vswp->instance, ether_sprintf((void *)mac->mma_addr), rv);
676 	}
677 	D1(vswp, "%s: exit", __func__);
678 	return (rv);
679 }
680 
681 /*
682  * Remove a unicast mac address which has previously been programmed
683  * into HW.
684  *
685  * Returns 0 on sucess, 1 on failure.
686  */
687 static int
688 vsw_unset_hw_addr(vsw_t *vswp, int slot)
689 {
690 	void	*mah;
691 	int	rv;
692 
693 	D1(vswp, "%s: enter", __func__);
694 
695 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
696 	ASSERT(slot >= 0);
697 
698 	if (vswp->maddr.maddr_handle == NULL)
699 		return (1);
700 
701 	mah = vswp->maddr.maddr_handle;
702 
703 	rv = vswp->maddr.maddr_remove(mah, slot);
704 	if (rv != 0) {
705 		DWARN(vswp, "%s: unable to remove address "
706 		    "from slot %d in device %s (err %d)",
707 		    __func__, slot, vswp->physname, rv);
708 		return (1);
709 	}
710 
711 	D2(vswp, "removed addr from slot %d in device %s",
712 	    slot, vswp->physname);
713 
714 	D1(vswp, "%s: exit", __func__);
715 	return (0);
716 }
717 
718 /*
719  * Set network card into promisc mode.
720  *
721  * Returns 0 on success, 1 on failure.
722  */
723 static int
724 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
725 {
726 	D1(vswp, "%s: enter", __func__);
727 
728 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
729 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
730 
731 	WRITE_ENTER(&vswp->mac_rwlock);
732 	if (vswp->mh == NULL) {
733 		RW_EXIT(&vswp->mac_rwlock);
734 		return (1);
735 	}
736 
737 	if (vswp->promisc_cnt++ == 0) {
738 		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
739 			vswp->promisc_cnt--;
740 			RW_EXIT(&vswp->mac_rwlock);
741 			return (1);
742 		}
743 		cmn_err(CE_NOTE, "!vsw%d: switching device %s into "
744 		    "promiscuous mode", vswp->instance, vswp->physname);
745 	}
746 	RW_EXIT(&vswp->mac_rwlock);
747 
748 	if (type == VSW_VNETPORT) {
749 		ASSERT(port != NULL);
750 		port->addr_set = VSW_ADDR_PROMISC;
751 	} else {
752 		vswp->addr_set = VSW_ADDR_PROMISC;
753 	}
754 
755 	D1(vswp, "%s: exit", __func__);
756 
757 	return (0);
758 }
759 
760 /*
761  * Turn off promiscuous mode on network card.
762  *
763  * Returns 0 on success, 1 on failure.
764  */
765 static int
766 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type)
767 {
768 	vsw_port_list_t 	*plist = &vswp->plist;
769 
770 	D2(vswp, "%s: enter", __func__);
771 
772 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
773 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
774 
775 	WRITE_ENTER(&vswp->mac_rwlock);
776 	if (vswp->mh == NULL) {
777 		RW_EXIT(&vswp->mac_rwlock);
778 		return (1);
779 	}
780 
781 	if (--vswp->promisc_cnt == 0) {
782 		if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) {
783 			vswp->promisc_cnt++;
784 			RW_EXIT(&vswp->mac_rwlock);
785 			return (1);
786 		}
787 
788 		/*
789 		 * We are exiting promisc mode either because we were
790 		 * only in promisc mode because we had failed over from
791 		 * switched mode due to HW resource issues, or the user
792 		 * wanted the card in promisc mode for all the ports and
793 		 * the last port is now being deleted. Tweak the message
794 		 * accordingly.
795 		 */
796 		if (plist->num_ports != 0) {
797 			cmn_err(CE_NOTE, "!vsw%d: switching device %s back to "
798 			    "programmed mode", vswp->instance, vswp->physname);
799 		} else {
800 			cmn_err(CE_NOTE, "!vsw%d: switching device %s out of "
801 			    "promiscuous mode", vswp->instance, vswp->physname);
802 		}
803 	}
804 	RW_EXIT(&vswp->mac_rwlock);
805 
806 	if (type == VSW_VNETPORT) {
807 		ASSERT(port != NULL);
808 		ASSERT(port->addr_set == VSW_ADDR_PROMISC);
809 		port->addr_set = VSW_ADDR_UNSET;
810 	} else {
811 		ASSERT(vswp->addr_set == VSW_ADDR_PROMISC);
812 		vswp->addr_set = VSW_ADDR_UNSET;
813 	}
814 
815 	D1(vswp, "%s: exit", __func__);
816 	return (0);
817 }
818 
819 /*
820  * Determine whether or not we are operating in our prefered
821  * mode and if not whether the physical resources now allow us
822  * to operate in it.
823  *
824  * If a port is being removed should only be invoked after port has been
825  * removed from the port list.
826  */
827 void
828 vsw_reconfig_hw(vsw_t *vswp)
829 {
830 	int			s_idx;
831 
832 	D1(vswp, "%s: enter", __func__);
833 
834 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
835 
836 	if (vswp->maddr.maddr_handle == NULL) {
837 		return;
838 	}
839 
840 	/*
841 	 * If we are in layer 2 (i.e. switched) or would like to be
842 	 * in layer 2 then check if any ports or the vswitch itself
843 	 * need to be programmed into the HW.
844 	 *
845 	 * This can happen in two cases - switched was specified as
846 	 * the prefered mode of operation but we exhausted the HW
847 	 * resources and so failed over to the next specifed mode,
848 	 * or switched was the only mode specified so after HW
849 	 * resources were exhausted there was nothing more we
850 	 * could do.
851 	 */
852 	if (vswp->smode_idx > 0)
853 		s_idx = vswp->smode_idx - 1;
854 	else
855 		s_idx = vswp->smode_idx;
856 
857 	if (vswp->smode[s_idx] != VSW_LAYER2) {
858 		return;
859 	}
860 
861 	D2(vswp, "%s: attempting reconfig..", __func__);
862 
863 	/*
864 	 * First, attempt to set the vswitch mac address into HW,
865 	 * if required.
866 	 */
867 	if (vsw_prog_if(vswp)) {
868 		return;
869 	}
870 
871 	/*
872 	 * Next, attempt to set any ports which have not yet been
873 	 * programmed into HW.
874 	 */
875 	if (vsw_prog_ports(vswp)) {
876 		return;
877 	}
878 
879 	/*
880 	 * By now we know that have programmed all desired ports etc
881 	 * into HW, so safe to mark reconfiguration as complete.
882 	 */
883 	vswp->recfg_reqd = B_FALSE;
884 
885 	vswp->smode_idx = s_idx;
886 
887 	D1(vswp, "%s: exit", __func__);
888 }
889 
890 /*
891  * Check to see if vsw itself is plumbed, and if so whether or not
892  * its mac address should be written into HW.
893  *
894  * Returns 0 if could set address, or didn't have to set it.
895  * Returns 1 if failed to set address.
896  */
897 static int
898 vsw_prog_if(vsw_t *vswp)
899 {
900 	mac_multi_addr_t	addr;
901 
902 	D1(vswp, "%s: enter", __func__);
903 
904 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
905 
906 	READ_ENTER(&vswp->if_lockrw);
907 	if ((vswp->if_state & VSW_IF_UP) &&
908 	    (vswp->addr_set != VSW_ADDR_HW)) {
909 
910 		addr.mma_addrlen = ETHERADDRL;
911 		ether_copy(&vswp->if_addr, &addr.mma_addr);
912 
913 		if (vsw_set_hw_addr(vswp, &addr) != 0) {
914 			RW_EXIT(&vswp->if_lockrw);
915 			return (1);
916 		}
917 
918 		vswp->addr_slot = addr.mma_slot;
919 
920 		/*
921 		 * If previously when plumbed had had to place
922 		 * interface into promisc mode, now reverse that.
923 		 *
924 		 * Note that interface will only actually be set into
925 		 * non-promisc mode when last port/interface has been
926 		 * programmed into HW.
927 		 */
928 		if (vswp->addr_set == VSW_ADDR_PROMISC)
929 			(void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV);
930 
931 		vswp->addr_set = VSW_ADDR_HW;
932 	}
933 	RW_EXIT(&vswp->if_lockrw);
934 
935 	D1(vswp, "%s: exit", __func__);
936 	return (0);
937 }
938 
939 /*
940  * Scan the port list for any ports which have not yet been set
941  * into HW. For those found attempt to program their mac addresses
942  * into the physical device.
943  *
944  * Returns 0 if able to program all required ports (can be 0) into HW.
945  * Returns 1 if failed to set at least one mac address.
946  */
947 static int
948 vsw_prog_ports(vsw_t *vswp)
949 {
950 	mac_multi_addr_t	addr;
951 	vsw_port_list_t		*plist = &vswp->plist;
952 	vsw_port_t		*tp;
953 	int			rv = 0;
954 
955 	D1(vswp, "%s: enter", __func__);
956 
957 	ASSERT(MUTEX_HELD(&vswp->hw_lock));
958 
959 	READ_ENTER(&plist->lockrw);
960 	for (tp = plist->head; tp != NULL; tp = tp->p_next) {
961 		if (tp->addr_set != VSW_ADDR_HW) {
962 			addr.mma_addrlen = ETHERADDRL;
963 			ether_copy(&tp->p_macaddr, &addr.mma_addr);
964 
965 			if (vsw_set_hw_addr(vswp, &addr) != 0) {
966 				rv = 1;
967 				break;
968 			}
969 
970 			tp->addr_slot = addr.mma_slot;
971 
972 			/*
973 			 * If when this port had first attached we had
974 			 * had to place the interface into promisc mode,
975 			 * then now reverse that.
976 			 *
977 			 * Note that the interface will not actually
978 			 * change to non-promisc mode until all ports
979 			 * have been programmed.
980 			 */
981 			if (tp->addr_set == VSW_ADDR_PROMISC)
982 				(void) vsw_unset_hw_promisc(vswp,
983 				    tp, VSW_VNETPORT);
984 
985 			tp->addr_set = VSW_ADDR_HW;
986 		}
987 	}
988 	RW_EXIT(&plist->lockrw);
989 
990 	D1(vswp, "%s: exit", __func__);
991 	return (rv);
992 }
993 
994 static void
995 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp)
996 {
997 	ringp->ring_state = VSW_MAC_RING_FREE;
998 	ringp->ring_arg = NULL;
999 	ringp->ring_blank = NULL;
1000 	ringp->ring_vqp = NULL;
1001 	ringp->ring_vswp = vswp;
1002 }
1003 
1004 static void
1005 vsw_mac_ring_tbl_init(vsw_t *vswp)
1006 {
1007 	int		i;
1008 
1009 	mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL);
1010 
1011 	vswp->mac_ring_tbl_sz = vsw_mac_rx_rings;
1012 	vswp->mac_ring_tbl  =
1013 	    kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP);
1014 
1015 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++)
1016 		vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]);
1017 }
1018 
1019 static void
1020 vsw_mac_ring_tbl_destroy(vsw_t *vswp)
1021 {
1022 	int		i;
1023 	vsw_mac_ring_t	*ringp;
1024 
1025 	mutex_enter(&vswp->mac_ring_lock);
1026 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1027 		ringp = &vswp->mac_ring_tbl[i];
1028 
1029 		if (ringp->ring_state != VSW_MAC_RING_FREE) {
1030 			/*
1031 			 * Destroy the queue.
1032 			 */
1033 			vsw_queue_stop(ringp->ring_vqp);
1034 			vsw_queue_destroy(ringp->ring_vqp);
1035 
1036 			/*
1037 			 * Re-initialize the structure.
1038 			 */
1039 			vsw_mac_ring_tbl_entry_init(vswp, ringp);
1040 		}
1041 	}
1042 	mutex_exit(&vswp->mac_ring_lock);
1043 
1044 	mutex_destroy(&vswp->mac_ring_lock);
1045 	kmem_free(vswp->mac_ring_tbl,
1046 	    vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t));
1047 	vswp->mac_ring_tbl_sz = 0;
1048 }
1049 
1050 /*
1051  * Handle resource add callbacks from the driver below.
1052  */
1053 static mac_resource_handle_t
1054 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp)
1055 {
1056 	vsw_t		*vswp = (vsw_t *)arg;
1057 	mac_rx_fifo_t	*mrfp = (mac_rx_fifo_t *)mrp;
1058 	vsw_mac_ring_t	*ringp;
1059 	vsw_queue_t	*vqp;
1060 	int		i;
1061 
1062 	ASSERT(vswp != NULL);
1063 	ASSERT(mrp != NULL);
1064 	ASSERT(vswp->mac_ring_tbl != NULL);
1065 
1066 	D1(vswp, "%s: enter", __func__);
1067 
1068 	/*
1069 	 * Check to make sure we have the correct resource type.
1070 	 */
1071 	if (mrp->mr_type != MAC_RX_FIFO)
1072 		return (NULL);
1073 
1074 	/*
1075 	 * Find a open entry in the ring table.
1076 	 */
1077 	mutex_enter(&vswp->mac_ring_lock);
1078 	for (i = 0; i < vswp->mac_ring_tbl_sz; i++) {
1079 		ringp = &vswp->mac_ring_tbl[i];
1080 
1081 		/*
1082 		 * Check for an empty slot, if found, then setup queue
1083 		 * and thread.
1084 		 */
1085 		if (ringp->ring_state == VSW_MAC_RING_FREE) {
1086 			/*
1087 			 * Create the queue for this ring.
1088 			 */
1089 			vqp = vsw_queue_create();
1090 
1091 			/*
1092 			 * Initialize the ring data structure.
1093 			 */
1094 			ringp->ring_vqp = vqp;
1095 			ringp->ring_arg = mrfp->mrf_arg;
1096 			ringp->ring_blank = mrfp->mrf_blank;
1097 			ringp->ring_state = VSW_MAC_RING_INUSE;
1098 
1099 			/*
1100 			 * Create the worker thread.
1101 			 */
1102 			vqp->vq_worker = thread_create(NULL, 0,
1103 			    vsw_queue_worker, ringp, 0, &p0,
1104 			    TS_RUN, minclsyspri);
1105 			if (vqp->vq_worker == NULL) {
1106 				vsw_queue_destroy(vqp);
1107 				vsw_mac_ring_tbl_entry_init(vswp, ringp);
1108 				ringp = NULL;
1109 			}
1110 
1111 			if (ringp != NULL) {
1112 				/*
1113 				 * Make sure thread get's running state for
1114 				 * this ring.
1115 				 */
1116 				mutex_enter(&vqp->vq_lock);
1117 				while ((vqp->vq_state != VSW_QUEUE_RUNNING) &&
1118 				    (vqp->vq_state != VSW_QUEUE_DRAINED)) {
1119 					cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1120 				}
1121 
1122 				/*
1123 				 * If the thread is not running, cleanup.
1124 				 */
1125 				if (vqp->vq_state == VSW_QUEUE_DRAINED) {
1126 					vsw_queue_destroy(vqp);
1127 					vsw_mac_ring_tbl_entry_init(vswp,
1128 					    ringp);
1129 					ringp = NULL;
1130 				}
1131 				mutex_exit(&vqp->vq_lock);
1132 			}
1133 
1134 			mutex_exit(&vswp->mac_ring_lock);
1135 			D1(vswp, "%s: exit", __func__);
1136 			return ((mac_resource_handle_t)ringp);
1137 		}
1138 	}
1139 	mutex_exit(&vswp->mac_ring_lock);
1140 
1141 	/*
1142 	 * No slots in the ring table available.
1143 	 */
1144 	D1(vswp, "%s: exit", __func__);
1145 	return (NULL);
1146 }
1147 
1148 static void
1149 vsw_queue_stop(vsw_queue_t *vqp)
1150 {
1151 	mutex_enter(&vqp->vq_lock);
1152 
1153 	if (vqp->vq_state == VSW_QUEUE_RUNNING) {
1154 		vqp->vq_state = VSW_QUEUE_STOP;
1155 		cv_signal(&vqp->vq_cv);
1156 
1157 		while (vqp->vq_state != VSW_QUEUE_DRAINED)
1158 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1159 	}
1160 
1161 	vqp->vq_state = VSW_QUEUE_STOPPED;
1162 
1163 	mutex_exit(&vqp->vq_lock);
1164 }
1165 
1166 static vsw_queue_t *
1167 vsw_queue_create()
1168 {
1169 	vsw_queue_t *vqp;
1170 
1171 	vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP);
1172 
1173 	mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL);
1174 	cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL);
1175 	vqp->vq_first = NULL;
1176 	vqp->vq_last = NULL;
1177 	vqp->vq_state = VSW_QUEUE_STOPPED;
1178 
1179 	return (vqp);
1180 }
1181 
1182 static void
1183 vsw_queue_destroy(vsw_queue_t *vqp)
1184 {
1185 	cv_destroy(&vqp->vq_cv);
1186 	mutex_destroy(&vqp->vq_lock);
1187 	kmem_free(vqp, sizeof (vsw_queue_t));
1188 }
1189 
1190 static void
1191 vsw_queue_worker(vsw_mac_ring_t *rrp)
1192 {
1193 	mblk_t		*mp;
1194 	vsw_queue_t	*vqp = rrp->ring_vqp;
1195 	vsw_t		*vswp = rrp->ring_vswp;
1196 
1197 	mutex_enter(&vqp->vq_lock);
1198 
1199 	ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED);
1200 
1201 	/*
1202 	 * Set the state to running, since the thread is now active.
1203 	 */
1204 	vqp->vq_state = VSW_QUEUE_RUNNING;
1205 	cv_signal(&vqp->vq_cv);
1206 
1207 	while (vqp->vq_state == VSW_QUEUE_RUNNING) {
1208 		/*
1209 		 * Wait for work to do or the state has changed
1210 		 * to not running.
1211 		 */
1212 		while ((vqp->vq_state == VSW_QUEUE_RUNNING) &&
1213 		    (vqp->vq_first == NULL)) {
1214 			cv_wait(&vqp->vq_cv, &vqp->vq_lock);
1215 		}
1216 
1217 		/*
1218 		 * Process packets that we received from the interface.
1219 		 */
1220 		if (vqp->vq_first != NULL) {
1221 			mp = vqp->vq_first;
1222 
1223 			vqp->vq_first = NULL;
1224 			vqp->vq_last = NULL;
1225 
1226 			mutex_exit(&vqp->vq_lock);
1227 
1228 			/* switch the chain of packets received */
1229 			vswp->vsw_switch_frame(vswp, mp,
1230 			    VSW_PHYSDEV, NULL, NULL);
1231 
1232 			mutex_enter(&vqp->vq_lock);
1233 		}
1234 	}
1235 
1236 	/*
1237 	 * We are drained and signal we are done.
1238 	 */
1239 	vqp->vq_state = VSW_QUEUE_DRAINED;
1240 	cv_signal(&vqp->vq_cv);
1241 
1242 	/*
1243 	 * Exit lock and drain the remaining packets.
1244 	 */
1245 	mutex_exit(&vqp->vq_lock);
1246 
1247 	/*
1248 	 * Exit the thread
1249 	 */
1250 	thread_exit();
1251 }
1252 
1253 /*
1254  * static void
1255  * vsw_rx_queue_cb() - Receive callback routine when
1256  *	vsw_multi_ring_enable is non-zero.  Queue the packets
1257  *	to a packet queue for a worker thread to process.
1258  */
1259 static void
1260 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1261 {
1262 	vsw_mac_ring_t	*ringp = (vsw_mac_ring_t *)mrh;
1263 	vsw_t		*vswp = (vsw_t *)arg;
1264 	vsw_queue_t	*vqp;
1265 	mblk_t		*bp, *last;
1266 
1267 	ASSERT(mrh != NULL);
1268 	ASSERT(vswp != NULL);
1269 	ASSERT(mp != NULL);
1270 
1271 	D1(vswp, "%s: enter", __func__);
1272 
1273 	/*
1274 	 * Find the last element in the mblk chain.
1275 	 */
1276 	bp = mp;
1277 	do {
1278 		last = bp;
1279 		bp = bp->b_next;
1280 	} while (bp != NULL);
1281 
1282 	/* Get the queue for the packets */
1283 	vqp = ringp->ring_vqp;
1284 
1285 	/*
1286 	 * Grab the lock such we can queue the packets.
1287 	 */
1288 	mutex_enter(&vqp->vq_lock);
1289 
1290 	if (vqp->vq_state != VSW_QUEUE_RUNNING) {
1291 		freemsgchain(mp);
1292 		mutex_exit(&vqp->vq_lock);
1293 		goto vsw_rx_queue_cb_exit;
1294 	}
1295 
1296 	/*
1297 	 * Add the mblk chain to the queue.  If there
1298 	 * is some mblks in the queue, then add the new
1299 	 * chain to the end.
1300 	 */
1301 	if (vqp->vq_first == NULL)
1302 		vqp->vq_first = mp;
1303 	else
1304 		vqp->vq_last->b_next = mp;
1305 
1306 	vqp->vq_last = last;
1307 
1308 	/*
1309 	 * Signal the worker thread that there is work to
1310 	 * do.
1311 	 */
1312 	cv_signal(&vqp->vq_cv);
1313 
1314 	/*
1315 	 * Let go of the lock and exit.
1316 	 */
1317 	mutex_exit(&vqp->vq_lock);
1318 
1319 vsw_rx_queue_cb_exit:
1320 	D1(vswp, "%s: exit", __func__);
1321 }
1322 
1323 /*
1324  * receive callback routine. Invoked by MAC layer when there
1325  * are pkts being passed up from physical device.
1326  *
1327  * PERF: It may be more efficient when the card is in promisc
1328  * mode to check the dest address of the pkts here (against
1329  * the FDB) rather than checking later. Needs to be investigated.
1330  */
1331 static void
1332 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
1333 {
1334 	_NOTE(ARGUNUSED(mrh))
1335 
1336 	vsw_t		*vswp = (vsw_t *)arg;
1337 
1338 	ASSERT(vswp != NULL);
1339 
1340 	D1(vswp, "vsw_rx_cb: enter");
1341 
1342 	/* switch the chain of packets received */
1343 	vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
1344 
1345 	D1(vswp, "vsw_rx_cb: exit");
1346 }
1347 
1348 /*
1349  * Send a message out over the physical device via the MAC layer.
1350  *
1351  * Returns any mblks that it was unable to transmit.
1352  */
1353 mblk_t *
1354 vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
1355 {
1356 	const mac_txinfo_t	*mtp;
1357 
1358 	READ_ENTER(&vswp->mac_rwlock);
1359 	if ((vswp->mh == NULL) || (vswp->mstarted == B_FALSE)) {
1360 
1361 		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
1362 		RW_EXIT(&vswp->mac_rwlock);
1363 		return (mp);
1364 	} else {
1365 		mtp = vswp->txinfo;
1366 		mp = mtp->mt_fn(mtp->mt_arg, mp);
1367 	}
1368 	RW_EXIT(&vswp->mac_rwlock);
1369 
1370 	return (mp);
1371 }
1372 
1373 #define	ARH_FIXED_LEN	8    /* Length of fixed part of ARP header(see arp.h) */
1374 
1375 /*
1376  * Send a gratuitous RARP packet to notify the physical switch to update its
1377  * Layer2 forwarding table for the given mac address. This is done to allow the
1378  * switch to quickly learn the macaddr-port association when a guest is live
1379  * migrated or when vsw's physical device is changed dynamically. Any protocol
1380  * packet would serve this purpose, but we choose RARP, as it allows us to
1381  * accomplish this within L2 (ie, no need to specify IP addr etc in the packet)
1382  * The macaddr of vnet is retained across migration. Hence, we don't need to
1383  * update the arp cache of other hosts within the broadcast domain. Note that
1384  * it is harmless to send these RARP packets during normal port attach of a
1385  * client vnet. This can can be turned off if needed, by setting
1386  * vsw_publish_macaddr_count to zero in /etc/system.
1387  */
1388 void
1389 vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr)
1390 {
1391 	mblk_t			*mp;
1392 	mblk_t			*bp;
1393 	struct arphdr		*arh;
1394 	struct	ether_header 	*ehp;
1395 	int			count = 0;
1396 	int			plen = 4;
1397 	uint8_t			*cp;
1398 
1399 	mp = allocb(ETHERMIN, BPRI_MED);
1400 	if (mp == NULL) {
1401 		return;
1402 	}
1403 
1404 	/* Initialize eth header */
1405 	ehp = (struct  ether_header *)mp->b_rptr;
1406 	bcopy(&etherbroadcastaddr, &ehp->ether_dhost, ETHERADDRL);
1407 	bcopy(addr, &ehp->ether_shost, ETHERADDRL);
1408 	ehp->ether_type = htons(ETHERTYPE_REVARP);
1409 
1410 	/* Initialize arp packet */
1411 	arh = (struct arphdr *)(mp->b_rptr + sizeof (struct ether_header));
1412 	cp = (uint8_t *)arh;
1413 
1414 	arh->ar_hrd = htons(ARPHRD_ETHER);	/* Hardware type:  ethernet */
1415 	arh->ar_pro = htons(ETHERTYPE_IP);	/* Protocol type:  IP */
1416 	arh->ar_hln = ETHERADDRL;	/* Length of hardware address:  6 */
1417 	arh->ar_pln = plen;		/* Length of protocol address:  4 */
1418 	arh->ar_op = htons(REVARP_REQUEST);	/* Opcode: REVARP Request */
1419 
1420 	cp += ARH_FIXED_LEN;
1421 
1422 	/* Sender's hardware address and protocol address */
1423 	bcopy(addr, cp, ETHERADDRL);
1424 	cp += ETHERADDRL;
1425 	bzero(cp, plen);	/* INADDR_ANY */
1426 	cp += plen;
1427 
1428 	/* Target hardware address and protocol address */
1429 	bcopy(addr, cp, ETHERADDRL);
1430 	cp += ETHERADDRL;
1431 	bzero(cp, plen);	/* INADDR_ANY */
1432 	cp += plen;
1433 
1434 	mp->b_wptr += ETHERMIN;	/* total size is 42; round up to ETHERMIN */
1435 
1436 	for (count = 0; count < vsw_publish_macaddr_count; count++) {
1437 
1438 		bp = dupmsg(mp);
1439 		if (bp == NULL) {
1440 			continue;
1441 		}
1442 
1443 		/* transmit the packet */
1444 		bp = vsw_tx_msg(vswp, bp);
1445 		if (bp != NULL) {
1446 			freemsg(bp);
1447 		}
1448 	}
1449 
1450 	freemsg(mp);
1451 }
1452 
1453 static void
1454 vsw_mac_set_mtu(vsw_t *vswp, uint32_t mtu)
1455 {
1456 	mac_prop_t	mp;
1457 	uint32_t	val;
1458 	int		rv;
1459 	uint_t		perm_flags = MAC_PROP_PERM_RW;
1460 	mp.mp_id = MAC_PROP_MTU;
1461 	mp.mp_name = mac_mtu_propname;
1462 	mp.mp_flags = 0;
1463 
1464 	/* Get the mtu of the physical device */
1465 	rv = mac_get_prop(vswp->mh, &mp, (void *)&val, sizeof (uint32_t),
1466 	    &perm_flags);
1467 	if (rv != 0) {
1468 		cmn_err(CE_NOTE,
1469 		    "!vsw%d: Unable to get the mtu of the physical device:%s\n",
1470 		    vswp->instance, vswp->physname);
1471 		return;
1472 	}
1473 
1474 	/* Return if the mtu is read-only */
1475 	if (perm_flags != MAC_PROP_PERM_RW) {
1476 		cmn_err(CE_NOTE,
1477 		    "!vsw%d: Read-only mtu of the physical device:%s\n",
1478 		    vswp->instance, vswp->physname);
1479 		return;
1480 	}
1481 
1482 	/* save the original mtu of physdev to reset it back later if needed */
1483 	vswp->mtu_physdev_orig = val;
1484 
1485 	if (val == mtu) {
1486 		/* no need to set, as the device already has the right mtu */
1487 		return;
1488 	}
1489 
1490 	mp.mp_id = MAC_PROP_MTU;
1491 	mp.mp_name = mac_mtu_propname;
1492 	mp.mp_flags = 0;
1493 
1494 	/* Set the mtu in the physical device */
1495 	rv = mac_set_prop(vswp->mh, &mp, &mtu, sizeof (uint32_t));
1496 	if (rv != 0) {
1497 		cmn_err(CE_NOTE,
1498 		    "!vsw%d: Unable to set the mtu:%d, in the "
1499 		    "physical device:%s\n",
1500 		    vswp->instance, mtu, vswp->physname);
1501 	}
1502 }
1503