xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_switching.c (revision e7cbe64f7a72dae5cb44f100db60ca88f3313c65)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 #include <sys/atomic.h>
74 #include <sys/vlan.h>
75 
76 /* Switching setup routines */
77 void vsw_setup_switching_timeout(void *arg);
78 void vsw_stop_switching_timeout(vsw_t *vswp);
79 int vsw_setup_switching(vsw_t *);
80 static	int vsw_setup_layer2(vsw_t *);
81 static	int vsw_setup_layer3(vsw_t *);
82 
83 /* Switching/data transmit routines */
84 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
85 	vsw_port_t *port, mac_resource_handle_t);
86 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
87 	vsw_port_t *port, mac_resource_handle_t);
88 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
89 	int caller, vsw_port_t *port);
90 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
91     int caller, vsw_port_t *port);
92 
93 /* VLAN routines */
94 void vsw_create_vlans(void *arg, int type);
95 void vsw_destroy_vlans(void *arg, int type);
96 void vsw_vlan_add_ids(void *arg, int type);
97 void vsw_vlan_remove_ids(void *arg, int type);
98 static	void vsw_vlan_create_hash(void *arg, int type);
99 static	void vsw_vlan_destroy_hash(void *arg, int type);
100 boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
101 	uint16_t *vidp);
102 mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
103 uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
104 boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
105 
106 /* Forwarding database (FDB) routines */
107 void vsw_fdbe_add(vsw_t *vswp, void *port);
108 void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
109 static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
110 static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
111 
112 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
113 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
114 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
115 void vsw_del_mcst_vsw(vsw_t *);
116 
117 /* Support functions */
118 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
119 static uint32_t vsw_get_same_dest_list(struct ether_header *ehp,
120     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
121 
122 
123 /*
124  * Functions imported from other files.
125  */
126 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
127 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
128 extern int vsw_mac_open(vsw_t *vswp);
129 extern void vsw_mac_close(vsw_t *vswp);
130 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
131     mblk_t *mp, vsw_macrx_flags_t flags);
132 extern void vsw_set_addrs(vsw_t *vswp);
133 extern int vsw_get_hw_maddr(vsw_t *);
134 extern int vsw_mac_attach(vsw_t *vswp);
135 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt,
136 	uint32_t count);
137 
138 /*
139  * Tunables used in this file.
140  */
141 extern	int vsw_setup_switching_delay;
142 extern	uint32_t vsw_vlan_nchains;
143 extern	uint32_t vsw_fdbe_refcnt_delay;
144 
145 #define	VSW_FDBE_REFHOLD(p)						\
146 {									\
147 	atomic_inc_32(&(p)->refcnt);					\
148 	ASSERT((p)->refcnt != 0);					\
149 }
150 
151 #define	VSW_FDBE_REFRELE(p)						\
152 {									\
153 	ASSERT((p)->refcnt != 0);					\
154 	atomic_dec_32(&(p)->refcnt);					\
155 }
156 
157 /*
158  * Timeout routine to setup switching mode:
159  * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop()
160  * initially. If it fails and the error is EAGAIN, then this timeout handler
161  * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried
162  * until we successfully finish it; or the returned error is not EAGAIN.
163  */
164 void
165 vsw_setup_switching_timeout(void *arg)
166 {
167 	vsw_t		*vswp = (vsw_t *)arg;
168 	int		rv;
169 
170 	if (vswp->swtmout_enabled == B_FALSE)
171 		return;
172 
173 	rv = vsw_setup_switching(vswp);
174 
175 	if (rv == 0) {
176 		/*
177 		 * Successfully setup switching mode.
178 		 * Program unicst, mcst addrs of vsw
179 		 * interface and ports in the physdev.
180 		 */
181 		vsw_set_addrs(vswp);
182 	}
183 
184 	mutex_enter(&vswp->swtmout_lock);
185 
186 	if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) {
187 		/*
188 		 * Reschedule timeout() if the error is EAGAIN and the
189 		 * timeout is still enabled. For errors other than EAGAIN,
190 		 * we simply return without rescheduling timeout().
191 		 */
192 		vswp->swtmout_id =
193 		    timeout(vsw_setup_switching_timeout, vswp,
194 		    (vsw_setup_switching_delay * drv_usectohz(MICROSEC)));
195 		goto exit;
196 	}
197 
198 	/* timeout handler completed */
199 	vswp->swtmout_enabled = B_FALSE;
200 	vswp->swtmout_id = 0;
201 
202 exit:
203 	mutex_exit(&vswp->swtmout_lock);
204 }
205 
206 /*
207  * Cancel the timeout handler to setup switching mode.
208  */
209 void
210 vsw_stop_switching_timeout(vsw_t *vswp)
211 {
212 	timeout_id_t tid;
213 
214 	mutex_enter(&vswp->swtmout_lock);
215 
216 	tid = vswp->swtmout_id;
217 
218 	if (tid != 0) {
219 		/* signal timeout handler to stop */
220 		vswp->swtmout_enabled = B_FALSE;
221 		vswp->swtmout_id = 0;
222 		mutex_exit(&vswp->swtmout_lock);
223 
224 		(void) untimeout(tid);
225 	} else {
226 		mutex_exit(&vswp->swtmout_lock);
227 	}
228 
229 	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
230 
231 	mutex_enter(&vswp->mac_lock);
232 	vswp->mac_open_retries = 0;
233 	mutex_exit(&vswp->mac_lock);
234 }
235 
236 /*
237  * Setup the required switching mode.
238  * This routine is invoked from vsw_attach() or vsw_update_md_prop()
239  * initially. If it fails and the error is EAGAIN, then a timeout handler
240  * is started to retry vsw_setup_switching(), until it successfully finishes;
241  * or the returned error is not EAGAIN.
242  *
243  * Returns:
244  *  0 on success.
245  *  EAGAIN if retry is needed.
246  *  1 on all other failures.
247  */
248 int
249 vsw_setup_switching(vsw_t *vswp)
250 {
251 	int	i, rv = 1;
252 
253 	D1(vswp, "%s: enter", __func__);
254 
255 	/*
256 	 * Select best switching mode.
257 	 * Note that we start from the saved smode_idx. This is done as
258 	 * this routine can be called from the timeout handler to retry
259 	 * setting up a specific mode. Currently only the function which
260 	 * sets up layer2/promisc mode returns EAGAIN if the underlying
261 	 * physical device is not available yet, causing retries.
262 	 */
263 	for (i = vswp->smode_idx; i < vswp->smode_num; i++) {
264 		vswp->smode_idx = i;
265 		switch (vswp->smode[i]) {
266 		case VSW_LAYER2:
267 		case VSW_LAYER2_PROMISC:
268 			rv = vsw_setup_layer2(vswp);
269 			break;
270 
271 		case VSW_LAYER3:
272 			rv = vsw_setup_layer3(vswp);
273 			break;
274 
275 		default:
276 			DERR(vswp, "unknown switch mode");
277 			break;
278 		}
279 
280 		if ((rv == 0) || (rv == EAGAIN))
281 			break;
282 
283 		/* all other errors(rv != 0): continue & select the next mode */
284 		rv = 1;
285 	}
286 
287 	if (rv && (rv != EAGAIN)) {
288 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
289 		    "switching mode", vswp->instance);
290 	} else if (rv == 0) {
291 		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
292 	}
293 
294 	D2(vswp, "%s: Operating in mode %d", __func__,
295 	    vswp->smode[vswp->smode_idx]);
296 
297 	D1(vswp, "%s: exit", __func__);
298 
299 	return (rv);
300 }
301 
302 /*
303  * Setup for layer 2 switching.
304  *
305  * Returns:
306  *  0 on success.
307  *  EAGAIN if retry is needed.
308  *  EIO on all other failures.
309  */
310 static int
311 vsw_setup_layer2(vsw_t *vswp)
312 {
313 	int	rv;
314 
315 	D1(vswp, "%s: enter", __func__);
316 
317 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
318 
319 	rv = strlen(vswp->physname);
320 	if (rv == 0) {
321 		/*
322 		 * Physical device name is NULL, which is
323 		 * required for layer 2.
324 		 */
325 		cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
326 		    vswp->instance);
327 		return (EIO);
328 	}
329 
330 	mutex_enter(&vswp->mac_lock);
331 
332 	rv = vsw_mac_open(vswp);
333 	if (rv != 0) {
334 		if (rv != EAGAIN) {
335 			cmn_err(CE_WARN, "!vsw%d: Unable to open physical "
336 			    "device: %s\n", vswp->instance, vswp->physname);
337 		}
338 		mutex_exit(&vswp->mac_lock);
339 		return (rv);
340 	}
341 
342 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
343 		/*
344 		 * Verify that underlying device can support multiple
345 		 * unicast mac addresses.
346 		 */
347 		rv = vsw_get_hw_maddr(vswp);
348 		if (rv != 0) {
349 			goto exit_error;
350 		}
351 	}
352 
353 	/*
354 	 * Attempt to link into the MAC layer so we can get
355 	 * and send packets out over the physical adapter.
356 	 */
357 	rv = vsw_mac_attach(vswp);
358 	if (rv != 0) {
359 		/*
360 		 * Registration with the MAC layer has failed,
361 		 * so return error so that can fall back to next
362 		 * prefered switching method.
363 		 */
364 		cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: "
365 		    "%s\n", vswp->instance, vswp->physname);
366 		goto exit_error;
367 	}
368 
369 	D1(vswp, "%s: exit", __func__);
370 
371 	mutex_exit(&vswp->mac_lock);
372 	return (0);
373 
374 exit_error:
375 	vsw_mac_close(vswp);
376 	mutex_exit(&vswp->mac_lock);
377 	return (EIO);
378 }
379 
380 static int
381 vsw_setup_layer3(vsw_t *vswp)
382 {
383 	D1(vswp, "%s: enter", __func__);
384 
385 	D2(vswp, "%s: operating in layer 3 mode", __func__);
386 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
387 
388 	D1(vswp, "%s: exit", __func__);
389 
390 	return (0);
391 }
392 
393 /*
394  * Switch the given ethernet frame when operating in layer 2 mode.
395  *
396  * vswp: pointer to the vsw instance
397  * mp: pointer to chain of ethernet frame(s) to be switched
398  * caller: identifies the source of this frame as:
399  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
400  *		2. VSW_PHYSDEV - the physical ethernet device
401  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
402  * arg: argument provided by the caller.
403  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
404  *		2. for PHYSDEV - NULL
405  *		3. for LOCALDEV - pointer to to this vsw_t(self)
406  */
407 void
408 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
409 			vsw_port_t *arg, mac_resource_handle_t mrh)
410 {
411 	struct ether_header	*ehp;
412 	mblk_t			*bp, *ret_m;
413 	mblk_t			*mpt = NULL;
414 	uint32_t		count;
415 	vsw_fdbe_t		*fp;
416 
417 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
418 
419 	/*
420 	 * PERF: rather than breaking up the chain here, scan it
421 	 * to find all mblks heading to same destination and then
422 	 * pass that sub-chain to the lower transmit functions.
423 	 */
424 
425 	/* process the chain of packets */
426 	bp = mp;
427 	while (bp) {
428 		ehp = (struct ether_header *)bp->b_rptr;
429 		count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
430 		ASSERT(count != 0);
431 
432 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
433 		    __func__, MBLKSIZE(mp), MBLKL(mp));
434 
435 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
436 			/*
437 			 * If destination is VSW_LOCALDEV (vsw as an eth
438 			 * interface) and if the device is up & running,
439 			 * send the packet up the stack on this host.
440 			 * If the virtual interface is down, drop the packet.
441 			 */
442 			if (caller != VSW_LOCALDEV) {
443 				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
444 			} else {
445 				freemsgchain(mp);
446 			}
447 			continue;
448 		}
449 
450 		/*
451 		 * Find fdb entry for the destination
452 		 * and hold a reference to it.
453 		 */
454 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
455 		if (fp != NULL) {
456 
457 			/*
458 			 * If plumbed and in promisc mode then copy msg
459 			 * and send up the stack.
460 			 */
461 			vsw_mac_rx(vswp, mrh, mp,
462 			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
463 
464 			/*
465 			 * If the destination is in FDB, the packet
466 			 * should be forwarded to the correponding
467 			 * vsw_port (connected to a vnet device -
468 			 * VSW_VNETPORT)
469 			 */
470 			(void) vsw_portsend(fp->portp, mp, mpt, count);
471 
472 			/* Release the reference on the fdb entry */
473 			VSW_FDBE_REFRELE(fp);
474 		} else {
475 			/*
476 			 * Destination not in FDB.
477 			 *
478 			 * If the destination is broadcast or
479 			 * multicast forward the packet to all
480 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
481 			 * except the caller.
482 			 */
483 			if (IS_BROADCAST(ehp)) {
484 				D2(vswp, "%s: BROADCAST pkt", __func__);
485 				(void) vsw_forward_all(vswp, mp, caller, arg);
486 			} else if (IS_MULTICAST(ehp)) {
487 				D2(vswp, "%s: MULTICAST pkt", __func__);
488 				(void) vsw_forward_grp(vswp, mp, caller, arg);
489 			} else {
490 				/*
491 				 * If the destination is unicast, and came
492 				 * from either a logical network device or
493 				 * the switch itself when it is plumbed, then
494 				 * send it out on the physical device and also
495 				 * up the stack if the logical interface is
496 				 * in promiscious mode.
497 				 *
498 				 * NOTE:  The assumption here is that if we
499 				 * cannot find the destination in our fdb, its
500 				 * a unicast address, and came from either a
501 				 * vnet or down the stack (when plumbed) it
502 				 * must be destinded for an ethernet device
503 				 * outside our ldoms.
504 				 */
505 				if (caller == VSW_VNETPORT) {
506 					/* promisc check copy etc */
507 					vsw_mac_rx(vswp, mrh, mp,
508 					    VSW_MACRX_PROMISC |
509 					    VSW_MACRX_COPYMSG);
510 
511 					if ((ret_m = vsw_tx_msg(vswp, mp))
512 					    != NULL) {
513 						DERR(vswp, "%s: drop mblks to "
514 						    "phys dev", __func__);
515 						freemsgchain(ret_m);
516 					}
517 
518 				} else if (caller == VSW_PHYSDEV) {
519 					/*
520 					 * Pkt seen because card in promisc
521 					 * mode. Send up stack if plumbed in
522 					 * promisc mode, else drop it.
523 					 */
524 					vsw_mac_rx(vswp, mrh, mp,
525 					    VSW_MACRX_PROMISC |
526 					    VSW_MACRX_FREEMSG);
527 
528 				} else if (caller == VSW_LOCALDEV) {
529 					/*
530 					 * Pkt came down the stack, send out
531 					 * over physical device.
532 					 */
533 					if ((ret_m = vsw_tx_msg(vswp, mp))
534 					    != NULL) {
535 						DERR(vswp, "%s: drop mblks to "
536 						    "phys dev", __func__);
537 						freemsgchain(ret_m);
538 					}
539 				}
540 			}
541 		}
542 	}
543 	D1(vswp, "%s: exit\n", __func__);
544 }
545 
546 /*
547  * Switch ethernet frame when in layer 3 mode (i.e. using IP
548  * layer to do the routing).
549  *
550  * There is a large amount of overlap between this function and
551  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
552  * both these functions.
553  */
554 void
555 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
556 			vsw_port_t *arg, mac_resource_handle_t mrh)
557 {
558 	struct ether_header	*ehp;
559 	mblk_t			*bp = NULL;
560 	mblk_t			*mpt;
561 	uint32_t		count;
562 	vsw_fdbe_t		*fp;
563 
564 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
565 
566 	/*
567 	 * In layer 3 mode should only ever be switching packets
568 	 * between IP layer and vnet devices. So make sure thats
569 	 * who is invoking us.
570 	 */
571 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
572 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
573 		freemsgchain(mp);
574 		return;
575 	}
576 
577 	/* process the chain of packets */
578 	bp = mp;
579 	while (bp) {
580 		ehp = (struct ether_header *)bp->b_rptr;
581 		count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
582 		ASSERT(count != 0);
583 
584 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
585 		    __func__, MBLKSIZE(mp), MBLKL(mp));
586 
587 		/*
588 		 * Find fdb entry for the destination
589 		 * and hold a reference to it.
590 		 */
591 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
592 		if (fp != NULL) {
593 
594 			D2(vswp, "%s: sending to target port", __func__);
595 			(void) vsw_portsend(fp->portp, mp, mpt, count);
596 
597 			/* Release the reference on the fdb entry */
598 			VSW_FDBE_REFRELE(fp);
599 		} else {
600 			/*
601 			 * Destination not in FDB
602 			 *
603 			 * If the destination is broadcast or
604 			 * multicast forward the packet to all
605 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
606 			 * except the caller.
607 			 */
608 			if (IS_BROADCAST(ehp)) {
609 				D2(vswp, "%s: BROADCAST pkt", __func__);
610 				(void) vsw_forward_all(vswp, mp, caller, arg);
611 			} else if (IS_MULTICAST(ehp)) {
612 				D2(vswp, "%s: MULTICAST pkt", __func__);
613 				(void) vsw_forward_grp(vswp, mp, caller, arg);
614 			} else {
615 				/*
616 				 * Unicast pkt from vnet that we don't have
617 				 * an FDB entry for, so must be destinded for
618 				 * the outside world. Attempt to send up to the
619 				 * IP layer to allow it to deal with it.
620 				 */
621 				if (caller == VSW_VNETPORT) {
622 					vsw_mac_rx(vswp, mrh,
623 					    mp, VSW_MACRX_FREEMSG);
624 				}
625 			}
626 		}
627 	}
628 
629 	D1(vswp, "%s: exit", __func__);
630 }
631 
632 /*
633  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
634  * except the caller (port on which frame arrived).
635  */
636 static int
637 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
638 {
639 	vsw_port_list_t	*plist = &vswp->plist;
640 	vsw_port_t	*portp;
641 	mblk_t		*nmp = NULL;
642 	mblk_t		*ret_m = NULL;
643 	int		skip_port = 0;
644 
645 	D1(vswp, "vsw_forward_all: enter\n");
646 
647 	/*
648 	 * Broadcast message from inside ldoms so send to outside
649 	 * world if in either of layer 2 modes.
650 	 */
651 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
652 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
653 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
654 
655 		nmp = vsw_dupmsgchain(mp);
656 		if (nmp) {
657 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
658 				DERR(vswp, "%s: dropping pkt(s) "
659 				    "consisting of %ld bytes of data for"
660 				    " physical device", __func__, MBLKL(ret_m));
661 				freemsgchain(ret_m);
662 			}
663 		}
664 	}
665 
666 	if (caller == VSW_VNETPORT)
667 		skip_port = 1;
668 
669 	/*
670 	 * Broadcast message from other vnet (layer 2 or 3) or outside
671 	 * world (layer 2 only), send up stack if plumbed.
672 	 */
673 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
674 		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
675 	}
676 
677 	/* send it to all VNETPORTs */
678 	READ_ENTER(&plist->lockrw);
679 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
680 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
681 		/*
682 		 * Caution ! - don't reorder these two checks as arg
683 		 * will be NULL if the caller is PHYSDEV. skip_port is
684 		 * only set if caller is VNETPORT.
685 		 */
686 		if ((skip_port) && (portp == arg)) {
687 			continue;
688 		} else {
689 			nmp = vsw_dupmsgchain(mp);
690 			if (nmp) {
691 				mblk_t	*mpt = nmp;
692 				uint32_t count = 1;
693 
694 				/* Find tail */
695 				while (mpt->b_next != NULL) {
696 					mpt = mpt->b_next;
697 					count++;
698 				}
699 				/*
700 				 * The plist->lockrw is protecting the
701 				 * portp from getting destroyed here.
702 				 * So, no ref_cnt is incremented here.
703 				 */
704 				(void) vsw_portsend(portp, nmp, mpt, count);
705 			} else {
706 				DERR(vswp, "vsw_forward_all: nmp NULL");
707 			}
708 		}
709 	}
710 	RW_EXIT(&plist->lockrw);
711 
712 	freemsgchain(mp);
713 
714 	D1(vswp, "vsw_forward_all: exit\n");
715 	return (0);
716 }
717 
718 /*
719  * Forward pkts to any devices or interfaces which have registered
720  * an interest in them (i.e. multicast groups).
721  */
722 static int
723 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
724 {
725 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
726 	mfdb_ent_t		*entp = NULL;
727 	mfdb_ent_t		*tpp = NULL;
728 	vsw_port_t 		*port;
729 	uint64_t		key = 0;
730 	mblk_t			*nmp = NULL;
731 	mblk_t			*ret_m = NULL;
732 	boolean_t		check_if = B_TRUE;
733 
734 	/*
735 	 * Convert address to hash table key
736 	 */
737 	KEY_HASH(key, &ehp->ether_dhost);
738 
739 	D1(vswp, "%s: key 0x%llx", __func__, key);
740 
741 	/*
742 	 * If pkt came from either a vnet or down the stack (if we are
743 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
744 	 * over the physical adapter, and then check to see if any other
745 	 * vnets are interested in it.
746 	 */
747 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
748 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
749 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
750 		nmp = vsw_dupmsgchain(mp);
751 		if (nmp) {
752 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
753 				DERR(vswp, "%s: dropping pkt(s) consisting of "
754 				    "%ld bytes of data for physical device",
755 				    __func__, MBLKL(ret_m));
756 				freemsgchain(ret_m);
757 			}
758 		}
759 	}
760 
761 	READ_ENTER(&vswp->mfdbrw);
762 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
763 	    (mod_hash_val_t *)&entp) != 0) {
764 		D3(vswp, "%s: no table entry found for addr 0x%llx",
765 		    __func__, key);
766 	} else {
767 		/*
768 		 * Send to list of devices associated with this address...
769 		 */
770 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
771 
772 			/* dont send to ourselves */
773 			if ((caller == VSW_VNETPORT) &&
774 			    (tpp->d_addr == (void *)arg)) {
775 				port = (vsw_port_t *)tpp->d_addr;
776 				D3(vswp, "%s: not sending to ourselves"
777 				    " : port %d", __func__, port->p_instance);
778 				continue;
779 
780 			} else if ((caller == VSW_LOCALDEV) &&
781 			    (tpp->d_type == VSW_LOCALDEV)) {
782 				D2(vswp, "%s: not sending back up stack",
783 				    __func__);
784 				continue;
785 			}
786 
787 			if (tpp->d_type == VSW_VNETPORT) {
788 				port = (vsw_port_t *)tpp->d_addr;
789 				D3(vswp, "%s: sending to port %ld for addr "
790 				    "0x%llx", __func__, port->p_instance, key);
791 
792 				nmp = vsw_dupmsgchain(mp);
793 				if (nmp) {
794 					mblk_t	*mpt = nmp;
795 					uint32_t count = 1;
796 
797 					/* Find tail */
798 					while (mpt->b_next != NULL) {
799 						mpt = mpt->b_next;
800 						count++;
801 					}
802 					/*
803 					 * The vswp->mfdbrw is protecting the
804 					 * portp from getting destroyed here.
805 					 * So, no ref_cnt is incremented here.
806 					 */
807 					(void) vsw_portsend(port, nmp, mpt,
808 					    count);
809 				}
810 			} else {
811 				vsw_mac_rx(vswp, NULL,
812 				    mp, VSW_MACRX_COPYMSG);
813 				D2(vswp, "%s: sending up stack"
814 				    " for addr 0x%llx", __func__, key);
815 				check_if = B_FALSE;
816 			}
817 		}
818 	}
819 
820 	RW_EXIT(&vswp->mfdbrw);
821 
822 	/*
823 	 * If the pkt came from either a vnet or from physical device,
824 	 * and if we havent already sent the pkt up the stack then we
825 	 * check now if we can/should (i.e. the interface is plumbed
826 	 * and in promisc mode).
827 	 */
828 	if ((check_if) &&
829 	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
830 		vsw_mac_rx(vswp, NULL, mp,
831 		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
832 	}
833 
834 	freemsgchain(mp);
835 
836 	D1(vswp, "%s: exit", __func__);
837 
838 	return (0);
839 }
840 
841 /*
842  * This function creates the vlan id hash table for the given vsw device or
843  * port. It then adds each vlan that the device or port has been assigned,
844  * into this hash table.
845  * Arguments:
846  *   arg:  vsw device or port.
847  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
848  */
849 void
850 vsw_create_vlans(void *arg, int type)
851 {
852 	/* create vlan hash table */
853 	vsw_vlan_create_hash(arg, type);
854 
855 	/* add vlan ids of the vsw device into its hash table */
856 	vsw_vlan_add_ids(arg, type);
857 }
858 
859 /*
860  * This function removes the vlan ids of the vsw device or port from its hash
861  * table. It then destroys the vlan hash table.
862  * Arguments:
863  *   arg:  vsw device or port.
864  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
865  */
866 void
867 vsw_destroy_vlans(void *arg, int type)
868 {
869 	/* remove vlan ids from the hash table */
870 	vsw_vlan_remove_ids(arg, type);
871 
872 	/* destroy vlan-hash-table */
873 	vsw_vlan_destroy_hash(arg, type);
874 }
875 
876 /*
877  * Create a vlan-id hash table for the given vsw device or port.
878  */
879 static void
880 vsw_vlan_create_hash(void *arg, int type)
881 {
882 	char		hashname[MAXNAMELEN];
883 
884 	if (type == VSW_LOCALDEV) {
885 		vsw_t		*vswp = (vsw_t *)arg;
886 
887 		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
888 		    vswp->instance);
889 
890 		vswp->vlan_nchains = vsw_vlan_nchains;
891 		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
892 		    vswp->vlan_nchains, mod_hash_null_valdtor);
893 
894 	} else if (type == VSW_VNETPORT) {
895 		vsw_port_t	*portp = (vsw_port_t *)arg;
896 
897 		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
898 		    portp->p_instance);
899 
900 		portp->vlan_nchains = vsw_vlan_nchains;
901 		portp->vlan_hashp = mod_hash_create_idhash(hashname,
902 		    portp->vlan_nchains, mod_hash_null_valdtor);
903 
904 	} else {
905 		return;
906 	}
907 }
908 
909 /*
910  * Destroy the vlan-id hash table for the given vsw device or port.
911  */
912 static void
913 vsw_vlan_destroy_hash(void *arg, int type)
914 {
915 	if (type == VSW_LOCALDEV) {
916 		vsw_t		*vswp = (vsw_t *)arg;
917 
918 		mod_hash_destroy_hash(vswp->vlan_hashp);
919 		vswp->vlan_nchains = 0;
920 	} else if (type == VSW_VNETPORT) {
921 		vsw_port_t	*portp = (vsw_port_t *)arg;
922 
923 		mod_hash_destroy_hash(portp->vlan_hashp);
924 		portp->vlan_nchains = 0;
925 	} else {
926 		return;
927 	}
928 }
929 
930 /*
931  * Add vlan ids of the given vsw device or port into its hash table.
932  */
933 void
934 vsw_vlan_add_ids(void *arg, int type)
935 {
936 	int	rv;
937 	int	i;
938 
939 	if (type == VSW_LOCALDEV) {
940 		vsw_t		*vswp = (vsw_t *)arg;
941 
942 		rv = mod_hash_insert(vswp->vlan_hashp,
943 		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
944 		    (mod_hash_val_t)B_TRUE);
945 		ASSERT(rv == 0);
946 
947 		for (i = 0; i < vswp->nvids; i++) {
948 			rv = mod_hash_insert(vswp->vlan_hashp,
949 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
950 			    (mod_hash_val_t)B_TRUE);
951 			ASSERT(rv == 0);
952 		}
953 
954 	} else if (type == VSW_VNETPORT) {
955 		vsw_port_t	*portp = (vsw_port_t *)arg;
956 
957 		rv = mod_hash_insert(portp->vlan_hashp,
958 		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
959 		    (mod_hash_val_t)B_TRUE);
960 		ASSERT(rv == 0);
961 
962 		for (i = 0; i < portp->nvids; i++) {
963 			rv = mod_hash_insert(portp->vlan_hashp,
964 			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
965 			    (mod_hash_val_t)B_TRUE);
966 			ASSERT(rv == 0);
967 		}
968 
969 	} else {
970 		return;
971 	}
972 }
973 
974 /*
975  * Remove vlan ids of the given vsw device or port from its hash table.
976  */
977 void
978 vsw_vlan_remove_ids(void *arg, int type)
979 {
980 	mod_hash_val_t	vp;
981 	int		rv;
982 	int		i;
983 
984 	if (type == VSW_LOCALDEV) {
985 		vsw_t		*vswp = (vsw_t *)arg;
986 
987 		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
988 		if (rv == B_TRUE) {
989 			rv = mod_hash_remove(vswp->vlan_hashp,
990 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
991 			    (mod_hash_val_t *)&vp);
992 			ASSERT(rv == 0);
993 		}
994 
995 		for (i = 0; i < vswp->nvids; i++) {
996 			rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->vids[i]);
997 			if (rv == B_TRUE) {
998 				rv = mod_hash_remove(vswp->vlan_hashp,
999 				    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
1000 				    (mod_hash_val_t *)&vp);
1001 				ASSERT(rv == 0);
1002 			}
1003 		}
1004 
1005 	} else if (type == VSW_VNETPORT) {
1006 		vsw_port_t	*portp = (vsw_port_t *)arg;
1007 
1008 		portp = (vsw_port_t *)arg;
1009 		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1010 		if (rv == B_TRUE) {
1011 			rv = mod_hash_remove(portp->vlan_hashp,
1012 			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1013 			    (mod_hash_val_t *)&vp);
1014 			ASSERT(rv == 0);
1015 		}
1016 
1017 		for (i = 0; i < portp->nvids; i++) {
1018 			rv = vsw_vlan_lookup(portp->vlan_hashp, portp->vids[i]);
1019 			if (rv == B_TRUE) {
1020 				rv = mod_hash_remove(portp->vlan_hashp,
1021 				    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
1022 				    (mod_hash_val_t *)&vp);
1023 				ASSERT(rv == 0);
1024 			}
1025 		}
1026 
1027 	} else {
1028 		return;
1029 	}
1030 }
1031 
1032 /*
1033  * Find the given vlan id in the hash table.
1034  * Return: B_TRUE if the id is found; B_FALSE if not found.
1035  */
1036 boolean_t
1037 vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1038 {
1039 	int		rv;
1040 	mod_hash_val_t	vp;
1041 
1042 	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1043 
1044 	if (rv != 0)
1045 		return (B_FALSE);
1046 
1047 	return (B_TRUE);
1048 }
1049 
1050 /*
1051  * Add an entry into FDB for the given vsw.
1052  */
1053 void
1054 vsw_fdbe_add(vsw_t *vswp, void *port)
1055 {
1056 	uint64_t	addr = 0;
1057 	vsw_port_t	*portp;
1058 	vsw_fdbe_t	*fp;
1059 	int		rv;
1060 
1061 	portp = (vsw_port_t *)port;
1062 	KEY_HASH(addr, &portp->p_macaddr);
1063 
1064 	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1065 	fp->portp = port;
1066 
1067 	/*
1068 	 * Note: duplicate keys will be rejected by mod_hash.
1069 	 */
1070 	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1071 	    (mod_hash_val_t)fp);
1072 	ASSERT(rv == 0);
1073 }
1074 
1075 /*
1076  * Remove an entry from FDB.
1077  */
1078 void
1079 vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1080 {
1081 	uint64_t	addr = 0;
1082 	vsw_fdbe_t	*fp;
1083 	int		rv;
1084 
1085 	KEY_HASH(addr, eaddr);
1086 
1087 	/*
1088 	 * Remove the entry from fdb hash table.
1089 	 * This prevents further references to this fdb entry.
1090 	 */
1091 	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1092 	    (mod_hash_val_t *)&fp);
1093 	if (rv != 0) {
1094 		/* invalid key? */
1095 		return;
1096 	}
1097 
1098 	/*
1099 	 * If there are threads already ref holding before the entry was
1100 	 * removed from hash table, then wait for ref count to drop to zero.
1101 	 */
1102 	while (fp->refcnt != 0) {
1103 		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1104 	}
1105 
1106 	kmem_free(fp, sizeof (*fp));
1107 }
1108 
1109 /*
1110  * Search fdb for a given mac address. If an entry is found, hold
1111  * a reference to it and return the entry, else returns NULL.
1112  */
1113 static vsw_fdbe_t *
1114 vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1115 {
1116 	uint64_t	key = 0;
1117 	vsw_fdbe_t	*fp;
1118 	int		rv;
1119 
1120 	KEY_HASH(key, addrp);
1121 
1122 	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1123 	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1124 
1125 	if (rv != 0)
1126 		return (NULL);
1127 
1128 	return (fp);
1129 }
1130 
1131 /*
1132  * Callback function provided to mod_hash_find_cb(). After finding the fdb
1133  * entry corresponding to the key (macaddr), this callback will be invoked by
1134  * mod_hash_find_cb() to atomically increment the reference count on the fdb
1135  * entry before returning the found entry.
1136  */
1137 static void
1138 vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1139 {
1140 	_NOTE(ARGUNUSED(key))
1141 	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1142 }
1143 
1144 /*
1145  * A given frame must be always tagged with the appropriate vlan id (unless it
1146  * is in the default-vlan) before the mac address switching function is called.
1147  * Otherwise, after switching function determines the destination, we cannot
1148  * figure out if the destination belongs to the the same vlan that the frame
1149  * originated from and if it needs tag/untag. Frames which are inbound from
1150  * the external(physical) network over a vlan trunk link are always tagged.
1151  * However frames which are received from a vnet-port over ldc or frames which
1152  * are coming down the stack on the service domain over vsw interface may be
1153  * untagged. These frames must be tagged with the appropriate pvid of the
1154  * sender (vnet-port or vsw device), before invoking the switching function.
1155  *
1156  * Arguments:
1157  *   arg:    caller of the function.
1158  *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1159  *   mp:     frame(s) to be tagged.
1160  */
1161 mblk_t *
1162 vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1163 {
1164 	vsw_t			*vswp;
1165 	vsw_port_t		*portp;
1166 	struct ether_header	*ehp;
1167 	mblk_t			*bp;
1168 	mblk_t			*bpt;
1169 	mblk_t			*bph;
1170 	mblk_t			*bpn;
1171 	uint16_t		pvid;
1172 
1173 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1174 
1175 	if (type == VSW_LOCALDEV) {
1176 		vswp = (vsw_t *)arg;
1177 		pvid = vswp->pvid;
1178 		portp = NULL;
1179 	} else {
1180 		/* VSW_VNETPORT */
1181 		portp = (vsw_port_t *)arg;
1182 		pvid = portp->pvid;
1183 		vswp = portp->p_vswp;
1184 	}
1185 
1186 	bpn = bph = bpt = NULL;
1187 
1188 	for (bp = mp; bp != NULL; bp = bpn) {
1189 
1190 		bpn = bp->b_next;
1191 		bp->b_next = bp->b_prev = NULL;
1192 
1193 		/* Determine if it is an untagged frame */
1194 		ehp = (struct ether_header *)bp->b_rptr;
1195 
1196 		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1197 
1198 			/* no need to tag if the frame is in default vlan */
1199 			if (pvid != vswp->default_vlan_id) {
1200 				bp = vnet_vlan_insert_tag(bp, pvid);
1201 				if (bp == NULL) {
1202 					continue;
1203 				}
1204 			}
1205 		}
1206 
1207 		/* build a chain of processed packets */
1208 		if (bph == NULL) {
1209 			bph = bpt = bp;
1210 		} else {
1211 			bpt->b_next = bp;
1212 			bpt = bp;
1213 		}
1214 
1215 	}
1216 
1217 	return (bph);
1218 }
1219 
1220 /*
1221  * Frames destined to a vnet-port or to the local vsw interface, must be
1222  * untagged if necessary before sending. This function first checks that the
1223  * frame can be sent to the destination in the vlan identified by the frame
1224  * tag. Note that when this function is invoked the frame must have been
1225  * already tagged (unless it is in the default-vlan). Because, this function is
1226  * called when the switching function determines the destination and invokes
1227  * its send function (vnet-port or vsw interface) and all frames would have
1228  * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1229  *
1230  * Arguments:
1231  *   arg:    destination device.
1232  *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1233  *   np:     head of pkt chain to be validated and untagged.
1234  *   npt:    tail of pkt chain to be validated and untagged.
1235  *
1236  * Returns:
1237  *   np:     head of updated chain of packets
1238  *   npt:    tail of updated chain of packets
1239  *   rv:     count of any packets dropped
1240  */
1241 uint32_t
1242 vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1243 {
1244 	mblk_t			*bp;
1245 	mblk_t			*bpt;
1246 	mblk_t			*bph;
1247 	mblk_t			*bpn;
1248 	vsw_port_t		*portp;
1249 	vsw_t			*vswp;
1250 	uint32_t		count;
1251 	struct ether_header	*ehp;
1252 	boolean_t		is_tagged;
1253 	boolean_t		rv;
1254 	uint16_t		vlan_id;
1255 	uint16_t		pvid;
1256 	mod_hash_t		*vlan_hashp;
1257 
1258 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1259 
1260 	if (type == VSW_LOCALDEV) {
1261 		vswp = (vsw_t *)arg;
1262 		pvid = vswp->pvid;
1263 		vlan_hashp = vswp->vlan_hashp;
1264 		portp = NULL;
1265 	} else {
1266 		/* type == VSW_VNETPORT */
1267 		portp = (vsw_port_t *)arg;
1268 		vswp = portp->p_vswp;
1269 		vlan_hashp = portp->vlan_hashp;
1270 		pvid = portp->pvid;
1271 	}
1272 
1273 	bpn = bph = bpt = NULL;
1274 	count = 0;
1275 
1276 	for (bp = *np; bp != NULL; bp = bpn) {
1277 
1278 		bpn = bp->b_next;
1279 		bp->b_next = bp->b_prev = NULL;
1280 
1281 		/*
1282 		 * Determine the vlan id that the frame belongs to.
1283 		 */
1284 		ehp = (struct ether_header *)bp->b_rptr;
1285 		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1286 
1287 		/*
1288 		 * Check if the destination is in the same vlan.
1289 		 */
1290 		rv = vsw_vlan_lookup(vlan_hashp, vlan_id);
1291 		if (rv == B_FALSE) {
1292 			/* drop the packet */
1293 			freemsg(bp);
1294 			count++;
1295 			continue;
1296 		}
1297 
1298 		/*
1299 		 * Check the frame header if tag/untag is  needed.
1300 		 */
1301 		if (is_tagged == B_FALSE) {
1302 			/*
1303 			 * Untagged frame. We shouldn't have an untagged
1304 			 * packet at this point, unless the destination's
1305 			 * vlan id is default-vlan-id; if it is not the
1306 			 * default-vlan-id, we drop the packet.
1307 			 */
1308 			if (vlan_id != vswp->default_vlan_id) {
1309 				/* drop the packet */
1310 				freemsg(bp);
1311 				count++;
1312 				continue;
1313 			}
1314 		} else {
1315 			/*
1316 			 * Tagged frame, untag if it's the destination's pvid.
1317 			 */
1318 			if (vlan_id == pvid) {
1319 
1320 				bp = vnet_vlan_remove_tag(bp);
1321 				if (bp == NULL) {
1322 					/* packet dropped */
1323 					count++;
1324 					continue;
1325 				}
1326 			}
1327 		}
1328 
1329 		/* build a chain of processed packets */
1330 		if (bph == NULL) {
1331 			bph = bpt = bp;
1332 		} else {
1333 			bpt->b_next = bp;
1334 			bpt = bp;
1335 		}
1336 
1337 	}
1338 
1339 	*np = bph;
1340 	*npt = bpt;
1341 
1342 	return (count);
1343 }
1344 
1345 /*
1346  * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1347  * then the vlan-id is available in the tag; otherwise, its vlan id is
1348  * implicitly obtained based on the caller (destination of the frame:
1349  * VSW_VNETPORT or VSW_LOCALDEV).
1350  * The vlan id determined is returned in vidp.
1351  * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1352  */
1353 boolean_t
1354 vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1355 	uint16_t *vidp)
1356 {
1357 	struct ether_vlan_header	*evhp;
1358 	vsw_t				*vswp;
1359 	vsw_port_t			*portp;
1360 
1361 	/* If it's a tagged frame, get the vid from vlan header */
1362 	if (ehp->ether_type == ETHERTYPE_VLAN) {
1363 
1364 		evhp = (struct ether_vlan_header *)ehp;
1365 		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1366 		return (B_TRUE);
1367 	}
1368 
1369 	/* Untagged frame; determine vlan id based on caller */
1370 	switch (caller) {
1371 
1372 	case VSW_VNETPORT:
1373 		/*
1374 		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1375 		 */
1376 		portp = (vsw_port_t *)arg;
1377 		*vidp = portp->pvid;
1378 		break;
1379 
1380 	case VSW_LOCALDEV:
1381 
1382 		/*
1383 		 * packet destined to vsw interface;
1384 		 * vlan-id is port-vlan-id of vsw device.
1385 		 */
1386 		vswp = (vsw_t *)arg;
1387 		*vidp = vswp->pvid;
1388 		break;
1389 	}
1390 
1391 	return (B_FALSE);
1392 }
1393 
1394 /*
1395  * Add or remove multicast address(es).
1396  *
1397  * Returns 0 on success, 1 on failure.
1398  */
1399 int
1400 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1401 {
1402 	mcst_addr_t		*mcst_p = NULL;
1403 	vsw_t			*vswp = port->p_vswp;
1404 	uint64_t		addr = 0x0;
1405 	int			i;
1406 
1407 	D1(vswp, "%s: enter", __func__);
1408 
1409 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1410 
1411 	for (i = 0; i < mcst_pkt->count; i++) {
1412 		/*
1413 		 * Convert address into form that can be used
1414 		 * as hash table key.
1415 		 */
1416 		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1417 
1418 		/*
1419 		 * Add or delete the specified address/port combination.
1420 		 */
1421 		if (mcst_pkt->set == 0x1) {
1422 			D3(vswp, "%s: adding multicast address 0x%llx for "
1423 			    "port %ld", __func__, addr, port->p_instance);
1424 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1425 				/*
1426 				 * Update the list of multicast
1427 				 * addresses contained within the
1428 				 * port structure to include this new
1429 				 * one.
1430 				 */
1431 				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1432 				    KM_NOSLEEP);
1433 				if (mcst_p == NULL) {
1434 					DERR(vswp, "%s: unable to alloc mem",
1435 					    __func__);
1436 					(void) vsw_del_mcst(vswp,
1437 					    VSW_VNETPORT, addr, port);
1438 					return (1);
1439 				}
1440 
1441 				mcst_p->nextp = NULL;
1442 				mcst_p->addr = addr;
1443 				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1444 
1445 				/*
1446 				 * Program the address into HW. If the addr
1447 				 * has already been programmed then the MAC
1448 				 * just increments a ref counter (which is
1449 				 * used when the address is being deleted)
1450 				 */
1451 				mutex_enter(&vswp->mac_lock);
1452 				if (vswp->mh != NULL) {
1453 					if (mac_multicst_add(vswp->mh,
1454 					    (uchar_t *)&mcst_pkt->mca[i])) {
1455 						mutex_exit(&vswp->mac_lock);
1456 						cmn_err(CE_WARN, "!vsw%d: "
1457 						    "unable to add multicast "
1458 						    "address: %s\n",
1459 						    vswp->instance,
1460 						    ether_sprintf((void *)
1461 						    &mcst_p->mca));
1462 						(void) vsw_del_mcst(vswp,
1463 						    VSW_VNETPORT, addr, port);
1464 						kmem_free(mcst_p,
1465 						    sizeof (*mcst_p));
1466 						return (1);
1467 					}
1468 					mcst_p->mac_added = B_TRUE;
1469 				}
1470 				mutex_exit(&vswp->mac_lock);
1471 
1472 				mutex_enter(&port->mca_lock);
1473 				mcst_p->nextp = port->mcap;
1474 				port->mcap = mcst_p;
1475 				mutex_exit(&port->mca_lock);
1476 
1477 			} else {
1478 				DERR(vswp, "%s: error adding multicast "
1479 				    "address 0x%llx for port %ld",
1480 				    __func__, addr, port->p_instance);
1481 				return (1);
1482 			}
1483 		} else {
1484 			/*
1485 			 * Delete an entry from the multicast hash
1486 			 * table and update the address list
1487 			 * appropriately.
1488 			 */
1489 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1490 				D3(vswp, "%s: deleting multicast address "
1491 				    "0x%llx for port %ld", __func__, addr,
1492 				    port->p_instance);
1493 
1494 				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1495 				ASSERT(mcst_p != NULL);
1496 
1497 				/*
1498 				 * Remove the address from HW. The address
1499 				 * will actually only be removed once the ref
1500 				 * count within the MAC layer has dropped to
1501 				 * zero. I.e. we can safely call this fn even
1502 				 * if other ports are interested in this
1503 				 * address.
1504 				 */
1505 				mutex_enter(&vswp->mac_lock);
1506 				if (vswp->mh != NULL && mcst_p->mac_added) {
1507 					if (mac_multicst_remove(vswp->mh,
1508 					    (uchar_t *)&mcst_pkt->mca[i])) {
1509 						mutex_exit(&vswp->mac_lock);
1510 						cmn_err(CE_WARN, "!vsw%d: "
1511 						    "unable to remove mcast "
1512 						    "address: %s\n",
1513 						    vswp->instance,
1514 						    ether_sprintf((void *)
1515 						    &mcst_p->mca));
1516 						kmem_free(mcst_p,
1517 						    sizeof (*mcst_p));
1518 						return (1);
1519 					}
1520 					mcst_p->mac_added = B_FALSE;
1521 				}
1522 				mutex_exit(&vswp->mac_lock);
1523 				kmem_free(mcst_p, sizeof (*mcst_p));
1524 
1525 			} else {
1526 				DERR(vswp, "%s: error deleting multicast "
1527 				    "addr 0x%llx for port %ld",
1528 				    __func__, addr, port->p_instance);
1529 				return (1);
1530 			}
1531 		}
1532 	}
1533 	D1(vswp, "%s: exit", __func__);
1534 	return (0);
1535 }
1536 
1537 /*
1538  * Add a new multicast entry.
1539  *
1540  * Search hash table based on address. If match found then
1541  * update associated val (which is chain of ports), otherwise
1542  * create new key/val (addr/port) pair and insert into table.
1543  */
1544 int
1545 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1546 {
1547 	int		dup = 0;
1548 	int		rv = 0;
1549 	mfdb_ent_t	*ment = NULL;
1550 	mfdb_ent_t	*tmp_ent = NULL;
1551 	mfdb_ent_t	*new_ent = NULL;
1552 	void		*tgt = NULL;
1553 
1554 	if (devtype == VSW_VNETPORT) {
1555 		/*
1556 		 * Being invoked from a vnet.
1557 		 */
1558 		ASSERT(arg != NULL);
1559 		tgt = arg;
1560 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1561 		    ((vsw_port_t *)arg)->p_instance, addr);
1562 	} else {
1563 		/*
1564 		 * We are being invoked via the m_multicst mac entry
1565 		 * point.
1566 		 */
1567 		D2(NULL, "%s: address 0x%llx", __func__, addr);
1568 		tgt = (void *)vswp;
1569 	}
1570 
1571 	WRITE_ENTER(&vswp->mfdbrw);
1572 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1573 	    (mod_hash_val_t *)&ment) != 0) {
1574 
1575 		/* address not currently in table */
1576 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1577 		ment->d_addr = (void *)tgt;
1578 		ment->d_type = devtype;
1579 		ment->nextp = NULL;
1580 
1581 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1582 		    (mod_hash_val_t)ment) != 0) {
1583 			DERR(vswp, "%s: hash table insertion failed", __func__);
1584 			kmem_free(ment, sizeof (mfdb_ent_t));
1585 			rv = 1;
1586 		} else {
1587 			D2(vswp, "%s: added initial entry for 0x%llx to "
1588 			    "table", __func__, addr);
1589 		}
1590 	} else {
1591 		/*
1592 		 * Address in table. Check to see if specified port
1593 		 * is already associated with the address. If not add
1594 		 * it now.
1595 		 */
1596 		tmp_ent = ment;
1597 		while (tmp_ent != NULL) {
1598 			if (tmp_ent->d_addr == (void *)tgt) {
1599 				if (devtype == VSW_VNETPORT) {
1600 					DERR(vswp, "%s: duplicate port entry "
1601 					    "found for portid %ld and key "
1602 					    "0x%llx", __func__,
1603 					    ((vsw_port_t *)arg)->p_instance,
1604 					    addr);
1605 				} else {
1606 					DERR(vswp, "%s: duplicate entry found"
1607 					    "for key 0x%llx", __func__, addr);
1608 				}
1609 				rv = 1;
1610 				dup = 1;
1611 				break;
1612 			}
1613 			tmp_ent = tmp_ent->nextp;
1614 		}
1615 
1616 		/*
1617 		 * Port not on list so add it to end now.
1618 		 */
1619 		if (0 == dup) {
1620 			D2(vswp, "%s: added entry for 0x%llx to table",
1621 			    __func__, addr);
1622 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1623 			new_ent->d_addr = (void *)tgt;
1624 			new_ent->d_type = devtype;
1625 			new_ent->nextp = NULL;
1626 
1627 			tmp_ent = ment;
1628 			while (tmp_ent->nextp != NULL)
1629 				tmp_ent = tmp_ent->nextp;
1630 
1631 			tmp_ent->nextp = new_ent;
1632 		}
1633 	}
1634 
1635 	RW_EXIT(&vswp->mfdbrw);
1636 	return (rv);
1637 }
1638 
1639 /*
1640  * Remove a multicast entry from the hashtable.
1641  *
1642  * Search hash table based on address. If match found, scan
1643  * list of ports associated with address. If specified port
1644  * found remove it from list.
1645  */
1646 int
1647 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1648 {
1649 	mfdb_ent_t	*ment = NULL;
1650 	mfdb_ent_t	*curr_p, *prev_p;
1651 	void		*tgt = NULL;
1652 
1653 	D1(vswp, "%s: enter", __func__);
1654 
1655 	if (devtype == VSW_VNETPORT) {
1656 		tgt = (vsw_port_t *)arg;
1657 		D2(vswp, "%s: removing port %d from mFDB for address"
1658 		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1659 	} else {
1660 		D2(vswp, "%s: removing entry", __func__);
1661 		tgt = (void *)vswp;
1662 	}
1663 
1664 	WRITE_ENTER(&vswp->mfdbrw);
1665 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1666 	    (mod_hash_val_t *)&ment) != 0) {
1667 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1668 		RW_EXIT(&vswp->mfdbrw);
1669 		return (1);
1670 	}
1671 
1672 	prev_p = curr_p = ment;
1673 
1674 	while (curr_p != NULL) {
1675 		if (curr_p->d_addr == (void *)tgt) {
1676 			if (devtype == VSW_VNETPORT) {
1677 				D2(vswp, "%s: port %d found", __func__,
1678 				    ((vsw_port_t *)tgt)->p_instance);
1679 			} else {
1680 				D2(vswp, "%s: instance found", __func__);
1681 			}
1682 
1683 			if (prev_p == curr_p) {
1684 				/*
1685 				 * head of list, if no other element is in
1686 				 * list then destroy this entry, otherwise
1687 				 * just replace it with updated value.
1688 				 */
1689 				ment = curr_p->nextp;
1690 				if (ment == NULL) {
1691 					(void) mod_hash_destroy(vswp->mfdb,
1692 					    (mod_hash_val_t)addr);
1693 				} else {
1694 					(void) mod_hash_replace(vswp->mfdb,
1695 					    (mod_hash_key_t)addr,
1696 					    (mod_hash_val_t)ment);
1697 				}
1698 			} else {
1699 				/*
1700 				 * Not head of list, no need to do
1701 				 * replacement, just adjust list pointers.
1702 				 */
1703 				prev_p->nextp = curr_p->nextp;
1704 			}
1705 			break;
1706 		}
1707 
1708 		prev_p = curr_p;
1709 		curr_p = curr_p->nextp;
1710 	}
1711 
1712 	RW_EXIT(&vswp->mfdbrw);
1713 
1714 	D1(vswp, "%s: exit", __func__);
1715 
1716 	if (curr_p == NULL)
1717 		return (1);
1718 	kmem_free(curr_p, sizeof (mfdb_ent_t));
1719 	return (0);
1720 }
1721 
1722 /*
1723  * Port is being deleted, but has registered an interest in one
1724  * or more multicast groups. Using the list of addresses maintained
1725  * within the port structure find the appropriate entry in the hash
1726  * table and remove this port from the list of interested ports.
1727  */
1728 void
1729 vsw_del_mcst_port(vsw_port_t *port)
1730 {
1731 	mcst_addr_t	*mcap = NULL;
1732 	vsw_t		*vswp = port->p_vswp;
1733 
1734 	D1(vswp, "%s: enter", __func__);
1735 
1736 	mutex_enter(&port->mca_lock);
1737 
1738 	while ((mcap = port->mcap) != NULL) {
1739 
1740 		port->mcap = mcap->nextp;
1741 
1742 		mutex_exit(&port->mca_lock);
1743 
1744 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1745 		    mcap->addr, port);
1746 
1747 		/*
1748 		 * Remove the address from HW. The address
1749 		 * will actually only be removed once the ref
1750 		 * count within the MAC layer has dropped to
1751 		 * zero. I.e. we can safely call this fn even
1752 		 * if other ports are interested in this
1753 		 * address.
1754 		 */
1755 		mutex_enter(&vswp->mac_lock);
1756 		if (vswp->mh != NULL && mcap->mac_added) {
1757 			(void) mac_multicst_remove(vswp->mh,
1758 			    (uchar_t *)&mcap->mca);
1759 		}
1760 		mutex_exit(&vswp->mac_lock);
1761 
1762 		kmem_free(mcap, sizeof (*mcap));
1763 
1764 		mutex_enter(&port->mca_lock);
1765 
1766 	}
1767 
1768 	mutex_exit(&port->mca_lock);
1769 
1770 	D1(vswp, "%s: exit", __func__);
1771 }
1772 
1773 /*
1774  * This vsw instance is detaching, but has registered an interest in one
1775  * or more multicast groups. Using the list of addresses maintained
1776  * within the vsw structure find the appropriate entry in the hash
1777  * table and remove this instance from the list of interested ports.
1778  */
1779 void
1780 vsw_del_mcst_vsw(vsw_t *vswp)
1781 {
1782 	mcst_addr_t	*next_p = NULL;
1783 
1784 	D1(vswp, "%s: enter", __func__);
1785 
1786 	mutex_enter(&vswp->mca_lock);
1787 
1788 	while (vswp->mcap != NULL) {
1789 		DERR(vswp, "%s: deleting addr 0x%llx",
1790 		    __func__, vswp->mcap->addr);
1791 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1792 
1793 		next_p = vswp->mcap->nextp;
1794 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1795 		vswp->mcap = next_p;
1796 	}
1797 
1798 	vswp->mcap = NULL;
1799 	mutex_exit(&vswp->mca_lock);
1800 
1801 	D1(vswp, "%s: exit", __func__);
1802 }
1803 
1804 static uint32_t
1805 vsw_get_same_dest_list(struct ether_header *ehp,
1806     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp)
1807 {
1808 	uint32_t		count = 0;
1809 	mblk_t			*bp;
1810 	mblk_t			*nbp;
1811 	mblk_t			*head = NULL;
1812 	mblk_t			*tail = NULL;
1813 	mblk_t			*prev = NULL;
1814 	struct ether_header	*behp;
1815 
1816 	/* process the chain of packets */
1817 	bp = *mpp;
1818 	while (bp) {
1819 		nbp = bp->b_next;
1820 		behp = (struct ether_header *)bp->b_rptr;
1821 		bp->b_prev = NULL;
1822 		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1823 			if (prev == NULL) {
1824 				*mpp = nbp;
1825 			} else {
1826 				prev->b_next = nbp;
1827 			}
1828 			bp->b_next =  NULL;
1829 			if (head == NULL) {
1830 				head = tail = bp;
1831 			} else {
1832 				tail->b_next = bp;
1833 				tail = bp;
1834 			}
1835 			count++;
1836 		} else {
1837 			prev = bp;
1838 		}
1839 		bp = nbp;
1840 	}
1841 	*rhead = head;
1842 	*rtail = tail;
1843 	DTRACE_PROBE1(vsw_same_dest, int, count);
1844 	return (count);
1845 }
1846 
1847 static mblk_t *
1848 vsw_dupmsgchain(mblk_t *mp)
1849 {
1850 	mblk_t	*nmp = NULL;
1851 	mblk_t	**nmpp = &nmp;
1852 
1853 	for (; mp != NULL; mp = mp->b_next) {
1854 		if ((*nmpp = dupmsg(mp)) == NULL) {
1855 			freemsgchain(nmp);
1856 			return (NULL);
1857 		}
1858 
1859 		nmpp = &((*nmpp)->b_next);
1860 	}
1861 
1862 	return (nmp);
1863 }
1864