xref: /titanic_51/usr/src/uts/sun4v/io/vsw_switching.c (revision 4045d94132614e1de2073685a6cdd4fbd86bec33)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 #include <sys/atomic.h>
74 #include <sys/vlan.h>
75 
76 /* Switching setup routines */
77 void vsw_setup_switching_timeout(void *arg);
78 void vsw_stop_switching_timeout(vsw_t *vswp);
79 int vsw_setup_switching(vsw_t *);
80 static	int vsw_setup_layer2(vsw_t *);
81 static	int vsw_setup_layer3(vsw_t *);
82 
83 /* Switching/data transmit routines */
84 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
85 	vsw_port_t *port, mac_resource_handle_t);
86 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
87 	vsw_port_t *port, mac_resource_handle_t);
88 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
89 	int caller, vsw_port_t *port);
90 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
91     int caller, vsw_port_t *port);
92 
93 /* VLAN routines */
94 void vsw_create_vlans(void *arg, int type);
95 void vsw_destroy_vlans(void *arg, int type);
96 void vsw_vlan_add_ids(void *arg, int type);
97 void vsw_vlan_remove_ids(void *arg, int type);
98 static	void vsw_vlan_create_hash(void *arg, int type);
99 static	void vsw_vlan_destroy_hash(void *arg, int type);
100 boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
101 	uint16_t *vidp);
102 mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
103 uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
104 boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
105 
106 /* Forwarding database (FDB) routines */
107 void vsw_fdbe_add(vsw_t *vswp, void *port);
108 void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
109 static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
110 static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
111 
112 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
113 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
114 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
115 void vsw_del_mcst_vsw(vsw_t *);
116 
117 /* Support functions */
118 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
119 static uint32_t vsw_get_same_dest_list(struct ether_header *ehp,
120     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
121 
122 
123 /*
124  * Functions imported from other files.
125  */
126 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
127 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
128 extern int vsw_mac_open(vsw_t *vswp);
129 extern void vsw_mac_close(vsw_t *vswp);
130 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
131     mblk_t *mp, vsw_macrx_flags_t flags);
132 extern void vsw_set_addrs(vsw_t *vswp);
133 extern int vsw_get_hw_maddr(vsw_t *);
134 extern int vsw_mac_attach(vsw_t *vswp);
135 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt,
136 	uint32_t count);
137 extern void vsw_hio_init(vsw_t *vswp);
138 extern void vsw_hio_start_ports(vsw_t *vswp);
139 
140 /*
141  * Tunables used in this file.
142  */
143 extern	int vsw_setup_switching_delay;
144 extern	uint32_t vsw_vlan_nchains;
145 extern	uint32_t vsw_fdbe_refcnt_delay;
146 
147 #define	VSW_FDBE_REFHOLD(p)						\
148 {									\
149 	atomic_inc_32(&(p)->refcnt);					\
150 	ASSERT((p)->refcnt != 0);					\
151 }
152 
153 #define	VSW_FDBE_REFRELE(p)						\
154 {									\
155 	ASSERT((p)->refcnt != 0);					\
156 	atomic_dec_32(&(p)->refcnt);					\
157 }
158 
159 /*
160  * Timeout routine to setup switching mode:
161  * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop()
162  * initially. If it fails and the error is EAGAIN, then this timeout handler
163  * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried
164  * until we successfully finish it; or the returned error is not EAGAIN.
165  */
166 void
167 vsw_setup_switching_timeout(void *arg)
168 {
169 	vsw_t		*vswp = (vsw_t *)arg;
170 	int		rv;
171 
172 	if (vswp->swtmout_enabled == B_FALSE)
173 		return;
174 
175 	rv = vsw_setup_switching(vswp);
176 
177 	if (rv == 0) {
178 		/*
179 		 * Successfully setup switching mode.
180 		 * Program unicst, mcst addrs of vsw
181 		 * interface and ports in the physdev.
182 		 */
183 		vsw_set_addrs(vswp);
184 
185 		/* Start HIO for ports that have already connected */
186 		vsw_hio_start_ports(vswp);
187 	}
188 
189 	mutex_enter(&vswp->swtmout_lock);
190 
191 	if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) {
192 		/*
193 		 * Reschedule timeout() if the error is EAGAIN and the
194 		 * timeout is still enabled. For errors other than EAGAIN,
195 		 * we simply return without rescheduling timeout().
196 		 */
197 		vswp->swtmout_id =
198 		    timeout(vsw_setup_switching_timeout, vswp,
199 		    (vsw_setup_switching_delay * drv_usectohz(MICROSEC)));
200 		goto exit;
201 	}
202 
203 	/* timeout handler completed */
204 	vswp->swtmout_enabled = B_FALSE;
205 	vswp->swtmout_id = 0;
206 
207 exit:
208 	mutex_exit(&vswp->swtmout_lock);
209 }
210 
211 /*
212  * Cancel the timeout handler to setup switching mode.
213  */
214 void
215 vsw_stop_switching_timeout(vsw_t *vswp)
216 {
217 	timeout_id_t tid;
218 
219 	mutex_enter(&vswp->swtmout_lock);
220 
221 	tid = vswp->swtmout_id;
222 
223 	if (tid != 0) {
224 		/* signal timeout handler to stop */
225 		vswp->swtmout_enabled = B_FALSE;
226 		vswp->swtmout_id = 0;
227 		mutex_exit(&vswp->swtmout_lock);
228 
229 		(void) untimeout(tid);
230 	} else {
231 		mutex_exit(&vswp->swtmout_lock);
232 	}
233 
234 	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
235 
236 	mutex_enter(&vswp->mac_lock);
237 	vswp->mac_open_retries = 0;
238 	mutex_exit(&vswp->mac_lock);
239 }
240 
241 /*
242  * Setup the required switching mode.
243  * This routine is invoked from vsw_attach() or vsw_update_md_prop()
244  * initially. If it fails and the error is EAGAIN, then a timeout handler
245  * is started to retry vsw_setup_switching(), until it successfully finishes;
246  * or the returned error is not EAGAIN.
247  *
248  * Returns:
249  *  0 on success.
250  *  EAGAIN if retry is needed.
251  *  1 on all other failures.
252  */
253 int
254 vsw_setup_switching(vsw_t *vswp)
255 {
256 	int	i, rv = 1;
257 
258 	D1(vswp, "%s: enter", __func__);
259 
260 	/*
261 	 * Select best switching mode.
262 	 * Note that we start from the saved smode_idx. This is done as
263 	 * this routine can be called from the timeout handler to retry
264 	 * setting up a specific mode. Currently only the function which
265 	 * sets up layer2/promisc mode returns EAGAIN if the underlying
266 	 * physical device is not available yet, causing retries.
267 	 */
268 	for (i = vswp->smode_idx; i < vswp->smode_num; i++) {
269 		vswp->smode_idx = i;
270 		switch (vswp->smode[i]) {
271 		case VSW_LAYER2:
272 		case VSW_LAYER2_PROMISC:
273 			rv = vsw_setup_layer2(vswp);
274 			break;
275 
276 		case VSW_LAYER3:
277 			rv = vsw_setup_layer3(vswp);
278 			break;
279 
280 		default:
281 			DERR(vswp, "unknown switch mode");
282 			break;
283 		}
284 
285 		if ((rv == 0) || (rv == EAGAIN))
286 			break;
287 
288 		/* all other errors(rv != 0): continue & select the next mode */
289 		rv = 1;
290 	}
291 
292 	if (rv && (rv != EAGAIN)) {
293 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
294 		    "switching mode", vswp->instance);
295 	} else if (rv == 0) {
296 		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
297 	}
298 
299 	D2(vswp, "%s: Operating in mode %d", __func__,
300 	    vswp->smode[vswp->smode_idx]);
301 
302 	D1(vswp, "%s: exit", __func__);
303 
304 	return (rv);
305 }
306 
307 /*
308  * Setup for layer 2 switching.
309  *
310  * Returns:
311  *  0 on success.
312  *  EAGAIN if retry is needed.
313  *  EIO on all other failures.
314  */
315 static int
316 vsw_setup_layer2(vsw_t *vswp)
317 {
318 	int	rv;
319 
320 	D1(vswp, "%s: enter", __func__);
321 
322 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
323 
324 	rv = strlen(vswp->physname);
325 	if (rv == 0) {
326 		/*
327 		 * Physical device name is NULL, which is
328 		 * required for layer 2.
329 		 */
330 		cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
331 		    vswp->instance);
332 		return (EIO);
333 	}
334 
335 	mutex_enter(&vswp->mac_lock);
336 
337 	rv = vsw_mac_open(vswp);
338 	if (rv != 0) {
339 		if (rv != EAGAIN) {
340 			cmn_err(CE_WARN, "!vsw%d: Unable to open physical "
341 			    "device: %s\n", vswp->instance, vswp->physname);
342 		}
343 		mutex_exit(&vswp->mac_lock);
344 		return (rv);
345 	}
346 
347 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
348 		/*
349 		 * Verify that underlying device can support multiple
350 		 * unicast mac addresses.
351 		 */
352 		rv = vsw_get_hw_maddr(vswp);
353 		if (rv != 0) {
354 			goto exit_error;
355 		}
356 	}
357 
358 	/*
359 	 * Attempt to link into the MAC layer so we can get
360 	 * and send packets out over the physical adapter.
361 	 */
362 	rv = vsw_mac_attach(vswp);
363 	if (rv != 0) {
364 		/*
365 		 * Registration with the MAC layer has failed,
366 		 * so return error so that can fall back to next
367 		 * prefered switching method.
368 		 */
369 		cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: "
370 		    "%s\n", vswp->instance, vswp->physname);
371 		goto exit_error;
372 	}
373 
374 	D1(vswp, "%s: exit", __func__);
375 
376 	mutex_exit(&vswp->mac_lock);
377 
378 	/* Initialize HybridIO related stuff */
379 	vsw_hio_init(vswp);
380 	return (0);
381 
382 exit_error:
383 	vsw_mac_close(vswp);
384 	mutex_exit(&vswp->mac_lock);
385 	return (EIO);
386 }
387 
388 static int
389 vsw_setup_layer3(vsw_t *vswp)
390 {
391 	D1(vswp, "%s: enter", __func__);
392 
393 	D2(vswp, "%s: operating in layer 3 mode", __func__);
394 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
395 
396 	D1(vswp, "%s: exit", __func__);
397 
398 	return (0);
399 }
400 
401 /*
402  * Switch the given ethernet frame when operating in layer 2 mode.
403  *
404  * vswp: pointer to the vsw instance
405  * mp: pointer to chain of ethernet frame(s) to be switched
406  * caller: identifies the source of this frame as:
407  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
408  *		2. VSW_PHYSDEV - the physical ethernet device
409  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
410  * arg: argument provided by the caller.
411  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
412  *		2. for PHYSDEV - NULL
413  *		3. for LOCALDEV - pointer to to this vsw_t(self)
414  */
415 void
416 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
417 			vsw_port_t *arg, mac_resource_handle_t mrh)
418 {
419 	struct ether_header	*ehp;
420 	mblk_t			*bp, *ret_m;
421 	mblk_t			*mpt = NULL;
422 	uint32_t		count;
423 	vsw_fdbe_t		*fp;
424 
425 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
426 
427 	/*
428 	 * PERF: rather than breaking up the chain here, scan it
429 	 * to find all mblks heading to same destination and then
430 	 * pass that sub-chain to the lower transmit functions.
431 	 */
432 
433 	/* process the chain of packets */
434 	bp = mp;
435 	while (bp) {
436 		ehp = (struct ether_header *)bp->b_rptr;
437 		count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
438 		ASSERT(count != 0);
439 
440 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
441 		    __func__, MBLKSIZE(mp), MBLKL(mp));
442 
443 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
444 			/*
445 			 * If destination is VSW_LOCALDEV (vsw as an eth
446 			 * interface) and if the device is up & running,
447 			 * send the packet up the stack on this host.
448 			 * If the virtual interface is down, drop the packet.
449 			 */
450 			if (caller != VSW_LOCALDEV) {
451 				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
452 			} else {
453 				freemsgchain(mp);
454 			}
455 			continue;
456 		}
457 
458 		/*
459 		 * Find fdb entry for the destination
460 		 * and hold a reference to it.
461 		 */
462 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
463 		if (fp != NULL) {
464 
465 			/*
466 			 * If plumbed and in promisc mode then copy msg
467 			 * and send up the stack.
468 			 */
469 			vsw_mac_rx(vswp, mrh, mp,
470 			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
471 
472 			/*
473 			 * If the destination is in FDB, the packet
474 			 * should be forwarded to the correponding
475 			 * vsw_port (connected to a vnet device -
476 			 * VSW_VNETPORT)
477 			 */
478 			(void) vsw_portsend(fp->portp, mp, mpt, count);
479 
480 			/* Release the reference on the fdb entry */
481 			VSW_FDBE_REFRELE(fp);
482 		} else {
483 			/*
484 			 * Destination not in FDB.
485 			 *
486 			 * If the destination is broadcast or
487 			 * multicast forward the packet to all
488 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
489 			 * except the caller.
490 			 */
491 			if (IS_BROADCAST(ehp)) {
492 				D2(vswp, "%s: BROADCAST pkt", __func__);
493 				(void) vsw_forward_all(vswp, mp, caller, arg);
494 			} else if (IS_MULTICAST(ehp)) {
495 				D2(vswp, "%s: MULTICAST pkt", __func__);
496 				(void) vsw_forward_grp(vswp, mp, caller, arg);
497 			} else {
498 				/*
499 				 * If the destination is unicast, and came
500 				 * from either a logical network device or
501 				 * the switch itself when it is plumbed, then
502 				 * send it out on the physical device and also
503 				 * up the stack if the logical interface is
504 				 * in promiscious mode.
505 				 *
506 				 * NOTE:  The assumption here is that if we
507 				 * cannot find the destination in our fdb, its
508 				 * a unicast address, and came from either a
509 				 * vnet or down the stack (when plumbed) it
510 				 * must be destinded for an ethernet device
511 				 * outside our ldoms.
512 				 */
513 				if (caller == VSW_VNETPORT) {
514 					/* promisc check copy etc */
515 					vsw_mac_rx(vswp, mrh, mp,
516 					    VSW_MACRX_PROMISC |
517 					    VSW_MACRX_COPYMSG);
518 
519 					if ((ret_m = vsw_tx_msg(vswp, mp))
520 					    != NULL) {
521 						DERR(vswp, "%s: drop mblks to "
522 						    "phys dev", __func__);
523 						freemsgchain(ret_m);
524 					}
525 
526 				} else if (caller == VSW_PHYSDEV) {
527 					/*
528 					 * Pkt seen because card in promisc
529 					 * mode. Send up stack if plumbed in
530 					 * promisc mode, else drop it.
531 					 */
532 					vsw_mac_rx(vswp, mrh, mp,
533 					    VSW_MACRX_PROMISC |
534 					    VSW_MACRX_FREEMSG);
535 
536 				} else if (caller == VSW_LOCALDEV) {
537 					/*
538 					 * Pkt came down the stack, send out
539 					 * over physical device.
540 					 */
541 					if ((ret_m = vsw_tx_msg(vswp, mp))
542 					    != NULL) {
543 						DERR(vswp, "%s: drop mblks to "
544 						    "phys dev", __func__);
545 						freemsgchain(ret_m);
546 					}
547 				}
548 			}
549 		}
550 	}
551 	D1(vswp, "%s: exit\n", __func__);
552 }
553 
554 /*
555  * Switch ethernet frame when in layer 3 mode (i.e. using IP
556  * layer to do the routing).
557  *
558  * There is a large amount of overlap between this function and
559  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
560  * both these functions.
561  */
562 void
563 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
564 			vsw_port_t *arg, mac_resource_handle_t mrh)
565 {
566 	struct ether_header	*ehp;
567 	mblk_t			*bp = NULL;
568 	mblk_t			*mpt;
569 	uint32_t		count;
570 	vsw_fdbe_t		*fp;
571 
572 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
573 
574 	/*
575 	 * In layer 3 mode should only ever be switching packets
576 	 * between IP layer and vnet devices. So make sure thats
577 	 * who is invoking us.
578 	 */
579 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
580 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
581 		freemsgchain(mp);
582 		return;
583 	}
584 
585 	/* process the chain of packets */
586 	bp = mp;
587 	while (bp) {
588 		ehp = (struct ether_header *)bp->b_rptr;
589 		count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
590 		ASSERT(count != 0);
591 
592 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
593 		    __func__, MBLKSIZE(mp), MBLKL(mp));
594 
595 		/*
596 		 * Find fdb entry for the destination
597 		 * and hold a reference to it.
598 		 */
599 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
600 		if (fp != NULL) {
601 
602 			D2(vswp, "%s: sending to target port", __func__);
603 			(void) vsw_portsend(fp->portp, mp, mpt, count);
604 
605 			/* Release the reference on the fdb entry */
606 			VSW_FDBE_REFRELE(fp);
607 		} else {
608 			/*
609 			 * Destination not in FDB
610 			 *
611 			 * If the destination is broadcast or
612 			 * multicast forward the packet to all
613 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
614 			 * except the caller.
615 			 */
616 			if (IS_BROADCAST(ehp)) {
617 				D2(vswp, "%s: BROADCAST pkt", __func__);
618 				(void) vsw_forward_all(vswp, mp, caller, arg);
619 			} else if (IS_MULTICAST(ehp)) {
620 				D2(vswp, "%s: MULTICAST pkt", __func__);
621 				(void) vsw_forward_grp(vswp, mp, caller, arg);
622 			} else {
623 				/*
624 				 * Unicast pkt from vnet that we don't have
625 				 * an FDB entry for, so must be destinded for
626 				 * the outside world. Attempt to send up to the
627 				 * IP layer to allow it to deal with it.
628 				 */
629 				if (caller == VSW_VNETPORT) {
630 					vsw_mac_rx(vswp, mrh,
631 					    mp, VSW_MACRX_FREEMSG);
632 				}
633 			}
634 		}
635 	}
636 
637 	D1(vswp, "%s: exit", __func__);
638 }
639 
640 /*
641  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
642  * except the caller (port on which frame arrived).
643  */
644 static int
645 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
646 {
647 	vsw_port_list_t	*plist = &vswp->plist;
648 	vsw_port_t	*portp;
649 	mblk_t		*nmp = NULL;
650 	mblk_t		*ret_m = NULL;
651 	int		skip_port = 0;
652 
653 	D1(vswp, "vsw_forward_all: enter\n");
654 
655 	/*
656 	 * Broadcast message from inside ldoms so send to outside
657 	 * world if in either of layer 2 modes.
658 	 */
659 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
660 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
661 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
662 
663 		nmp = vsw_dupmsgchain(mp);
664 		if (nmp) {
665 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
666 				DERR(vswp, "%s: dropping pkt(s) "
667 				    "consisting of %ld bytes of data for"
668 				    " physical device", __func__, MBLKL(ret_m));
669 				freemsgchain(ret_m);
670 			}
671 		}
672 	}
673 
674 	if (caller == VSW_VNETPORT)
675 		skip_port = 1;
676 
677 	/*
678 	 * Broadcast message from other vnet (layer 2 or 3) or outside
679 	 * world (layer 2 only), send up stack if plumbed.
680 	 */
681 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
682 		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
683 	}
684 
685 	/* send it to all VNETPORTs */
686 	READ_ENTER(&plist->lockrw);
687 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
688 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
689 		/*
690 		 * Caution ! - don't reorder these two checks as arg
691 		 * will be NULL if the caller is PHYSDEV. skip_port is
692 		 * only set if caller is VNETPORT.
693 		 */
694 		if ((skip_port) && (portp == arg)) {
695 			continue;
696 		} else {
697 			nmp = vsw_dupmsgchain(mp);
698 			if (nmp) {
699 				mblk_t	*mpt = nmp;
700 				uint32_t count = 1;
701 
702 				/* Find tail */
703 				while (mpt->b_next != NULL) {
704 					mpt = mpt->b_next;
705 					count++;
706 				}
707 				/*
708 				 * The plist->lockrw is protecting the
709 				 * portp from getting destroyed here.
710 				 * So, no ref_cnt is incremented here.
711 				 */
712 				(void) vsw_portsend(portp, nmp, mpt, count);
713 			} else {
714 				DERR(vswp, "vsw_forward_all: nmp NULL");
715 			}
716 		}
717 	}
718 	RW_EXIT(&plist->lockrw);
719 
720 	freemsgchain(mp);
721 
722 	D1(vswp, "vsw_forward_all: exit\n");
723 	return (0);
724 }
725 
726 /*
727  * Forward pkts to any devices or interfaces which have registered
728  * an interest in them (i.e. multicast groups).
729  */
730 static int
731 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
732 {
733 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
734 	mfdb_ent_t		*entp = NULL;
735 	mfdb_ent_t		*tpp = NULL;
736 	vsw_port_t 		*port;
737 	uint64_t		key = 0;
738 	mblk_t			*nmp = NULL;
739 	mblk_t			*ret_m = NULL;
740 	boolean_t		check_if = B_TRUE;
741 
742 	/*
743 	 * Convert address to hash table key
744 	 */
745 	KEY_HASH(key, &ehp->ether_dhost);
746 
747 	D1(vswp, "%s: key 0x%llx", __func__, key);
748 
749 	/*
750 	 * If pkt came from either a vnet or down the stack (if we are
751 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
752 	 * over the physical adapter, and then check to see if any other
753 	 * vnets are interested in it.
754 	 */
755 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
756 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
757 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
758 		nmp = vsw_dupmsgchain(mp);
759 		if (nmp) {
760 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
761 				DERR(vswp, "%s: dropping pkt(s) consisting of "
762 				    "%ld bytes of data for physical device",
763 				    __func__, MBLKL(ret_m));
764 				freemsgchain(ret_m);
765 			}
766 		}
767 	}
768 
769 	READ_ENTER(&vswp->mfdbrw);
770 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
771 	    (mod_hash_val_t *)&entp) != 0) {
772 		D3(vswp, "%s: no table entry found for addr 0x%llx",
773 		    __func__, key);
774 	} else {
775 		/*
776 		 * Send to list of devices associated with this address...
777 		 */
778 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
779 
780 			/* dont send to ourselves */
781 			if ((caller == VSW_VNETPORT) &&
782 			    (tpp->d_addr == (void *)arg)) {
783 				port = (vsw_port_t *)tpp->d_addr;
784 				D3(vswp, "%s: not sending to ourselves"
785 				    " : port %d", __func__, port->p_instance);
786 				continue;
787 
788 			} else if ((caller == VSW_LOCALDEV) &&
789 			    (tpp->d_type == VSW_LOCALDEV)) {
790 				D2(vswp, "%s: not sending back up stack",
791 				    __func__);
792 				continue;
793 			}
794 
795 			if (tpp->d_type == VSW_VNETPORT) {
796 				port = (vsw_port_t *)tpp->d_addr;
797 				D3(vswp, "%s: sending to port %ld for addr "
798 				    "0x%llx", __func__, port->p_instance, key);
799 
800 				nmp = vsw_dupmsgchain(mp);
801 				if (nmp) {
802 					mblk_t	*mpt = nmp;
803 					uint32_t count = 1;
804 
805 					/* Find tail */
806 					while (mpt->b_next != NULL) {
807 						mpt = mpt->b_next;
808 						count++;
809 					}
810 					/*
811 					 * The vswp->mfdbrw is protecting the
812 					 * portp from getting destroyed here.
813 					 * So, no ref_cnt is incremented here.
814 					 */
815 					(void) vsw_portsend(port, nmp, mpt,
816 					    count);
817 				}
818 			} else {
819 				vsw_mac_rx(vswp, NULL,
820 				    mp, VSW_MACRX_COPYMSG);
821 				D2(vswp, "%s: sending up stack"
822 				    " for addr 0x%llx", __func__, key);
823 				check_if = B_FALSE;
824 			}
825 		}
826 	}
827 
828 	RW_EXIT(&vswp->mfdbrw);
829 
830 	/*
831 	 * If the pkt came from either a vnet or from physical device,
832 	 * and if we havent already sent the pkt up the stack then we
833 	 * check now if we can/should (i.e. the interface is plumbed
834 	 * and in promisc mode).
835 	 */
836 	if ((check_if) &&
837 	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
838 		vsw_mac_rx(vswp, NULL, mp,
839 		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
840 	}
841 
842 	freemsgchain(mp);
843 
844 	D1(vswp, "%s: exit", __func__);
845 
846 	return (0);
847 }
848 
849 /*
850  * This function creates the vlan id hash table for the given vsw device or
851  * port. It then adds each vlan that the device or port has been assigned,
852  * into this hash table.
853  * Arguments:
854  *   arg:  vsw device or port.
855  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
856  */
857 void
858 vsw_create_vlans(void *arg, int type)
859 {
860 	/* create vlan hash table */
861 	vsw_vlan_create_hash(arg, type);
862 
863 	/* add vlan ids of the vsw device into its hash table */
864 	vsw_vlan_add_ids(arg, type);
865 }
866 
867 /*
868  * This function removes the vlan ids of the vsw device or port from its hash
869  * table. It then destroys the vlan hash table.
870  * Arguments:
871  *   arg:  vsw device or port.
872  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
873  */
874 void
875 vsw_destroy_vlans(void *arg, int type)
876 {
877 	/* remove vlan ids from the hash table */
878 	vsw_vlan_remove_ids(arg, type);
879 
880 	/* destroy vlan-hash-table */
881 	vsw_vlan_destroy_hash(arg, type);
882 }
883 
884 /*
885  * Create a vlan-id hash table for the given vsw device or port.
886  */
887 static void
888 vsw_vlan_create_hash(void *arg, int type)
889 {
890 	char		hashname[MAXNAMELEN];
891 
892 	if (type == VSW_LOCALDEV) {
893 		vsw_t		*vswp = (vsw_t *)arg;
894 
895 		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
896 		    vswp->instance);
897 
898 		vswp->vlan_nchains = vsw_vlan_nchains;
899 		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
900 		    vswp->vlan_nchains, mod_hash_null_valdtor);
901 
902 	} else if (type == VSW_VNETPORT) {
903 		vsw_port_t	*portp = (vsw_port_t *)arg;
904 
905 		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
906 		    portp->p_instance);
907 
908 		portp->vlan_nchains = vsw_vlan_nchains;
909 		portp->vlan_hashp = mod_hash_create_idhash(hashname,
910 		    portp->vlan_nchains, mod_hash_null_valdtor);
911 
912 	} else {
913 		return;
914 	}
915 }
916 
917 /*
918  * Destroy the vlan-id hash table for the given vsw device or port.
919  */
920 static void
921 vsw_vlan_destroy_hash(void *arg, int type)
922 {
923 	if (type == VSW_LOCALDEV) {
924 		vsw_t		*vswp = (vsw_t *)arg;
925 
926 		mod_hash_destroy_hash(vswp->vlan_hashp);
927 		vswp->vlan_nchains = 0;
928 	} else if (type == VSW_VNETPORT) {
929 		vsw_port_t	*portp = (vsw_port_t *)arg;
930 
931 		mod_hash_destroy_hash(portp->vlan_hashp);
932 		portp->vlan_nchains = 0;
933 	} else {
934 		return;
935 	}
936 }
937 
938 /*
939  * Add vlan ids of the given vsw device or port into its hash table.
940  */
941 void
942 vsw_vlan_add_ids(void *arg, int type)
943 {
944 	int	rv;
945 	int	i;
946 
947 	if (type == VSW_LOCALDEV) {
948 		vsw_t		*vswp = (vsw_t *)arg;
949 
950 		rv = mod_hash_insert(vswp->vlan_hashp,
951 		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
952 		    (mod_hash_val_t)B_TRUE);
953 		ASSERT(rv == 0);
954 
955 		for (i = 0; i < vswp->nvids; i++) {
956 			rv = mod_hash_insert(vswp->vlan_hashp,
957 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
958 			    (mod_hash_val_t)B_TRUE);
959 			ASSERT(rv == 0);
960 		}
961 
962 	} else if (type == VSW_VNETPORT) {
963 		vsw_port_t	*portp = (vsw_port_t *)arg;
964 
965 		rv = mod_hash_insert(portp->vlan_hashp,
966 		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
967 		    (mod_hash_val_t)B_TRUE);
968 		ASSERT(rv == 0);
969 
970 		for (i = 0; i < portp->nvids; i++) {
971 			rv = mod_hash_insert(portp->vlan_hashp,
972 			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
973 			    (mod_hash_val_t)B_TRUE);
974 			ASSERT(rv == 0);
975 		}
976 
977 	} else {
978 		return;
979 	}
980 }
981 
982 /*
983  * Remove vlan ids of the given vsw device or port from its hash table.
984  */
985 void
986 vsw_vlan_remove_ids(void *arg, int type)
987 {
988 	mod_hash_val_t	vp;
989 	int		rv;
990 	int		i;
991 
992 	if (type == VSW_LOCALDEV) {
993 		vsw_t		*vswp = (vsw_t *)arg;
994 
995 		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
996 		if (rv == B_TRUE) {
997 			rv = mod_hash_remove(vswp->vlan_hashp,
998 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
999 			    (mod_hash_val_t *)&vp);
1000 			ASSERT(rv == 0);
1001 		}
1002 
1003 		for (i = 0; i < vswp->nvids; i++) {
1004 			rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->vids[i]);
1005 			if (rv == B_TRUE) {
1006 				rv = mod_hash_remove(vswp->vlan_hashp,
1007 				    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
1008 				    (mod_hash_val_t *)&vp);
1009 				ASSERT(rv == 0);
1010 			}
1011 		}
1012 
1013 	} else if (type == VSW_VNETPORT) {
1014 		vsw_port_t	*portp = (vsw_port_t *)arg;
1015 
1016 		portp = (vsw_port_t *)arg;
1017 		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1018 		if (rv == B_TRUE) {
1019 			rv = mod_hash_remove(portp->vlan_hashp,
1020 			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1021 			    (mod_hash_val_t *)&vp);
1022 			ASSERT(rv == 0);
1023 		}
1024 
1025 		for (i = 0; i < portp->nvids; i++) {
1026 			rv = vsw_vlan_lookup(portp->vlan_hashp, portp->vids[i]);
1027 			if (rv == B_TRUE) {
1028 				rv = mod_hash_remove(portp->vlan_hashp,
1029 				    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
1030 				    (mod_hash_val_t *)&vp);
1031 				ASSERT(rv == 0);
1032 			}
1033 		}
1034 
1035 	} else {
1036 		return;
1037 	}
1038 }
1039 
1040 /*
1041  * Find the given vlan id in the hash table.
1042  * Return: B_TRUE if the id is found; B_FALSE if not found.
1043  */
1044 boolean_t
1045 vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1046 {
1047 	int		rv;
1048 	mod_hash_val_t	vp;
1049 
1050 	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1051 
1052 	if (rv != 0)
1053 		return (B_FALSE);
1054 
1055 	return (B_TRUE);
1056 }
1057 
1058 /*
1059  * Add an entry into FDB for the given vsw.
1060  */
1061 void
1062 vsw_fdbe_add(vsw_t *vswp, void *port)
1063 {
1064 	uint64_t	addr = 0;
1065 	vsw_port_t	*portp;
1066 	vsw_fdbe_t	*fp;
1067 	int		rv;
1068 
1069 	portp = (vsw_port_t *)port;
1070 	KEY_HASH(addr, &portp->p_macaddr);
1071 
1072 	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1073 	fp->portp = port;
1074 
1075 	/*
1076 	 * Note: duplicate keys will be rejected by mod_hash.
1077 	 */
1078 	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1079 	    (mod_hash_val_t)fp);
1080 	ASSERT(rv == 0);
1081 }
1082 
1083 /*
1084  * Remove an entry from FDB.
1085  */
1086 void
1087 vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1088 {
1089 	uint64_t	addr = 0;
1090 	vsw_fdbe_t	*fp;
1091 	int		rv;
1092 
1093 	KEY_HASH(addr, eaddr);
1094 
1095 	/*
1096 	 * Remove the entry from fdb hash table.
1097 	 * This prevents further references to this fdb entry.
1098 	 */
1099 	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1100 	    (mod_hash_val_t *)&fp);
1101 	if (rv != 0) {
1102 		/* invalid key? */
1103 		return;
1104 	}
1105 
1106 	/*
1107 	 * If there are threads already ref holding before the entry was
1108 	 * removed from hash table, then wait for ref count to drop to zero.
1109 	 */
1110 	while (fp->refcnt != 0) {
1111 		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1112 	}
1113 
1114 	kmem_free(fp, sizeof (*fp));
1115 }
1116 
1117 /*
1118  * Search fdb for a given mac address. If an entry is found, hold
1119  * a reference to it and return the entry, else returns NULL.
1120  */
1121 static vsw_fdbe_t *
1122 vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1123 {
1124 	uint64_t	key = 0;
1125 	vsw_fdbe_t	*fp;
1126 	int		rv;
1127 
1128 	KEY_HASH(key, addrp);
1129 
1130 	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1131 	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1132 
1133 	if (rv != 0)
1134 		return (NULL);
1135 
1136 	return (fp);
1137 }
1138 
1139 /*
1140  * Callback function provided to mod_hash_find_cb(). After finding the fdb
1141  * entry corresponding to the key (macaddr), this callback will be invoked by
1142  * mod_hash_find_cb() to atomically increment the reference count on the fdb
1143  * entry before returning the found entry.
1144  */
1145 static void
1146 vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1147 {
1148 	_NOTE(ARGUNUSED(key))
1149 	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1150 }
1151 
1152 /*
1153  * A given frame must be always tagged with the appropriate vlan id (unless it
1154  * is in the default-vlan) before the mac address switching function is called.
1155  * Otherwise, after switching function determines the destination, we cannot
1156  * figure out if the destination belongs to the the same vlan that the frame
1157  * originated from and if it needs tag/untag. Frames which are inbound from
1158  * the external(physical) network over a vlan trunk link are always tagged.
1159  * However frames which are received from a vnet-port over ldc or frames which
1160  * are coming down the stack on the service domain over vsw interface may be
1161  * untagged. These frames must be tagged with the appropriate pvid of the
1162  * sender (vnet-port or vsw device), before invoking the switching function.
1163  *
1164  * Arguments:
1165  *   arg:    caller of the function.
1166  *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1167  *   mp:     frame(s) to be tagged.
1168  */
1169 mblk_t *
1170 vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1171 {
1172 	vsw_t			*vswp;
1173 	vsw_port_t		*portp;
1174 	struct ether_header	*ehp;
1175 	mblk_t			*bp;
1176 	mblk_t			*bpt;
1177 	mblk_t			*bph;
1178 	mblk_t			*bpn;
1179 	uint16_t		pvid;
1180 
1181 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1182 
1183 	if (type == VSW_LOCALDEV) {
1184 		vswp = (vsw_t *)arg;
1185 		pvid = vswp->pvid;
1186 		portp = NULL;
1187 	} else {
1188 		/* VSW_VNETPORT */
1189 		portp = (vsw_port_t *)arg;
1190 		pvid = portp->pvid;
1191 		vswp = portp->p_vswp;
1192 	}
1193 
1194 	bpn = bph = bpt = NULL;
1195 
1196 	for (bp = mp; bp != NULL; bp = bpn) {
1197 
1198 		bpn = bp->b_next;
1199 		bp->b_next = bp->b_prev = NULL;
1200 
1201 		/* Determine if it is an untagged frame */
1202 		ehp = (struct ether_header *)bp->b_rptr;
1203 
1204 		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1205 
1206 			/* no need to tag if the frame is in default vlan */
1207 			if (pvid != vswp->default_vlan_id) {
1208 				bp = vnet_vlan_insert_tag(bp, pvid);
1209 				if (bp == NULL) {
1210 					continue;
1211 				}
1212 			}
1213 		}
1214 
1215 		/* build a chain of processed packets */
1216 		if (bph == NULL) {
1217 			bph = bpt = bp;
1218 		} else {
1219 			bpt->b_next = bp;
1220 			bpt = bp;
1221 		}
1222 
1223 	}
1224 
1225 	return (bph);
1226 }
1227 
1228 /*
1229  * Frames destined to a vnet-port or to the local vsw interface, must be
1230  * untagged if necessary before sending. This function first checks that the
1231  * frame can be sent to the destination in the vlan identified by the frame
1232  * tag. Note that when this function is invoked the frame must have been
1233  * already tagged (unless it is in the default-vlan). Because, this function is
1234  * called when the switching function determines the destination and invokes
1235  * its send function (vnet-port or vsw interface) and all frames would have
1236  * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1237  *
1238  * Arguments:
1239  *   arg:    destination device.
1240  *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1241  *   np:     head of pkt chain to be validated and untagged.
1242  *   npt:    tail of pkt chain to be validated and untagged.
1243  *
1244  * Returns:
1245  *   np:     head of updated chain of packets
1246  *   npt:    tail of updated chain of packets
1247  *   rv:     count of any packets dropped
1248  */
1249 uint32_t
1250 vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1251 {
1252 	mblk_t			*bp;
1253 	mblk_t			*bpt;
1254 	mblk_t			*bph;
1255 	mblk_t			*bpn;
1256 	vsw_port_t		*portp;
1257 	vsw_t			*vswp;
1258 	uint32_t		count;
1259 	struct ether_header	*ehp;
1260 	boolean_t		is_tagged;
1261 	boolean_t		rv;
1262 	uint16_t		vlan_id;
1263 	uint16_t		pvid;
1264 	mod_hash_t		*vlan_hashp;
1265 
1266 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1267 
1268 	if (type == VSW_LOCALDEV) {
1269 		vswp = (vsw_t *)arg;
1270 		pvid = vswp->pvid;
1271 		vlan_hashp = vswp->vlan_hashp;
1272 		portp = NULL;
1273 	} else {
1274 		/* type == VSW_VNETPORT */
1275 		portp = (vsw_port_t *)arg;
1276 		vswp = portp->p_vswp;
1277 		vlan_hashp = portp->vlan_hashp;
1278 		pvid = portp->pvid;
1279 	}
1280 
1281 	bpn = bph = bpt = NULL;
1282 	count = 0;
1283 
1284 	for (bp = *np; bp != NULL; bp = bpn) {
1285 
1286 		bpn = bp->b_next;
1287 		bp->b_next = bp->b_prev = NULL;
1288 
1289 		/*
1290 		 * Determine the vlan id that the frame belongs to.
1291 		 */
1292 		ehp = (struct ether_header *)bp->b_rptr;
1293 		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1294 
1295 		/*
1296 		 * Check if the destination is in the same vlan.
1297 		 */
1298 		rv = vsw_vlan_lookup(vlan_hashp, vlan_id);
1299 		if (rv == B_FALSE) {
1300 			/* drop the packet */
1301 			freemsg(bp);
1302 			count++;
1303 			continue;
1304 		}
1305 
1306 		/*
1307 		 * Check the frame header if tag/untag is  needed.
1308 		 */
1309 		if (is_tagged == B_FALSE) {
1310 			/*
1311 			 * Untagged frame. We shouldn't have an untagged
1312 			 * packet at this point, unless the destination's
1313 			 * vlan id is default-vlan-id; if it is not the
1314 			 * default-vlan-id, we drop the packet.
1315 			 */
1316 			if (vlan_id != vswp->default_vlan_id) {
1317 				/* drop the packet */
1318 				freemsg(bp);
1319 				count++;
1320 				continue;
1321 			}
1322 		} else {
1323 			/*
1324 			 * Tagged frame, untag if it's the destination's pvid.
1325 			 */
1326 			if (vlan_id == pvid) {
1327 
1328 				bp = vnet_vlan_remove_tag(bp);
1329 				if (bp == NULL) {
1330 					/* packet dropped */
1331 					count++;
1332 					continue;
1333 				}
1334 			}
1335 		}
1336 
1337 		/* build a chain of processed packets */
1338 		if (bph == NULL) {
1339 			bph = bpt = bp;
1340 		} else {
1341 			bpt->b_next = bp;
1342 			bpt = bp;
1343 		}
1344 
1345 	}
1346 
1347 	*np = bph;
1348 	*npt = bpt;
1349 
1350 	return (count);
1351 }
1352 
1353 /*
1354  * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1355  * then the vlan-id is available in the tag; otherwise, its vlan id is
1356  * implicitly obtained based on the caller (destination of the frame:
1357  * VSW_VNETPORT or VSW_LOCALDEV).
1358  * The vlan id determined is returned in vidp.
1359  * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1360  */
1361 boolean_t
1362 vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1363 	uint16_t *vidp)
1364 {
1365 	struct ether_vlan_header	*evhp;
1366 	vsw_t				*vswp;
1367 	vsw_port_t			*portp;
1368 
1369 	/* If it's a tagged frame, get the vid from vlan header */
1370 	if (ehp->ether_type == ETHERTYPE_VLAN) {
1371 
1372 		evhp = (struct ether_vlan_header *)ehp;
1373 		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1374 		return (B_TRUE);
1375 	}
1376 
1377 	/* Untagged frame; determine vlan id based on caller */
1378 	switch (caller) {
1379 
1380 	case VSW_VNETPORT:
1381 		/*
1382 		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1383 		 */
1384 		portp = (vsw_port_t *)arg;
1385 		*vidp = portp->pvid;
1386 		break;
1387 
1388 	case VSW_LOCALDEV:
1389 
1390 		/*
1391 		 * packet destined to vsw interface;
1392 		 * vlan-id is port-vlan-id of vsw device.
1393 		 */
1394 		vswp = (vsw_t *)arg;
1395 		*vidp = vswp->pvid;
1396 		break;
1397 	}
1398 
1399 	return (B_FALSE);
1400 }
1401 
1402 /*
1403  * Add or remove multicast address(es).
1404  *
1405  * Returns 0 on success, 1 on failure.
1406  */
1407 int
1408 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1409 {
1410 	mcst_addr_t		*mcst_p = NULL;
1411 	vsw_t			*vswp = port->p_vswp;
1412 	uint64_t		addr = 0x0;
1413 	int			i;
1414 
1415 	D1(vswp, "%s: enter", __func__);
1416 
1417 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1418 
1419 	for (i = 0; i < mcst_pkt->count; i++) {
1420 		/*
1421 		 * Convert address into form that can be used
1422 		 * as hash table key.
1423 		 */
1424 		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1425 
1426 		/*
1427 		 * Add or delete the specified address/port combination.
1428 		 */
1429 		if (mcst_pkt->set == 0x1) {
1430 			D3(vswp, "%s: adding multicast address 0x%llx for "
1431 			    "port %ld", __func__, addr, port->p_instance);
1432 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1433 				/*
1434 				 * Update the list of multicast
1435 				 * addresses contained within the
1436 				 * port structure to include this new
1437 				 * one.
1438 				 */
1439 				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1440 				    KM_NOSLEEP);
1441 				if (mcst_p == NULL) {
1442 					DERR(vswp, "%s: unable to alloc mem",
1443 					    __func__);
1444 					(void) vsw_del_mcst(vswp,
1445 					    VSW_VNETPORT, addr, port);
1446 					return (1);
1447 				}
1448 
1449 				mcst_p->nextp = NULL;
1450 				mcst_p->addr = addr;
1451 				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1452 
1453 				/*
1454 				 * Program the address into HW. If the addr
1455 				 * has already been programmed then the MAC
1456 				 * just increments a ref counter (which is
1457 				 * used when the address is being deleted)
1458 				 */
1459 				mutex_enter(&vswp->mac_lock);
1460 				if (vswp->mh != NULL) {
1461 					if (mac_multicst_add(vswp->mh,
1462 					    (uchar_t *)&mcst_pkt->mca[i])) {
1463 						mutex_exit(&vswp->mac_lock);
1464 						cmn_err(CE_WARN, "!vsw%d: "
1465 						    "unable to add multicast "
1466 						    "address: %s\n",
1467 						    vswp->instance,
1468 						    ether_sprintf((void *)
1469 						    &mcst_p->mca));
1470 						(void) vsw_del_mcst(vswp,
1471 						    VSW_VNETPORT, addr, port);
1472 						kmem_free(mcst_p,
1473 						    sizeof (*mcst_p));
1474 						return (1);
1475 					}
1476 					mcst_p->mac_added = B_TRUE;
1477 				}
1478 				mutex_exit(&vswp->mac_lock);
1479 
1480 				mutex_enter(&port->mca_lock);
1481 				mcst_p->nextp = port->mcap;
1482 				port->mcap = mcst_p;
1483 				mutex_exit(&port->mca_lock);
1484 
1485 			} else {
1486 				DERR(vswp, "%s: error adding multicast "
1487 				    "address 0x%llx for port %ld",
1488 				    __func__, addr, port->p_instance);
1489 				return (1);
1490 			}
1491 		} else {
1492 			/*
1493 			 * Delete an entry from the multicast hash
1494 			 * table and update the address list
1495 			 * appropriately.
1496 			 */
1497 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1498 				D3(vswp, "%s: deleting multicast address "
1499 				    "0x%llx for port %ld", __func__, addr,
1500 				    port->p_instance);
1501 
1502 				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1503 				ASSERT(mcst_p != NULL);
1504 
1505 				/*
1506 				 * Remove the address from HW. The address
1507 				 * will actually only be removed once the ref
1508 				 * count within the MAC layer has dropped to
1509 				 * zero. I.e. we can safely call this fn even
1510 				 * if other ports are interested in this
1511 				 * address.
1512 				 */
1513 				mutex_enter(&vswp->mac_lock);
1514 				if (vswp->mh != NULL && mcst_p->mac_added) {
1515 					if (mac_multicst_remove(vswp->mh,
1516 					    (uchar_t *)&mcst_pkt->mca[i])) {
1517 						mutex_exit(&vswp->mac_lock);
1518 						cmn_err(CE_WARN, "!vsw%d: "
1519 						    "unable to remove mcast "
1520 						    "address: %s\n",
1521 						    vswp->instance,
1522 						    ether_sprintf((void *)
1523 						    &mcst_p->mca));
1524 						kmem_free(mcst_p,
1525 						    sizeof (*mcst_p));
1526 						return (1);
1527 					}
1528 					mcst_p->mac_added = B_FALSE;
1529 				}
1530 				mutex_exit(&vswp->mac_lock);
1531 				kmem_free(mcst_p, sizeof (*mcst_p));
1532 
1533 			} else {
1534 				DERR(vswp, "%s: error deleting multicast "
1535 				    "addr 0x%llx for port %ld",
1536 				    __func__, addr, port->p_instance);
1537 				return (1);
1538 			}
1539 		}
1540 	}
1541 	D1(vswp, "%s: exit", __func__);
1542 	return (0);
1543 }
1544 
1545 /*
1546  * Add a new multicast entry.
1547  *
1548  * Search hash table based on address. If match found then
1549  * update associated val (which is chain of ports), otherwise
1550  * create new key/val (addr/port) pair and insert into table.
1551  */
1552 int
1553 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1554 {
1555 	int		dup = 0;
1556 	int		rv = 0;
1557 	mfdb_ent_t	*ment = NULL;
1558 	mfdb_ent_t	*tmp_ent = NULL;
1559 	mfdb_ent_t	*new_ent = NULL;
1560 	void		*tgt = NULL;
1561 
1562 	if (devtype == VSW_VNETPORT) {
1563 		/*
1564 		 * Being invoked from a vnet.
1565 		 */
1566 		ASSERT(arg != NULL);
1567 		tgt = arg;
1568 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1569 		    ((vsw_port_t *)arg)->p_instance, addr);
1570 	} else {
1571 		/*
1572 		 * We are being invoked via the m_multicst mac entry
1573 		 * point.
1574 		 */
1575 		D2(NULL, "%s: address 0x%llx", __func__, addr);
1576 		tgt = (void *)vswp;
1577 	}
1578 
1579 	WRITE_ENTER(&vswp->mfdbrw);
1580 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1581 	    (mod_hash_val_t *)&ment) != 0) {
1582 
1583 		/* address not currently in table */
1584 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1585 		ment->d_addr = (void *)tgt;
1586 		ment->d_type = devtype;
1587 		ment->nextp = NULL;
1588 
1589 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1590 		    (mod_hash_val_t)ment) != 0) {
1591 			DERR(vswp, "%s: hash table insertion failed", __func__);
1592 			kmem_free(ment, sizeof (mfdb_ent_t));
1593 			rv = 1;
1594 		} else {
1595 			D2(vswp, "%s: added initial entry for 0x%llx to "
1596 			    "table", __func__, addr);
1597 		}
1598 	} else {
1599 		/*
1600 		 * Address in table. Check to see if specified port
1601 		 * is already associated with the address. If not add
1602 		 * it now.
1603 		 */
1604 		tmp_ent = ment;
1605 		while (tmp_ent != NULL) {
1606 			if (tmp_ent->d_addr == (void *)tgt) {
1607 				if (devtype == VSW_VNETPORT) {
1608 					DERR(vswp, "%s: duplicate port entry "
1609 					    "found for portid %ld and key "
1610 					    "0x%llx", __func__,
1611 					    ((vsw_port_t *)arg)->p_instance,
1612 					    addr);
1613 				} else {
1614 					DERR(vswp, "%s: duplicate entry found"
1615 					    "for key 0x%llx", __func__, addr);
1616 				}
1617 				rv = 1;
1618 				dup = 1;
1619 				break;
1620 			}
1621 			tmp_ent = tmp_ent->nextp;
1622 		}
1623 
1624 		/*
1625 		 * Port not on list so add it to end now.
1626 		 */
1627 		if (0 == dup) {
1628 			D2(vswp, "%s: added entry for 0x%llx to table",
1629 			    __func__, addr);
1630 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1631 			new_ent->d_addr = (void *)tgt;
1632 			new_ent->d_type = devtype;
1633 			new_ent->nextp = NULL;
1634 
1635 			tmp_ent = ment;
1636 			while (tmp_ent->nextp != NULL)
1637 				tmp_ent = tmp_ent->nextp;
1638 
1639 			tmp_ent->nextp = new_ent;
1640 		}
1641 	}
1642 
1643 	RW_EXIT(&vswp->mfdbrw);
1644 	return (rv);
1645 }
1646 
1647 /*
1648  * Remove a multicast entry from the hashtable.
1649  *
1650  * Search hash table based on address. If match found, scan
1651  * list of ports associated with address. If specified port
1652  * found remove it from list.
1653  */
1654 int
1655 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1656 {
1657 	mfdb_ent_t	*ment = NULL;
1658 	mfdb_ent_t	*curr_p, *prev_p;
1659 	void		*tgt = NULL;
1660 
1661 	D1(vswp, "%s: enter", __func__);
1662 
1663 	if (devtype == VSW_VNETPORT) {
1664 		tgt = (vsw_port_t *)arg;
1665 		D2(vswp, "%s: removing port %d from mFDB for address"
1666 		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1667 	} else {
1668 		D2(vswp, "%s: removing entry", __func__);
1669 		tgt = (void *)vswp;
1670 	}
1671 
1672 	WRITE_ENTER(&vswp->mfdbrw);
1673 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1674 	    (mod_hash_val_t *)&ment) != 0) {
1675 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1676 		RW_EXIT(&vswp->mfdbrw);
1677 		return (1);
1678 	}
1679 
1680 	prev_p = curr_p = ment;
1681 
1682 	while (curr_p != NULL) {
1683 		if (curr_p->d_addr == (void *)tgt) {
1684 			if (devtype == VSW_VNETPORT) {
1685 				D2(vswp, "%s: port %d found", __func__,
1686 				    ((vsw_port_t *)tgt)->p_instance);
1687 			} else {
1688 				D2(vswp, "%s: instance found", __func__);
1689 			}
1690 
1691 			if (prev_p == curr_p) {
1692 				/*
1693 				 * head of list, if no other element is in
1694 				 * list then destroy this entry, otherwise
1695 				 * just replace it with updated value.
1696 				 */
1697 				ment = curr_p->nextp;
1698 				if (ment == NULL) {
1699 					(void) mod_hash_destroy(vswp->mfdb,
1700 					    (mod_hash_val_t)addr);
1701 				} else {
1702 					(void) mod_hash_replace(vswp->mfdb,
1703 					    (mod_hash_key_t)addr,
1704 					    (mod_hash_val_t)ment);
1705 				}
1706 			} else {
1707 				/*
1708 				 * Not head of list, no need to do
1709 				 * replacement, just adjust list pointers.
1710 				 */
1711 				prev_p->nextp = curr_p->nextp;
1712 			}
1713 			break;
1714 		}
1715 
1716 		prev_p = curr_p;
1717 		curr_p = curr_p->nextp;
1718 	}
1719 
1720 	RW_EXIT(&vswp->mfdbrw);
1721 
1722 	D1(vswp, "%s: exit", __func__);
1723 
1724 	if (curr_p == NULL)
1725 		return (1);
1726 	kmem_free(curr_p, sizeof (mfdb_ent_t));
1727 	return (0);
1728 }
1729 
1730 /*
1731  * Port is being deleted, but has registered an interest in one
1732  * or more multicast groups. Using the list of addresses maintained
1733  * within the port structure find the appropriate entry in the hash
1734  * table and remove this port from the list of interested ports.
1735  */
1736 void
1737 vsw_del_mcst_port(vsw_port_t *port)
1738 {
1739 	mcst_addr_t	*mcap = NULL;
1740 	vsw_t		*vswp = port->p_vswp;
1741 
1742 	D1(vswp, "%s: enter", __func__);
1743 
1744 	mutex_enter(&port->mca_lock);
1745 
1746 	while ((mcap = port->mcap) != NULL) {
1747 
1748 		port->mcap = mcap->nextp;
1749 
1750 		mutex_exit(&port->mca_lock);
1751 
1752 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1753 		    mcap->addr, port);
1754 
1755 		/*
1756 		 * Remove the address from HW. The address
1757 		 * will actually only be removed once the ref
1758 		 * count within the MAC layer has dropped to
1759 		 * zero. I.e. we can safely call this fn even
1760 		 * if other ports are interested in this
1761 		 * address.
1762 		 */
1763 		mutex_enter(&vswp->mac_lock);
1764 		if (vswp->mh != NULL && mcap->mac_added) {
1765 			(void) mac_multicst_remove(vswp->mh,
1766 			    (uchar_t *)&mcap->mca);
1767 		}
1768 		mutex_exit(&vswp->mac_lock);
1769 
1770 		kmem_free(mcap, sizeof (*mcap));
1771 
1772 		mutex_enter(&port->mca_lock);
1773 
1774 	}
1775 
1776 	mutex_exit(&port->mca_lock);
1777 
1778 	D1(vswp, "%s: exit", __func__);
1779 }
1780 
1781 /*
1782  * This vsw instance is detaching, but has registered an interest in one
1783  * or more multicast groups. Using the list of addresses maintained
1784  * within the vsw structure find the appropriate entry in the hash
1785  * table and remove this instance from the list of interested ports.
1786  */
1787 void
1788 vsw_del_mcst_vsw(vsw_t *vswp)
1789 {
1790 	mcst_addr_t	*next_p = NULL;
1791 
1792 	D1(vswp, "%s: enter", __func__);
1793 
1794 	mutex_enter(&vswp->mca_lock);
1795 
1796 	while (vswp->mcap != NULL) {
1797 		DERR(vswp, "%s: deleting addr 0x%llx",
1798 		    __func__, vswp->mcap->addr);
1799 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1800 
1801 		next_p = vswp->mcap->nextp;
1802 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1803 		vswp->mcap = next_p;
1804 	}
1805 
1806 	vswp->mcap = NULL;
1807 	mutex_exit(&vswp->mca_lock);
1808 
1809 	D1(vswp, "%s: exit", __func__);
1810 }
1811 
1812 static uint32_t
1813 vsw_get_same_dest_list(struct ether_header *ehp,
1814     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp)
1815 {
1816 	uint32_t		count = 0;
1817 	mblk_t			*bp;
1818 	mblk_t			*nbp;
1819 	mblk_t			*head = NULL;
1820 	mblk_t			*tail = NULL;
1821 	mblk_t			*prev = NULL;
1822 	struct ether_header	*behp;
1823 
1824 	/* process the chain of packets */
1825 	bp = *mpp;
1826 	while (bp) {
1827 		nbp = bp->b_next;
1828 		behp = (struct ether_header *)bp->b_rptr;
1829 		bp->b_prev = NULL;
1830 		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1831 			if (prev == NULL) {
1832 				*mpp = nbp;
1833 			} else {
1834 				prev->b_next = nbp;
1835 			}
1836 			bp->b_next =  NULL;
1837 			if (head == NULL) {
1838 				head = tail = bp;
1839 			} else {
1840 				tail->b_next = bp;
1841 				tail = bp;
1842 			}
1843 			count++;
1844 		} else {
1845 			prev = bp;
1846 		}
1847 		bp = nbp;
1848 	}
1849 	*rhead = head;
1850 	*rtail = tail;
1851 	DTRACE_PROBE1(vsw_same_dest, int, count);
1852 	return (count);
1853 }
1854 
1855 static mblk_t *
1856 vsw_dupmsgchain(mblk_t *mp)
1857 {
1858 	mblk_t	*nmp = NULL;
1859 	mblk_t	**nmpp = &nmp;
1860 
1861 	for (; mp != NULL; mp = mp->b_next) {
1862 		if ((*nmpp = dupmsg(mp)) == NULL) {
1863 			freemsgchain(nmp);
1864 			return (NULL);
1865 		}
1866 
1867 		nmpp = &((*nmpp)->b_next);
1868 	}
1869 
1870 	return (nmp);
1871 }
1872