xref: /titanic_41/usr/src/uts/sun4v/io/vsw_switching.c (revision 40db2e2b777b79f3dd0d6d9629593a07f86b9c0a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/errno.h>
31 #include <sys/debug.h>
32 #include <sys/time.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/user.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strlog.h>
39 #include <sys/strsubr.h>
40 #include <sys/cmn_err.h>
41 #include <sys/cpu.h>
42 #include <sys/kmem.h>
43 #include <sys/conf.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 #include <sys/stat.h>
48 #include <sys/kstat.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/dlpi.h>
52 #include <sys/ethernet.h>
53 #include <net/if.h>
54 #include <sys/varargs.h>
55 #include <sys/machsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/modhash.h>
58 #include <sys/mac.h>
59 #include <sys/mac_ether.h>
60 #include <sys/taskq.h>
61 #include <sys/note.h>
62 #include <sys/mach_descrip.h>
63 #include <sys/mac.h>
64 #include <sys/mdeg.h>
65 #include <sys/ldc.h>
66 #include <sys/vsw_fdb.h>
67 #include <sys/vsw.h>
68 #include <sys/vio_mailbox.h>
69 #include <sys/vnet_mailbox.h>
70 #include <sys/vnet_common.h>
71 #include <sys/vio_util.h>
72 #include <sys/sdt.h>
73 #include <sys/atomic.h>
74 #include <sys/vlan.h>
75 
76 /* Switching setup routines */
77 void vsw_setup_switching_timeout(void *arg);
78 void vsw_stop_switching_timeout(vsw_t *vswp);
79 int vsw_setup_switching(vsw_t *);
80 void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
81     vsw_port_t *port, mac_resource_handle_t mrh);
82 static	int vsw_setup_layer2(vsw_t *);
83 static	int vsw_setup_layer3(vsw_t *);
84 
85 /* Switching/data transmit routines */
86 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
87 	vsw_port_t *port, mac_resource_handle_t);
88 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
89 	vsw_port_t *port, mac_resource_handle_t);
90 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
91 	int caller, vsw_port_t *port);
92 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
93     int caller, vsw_port_t *port);
94 
95 /* VLAN routines */
96 void vsw_create_vlans(void *arg, int type);
97 void vsw_destroy_vlans(void *arg, int type);
98 void vsw_vlan_add_ids(void *arg, int type);
99 void vsw_vlan_remove_ids(void *arg, int type);
100 static	void vsw_vlan_create_hash(void *arg, int type);
101 static	void vsw_vlan_destroy_hash(void *arg, int type);
102 boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
103 	uint16_t *vidp);
104 mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
105 uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
106 boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
107 
108 /* Forwarding database (FDB) routines */
109 void vsw_fdbe_add(vsw_t *vswp, void *port);
110 void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
111 static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
112 static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
113 
114 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
115 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
116 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
117 void vsw_del_mcst_vsw(vsw_t *);
118 
119 /* Support functions */
120 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
121 static uint32_t vsw_get_same_dest_list(struct ether_header *ehp,
122     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
123 
124 
125 /*
126  * Functions imported from other files.
127  */
128 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
129 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
130 extern int vsw_mac_open(vsw_t *vswp);
131 extern void vsw_mac_close(vsw_t *vswp);
132 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
133     mblk_t *mp, vsw_macrx_flags_t flags);
134 extern void vsw_set_addrs(vsw_t *vswp);
135 extern int vsw_get_hw_maddr(vsw_t *);
136 extern int vsw_mac_attach(vsw_t *vswp);
137 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt,
138 	uint32_t count);
139 extern void vsw_hio_init(vsw_t *vswp);
140 extern void vsw_hio_start_ports(vsw_t *vswp);
141 
142 /*
143  * Tunables used in this file.
144  */
145 extern	int vsw_setup_switching_delay;
146 extern	uint32_t vsw_vlan_nchains;
147 extern	uint32_t vsw_fdbe_refcnt_delay;
148 
149 #define	VSW_FDBE_REFHOLD(p)						\
150 {									\
151 	atomic_inc_32(&(p)->refcnt);					\
152 	ASSERT((p)->refcnt != 0);					\
153 }
154 
155 #define	VSW_FDBE_REFRELE(p)						\
156 {									\
157 	ASSERT((p)->refcnt != 0);					\
158 	atomic_dec_32(&(p)->refcnt);					\
159 }
160 
161 /*
162  * Timeout routine to setup switching mode:
163  * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop()
164  * initially. If it fails and the error is EAGAIN, then this timeout handler
165  * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried
166  * until we successfully finish it; or the returned error is not EAGAIN.
167  */
168 void
169 vsw_setup_switching_timeout(void *arg)
170 {
171 	vsw_t		*vswp = (vsw_t *)arg;
172 	int		rv;
173 
174 	if (vswp->swtmout_enabled == B_FALSE)
175 		return;
176 
177 	rv = vsw_setup_switching(vswp);
178 
179 	if (rv == 0) {
180 		/*
181 		 * Successfully setup switching mode.
182 		 * Program unicst, mcst addrs of vsw
183 		 * interface and ports in the physdev.
184 		 */
185 		vsw_set_addrs(vswp);
186 
187 		/* Start HIO for ports that have already connected */
188 		vsw_hio_start_ports(vswp);
189 	}
190 
191 	mutex_enter(&vswp->swtmout_lock);
192 
193 	if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) {
194 		/*
195 		 * Reschedule timeout() if the error is EAGAIN and the
196 		 * timeout is still enabled. For errors other than EAGAIN,
197 		 * we simply return without rescheduling timeout().
198 		 */
199 		vswp->swtmout_id =
200 		    timeout(vsw_setup_switching_timeout, vswp,
201 		    (vsw_setup_switching_delay * drv_usectohz(MICROSEC)));
202 		goto exit;
203 	}
204 
205 	/* timeout handler completed */
206 	vswp->swtmout_enabled = B_FALSE;
207 	vswp->swtmout_id = 0;
208 
209 exit:
210 	mutex_exit(&vswp->swtmout_lock);
211 }
212 
213 /*
214  * Cancel the timeout handler to setup switching mode.
215  */
216 void
217 vsw_stop_switching_timeout(vsw_t *vswp)
218 {
219 	timeout_id_t tid;
220 
221 	mutex_enter(&vswp->swtmout_lock);
222 
223 	tid = vswp->swtmout_id;
224 
225 	if (tid != 0) {
226 		/* signal timeout handler to stop */
227 		vswp->swtmout_enabled = B_FALSE;
228 		vswp->swtmout_id = 0;
229 		mutex_exit(&vswp->swtmout_lock);
230 
231 		(void) untimeout(tid);
232 	} else {
233 		mutex_exit(&vswp->swtmout_lock);
234 	}
235 
236 	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
237 
238 	WRITE_ENTER(&vswp->mac_rwlock);
239 	vswp->mac_open_retries = 0;
240 	RW_EXIT(&vswp->mac_rwlock);
241 }
242 
243 /*
244  * Setup the required switching mode.
245  * This routine is invoked from vsw_attach() or vsw_update_md_prop()
246  * initially. If it fails and the error is EAGAIN, then a timeout handler
247  * is started to retry vsw_setup_switching(), until it successfully finishes;
248  * or the returned error is not EAGAIN.
249  *
250  * Returns:
251  *  0 on success.
252  *  EAGAIN if retry is needed.
253  *  1 on all other failures.
254  */
255 int
256 vsw_setup_switching(vsw_t *vswp)
257 {
258 	int	i, rv = 1;
259 
260 	D1(vswp, "%s: enter", __func__);
261 
262 	/*
263 	 * Select best switching mode.
264 	 * Note that we start from the saved smode_idx. This is done as
265 	 * this routine can be called from the timeout handler to retry
266 	 * setting up a specific mode. Currently only the function which
267 	 * sets up layer2/promisc mode returns EAGAIN if the underlying
268 	 * physical device is not available yet, causing retries.
269 	 */
270 	for (i = vswp->smode_idx; i < vswp->smode_num; i++) {
271 		vswp->smode_idx = i;
272 		switch (vswp->smode[i]) {
273 		case VSW_LAYER2:
274 		case VSW_LAYER2_PROMISC:
275 			rv = vsw_setup_layer2(vswp);
276 			break;
277 
278 		case VSW_LAYER3:
279 			rv = vsw_setup_layer3(vswp);
280 			break;
281 
282 		default:
283 			DERR(vswp, "unknown switch mode");
284 			break;
285 		}
286 
287 		if ((rv == 0) || (rv == EAGAIN))
288 			break;
289 
290 		/* all other errors(rv != 0): continue & select the next mode */
291 		rv = 1;
292 	}
293 
294 	if (rv && (rv != EAGAIN)) {
295 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
296 		    "switching mode", vswp->instance);
297 	} else if (rv == 0) {
298 		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
299 	}
300 
301 	D2(vswp, "%s: Operating in mode %d", __func__,
302 	    vswp->smode[vswp->smode_idx]);
303 
304 	D1(vswp, "%s: exit", __func__);
305 
306 	return (rv);
307 }
308 
309 /*
310  * Setup for layer 2 switching.
311  *
312  * Returns:
313  *  0 on success.
314  *  EAGAIN if retry is needed.
315  *  EIO on all other failures.
316  */
317 static int
318 vsw_setup_layer2(vsw_t *vswp)
319 {
320 	int	rv;
321 
322 	D1(vswp, "%s: enter", __func__);
323 
324 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
325 
326 	rv = strlen(vswp->physname);
327 	if (rv == 0) {
328 		/*
329 		 * Physical device name is NULL, which is
330 		 * required for layer 2.
331 		 */
332 		cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
333 		    vswp->instance);
334 		return (EIO);
335 	}
336 
337 	WRITE_ENTER(&vswp->mac_rwlock);
338 
339 	rv = vsw_mac_open(vswp);
340 	if (rv != 0) {
341 		if (rv != EAGAIN) {
342 			cmn_err(CE_WARN, "!vsw%d: Unable to open physical "
343 			    "device: %s\n", vswp->instance, vswp->physname);
344 		}
345 		RW_EXIT(&vswp->mac_rwlock);
346 		return (rv);
347 	}
348 
349 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
350 		/*
351 		 * Verify that underlying device can support multiple
352 		 * unicast mac addresses.
353 		 */
354 		rv = vsw_get_hw_maddr(vswp);
355 		if (rv != 0) {
356 			goto exit_error;
357 		}
358 	}
359 
360 	/*
361 	 * Attempt to link into the MAC layer so we can get
362 	 * and send packets out over the physical adapter.
363 	 */
364 	rv = vsw_mac_attach(vswp);
365 	if (rv != 0) {
366 		/*
367 		 * Registration with the MAC layer has failed,
368 		 * so return error so that can fall back to next
369 		 * prefered switching method.
370 		 */
371 		cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: "
372 		    "%s\n", vswp->instance, vswp->physname);
373 		goto exit_error;
374 	}
375 
376 	D1(vswp, "%s: exit", __func__);
377 
378 	RW_EXIT(&vswp->mac_rwlock);
379 
380 	/* Initialize HybridIO related stuff */
381 	vsw_hio_init(vswp);
382 	return (0);
383 
384 exit_error:
385 	vsw_mac_close(vswp);
386 	RW_EXIT(&vswp->mac_rwlock);
387 	return (EIO);
388 }
389 
390 static int
391 vsw_setup_layer3(vsw_t *vswp)
392 {
393 	D1(vswp, "%s: enter", __func__);
394 
395 	D2(vswp, "%s: operating in layer 3 mode", __func__);
396 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
397 
398 	D1(vswp, "%s: exit", __func__);
399 
400 	return (0);
401 }
402 
403 /* ARGSUSED */
404 void
405 vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
406 			mac_resource_handle_t mrh)
407 {
408 	freemsgchain(mp);
409 }
410 
411 /*
412  * Switch the given ethernet frame when operating in layer 2 mode.
413  *
414  * vswp: pointer to the vsw instance
415  * mp: pointer to chain of ethernet frame(s) to be switched
416  * caller: identifies the source of this frame as:
417  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
418  *		2. VSW_PHYSDEV - the physical ethernet device
419  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
420  * arg: argument provided by the caller.
421  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
422  *		2. for PHYSDEV - NULL
423  *		3. for LOCALDEV - pointer to to this vsw_t(self)
424  */
425 void
426 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
427 			vsw_port_t *arg, mac_resource_handle_t mrh)
428 {
429 	struct ether_header	*ehp;
430 	mblk_t			*bp, *ret_m;
431 	mblk_t			*mpt = NULL;
432 	uint32_t		count;
433 	vsw_fdbe_t		*fp;
434 
435 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
436 
437 	/*
438 	 * PERF: rather than breaking up the chain here, scan it
439 	 * to find all mblks heading to same destination and then
440 	 * pass that sub-chain to the lower transmit functions.
441 	 */
442 
443 	/* process the chain of packets */
444 	bp = mp;
445 	while (bp) {
446 		ehp = (struct ether_header *)bp->b_rptr;
447 		count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
448 		ASSERT(count != 0);
449 
450 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
451 		    __func__, MBLKSIZE(mp), MBLKL(mp));
452 
453 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
454 			/*
455 			 * If destination is VSW_LOCALDEV (vsw as an eth
456 			 * interface) and if the device is up & running,
457 			 * send the packet up the stack on this host.
458 			 * If the virtual interface is down, drop the packet.
459 			 */
460 			if (caller != VSW_LOCALDEV) {
461 				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
462 			} else {
463 				freemsgchain(mp);
464 			}
465 			continue;
466 		}
467 
468 		/*
469 		 * Find fdb entry for the destination
470 		 * and hold a reference to it.
471 		 */
472 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
473 		if (fp != NULL) {
474 
475 			/*
476 			 * If plumbed and in promisc mode then copy msg
477 			 * and send up the stack.
478 			 */
479 			vsw_mac_rx(vswp, mrh, mp,
480 			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
481 
482 			/*
483 			 * If the destination is in FDB, the packet
484 			 * should be forwarded to the correponding
485 			 * vsw_port (connected to a vnet device -
486 			 * VSW_VNETPORT)
487 			 */
488 			(void) vsw_portsend(fp->portp, mp, mpt, count);
489 
490 			/* Release the reference on the fdb entry */
491 			VSW_FDBE_REFRELE(fp);
492 		} else {
493 			/*
494 			 * Destination not in FDB.
495 			 *
496 			 * If the destination is broadcast or
497 			 * multicast forward the packet to all
498 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
499 			 * except the caller.
500 			 */
501 			if (IS_BROADCAST(ehp)) {
502 				D2(vswp, "%s: BROADCAST pkt", __func__);
503 				(void) vsw_forward_all(vswp, mp, caller, arg);
504 			} else if (IS_MULTICAST(ehp)) {
505 				D2(vswp, "%s: MULTICAST pkt", __func__);
506 				(void) vsw_forward_grp(vswp, mp, caller, arg);
507 			} else {
508 				/*
509 				 * If the destination is unicast, and came
510 				 * from either a logical network device or
511 				 * the switch itself when it is plumbed, then
512 				 * send it out on the physical device and also
513 				 * up the stack if the logical interface is
514 				 * in promiscious mode.
515 				 *
516 				 * NOTE:  The assumption here is that if we
517 				 * cannot find the destination in our fdb, its
518 				 * a unicast address, and came from either a
519 				 * vnet or down the stack (when plumbed) it
520 				 * must be destinded for an ethernet device
521 				 * outside our ldoms.
522 				 */
523 				if (caller == VSW_VNETPORT) {
524 					/* promisc check copy etc */
525 					vsw_mac_rx(vswp, mrh, mp,
526 					    VSW_MACRX_PROMISC |
527 					    VSW_MACRX_COPYMSG);
528 
529 					if ((ret_m = vsw_tx_msg(vswp, mp))
530 					    != NULL) {
531 						DERR(vswp, "%s: drop mblks to "
532 						    "phys dev", __func__);
533 						freemsgchain(ret_m);
534 					}
535 
536 				} else if (caller == VSW_PHYSDEV) {
537 					/*
538 					 * Pkt seen because card in promisc
539 					 * mode. Send up stack if plumbed in
540 					 * promisc mode, else drop it.
541 					 */
542 					vsw_mac_rx(vswp, mrh, mp,
543 					    VSW_MACRX_PROMISC |
544 					    VSW_MACRX_FREEMSG);
545 
546 				} else if (caller == VSW_LOCALDEV) {
547 					/*
548 					 * Pkt came down the stack, send out
549 					 * over physical device.
550 					 */
551 					if ((ret_m = vsw_tx_msg(vswp, mp))
552 					    != NULL) {
553 						DERR(vswp, "%s: drop mblks to "
554 						    "phys dev", __func__);
555 						freemsgchain(ret_m);
556 					}
557 				}
558 			}
559 		}
560 	}
561 	D1(vswp, "%s: exit\n", __func__);
562 }
563 
564 /*
565  * Switch ethernet frame when in layer 3 mode (i.e. using IP
566  * layer to do the routing).
567  *
568  * There is a large amount of overlap between this function and
569  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
570  * both these functions.
571  */
572 void
573 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
574 			vsw_port_t *arg, mac_resource_handle_t mrh)
575 {
576 	struct ether_header	*ehp;
577 	mblk_t			*bp = NULL;
578 	mblk_t			*mpt;
579 	uint32_t		count;
580 	vsw_fdbe_t		*fp;
581 
582 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
583 
584 	/*
585 	 * In layer 3 mode should only ever be switching packets
586 	 * between IP layer and vnet devices. So make sure thats
587 	 * who is invoking us.
588 	 */
589 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
590 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
591 		freemsgchain(mp);
592 		return;
593 	}
594 
595 	/* process the chain of packets */
596 	bp = mp;
597 	while (bp) {
598 		ehp = (struct ether_header *)bp->b_rptr;
599 		count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
600 		ASSERT(count != 0);
601 
602 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
603 		    __func__, MBLKSIZE(mp), MBLKL(mp));
604 
605 		/*
606 		 * Find fdb entry for the destination
607 		 * and hold a reference to it.
608 		 */
609 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
610 		if (fp != NULL) {
611 
612 			D2(vswp, "%s: sending to target port", __func__);
613 			(void) vsw_portsend(fp->portp, mp, mpt, count);
614 
615 			/* Release the reference on the fdb entry */
616 			VSW_FDBE_REFRELE(fp);
617 		} else {
618 			/*
619 			 * Destination not in FDB
620 			 *
621 			 * If the destination is broadcast or
622 			 * multicast forward the packet to all
623 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
624 			 * except the caller.
625 			 */
626 			if (IS_BROADCAST(ehp)) {
627 				D2(vswp, "%s: BROADCAST pkt", __func__);
628 				(void) vsw_forward_all(vswp, mp, caller, arg);
629 			} else if (IS_MULTICAST(ehp)) {
630 				D2(vswp, "%s: MULTICAST pkt", __func__);
631 				(void) vsw_forward_grp(vswp, mp, caller, arg);
632 			} else {
633 				/*
634 				 * Unicast pkt from vnet that we don't have
635 				 * an FDB entry for, so must be destinded for
636 				 * the outside world. Attempt to send up to the
637 				 * IP layer to allow it to deal with it.
638 				 */
639 				if (caller == VSW_VNETPORT) {
640 					vsw_mac_rx(vswp, mrh,
641 					    mp, VSW_MACRX_FREEMSG);
642 				}
643 			}
644 		}
645 	}
646 
647 	D1(vswp, "%s: exit", __func__);
648 }
649 
650 /*
651  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
652  * except the caller (port on which frame arrived).
653  */
654 static int
655 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
656 {
657 	vsw_port_list_t	*plist = &vswp->plist;
658 	vsw_port_t	*portp;
659 	mblk_t		*nmp = NULL;
660 	mblk_t		*ret_m = NULL;
661 	int		skip_port = 0;
662 
663 	D1(vswp, "vsw_forward_all: enter\n");
664 
665 	/*
666 	 * Broadcast message from inside ldoms so send to outside
667 	 * world if in either of layer 2 modes.
668 	 */
669 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
670 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
671 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
672 
673 		nmp = vsw_dupmsgchain(mp);
674 		if (nmp) {
675 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
676 				DERR(vswp, "%s: dropping pkt(s) "
677 				    "consisting of %ld bytes of data for"
678 				    " physical device", __func__, MBLKL(ret_m));
679 				freemsgchain(ret_m);
680 			}
681 		}
682 	}
683 
684 	if (caller == VSW_VNETPORT)
685 		skip_port = 1;
686 
687 	/*
688 	 * Broadcast message from other vnet (layer 2 or 3) or outside
689 	 * world (layer 2 only), send up stack if plumbed.
690 	 */
691 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
692 		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
693 	}
694 
695 	/* send it to all VNETPORTs */
696 	READ_ENTER(&plist->lockrw);
697 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
698 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
699 		/*
700 		 * Caution ! - don't reorder these two checks as arg
701 		 * will be NULL if the caller is PHYSDEV. skip_port is
702 		 * only set if caller is VNETPORT.
703 		 */
704 		if ((skip_port) && (portp == arg)) {
705 			continue;
706 		} else {
707 			nmp = vsw_dupmsgchain(mp);
708 			if (nmp) {
709 				mblk_t	*mpt = nmp;
710 				uint32_t count = 1;
711 
712 				/* Find tail */
713 				while (mpt->b_next != NULL) {
714 					mpt = mpt->b_next;
715 					count++;
716 				}
717 				/*
718 				 * The plist->lockrw is protecting the
719 				 * portp from getting destroyed here.
720 				 * So, no ref_cnt is incremented here.
721 				 */
722 				(void) vsw_portsend(portp, nmp, mpt, count);
723 			} else {
724 				DERR(vswp, "vsw_forward_all: nmp NULL");
725 			}
726 		}
727 	}
728 	RW_EXIT(&plist->lockrw);
729 
730 	freemsgchain(mp);
731 
732 	D1(vswp, "vsw_forward_all: exit\n");
733 	return (0);
734 }
735 
736 /*
737  * Forward pkts to any devices or interfaces which have registered
738  * an interest in them (i.e. multicast groups).
739  */
740 static int
741 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
742 {
743 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
744 	mfdb_ent_t		*entp = NULL;
745 	mfdb_ent_t		*tpp = NULL;
746 	vsw_port_t 		*port;
747 	uint64_t		key = 0;
748 	mblk_t			*nmp = NULL;
749 	mblk_t			*ret_m = NULL;
750 	boolean_t		check_if = B_TRUE;
751 
752 	/*
753 	 * Convert address to hash table key
754 	 */
755 	KEY_HASH(key, &ehp->ether_dhost);
756 
757 	D1(vswp, "%s: key 0x%llx", __func__, key);
758 
759 	/*
760 	 * If pkt came from either a vnet or down the stack (if we are
761 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
762 	 * over the physical adapter, and then check to see if any other
763 	 * vnets are interested in it.
764 	 */
765 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
766 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
767 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
768 		nmp = vsw_dupmsgchain(mp);
769 		if (nmp) {
770 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
771 				DERR(vswp, "%s: dropping pkt(s) consisting of "
772 				    "%ld bytes of data for physical device",
773 				    __func__, MBLKL(ret_m));
774 				freemsgchain(ret_m);
775 			}
776 		}
777 	}
778 
779 	READ_ENTER(&vswp->mfdbrw);
780 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
781 	    (mod_hash_val_t *)&entp) != 0) {
782 		D3(vswp, "%s: no table entry found for addr 0x%llx",
783 		    __func__, key);
784 	} else {
785 		/*
786 		 * Send to list of devices associated with this address...
787 		 */
788 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
789 
790 			/* dont send to ourselves */
791 			if ((caller == VSW_VNETPORT) &&
792 			    (tpp->d_addr == (void *)arg)) {
793 				port = (vsw_port_t *)tpp->d_addr;
794 				D3(vswp, "%s: not sending to ourselves"
795 				    " : port %d", __func__, port->p_instance);
796 				continue;
797 
798 			} else if ((caller == VSW_LOCALDEV) &&
799 			    (tpp->d_type == VSW_LOCALDEV)) {
800 				D2(vswp, "%s: not sending back up stack",
801 				    __func__);
802 				continue;
803 			}
804 
805 			if (tpp->d_type == VSW_VNETPORT) {
806 				port = (vsw_port_t *)tpp->d_addr;
807 				D3(vswp, "%s: sending to port %ld for addr "
808 				    "0x%llx", __func__, port->p_instance, key);
809 
810 				nmp = vsw_dupmsgchain(mp);
811 				if (nmp) {
812 					mblk_t	*mpt = nmp;
813 					uint32_t count = 1;
814 
815 					/* Find tail */
816 					while (mpt->b_next != NULL) {
817 						mpt = mpt->b_next;
818 						count++;
819 					}
820 					/*
821 					 * The vswp->mfdbrw is protecting the
822 					 * portp from getting destroyed here.
823 					 * So, no ref_cnt is incremented here.
824 					 */
825 					(void) vsw_portsend(port, nmp, mpt,
826 					    count);
827 				}
828 			} else {
829 				vsw_mac_rx(vswp, NULL,
830 				    mp, VSW_MACRX_COPYMSG);
831 				D2(vswp, "%s: sending up stack"
832 				    " for addr 0x%llx", __func__, key);
833 				check_if = B_FALSE;
834 			}
835 		}
836 	}
837 
838 	RW_EXIT(&vswp->mfdbrw);
839 
840 	/*
841 	 * If the pkt came from either a vnet or from physical device,
842 	 * and if we havent already sent the pkt up the stack then we
843 	 * check now if we can/should (i.e. the interface is plumbed
844 	 * and in promisc mode).
845 	 */
846 	if ((check_if) &&
847 	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
848 		vsw_mac_rx(vswp, NULL, mp,
849 		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
850 	}
851 
852 	freemsgchain(mp);
853 
854 	D1(vswp, "%s: exit", __func__);
855 
856 	return (0);
857 }
858 
859 /*
860  * This function creates the vlan id hash table for the given vsw device or
861  * port. It then adds each vlan that the device or port has been assigned,
862  * into this hash table.
863  * Arguments:
864  *   arg:  vsw device or port.
865  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
866  */
867 void
868 vsw_create_vlans(void *arg, int type)
869 {
870 	/* create vlan hash table */
871 	vsw_vlan_create_hash(arg, type);
872 
873 	/* add vlan ids of the vsw device into its hash table */
874 	vsw_vlan_add_ids(arg, type);
875 }
876 
877 /*
878  * This function removes the vlan ids of the vsw device or port from its hash
879  * table. It then destroys the vlan hash table.
880  * Arguments:
881  *   arg:  vsw device or port.
882  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
883  */
884 void
885 vsw_destroy_vlans(void *arg, int type)
886 {
887 	/* remove vlan ids from the hash table */
888 	vsw_vlan_remove_ids(arg, type);
889 
890 	/* destroy vlan-hash-table */
891 	vsw_vlan_destroy_hash(arg, type);
892 }
893 
894 /*
895  * Create a vlan-id hash table for the given vsw device or port.
896  */
897 static void
898 vsw_vlan_create_hash(void *arg, int type)
899 {
900 	char		hashname[MAXNAMELEN];
901 
902 	if (type == VSW_LOCALDEV) {
903 		vsw_t		*vswp = (vsw_t *)arg;
904 
905 		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
906 		    vswp->instance);
907 
908 		vswp->vlan_nchains = vsw_vlan_nchains;
909 		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
910 		    vswp->vlan_nchains, mod_hash_null_valdtor);
911 
912 	} else if (type == VSW_VNETPORT) {
913 		vsw_port_t	*portp = (vsw_port_t *)arg;
914 
915 		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
916 		    portp->p_instance);
917 
918 		portp->vlan_nchains = vsw_vlan_nchains;
919 		portp->vlan_hashp = mod_hash_create_idhash(hashname,
920 		    portp->vlan_nchains, mod_hash_null_valdtor);
921 
922 	} else {
923 		return;
924 	}
925 }
926 
927 /*
928  * Destroy the vlan-id hash table for the given vsw device or port.
929  */
930 static void
931 vsw_vlan_destroy_hash(void *arg, int type)
932 {
933 	if (type == VSW_LOCALDEV) {
934 		vsw_t		*vswp = (vsw_t *)arg;
935 
936 		mod_hash_destroy_hash(vswp->vlan_hashp);
937 		vswp->vlan_nchains = 0;
938 	} else if (type == VSW_VNETPORT) {
939 		vsw_port_t	*portp = (vsw_port_t *)arg;
940 
941 		mod_hash_destroy_hash(portp->vlan_hashp);
942 		portp->vlan_nchains = 0;
943 	} else {
944 		return;
945 	}
946 }
947 
948 /*
949  * Add vlan ids of the given vsw device or port into its hash table.
950  */
951 void
952 vsw_vlan_add_ids(void *arg, int type)
953 {
954 	int	rv;
955 	int	i;
956 
957 	if (type == VSW_LOCALDEV) {
958 		vsw_t		*vswp = (vsw_t *)arg;
959 
960 		rv = mod_hash_insert(vswp->vlan_hashp,
961 		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
962 		    (mod_hash_val_t)B_TRUE);
963 		ASSERT(rv == 0);
964 
965 		for (i = 0; i < vswp->nvids; i++) {
966 			rv = mod_hash_insert(vswp->vlan_hashp,
967 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
968 			    (mod_hash_val_t)B_TRUE);
969 			ASSERT(rv == 0);
970 		}
971 
972 	} else if (type == VSW_VNETPORT) {
973 		vsw_port_t	*portp = (vsw_port_t *)arg;
974 
975 		rv = mod_hash_insert(portp->vlan_hashp,
976 		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
977 		    (mod_hash_val_t)B_TRUE);
978 		ASSERT(rv == 0);
979 
980 		for (i = 0; i < portp->nvids; i++) {
981 			rv = mod_hash_insert(portp->vlan_hashp,
982 			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
983 			    (mod_hash_val_t)B_TRUE);
984 			ASSERT(rv == 0);
985 		}
986 
987 	} else {
988 		return;
989 	}
990 }
991 
992 /*
993  * Remove vlan ids of the given vsw device or port from its hash table.
994  */
995 void
996 vsw_vlan_remove_ids(void *arg, int type)
997 {
998 	mod_hash_val_t	vp;
999 	int		rv;
1000 	int		i;
1001 
1002 	if (type == VSW_LOCALDEV) {
1003 		vsw_t		*vswp = (vsw_t *)arg;
1004 
1005 		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
1006 		if (rv == B_TRUE) {
1007 			rv = mod_hash_remove(vswp->vlan_hashp,
1008 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1009 			    (mod_hash_val_t *)&vp);
1010 			ASSERT(rv == 0);
1011 		}
1012 
1013 		for (i = 0; i < vswp->nvids; i++) {
1014 			rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->vids[i]);
1015 			if (rv == B_TRUE) {
1016 				rv = mod_hash_remove(vswp->vlan_hashp,
1017 				    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
1018 				    (mod_hash_val_t *)&vp);
1019 				ASSERT(rv == 0);
1020 			}
1021 		}
1022 
1023 	} else if (type == VSW_VNETPORT) {
1024 		vsw_port_t	*portp = (vsw_port_t *)arg;
1025 
1026 		portp = (vsw_port_t *)arg;
1027 		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1028 		if (rv == B_TRUE) {
1029 			rv = mod_hash_remove(portp->vlan_hashp,
1030 			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1031 			    (mod_hash_val_t *)&vp);
1032 			ASSERT(rv == 0);
1033 		}
1034 
1035 		for (i = 0; i < portp->nvids; i++) {
1036 			rv = vsw_vlan_lookup(portp->vlan_hashp, portp->vids[i]);
1037 			if (rv == B_TRUE) {
1038 				rv = mod_hash_remove(portp->vlan_hashp,
1039 				    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
1040 				    (mod_hash_val_t *)&vp);
1041 				ASSERT(rv == 0);
1042 			}
1043 		}
1044 
1045 	} else {
1046 		return;
1047 	}
1048 }
1049 
1050 /*
1051  * Find the given vlan id in the hash table.
1052  * Return: B_TRUE if the id is found; B_FALSE if not found.
1053  */
1054 boolean_t
1055 vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1056 {
1057 	int		rv;
1058 	mod_hash_val_t	vp;
1059 
1060 	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1061 
1062 	if (rv != 0)
1063 		return (B_FALSE);
1064 
1065 	return (B_TRUE);
1066 }
1067 
1068 /*
1069  * Add an entry into FDB for the given vsw.
1070  */
1071 void
1072 vsw_fdbe_add(vsw_t *vswp, void *port)
1073 {
1074 	uint64_t	addr = 0;
1075 	vsw_port_t	*portp;
1076 	vsw_fdbe_t	*fp;
1077 	int		rv;
1078 
1079 	portp = (vsw_port_t *)port;
1080 	KEY_HASH(addr, &portp->p_macaddr);
1081 
1082 	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1083 	fp->portp = port;
1084 
1085 	/*
1086 	 * Note: duplicate keys will be rejected by mod_hash.
1087 	 */
1088 	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1089 	    (mod_hash_val_t)fp);
1090 	ASSERT(rv == 0);
1091 }
1092 
1093 /*
1094  * Remove an entry from FDB.
1095  */
1096 void
1097 vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1098 {
1099 	uint64_t	addr = 0;
1100 	vsw_fdbe_t	*fp;
1101 	int		rv;
1102 
1103 	KEY_HASH(addr, eaddr);
1104 
1105 	/*
1106 	 * Remove the entry from fdb hash table.
1107 	 * This prevents further references to this fdb entry.
1108 	 */
1109 	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1110 	    (mod_hash_val_t *)&fp);
1111 	if (rv != 0) {
1112 		/* invalid key? */
1113 		return;
1114 	}
1115 
1116 	/*
1117 	 * If there are threads already ref holding before the entry was
1118 	 * removed from hash table, then wait for ref count to drop to zero.
1119 	 */
1120 	while (fp->refcnt != 0) {
1121 		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1122 	}
1123 
1124 	kmem_free(fp, sizeof (*fp));
1125 }
1126 
1127 /*
1128  * Search fdb for a given mac address. If an entry is found, hold
1129  * a reference to it and return the entry, else returns NULL.
1130  */
1131 static vsw_fdbe_t *
1132 vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1133 {
1134 	uint64_t	key = 0;
1135 	vsw_fdbe_t	*fp;
1136 	int		rv;
1137 
1138 	KEY_HASH(key, addrp);
1139 
1140 	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1141 	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1142 
1143 	if (rv != 0)
1144 		return (NULL);
1145 
1146 	return (fp);
1147 }
1148 
1149 /*
1150  * Callback function provided to mod_hash_find_cb(). After finding the fdb
1151  * entry corresponding to the key (macaddr), this callback will be invoked by
1152  * mod_hash_find_cb() to atomically increment the reference count on the fdb
1153  * entry before returning the found entry.
1154  */
1155 static void
1156 vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1157 {
1158 	_NOTE(ARGUNUSED(key))
1159 	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1160 }
1161 
1162 /*
1163  * A given frame must be always tagged with the appropriate vlan id (unless it
1164  * is in the default-vlan) before the mac address switching function is called.
1165  * Otherwise, after switching function determines the destination, we cannot
1166  * figure out if the destination belongs to the the same vlan that the frame
1167  * originated from and if it needs tag/untag. Frames which are inbound from
1168  * the external(physical) network over a vlan trunk link are always tagged.
1169  * However frames which are received from a vnet-port over ldc or frames which
1170  * are coming down the stack on the service domain over vsw interface may be
1171  * untagged. These frames must be tagged with the appropriate pvid of the
1172  * sender (vnet-port or vsw device), before invoking the switching function.
1173  *
1174  * Arguments:
1175  *   arg:    caller of the function.
1176  *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1177  *   mp:     frame(s) to be tagged.
1178  */
1179 mblk_t *
1180 vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1181 {
1182 	vsw_t			*vswp;
1183 	vsw_port_t		*portp;
1184 	struct ether_header	*ehp;
1185 	mblk_t			*bp;
1186 	mblk_t			*bpt;
1187 	mblk_t			*bph;
1188 	mblk_t			*bpn;
1189 	uint16_t		pvid;
1190 
1191 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1192 
1193 	if (type == VSW_LOCALDEV) {
1194 		vswp = (vsw_t *)arg;
1195 		pvid = vswp->pvid;
1196 		portp = NULL;
1197 	} else {
1198 		/* VSW_VNETPORT */
1199 		portp = (vsw_port_t *)arg;
1200 		pvid = portp->pvid;
1201 		vswp = portp->p_vswp;
1202 	}
1203 
1204 	bpn = bph = bpt = NULL;
1205 
1206 	for (bp = mp; bp != NULL; bp = bpn) {
1207 
1208 		bpn = bp->b_next;
1209 		bp->b_next = bp->b_prev = NULL;
1210 
1211 		/* Determine if it is an untagged frame */
1212 		ehp = (struct ether_header *)bp->b_rptr;
1213 
1214 		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1215 
1216 			/* no need to tag if the frame is in default vlan */
1217 			if (pvid != vswp->default_vlan_id) {
1218 				bp = vnet_vlan_insert_tag(bp, pvid);
1219 				if (bp == NULL) {
1220 					continue;
1221 				}
1222 			}
1223 		}
1224 
1225 		/* build a chain of processed packets */
1226 		if (bph == NULL) {
1227 			bph = bpt = bp;
1228 		} else {
1229 			bpt->b_next = bp;
1230 			bpt = bp;
1231 		}
1232 
1233 	}
1234 
1235 	return (bph);
1236 }
1237 
1238 /*
1239  * Frames destined to a vnet-port or to the local vsw interface, must be
1240  * untagged if necessary before sending. This function first checks that the
1241  * frame can be sent to the destination in the vlan identified by the frame
1242  * tag. Note that when this function is invoked the frame must have been
1243  * already tagged (unless it is in the default-vlan). Because, this function is
1244  * called when the switching function determines the destination and invokes
1245  * its send function (vnet-port or vsw interface) and all frames would have
1246  * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1247  *
1248  * Arguments:
1249  *   arg:    destination device.
1250  *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1251  *   np:     head of pkt chain to be validated and untagged.
1252  *   npt:    tail of pkt chain to be validated and untagged.
1253  *
1254  * Returns:
1255  *   np:     head of updated chain of packets
1256  *   npt:    tail of updated chain of packets
1257  *   rv:     count of any packets dropped
1258  */
1259 uint32_t
1260 vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1261 {
1262 	mblk_t			*bp;
1263 	mblk_t			*bpt;
1264 	mblk_t			*bph;
1265 	mblk_t			*bpn;
1266 	vsw_port_t		*portp;
1267 	vsw_t			*vswp;
1268 	uint32_t		count;
1269 	struct ether_header	*ehp;
1270 	boolean_t		is_tagged;
1271 	boolean_t		rv;
1272 	uint16_t		vlan_id;
1273 	uint16_t		pvid;
1274 	mod_hash_t		*vlan_hashp;
1275 
1276 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1277 
1278 	if (type == VSW_LOCALDEV) {
1279 		vswp = (vsw_t *)arg;
1280 		pvid = vswp->pvid;
1281 		vlan_hashp = vswp->vlan_hashp;
1282 		portp = NULL;
1283 	} else {
1284 		/* type == VSW_VNETPORT */
1285 		portp = (vsw_port_t *)arg;
1286 		vswp = portp->p_vswp;
1287 		vlan_hashp = portp->vlan_hashp;
1288 		pvid = portp->pvid;
1289 	}
1290 
1291 	bpn = bph = bpt = NULL;
1292 	count = 0;
1293 
1294 	for (bp = *np; bp != NULL; bp = bpn) {
1295 
1296 		bpn = bp->b_next;
1297 		bp->b_next = bp->b_prev = NULL;
1298 
1299 		/*
1300 		 * Determine the vlan id that the frame belongs to.
1301 		 */
1302 		ehp = (struct ether_header *)bp->b_rptr;
1303 		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1304 
1305 		/*
1306 		 * Check if the destination is in the same vlan.
1307 		 */
1308 		rv = vsw_vlan_lookup(vlan_hashp, vlan_id);
1309 		if (rv == B_FALSE) {
1310 			/* drop the packet */
1311 			freemsg(bp);
1312 			count++;
1313 			continue;
1314 		}
1315 
1316 		/*
1317 		 * Check the frame header if tag/untag is  needed.
1318 		 */
1319 		if (is_tagged == B_FALSE) {
1320 			/*
1321 			 * Untagged frame. We shouldn't have an untagged
1322 			 * packet at this point, unless the destination's
1323 			 * vlan id is default-vlan-id; if it is not the
1324 			 * default-vlan-id, we drop the packet.
1325 			 */
1326 			if (vlan_id != vswp->default_vlan_id) {
1327 				/* drop the packet */
1328 				freemsg(bp);
1329 				count++;
1330 				continue;
1331 			}
1332 		} else {
1333 			/*
1334 			 * Tagged frame, untag if it's the destination's pvid.
1335 			 */
1336 			if (vlan_id == pvid) {
1337 
1338 				bp = vnet_vlan_remove_tag(bp);
1339 				if (bp == NULL) {
1340 					/* packet dropped */
1341 					count++;
1342 					continue;
1343 				}
1344 			}
1345 		}
1346 
1347 		/* build a chain of processed packets */
1348 		if (bph == NULL) {
1349 			bph = bpt = bp;
1350 		} else {
1351 			bpt->b_next = bp;
1352 			bpt = bp;
1353 		}
1354 
1355 	}
1356 
1357 	*np = bph;
1358 	*npt = bpt;
1359 
1360 	return (count);
1361 }
1362 
1363 /*
1364  * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1365  * then the vlan-id is available in the tag; otherwise, its vlan id is
1366  * implicitly obtained based on the caller (destination of the frame:
1367  * VSW_VNETPORT or VSW_LOCALDEV).
1368  * The vlan id determined is returned in vidp.
1369  * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1370  */
1371 boolean_t
1372 vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1373 	uint16_t *vidp)
1374 {
1375 	struct ether_vlan_header	*evhp;
1376 	vsw_t				*vswp;
1377 	vsw_port_t			*portp;
1378 
1379 	/* If it's a tagged frame, get the vid from vlan header */
1380 	if (ehp->ether_type == ETHERTYPE_VLAN) {
1381 
1382 		evhp = (struct ether_vlan_header *)ehp;
1383 		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1384 		return (B_TRUE);
1385 	}
1386 
1387 	/* Untagged frame; determine vlan id based on caller */
1388 	switch (caller) {
1389 
1390 	case VSW_VNETPORT:
1391 		/*
1392 		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1393 		 */
1394 		portp = (vsw_port_t *)arg;
1395 		*vidp = portp->pvid;
1396 		break;
1397 
1398 	case VSW_LOCALDEV:
1399 
1400 		/*
1401 		 * packet destined to vsw interface;
1402 		 * vlan-id is port-vlan-id of vsw device.
1403 		 */
1404 		vswp = (vsw_t *)arg;
1405 		*vidp = vswp->pvid;
1406 		break;
1407 	}
1408 
1409 	return (B_FALSE);
1410 }
1411 
1412 /*
1413  * Add or remove multicast address(es).
1414  *
1415  * Returns 0 on success, 1 on failure.
1416  */
1417 int
1418 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1419 {
1420 	mcst_addr_t		*mcst_p = NULL;
1421 	vsw_t			*vswp = port->p_vswp;
1422 	uint64_t		addr = 0x0;
1423 	int			i;
1424 
1425 	D1(vswp, "%s: enter", __func__);
1426 
1427 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1428 
1429 	for (i = 0; i < mcst_pkt->count; i++) {
1430 		/*
1431 		 * Convert address into form that can be used
1432 		 * as hash table key.
1433 		 */
1434 		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1435 
1436 		/*
1437 		 * Add or delete the specified address/port combination.
1438 		 */
1439 		if (mcst_pkt->set == 0x1) {
1440 			D3(vswp, "%s: adding multicast address 0x%llx for "
1441 			    "port %ld", __func__, addr, port->p_instance);
1442 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1443 				/*
1444 				 * Update the list of multicast
1445 				 * addresses contained within the
1446 				 * port structure to include this new
1447 				 * one.
1448 				 */
1449 				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1450 				    KM_NOSLEEP);
1451 				if (mcst_p == NULL) {
1452 					DERR(vswp, "%s: unable to alloc mem",
1453 					    __func__);
1454 					(void) vsw_del_mcst(vswp,
1455 					    VSW_VNETPORT, addr, port);
1456 					return (1);
1457 				}
1458 
1459 				mcst_p->nextp = NULL;
1460 				mcst_p->addr = addr;
1461 				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1462 
1463 				/*
1464 				 * Program the address into HW. If the addr
1465 				 * has already been programmed then the MAC
1466 				 * just increments a ref counter (which is
1467 				 * used when the address is being deleted)
1468 				 */
1469 				WRITE_ENTER(&vswp->mac_rwlock);
1470 				if (vswp->mh != NULL) {
1471 					if (mac_multicst_add(vswp->mh,
1472 					    (uchar_t *)&mcst_pkt->mca[i])) {
1473 						RW_EXIT(&vswp->mac_rwlock);
1474 						cmn_err(CE_WARN, "!vsw%d: "
1475 						    "unable to add multicast "
1476 						    "address: %s\n",
1477 						    vswp->instance,
1478 						    ether_sprintf((void *)
1479 						    &mcst_p->mca));
1480 						(void) vsw_del_mcst(vswp,
1481 						    VSW_VNETPORT, addr, port);
1482 						kmem_free(mcst_p,
1483 						    sizeof (*mcst_p));
1484 						return (1);
1485 					}
1486 					mcst_p->mac_added = B_TRUE;
1487 				}
1488 				RW_EXIT(&vswp->mac_rwlock);
1489 
1490 				mutex_enter(&port->mca_lock);
1491 				mcst_p->nextp = port->mcap;
1492 				port->mcap = mcst_p;
1493 				mutex_exit(&port->mca_lock);
1494 
1495 			} else {
1496 				DERR(vswp, "%s: error adding multicast "
1497 				    "address 0x%llx for port %ld",
1498 				    __func__, addr, port->p_instance);
1499 				return (1);
1500 			}
1501 		} else {
1502 			/*
1503 			 * Delete an entry from the multicast hash
1504 			 * table and update the address list
1505 			 * appropriately.
1506 			 */
1507 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1508 				D3(vswp, "%s: deleting multicast address "
1509 				    "0x%llx for port %ld", __func__, addr,
1510 				    port->p_instance);
1511 
1512 				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1513 				ASSERT(mcst_p != NULL);
1514 
1515 				/*
1516 				 * Remove the address from HW. The address
1517 				 * will actually only be removed once the ref
1518 				 * count within the MAC layer has dropped to
1519 				 * zero. I.e. we can safely call this fn even
1520 				 * if other ports are interested in this
1521 				 * address.
1522 				 */
1523 				WRITE_ENTER(&vswp->mac_rwlock);
1524 				if (vswp->mh != NULL && mcst_p->mac_added) {
1525 					if (mac_multicst_remove(vswp->mh,
1526 					    (uchar_t *)&mcst_pkt->mca[i])) {
1527 						RW_EXIT(&vswp->mac_rwlock);
1528 						cmn_err(CE_WARN, "!vsw%d: "
1529 						    "unable to remove mcast "
1530 						    "address: %s\n",
1531 						    vswp->instance,
1532 						    ether_sprintf((void *)
1533 						    &mcst_p->mca));
1534 						kmem_free(mcst_p,
1535 						    sizeof (*mcst_p));
1536 						return (1);
1537 					}
1538 					mcst_p->mac_added = B_FALSE;
1539 				}
1540 				RW_EXIT(&vswp->mac_rwlock);
1541 				kmem_free(mcst_p, sizeof (*mcst_p));
1542 
1543 			} else {
1544 				DERR(vswp, "%s: error deleting multicast "
1545 				    "addr 0x%llx for port %ld",
1546 				    __func__, addr, port->p_instance);
1547 				return (1);
1548 			}
1549 		}
1550 	}
1551 	D1(vswp, "%s: exit", __func__);
1552 	return (0);
1553 }
1554 
1555 /*
1556  * Add a new multicast entry.
1557  *
1558  * Search hash table based on address. If match found then
1559  * update associated val (which is chain of ports), otherwise
1560  * create new key/val (addr/port) pair and insert into table.
1561  */
1562 int
1563 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1564 {
1565 	int		dup = 0;
1566 	int		rv = 0;
1567 	mfdb_ent_t	*ment = NULL;
1568 	mfdb_ent_t	*tmp_ent = NULL;
1569 	mfdb_ent_t	*new_ent = NULL;
1570 	void		*tgt = NULL;
1571 
1572 	if (devtype == VSW_VNETPORT) {
1573 		/*
1574 		 * Being invoked from a vnet.
1575 		 */
1576 		ASSERT(arg != NULL);
1577 		tgt = arg;
1578 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1579 		    ((vsw_port_t *)arg)->p_instance, addr);
1580 	} else {
1581 		/*
1582 		 * We are being invoked via the m_multicst mac entry
1583 		 * point.
1584 		 */
1585 		D2(NULL, "%s: address 0x%llx", __func__, addr);
1586 		tgt = (void *)vswp;
1587 	}
1588 
1589 	WRITE_ENTER(&vswp->mfdbrw);
1590 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1591 	    (mod_hash_val_t *)&ment) != 0) {
1592 
1593 		/* address not currently in table */
1594 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1595 		ment->d_addr = (void *)tgt;
1596 		ment->d_type = devtype;
1597 		ment->nextp = NULL;
1598 
1599 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1600 		    (mod_hash_val_t)ment) != 0) {
1601 			DERR(vswp, "%s: hash table insertion failed", __func__);
1602 			kmem_free(ment, sizeof (mfdb_ent_t));
1603 			rv = 1;
1604 		} else {
1605 			D2(vswp, "%s: added initial entry for 0x%llx to "
1606 			    "table", __func__, addr);
1607 		}
1608 	} else {
1609 		/*
1610 		 * Address in table. Check to see if specified port
1611 		 * is already associated with the address. If not add
1612 		 * it now.
1613 		 */
1614 		tmp_ent = ment;
1615 		while (tmp_ent != NULL) {
1616 			if (tmp_ent->d_addr == (void *)tgt) {
1617 				if (devtype == VSW_VNETPORT) {
1618 					DERR(vswp, "%s: duplicate port entry "
1619 					    "found for portid %ld and key "
1620 					    "0x%llx", __func__,
1621 					    ((vsw_port_t *)arg)->p_instance,
1622 					    addr);
1623 				} else {
1624 					DERR(vswp, "%s: duplicate entry found"
1625 					    "for key 0x%llx", __func__, addr);
1626 				}
1627 				rv = 1;
1628 				dup = 1;
1629 				break;
1630 			}
1631 			tmp_ent = tmp_ent->nextp;
1632 		}
1633 
1634 		/*
1635 		 * Port not on list so add it to end now.
1636 		 */
1637 		if (0 == dup) {
1638 			D2(vswp, "%s: added entry for 0x%llx to table",
1639 			    __func__, addr);
1640 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1641 			new_ent->d_addr = (void *)tgt;
1642 			new_ent->d_type = devtype;
1643 			new_ent->nextp = NULL;
1644 
1645 			tmp_ent = ment;
1646 			while (tmp_ent->nextp != NULL)
1647 				tmp_ent = tmp_ent->nextp;
1648 
1649 			tmp_ent->nextp = new_ent;
1650 		}
1651 	}
1652 
1653 	RW_EXIT(&vswp->mfdbrw);
1654 	return (rv);
1655 }
1656 
1657 /*
1658  * Remove a multicast entry from the hashtable.
1659  *
1660  * Search hash table based on address. If match found, scan
1661  * list of ports associated with address. If specified port
1662  * found remove it from list.
1663  */
1664 int
1665 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1666 {
1667 	mfdb_ent_t	*ment = NULL;
1668 	mfdb_ent_t	*curr_p, *prev_p;
1669 	void		*tgt = NULL;
1670 
1671 	D1(vswp, "%s: enter", __func__);
1672 
1673 	if (devtype == VSW_VNETPORT) {
1674 		tgt = (vsw_port_t *)arg;
1675 		D2(vswp, "%s: removing port %d from mFDB for address"
1676 		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1677 	} else {
1678 		D2(vswp, "%s: removing entry", __func__);
1679 		tgt = (void *)vswp;
1680 	}
1681 
1682 	WRITE_ENTER(&vswp->mfdbrw);
1683 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1684 	    (mod_hash_val_t *)&ment) != 0) {
1685 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1686 		RW_EXIT(&vswp->mfdbrw);
1687 		return (1);
1688 	}
1689 
1690 	prev_p = curr_p = ment;
1691 
1692 	while (curr_p != NULL) {
1693 		if (curr_p->d_addr == (void *)tgt) {
1694 			if (devtype == VSW_VNETPORT) {
1695 				D2(vswp, "%s: port %d found", __func__,
1696 				    ((vsw_port_t *)tgt)->p_instance);
1697 			} else {
1698 				D2(vswp, "%s: instance found", __func__);
1699 			}
1700 
1701 			if (prev_p == curr_p) {
1702 				/*
1703 				 * head of list, if no other element is in
1704 				 * list then destroy this entry, otherwise
1705 				 * just replace it with updated value.
1706 				 */
1707 				ment = curr_p->nextp;
1708 				if (ment == NULL) {
1709 					(void) mod_hash_destroy(vswp->mfdb,
1710 					    (mod_hash_val_t)addr);
1711 				} else {
1712 					(void) mod_hash_replace(vswp->mfdb,
1713 					    (mod_hash_key_t)addr,
1714 					    (mod_hash_val_t)ment);
1715 				}
1716 			} else {
1717 				/*
1718 				 * Not head of list, no need to do
1719 				 * replacement, just adjust list pointers.
1720 				 */
1721 				prev_p->nextp = curr_p->nextp;
1722 			}
1723 			break;
1724 		}
1725 
1726 		prev_p = curr_p;
1727 		curr_p = curr_p->nextp;
1728 	}
1729 
1730 	RW_EXIT(&vswp->mfdbrw);
1731 
1732 	D1(vswp, "%s: exit", __func__);
1733 
1734 	if (curr_p == NULL)
1735 		return (1);
1736 	kmem_free(curr_p, sizeof (mfdb_ent_t));
1737 	return (0);
1738 }
1739 
1740 /*
1741  * Port is being deleted, but has registered an interest in one
1742  * or more multicast groups. Using the list of addresses maintained
1743  * within the port structure find the appropriate entry in the hash
1744  * table and remove this port from the list of interested ports.
1745  */
1746 void
1747 vsw_del_mcst_port(vsw_port_t *port)
1748 {
1749 	mcst_addr_t	*mcap = NULL;
1750 	vsw_t		*vswp = port->p_vswp;
1751 
1752 	D1(vswp, "%s: enter", __func__);
1753 
1754 	mutex_enter(&port->mca_lock);
1755 
1756 	while ((mcap = port->mcap) != NULL) {
1757 
1758 		port->mcap = mcap->nextp;
1759 
1760 		mutex_exit(&port->mca_lock);
1761 
1762 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1763 		    mcap->addr, port);
1764 
1765 		/*
1766 		 * Remove the address from HW. The address
1767 		 * will actually only be removed once the ref
1768 		 * count within the MAC layer has dropped to
1769 		 * zero. I.e. we can safely call this fn even
1770 		 * if other ports are interested in this
1771 		 * address.
1772 		 */
1773 		WRITE_ENTER(&vswp->mac_rwlock);
1774 		if (vswp->mh != NULL && mcap->mac_added) {
1775 			(void) mac_multicst_remove(vswp->mh,
1776 			    (uchar_t *)&mcap->mca);
1777 		}
1778 		RW_EXIT(&vswp->mac_rwlock);
1779 
1780 		kmem_free(mcap, sizeof (*mcap));
1781 
1782 		mutex_enter(&port->mca_lock);
1783 
1784 	}
1785 
1786 	mutex_exit(&port->mca_lock);
1787 
1788 	D1(vswp, "%s: exit", __func__);
1789 }
1790 
1791 /*
1792  * This vsw instance is detaching, but has registered an interest in one
1793  * or more multicast groups. Using the list of addresses maintained
1794  * within the vsw structure find the appropriate entry in the hash
1795  * table and remove this instance from the list of interested ports.
1796  */
1797 void
1798 vsw_del_mcst_vsw(vsw_t *vswp)
1799 {
1800 	mcst_addr_t	*next_p = NULL;
1801 
1802 	D1(vswp, "%s: enter", __func__);
1803 
1804 	mutex_enter(&vswp->mca_lock);
1805 
1806 	while (vswp->mcap != NULL) {
1807 		DERR(vswp, "%s: deleting addr 0x%llx",
1808 		    __func__, vswp->mcap->addr);
1809 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1810 
1811 		next_p = vswp->mcap->nextp;
1812 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1813 		vswp->mcap = next_p;
1814 	}
1815 
1816 	vswp->mcap = NULL;
1817 	mutex_exit(&vswp->mca_lock);
1818 
1819 	D1(vswp, "%s: exit", __func__);
1820 }
1821 
1822 static uint32_t
1823 vsw_get_same_dest_list(struct ether_header *ehp,
1824     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp)
1825 {
1826 	uint32_t		count = 0;
1827 	mblk_t			*bp;
1828 	mblk_t			*nbp;
1829 	mblk_t			*head = NULL;
1830 	mblk_t			*tail = NULL;
1831 	mblk_t			*prev = NULL;
1832 	struct ether_header	*behp;
1833 
1834 	/* process the chain of packets */
1835 	bp = *mpp;
1836 	while (bp) {
1837 		nbp = bp->b_next;
1838 		behp = (struct ether_header *)bp->b_rptr;
1839 		bp->b_prev = NULL;
1840 		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1841 			if (prev == NULL) {
1842 				*mpp = nbp;
1843 			} else {
1844 				prev->b_next = nbp;
1845 			}
1846 			bp->b_next =  NULL;
1847 			if (head == NULL) {
1848 				head = tail = bp;
1849 			} else {
1850 				tail->b_next = bp;
1851 				tail = bp;
1852 			}
1853 			count++;
1854 		} else {
1855 			prev = bp;
1856 		}
1857 		bp = nbp;
1858 	}
1859 	*rhead = head;
1860 	*rtail = tail;
1861 	DTRACE_PROBE1(vsw_same_dest, int, count);
1862 	return (count);
1863 }
1864 
1865 static mblk_t *
1866 vsw_dupmsgchain(mblk_t *mp)
1867 {
1868 	mblk_t	*nmp = NULL;
1869 	mblk_t	**nmpp = &nmp;
1870 
1871 	for (; mp != NULL; mp = mp->b_next) {
1872 		if ((*nmpp = dupmsg(mp)) == NULL) {
1873 			freemsgchain(nmp);
1874 			return (NULL);
1875 		}
1876 
1877 		nmpp = &((*nmpp)->b_next);
1878 	}
1879 
1880 	return (nmp);
1881 }
1882