xref: /titanic_44/usr/src/uts/sun4v/io/vsw_switching.c (revision 73a0bd151c1115bf39cc2caa30c7cbfdd86361c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mdeg.h>
62 #include <sys/ldc.h>
63 #include <sys/vsw_fdb.h>
64 #include <sys/vsw.h>
65 #include <sys/vio_mailbox.h>
66 #include <sys/vnet_mailbox.h>
67 #include <sys/vnet_common.h>
68 #include <sys/vio_util.h>
69 #include <sys/sdt.h>
70 #include <sys/atomic.h>
71 #include <sys/vlan.h>
72 
73 /* Switching setup routines */
74 void vsw_setup_switching_thread(void *arg);
75 int vsw_setup_switching_start(vsw_t *vswp);
76 void vsw_setup_switching_stop(vsw_t *vswp);
77 int vsw_setup_switching(vsw_t *);
78 void vsw_setup_layer2_post_process(vsw_t *vswp);
79 void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
80     vsw_port_t *port, mac_resource_handle_t mrh);
81 static	int vsw_setup_layer2(vsw_t *);
82 static	int vsw_setup_layer3(vsw_t *);
83 
84 /* Switching/data transmit routines */
85 static	void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
86     vsw_port_t *port, mac_resource_handle_t);
87 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
88 	vsw_port_t *port, mac_resource_handle_t);
89 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
90 	vsw_port_t *port, mac_resource_handle_t);
91 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
92 	int caller, vsw_port_t *port);
93 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
94     int caller, vsw_port_t *port);
95 
96 /* VLAN routines */
97 void vsw_create_vlans(void *arg, int type);
98 void vsw_destroy_vlans(void *arg, int type);
99 void vsw_vlan_add_ids(void *arg, int type);
100 void vsw_vlan_remove_ids(void *arg, int type);
101 static	void vsw_vlan_create_hash(void *arg, int type);
102 static	void vsw_vlan_destroy_hash(void *arg, int type);
103 boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
104 	uint16_t *vidp);
105 mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
106 uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
107 boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
108 
109 /* Forwarding database (FDB) routines */
110 void vsw_fdbe_add(vsw_t *vswp, void *port);
111 void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
112 static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
113 static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
114 
115 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
116 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
117 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
118 void vsw_del_mcst_vsw(vsw_t *);
119 
120 /* Support functions */
121 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
122 static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp);
123 
124 
125 /*
126  * Functions imported from other files.
127  */
128 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
129 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
130 extern int vsw_mac_open(vsw_t *vswp);
131 extern void vsw_mac_close(vsw_t *vswp);
132 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
133     mblk_t *mp, vsw_macrx_flags_t flags);
134 extern void vsw_set_addrs(vsw_t *vswp);
135 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
136 extern void vsw_hio_init(vsw_t *vswp);
137 extern void vsw_hio_start_ports(vsw_t *vswp);
138 extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port,
139     mcst_addr_t *mcst_p, int type);
140 extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
141     mcst_addr_t *mcst_p, int type);
142 
143 /*
144  * Tunables used in this file.
145  */
146 extern	int vsw_setup_switching_delay;
147 extern	uint32_t vsw_vlan_nchains;
148 extern	uint32_t vsw_fdbe_refcnt_delay;
149 
150 #define	VSW_FDBE_REFHOLD(p)						\
151 {									\
152 	atomic_inc_32(&(p)->refcnt);					\
153 	ASSERT((p)->refcnt != 0);					\
154 }
155 
156 #define	VSW_FDBE_REFRELE(p)						\
157 {									\
158 	ASSERT((p)->refcnt != 0);					\
159 	atomic_dec_32(&(p)->refcnt);					\
160 }
161 
162 /*
163  * Thread to setup switching mode. This thread is created during vsw_attach()
164  * initially. It invokes vsw_setup_switching() and keeps retrying while the
165  * returned value is EAGAIN. The thread exits when the switching mode setup is
166  * done successfully or when the error returned is not EAGAIN. This thread may
167  * also get created from vsw_update_md_prop() if the switching mode needs to be
168  * updated.
169  */
170 void
171 vsw_setup_switching_thread(void *arg)
172 {
173 	callb_cpr_t	cprinfo;
174 	vsw_t		*vswp =  (vsw_t *)arg;
175 	clock_t		wait_time;
176 	clock_t		xwait;
177 	clock_t		wait_rv;
178 	int		rv;
179 
180 	/* wait time used on successive retries */
181 	xwait = drv_usectohz(vsw_setup_switching_delay * MICROSEC);
182 
183 	CALLB_CPR_INIT(&cprinfo, &vswp->sw_thr_lock, callb_generic_cpr,
184 	    "vsw_setup_sw_thread");
185 
186 	mutex_enter(&vswp->sw_thr_lock);
187 
188 	while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
189 
190 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
191 
192 		/* Wait for sometime before (re)trying setup_switching() */
193 		wait_time = ddi_get_lbolt() + xwait;
194 		while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
195 			wait_rv = cv_timedwait(&vswp->sw_thr_cv,
196 			    &vswp->sw_thr_lock, wait_time);
197 			if (wait_rv == -1) {	/* timed out */
198 				break;
199 			}
200 		}
201 
202 		CALLB_CPR_SAFE_END(&cprinfo, &vswp->sw_thr_lock)
203 
204 		if ((vswp->sw_thr_flags & VSW_SWTHR_STOP) != 0) {
205 			/*
206 			 * If there is a stop request, process that first and
207 			 * exit the loop. Continue to hold the mutex which gets
208 			 * released in CALLB_CPR_EXIT().
209 			 */
210 			break;
211 		}
212 
213 		mutex_exit(&vswp->sw_thr_lock);
214 		rv = vsw_setup_switching(vswp);
215 		if (rv == 0) {
216 			vsw_setup_layer2_post_process(vswp);
217 		}
218 		mutex_enter(&vswp->sw_thr_lock);
219 		if (rv != EAGAIN) {
220 			break;
221 		}
222 
223 	}
224 
225 	vswp->sw_thr_flags &= ~VSW_SWTHR_STOP;
226 	vswp->sw_thread = NULL;
227 	CALLB_CPR_EXIT(&cprinfo);
228 	thread_exit();
229 }
230 
231 /*
232  * Create a thread to setup the switching mode.
233  * Returns 0 on success; 1 on failure.
234  */
235 int
236 vsw_setup_switching_start(vsw_t *vswp)
237 {
238 	mutex_enter(&vswp->sw_thr_lock);
239 
240 	vswp->sw_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
241 	    vsw_setup_switching_thread, vswp, 0, &p0, TS_RUN, minclsyspri);
242 
243 	if (vswp->sw_thread == NULL) {
244 		mutex_exit(&vswp->sw_thr_lock);
245 		return (1);
246 	}
247 
248 	mutex_exit(&vswp->sw_thr_lock);
249 	return (0);
250 }
251 
252 /*
253  * Stop the thread to setup switching mode.
254  */
255 void
256 vsw_setup_switching_stop(vsw_t *vswp)
257 {
258 	kt_did_t	tid = 0;
259 
260 	/*
261 	 * Signal the setup_switching thread to stop and wait until it stops.
262 	 */
263 	mutex_enter(&vswp->sw_thr_lock);
264 
265 	if (vswp->sw_thread != NULL) {
266 		tid = vswp->sw_thread->t_did;
267 		vswp->sw_thr_flags |= VSW_SWTHR_STOP;
268 		cv_signal(&vswp->sw_thr_cv);
269 	}
270 
271 	mutex_exit(&vswp->sw_thr_lock);
272 
273 	if (tid != 0)
274 		thread_join(tid);
275 
276 	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
277 
278 	vswp->mac_open_retries = 0;
279 }
280 
281 /*
282  * Setup the required switching mode.
283  * Returns:
284  *  0 on success.
285  *  EAGAIN if retry is needed.
286  *  1 on all other failures.
287  */
288 int
289 vsw_setup_switching(vsw_t *vswp)
290 {
291 	int	rv = 1;
292 
293 	D1(vswp, "%s: enter", __func__);
294 
295 	/*
296 	 * Select best switching mode.
297 	 * This is done as this routine can be called from the timeout
298 	 * handler to retry setting up a specific mode. Currently only
299 	 * the function which sets up layer2/promisc mode returns EAGAIN
300 	 * if the underlying network device is not available yet, causing
301 	 * retries.
302 	 */
303 	if (vswp->smode & VSW_LAYER2) {
304 		rv = vsw_setup_layer2(vswp);
305 	} else if (vswp->smode & VSW_LAYER3) {
306 		rv = vsw_setup_layer3(vswp);
307 	} else {
308 		DERR(vswp, "unknown switch mode");
309 		rv = 1;
310 	}
311 
312 	if (rv && (rv != EAGAIN)) {
313 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
314 		    "switching mode", vswp->instance);
315 	} else if (rv == 0) {
316 		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
317 	}
318 
319 	D2(vswp, "%s: Operating in mode %d", __func__,
320 	    vswp->smode);
321 
322 	D1(vswp, "%s: exit", __func__);
323 
324 	return (rv);
325 }
326 
327 /*
328  * Setup for layer 2 switching.
329  *
330  * Returns:
331  *  0 on success.
332  *  EAGAIN if retry is needed.
333  *  EIO on all other failures.
334  */
335 static int
336 vsw_setup_layer2(vsw_t *vswp)
337 {
338 	int	rv;
339 
340 	D1(vswp, "%s: enter", __func__);
341 
342 	/*
343 	 * Until the network device is successfully opened,
344 	 * set the switching to use vsw_switch_l2_frame.
345 	 */
346 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
347 	vswp->mac_cl_switching = B_FALSE;
348 
349 	rv = strlen(vswp->physname);
350 	if (rv == 0) {
351 		/*
352 		 * Physical device name is NULL, which is
353 		 * required for layer 2.
354 		 */
355 		cmn_err(CE_WARN, "!vsw%d: no network device name specified",
356 		    vswp->instance);
357 		return (EIO);
358 	}
359 
360 	mutex_enter(&vswp->mac_lock);
361 
362 	rv = vsw_mac_open(vswp);
363 	if (rv != 0) {
364 		if (rv != EAGAIN) {
365 			cmn_err(CE_WARN, "!vsw%d: Unable to open network "
366 			    "device: %s\n", vswp->instance, vswp->physname);
367 		}
368 		mutex_exit(&vswp->mac_lock);
369 		return (rv);
370 	}
371 
372 	/*
373 	 * Now we can use the mac client switching, so set the switching
374 	 * function to use vsw_switch_l2_frame_mac_client(), which simply
375 	 * sends the packets to MAC layer for switching.
376 	 */
377 	vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client;
378 	vswp->mac_cl_switching = B_TRUE;
379 
380 	D1(vswp, "%s: exit", __func__);
381 
382 	/* Initialize HybridIO related stuff */
383 	vsw_hio_init(vswp);
384 
385 	mutex_exit(&vswp->mac_lock);
386 	return (0);
387 
388 exit_error:
389 	vsw_mac_close(vswp);
390 	mutex_exit(&vswp->mac_lock);
391 	return (EIO);
392 }
393 
394 static int
395 vsw_setup_layer3(vsw_t *vswp)
396 {
397 	D1(vswp, "%s: enter", __func__);
398 
399 	D2(vswp, "%s: operating in layer 3 mode", __func__);
400 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
401 
402 	D1(vswp, "%s: exit", __func__);
403 
404 	return (0);
405 }
406 
407 /* ARGSUSED */
408 void
409 vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
410 			mac_resource_handle_t mrh)
411 {
412 	freemsgchain(mp);
413 }
414 
415 /*
416  * Use mac client for layer 2 switching .
417  */
418 static void
419 vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
420     vsw_port_t *port, mac_resource_handle_t mrh)
421 {
422 	_NOTE(ARGUNUSED(mrh))
423 
424 	mblk_t		*ret_m;
425 
426 	/*
427 	 * This switching function is expected to be called by
428 	 * the ports or the interface only. The packets from
429 	 * physical interface already switched.
430 	 */
431 	ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV));
432 
433 	if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) {
434 		DERR(vswp, "%s: drop mblks to "
435 		    "phys dev", __func__);
436 		freemsgchain(ret_m);
437 	}
438 }
439 
440 /*
441  * Switch the given ethernet frame when operating in layer 2 mode.
442  *
443  * vswp: pointer to the vsw instance
444  * mp: pointer to chain of ethernet frame(s) to be switched
445  * caller: identifies the source of this frame as:
446  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
447  *		2. VSW_PHYSDEV - the physical ethernet device
448  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
449  * arg: argument provided by the caller.
450  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
451  *		2. for PHYSDEV - NULL
452  *		3. for LOCALDEV - pointer to to this vsw_t(self)
453  */
454 void
455 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
456 			vsw_port_t *arg, mac_resource_handle_t mrh)
457 {
458 	struct ether_header	*ehp;
459 	mblk_t			*bp, *ret_m;
460 	vsw_fdbe_t		*fp;
461 
462 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
463 
464 	/*
465 	 * PERF: rather than breaking up the chain here, scan it
466 	 * to find all mblks heading to same destination and then
467 	 * pass that sub-chain to the lower transmit functions.
468 	 */
469 
470 	/* process the chain of packets */
471 	bp = mp;
472 	while (bp) {
473 		ehp = (struct ether_header *)bp->b_rptr;
474 		mp = vsw_get_same_dest_list(ehp, &bp);
475 		ASSERT(mp != NULL);
476 
477 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
478 		    __func__, MBLKSIZE(mp), MBLKL(mp));
479 
480 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
481 			/*
482 			 * If destination is VSW_LOCALDEV (vsw as an eth
483 			 * interface) and if the device is up & running,
484 			 * send the packet up the stack on this host.
485 			 * If the virtual interface is down, drop the packet.
486 			 */
487 			if (caller != VSW_LOCALDEV) {
488 				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
489 			} else {
490 				freemsgchain(mp);
491 			}
492 			continue;
493 		}
494 
495 		/*
496 		 * Find fdb entry for the destination
497 		 * and hold a reference to it.
498 		 */
499 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
500 		if (fp != NULL) {
501 
502 			/*
503 			 * If plumbed and in promisc mode then copy msg
504 			 * and send up the stack.
505 			 */
506 			vsw_mac_rx(vswp, mrh, mp,
507 			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
508 
509 			/*
510 			 * If the destination is in FDB, the packet
511 			 * should be forwarded to the correponding
512 			 * vsw_port (connected to a vnet device -
513 			 * VSW_VNETPORT)
514 			 */
515 			(void) vsw_portsend(fp->portp, mp);
516 
517 			/* Release the reference on the fdb entry */
518 			VSW_FDBE_REFRELE(fp);
519 		} else {
520 			/*
521 			 * Destination not in FDB.
522 			 *
523 			 * If the destination is broadcast or
524 			 * multicast forward the packet to all
525 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
526 			 * except the caller.
527 			 */
528 			if (IS_BROADCAST(ehp)) {
529 				D2(vswp, "%s: BROADCAST pkt", __func__);
530 				(void) vsw_forward_all(vswp, mp, caller, arg);
531 			} else if (IS_MULTICAST(ehp)) {
532 				D2(vswp, "%s: MULTICAST pkt", __func__);
533 				(void) vsw_forward_grp(vswp, mp, caller, arg);
534 			} else {
535 				/*
536 				 * If the destination is unicast, and came
537 				 * from either a logical network device or
538 				 * the switch itself when it is plumbed, then
539 				 * send it out on the physical device and also
540 				 * up the stack if the logical interface is
541 				 * in promiscious mode.
542 				 *
543 				 * NOTE:  The assumption here is that if we
544 				 * cannot find the destination in our fdb, its
545 				 * a unicast address, and came from either a
546 				 * vnet or down the stack (when plumbed) it
547 				 * must be destinded for an ethernet device
548 				 * outside our ldoms.
549 				 */
550 				if (caller == VSW_VNETPORT) {
551 					/* promisc check copy etc */
552 					vsw_mac_rx(vswp, mrh, mp,
553 					    VSW_MACRX_PROMISC |
554 					    VSW_MACRX_COPYMSG);
555 
556 					if ((ret_m = vsw_tx_msg(vswp, mp,
557 					    caller, arg)) != NULL) {
558 						DERR(vswp, "%s: drop mblks to "
559 						    "phys dev", __func__);
560 						freemsgchain(ret_m);
561 					}
562 
563 				} else if (caller == VSW_PHYSDEV) {
564 					/*
565 					 * Pkt seen because card in promisc
566 					 * mode. Send up stack if plumbed in
567 					 * promisc mode, else drop it.
568 					 */
569 					vsw_mac_rx(vswp, mrh, mp,
570 					    VSW_MACRX_PROMISC |
571 					    VSW_MACRX_FREEMSG);
572 
573 				} else if (caller == VSW_LOCALDEV) {
574 					/*
575 					 * Pkt came down the stack, send out
576 					 * over physical device.
577 					 */
578 					if ((ret_m = vsw_tx_msg(vswp, mp,
579 					    caller, NULL)) != NULL) {
580 						DERR(vswp, "%s: drop mblks to "
581 						    "phys dev", __func__);
582 						freemsgchain(ret_m);
583 					}
584 				}
585 			}
586 		}
587 	}
588 	D1(vswp, "%s: exit\n", __func__);
589 }
590 
591 /*
592  * Switch ethernet frame when in layer 3 mode (i.e. using IP
593  * layer to do the routing).
594  *
595  * There is a large amount of overlap between this function and
596  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
597  * both these functions.
598  */
599 void
600 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
601 			vsw_port_t *arg, mac_resource_handle_t mrh)
602 {
603 	struct ether_header	*ehp;
604 	mblk_t			*bp = NULL;
605 	vsw_fdbe_t		*fp;
606 
607 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
608 
609 	/*
610 	 * In layer 3 mode should only ever be switching packets
611 	 * between IP layer and vnet devices. So make sure thats
612 	 * who is invoking us.
613 	 */
614 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
615 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
616 		freemsgchain(mp);
617 		return;
618 	}
619 
620 	/* process the chain of packets */
621 	bp = mp;
622 	while (bp) {
623 		ehp = (struct ether_header *)bp->b_rptr;
624 		mp = vsw_get_same_dest_list(ehp, &bp);
625 		ASSERT(mp != NULL);
626 
627 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
628 		    __func__, MBLKSIZE(mp), MBLKL(mp));
629 
630 		/*
631 		 * Find fdb entry for the destination
632 		 * and hold a reference to it.
633 		 */
634 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
635 		if (fp != NULL) {
636 
637 			D2(vswp, "%s: sending to target port", __func__);
638 			(void) vsw_portsend(fp->portp, mp);
639 
640 			/* Release the reference on the fdb entry */
641 			VSW_FDBE_REFRELE(fp);
642 		} else {
643 			/*
644 			 * Destination not in FDB
645 			 *
646 			 * If the destination is broadcast or
647 			 * multicast forward the packet to all
648 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
649 			 * except the caller.
650 			 */
651 			if (IS_BROADCAST(ehp)) {
652 				D2(vswp, "%s: BROADCAST pkt", __func__);
653 				(void) vsw_forward_all(vswp, mp, caller, arg);
654 			} else if (IS_MULTICAST(ehp)) {
655 				D2(vswp, "%s: MULTICAST pkt", __func__);
656 				(void) vsw_forward_grp(vswp, mp, caller, arg);
657 			} else {
658 				/*
659 				 * Unicast pkt from vnet that we don't have
660 				 * an FDB entry for, so must be destinded for
661 				 * the outside world. Attempt to send up to the
662 				 * IP layer to allow it to deal with it.
663 				 */
664 				if (caller == VSW_VNETPORT) {
665 					vsw_mac_rx(vswp, mrh,
666 					    mp, VSW_MACRX_FREEMSG);
667 				}
668 			}
669 		}
670 	}
671 
672 	D1(vswp, "%s: exit", __func__);
673 }
674 
675 /*
676  * Setup mac addrs and hio resources for layer 2 switching only.
677  */
678 void
679 vsw_setup_layer2_post_process(vsw_t *vswp)
680 {
681 	if (vswp->smode & VSW_LAYER2) {
682 		/*
683 		 * Program unicst, mcst addrs of vsw
684 		 * interface and ports in the physdev.
685 		 */
686 		vsw_set_addrs(vswp);
687 
688 		/* Start HIO for ports that have already connected */
689 		vsw_hio_start_ports(vswp);
690 	}
691 }
692 
693 /*
694  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
695  * except the caller (port on which frame arrived).
696  */
697 static int
698 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
699 {
700 	vsw_port_list_t	*plist = &vswp->plist;
701 	vsw_port_t	*portp;
702 	mblk_t		*nmp = NULL;
703 	mblk_t		*ret_m = NULL;
704 	int		skip_port = 0;
705 
706 	D1(vswp, "vsw_forward_all: enter\n");
707 
708 	/*
709 	 * Broadcast message from inside ldoms so send to outside
710 	 * world if in either of layer 2 modes.
711 	 */
712 	if ((vswp->smode & VSW_LAYER2) &&
713 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
714 
715 		nmp = vsw_dupmsgchain(mp);
716 		if (nmp) {
717 			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
718 			    != NULL) {
719 				DERR(vswp, "%s: dropping pkt(s) "
720 				    "consisting of %ld bytes of data for"
721 				    " physical device", __func__, MBLKL(ret_m));
722 				freemsgchain(ret_m);
723 			}
724 		}
725 	}
726 
727 	if (caller == VSW_VNETPORT)
728 		skip_port = 1;
729 
730 	/*
731 	 * Broadcast message from other vnet (layer 2 or 3) or outside
732 	 * world (layer 2 only), send up stack if plumbed.
733 	 */
734 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
735 		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
736 	}
737 
738 	/* send it to all VNETPORTs */
739 	READ_ENTER(&plist->lockrw);
740 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
741 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
742 		/*
743 		 * Caution ! - don't reorder these two checks as arg
744 		 * will be NULL if the caller is PHYSDEV. skip_port is
745 		 * only set if caller is VNETPORT.
746 		 */
747 		if ((skip_port) && (portp == arg)) {
748 			continue;
749 		} else {
750 			nmp = vsw_dupmsgchain(mp);
751 			if (nmp) {
752 				/*
753 				 * The plist->lockrw is protecting the
754 				 * portp from getting destroyed here.
755 				 * So, no ref_cnt is incremented here.
756 				 */
757 				(void) vsw_portsend(portp, nmp);
758 			} else {
759 				DERR(vswp, "vsw_forward_all: nmp NULL");
760 			}
761 		}
762 	}
763 	RW_EXIT(&plist->lockrw);
764 
765 	freemsgchain(mp);
766 
767 	D1(vswp, "vsw_forward_all: exit\n");
768 	return (0);
769 }
770 
771 /*
772  * Forward pkts to any devices or interfaces which have registered
773  * an interest in them (i.e. multicast groups).
774  */
775 static int
776 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
777 {
778 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
779 	mfdb_ent_t		*entp = NULL;
780 	mfdb_ent_t		*tpp = NULL;
781 	vsw_port_t 		*port;
782 	uint64_t		key = 0;
783 	mblk_t			*nmp = NULL;
784 	mblk_t			*ret_m = NULL;
785 	boolean_t		check_if = B_TRUE;
786 
787 	/*
788 	 * Convert address to hash table key
789 	 */
790 	KEY_HASH(key, &ehp->ether_dhost);
791 
792 	D1(vswp, "%s: key 0x%llx", __func__, key);
793 
794 	/*
795 	 * If pkt came from either a vnet or down the stack (if we are
796 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
797 	 * over the physical adapter, and then check to see if any other
798 	 * vnets are interested in it.
799 	 */
800 	if ((vswp->smode & VSW_LAYER2) &&
801 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
802 		nmp = vsw_dupmsgchain(mp);
803 		if (nmp) {
804 			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
805 			    != NULL) {
806 				DERR(vswp, "%s: dropping pkt(s) consisting of "
807 				    "%ld bytes of data for physical device",
808 				    __func__, MBLKL(ret_m));
809 				freemsgchain(ret_m);
810 			}
811 		}
812 	}
813 
814 	READ_ENTER(&vswp->mfdbrw);
815 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
816 	    (mod_hash_val_t *)&entp) != 0) {
817 		D3(vswp, "%s: no table entry found for addr 0x%llx",
818 		    __func__, key);
819 	} else {
820 		/*
821 		 * Send to list of devices associated with this address...
822 		 */
823 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
824 
825 			/* dont send to ourselves */
826 			if ((caller == VSW_VNETPORT) &&
827 			    (tpp->d_addr == (void *)arg)) {
828 				port = (vsw_port_t *)tpp->d_addr;
829 				D3(vswp, "%s: not sending to ourselves"
830 				    " : port %d", __func__, port->p_instance);
831 				continue;
832 
833 			} else if ((caller == VSW_LOCALDEV) &&
834 			    (tpp->d_type == VSW_LOCALDEV)) {
835 				D2(vswp, "%s: not sending back up stack",
836 				    __func__);
837 				continue;
838 			}
839 
840 			if (tpp->d_type == VSW_VNETPORT) {
841 				port = (vsw_port_t *)tpp->d_addr;
842 				D3(vswp, "%s: sending to port %ld for addr "
843 				    "0x%llx", __func__, port->p_instance, key);
844 
845 				nmp = vsw_dupmsgchain(mp);
846 				if (nmp) {
847 					/*
848 					 * The vswp->mfdbrw is protecting the
849 					 * portp from getting destroyed here.
850 					 * So, no ref_cnt is incremented here.
851 					 */
852 					(void) vsw_portsend(port, nmp);
853 				}
854 			} else {
855 				vsw_mac_rx(vswp, NULL,
856 				    mp, VSW_MACRX_COPYMSG);
857 				D2(vswp, "%s: sending up stack"
858 				    " for addr 0x%llx", __func__, key);
859 				check_if = B_FALSE;
860 			}
861 		}
862 	}
863 
864 	RW_EXIT(&vswp->mfdbrw);
865 
866 	/*
867 	 * If the pkt came from either a vnet or from physical device,
868 	 * and if we havent already sent the pkt up the stack then we
869 	 * check now if we can/should (i.e. the interface is plumbed
870 	 * and in promisc mode).
871 	 */
872 	if ((check_if) &&
873 	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
874 		vsw_mac_rx(vswp, NULL, mp,
875 		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
876 	}
877 
878 	freemsgchain(mp);
879 
880 	D1(vswp, "%s: exit", __func__);
881 
882 	return (0);
883 }
884 
885 /*
886  * This function creates the vlan id hash table for the given vsw device or
887  * port. It then adds each vlan that the device or port has been assigned,
888  * into this hash table.
889  * Arguments:
890  *   arg:  vsw device or port.
891  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
892  */
893 void
894 vsw_create_vlans(void *arg, int type)
895 {
896 	/* create vlan hash table */
897 	vsw_vlan_create_hash(arg, type);
898 
899 	/* add vlan ids of the vsw device into its hash table */
900 	vsw_vlan_add_ids(arg, type);
901 }
902 
903 /*
904  * This function removes the vlan ids of the vsw device or port from its hash
905  * table. It then destroys the vlan hash table.
906  * Arguments:
907  *   arg:  vsw device or port.
908  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
909  */
910 void
911 vsw_destroy_vlans(void *arg, int type)
912 {
913 	/* remove vlan ids from the hash table */
914 	vsw_vlan_remove_ids(arg, type);
915 
916 	/* destroy vlan-hash-table */
917 	vsw_vlan_destroy_hash(arg, type);
918 }
919 
920 /*
921  * Create a vlan-id hash table for the given vsw device or port.
922  */
923 static void
924 vsw_vlan_create_hash(void *arg, int type)
925 {
926 	char		hashname[MAXNAMELEN];
927 
928 	if (type == VSW_LOCALDEV) {
929 		vsw_t		*vswp = (vsw_t *)arg;
930 
931 		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
932 		    vswp->instance);
933 
934 		vswp->vlan_nchains = vsw_vlan_nchains;
935 		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
936 		    vswp->vlan_nchains, mod_hash_null_valdtor);
937 
938 	} else if (type == VSW_VNETPORT) {
939 		vsw_port_t	*portp = (vsw_port_t *)arg;
940 
941 		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
942 		    portp->p_instance);
943 
944 		portp->vlan_nchains = vsw_vlan_nchains;
945 		portp->vlan_hashp = mod_hash_create_idhash(hashname,
946 		    portp->vlan_nchains, mod_hash_null_valdtor);
947 
948 	} else {
949 		return;
950 	}
951 }
952 
953 /*
954  * Destroy the vlan-id hash table for the given vsw device or port.
955  */
956 static void
957 vsw_vlan_destroy_hash(void *arg, int type)
958 {
959 	if (type == VSW_LOCALDEV) {
960 		vsw_t		*vswp = (vsw_t *)arg;
961 
962 		mod_hash_destroy_hash(vswp->vlan_hashp);
963 		vswp->vlan_nchains = 0;
964 	} else if (type == VSW_VNETPORT) {
965 		vsw_port_t	*portp = (vsw_port_t *)arg;
966 
967 		mod_hash_destroy_hash(portp->vlan_hashp);
968 		portp->vlan_nchains = 0;
969 	} else {
970 		return;
971 	}
972 }
973 
974 /*
975  * Add vlan ids of the given vsw device or port into its hash table.
976  */
977 void
978 vsw_vlan_add_ids(void *arg, int type)
979 {
980 	int	rv;
981 	int	i;
982 
983 	if (type == VSW_LOCALDEV) {
984 		vsw_t		*vswp = (vsw_t *)arg;
985 
986 		rv = mod_hash_insert(vswp->vlan_hashp,
987 		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
988 		    (mod_hash_val_t)B_TRUE);
989 		if (rv != 0) {
990 			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
991 			    "the interface", vswp->instance, vswp->pvid);
992 		}
993 
994 		for (i = 0; i < vswp->nvids; i++) {
995 			rv = mod_hash_insert(vswp->vlan_hashp,
996 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid),
997 			    (mod_hash_val_t)B_TRUE);
998 			if (rv != 0) {
999 				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
1000 				    " for the interface", vswp->instance,
1001 				    vswp->pvid);
1002 			}
1003 		}
1004 
1005 	} else if (type == VSW_VNETPORT) {
1006 		vsw_port_t	*portp = (vsw_port_t *)arg;
1007 		vsw_t		*vswp = portp->p_vswp;
1008 
1009 		rv = mod_hash_insert(portp->vlan_hashp,
1010 		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1011 		    (mod_hash_val_t)B_TRUE);
1012 		if (rv != 0) {
1013 			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
1014 			    "the port(%d)", vswp->instance, vswp->pvid,
1015 			    portp->p_instance);
1016 		}
1017 
1018 		for (i = 0; i < portp->nvids; i++) {
1019 			rv = mod_hash_insert(portp->vlan_hashp,
1020 			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid),
1021 			    (mod_hash_val_t)B_TRUE);
1022 			if (rv != 0) {
1023 				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
1024 				    " for the port(%d)", vswp->instance,
1025 				    vswp->pvid, portp->p_instance);
1026 			}
1027 		}
1028 
1029 	}
1030 }
1031 
1032 /*
1033  * Remove vlan ids of the given vsw device or port from its hash table.
1034  */
1035 void
1036 vsw_vlan_remove_ids(void *arg, int type)
1037 {
1038 	mod_hash_val_t	vp;
1039 	int		rv;
1040 	int		i;
1041 
1042 	if (type == VSW_LOCALDEV) {
1043 		vsw_t		*vswp = (vsw_t *)arg;
1044 
1045 		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
1046 		if (rv == B_TRUE) {
1047 			rv = mod_hash_remove(vswp->vlan_hashp,
1048 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1049 			    (mod_hash_val_t *)&vp);
1050 			ASSERT(rv == 0);
1051 		}
1052 
1053 		for (i = 0; i < vswp->nvids; i++) {
1054 			rv = vsw_vlan_lookup(vswp->vlan_hashp,
1055 			    vswp->vids[i].vl_vid);
1056 			if (rv == B_TRUE) {
1057 				rv = mod_hash_remove(vswp->vlan_hashp,
1058 				    (mod_hash_key_t)VLAN_ID_KEY(
1059 				    vswp->vids[i].vl_vid),
1060 				    (mod_hash_val_t *)&vp);
1061 				ASSERT(rv == 0);
1062 			}
1063 		}
1064 
1065 	} else if (type == VSW_VNETPORT) {
1066 		vsw_port_t	*portp = (vsw_port_t *)arg;
1067 
1068 		portp = (vsw_port_t *)arg;
1069 		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1070 		if (rv == B_TRUE) {
1071 			rv = mod_hash_remove(portp->vlan_hashp,
1072 			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1073 			    (mod_hash_val_t *)&vp);
1074 			ASSERT(rv == 0);
1075 		}
1076 
1077 		for (i = 0; i < portp->nvids; i++) {
1078 			rv = vsw_vlan_lookup(portp->vlan_hashp,
1079 			    portp->vids[i].vl_vid);
1080 			if (rv == B_TRUE) {
1081 				rv = mod_hash_remove(portp->vlan_hashp,
1082 				    (mod_hash_key_t)VLAN_ID_KEY(
1083 				    portp->vids[i].vl_vid),
1084 				    (mod_hash_val_t *)&vp);
1085 				ASSERT(rv == 0);
1086 			}
1087 		}
1088 
1089 	} else {
1090 		return;
1091 	}
1092 }
1093 
1094 /*
1095  * Find the given vlan id in the hash table.
1096  * Return: B_TRUE if the id is found; B_FALSE if not found.
1097  */
1098 boolean_t
1099 vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1100 {
1101 	int		rv;
1102 	mod_hash_val_t	vp;
1103 
1104 	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1105 
1106 	if (rv != 0)
1107 		return (B_FALSE);
1108 
1109 	return (B_TRUE);
1110 }
1111 
1112 /*
1113  * Add an entry into FDB for the given vsw.
1114  */
1115 void
1116 vsw_fdbe_add(vsw_t *vswp, void *port)
1117 {
1118 	uint64_t	addr = 0;
1119 	vsw_port_t	*portp;
1120 	vsw_fdbe_t	*fp;
1121 	int		rv;
1122 
1123 	portp = (vsw_port_t *)port;
1124 	KEY_HASH(addr, &portp->p_macaddr);
1125 
1126 	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1127 	fp->portp = port;
1128 
1129 	/*
1130 	 * Note: duplicate keys will be rejected by mod_hash.
1131 	 */
1132 	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1133 	    (mod_hash_val_t)fp);
1134 	if (rv != 0) {
1135 		cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for "
1136 		    "the port(%d)", vswp->instance,
1137 		    ether_sprintf(&portp->p_macaddr), portp->p_instance);
1138 	}
1139 }
1140 
1141 /*
1142  * Remove an entry from FDB.
1143  */
1144 void
1145 vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1146 {
1147 	uint64_t	addr = 0;
1148 	vsw_fdbe_t	*fp;
1149 	int		rv;
1150 
1151 	KEY_HASH(addr, eaddr);
1152 
1153 	/*
1154 	 * Remove the entry from fdb hash table.
1155 	 * This prevents further references to this fdb entry.
1156 	 */
1157 	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1158 	    (mod_hash_val_t *)&fp);
1159 	if (rv != 0) {
1160 		/* invalid key? */
1161 		return;
1162 	}
1163 
1164 	/*
1165 	 * If there are threads already ref holding before the entry was
1166 	 * removed from hash table, then wait for ref count to drop to zero.
1167 	 */
1168 	while (fp->refcnt != 0) {
1169 		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1170 	}
1171 
1172 	kmem_free(fp, sizeof (*fp));
1173 }
1174 
1175 /*
1176  * Search fdb for a given mac address. If an entry is found, hold
1177  * a reference to it and return the entry, else returns NULL.
1178  */
1179 static vsw_fdbe_t *
1180 vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1181 {
1182 	uint64_t	key = 0;
1183 	vsw_fdbe_t	*fp;
1184 	int		rv;
1185 
1186 	KEY_HASH(key, addrp);
1187 
1188 	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1189 	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1190 
1191 	if (rv != 0)
1192 		return (NULL);
1193 
1194 	return (fp);
1195 }
1196 
1197 /*
1198  * Callback function provided to mod_hash_find_cb(). After finding the fdb
1199  * entry corresponding to the key (macaddr), this callback will be invoked by
1200  * mod_hash_find_cb() to atomically increment the reference count on the fdb
1201  * entry before returning the found entry.
1202  */
1203 static void
1204 vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1205 {
1206 	_NOTE(ARGUNUSED(key))
1207 	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1208 }
1209 
1210 /*
1211  * A given frame must be always tagged with the appropriate vlan id (unless it
1212  * is in the default-vlan) before the mac address switching function is called.
1213  * Otherwise, after switching function determines the destination, we cannot
1214  * figure out if the destination belongs to the the same vlan that the frame
1215  * originated from and if it needs tag/untag. Frames which are inbound from
1216  * the external(physical) network over a vlan trunk link are always tagged.
1217  * However frames which are received from a vnet-port over ldc or frames which
1218  * are coming down the stack on the service domain over vsw interface may be
1219  * untagged. These frames must be tagged with the appropriate pvid of the
1220  * sender (vnet-port or vsw device), before invoking the switching function.
1221  *
1222  * Arguments:
1223  *   arg:    caller of the function.
1224  *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1225  *   mp:     frame(s) to be tagged.
1226  */
1227 mblk_t *
1228 vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1229 {
1230 	vsw_t			*vswp;
1231 	vsw_port_t		*portp;
1232 	struct ether_header	*ehp;
1233 	mblk_t			*bp;
1234 	mblk_t			*bpt;
1235 	mblk_t			*bph;
1236 	mblk_t			*bpn;
1237 	uint16_t		pvid;
1238 
1239 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1240 
1241 	if (type == VSW_LOCALDEV) {
1242 		vswp = (vsw_t *)arg;
1243 		pvid = vswp->pvid;
1244 		portp = NULL;
1245 	} else {
1246 		/* VSW_VNETPORT */
1247 		portp = (vsw_port_t *)arg;
1248 		pvid = portp->pvid;
1249 		vswp = portp->p_vswp;
1250 	}
1251 
1252 	bpn = bph = bpt = NULL;
1253 
1254 	for (bp = mp; bp != NULL; bp = bpn) {
1255 
1256 		bpn = bp->b_next;
1257 		bp->b_next = bp->b_prev = NULL;
1258 
1259 		/* Determine if it is an untagged frame */
1260 		ehp = (struct ether_header *)bp->b_rptr;
1261 
1262 		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1263 
1264 			/* no need to tag if the frame is in default vlan */
1265 			if (pvid != vswp->default_vlan_id) {
1266 				bp = vnet_vlan_insert_tag(bp, pvid);
1267 				if (bp == NULL) {
1268 					continue;
1269 				}
1270 			}
1271 		}
1272 
1273 		/* build a chain of processed packets */
1274 		if (bph == NULL) {
1275 			bph = bpt = bp;
1276 		} else {
1277 			bpt->b_next = bp;
1278 			bpt = bp;
1279 		}
1280 
1281 	}
1282 
1283 	return (bph);
1284 }
1285 
1286 /*
1287  * Frames destined to a vnet-port or to the local vsw interface, must be
1288  * untagged if necessary before sending. This function first checks that the
1289  * frame can be sent to the destination in the vlan identified by the frame
1290  * tag. Note that when this function is invoked the frame must have been
1291  * already tagged (unless it is in the default-vlan). Because, this function is
1292  * called when the switching function determines the destination and invokes
1293  * its send function (vnet-port or vsw interface) and all frames would have
1294  * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1295  *
1296  * Arguments:
1297  *   arg:    destination device.
1298  *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1299  *   np:     head of pkt chain to be validated and untagged.
1300  *   npt:    tail of pkt chain to be validated and untagged.
1301  *
1302  * Returns:
1303  *   np:     head of updated chain of packets
1304  *   npt:    tail of updated chain of packets
1305  *   rv:     count of the packets in the returned list
1306  */
1307 uint32_t
1308 vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1309 {
1310 	mblk_t			*bp;
1311 	mblk_t			*bpt;
1312 	mblk_t			*bph;
1313 	mblk_t			*bpn;
1314 	vsw_port_t		*portp;
1315 	vsw_t			*vswp;
1316 	uint32_t		count;
1317 	struct ether_header	*ehp;
1318 	boolean_t		is_tagged;
1319 	boolean_t		rv;
1320 	uint16_t		vlan_id;
1321 	uint16_t		pvid;
1322 	mod_hash_t		*vlan_hashp;
1323 
1324 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1325 
1326 
1327 	if (type == VSW_LOCALDEV) {
1328 		vswp = (vsw_t *)arg;
1329 		pvid = vswp->pvid;
1330 		vlan_hashp = vswp->vlan_hashp;
1331 		portp = NULL;
1332 	} else {
1333 		/* type == VSW_VNETPORT */
1334 		portp = (vsw_port_t *)arg;
1335 		vswp = portp->p_vswp;
1336 		vlan_hashp = portp->vlan_hashp;
1337 		pvid = portp->pvid;
1338 	}
1339 
1340 	/*
1341 	 * If the MAC layer switching in place, then
1342 	 * untagging required only if the pvid is not
1343 	 * the same as default_vlan_id. This is because,
1344 	 * the MAC layer will send packets for the
1345 	 * registered vlans only.
1346 	 */
1347 	if ((vswp->mac_cl_switching == B_TRUE) &&
1348 	    (pvid == vswp->default_vlan_id)) {
1349 		/* simply count and set the tail */
1350 		count = 1;
1351 		bp = *np;
1352 		ASSERT(bp != NULL);
1353 		while (bp->b_next != NULL) {
1354 			bp = bp->b_next;
1355 			count++;
1356 		}
1357 		*npt = bp;
1358 		return (count);
1359 	}
1360 
1361 	bpn = bph = bpt = NULL;
1362 	count = 0;
1363 
1364 	for (bp = *np; bp != NULL; bp = bpn) {
1365 
1366 		bpn = bp->b_next;
1367 		bp->b_next = bp->b_prev = NULL;
1368 
1369 		/*
1370 		 * Determine the vlan id that the frame belongs to.
1371 		 */
1372 		ehp = (struct ether_header *)bp->b_rptr;
1373 		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1374 
1375 		/*
1376 		 * If MAC layer switching in place, then we
1377 		 * need to untag only if the tagged packet has
1378 		 * vlan-id same as the pvid.
1379 		 */
1380 		if (vswp->mac_cl_switching == B_TRUE) {
1381 
1382 			/* only tagged packets expected here */
1383 			ASSERT(is_tagged == B_TRUE);
1384 			if (vlan_id == pvid) {
1385 				bp = vnet_vlan_remove_tag(bp);
1386 				if (bp == NULL) {
1387 					/* packet dropped */
1388 					continue;
1389 				}
1390 			}
1391 		} else { /* No MAC layer switching */
1392 
1393 			/*
1394 			 * Check the frame header if tag/untag is  needed.
1395 			 */
1396 			if (is_tagged == B_FALSE) {
1397 				/*
1398 				 * Untagged frame. We shouldn't have an
1399 				 * untagged packet at this point, unless
1400 				 * the destination's  vlan id is
1401 				 * default-vlan-id; if it is not the
1402 				 * default-vlan-id, we drop the packet.
1403 				 */
1404 				if (vlan_id != vswp->default_vlan_id) {
1405 					/* drop the packet */
1406 					freemsg(bp);
1407 					continue;
1408 				}
1409 			} else {	/* Tagged */
1410 				/*
1411 				 * Tagged frame, untag if it's the
1412 				 * destination's pvid.
1413 				 */
1414 				if (vlan_id == pvid) {
1415 
1416 					bp = vnet_vlan_remove_tag(bp);
1417 					if (bp == NULL) {
1418 						/* packet dropped */
1419 						continue;
1420 					}
1421 				} else {
1422 
1423 					/*
1424 					 * Check if the destination is in the
1425 					 * same vlan.
1426 					 */
1427 					rv = vsw_vlan_lookup(vlan_hashp,
1428 					    vlan_id);
1429 					if (rv == B_FALSE) {
1430 						/* drop the packet */
1431 						freemsg(bp);
1432 						continue;
1433 					}
1434 				}
1435 
1436 			}
1437 		}
1438 
1439 		/* build a chain of processed packets */
1440 		if (bph == NULL) {
1441 			bph = bpt = bp;
1442 		} else {
1443 			bpt->b_next = bp;
1444 			bpt = bp;
1445 		}
1446 		count++;
1447 	}
1448 
1449 	*np = bph;
1450 	*npt = bpt;
1451 	return (count);
1452 }
1453 
1454 /*
1455  * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1456  * then the vlan-id is available in the tag; otherwise, its vlan id is
1457  * implicitly obtained based on the caller (destination of the frame:
1458  * VSW_VNETPORT or VSW_LOCALDEV).
1459  * The vlan id determined is returned in vidp.
1460  * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1461  */
1462 boolean_t
1463 vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1464 	uint16_t *vidp)
1465 {
1466 	struct ether_vlan_header	*evhp;
1467 	vsw_t				*vswp;
1468 	vsw_port_t			*portp;
1469 
1470 	/* If it's a tagged frame, get the vid from vlan header */
1471 	if (ehp->ether_type == ETHERTYPE_VLAN) {
1472 
1473 		evhp = (struct ether_vlan_header *)ehp;
1474 		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1475 		return (B_TRUE);
1476 	}
1477 
1478 	/* Untagged frame; determine vlan id based on caller */
1479 	switch (caller) {
1480 
1481 	case VSW_VNETPORT:
1482 		/*
1483 		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1484 		 */
1485 		portp = (vsw_port_t *)arg;
1486 		*vidp = portp->pvid;
1487 		break;
1488 
1489 	case VSW_LOCALDEV:
1490 
1491 		/*
1492 		 * packet destined to vsw interface;
1493 		 * vlan-id is port-vlan-id of vsw device.
1494 		 */
1495 		vswp = (vsw_t *)arg;
1496 		*vidp = vswp->pvid;
1497 		break;
1498 	}
1499 
1500 	return (B_FALSE);
1501 }
1502 
1503 /*
1504  * Add or remove multicast address(es).
1505  *
1506  * Returns 0 on success, 1 on failure.
1507  */
1508 int
1509 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1510 {
1511 	mcst_addr_t		*mcst_p = NULL;
1512 	vsw_t			*vswp = port->p_vswp;
1513 	uint64_t		addr = 0x0;
1514 	int			i;
1515 
1516 	D1(vswp, "%s: enter", __func__);
1517 
1518 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1519 
1520 	for (i = 0; i < mcst_pkt->count; i++) {
1521 		/*
1522 		 * Convert address into form that can be used
1523 		 * as hash table key.
1524 		 */
1525 		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1526 
1527 		/*
1528 		 * Add or delete the specified address/port combination.
1529 		 */
1530 		if (mcst_pkt->set == 0x1) {
1531 			D3(vswp, "%s: adding multicast address 0x%llx for "
1532 			    "port %ld", __func__, addr, port->p_instance);
1533 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1534 				/*
1535 				 * Update the list of multicast
1536 				 * addresses contained within the
1537 				 * port structure to include this new
1538 				 * one.
1539 				 */
1540 				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1541 				    KM_NOSLEEP);
1542 				if (mcst_p == NULL) {
1543 					DERR(vswp, "%s: unable to alloc mem",
1544 					    __func__);
1545 					(void) vsw_del_mcst(vswp,
1546 					    VSW_VNETPORT, addr, port);
1547 					return (1);
1548 				}
1549 
1550 				mcst_p->nextp = NULL;
1551 				mcst_p->addr = addr;
1552 				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1553 
1554 				/*
1555 				 * Program the address into HW. If the addr
1556 				 * has already been programmed then the MAC
1557 				 * just increments a ref counter (which is
1558 				 * used when the address is being deleted)
1559 				 */
1560 				if (vsw_mac_multicast_add(vswp, port, mcst_p,
1561 				    VSW_VNETPORT)) {
1562 					(void) vsw_del_mcst(vswp,
1563 					    VSW_VNETPORT, addr, port);
1564 					kmem_free(mcst_p, sizeof (*mcst_p));
1565 					return (1);
1566 				}
1567 
1568 				mutex_enter(&port->mca_lock);
1569 				mcst_p->nextp = port->mcap;
1570 				port->mcap = mcst_p;
1571 				mutex_exit(&port->mca_lock);
1572 
1573 			} else {
1574 				DERR(vswp, "%s: error adding multicast "
1575 				    "address 0x%llx for port %ld",
1576 				    __func__, addr, port->p_instance);
1577 				return (1);
1578 			}
1579 		} else {
1580 			/*
1581 			 * Delete an entry from the multicast hash
1582 			 * table and update the address list
1583 			 * appropriately.
1584 			 */
1585 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1586 				D3(vswp, "%s: deleting multicast address "
1587 				    "0x%llx for port %ld", __func__, addr,
1588 				    port->p_instance);
1589 
1590 				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1591 				ASSERT(mcst_p != NULL);
1592 
1593 				/*
1594 				 * Remove the address from HW. The address
1595 				 * will actually only be removed once the ref
1596 				 * count within the MAC layer has dropped to
1597 				 * zero. I.e. we can safely call this fn even
1598 				 * if other ports are interested in this
1599 				 * address.
1600 				 */
1601 				vsw_mac_multicast_remove(vswp, port, mcst_p,
1602 				    VSW_VNETPORT);
1603 				kmem_free(mcst_p, sizeof (*mcst_p));
1604 
1605 			} else {
1606 				DERR(vswp, "%s: error deleting multicast "
1607 				    "addr 0x%llx for port %ld",
1608 				    __func__, addr, port->p_instance);
1609 				return (1);
1610 			}
1611 		}
1612 	}
1613 	D1(vswp, "%s: exit", __func__);
1614 	return (0);
1615 }
1616 
1617 /*
1618  * Add a new multicast entry.
1619  *
1620  * Search hash table based on address. If match found then
1621  * update associated val (which is chain of ports), otherwise
1622  * create new key/val (addr/port) pair and insert into table.
1623  */
1624 int
1625 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1626 {
1627 	int		dup = 0;
1628 	int		rv = 0;
1629 	mfdb_ent_t	*ment = NULL;
1630 	mfdb_ent_t	*tmp_ent = NULL;
1631 	mfdb_ent_t	*new_ent = NULL;
1632 	void		*tgt = NULL;
1633 
1634 	if (devtype == VSW_VNETPORT) {
1635 		/*
1636 		 * Being invoked from a vnet.
1637 		 */
1638 		ASSERT(arg != NULL);
1639 		tgt = arg;
1640 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1641 		    ((vsw_port_t *)arg)->p_instance, addr);
1642 	} else {
1643 		/*
1644 		 * We are being invoked via the m_multicst mac entry
1645 		 * point.
1646 		 */
1647 		D2(NULL, "%s: address 0x%llx", __func__, addr);
1648 		tgt = (void *)vswp;
1649 	}
1650 
1651 	WRITE_ENTER(&vswp->mfdbrw);
1652 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1653 	    (mod_hash_val_t *)&ment) != 0) {
1654 
1655 		/* address not currently in table */
1656 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1657 		ment->d_addr = (void *)tgt;
1658 		ment->d_type = devtype;
1659 		ment->nextp = NULL;
1660 
1661 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1662 		    (mod_hash_val_t)ment) != 0) {
1663 			DERR(vswp, "%s: hash table insertion failed", __func__);
1664 			kmem_free(ment, sizeof (mfdb_ent_t));
1665 			rv = 1;
1666 		} else {
1667 			D2(vswp, "%s: added initial entry for 0x%llx to "
1668 			    "table", __func__, addr);
1669 		}
1670 	} else {
1671 		/*
1672 		 * Address in table. Check to see if specified port
1673 		 * is already associated with the address. If not add
1674 		 * it now.
1675 		 */
1676 		tmp_ent = ment;
1677 		while (tmp_ent != NULL) {
1678 			if (tmp_ent->d_addr == (void *)tgt) {
1679 				if (devtype == VSW_VNETPORT) {
1680 					DERR(vswp, "%s: duplicate port entry "
1681 					    "found for portid %ld and key "
1682 					    "0x%llx", __func__,
1683 					    ((vsw_port_t *)arg)->p_instance,
1684 					    addr);
1685 				} else {
1686 					DERR(vswp, "%s: duplicate entry found"
1687 					    "for key 0x%llx", __func__, addr);
1688 				}
1689 				rv = 1;
1690 				dup = 1;
1691 				break;
1692 			}
1693 			tmp_ent = tmp_ent->nextp;
1694 		}
1695 
1696 		/*
1697 		 * Port not on list so add it to end now.
1698 		 */
1699 		if (0 == dup) {
1700 			D2(vswp, "%s: added entry for 0x%llx to table",
1701 			    __func__, addr);
1702 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1703 			new_ent->d_addr = (void *)tgt;
1704 			new_ent->d_type = devtype;
1705 			new_ent->nextp = NULL;
1706 
1707 			tmp_ent = ment;
1708 			while (tmp_ent->nextp != NULL)
1709 				tmp_ent = tmp_ent->nextp;
1710 
1711 			tmp_ent->nextp = new_ent;
1712 		}
1713 	}
1714 
1715 	RW_EXIT(&vswp->mfdbrw);
1716 	return (rv);
1717 }
1718 
1719 /*
1720  * Remove a multicast entry from the hashtable.
1721  *
1722  * Search hash table based on address. If match found, scan
1723  * list of ports associated with address. If specified port
1724  * found remove it from list.
1725  */
1726 int
1727 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1728 {
1729 	mfdb_ent_t	*ment = NULL;
1730 	mfdb_ent_t	*curr_p, *prev_p;
1731 	void		*tgt = NULL;
1732 
1733 	D1(vswp, "%s: enter", __func__);
1734 
1735 	if (devtype == VSW_VNETPORT) {
1736 		tgt = (vsw_port_t *)arg;
1737 		D2(vswp, "%s: removing port %d from mFDB for address"
1738 		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1739 	} else {
1740 		D2(vswp, "%s: removing entry", __func__);
1741 		tgt = (void *)vswp;
1742 	}
1743 
1744 	WRITE_ENTER(&vswp->mfdbrw);
1745 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1746 	    (mod_hash_val_t *)&ment) != 0) {
1747 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1748 		RW_EXIT(&vswp->mfdbrw);
1749 		return (1);
1750 	}
1751 
1752 	prev_p = curr_p = ment;
1753 
1754 	while (curr_p != NULL) {
1755 		if (curr_p->d_addr == (void *)tgt) {
1756 			if (devtype == VSW_VNETPORT) {
1757 				D2(vswp, "%s: port %d found", __func__,
1758 				    ((vsw_port_t *)tgt)->p_instance);
1759 			} else {
1760 				D2(vswp, "%s: instance found", __func__);
1761 			}
1762 
1763 			if (prev_p == curr_p) {
1764 				/*
1765 				 * head of list, if no other element is in
1766 				 * list then destroy this entry, otherwise
1767 				 * just replace it with updated value.
1768 				 */
1769 				ment = curr_p->nextp;
1770 				if (ment == NULL) {
1771 					(void) mod_hash_destroy(vswp->mfdb,
1772 					    (mod_hash_val_t)addr);
1773 				} else {
1774 					(void) mod_hash_replace(vswp->mfdb,
1775 					    (mod_hash_key_t)addr,
1776 					    (mod_hash_val_t)ment);
1777 				}
1778 			} else {
1779 				/*
1780 				 * Not head of list, no need to do
1781 				 * replacement, just adjust list pointers.
1782 				 */
1783 				prev_p->nextp = curr_p->nextp;
1784 			}
1785 			break;
1786 		}
1787 
1788 		prev_p = curr_p;
1789 		curr_p = curr_p->nextp;
1790 	}
1791 
1792 	RW_EXIT(&vswp->mfdbrw);
1793 
1794 	D1(vswp, "%s: exit", __func__);
1795 
1796 	if (curr_p == NULL)
1797 		return (1);
1798 	kmem_free(curr_p, sizeof (mfdb_ent_t));
1799 	return (0);
1800 }
1801 
1802 /*
1803  * Port is being deleted, but has registered an interest in one
1804  * or more multicast groups. Using the list of addresses maintained
1805  * within the port structure find the appropriate entry in the hash
1806  * table and remove this port from the list of interested ports.
1807  */
1808 void
1809 vsw_del_mcst_port(vsw_port_t *port)
1810 {
1811 	mcst_addr_t	*mcap = NULL;
1812 	vsw_t		*vswp = port->p_vswp;
1813 
1814 	D1(vswp, "%s: enter", __func__);
1815 
1816 	mutex_enter(&port->mca_lock);
1817 
1818 	while ((mcap = port->mcap) != NULL) {
1819 
1820 		port->mcap = mcap->nextp;
1821 
1822 		mutex_exit(&port->mca_lock);
1823 
1824 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1825 		    mcap->addr, port);
1826 
1827 		/*
1828 		 * Remove the address from HW. The address
1829 		 * will actually only be removed once the ref
1830 		 * count within the MAC layer has dropped to
1831 		 * zero. I.e. we can safely call this fn even
1832 		 * if other ports are interested in this
1833 		 * address.
1834 		 */
1835 		vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT);
1836 		kmem_free(mcap, sizeof (*mcap));
1837 
1838 		mutex_enter(&port->mca_lock);
1839 
1840 	}
1841 
1842 	mutex_exit(&port->mca_lock);
1843 
1844 	D1(vswp, "%s: exit", __func__);
1845 }
1846 
1847 /*
1848  * This vsw instance is detaching, but has registered an interest in one
1849  * or more multicast groups. Using the list of addresses maintained
1850  * within the vsw structure find the appropriate entry in the hash
1851  * table and remove this instance from the list of interested ports.
1852  */
1853 void
1854 vsw_del_mcst_vsw(vsw_t *vswp)
1855 {
1856 	mcst_addr_t	*next_p = NULL;
1857 
1858 	D1(vswp, "%s: enter", __func__);
1859 
1860 	mutex_enter(&vswp->mca_lock);
1861 
1862 	while (vswp->mcap != NULL) {
1863 		DERR(vswp, "%s: deleting addr 0x%llx",
1864 		    __func__, vswp->mcap->addr);
1865 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1866 
1867 		next_p = vswp->mcap->nextp;
1868 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1869 		vswp->mcap = next_p;
1870 	}
1871 
1872 	vswp->mcap = NULL;
1873 	mutex_exit(&vswp->mca_lock);
1874 
1875 	D1(vswp, "%s: exit", __func__);
1876 }
1877 
1878 mblk_t *
1879 vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp)
1880 {
1881 	mblk_t			*bp;
1882 	mblk_t			*nbp;
1883 	mblk_t			*head = NULL;
1884 	mblk_t			*tail = NULL;
1885 	mblk_t			*prev = NULL;
1886 	struct ether_header	*behp;
1887 
1888 	/* process the chain of packets */
1889 	bp = *mpp;
1890 	while (bp) {
1891 		nbp = bp->b_next;
1892 		behp = (struct ether_header *)bp->b_rptr;
1893 		bp->b_prev = NULL;
1894 		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1895 			if (prev == NULL) {
1896 				*mpp = nbp;
1897 			} else {
1898 				prev->b_next = nbp;
1899 			}
1900 			bp->b_next =  NULL;
1901 			if (head == NULL) {
1902 				head = tail = bp;
1903 			} else {
1904 				tail->b_next = bp;
1905 				tail = bp;
1906 			}
1907 		} else {
1908 			prev = bp;
1909 		}
1910 		bp = nbp;
1911 	}
1912 	return (head);
1913 }
1914 
1915 static mblk_t *
1916 vsw_dupmsgchain(mblk_t *mp)
1917 {
1918 	mblk_t	*nmp = NULL;
1919 	mblk_t	**nmpp = &nmp;
1920 
1921 	for (; mp != NULL; mp = mp->b_next) {
1922 		if ((*nmpp = dupmsg(mp)) == NULL) {
1923 			freemsgchain(nmp);
1924 			return (NULL);
1925 		}
1926 
1927 		nmpp = &((*nmpp)->b_next);
1928 	}
1929 
1930 	return (nmp);
1931 }
1932