xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_switching.c (revision 4eaa471005973e11a6110b69fe990530b3b95a38)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mdeg.h>
62 #include <sys/ldc.h>
63 #include <sys/vsw_fdb.h>
64 #include <sys/vsw.h>
65 #include <sys/vio_mailbox.h>
66 #include <sys/vnet_mailbox.h>
67 #include <sys/vnet_common.h>
68 #include <sys/vio_util.h>
69 #include <sys/sdt.h>
70 #include <sys/atomic.h>
71 #include <sys/vlan.h>
72 
73 /* Switching setup routines */
74 void vsw_setup_switching_thread(void *arg);
75 int vsw_setup_switching_start(vsw_t *vswp);
76 void vsw_setup_switching_stop(vsw_t *vswp);
77 int vsw_setup_switching(vsw_t *);
78 void vsw_setup_switching_post_process(vsw_t *vswp);
79 void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
80     vsw_port_t *port, mac_resource_handle_t mrh);
81 static	int vsw_setup_layer2(vsw_t *);
82 static	int vsw_setup_layer3(vsw_t *);
83 
84 /* Switching/data transmit routines */
85 static	void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
86     vsw_port_t *port, mac_resource_handle_t);
87 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
88 	vsw_port_t *port, mac_resource_handle_t);
89 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
90 	vsw_port_t *port, mac_resource_handle_t);
91 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
92 	int caller, vsw_port_t *port);
93 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
94     int caller, vsw_port_t *port);
95 
96 /* VLAN routines */
97 void vsw_create_vlans(void *arg, int type);
98 void vsw_destroy_vlans(void *arg, int type);
99 void vsw_vlan_add_ids(void *arg, int type);
100 void vsw_vlan_remove_ids(void *arg, int type);
101 static	void vsw_vlan_create_hash(void *arg, int type);
102 static	void vsw_vlan_destroy_hash(void *arg, int type);
103 boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
104 	uint16_t *vidp);
105 mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
106 uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
107 boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
108 
109 /* Forwarding database (FDB) routines */
110 void vsw_fdbe_add(vsw_t *vswp, void *port);
111 void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
112 static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
113 static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
114 
115 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
116 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
117 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
118 void vsw_del_mcst_vsw(vsw_t *);
119 
120 /* Support functions */
121 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
122 static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp);
123 
124 
125 /*
126  * Functions imported from other files.
127  */
128 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
129 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
130 extern int vsw_mac_open(vsw_t *vswp);
131 extern void vsw_mac_close(vsw_t *vswp);
132 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
133     mblk_t *mp, vsw_macrx_flags_t flags);
134 extern void vsw_set_addrs(vsw_t *vswp);
135 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
136 extern void vsw_hio_init(vsw_t *vswp);
137 extern void vsw_hio_start_ports(vsw_t *vswp);
138 extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port,
139     mcst_addr_t *mcst_p, int type);
140 extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
141     mcst_addr_t *mcst_p, int type);
142 extern void vsw_mac_link_update(vsw_t *vswp, link_state_t link_state);
143 extern void vsw_physlink_update_ports(vsw_t *vswp);
144 
145 /*
146  * Tunables used in this file.
147  */
148 extern	int vsw_setup_switching_delay;
149 extern	uint32_t vsw_vlan_nchains;
150 extern	uint32_t vsw_fdbe_refcnt_delay;
151 
152 #define	VSW_FDBE_REFHOLD(p)						\
153 {									\
154 	atomic_inc_32(&(p)->refcnt);					\
155 	ASSERT((p)->refcnt != 0);					\
156 }
157 
158 #define	VSW_FDBE_REFRELE(p)						\
159 {									\
160 	ASSERT((p)->refcnt != 0);					\
161 	atomic_dec_32(&(p)->refcnt);					\
162 }
163 
164 /*
165  * Thread to setup switching mode. This thread is created during vsw_attach()
166  * initially. It invokes vsw_setup_switching() and keeps retrying while the
167  * returned value is EAGAIN. The thread exits when the switching mode setup is
168  * done successfully or when the error returned is not EAGAIN. This thread may
169  * also get created from vsw_update_md_prop() if the switching mode needs to be
170  * updated.
171  */
172 void
173 vsw_setup_switching_thread(void *arg)
174 {
175 	callb_cpr_t	cprinfo;
176 	vsw_t		*vswp =  (vsw_t *)arg;
177 	clock_t		wait_time;
178 	clock_t		xwait;
179 	clock_t		wait_rv;
180 	int		rv;
181 
182 	/* wait time used on successive retries */
183 	xwait = drv_usectohz(vsw_setup_switching_delay * MICROSEC);
184 
185 	CALLB_CPR_INIT(&cprinfo, &vswp->sw_thr_lock, callb_generic_cpr,
186 	    "vsw_setup_sw_thread");
187 
188 	mutex_enter(&vswp->sw_thr_lock);
189 
190 	while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
191 
192 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
193 
194 		/* Wait for sometime before (re)trying setup_switching() */
195 		wait_time = ddi_get_lbolt() + xwait;
196 		while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
197 			wait_rv = cv_timedwait(&vswp->sw_thr_cv,
198 			    &vswp->sw_thr_lock, wait_time);
199 			if (wait_rv == -1) {	/* timed out */
200 				break;
201 			}
202 		}
203 
204 		CALLB_CPR_SAFE_END(&cprinfo, &vswp->sw_thr_lock)
205 
206 		if ((vswp->sw_thr_flags & VSW_SWTHR_STOP) != 0) {
207 			/*
208 			 * If there is a stop request, process that first and
209 			 * exit the loop. Continue to hold the mutex which gets
210 			 * released in CALLB_CPR_EXIT().
211 			 */
212 			break;
213 		}
214 
215 		mutex_exit(&vswp->sw_thr_lock);
216 		rv = vsw_setup_switching(vswp);
217 		if (rv == 0) {
218 			vsw_setup_switching_post_process(vswp);
219 		}
220 		mutex_enter(&vswp->sw_thr_lock);
221 		if (rv != EAGAIN) {
222 			break;
223 		}
224 
225 	}
226 
227 	vswp->sw_thr_flags &= ~VSW_SWTHR_STOP;
228 	vswp->sw_thread = NULL;
229 	CALLB_CPR_EXIT(&cprinfo);
230 	thread_exit();
231 }
232 
233 /*
234  * Create a thread to setup the switching mode.
235  * Returns 0 on success; 1 on failure.
236  */
237 int
238 vsw_setup_switching_start(vsw_t *vswp)
239 {
240 	mutex_enter(&vswp->sw_thr_lock);
241 
242 	vswp->sw_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
243 	    vsw_setup_switching_thread, vswp, 0, &p0, TS_RUN, minclsyspri);
244 
245 	if (vswp->sw_thread == NULL) {
246 		mutex_exit(&vswp->sw_thr_lock);
247 		return (1);
248 	}
249 
250 	mutex_exit(&vswp->sw_thr_lock);
251 	return (0);
252 }
253 
254 /*
255  * Stop the thread to setup switching mode.
256  */
257 void
258 vsw_setup_switching_stop(vsw_t *vswp)
259 {
260 	kt_did_t	tid = 0;
261 
262 	/*
263 	 * Signal the setup_switching thread to stop and wait until it stops.
264 	 */
265 	mutex_enter(&vswp->sw_thr_lock);
266 
267 	if (vswp->sw_thread != NULL) {
268 		tid = vswp->sw_thread->t_did;
269 		vswp->sw_thr_flags |= VSW_SWTHR_STOP;
270 		cv_signal(&vswp->sw_thr_cv);
271 	}
272 
273 	mutex_exit(&vswp->sw_thr_lock);
274 
275 	if (tid != 0)
276 		thread_join(tid);
277 
278 	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
279 
280 	vswp->mac_open_retries = 0;
281 }
282 
283 /*
284  * Setup the required switching mode.
285  * Returns:
286  *  0 on success.
287  *  EAGAIN if retry is needed.
288  *  1 on all other failures.
289  */
290 int
291 vsw_setup_switching(vsw_t *vswp)
292 {
293 	int	rv = 1;
294 
295 	D1(vswp, "%s: enter", __func__);
296 
297 	/*
298 	 * Select best switching mode.
299 	 * This is done as this routine can be called from the timeout
300 	 * handler to retry setting up a specific mode. Currently only
301 	 * the function which sets up layer2/promisc mode returns EAGAIN
302 	 * if the underlying network device is not available yet, causing
303 	 * retries.
304 	 */
305 	if (vswp->smode & VSW_LAYER2) {
306 		rv = vsw_setup_layer2(vswp);
307 	} else if (vswp->smode & VSW_LAYER3) {
308 		rv = vsw_setup_layer3(vswp);
309 	} else {
310 		DERR(vswp, "unknown switch mode");
311 		rv = 1;
312 	}
313 
314 	if (rv && (rv != EAGAIN)) {
315 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
316 		    "switching mode", vswp->instance);
317 	} else if (rv == 0) {
318 		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
319 	}
320 
321 	D2(vswp, "%s: Operating in mode %d", __func__,
322 	    vswp->smode);
323 
324 	D1(vswp, "%s: exit", __func__);
325 
326 	return (rv);
327 }
328 
329 /*
330  * Setup for layer 2 switching.
331  *
332  * Returns:
333  *  0 on success.
334  *  EAGAIN if retry is needed.
335  *  EIO on all other failures.
336  */
337 static int
338 vsw_setup_layer2(vsw_t *vswp)
339 {
340 	int	rv;
341 
342 	D1(vswp, "%s: enter", __func__);
343 
344 	/*
345 	 * Until the network device is successfully opened,
346 	 * set the switching to use vsw_switch_l2_frame.
347 	 */
348 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
349 	vswp->mac_cl_switching = B_FALSE;
350 
351 	rv = strlen(vswp->physname);
352 	if (rv == 0) {
353 		/*
354 		 * Physical device name is NULL, which is
355 		 * required for layer 2.
356 		 */
357 		cmn_err(CE_WARN, "!vsw%d: no network device name specified",
358 		    vswp->instance);
359 		return (EIO);
360 	}
361 
362 	mutex_enter(&vswp->mac_lock);
363 
364 	rv = vsw_mac_open(vswp);
365 	if (rv != 0) {
366 		if (rv != EAGAIN) {
367 			cmn_err(CE_WARN, "!vsw%d: Unable to open network "
368 			    "device: %s\n", vswp->instance, vswp->physname);
369 		}
370 		mutex_exit(&vswp->mac_lock);
371 		return (rv);
372 	}
373 
374 	/*
375 	 * Now we can use the mac client switching, so set the switching
376 	 * function to use vsw_switch_l2_frame_mac_client(), which simply
377 	 * sends the packets to MAC layer for switching.
378 	 */
379 	vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client;
380 	vswp->mac_cl_switching = B_TRUE;
381 
382 	D1(vswp, "%s: exit", __func__);
383 
384 	/* Initialize HybridIO related stuff */
385 	vsw_hio_init(vswp);
386 
387 	mutex_exit(&vswp->mac_lock);
388 	return (0);
389 
390 exit_error:
391 	vsw_mac_close(vswp);
392 	mutex_exit(&vswp->mac_lock);
393 	return (EIO);
394 }
395 
396 static int
397 vsw_setup_layer3(vsw_t *vswp)
398 {
399 	D1(vswp, "%s: enter", __func__);
400 
401 	D2(vswp, "%s: operating in layer 3 mode", __func__);
402 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
403 
404 	D1(vswp, "%s: exit", __func__);
405 
406 	return (0);
407 }
408 
409 /* ARGSUSED */
410 void
411 vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
412 			mac_resource_handle_t mrh)
413 {
414 	freemsgchain(mp);
415 }
416 
417 /*
418  * Use mac client for layer 2 switching .
419  */
420 static void
421 vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
422     vsw_port_t *port, mac_resource_handle_t mrh)
423 {
424 	_NOTE(ARGUNUSED(mrh))
425 
426 	mblk_t		*ret_m;
427 
428 	/*
429 	 * This switching function is expected to be called by
430 	 * the ports or the interface only. The packets from
431 	 * physical interface already switched.
432 	 */
433 	ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV));
434 
435 	if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) {
436 		DERR(vswp, "%s: drop mblks to "
437 		    "phys dev", __func__);
438 		freemsgchain(ret_m);
439 	}
440 }
441 
442 /*
443  * Switch the given ethernet frame when operating in layer 2 mode.
444  *
445  * vswp: pointer to the vsw instance
446  * mp: pointer to chain of ethernet frame(s) to be switched
447  * caller: identifies the source of this frame as:
448  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
449  *		2. VSW_PHYSDEV - the physical ethernet device
450  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
451  * arg: argument provided by the caller.
452  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
453  *		2. for PHYSDEV - NULL
454  *		3. for LOCALDEV - pointer to to this vsw_t(self)
455  */
456 void
457 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
458 			vsw_port_t *arg, mac_resource_handle_t mrh)
459 {
460 	struct ether_header	*ehp;
461 	mblk_t			*bp, *ret_m;
462 	vsw_fdbe_t		*fp;
463 
464 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
465 
466 	/*
467 	 * PERF: rather than breaking up the chain here, scan it
468 	 * to find all mblks heading to same destination and then
469 	 * pass that sub-chain to the lower transmit functions.
470 	 */
471 
472 	/* process the chain of packets */
473 	bp = mp;
474 	while (bp) {
475 		ehp = (struct ether_header *)bp->b_rptr;
476 		mp = vsw_get_same_dest_list(ehp, &bp);
477 		ASSERT(mp != NULL);
478 
479 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
480 		    __func__, MBLKSIZE(mp), MBLKL(mp));
481 
482 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
483 			/*
484 			 * If destination is VSW_LOCALDEV (vsw as an eth
485 			 * interface) and if the device is up & running,
486 			 * send the packet up the stack on this host.
487 			 * If the virtual interface is down, drop the packet.
488 			 */
489 			if (caller != VSW_LOCALDEV) {
490 				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
491 			} else {
492 				freemsgchain(mp);
493 			}
494 			continue;
495 		}
496 
497 		/*
498 		 * Find fdb entry for the destination
499 		 * and hold a reference to it.
500 		 */
501 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
502 		if (fp != NULL) {
503 
504 			/*
505 			 * If plumbed and in promisc mode then copy msg
506 			 * and send up the stack.
507 			 */
508 			vsw_mac_rx(vswp, mrh, mp,
509 			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
510 
511 			/*
512 			 * If the destination is in FDB, the packet
513 			 * should be forwarded to the correponding
514 			 * vsw_port (connected to a vnet device -
515 			 * VSW_VNETPORT)
516 			 */
517 			(void) vsw_portsend(fp->portp, mp);
518 
519 			/* Release the reference on the fdb entry */
520 			VSW_FDBE_REFRELE(fp);
521 		} else {
522 			/*
523 			 * Destination not in FDB.
524 			 *
525 			 * If the destination is broadcast or
526 			 * multicast forward the packet to all
527 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
528 			 * except the caller.
529 			 */
530 			if (IS_BROADCAST(ehp)) {
531 				D2(vswp, "%s: BROADCAST pkt", __func__);
532 				(void) vsw_forward_all(vswp, mp, caller, arg);
533 			} else if (IS_MULTICAST(ehp)) {
534 				D2(vswp, "%s: MULTICAST pkt", __func__);
535 				(void) vsw_forward_grp(vswp, mp, caller, arg);
536 			} else {
537 				/*
538 				 * If the destination is unicast, and came
539 				 * from either a logical network device or
540 				 * the switch itself when it is plumbed, then
541 				 * send it out on the physical device and also
542 				 * up the stack if the logical interface is
543 				 * in promiscious mode.
544 				 *
545 				 * NOTE:  The assumption here is that if we
546 				 * cannot find the destination in our fdb, its
547 				 * a unicast address, and came from either a
548 				 * vnet or down the stack (when plumbed) it
549 				 * must be destinded for an ethernet device
550 				 * outside our ldoms.
551 				 */
552 				if (caller == VSW_VNETPORT) {
553 					/* promisc check copy etc */
554 					vsw_mac_rx(vswp, mrh, mp,
555 					    VSW_MACRX_PROMISC |
556 					    VSW_MACRX_COPYMSG);
557 
558 					if ((ret_m = vsw_tx_msg(vswp, mp,
559 					    caller, arg)) != NULL) {
560 						DERR(vswp, "%s: drop mblks to "
561 						    "phys dev", __func__);
562 						freemsgchain(ret_m);
563 					}
564 
565 				} else if (caller == VSW_PHYSDEV) {
566 					/*
567 					 * Pkt seen because card in promisc
568 					 * mode. Send up stack if plumbed in
569 					 * promisc mode, else drop it.
570 					 */
571 					vsw_mac_rx(vswp, mrh, mp,
572 					    VSW_MACRX_PROMISC |
573 					    VSW_MACRX_FREEMSG);
574 
575 				} else if (caller == VSW_LOCALDEV) {
576 					/*
577 					 * Pkt came down the stack, send out
578 					 * over physical device.
579 					 */
580 					if ((ret_m = vsw_tx_msg(vswp, mp,
581 					    caller, NULL)) != NULL) {
582 						DERR(vswp, "%s: drop mblks to "
583 						    "phys dev", __func__);
584 						freemsgchain(ret_m);
585 					}
586 				}
587 			}
588 		}
589 	}
590 	D1(vswp, "%s: exit\n", __func__);
591 }
592 
593 /*
594  * Switch ethernet frame when in layer 3 mode (i.e. using IP
595  * layer to do the routing).
596  *
597  * There is a large amount of overlap between this function and
598  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
599  * both these functions.
600  */
601 void
602 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
603 			vsw_port_t *arg, mac_resource_handle_t mrh)
604 {
605 	struct ether_header	*ehp;
606 	mblk_t			*bp = NULL;
607 	vsw_fdbe_t		*fp;
608 
609 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
610 
611 	/*
612 	 * In layer 3 mode should only ever be switching packets
613 	 * between IP layer and vnet devices. So make sure thats
614 	 * who is invoking us.
615 	 */
616 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
617 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
618 		freemsgchain(mp);
619 		return;
620 	}
621 
622 	/* process the chain of packets */
623 	bp = mp;
624 	while (bp) {
625 		ehp = (struct ether_header *)bp->b_rptr;
626 		mp = vsw_get_same_dest_list(ehp, &bp);
627 		ASSERT(mp != NULL);
628 
629 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
630 		    __func__, MBLKSIZE(mp), MBLKL(mp));
631 
632 		/*
633 		 * Find fdb entry for the destination
634 		 * and hold a reference to it.
635 		 */
636 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
637 		if (fp != NULL) {
638 
639 			D2(vswp, "%s: sending to target port", __func__);
640 			(void) vsw_portsend(fp->portp, mp);
641 
642 			/* Release the reference on the fdb entry */
643 			VSW_FDBE_REFRELE(fp);
644 		} else {
645 			/*
646 			 * Destination not in FDB
647 			 *
648 			 * If the destination is broadcast or
649 			 * multicast forward the packet to all
650 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
651 			 * except the caller.
652 			 */
653 			if (IS_BROADCAST(ehp)) {
654 				D2(vswp, "%s: BROADCAST pkt", __func__);
655 				(void) vsw_forward_all(vswp, mp, caller, arg);
656 			} else if (IS_MULTICAST(ehp)) {
657 				D2(vswp, "%s: MULTICAST pkt", __func__);
658 				(void) vsw_forward_grp(vswp, mp, caller, arg);
659 			} else {
660 				/*
661 				 * Unicast pkt from vnet that we don't have
662 				 * an FDB entry for, so must be destinded for
663 				 * the outside world. Attempt to send up to the
664 				 * IP layer to allow it to deal with it.
665 				 */
666 				if (caller == VSW_VNETPORT) {
667 					vsw_mac_rx(vswp, mrh,
668 					    mp, VSW_MACRX_FREEMSG);
669 				}
670 			}
671 		}
672 	}
673 
674 	D1(vswp, "%s: exit", __func__);
675 }
676 
677 /*
678  * Additional initializations that are needed for the specific switching mode.
679  */
680 void
681 vsw_setup_switching_post_process(vsw_t *vswp)
682 {
683 	link_state_t	link_state = LINK_STATE_UP;
684 
685 	if (vswp->smode & VSW_LAYER2) {
686 		/*
687 		 * Program unicst, mcst addrs of vsw
688 		 * interface and ports in the physdev.
689 		 */
690 		vsw_set_addrs(vswp);
691 
692 		/* Start HIO for ports that have already connected */
693 		vsw_hio_start_ports(vswp);
694 
695 		if (vswp->pls_update == B_TRUE) {
696 			link_state = vswp->phys_link_state;
697 		}
698 
699 		/* Update physical link info to any ports already connected */
700 		vsw_physlink_update_ports(vswp);
701 	}
702 
703 	vsw_mac_link_update(vswp, link_state);
704 }
705 
706 /*
707  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
708  * except the caller (port on which frame arrived).
709  */
710 static int
711 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
712 {
713 	vsw_port_list_t	*plist = &vswp->plist;
714 	vsw_port_t	*portp;
715 	mblk_t		*nmp = NULL;
716 	mblk_t		*ret_m = NULL;
717 	int		skip_port = 0;
718 
719 	D1(vswp, "vsw_forward_all: enter\n");
720 
721 	/*
722 	 * Broadcast message from inside ldoms so send to outside
723 	 * world if in either of layer 2 modes.
724 	 */
725 	if ((vswp->smode & VSW_LAYER2) &&
726 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
727 
728 		nmp = vsw_dupmsgchain(mp);
729 		if (nmp) {
730 			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
731 			    != NULL) {
732 				DERR(vswp, "%s: dropping pkt(s) "
733 				    "consisting of %ld bytes of data for"
734 				    " physical device", __func__, MBLKL(ret_m));
735 				freemsgchain(ret_m);
736 			}
737 		}
738 	}
739 
740 	if (caller == VSW_VNETPORT)
741 		skip_port = 1;
742 
743 	/*
744 	 * Broadcast message from other vnet (layer 2 or 3) or outside
745 	 * world (layer 2 only), send up stack if plumbed.
746 	 */
747 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
748 		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
749 	}
750 
751 	/* send it to all VNETPORTs */
752 	READ_ENTER(&plist->lockrw);
753 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
754 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
755 		/*
756 		 * Caution ! - don't reorder these two checks as arg
757 		 * will be NULL if the caller is PHYSDEV. skip_port is
758 		 * only set if caller is VNETPORT.
759 		 */
760 		if ((skip_port) && (portp == arg)) {
761 			continue;
762 		} else {
763 			nmp = vsw_dupmsgchain(mp);
764 			if (nmp) {
765 				/*
766 				 * The plist->lockrw is protecting the
767 				 * portp from getting destroyed here.
768 				 * So, no ref_cnt is incremented here.
769 				 */
770 				(void) vsw_portsend(portp, nmp);
771 			} else {
772 				DERR(vswp, "vsw_forward_all: nmp NULL");
773 			}
774 		}
775 	}
776 	RW_EXIT(&plist->lockrw);
777 
778 	freemsgchain(mp);
779 
780 	D1(vswp, "vsw_forward_all: exit\n");
781 	return (0);
782 }
783 
784 /*
785  * Forward pkts to any devices or interfaces which have registered
786  * an interest in them (i.e. multicast groups).
787  */
788 static int
789 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
790 {
791 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
792 	mfdb_ent_t		*entp = NULL;
793 	mfdb_ent_t		*tpp = NULL;
794 	vsw_port_t 		*port;
795 	uint64_t		key = 0;
796 	mblk_t			*nmp = NULL;
797 	mblk_t			*ret_m = NULL;
798 	boolean_t		check_if = B_TRUE;
799 
800 	/*
801 	 * Convert address to hash table key
802 	 */
803 	KEY_HASH(key, &ehp->ether_dhost);
804 
805 	D1(vswp, "%s: key 0x%llx", __func__, key);
806 
807 	/*
808 	 * If pkt came from either a vnet or down the stack (if we are
809 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
810 	 * over the physical adapter, and then check to see if any other
811 	 * vnets are interested in it.
812 	 */
813 	if ((vswp->smode & VSW_LAYER2) &&
814 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
815 		nmp = vsw_dupmsgchain(mp);
816 		if (nmp) {
817 			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
818 			    != NULL) {
819 				DERR(vswp, "%s: dropping pkt(s) consisting of "
820 				    "%ld bytes of data for physical device",
821 				    __func__, MBLKL(ret_m));
822 				freemsgchain(ret_m);
823 			}
824 		}
825 	}
826 
827 	READ_ENTER(&vswp->mfdbrw);
828 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
829 	    (mod_hash_val_t *)&entp) != 0) {
830 		D3(vswp, "%s: no table entry found for addr 0x%llx",
831 		    __func__, key);
832 	} else {
833 		/*
834 		 * Send to list of devices associated with this address...
835 		 */
836 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
837 
838 			/* dont send to ourselves */
839 			if ((caller == VSW_VNETPORT) &&
840 			    (tpp->d_addr == (void *)arg)) {
841 				port = (vsw_port_t *)tpp->d_addr;
842 				D3(vswp, "%s: not sending to ourselves"
843 				    " : port %d", __func__, port->p_instance);
844 				continue;
845 
846 			} else if ((caller == VSW_LOCALDEV) &&
847 			    (tpp->d_type == VSW_LOCALDEV)) {
848 				D2(vswp, "%s: not sending back up stack",
849 				    __func__);
850 				continue;
851 			}
852 
853 			if (tpp->d_type == VSW_VNETPORT) {
854 				port = (vsw_port_t *)tpp->d_addr;
855 				D3(vswp, "%s: sending to port %ld for addr "
856 				    "0x%llx", __func__, port->p_instance, key);
857 
858 				nmp = vsw_dupmsgchain(mp);
859 				if (nmp) {
860 					/*
861 					 * The vswp->mfdbrw is protecting the
862 					 * portp from getting destroyed here.
863 					 * So, no ref_cnt is incremented here.
864 					 */
865 					(void) vsw_portsend(port, nmp);
866 				}
867 			} else {
868 				vsw_mac_rx(vswp, NULL,
869 				    mp, VSW_MACRX_COPYMSG);
870 				D2(vswp, "%s: sending up stack"
871 				    " for addr 0x%llx", __func__, key);
872 				check_if = B_FALSE;
873 			}
874 		}
875 	}
876 
877 	RW_EXIT(&vswp->mfdbrw);
878 
879 	/*
880 	 * If the pkt came from either a vnet or from physical device,
881 	 * and if we havent already sent the pkt up the stack then we
882 	 * check now if we can/should (i.e. the interface is plumbed
883 	 * and in promisc mode).
884 	 */
885 	if ((check_if) &&
886 	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
887 		vsw_mac_rx(vswp, NULL, mp,
888 		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
889 	}
890 
891 	freemsgchain(mp);
892 
893 	D1(vswp, "%s: exit", __func__);
894 
895 	return (0);
896 }
897 
898 /*
899  * This function creates the vlan id hash table for the given vsw device or
900  * port. It then adds each vlan that the device or port has been assigned,
901  * into this hash table.
902  * Arguments:
903  *   arg:  vsw device or port.
904  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
905  */
906 void
907 vsw_create_vlans(void *arg, int type)
908 {
909 	/* create vlan hash table */
910 	vsw_vlan_create_hash(arg, type);
911 
912 	/* add vlan ids of the vsw device into its hash table */
913 	vsw_vlan_add_ids(arg, type);
914 }
915 
916 /*
917  * This function removes the vlan ids of the vsw device or port from its hash
918  * table. It then destroys the vlan hash table.
919  * Arguments:
920  *   arg:  vsw device or port.
921  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
922  */
923 void
924 vsw_destroy_vlans(void *arg, int type)
925 {
926 	/* remove vlan ids from the hash table */
927 	vsw_vlan_remove_ids(arg, type);
928 
929 	/* destroy vlan-hash-table */
930 	vsw_vlan_destroy_hash(arg, type);
931 }
932 
933 /*
934  * Create a vlan-id hash table for the given vsw device or port.
935  */
936 static void
937 vsw_vlan_create_hash(void *arg, int type)
938 {
939 	char		hashname[MAXNAMELEN];
940 
941 	if (type == VSW_LOCALDEV) {
942 		vsw_t		*vswp = (vsw_t *)arg;
943 
944 		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
945 		    vswp->instance);
946 
947 		vswp->vlan_nchains = vsw_vlan_nchains;
948 		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
949 		    vswp->vlan_nchains, mod_hash_null_valdtor);
950 
951 	} else if (type == VSW_VNETPORT) {
952 		vsw_port_t	*portp = (vsw_port_t *)arg;
953 
954 		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
955 		    portp->p_instance);
956 
957 		portp->vlan_nchains = vsw_vlan_nchains;
958 		portp->vlan_hashp = mod_hash_create_idhash(hashname,
959 		    portp->vlan_nchains, mod_hash_null_valdtor);
960 
961 	} else {
962 		return;
963 	}
964 }
965 
966 /*
967  * Destroy the vlan-id hash table for the given vsw device or port.
968  */
969 static void
970 vsw_vlan_destroy_hash(void *arg, int type)
971 {
972 	if (type == VSW_LOCALDEV) {
973 		vsw_t		*vswp = (vsw_t *)arg;
974 
975 		mod_hash_destroy_hash(vswp->vlan_hashp);
976 		vswp->vlan_nchains = 0;
977 	} else if (type == VSW_VNETPORT) {
978 		vsw_port_t	*portp = (vsw_port_t *)arg;
979 
980 		mod_hash_destroy_hash(portp->vlan_hashp);
981 		portp->vlan_nchains = 0;
982 	} else {
983 		return;
984 	}
985 }
986 
987 /*
988  * Add vlan ids of the given vsw device or port into its hash table.
989  */
990 void
991 vsw_vlan_add_ids(void *arg, int type)
992 {
993 	int	rv;
994 	int	i;
995 
996 	if (type == VSW_LOCALDEV) {
997 		vsw_t		*vswp = (vsw_t *)arg;
998 
999 		rv = mod_hash_insert(vswp->vlan_hashp,
1000 		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1001 		    (mod_hash_val_t)B_TRUE);
1002 		if (rv != 0) {
1003 			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
1004 			    "the interface", vswp->instance, vswp->pvid);
1005 		}
1006 
1007 		for (i = 0; i < vswp->nvids; i++) {
1008 			rv = mod_hash_insert(vswp->vlan_hashp,
1009 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid),
1010 			    (mod_hash_val_t)B_TRUE);
1011 			if (rv != 0) {
1012 				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
1013 				    " for the interface", vswp->instance,
1014 				    vswp->pvid);
1015 			}
1016 		}
1017 
1018 	} else if (type == VSW_VNETPORT) {
1019 		vsw_port_t	*portp = (vsw_port_t *)arg;
1020 		vsw_t		*vswp = portp->p_vswp;
1021 
1022 		rv = mod_hash_insert(portp->vlan_hashp,
1023 		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1024 		    (mod_hash_val_t)B_TRUE);
1025 		if (rv != 0) {
1026 			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
1027 			    "the port(%d)", vswp->instance, vswp->pvid,
1028 			    portp->p_instance);
1029 		}
1030 
1031 		for (i = 0; i < portp->nvids; i++) {
1032 			rv = mod_hash_insert(portp->vlan_hashp,
1033 			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid),
1034 			    (mod_hash_val_t)B_TRUE);
1035 			if (rv != 0) {
1036 				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
1037 				    " for the port(%d)", vswp->instance,
1038 				    vswp->pvid, portp->p_instance);
1039 			}
1040 		}
1041 
1042 	}
1043 }
1044 
1045 /*
1046  * Remove vlan ids of the given vsw device or port from its hash table.
1047  */
1048 void
1049 vsw_vlan_remove_ids(void *arg, int type)
1050 {
1051 	mod_hash_val_t	vp;
1052 	int		rv;
1053 	int		i;
1054 
1055 	if (type == VSW_LOCALDEV) {
1056 		vsw_t		*vswp = (vsw_t *)arg;
1057 
1058 		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
1059 		if (rv == B_TRUE) {
1060 			rv = mod_hash_remove(vswp->vlan_hashp,
1061 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1062 			    (mod_hash_val_t *)&vp);
1063 			ASSERT(rv == 0);
1064 		}
1065 
1066 		for (i = 0; i < vswp->nvids; i++) {
1067 			rv = vsw_vlan_lookup(vswp->vlan_hashp,
1068 			    vswp->vids[i].vl_vid);
1069 			if (rv == B_TRUE) {
1070 				rv = mod_hash_remove(vswp->vlan_hashp,
1071 				    (mod_hash_key_t)VLAN_ID_KEY(
1072 				    vswp->vids[i].vl_vid),
1073 				    (mod_hash_val_t *)&vp);
1074 				ASSERT(rv == 0);
1075 			}
1076 		}
1077 
1078 	} else if (type == VSW_VNETPORT) {
1079 		vsw_port_t	*portp = (vsw_port_t *)arg;
1080 
1081 		portp = (vsw_port_t *)arg;
1082 		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1083 		if (rv == B_TRUE) {
1084 			rv = mod_hash_remove(portp->vlan_hashp,
1085 			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1086 			    (mod_hash_val_t *)&vp);
1087 			ASSERT(rv == 0);
1088 		}
1089 
1090 		for (i = 0; i < portp->nvids; i++) {
1091 			rv = vsw_vlan_lookup(portp->vlan_hashp,
1092 			    portp->vids[i].vl_vid);
1093 			if (rv == B_TRUE) {
1094 				rv = mod_hash_remove(portp->vlan_hashp,
1095 				    (mod_hash_key_t)VLAN_ID_KEY(
1096 				    portp->vids[i].vl_vid),
1097 				    (mod_hash_val_t *)&vp);
1098 				ASSERT(rv == 0);
1099 			}
1100 		}
1101 
1102 	} else {
1103 		return;
1104 	}
1105 }
1106 
1107 /*
1108  * Find the given vlan id in the hash table.
1109  * Return: B_TRUE if the id is found; B_FALSE if not found.
1110  */
1111 boolean_t
1112 vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1113 {
1114 	int		rv;
1115 	mod_hash_val_t	vp;
1116 
1117 	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1118 
1119 	if (rv != 0)
1120 		return (B_FALSE);
1121 
1122 	return (B_TRUE);
1123 }
1124 
1125 /*
1126  * Add an entry into FDB for the given vsw.
1127  */
1128 void
1129 vsw_fdbe_add(vsw_t *vswp, void *port)
1130 {
1131 	uint64_t	addr = 0;
1132 	vsw_port_t	*portp;
1133 	vsw_fdbe_t	*fp;
1134 	int		rv;
1135 
1136 	portp = (vsw_port_t *)port;
1137 	KEY_HASH(addr, &portp->p_macaddr);
1138 
1139 	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1140 	fp->portp = port;
1141 
1142 	/*
1143 	 * Note: duplicate keys will be rejected by mod_hash.
1144 	 */
1145 	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1146 	    (mod_hash_val_t)fp);
1147 	if (rv != 0) {
1148 		cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for "
1149 		    "the port(%d)", vswp->instance,
1150 		    ether_sprintf(&portp->p_macaddr), portp->p_instance);
1151 	}
1152 }
1153 
1154 /*
1155  * Remove an entry from FDB.
1156  */
1157 void
1158 vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1159 {
1160 	uint64_t	addr = 0;
1161 	vsw_fdbe_t	*fp;
1162 	int		rv;
1163 
1164 	KEY_HASH(addr, eaddr);
1165 
1166 	/*
1167 	 * Remove the entry from fdb hash table.
1168 	 * This prevents further references to this fdb entry.
1169 	 */
1170 	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1171 	    (mod_hash_val_t *)&fp);
1172 	if (rv != 0) {
1173 		/* invalid key? */
1174 		return;
1175 	}
1176 
1177 	/*
1178 	 * If there are threads already ref holding before the entry was
1179 	 * removed from hash table, then wait for ref count to drop to zero.
1180 	 */
1181 	while (fp->refcnt != 0) {
1182 		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1183 	}
1184 
1185 	kmem_free(fp, sizeof (*fp));
1186 }
1187 
1188 /*
1189  * Search fdb for a given mac address. If an entry is found, hold
1190  * a reference to it and return the entry, else returns NULL.
1191  */
1192 static vsw_fdbe_t *
1193 vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1194 {
1195 	uint64_t	key = 0;
1196 	vsw_fdbe_t	*fp;
1197 	int		rv;
1198 
1199 	KEY_HASH(key, addrp);
1200 
1201 	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1202 	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1203 
1204 	if (rv != 0)
1205 		return (NULL);
1206 
1207 	return (fp);
1208 }
1209 
1210 /*
1211  * Callback function provided to mod_hash_find_cb(). After finding the fdb
1212  * entry corresponding to the key (macaddr), this callback will be invoked by
1213  * mod_hash_find_cb() to atomically increment the reference count on the fdb
1214  * entry before returning the found entry.
1215  */
1216 static void
1217 vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1218 {
1219 	_NOTE(ARGUNUSED(key))
1220 	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1221 }
1222 
1223 /*
1224  * A given frame must be always tagged with the appropriate vlan id (unless it
1225  * is in the default-vlan) before the mac address switching function is called.
1226  * Otherwise, after switching function determines the destination, we cannot
1227  * figure out if the destination belongs to the the same vlan that the frame
1228  * originated from and if it needs tag/untag. Frames which are inbound from
1229  * the external(physical) network over a vlan trunk link are always tagged.
1230  * However frames which are received from a vnet-port over ldc or frames which
1231  * are coming down the stack on the service domain over vsw interface may be
1232  * untagged. These frames must be tagged with the appropriate pvid of the
1233  * sender (vnet-port or vsw device), before invoking the switching function.
1234  *
1235  * Arguments:
1236  *   arg:    caller of the function.
1237  *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1238  *   mp:     frame(s) to be tagged.
1239  */
1240 mblk_t *
1241 vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1242 {
1243 	vsw_t			*vswp;
1244 	vsw_port_t		*portp;
1245 	struct ether_header	*ehp;
1246 	mblk_t			*bp;
1247 	mblk_t			*bpt;
1248 	mblk_t			*bph;
1249 	mblk_t			*bpn;
1250 	uint16_t		pvid;
1251 
1252 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1253 
1254 	if (type == VSW_LOCALDEV) {
1255 		vswp = (vsw_t *)arg;
1256 		pvid = vswp->pvid;
1257 		portp = NULL;
1258 	} else {
1259 		/* VSW_VNETPORT */
1260 		portp = (vsw_port_t *)arg;
1261 		pvid = portp->pvid;
1262 		vswp = portp->p_vswp;
1263 	}
1264 
1265 	bpn = bph = bpt = NULL;
1266 
1267 	for (bp = mp; bp != NULL; bp = bpn) {
1268 
1269 		bpn = bp->b_next;
1270 		bp->b_next = bp->b_prev = NULL;
1271 
1272 		/* Determine if it is an untagged frame */
1273 		ehp = (struct ether_header *)bp->b_rptr;
1274 
1275 		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1276 
1277 			/* no need to tag if the frame is in default vlan */
1278 			if (pvid != vswp->default_vlan_id) {
1279 				bp = vnet_vlan_insert_tag(bp, pvid);
1280 				if (bp == NULL) {
1281 					continue;
1282 				}
1283 			}
1284 		}
1285 
1286 		/* build a chain of processed packets */
1287 		if (bph == NULL) {
1288 			bph = bpt = bp;
1289 		} else {
1290 			bpt->b_next = bp;
1291 			bpt = bp;
1292 		}
1293 
1294 	}
1295 
1296 	return (bph);
1297 }
1298 
1299 /*
1300  * Frames destined to a vnet-port or to the local vsw interface, must be
1301  * untagged if necessary before sending. This function first checks that the
1302  * frame can be sent to the destination in the vlan identified by the frame
1303  * tag. Note that when this function is invoked the frame must have been
1304  * already tagged (unless it is in the default-vlan). Because, this function is
1305  * called when the switching function determines the destination and invokes
1306  * its send function (vnet-port or vsw interface) and all frames would have
1307  * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1308  *
1309  * Arguments:
1310  *   arg:    destination device.
1311  *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1312  *   np:     head of pkt chain to be validated and untagged.
1313  *   npt:    tail of pkt chain to be validated and untagged.
1314  *
1315  * Returns:
1316  *   np:     head of updated chain of packets
1317  *   npt:    tail of updated chain of packets
1318  *   rv:     count of the packets in the returned list
1319  */
1320 uint32_t
1321 vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1322 {
1323 	mblk_t			*bp;
1324 	mblk_t			*bpt;
1325 	mblk_t			*bph;
1326 	mblk_t			*bpn;
1327 	vsw_port_t		*portp;
1328 	vsw_t			*vswp;
1329 	uint32_t		count;
1330 	struct ether_header	*ehp;
1331 	boolean_t		is_tagged;
1332 	boolean_t		rv;
1333 	uint16_t		vlan_id;
1334 	uint16_t		pvid;
1335 	mod_hash_t		*vlan_hashp;
1336 
1337 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1338 
1339 
1340 	if (type == VSW_LOCALDEV) {
1341 		vswp = (vsw_t *)arg;
1342 		pvid = vswp->pvid;
1343 		vlan_hashp = vswp->vlan_hashp;
1344 		portp = NULL;
1345 	} else {
1346 		/* type == VSW_VNETPORT */
1347 		portp = (vsw_port_t *)arg;
1348 		vswp = portp->p_vswp;
1349 		vlan_hashp = portp->vlan_hashp;
1350 		pvid = portp->pvid;
1351 	}
1352 
1353 	/*
1354 	 * If the MAC layer switching in place, then
1355 	 * untagging required only if the pvid is not
1356 	 * the same as default_vlan_id. This is because,
1357 	 * the MAC layer will send packets for the
1358 	 * registered vlans only.
1359 	 */
1360 	if ((vswp->mac_cl_switching == B_TRUE) &&
1361 	    (pvid == vswp->default_vlan_id)) {
1362 		/* simply count and set the tail */
1363 		count = 1;
1364 		bp = *np;
1365 		ASSERT(bp != NULL);
1366 		while (bp->b_next != NULL) {
1367 			bp = bp->b_next;
1368 			count++;
1369 		}
1370 		*npt = bp;
1371 		return (count);
1372 	}
1373 
1374 	bpn = bph = bpt = NULL;
1375 	count = 0;
1376 
1377 	for (bp = *np; bp != NULL; bp = bpn) {
1378 
1379 		bpn = bp->b_next;
1380 		bp->b_next = bp->b_prev = NULL;
1381 
1382 		/*
1383 		 * Determine the vlan id that the frame belongs to.
1384 		 */
1385 		ehp = (struct ether_header *)bp->b_rptr;
1386 		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1387 
1388 		/*
1389 		 * If MAC layer switching in place, then we
1390 		 * need to untag only if the tagged packet has
1391 		 * vlan-id same as the pvid.
1392 		 */
1393 		if (vswp->mac_cl_switching == B_TRUE) {
1394 
1395 			/* only tagged packets expected here */
1396 			ASSERT(is_tagged == B_TRUE);
1397 			if (vlan_id == pvid) {
1398 				bp = vnet_vlan_remove_tag(bp);
1399 				if (bp == NULL) {
1400 					/* packet dropped */
1401 					continue;
1402 				}
1403 			}
1404 		} else { /* No MAC layer switching */
1405 
1406 			/*
1407 			 * Check the frame header if tag/untag is  needed.
1408 			 */
1409 			if (is_tagged == B_FALSE) {
1410 				/*
1411 				 * Untagged frame. We shouldn't have an
1412 				 * untagged packet at this point, unless
1413 				 * the destination's  vlan id is
1414 				 * default-vlan-id; if it is not the
1415 				 * default-vlan-id, we drop the packet.
1416 				 */
1417 				if (vlan_id != vswp->default_vlan_id) {
1418 					/* drop the packet */
1419 					freemsg(bp);
1420 					continue;
1421 				}
1422 			} else {	/* Tagged */
1423 				/*
1424 				 * Tagged frame, untag if it's the
1425 				 * destination's pvid.
1426 				 */
1427 				if (vlan_id == pvid) {
1428 
1429 					bp = vnet_vlan_remove_tag(bp);
1430 					if (bp == NULL) {
1431 						/* packet dropped */
1432 						continue;
1433 					}
1434 				} else {
1435 
1436 					/*
1437 					 * Check if the destination is in the
1438 					 * same vlan.
1439 					 */
1440 					rv = vsw_vlan_lookup(vlan_hashp,
1441 					    vlan_id);
1442 					if (rv == B_FALSE) {
1443 						/* drop the packet */
1444 						freemsg(bp);
1445 						continue;
1446 					}
1447 				}
1448 
1449 			}
1450 		}
1451 
1452 		/* build a chain of processed packets */
1453 		if (bph == NULL) {
1454 			bph = bpt = bp;
1455 		} else {
1456 			bpt->b_next = bp;
1457 			bpt = bp;
1458 		}
1459 		count++;
1460 	}
1461 
1462 	*np = bph;
1463 	*npt = bpt;
1464 	return (count);
1465 }
1466 
1467 /*
1468  * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1469  * then the vlan-id is available in the tag; otherwise, its vlan id is
1470  * implicitly obtained based on the caller (destination of the frame:
1471  * VSW_VNETPORT or VSW_LOCALDEV).
1472  * The vlan id determined is returned in vidp.
1473  * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1474  */
1475 boolean_t
1476 vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1477 	uint16_t *vidp)
1478 {
1479 	struct ether_vlan_header	*evhp;
1480 	vsw_t				*vswp;
1481 	vsw_port_t			*portp;
1482 
1483 	/* If it's a tagged frame, get the vid from vlan header */
1484 	if (ehp->ether_type == ETHERTYPE_VLAN) {
1485 
1486 		evhp = (struct ether_vlan_header *)ehp;
1487 		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1488 		return (B_TRUE);
1489 	}
1490 
1491 	/* Untagged frame; determine vlan id based on caller */
1492 	switch (caller) {
1493 
1494 	case VSW_VNETPORT:
1495 		/*
1496 		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1497 		 */
1498 		portp = (vsw_port_t *)arg;
1499 		*vidp = portp->pvid;
1500 		break;
1501 
1502 	case VSW_LOCALDEV:
1503 
1504 		/*
1505 		 * packet destined to vsw interface;
1506 		 * vlan-id is port-vlan-id of vsw device.
1507 		 */
1508 		vswp = (vsw_t *)arg;
1509 		*vidp = vswp->pvid;
1510 		break;
1511 	}
1512 
1513 	return (B_FALSE);
1514 }
1515 
1516 /*
1517  * Add or remove multicast address(es).
1518  *
1519  * Returns 0 on success, 1 on failure.
1520  */
1521 int
1522 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1523 {
1524 	mcst_addr_t		*mcst_p = NULL;
1525 	vsw_t			*vswp = port->p_vswp;
1526 	uint64_t		addr = 0x0;
1527 	int			i;
1528 
1529 	D1(vswp, "%s: enter", __func__);
1530 
1531 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1532 
1533 	for (i = 0; i < mcst_pkt->count; i++) {
1534 		/*
1535 		 * Convert address into form that can be used
1536 		 * as hash table key.
1537 		 */
1538 		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1539 
1540 		/*
1541 		 * Add or delete the specified address/port combination.
1542 		 */
1543 		if (mcst_pkt->set == 0x1) {
1544 			D3(vswp, "%s: adding multicast address 0x%llx for "
1545 			    "port %ld", __func__, addr, port->p_instance);
1546 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1547 				/*
1548 				 * Update the list of multicast
1549 				 * addresses contained within the
1550 				 * port structure to include this new
1551 				 * one.
1552 				 */
1553 				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1554 				    KM_NOSLEEP);
1555 				if (mcst_p == NULL) {
1556 					DERR(vswp, "%s: unable to alloc mem",
1557 					    __func__);
1558 					(void) vsw_del_mcst(vswp,
1559 					    VSW_VNETPORT, addr, port);
1560 					return (1);
1561 				}
1562 
1563 				mcst_p->nextp = NULL;
1564 				mcst_p->addr = addr;
1565 				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1566 
1567 				/*
1568 				 * Program the address into HW. If the addr
1569 				 * has already been programmed then the MAC
1570 				 * just increments a ref counter (which is
1571 				 * used when the address is being deleted)
1572 				 */
1573 				if (vsw_mac_multicast_add(vswp, port, mcst_p,
1574 				    VSW_VNETPORT)) {
1575 					(void) vsw_del_mcst(vswp,
1576 					    VSW_VNETPORT, addr, port);
1577 					kmem_free(mcst_p, sizeof (*mcst_p));
1578 					return (1);
1579 				}
1580 
1581 				mutex_enter(&port->mca_lock);
1582 				mcst_p->nextp = port->mcap;
1583 				port->mcap = mcst_p;
1584 				mutex_exit(&port->mca_lock);
1585 
1586 			} else {
1587 				DERR(vswp, "%s: error adding multicast "
1588 				    "address 0x%llx for port %ld",
1589 				    __func__, addr, port->p_instance);
1590 				return (1);
1591 			}
1592 		} else {
1593 			/*
1594 			 * Delete an entry from the multicast hash
1595 			 * table and update the address list
1596 			 * appropriately.
1597 			 */
1598 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1599 				D3(vswp, "%s: deleting multicast address "
1600 				    "0x%llx for port %ld", __func__, addr,
1601 				    port->p_instance);
1602 
1603 				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1604 				ASSERT(mcst_p != NULL);
1605 
1606 				/*
1607 				 * Remove the address from HW. The address
1608 				 * will actually only be removed once the ref
1609 				 * count within the MAC layer has dropped to
1610 				 * zero. I.e. we can safely call this fn even
1611 				 * if other ports are interested in this
1612 				 * address.
1613 				 */
1614 				vsw_mac_multicast_remove(vswp, port, mcst_p,
1615 				    VSW_VNETPORT);
1616 				kmem_free(mcst_p, sizeof (*mcst_p));
1617 
1618 			} else {
1619 				DERR(vswp, "%s: error deleting multicast "
1620 				    "addr 0x%llx for port %ld",
1621 				    __func__, addr, port->p_instance);
1622 				return (1);
1623 			}
1624 		}
1625 	}
1626 	D1(vswp, "%s: exit", __func__);
1627 	return (0);
1628 }
1629 
1630 /*
1631  * Add a new multicast entry.
1632  *
1633  * Search hash table based on address. If match found then
1634  * update associated val (which is chain of ports), otherwise
1635  * create new key/val (addr/port) pair and insert into table.
1636  */
1637 int
1638 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1639 {
1640 	int		dup = 0;
1641 	int		rv = 0;
1642 	mfdb_ent_t	*ment = NULL;
1643 	mfdb_ent_t	*tmp_ent = NULL;
1644 	mfdb_ent_t	*new_ent = NULL;
1645 	void		*tgt = NULL;
1646 
1647 	if (devtype == VSW_VNETPORT) {
1648 		/*
1649 		 * Being invoked from a vnet.
1650 		 */
1651 		ASSERT(arg != NULL);
1652 		tgt = arg;
1653 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1654 		    ((vsw_port_t *)arg)->p_instance, addr);
1655 	} else {
1656 		/*
1657 		 * We are being invoked via the m_multicst mac entry
1658 		 * point.
1659 		 */
1660 		D2(NULL, "%s: address 0x%llx", __func__, addr);
1661 		tgt = (void *)vswp;
1662 	}
1663 
1664 	WRITE_ENTER(&vswp->mfdbrw);
1665 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1666 	    (mod_hash_val_t *)&ment) != 0) {
1667 
1668 		/* address not currently in table */
1669 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1670 		ment->d_addr = (void *)tgt;
1671 		ment->d_type = devtype;
1672 		ment->nextp = NULL;
1673 
1674 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1675 		    (mod_hash_val_t)ment) != 0) {
1676 			DERR(vswp, "%s: hash table insertion failed", __func__);
1677 			kmem_free(ment, sizeof (mfdb_ent_t));
1678 			rv = 1;
1679 		} else {
1680 			D2(vswp, "%s: added initial entry for 0x%llx to "
1681 			    "table", __func__, addr);
1682 		}
1683 	} else {
1684 		/*
1685 		 * Address in table. Check to see if specified port
1686 		 * is already associated with the address. If not add
1687 		 * it now.
1688 		 */
1689 		tmp_ent = ment;
1690 		while (tmp_ent != NULL) {
1691 			if (tmp_ent->d_addr == (void *)tgt) {
1692 				if (devtype == VSW_VNETPORT) {
1693 					DERR(vswp, "%s: duplicate port entry "
1694 					    "found for portid %ld and key "
1695 					    "0x%llx", __func__,
1696 					    ((vsw_port_t *)arg)->p_instance,
1697 					    addr);
1698 				} else {
1699 					DERR(vswp, "%s: duplicate entry found"
1700 					    "for key 0x%llx", __func__, addr);
1701 				}
1702 				rv = 1;
1703 				dup = 1;
1704 				break;
1705 			}
1706 			tmp_ent = tmp_ent->nextp;
1707 		}
1708 
1709 		/*
1710 		 * Port not on list so add it to end now.
1711 		 */
1712 		if (0 == dup) {
1713 			D2(vswp, "%s: added entry for 0x%llx to table",
1714 			    __func__, addr);
1715 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1716 			new_ent->d_addr = (void *)tgt;
1717 			new_ent->d_type = devtype;
1718 			new_ent->nextp = NULL;
1719 
1720 			tmp_ent = ment;
1721 			while (tmp_ent->nextp != NULL)
1722 				tmp_ent = tmp_ent->nextp;
1723 
1724 			tmp_ent->nextp = new_ent;
1725 		}
1726 	}
1727 
1728 	RW_EXIT(&vswp->mfdbrw);
1729 	return (rv);
1730 }
1731 
1732 /*
1733  * Remove a multicast entry from the hashtable.
1734  *
1735  * Search hash table based on address. If match found, scan
1736  * list of ports associated with address. If specified port
1737  * found remove it from list.
1738  */
1739 int
1740 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1741 {
1742 	mfdb_ent_t	*ment = NULL;
1743 	mfdb_ent_t	*curr_p, *prev_p;
1744 	void		*tgt = NULL;
1745 
1746 	D1(vswp, "%s: enter", __func__);
1747 
1748 	if (devtype == VSW_VNETPORT) {
1749 		tgt = (vsw_port_t *)arg;
1750 		D2(vswp, "%s: removing port %d from mFDB for address"
1751 		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1752 	} else {
1753 		D2(vswp, "%s: removing entry", __func__);
1754 		tgt = (void *)vswp;
1755 	}
1756 
1757 	WRITE_ENTER(&vswp->mfdbrw);
1758 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1759 	    (mod_hash_val_t *)&ment) != 0) {
1760 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1761 		RW_EXIT(&vswp->mfdbrw);
1762 		return (1);
1763 	}
1764 
1765 	prev_p = curr_p = ment;
1766 
1767 	while (curr_p != NULL) {
1768 		if (curr_p->d_addr == (void *)tgt) {
1769 			if (devtype == VSW_VNETPORT) {
1770 				D2(vswp, "%s: port %d found", __func__,
1771 				    ((vsw_port_t *)tgt)->p_instance);
1772 			} else {
1773 				D2(vswp, "%s: instance found", __func__);
1774 			}
1775 
1776 			if (prev_p == curr_p) {
1777 				/*
1778 				 * head of list, if no other element is in
1779 				 * list then destroy this entry, otherwise
1780 				 * just replace it with updated value.
1781 				 */
1782 				ment = curr_p->nextp;
1783 				if (ment == NULL) {
1784 					(void) mod_hash_destroy(vswp->mfdb,
1785 					    (mod_hash_val_t)addr);
1786 				} else {
1787 					(void) mod_hash_replace(vswp->mfdb,
1788 					    (mod_hash_key_t)addr,
1789 					    (mod_hash_val_t)ment);
1790 				}
1791 			} else {
1792 				/*
1793 				 * Not head of list, no need to do
1794 				 * replacement, just adjust list pointers.
1795 				 */
1796 				prev_p->nextp = curr_p->nextp;
1797 			}
1798 			break;
1799 		}
1800 
1801 		prev_p = curr_p;
1802 		curr_p = curr_p->nextp;
1803 	}
1804 
1805 	RW_EXIT(&vswp->mfdbrw);
1806 
1807 	D1(vswp, "%s: exit", __func__);
1808 
1809 	if (curr_p == NULL)
1810 		return (1);
1811 	kmem_free(curr_p, sizeof (mfdb_ent_t));
1812 	return (0);
1813 }
1814 
1815 /*
1816  * Port is being deleted, but has registered an interest in one
1817  * or more multicast groups. Using the list of addresses maintained
1818  * within the port structure find the appropriate entry in the hash
1819  * table and remove this port from the list of interested ports.
1820  */
1821 void
1822 vsw_del_mcst_port(vsw_port_t *port)
1823 {
1824 	mcst_addr_t	*mcap = NULL;
1825 	vsw_t		*vswp = port->p_vswp;
1826 
1827 	D1(vswp, "%s: enter", __func__);
1828 
1829 	mutex_enter(&port->mca_lock);
1830 
1831 	while ((mcap = port->mcap) != NULL) {
1832 
1833 		port->mcap = mcap->nextp;
1834 
1835 		mutex_exit(&port->mca_lock);
1836 
1837 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1838 		    mcap->addr, port);
1839 
1840 		/*
1841 		 * Remove the address from HW. The address
1842 		 * will actually only be removed once the ref
1843 		 * count within the MAC layer has dropped to
1844 		 * zero. I.e. we can safely call this fn even
1845 		 * if other ports are interested in this
1846 		 * address.
1847 		 */
1848 		vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT);
1849 		kmem_free(mcap, sizeof (*mcap));
1850 
1851 		mutex_enter(&port->mca_lock);
1852 
1853 	}
1854 
1855 	mutex_exit(&port->mca_lock);
1856 
1857 	D1(vswp, "%s: exit", __func__);
1858 }
1859 
1860 /*
1861  * This vsw instance is detaching, but has registered an interest in one
1862  * or more multicast groups. Using the list of addresses maintained
1863  * within the vsw structure find the appropriate entry in the hash
1864  * table and remove this instance from the list of interested ports.
1865  */
1866 void
1867 vsw_del_mcst_vsw(vsw_t *vswp)
1868 {
1869 	mcst_addr_t	*next_p = NULL;
1870 
1871 	D1(vswp, "%s: enter", __func__);
1872 
1873 	mutex_enter(&vswp->mca_lock);
1874 
1875 	while (vswp->mcap != NULL) {
1876 		DERR(vswp, "%s: deleting addr 0x%llx",
1877 		    __func__, vswp->mcap->addr);
1878 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1879 
1880 		next_p = vswp->mcap->nextp;
1881 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1882 		vswp->mcap = next_p;
1883 	}
1884 
1885 	vswp->mcap = NULL;
1886 	mutex_exit(&vswp->mca_lock);
1887 
1888 	D1(vswp, "%s: exit", __func__);
1889 }
1890 
1891 mblk_t *
1892 vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp)
1893 {
1894 	mblk_t			*bp;
1895 	mblk_t			*nbp;
1896 	mblk_t			*head = NULL;
1897 	mblk_t			*tail = NULL;
1898 	mblk_t			*prev = NULL;
1899 	struct ether_header	*behp;
1900 
1901 	/* process the chain of packets */
1902 	bp = *mpp;
1903 	while (bp) {
1904 		nbp = bp->b_next;
1905 		behp = (struct ether_header *)bp->b_rptr;
1906 		bp->b_prev = NULL;
1907 		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1908 			if (prev == NULL) {
1909 				*mpp = nbp;
1910 			} else {
1911 				prev->b_next = nbp;
1912 			}
1913 			bp->b_next =  NULL;
1914 			if (head == NULL) {
1915 				head = tail = bp;
1916 			} else {
1917 				tail->b_next = bp;
1918 				tail = bp;
1919 			}
1920 		} else {
1921 			prev = bp;
1922 		}
1923 		bp = nbp;
1924 	}
1925 	return (head);
1926 }
1927 
1928 static mblk_t *
1929 vsw_dupmsgchain(mblk_t *mp)
1930 {
1931 	mblk_t	*nmp = NULL;
1932 	mblk_t	**nmpp = &nmp;
1933 
1934 	for (; mp != NULL; mp = mp->b_next) {
1935 		if ((*nmpp = dupmsg(mp)) == NULL) {
1936 			freemsgchain(nmp);
1937 			return (NULL);
1938 		}
1939 
1940 		nmpp = &((*nmpp)->b_next);
1941 	}
1942 
1943 	return (nmp);
1944 }
1945