xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_switching.c (revision d5ace9454616652a717c9831d949dffa319381f9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mdeg.h>
62 #include <sys/ldc.h>
63 #include <sys/vsw_fdb.h>
64 #include <sys/vsw.h>
65 #include <sys/vio_mailbox.h>
66 #include <sys/vnet_mailbox.h>
67 #include <sys/vnet_common.h>
68 #include <sys/vio_util.h>
69 #include <sys/sdt.h>
70 #include <sys/atomic.h>
71 #include <sys/vlan.h>
72 
73 /* Switching setup routines */
74 void vsw_setup_switching_thread(void *arg);
75 int vsw_setup_switching_start(vsw_t *vswp);
76 void vsw_setup_switching_stop(vsw_t *vswp);
77 int vsw_setup_switching(vsw_t *);
78 void vsw_setup_layer2_post_process(vsw_t *vswp);
79 void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
80     vsw_port_t *port, mac_resource_handle_t mrh);
81 static	int vsw_setup_layer2(vsw_t *);
82 static	int vsw_setup_layer3(vsw_t *);
83 
84 /* Switching/data transmit routines */
85 static	void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
86     vsw_port_t *port, mac_resource_handle_t);
87 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
88 	vsw_port_t *port, mac_resource_handle_t);
89 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
90 	vsw_port_t *port, mac_resource_handle_t);
91 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
92 	int caller, vsw_port_t *port);
93 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
94     int caller, vsw_port_t *port);
95 
96 /* VLAN routines */
97 void vsw_create_vlans(void *arg, int type);
98 void vsw_destroy_vlans(void *arg, int type);
99 void vsw_vlan_add_ids(void *arg, int type);
100 void vsw_vlan_remove_ids(void *arg, int type);
101 static	void vsw_vlan_create_hash(void *arg, int type);
102 static	void vsw_vlan_destroy_hash(void *arg, int type);
103 boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
104 	uint16_t *vidp);
105 mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
106 uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
107 boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
108 
109 /* Forwarding database (FDB) routines */
110 void vsw_fdbe_add(vsw_t *vswp, void *port);
111 void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
112 static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
113 static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
114 
115 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
116 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
117 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
118 void vsw_del_mcst_vsw(vsw_t *);
119 
120 /* Support functions */
121 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
122 static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp);
123 
124 
125 /*
126  * Functions imported from other files.
127  */
128 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
129 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
130 extern int vsw_mac_open(vsw_t *vswp);
131 extern void vsw_mac_close(vsw_t *vswp);
132 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
133     mblk_t *mp, vsw_macrx_flags_t flags);
134 extern void vsw_set_addrs(vsw_t *vswp);
135 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
136 extern void vsw_hio_init(vsw_t *vswp);
137 extern void vsw_hio_start_ports(vsw_t *vswp);
138 extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port,
139     mcst_addr_t *mcst_p, int type);
140 extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
141     mcst_addr_t *mcst_p, int type);
142 extern void vsw_physlink_state_update(vsw_t *vswp);
143 
144 /*
145  * Tunables used in this file.
146  */
147 extern	int vsw_setup_switching_delay;
148 extern	uint32_t vsw_vlan_nchains;
149 extern	uint32_t vsw_fdbe_refcnt_delay;
150 
151 #define	VSW_FDBE_REFHOLD(p)						\
152 {									\
153 	atomic_inc_32(&(p)->refcnt);					\
154 	ASSERT((p)->refcnt != 0);					\
155 }
156 
157 #define	VSW_FDBE_REFRELE(p)						\
158 {									\
159 	ASSERT((p)->refcnt != 0);					\
160 	atomic_dec_32(&(p)->refcnt);					\
161 }
162 
163 /*
164  * Thread to setup switching mode. This thread is created during vsw_attach()
165  * initially. It invokes vsw_setup_switching() and keeps retrying while the
166  * returned value is EAGAIN. The thread exits when the switching mode setup is
167  * done successfully or when the error returned is not EAGAIN. This thread may
168  * also get created from vsw_update_md_prop() if the switching mode needs to be
169  * updated.
170  */
171 void
172 vsw_setup_switching_thread(void *arg)
173 {
174 	callb_cpr_t	cprinfo;
175 	vsw_t		*vswp =  (vsw_t *)arg;
176 	clock_t		wait_time;
177 	clock_t		xwait;
178 	clock_t		wait_rv;
179 	int		rv;
180 
181 	/* wait time used on successive retries */
182 	xwait = drv_usectohz(vsw_setup_switching_delay * MICROSEC);
183 
184 	CALLB_CPR_INIT(&cprinfo, &vswp->sw_thr_lock, callb_generic_cpr,
185 	    "vsw_setup_sw_thread");
186 
187 	mutex_enter(&vswp->sw_thr_lock);
188 
189 	while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
190 
191 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
192 
193 		/* Wait for sometime before (re)trying setup_switching() */
194 		wait_time = ddi_get_lbolt() + xwait;
195 		while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
196 			wait_rv = cv_timedwait(&vswp->sw_thr_cv,
197 			    &vswp->sw_thr_lock, wait_time);
198 			if (wait_rv == -1) {	/* timed out */
199 				break;
200 			}
201 		}
202 
203 		CALLB_CPR_SAFE_END(&cprinfo, &vswp->sw_thr_lock)
204 
205 		if ((vswp->sw_thr_flags & VSW_SWTHR_STOP) != 0) {
206 			/*
207 			 * If there is a stop request, process that first and
208 			 * exit the loop. Continue to hold the mutex which gets
209 			 * released in CALLB_CPR_EXIT().
210 			 */
211 			break;
212 		}
213 
214 		mutex_exit(&vswp->sw_thr_lock);
215 		rv = vsw_setup_switching(vswp);
216 		if (rv == 0) {
217 			vsw_setup_layer2_post_process(vswp);
218 		}
219 		mutex_enter(&vswp->sw_thr_lock);
220 		if (rv != EAGAIN) {
221 			break;
222 		}
223 
224 	}
225 
226 	vswp->sw_thr_flags &= ~VSW_SWTHR_STOP;
227 	vswp->sw_thread = NULL;
228 	CALLB_CPR_EXIT(&cprinfo);
229 	thread_exit();
230 }
231 
232 /*
233  * Create a thread to setup the switching mode.
234  * Returns 0 on success; 1 on failure.
235  */
236 int
237 vsw_setup_switching_start(vsw_t *vswp)
238 {
239 	mutex_enter(&vswp->sw_thr_lock);
240 
241 	vswp->sw_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
242 	    vsw_setup_switching_thread, vswp, 0, &p0, TS_RUN, minclsyspri);
243 
244 	if (vswp->sw_thread == NULL) {
245 		mutex_exit(&vswp->sw_thr_lock);
246 		return (1);
247 	}
248 
249 	mutex_exit(&vswp->sw_thr_lock);
250 	return (0);
251 }
252 
253 /*
254  * Stop the thread to setup switching mode.
255  */
256 void
257 vsw_setup_switching_stop(vsw_t *vswp)
258 {
259 	kt_did_t	tid = 0;
260 
261 	/*
262 	 * Signal the setup_switching thread to stop and wait until it stops.
263 	 */
264 	mutex_enter(&vswp->sw_thr_lock);
265 
266 	if (vswp->sw_thread != NULL) {
267 		tid = vswp->sw_thread->t_did;
268 		vswp->sw_thr_flags |= VSW_SWTHR_STOP;
269 		cv_signal(&vswp->sw_thr_cv);
270 	}
271 
272 	mutex_exit(&vswp->sw_thr_lock);
273 
274 	if (tid != 0)
275 		thread_join(tid);
276 
277 	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
278 
279 	vswp->mac_open_retries = 0;
280 }
281 
282 /*
283  * Setup the required switching mode.
284  * Returns:
285  *  0 on success.
286  *  EAGAIN if retry is needed.
287  *  1 on all other failures.
288  */
289 int
290 vsw_setup_switching(vsw_t *vswp)
291 {
292 	int	rv = 1;
293 
294 	D1(vswp, "%s: enter", __func__);
295 
296 	/*
297 	 * Select best switching mode.
298 	 * This is done as this routine can be called from the timeout
299 	 * handler to retry setting up a specific mode. Currently only
300 	 * the function which sets up layer2/promisc mode returns EAGAIN
301 	 * if the underlying network device is not available yet, causing
302 	 * retries.
303 	 */
304 	if (vswp->smode & VSW_LAYER2) {
305 		rv = vsw_setup_layer2(vswp);
306 	} else if (vswp->smode & VSW_LAYER3) {
307 		rv = vsw_setup_layer3(vswp);
308 	} else {
309 		DERR(vswp, "unknown switch mode");
310 		rv = 1;
311 	}
312 
313 	if (rv && (rv != EAGAIN)) {
314 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
315 		    "switching mode", vswp->instance);
316 	} else if (rv == 0) {
317 		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
318 	}
319 
320 	D2(vswp, "%s: Operating in mode %d", __func__,
321 	    vswp->smode);
322 
323 	D1(vswp, "%s: exit", __func__);
324 
325 	return (rv);
326 }
327 
328 /*
329  * Setup for layer 2 switching.
330  *
331  * Returns:
332  *  0 on success.
333  *  EAGAIN if retry is needed.
334  *  EIO on all other failures.
335  */
336 static int
337 vsw_setup_layer2(vsw_t *vswp)
338 {
339 	int	rv;
340 
341 	D1(vswp, "%s: enter", __func__);
342 
343 	/*
344 	 * Until the network device is successfully opened,
345 	 * set the switching to use vsw_switch_l2_frame.
346 	 */
347 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
348 	vswp->mac_cl_switching = B_FALSE;
349 
350 	rv = strlen(vswp->physname);
351 	if (rv == 0) {
352 		/*
353 		 * Physical device name is NULL, which is
354 		 * required for layer 2.
355 		 */
356 		cmn_err(CE_WARN, "!vsw%d: no network device name specified",
357 		    vswp->instance);
358 		return (EIO);
359 	}
360 
361 	mutex_enter(&vswp->mac_lock);
362 
363 	rv = vsw_mac_open(vswp);
364 	if (rv != 0) {
365 		if (rv != EAGAIN) {
366 			cmn_err(CE_WARN, "!vsw%d: Unable to open network "
367 			    "device: %s\n", vswp->instance, vswp->physname);
368 		}
369 		mutex_exit(&vswp->mac_lock);
370 		return (rv);
371 	}
372 
373 	/*
374 	 * Now we can use the mac client switching, so set the switching
375 	 * function to use vsw_switch_l2_frame_mac_client(), which simply
376 	 * sends the packets to MAC layer for switching.
377 	 */
378 	vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client;
379 	vswp->mac_cl_switching = B_TRUE;
380 
381 	D1(vswp, "%s: exit", __func__);
382 
383 	/* Initialize HybridIO related stuff */
384 	vsw_hio_init(vswp);
385 
386 	mutex_exit(&vswp->mac_lock);
387 	return (0);
388 
389 exit_error:
390 	vsw_mac_close(vswp);
391 	mutex_exit(&vswp->mac_lock);
392 	return (EIO);
393 }
394 
395 static int
396 vsw_setup_layer3(vsw_t *vswp)
397 {
398 	D1(vswp, "%s: enter", __func__);
399 
400 	D2(vswp, "%s: operating in layer 3 mode", __func__);
401 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
402 
403 	D1(vswp, "%s: exit", __func__);
404 
405 	return (0);
406 }
407 
408 /* ARGSUSED */
409 void
410 vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
411 			mac_resource_handle_t mrh)
412 {
413 	freemsgchain(mp);
414 }
415 
416 /*
417  * Use mac client for layer 2 switching .
418  */
419 static void
420 vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
421     vsw_port_t *port, mac_resource_handle_t mrh)
422 {
423 	_NOTE(ARGUNUSED(mrh))
424 
425 	mblk_t		*ret_m;
426 
427 	/*
428 	 * This switching function is expected to be called by
429 	 * the ports or the interface only. The packets from
430 	 * physical interface already switched.
431 	 */
432 	ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV));
433 
434 	if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) {
435 		DERR(vswp, "%s: drop mblks to "
436 		    "phys dev", __func__);
437 		freemsgchain(ret_m);
438 	}
439 }
440 
441 /*
442  * Switch the given ethernet frame when operating in layer 2 mode.
443  *
444  * vswp: pointer to the vsw instance
445  * mp: pointer to chain of ethernet frame(s) to be switched
446  * caller: identifies the source of this frame as:
447  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
448  *		2. VSW_PHYSDEV - the physical ethernet device
449  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
450  * arg: argument provided by the caller.
451  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
452  *		2. for PHYSDEV - NULL
453  *		3. for LOCALDEV - pointer to to this vsw_t(self)
454  */
455 void
456 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
457 			vsw_port_t *arg, mac_resource_handle_t mrh)
458 {
459 	struct ether_header	*ehp;
460 	mblk_t			*bp, *ret_m;
461 	vsw_fdbe_t		*fp;
462 
463 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
464 
465 	/*
466 	 * PERF: rather than breaking up the chain here, scan it
467 	 * to find all mblks heading to same destination and then
468 	 * pass that sub-chain to the lower transmit functions.
469 	 */
470 
471 	/* process the chain of packets */
472 	bp = mp;
473 	while (bp) {
474 		ehp = (struct ether_header *)bp->b_rptr;
475 		mp = vsw_get_same_dest_list(ehp, &bp);
476 		ASSERT(mp != NULL);
477 
478 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
479 		    __func__, MBLKSIZE(mp), MBLKL(mp));
480 
481 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
482 			/*
483 			 * If destination is VSW_LOCALDEV (vsw as an eth
484 			 * interface) and if the device is up & running,
485 			 * send the packet up the stack on this host.
486 			 * If the virtual interface is down, drop the packet.
487 			 */
488 			if (caller != VSW_LOCALDEV) {
489 				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
490 			} else {
491 				freemsgchain(mp);
492 			}
493 			continue;
494 		}
495 
496 		/*
497 		 * Find fdb entry for the destination
498 		 * and hold a reference to it.
499 		 */
500 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
501 		if (fp != NULL) {
502 
503 			/*
504 			 * If plumbed and in promisc mode then copy msg
505 			 * and send up the stack.
506 			 */
507 			vsw_mac_rx(vswp, mrh, mp,
508 			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
509 
510 			/*
511 			 * If the destination is in FDB, the packet
512 			 * should be forwarded to the correponding
513 			 * vsw_port (connected to a vnet device -
514 			 * VSW_VNETPORT)
515 			 */
516 			(void) vsw_portsend(fp->portp, mp);
517 
518 			/* Release the reference on the fdb entry */
519 			VSW_FDBE_REFRELE(fp);
520 		} else {
521 			/*
522 			 * Destination not in FDB.
523 			 *
524 			 * If the destination is broadcast or
525 			 * multicast forward the packet to all
526 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
527 			 * except the caller.
528 			 */
529 			if (IS_BROADCAST(ehp)) {
530 				D2(vswp, "%s: BROADCAST pkt", __func__);
531 				(void) vsw_forward_all(vswp, mp, caller, arg);
532 			} else if (IS_MULTICAST(ehp)) {
533 				D2(vswp, "%s: MULTICAST pkt", __func__);
534 				(void) vsw_forward_grp(vswp, mp, caller, arg);
535 			} else {
536 				/*
537 				 * If the destination is unicast, and came
538 				 * from either a logical network device or
539 				 * the switch itself when it is plumbed, then
540 				 * send it out on the physical device and also
541 				 * up the stack if the logical interface is
542 				 * in promiscious mode.
543 				 *
544 				 * NOTE:  The assumption here is that if we
545 				 * cannot find the destination in our fdb, its
546 				 * a unicast address, and came from either a
547 				 * vnet or down the stack (when plumbed) it
548 				 * must be destinded for an ethernet device
549 				 * outside our ldoms.
550 				 */
551 				if (caller == VSW_VNETPORT) {
552 					/* promisc check copy etc */
553 					vsw_mac_rx(vswp, mrh, mp,
554 					    VSW_MACRX_PROMISC |
555 					    VSW_MACRX_COPYMSG);
556 
557 					if ((ret_m = vsw_tx_msg(vswp, mp,
558 					    caller, arg)) != NULL) {
559 						DERR(vswp, "%s: drop mblks to "
560 						    "phys dev", __func__);
561 						freemsgchain(ret_m);
562 					}
563 
564 				} else if (caller == VSW_PHYSDEV) {
565 					/*
566 					 * Pkt seen because card in promisc
567 					 * mode. Send up stack if plumbed in
568 					 * promisc mode, else drop it.
569 					 */
570 					vsw_mac_rx(vswp, mrh, mp,
571 					    VSW_MACRX_PROMISC |
572 					    VSW_MACRX_FREEMSG);
573 
574 				} else if (caller == VSW_LOCALDEV) {
575 					/*
576 					 * Pkt came down the stack, send out
577 					 * over physical device.
578 					 */
579 					if ((ret_m = vsw_tx_msg(vswp, mp,
580 					    caller, NULL)) != NULL) {
581 						DERR(vswp, "%s: drop mblks to "
582 						    "phys dev", __func__);
583 						freemsgchain(ret_m);
584 					}
585 				}
586 			}
587 		}
588 	}
589 	D1(vswp, "%s: exit\n", __func__);
590 }
591 
592 /*
593  * Switch ethernet frame when in layer 3 mode (i.e. using IP
594  * layer to do the routing).
595  *
596  * There is a large amount of overlap between this function and
597  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
598  * both these functions.
599  */
600 void
601 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
602 			vsw_port_t *arg, mac_resource_handle_t mrh)
603 {
604 	struct ether_header	*ehp;
605 	mblk_t			*bp = NULL;
606 	vsw_fdbe_t		*fp;
607 
608 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
609 
610 	/*
611 	 * In layer 3 mode should only ever be switching packets
612 	 * between IP layer and vnet devices. So make sure thats
613 	 * who is invoking us.
614 	 */
615 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
616 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
617 		freemsgchain(mp);
618 		return;
619 	}
620 
621 	/* process the chain of packets */
622 	bp = mp;
623 	while (bp) {
624 		ehp = (struct ether_header *)bp->b_rptr;
625 		mp = vsw_get_same_dest_list(ehp, &bp);
626 		ASSERT(mp != NULL);
627 
628 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
629 		    __func__, MBLKSIZE(mp), MBLKL(mp));
630 
631 		/*
632 		 * Find fdb entry for the destination
633 		 * and hold a reference to it.
634 		 */
635 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
636 		if (fp != NULL) {
637 
638 			D2(vswp, "%s: sending to target port", __func__);
639 			(void) vsw_portsend(fp->portp, mp);
640 
641 			/* Release the reference on the fdb entry */
642 			VSW_FDBE_REFRELE(fp);
643 		} else {
644 			/*
645 			 * Destination not in FDB
646 			 *
647 			 * If the destination is broadcast or
648 			 * multicast forward the packet to all
649 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
650 			 * except the caller.
651 			 */
652 			if (IS_BROADCAST(ehp)) {
653 				D2(vswp, "%s: BROADCAST pkt", __func__);
654 				(void) vsw_forward_all(vswp, mp, caller, arg);
655 			} else if (IS_MULTICAST(ehp)) {
656 				D2(vswp, "%s: MULTICAST pkt", __func__);
657 				(void) vsw_forward_grp(vswp, mp, caller, arg);
658 			} else {
659 				/*
660 				 * Unicast pkt from vnet that we don't have
661 				 * an FDB entry for, so must be destinded for
662 				 * the outside world. Attempt to send up to the
663 				 * IP layer to allow it to deal with it.
664 				 */
665 				if (caller == VSW_VNETPORT) {
666 					vsw_mac_rx(vswp, mrh,
667 					    mp, VSW_MACRX_FREEMSG);
668 				}
669 			}
670 		}
671 	}
672 
673 	D1(vswp, "%s: exit", __func__);
674 }
675 
676 /*
677  * Setup mac addrs and hio resources for layer 2 switching only.
678  */
679 void
680 vsw_setup_layer2_post_process(vsw_t *vswp)
681 {
682 	if (vswp->smode & VSW_LAYER2) {
683 		/*
684 		 * Program unicst, mcst addrs of vsw
685 		 * interface and ports in the physdev.
686 		 */
687 		vsw_set_addrs(vswp);
688 
689 		/* Start HIO for ports that have already connected */
690 		vsw_hio_start_ports(vswp);
691 
692 		/* Update physical link info to any ports already connected */
693 		vsw_physlink_state_update(vswp);
694 	}
695 }
696 
697 /*
698  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
699  * except the caller (port on which frame arrived).
700  */
701 static int
702 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
703 {
704 	vsw_port_list_t	*plist = &vswp->plist;
705 	vsw_port_t	*portp;
706 	mblk_t		*nmp = NULL;
707 	mblk_t		*ret_m = NULL;
708 	int		skip_port = 0;
709 
710 	D1(vswp, "vsw_forward_all: enter\n");
711 
712 	/*
713 	 * Broadcast message from inside ldoms so send to outside
714 	 * world if in either of layer 2 modes.
715 	 */
716 	if ((vswp->smode & VSW_LAYER2) &&
717 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
718 
719 		nmp = vsw_dupmsgchain(mp);
720 		if (nmp) {
721 			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
722 			    != NULL) {
723 				DERR(vswp, "%s: dropping pkt(s) "
724 				    "consisting of %ld bytes of data for"
725 				    " physical device", __func__, MBLKL(ret_m));
726 				freemsgchain(ret_m);
727 			}
728 		}
729 	}
730 
731 	if (caller == VSW_VNETPORT)
732 		skip_port = 1;
733 
734 	/*
735 	 * Broadcast message from other vnet (layer 2 or 3) or outside
736 	 * world (layer 2 only), send up stack if plumbed.
737 	 */
738 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
739 		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
740 	}
741 
742 	/* send it to all VNETPORTs */
743 	READ_ENTER(&plist->lockrw);
744 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
745 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
746 		/*
747 		 * Caution ! - don't reorder these two checks as arg
748 		 * will be NULL if the caller is PHYSDEV. skip_port is
749 		 * only set if caller is VNETPORT.
750 		 */
751 		if ((skip_port) && (portp == arg)) {
752 			continue;
753 		} else {
754 			nmp = vsw_dupmsgchain(mp);
755 			if (nmp) {
756 				/*
757 				 * The plist->lockrw is protecting the
758 				 * portp from getting destroyed here.
759 				 * So, no ref_cnt is incremented here.
760 				 */
761 				(void) vsw_portsend(portp, nmp);
762 			} else {
763 				DERR(vswp, "vsw_forward_all: nmp NULL");
764 			}
765 		}
766 	}
767 	RW_EXIT(&plist->lockrw);
768 
769 	freemsgchain(mp);
770 
771 	D1(vswp, "vsw_forward_all: exit\n");
772 	return (0);
773 }
774 
775 /*
776  * Forward pkts to any devices or interfaces which have registered
777  * an interest in them (i.e. multicast groups).
778  */
779 static int
780 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
781 {
782 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
783 	mfdb_ent_t		*entp = NULL;
784 	mfdb_ent_t		*tpp = NULL;
785 	vsw_port_t 		*port;
786 	uint64_t		key = 0;
787 	mblk_t			*nmp = NULL;
788 	mblk_t			*ret_m = NULL;
789 	boolean_t		check_if = B_TRUE;
790 
791 	/*
792 	 * Convert address to hash table key
793 	 */
794 	KEY_HASH(key, &ehp->ether_dhost);
795 
796 	D1(vswp, "%s: key 0x%llx", __func__, key);
797 
798 	/*
799 	 * If pkt came from either a vnet or down the stack (if we are
800 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
801 	 * over the physical adapter, and then check to see if any other
802 	 * vnets are interested in it.
803 	 */
804 	if ((vswp->smode & VSW_LAYER2) &&
805 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
806 		nmp = vsw_dupmsgchain(mp);
807 		if (nmp) {
808 			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
809 			    != NULL) {
810 				DERR(vswp, "%s: dropping pkt(s) consisting of "
811 				    "%ld bytes of data for physical device",
812 				    __func__, MBLKL(ret_m));
813 				freemsgchain(ret_m);
814 			}
815 		}
816 	}
817 
818 	READ_ENTER(&vswp->mfdbrw);
819 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
820 	    (mod_hash_val_t *)&entp) != 0) {
821 		D3(vswp, "%s: no table entry found for addr 0x%llx",
822 		    __func__, key);
823 	} else {
824 		/*
825 		 * Send to list of devices associated with this address...
826 		 */
827 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
828 
829 			/* dont send to ourselves */
830 			if ((caller == VSW_VNETPORT) &&
831 			    (tpp->d_addr == (void *)arg)) {
832 				port = (vsw_port_t *)tpp->d_addr;
833 				D3(vswp, "%s: not sending to ourselves"
834 				    " : port %d", __func__, port->p_instance);
835 				continue;
836 
837 			} else if ((caller == VSW_LOCALDEV) &&
838 			    (tpp->d_type == VSW_LOCALDEV)) {
839 				D2(vswp, "%s: not sending back up stack",
840 				    __func__);
841 				continue;
842 			}
843 
844 			if (tpp->d_type == VSW_VNETPORT) {
845 				port = (vsw_port_t *)tpp->d_addr;
846 				D3(vswp, "%s: sending to port %ld for addr "
847 				    "0x%llx", __func__, port->p_instance, key);
848 
849 				nmp = vsw_dupmsgchain(mp);
850 				if (nmp) {
851 					/*
852 					 * The vswp->mfdbrw is protecting the
853 					 * portp from getting destroyed here.
854 					 * So, no ref_cnt is incremented here.
855 					 */
856 					(void) vsw_portsend(port, nmp);
857 				}
858 			} else {
859 				vsw_mac_rx(vswp, NULL,
860 				    mp, VSW_MACRX_COPYMSG);
861 				D2(vswp, "%s: sending up stack"
862 				    " for addr 0x%llx", __func__, key);
863 				check_if = B_FALSE;
864 			}
865 		}
866 	}
867 
868 	RW_EXIT(&vswp->mfdbrw);
869 
870 	/*
871 	 * If the pkt came from either a vnet or from physical device,
872 	 * and if we havent already sent the pkt up the stack then we
873 	 * check now if we can/should (i.e. the interface is plumbed
874 	 * and in promisc mode).
875 	 */
876 	if ((check_if) &&
877 	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
878 		vsw_mac_rx(vswp, NULL, mp,
879 		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
880 	}
881 
882 	freemsgchain(mp);
883 
884 	D1(vswp, "%s: exit", __func__);
885 
886 	return (0);
887 }
888 
889 /*
890  * This function creates the vlan id hash table for the given vsw device or
891  * port. It then adds each vlan that the device or port has been assigned,
892  * into this hash table.
893  * Arguments:
894  *   arg:  vsw device or port.
895  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
896  */
897 void
898 vsw_create_vlans(void *arg, int type)
899 {
900 	/* create vlan hash table */
901 	vsw_vlan_create_hash(arg, type);
902 
903 	/* add vlan ids of the vsw device into its hash table */
904 	vsw_vlan_add_ids(arg, type);
905 }
906 
907 /*
908  * This function removes the vlan ids of the vsw device or port from its hash
909  * table. It then destroys the vlan hash table.
910  * Arguments:
911  *   arg:  vsw device or port.
912  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
913  */
914 void
915 vsw_destroy_vlans(void *arg, int type)
916 {
917 	/* remove vlan ids from the hash table */
918 	vsw_vlan_remove_ids(arg, type);
919 
920 	/* destroy vlan-hash-table */
921 	vsw_vlan_destroy_hash(arg, type);
922 }
923 
924 /*
925  * Create a vlan-id hash table for the given vsw device or port.
926  */
927 static void
928 vsw_vlan_create_hash(void *arg, int type)
929 {
930 	char		hashname[MAXNAMELEN];
931 
932 	if (type == VSW_LOCALDEV) {
933 		vsw_t		*vswp = (vsw_t *)arg;
934 
935 		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
936 		    vswp->instance);
937 
938 		vswp->vlan_nchains = vsw_vlan_nchains;
939 		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
940 		    vswp->vlan_nchains, mod_hash_null_valdtor);
941 
942 	} else if (type == VSW_VNETPORT) {
943 		vsw_port_t	*portp = (vsw_port_t *)arg;
944 
945 		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
946 		    portp->p_instance);
947 
948 		portp->vlan_nchains = vsw_vlan_nchains;
949 		portp->vlan_hashp = mod_hash_create_idhash(hashname,
950 		    portp->vlan_nchains, mod_hash_null_valdtor);
951 
952 	} else {
953 		return;
954 	}
955 }
956 
957 /*
958  * Destroy the vlan-id hash table for the given vsw device or port.
959  */
960 static void
961 vsw_vlan_destroy_hash(void *arg, int type)
962 {
963 	if (type == VSW_LOCALDEV) {
964 		vsw_t		*vswp = (vsw_t *)arg;
965 
966 		mod_hash_destroy_hash(vswp->vlan_hashp);
967 		vswp->vlan_nchains = 0;
968 	} else if (type == VSW_VNETPORT) {
969 		vsw_port_t	*portp = (vsw_port_t *)arg;
970 
971 		mod_hash_destroy_hash(portp->vlan_hashp);
972 		portp->vlan_nchains = 0;
973 	} else {
974 		return;
975 	}
976 }
977 
978 /*
979  * Add vlan ids of the given vsw device or port into its hash table.
980  */
981 void
982 vsw_vlan_add_ids(void *arg, int type)
983 {
984 	int	rv;
985 	int	i;
986 
987 	if (type == VSW_LOCALDEV) {
988 		vsw_t		*vswp = (vsw_t *)arg;
989 
990 		rv = mod_hash_insert(vswp->vlan_hashp,
991 		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
992 		    (mod_hash_val_t)B_TRUE);
993 		if (rv != 0) {
994 			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
995 			    "the interface", vswp->instance, vswp->pvid);
996 		}
997 
998 		for (i = 0; i < vswp->nvids; i++) {
999 			rv = mod_hash_insert(vswp->vlan_hashp,
1000 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid),
1001 			    (mod_hash_val_t)B_TRUE);
1002 			if (rv != 0) {
1003 				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
1004 				    " for the interface", vswp->instance,
1005 				    vswp->pvid);
1006 			}
1007 		}
1008 
1009 	} else if (type == VSW_VNETPORT) {
1010 		vsw_port_t	*portp = (vsw_port_t *)arg;
1011 		vsw_t		*vswp = portp->p_vswp;
1012 
1013 		rv = mod_hash_insert(portp->vlan_hashp,
1014 		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1015 		    (mod_hash_val_t)B_TRUE);
1016 		if (rv != 0) {
1017 			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
1018 			    "the port(%d)", vswp->instance, vswp->pvid,
1019 			    portp->p_instance);
1020 		}
1021 
1022 		for (i = 0; i < portp->nvids; i++) {
1023 			rv = mod_hash_insert(portp->vlan_hashp,
1024 			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid),
1025 			    (mod_hash_val_t)B_TRUE);
1026 			if (rv != 0) {
1027 				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
1028 				    " for the port(%d)", vswp->instance,
1029 				    vswp->pvid, portp->p_instance);
1030 			}
1031 		}
1032 
1033 	}
1034 }
1035 
1036 /*
1037  * Remove vlan ids of the given vsw device or port from its hash table.
1038  */
1039 void
1040 vsw_vlan_remove_ids(void *arg, int type)
1041 {
1042 	mod_hash_val_t	vp;
1043 	int		rv;
1044 	int		i;
1045 
1046 	if (type == VSW_LOCALDEV) {
1047 		vsw_t		*vswp = (vsw_t *)arg;
1048 
1049 		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
1050 		if (rv == B_TRUE) {
1051 			rv = mod_hash_remove(vswp->vlan_hashp,
1052 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1053 			    (mod_hash_val_t *)&vp);
1054 			ASSERT(rv == 0);
1055 		}
1056 
1057 		for (i = 0; i < vswp->nvids; i++) {
1058 			rv = vsw_vlan_lookup(vswp->vlan_hashp,
1059 			    vswp->vids[i].vl_vid);
1060 			if (rv == B_TRUE) {
1061 				rv = mod_hash_remove(vswp->vlan_hashp,
1062 				    (mod_hash_key_t)VLAN_ID_KEY(
1063 				    vswp->vids[i].vl_vid),
1064 				    (mod_hash_val_t *)&vp);
1065 				ASSERT(rv == 0);
1066 			}
1067 		}
1068 
1069 	} else if (type == VSW_VNETPORT) {
1070 		vsw_port_t	*portp = (vsw_port_t *)arg;
1071 
1072 		portp = (vsw_port_t *)arg;
1073 		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1074 		if (rv == B_TRUE) {
1075 			rv = mod_hash_remove(portp->vlan_hashp,
1076 			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1077 			    (mod_hash_val_t *)&vp);
1078 			ASSERT(rv == 0);
1079 		}
1080 
1081 		for (i = 0; i < portp->nvids; i++) {
1082 			rv = vsw_vlan_lookup(portp->vlan_hashp,
1083 			    portp->vids[i].vl_vid);
1084 			if (rv == B_TRUE) {
1085 				rv = mod_hash_remove(portp->vlan_hashp,
1086 				    (mod_hash_key_t)VLAN_ID_KEY(
1087 				    portp->vids[i].vl_vid),
1088 				    (mod_hash_val_t *)&vp);
1089 				ASSERT(rv == 0);
1090 			}
1091 		}
1092 
1093 	} else {
1094 		return;
1095 	}
1096 }
1097 
1098 /*
1099  * Find the given vlan id in the hash table.
1100  * Return: B_TRUE if the id is found; B_FALSE if not found.
1101  */
1102 boolean_t
1103 vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1104 {
1105 	int		rv;
1106 	mod_hash_val_t	vp;
1107 
1108 	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1109 
1110 	if (rv != 0)
1111 		return (B_FALSE);
1112 
1113 	return (B_TRUE);
1114 }
1115 
1116 /*
1117  * Add an entry into FDB for the given vsw.
1118  */
1119 void
1120 vsw_fdbe_add(vsw_t *vswp, void *port)
1121 {
1122 	uint64_t	addr = 0;
1123 	vsw_port_t	*portp;
1124 	vsw_fdbe_t	*fp;
1125 	int		rv;
1126 
1127 	portp = (vsw_port_t *)port;
1128 	KEY_HASH(addr, &portp->p_macaddr);
1129 
1130 	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1131 	fp->portp = port;
1132 
1133 	/*
1134 	 * Note: duplicate keys will be rejected by mod_hash.
1135 	 */
1136 	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1137 	    (mod_hash_val_t)fp);
1138 	if (rv != 0) {
1139 		cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for "
1140 		    "the port(%d)", vswp->instance,
1141 		    ether_sprintf(&portp->p_macaddr), portp->p_instance);
1142 	}
1143 }
1144 
1145 /*
1146  * Remove an entry from FDB.
1147  */
1148 void
1149 vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1150 {
1151 	uint64_t	addr = 0;
1152 	vsw_fdbe_t	*fp;
1153 	int		rv;
1154 
1155 	KEY_HASH(addr, eaddr);
1156 
1157 	/*
1158 	 * Remove the entry from fdb hash table.
1159 	 * This prevents further references to this fdb entry.
1160 	 */
1161 	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1162 	    (mod_hash_val_t *)&fp);
1163 	if (rv != 0) {
1164 		/* invalid key? */
1165 		return;
1166 	}
1167 
1168 	/*
1169 	 * If there are threads already ref holding before the entry was
1170 	 * removed from hash table, then wait for ref count to drop to zero.
1171 	 */
1172 	while (fp->refcnt != 0) {
1173 		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1174 	}
1175 
1176 	kmem_free(fp, sizeof (*fp));
1177 }
1178 
1179 /*
1180  * Search fdb for a given mac address. If an entry is found, hold
1181  * a reference to it and return the entry, else returns NULL.
1182  */
1183 static vsw_fdbe_t *
1184 vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1185 {
1186 	uint64_t	key = 0;
1187 	vsw_fdbe_t	*fp;
1188 	int		rv;
1189 
1190 	KEY_HASH(key, addrp);
1191 
1192 	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1193 	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1194 
1195 	if (rv != 0)
1196 		return (NULL);
1197 
1198 	return (fp);
1199 }
1200 
1201 /*
1202  * Callback function provided to mod_hash_find_cb(). After finding the fdb
1203  * entry corresponding to the key (macaddr), this callback will be invoked by
1204  * mod_hash_find_cb() to atomically increment the reference count on the fdb
1205  * entry before returning the found entry.
1206  */
1207 static void
1208 vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1209 {
1210 	_NOTE(ARGUNUSED(key))
1211 	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1212 }
1213 
1214 /*
1215  * A given frame must be always tagged with the appropriate vlan id (unless it
1216  * is in the default-vlan) before the mac address switching function is called.
1217  * Otherwise, after switching function determines the destination, we cannot
1218  * figure out if the destination belongs to the the same vlan that the frame
1219  * originated from and if it needs tag/untag. Frames which are inbound from
1220  * the external(physical) network over a vlan trunk link are always tagged.
1221  * However frames which are received from a vnet-port over ldc or frames which
1222  * are coming down the stack on the service domain over vsw interface may be
1223  * untagged. These frames must be tagged with the appropriate pvid of the
1224  * sender (vnet-port or vsw device), before invoking the switching function.
1225  *
1226  * Arguments:
1227  *   arg:    caller of the function.
1228  *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1229  *   mp:     frame(s) to be tagged.
1230  */
1231 mblk_t *
1232 vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1233 {
1234 	vsw_t			*vswp;
1235 	vsw_port_t		*portp;
1236 	struct ether_header	*ehp;
1237 	mblk_t			*bp;
1238 	mblk_t			*bpt;
1239 	mblk_t			*bph;
1240 	mblk_t			*bpn;
1241 	uint16_t		pvid;
1242 
1243 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1244 
1245 	if (type == VSW_LOCALDEV) {
1246 		vswp = (vsw_t *)arg;
1247 		pvid = vswp->pvid;
1248 		portp = NULL;
1249 	} else {
1250 		/* VSW_VNETPORT */
1251 		portp = (vsw_port_t *)arg;
1252 		pvid = portp->pvid;
1253 		vswp = portp->p_vswp;
1254 	}
1255 
1256 	bpn = bph = bpt = NULL;
1257 
1258 	for (bp = mp; bp != NULL; bp = bpn) {
1259 
1260 		bpn = bp->b_next;
1261 		bp->b_next = bp->b_prev = NULL;
1262 
1263 		/* Determine if it is an untagged frame */
1264 		ehp = (struct ether_header *)bp->b_rptr;
1265 
1266 		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1267 
1268 			/* no need to tag if the frame is in default vlan */
1269 			if (pvid != vswp->default_vlan_id) {
1270 				bp = vnet_vlan_insert_tag(bp, pvid);
1271 				if (bp == NULL) {
1272 					continue;
1273 				}
1274 			}
1275 		}
1276 
1277 		/* build a chain of processed packets */
1278 		if (bph == NULL) {
1279 			bph = bpt = bp;
1280 		} else {
1281 			bpt->b_next = bp;
1282 			bpt = bp;
1283 		}
1284 
1285 	}
1286 
1287 	return (bph);
1288 }
1289 
1290 /*
1291  * Frames destined to a vnet-port or to the local vsw interface, must be
1292  * untagged if necessary before sending. This function first checks that the
1293  * frame can be sent to the destination in the vlan identified by the frame
1294  * tag. Note that when this function is invoked the frame must have been
1295  * already tagged (unless it is in the default-vlan). Because, this function is
1296  * called when the switching function determines the destination and invokes
1297  * its send function (vnet-port or vsw interface) and all frames would have
1298  * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1299  *
1300  * Arguments:
1301  *   arg:    destination device.
1302  *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1303  *   np:     head of pkt chain to be validated and untagged.
1304  *   npt:    tail of pkt chain to be validated and untagged.
1305  *
1306  * Returns:
1307  *   np:     head of updated chain of packets
1308  *   npt:    tail of updated chain of packets
1309  *   rv:     count of the packets in the returned list
1310  */
1311 uint32_t
1312 vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1313 {
1314 	mblk_t			*bp;
1315 	mblk_t			*bpt;
1316 	mblk_t			*bph;
1317 	mblk_t			*bpn;
1318 	vsw_port_t		*portp;
1319 	vsw_t			*vswp;
1320 	uint32_t		count;
1321 	struct ether_header	*ehp;
1322 	boolean_t		is_tagged;
1323 	boolean_t		rv;
1324 	uint16_t		vlan_id;
1325 	uint16_t		pvid;
1326 	mod_hash_t		*vlan_hashp;
1327 
1328 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1329 
1330 
1331 	if (type == VSW_LOCALDEV) {
1332 		vswp = (vsw_t *)arg;
1333 		pvid = vswp->pvid;
1334 		vlan_hashp = vswp->vlan_hashp;
1335 		portp = NULL;
1336 	} else {
1337 		/* type == VSW_VNETPORT */
1338 		portp = (vsw_port_t *)arg;
1339 		vswp = portp->p_vswp;
1340 		vlan_hashp = portp->vlan_hashp;
1341 		pvid = portp->pvid;
1342 	}
1343 
1344 	/*
1345 	 * If the MAC layer switching in place, then
1346 	 * untagging required only if the pvid is not
1347 	 * the same as default_vlan_id. This is because,
1348 	 * the MAC layer will send packets for the
1349 	 * registered vlans only.
1350 	 */
1351 	if ((vswp->mac_cl_switching == B_TRUE) &&
1352 	    (pvid == vswp->default_vlan_id)) {
1353 		/* simply count and set the tail */
1354 		count = 1;
1355 		bp = *np;
1356 		ASSERT(bp != NULL);
1357 		while (bp->b_next != NULL) {
1358 			bp = bp->b_next;
1359 			count++;
1360 		}
1361 		*npt = bp;
1362 		return (count);
1363 	}
1364 
1365 	bpn = bph = bpt = NULL;
1366 	count = 0;
1367 
1368 	for (bp = *np; bp != NULL; bp = bpn) {
1369 
1370 		bpn = bp->b_next;
1371 		bp->b_next = bp->b_prev = NULL;
1372 
1373 		/*
1374 		 * Determine the vlan id that the frame belongs to.
1375 		 */
1376 		ehp = (struct ether_header *)bp->b_rptr;
1377 		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1378 
1379 		/*
1380 		 * If MAC layer switching in place, then we
1381 		 * need to untag only if the tagged packet has
1382 		 * vlan-id same as the pvid.
1383 		 */
1384 		if (vswp->mac_cl_switching == B_TRUE) {
1385 
1386 			/* only tagged packets expected here */
1387 			ASSERT(is_tagged == B_TRUE);
1388 			if (vlan_id == pvid) {
1389 				bp = vnet_vlan_remove_tag(bp);
1390 				if (bp == NULL) {
1391 					/* packet dropped */
1392 					continue;
1393 				}
1394 			}
1395 		} else { /* No MAC layer switching */
1396 
1397 			/*
1398 			 * Check the frame header if tag/untag is  needed.
1399 			 */
1400 			if (is_tagged == B_FALSE) {
1401 				/*
1402 				 * Untagged frame. We shouldn't have an
1403 				 * untagged packet at this point, unless
1404 				 * the destination's  vlan id is
1405 				 * default-vlan-id; if it is not the
1406 				 * default-vlan-id, we drop the packet.
1407 				 */
1408 				if (vlan_id != vswp->default_vlan_id) {
1409 					/* drop the packet */
1410 					freemsg(bp);
1411 					continue;
1412 				}
1413 			} else {	/* Tagged */
1414 				/*
1415 				 * Tagged frame, untag if it's the
1416 				 * destination's pvid.
1417 				 */
1418 				if (vlan_id == pvid) {
1419 
1420 					bp = vnet_vlan_remove_tag(bp);
1421 					if (bp == NULL) {
1422 						/* packet dropped */
1423 						continue;
1424 					}
1425 				} else {
1426 
1427 					/*
1428 					 * Check if the destination is in the
1429 					 * same vlan.
1430 					 */
1431 					rv = vsw_vlan_lookup(vlan_hashp,
1432 					    vlan_id);
1433 					if (rv == B_FALSE) {
1434 						/* drop the packet */
1435 						freemsg(bp);
1436 						continue;
1437 					}
1438 				}
1439 
1440 			}
1441 		}
1442 
1443 		/* build a chain of processed packets */
1444 		if (bph == NULL) {
1445 			bph = bpt = bp;
1446 		} else {
1447 			bpt->b_next = bp;
1448 			bpt = bp;
1449 		}
1450 		count++;
1451 	}
1452 
1453 	*np = bph;
1454 	*npt = bpt;
1455 	return (count);
1456 }
1457 
1458 /*
1459  * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1460  * then the vlan-id is available in the tag; otherwise, its vlan id is
1461  * implicitly obtained based on the caller (destination of the frame:
1462  * VSW_VNETPORT or VSW_LOCALDEV).
1463  * The vlan id determined is returned in vidp.
1464  * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1465  */
1466 boolean_t
1467 vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1468 	uint16_t *vidp)
1469 {
1470 	struct ether_vlan_header	*evhp;
1471 	vsw_t				*vswp;
1472 	vsw_port_t			*portp;
1473 
1474 	/* If it's a tagged frame, get the vid from vlan header */
1475 	if (ehp->ether_type == ETHERTYPE_VLAN) {
1476 
1477 		evhp = (struct ether_vlan_header *)ehp;
1478 		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1479 		return (B_TRUE);
1480 	}
1481 
1482 	/* Untagged frame; determine vlan id based on caller */
1483 	switch (caller) {
1484 
1485 	case VSW_VNETPORT:
1486 		/*
1487 		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1488 		 */
1489 		portp = (vsw_port_t *)arg;
1490 		*vidp = portp->pvid;
1491 		break;
1492 
1493 	case VSW_LOCALDEV:
1494 
1495 		/*
1496 		 * packet destined to vsw interface;
1497 		 * vlan-id is port-vlan-id of vsw device.
1498 		 */
1499 		vswp = (vsw_t *)arg;
1500 		*vidp = vswp->pvid;
1501 		break;
1502 	}
1503 
1504 	return (B_FALSE);
1505 }
1506 
1507 /*
1508  * Add or remove multicast address(es).
1509  *
1510  * Returns 0 on success, 1 on failure.
1511  */
1512 int
1513 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1514 {
1515 	mcst_addr_t		*mcst_p = NULL;
1516 	vsw_t			*vswp = port->p_vswp;
1517 	uint64_t		addr = 0x0;
1518 	int			i;
1519 
1520 	D1(vswp, "%s: enter", __func__);
1521 
1522 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1523 
1524 	for (i = 0; i < mcst_pkt->count; i++) {
1525 		/*
1526 		 * Convert address into form that can be used
1527 		 * as hash table key.
1528 		 */
1529 		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1530 
1531 		/*
1532 		 * Add or delete the specified address/port combination.
1533 		 */
1534 		if (mcst_pkt->set == 0x1) {
1535 			D3(vswp, "%s: adding multicast address 0x%llx for "
1536 			    "port %ld", __func__, addr, port->p_instance);
1537 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1538 				/*
1539 				 * Update the list of multicast
1540 				 * addresses contained within the
1541 				 * port structure to include this new
1542 				 * one.
1543 				 */
1544 				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1545 				    KM_NOSLEEP);
1546 				if (mcst_p == NULL) {
1547 					DERR(vswp, "%s: unable to alloc mem",
1548 					    __func__);
1549 					(void) vsw_del_mcst(vswp,
1550 					    VSW_VNETPORT, addr, port);
1551 					return (1);
1552 				}
1553 
1554 				mcst_p->nextp = NULL;
1555 				mcst_p->addr = addr;
1556 				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1557 
1558 				/*
1559 				 * Program the address into HW. If the addr
1560 				 * has already been programmed then the MAC
1561 				 * just increments a ref counter (which is
1562 				 * used when the address is being deleted)
1563 				 */
1564 				if (vsw_mac_multicast_add(vswp, port, mcst_p,
1565 				    VSW_VNETPORT)) {
1566 					(void) vsw_del_mcst(vswp,
1567 					    VSW_VNETPORT, addr, port);
1568 					kmem_free(mcst_p, sizeof (*mcst_p));
1569 					return (1);
1570 				}
1571 
1572 				mutex_enter(&port->mca_lock);
1573 				mcst_p->nextp = port->mcap;
1574 				port->mcap = mcst_p;
1575 				mutex_exit(&port->mca_lock);
1576 
1577 			} else {
1578 				DERR(vswp, "%s: error adding multicast "
1579 				    "address 0x%llx for port %ld",
1580 				    __func__, addr, port->p_instance);
1581 				return (1);
1582 			}
1583 		} else {
1584 			/*
1585 			 * Delete an entry from the multicast hash
1586 			 * table and update the address list
1587 			 * appropriately.
1588 			 */
1589 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1590 				D3(vswp, "%s: deleting multicast address "
1591 				    "0x%llx for port %ld", __func__, addr,
1592 				    port->p_instance);
1593 
1594 				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1595 				ASSERT(mcst_p != NULL);
1596 
1597 				/*
1598 				 * Remove the address from HW. The address
1599 				 * will actually only be removed once the ref
1600 				 * count within the MAC layer has dropped to
1601 				 * zero. I.e. we can safely call this fn even
1602 				 * if other ports are interested in this
1603 				 * address.
1604 				 */
1605 				vsw_mac_multicast_remove(vswp, port, mcst_p,
1606 				    VSW_VNETPORT);
1607 				kmem_free(mcst_p, sizeof (*mcst_p));
1608 
1609 			} else {
1610 				DERR(vswp, "%s: error deleting multicast "
1611 				    "addr 0x%llx for port %ld",
1612 				    __func__, addr, port->p_instance);
1613 				return (1);
1614 			}
1615 		}
1616 	}
1617 	D1(vswp, "%s: exit", __func__);
1618 	return (0);
1619 }
1620 
1621 /*
1622  * Add a new multicast entry.
1623  *
1624  * Search hash table based on address. If match found then
1625  * update associated val (which is chain of ports), otherwise
1626  * create new key/val (addr/port) pair and insert into table.
1627  */
1628 int
1629 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1630 {
1631 	int		dup = 0;
1632 	int		rv = 0;
1633 	mfdb_ent_t	*ment = NULL;
1634 	mfdb_ent_t	*tmp_ent = NULL;
1635 	mfdb_ent_t	*new_ent = NULL;
1636 	void		*tgt = NULL;
1637 
1638 	if (devtype == VSW_VNETPORT) {
1639 		/*
1640 		 * Being invoked from a vnet.
1641 		 */
1642 		ASSERT(arg != NULL);
1643 		tgt = arg;
1644 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1645 		    ((vsw_port_t *)arg)->p_instance, addr);
1646 	} else {
1647 		/*
1648 		 * We are being invoked via the m_multicst mac entry
1649 		 * point.
1650 		 */
1651 		D2(NULL, "%s: address 0x%llx", __func__, addr);
1652 		tgt = (void *)vswp;
1653 	}
1654 
1655 	WRITE_ENTER(&vswp->mfdbrw);
1656 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1657 	    (mod_hash_val_t *)&ment) != 0) {
1658 
1659 		/* address not currently in table */
1660 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1661 		ment->d_addr = (void *)tgt;
1662 		ment->d_type = devtype;
1663 		ment->nextp = NULL;
1664 
1665 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1666 		    (mod_hash_val_t)ment) != 0) {
1667 			DERR(vswp, "%s: hash table insertion failed", __func__);
1668 			kmem_free(ment, sizeof (mfdb_ent_t));
1669 			rv = 1;
1670 		} else {
1671 			D2(vswp, "%s: added initial entry for 0x%llx to "
1672 			    "table", __func__, addr);
1673 		}
1674 	} else {
1675 		/*
1676 		 * Address in table. Check to see if specified port
1677 		 * is already associated with the address. If not add
1678 		 * it now.
1679 		 */
1680 		tmp_ent = ment;
1681 		while (tmp_ent != NULL) {
1682 			if (tmp_ent->d_addr == (void *)tgt) {
1683 				if (devtype == VSW_VNETPORT) {
1684 					DERR(vswp, "%s: duplicate port entry "
1685 					    "found for portid %ld and key "
1686 					    "0x%llx", __func__,
1687 					    ((vsw_port_t *)arg)->p_instance,
1688 					    addr);
1689 				} else {
1690 					DERR(vswp, "%s: duplicate entry found"
1691 					    "for key 0x%llx", __func__, addr);
1692 				}
1693 				rv = 1;
1694 				dup = 1;
1695 				break;
1696 			}
1697 			tmp_ent = tmp_ent->nextp;
1698 		}
1699 
1700 		/*
1701 		 * Port not on list so add it to end now.
1702 		 */
1703 		if (0 == dup) {
1704 			D2(vswp, "%s: added entry for 0x%llx to table",
1705 			    __func__, addr);
1706 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1707 			new_ent->d_addr = (void *)tgt;
1708 			new_ent->d_type = devtype;
1709 			new_ent->nextp = NULL;
1710 
1711 			tmp_ent = ment;
1712 			while (tmp_ent->nextp != NULL)
1713 				tmp_ent = tmp_ent->nextp;
1714 
1715 			tmp_ent->nextp = new_ent;
1716 		}
1717 	}
1718 
1719 	RW_EXIT(&vswp->mfdbrw);
1720 	return (rv);
1721 }
1722 
1723 /*
1724  * Remove a multicast entry from the hashtable.
1725  *
1726  * Search hash table based on address. If match found, scan
1727  * list of ports associated with address. If specified port
1728  * found remove it from list.
1729  */
1730 int
1731 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1732 {
1733 	mfdb_ent_t	*ment = NULL;
1734 	mfdb_ent_t	*curr_p, *prev_p;
1735 	void		*tgt = NULL;
1736 
1737 	D1(vswp, "%s: enter", __func__);
1738 
1739 	if (devtype == VSW_VNETPORT) {
1740 		tgt = (vsw_port_t *)arg;
1741 		D2(vswp, "%s: removing port %d from mFDB for address"
1742 		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1743 	} else {
1744 		D2(vswp, "%s: removing entry", __func__);
1745 		tgt = (void *)vswp;
1746 	}
1747 
1748 	WRITE_ENTER(&vswp->mfdbrw);
1749 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1750 	    (mod_hash_val_t *)&ment) != 0) {
1751 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1752 		RW_EXIT(&vswp->mfdbrw);
1753 		return (1);
1754 	}
1755 
1756 	prev_p = curr_p = ment;
1757 
1758 	while (curr_p != NULL) {
1759 		if (curr_p->d_addr == (void *)tgt) {
1760 			if (devtype == VSW_VNETPORT) {
1761 				D2(vswp, "%s: port %d found", __func__,
1762 				    ((vsw_port_t *)tgt)->p_instance);
1763 			} else {
1764 				D2(vswp, "%s: instance found", __func__);
1765 			}
1766 
1767 			if (prev_p == curr_p) {
1768 				/*
1769 				 * head of list, if no other element is in
1770 				 * list then destroy this entry, otherwise
1771 				 * just replace it with updated value.
1772 				 */
1773 				ment = curr_p->nextp;
1774 				if (ment == NULL) {
1775 					(void) mod_hash_destroy(vswp->mfdb,
1776 					    (mod_hash_val_t)addr);
1777 				} else {
1778 					(void) mod_hash_replace(vswp->mfdb,
1779 					    (mod_hash_key_t)addr,
1780 					    (mod_hash_val_t)ment);
1781 				}
1782 			} else {
1783 				/*
1784 				 * Not head of list, no need to do
1785 				 * replacement, just adjust list pointers.
1786 				 */
1787 				prev_p->nextp = curr_p->nextp;
1788 			}
1789 			break;
1790 		}
1791 
1792 		prev_p = curr_p;
1793 		curr_p = curr_p->nextp;
1794 	}
1795 
1796 	RW_EXIT(&vswp->mfdbrw);
1797 
1798 	D1(vswp, "%s: exit", __func__);
1799 
1800 	if (curr_p == NULL)
1801 		return (1);
1802 	kmem_free(curr_p, sizeof (mfdb_ent_t));
1803 	return (0);
1804 }
1805 
1806 /*
1807  * Port is being deleted, but has registered an interest in one
1808  * or more multicast groups. Using the list of addresses maintained
1809  * within the port structure find the appropriate entry in the hash
1810  * table and remove this port from the list of interested ports.
1811  */
1812 void
1813 vsw_del_mcst_port(vsw_port_t *port)
1814 {
1815 	mcst_addr_t	*mcap = NULL;
1816 	vsw_t		*vswp = port->p_vswp;
1817 
1818 	D1(vswp, "%s: enter", __func__);
1819 
1820 	mutex_enter(&port->mca_lock);
1821 
1822 	while ((mcap = port->mcap) != NULL) {
1823 
1824 		port->mcap = mcap->nextp;
1825 
1826 		mutex_exit(&port->mca_lock);
1827 
1828 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1829 		    mcap->addr, port);
1830 
1831 		/*
1832 		 * Remove the address from HW. The address
1833 		 * will actually only be removed once the ref
1834 		 * count within the MAC layer has dropped to
1835 		 * zero. I.e. we can safely call this fn even
1836 		 * if other ports are interested in this
1837 		 * address.
1838 		 */
1839 		vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT);
1840 		kmem_free(mcap, sizeof (*mcap));
1841 
1842 		mutex_enter(&port->mca_lock);
1843 
1844 	}
1845 
1846 	mutex_exit(&port->mca_lock);
1847 
1848 	D1(vswp, "%s: exit", __func__);
1849 }
1850 
1851 /*
1852  * This vsw instance is detaching, but has registered an interest in one
1853  * or more multicast groups. Using the list of addresses maintained
1854  * within the vsw structure find the appropriate entry in the hash
1855  * table and remove this instance from the list of interested ports.
1856  */
1857 void
1858 vsw_del_mcst_vsw(vsw_t *vswp)
1859 {
1860 	mcst_addr_t	*next_p = NULL;
1861 
1862 	D1(vswp, "%s: enter", __func__);
1863 
1864 	mutex_enter(&vswp->mca_lock);
1865 
1866 	while (vswp->mcap != NULL) {
1867 		DERR(vswp, "%s: deleting addr 0x%llx",
1868 		    __func__, vswp->mcap->addr);
1869 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1870 
1871 		next_p = vswp->mcap->nextp;
1872 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1873 		vswp->mcap = next_p;
1874 	}
1875 
1876 	vswp->mcap = NULL;
1877 	mutex_exit(&vswp->mca_lock);
1878 
1879 	D1(vswp, "%s: exit", __func__);
1880 }
1881 
1882 mblk_t *
1883 vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp)
1884 {
1885 	mblk_t			*bp;
1886 	mblk_t			*nbp;
1887 	mblk_t			*head = NULL;
1888 	mblk_t			*tail = NULL;
1889 	mblk_t			*prev = NULL;
1890 	struct ether_header	*behp;
1891 
1892 	/* process the chain of packets */
1893 	bp = *mpp;
1894 	while (bp) {
1895 		nbp = bp->b_next;
1896 		behp = (struct ether_header *)bp->b_rptr;
1897 		bp->b_prev = NULL;
1898 		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1899 			if (prev == NULL) {
1900 				*mpp = nbp;
1901 			} else {
1902 				prev->b_next = nbp;
1903 			}
1904 			bp->b_next =  NULL;
1905 			if (head == NULL) {
1906 				head = tail = bp;
1907 			} else {
1908 				tail->b_next = bp;
1909 				tail = bp;
1910 			}
1911 		} else {
1912 			prev = bp;
1913 		}
1914 		bp = nbp;
1915 	}
1916 	return (head);
1917 }
1918 
1919 static mblk_t *
1920 vsw_dupmsgchain(mblk_t *mp)
1921 {
1922 	mblk_t	*nmp = NULL;
1923 	mblk_t	**nmpp = &nmp;
1924 
1925 	for (; mp != NULL; mp = mp->b_next) {
1926 		if ((*nmpp = dupmsg(mp)) == NULL) {
1927 			freemsgchain(nmp);
1928 			return (NULL);
1929 		}
1930 
1931 		nmpp = &((*nmpp)->b_next);
1932 	}
1933 
1934 	return (nmp);
1935 }
1936