xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_switching.c (revision 9b4e3ac25d882519cad3fc11f0c53b07f4e60536)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mdeg.h>
62 #include <sys/ldc.h>
63 #include <sys/vsw_fdb.h>
64 #include <sys/vsw.h>
65 #include <sys/vio_mailbox.h>
66 #include <sys/vnet_mailbox.h>
67 #include <sys/vnet_common.h>
68 #include <sys/vio_util.h>
69 #include <sys/sdt.h>
70 #include <sys/atomic.h>
71 #include <sys/vlan.h>
72 
73 /* Switching setup routines */
74 void vsw_setup_switching_timeout(void *arg);
75 void vsw_stop_switching_timeout(vsw_t *vswp);
76 int vsw_setup_switching(vsw_t *);
77 void vsw_setup_layer2_post_process(vsw_t *vswp);
78 void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
79     vsw_port_t *port, mac_resource_handle_t mrh);
80 static	int vsw_setup_layer2(vsw_t *);
81 static	int vsw_setup_layer3(vsw_t *);
82 
83 /* Switching/data transmit routines */
84 static	void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
85     vsw_port_t *port, mac_resource_handle_t);
86 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
87 	vsw_port_t *port, mac_resource_handle_t);
88 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
89 	vsw_port_t *port, mac_resource_handle_t);
90 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
91 	int caller, vsw_port_t *port);
92 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
93     int caller, vsw_port_t *port);
94 
95 /* VLAN routines */
96 void vsw_create_vlans(void *arg, int type);
97 void vsw_destroy_vlans(void *arg, int type);
98 void vsw_vlan_add_ids(void *arg, int type);
99 void vsw_vlan_remove_ids(void *arg, int type);
100 static	void vsw_vlan_create_hash(void *arg, int type);
101 static	void vsw_vlan_destroy_hash(void *arg, int type);
102 boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
103 	uint16_t *vidp);
104 mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
105 uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
106 boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
107 
108 /* Forwarding database (FDB) routines */
109 void vsw_fdbe_add(vsw_t *vswp, void *port);
110 void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
111 static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
112 static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
113 
114 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
115 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
116 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
117 void vsw_del_mcst_vsw(vsw_t *);
118 
119 /* Support functions */
120 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
121 static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp);
122 
123 
124 /*
125  * Functions imported from other files.
126  */
127 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
128 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
129 extern int vsw_mac_open(vsw_t *vswp);
130 extern void vsw_mac_close(vsw_t *vswp);
131 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
132     mblk_t *mp, vsw_macrx_flags_t flags);
133 extern void vsw_set_addrs(vsw_t *vswp);
134 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
135 extern void vsw_hio_init(vsw_t *vswp);
136 extern void vsw_hio_start_ports(vsw_t *vswp);
137 extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port,
138     mcst_addr_t *mcst_p, int type);
139 extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
140     mcst_addr_t *mcst_p, int type);
141 
142 /*
143  * Tunables used in this file.
144  */
145 extern	int vsw_setup_switching_delay;
146 extern	uint32_t vsw_vlan_nchains;
147 extern	uint32_t vsw_fdbe_refcnt_delay;
148 
149 #define	VSW_FDBE_REFHOLD(p)						\
150 {									\
151 	atomic_inc_32(&(p)->refcnt);					\
152 	ASSERT((p)->refcnt != 0);					\
153 }
154 
155 #define	VSW_FDBE_REFRELE(p)						\
156 {									\
157 	ASSERT((p)->refcnt != 0);					\
158 	atomic_dec_32(&(p)->refcnt);					\
159 }
160 
161 /*
162  * Timeout routine to setup switching mode:
163  * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop()
164  * initially. If it fails and the error is EAGAIN, then this timeout handler
165  * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried
166  * until we successfully finish it; or the returned error is not EAGAIN.
167  */
168 void
169 vsw_setup_switching_timeout(void *arg)
170 {
171 	vsw_t		*vswp = (vsw_t *)arg;
172 	int		rv;
173 
174 	if (vswp->swtmout_enabled == B_FALSE)
175 		return;
176 
177 	rv = vsw_setup_switching(vswp);
178 
179 	if (rv == 0) {
180 		vsw_setup_layer2_post_process(vswp);
181 	}
182 
183 	mutex_enter(&vswp->swtmout_lock);
184 
185 	if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) {
186 		/*
187 		 * Reschedule timeout() if the error is EAGAIN and the
188 		 * timeout is still enabled. For errors other than EAGAIN,
189 		 * we simply return without rescheduling timeout().
190 		 */
191 		vswp->swtmout_id =
192 		    timeout(vsw_setup_switching_timeout, vswp,
193 		    (vsw_setup_switching_delay * drv_usectohz(MICROSEC)));
194 		goto exit;
195 	}
196 
197 	/* timeout handler completed */
198 	vswp->swtmout_enabled = B_FALSE;
199 	vswp->swtmout_id = 0;
200 
201 exit:
202 	mutex_exit(&vswp->swtmout_lock);
203 }
204 
205 /*
206  * Cancel the timeout handler to setup switching mode.
207  */
208 void
209 vsw_stop_switching_timeout(vsw_t *vswp)
210 {
211 	timeout_id_t tid;
212 
213 	mutex_enter(&vswp->swtmout_lock);
214 
215 	tid = vswp->swtmout_id;
216 
217 	if (tid != 0) {
218 		/* signal timeout handler to stop */
219 		vswp->swtmout_enabled = B_FALSE;
220 		vswp->swtmout_id = 0;
221 		mutex_exit(&vswp->swtmout_lock);
222 
223 		(void) untimeout(tid);
224 	} else {
225 		mutex_exit(&vswp->swtmout_lock);
226 	}
227 
228 	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
229 
230 	mutex_enter(&vswp->mac_lock);
231 	vswp->mac_open_retries = 0;
232 	mutex_exit(&vswp->mac_lock);
233 }
234 
235 /*
236  * Setup the required switching mode.
237  * This routine is invoked from vsw_attach() or vsw_update_md_prop()
238  * initially. If it fails and the error is EAGAIN, then a timeout handler
239  * is started to retry vsw_setup_switching(), until it successfully finishes;
240  * or the returned error is not EAGAIN.
241  *
242  * Returns:
243  *  0 on success.
244  *  EAGAIN if retry is needed.
245  *  1 on all other failures.
246  */
247 int
248 vsw_setup_switching(vsw_t *vswp)
249 {
250 	int	rv = 1;
251 
252 	D1(vswp, "%s: enter", __func__);
253 
254 	/*
255 	 * Select best switching mode.
256 	 * This is done as this routine can be called from the timeout
257 	 * handler to retry setting up a specific mode. Currently only
258 	 * the function which sets up layer2/promisc mode returns EAGAIN
259 	 * if the underlying network device is not available yet, causing
260 	 * retries.
261 	 */
262 	if (vswp->smode & VSW_LAYER2) {
263 		rv = vsw_setup_layer2(vswp);
264 	} else if (vswp->smode & VSW_LAYER3) {
265 		rv = vsw_setup_layer3(vswp);
266 	} else {
267 		DERR(vswp, "unknown switch mode");
268 		rv = 1;
269 	}
270 
271 	if (rv && (rv != EAGAIN)) {
272 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
273 		    "switching mode", vswp->instance);
274 	} else if (rv == 0) {
275 		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
276 	}
277 
278 	D2(vswp, "%s: Operating in mode %d", __func__,
279 	    vswp->smode);
280 
281 	D1(vswp, "%s: exit", __func__);
282 
283 	return (rv);
284 }
285 
286 /*
287  * Setup for layer 2 switching.
288  *
289  * Returns:
290  *  0 on success.
291  *  EAGAIN if retry is needed.
292  *  EIO on all other failures.
293  */
294 static int
295 vsw_setup_layer2(vsw_t *vswp)
296 {
297 	int	rv;
298 
299 	D1(vswp, "%s: enter", __func__);
300 
301 	/*
302 	 * Until the network device is successfully opened,
303 	 * set the switching to use vsw_switch_l2_frame.
304 	 */
305 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
306 	vswp->mac_cl_switching = B_FALSE;
307 
308 	rv = strlen(vswp->physname);
309 	if (rv == 0) {
310 		/*
311 		 * Physical device name is NULL, which is
312 		 * required for layer 2.
313 		 */
314 		cmn_err(CE_WARN, "!vsw%d: no network device name specified",
315 		    vswp->instance);
316 		return (EIO);
317 	}
318 
319 	mutex_enter(&vswp->mac_lock);
320 
321 	rv = vsw_mac_open(vswp);
322 	if (rv != 0) {
323 		if (rv != EAGAIN) {
324 			cmn_err(CE_WARN, "!vsw%d: Unable to open network "
325 			    "device: %s\n", vswp->instance, vswp->physname);
326 		}
327 		mutex_exit(&vswp->mac_lock);
328 		return (rv);
329 	}
330 
331 	/*
332 	 * Now we can use the mac client switching, so set the switching
333 	 * function to use vsw_switch_l2_frame_mac_client(), which simply
334 	 * sends the packets to MAC layer for switching.
335 	 */
336 	vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client;
337 	vswp->mac_cl_switching = B_TRUE;
338 
339 	D1(vswp, "%s: exit", __func__);
340 
341 	/* Initialize HybridIO related stuff */
342 	vsw_hio_init(vswp);
343 
344 	mutex_exit(&vswp->mac_lock);
345 	return (0);
346 
347 exit_error:
348 	vsw_mac_close(vswp);
349 	mutex_exit(&vswp->mac_lock);
350 	return (EIO);
351 }
352 
353 static int
354 vsw_setup_layer3(vsw_t *vswp)
355 {
356 	D1(vswp, "%s: enter", __func__);
357 
358 	D2(vswp, "%s: operating in layer 3 mode", __func__);
359 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
360 
361 	D1(vswp, "%s: exit", __func__);
362 
363 	return (0);
364 }
365 
366 /* ARGSUSED */
367 void
368 vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
369 			mac_resource_handle_t mrh)
370 {
371 	freemsgchain(mp);
372 }
373 
374 /*
375  * Use mac client for layer 2 switching .
376  */
377 static void
378 vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
379     vsw_port_t *port, mac_resource_handle_t mrh)
380 {
381 	_NOTE(ARGUNUSED(mrh))
382 
383 	mblk_t		*ret_m;
384 
385 	/*
386 	 * This switching function is expected to be called by
387 	 * the ports or the interface only. The packets from
388 	 * physical interface already switched.
389 	 */
390 	ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV));
391 
392 	if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) {
393 		DERR(vswp, "%s: drop mblks to "
394 		    "phys dev", __func__);
395 		freemsgchain(ret_m);
396 	}
397 }
398 
399 /*
400  * Switch the given ethernet frame when operating in layer 2 mode.
401  *
402  * vswp: pointer to the vsw instance
403  * mp: pointer to chain of ethernet frame(s) to be switched
404  * caller: identifies the source of this frame as:
405  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
406  *		2. VSW_PHYSDEV - the physical ethernet device
407  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
408  * arg: argument provided by the caller.
409  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
410  *		2. for PHYSDEV - NULL
411  *		3. for LOCALDEV - pointer to to this vsw_t(self)
412  */
413 void
414 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
415 			vsw_port_t *arg, mac_resource_handle_t mrh)
416 {
417 	struct ether_header	*ehp;
418 	mblk_t			*bp, *ret_m;
419 	vsw_fdbe_t		*fp;
420 
421 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
422 
423 	/*
424 	 * PERF: rather than breaking up the chain here, scan it
425 	 * to find all mblks heading to same destination and then
426 	 * pass that sub-chain to the lower transmit functions.
427 	 */
428 
429 	/* process the chain of packets */
430 	bp = mp;
431 	while (bp) {
432 		ehp = (struct ether_header *)bp->b_rptr;
433 		mp = vsw_get_same_dest_list(ehp, &bp);
434 		ASSERT(mp != NULL);
435 
436 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
437 		    __func__, MBLKSIZE(mp), MBLKL(mp));
438 
439 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
440 			/*
441 			 * If destination is VSW_LOCALDEV (vsw as an eth
442 			 * interface) and if the device is up & running,
443 			 * send the packet up the stack on this host.
444 			 * If the virtual interface is down, drop the packet.
445 			 */
446 			if (caller != VSW_LOCALDEV) {
447 				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
448 			} else {
449 				freemsgchain(mp);
450 			}
451 			continue;
452 		}
453 
454 		/*
455 		 * Find fdb entry for the destination
456 		 * and hold a reference to it.
457 		 */
458 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
459 		if (fp != NULL) {
460 
461 			/*
462 			 * If plumbed and in promisc mode then copy msg
463 			 * and send up the stack.
464 			 */
465 			vsw_mac_rx(vswp, mrh, mp,
466 			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
467 
468 			/*
469 			 * If the destination is in FDB, the packet
470 			 * should be forwarded to the correponding
471 			 * vsw_port (connected to a vnet device -
472 			 * VSW_VNETPORT)
473 			 */
474 			(void) vsw_portsend(fp->portp, mp);
475 
476 			/* Release the reference on the fdb entry */
477 			VSW_FDBE_REFRELE(fp);
478 		} else {
479 			/*
480 			 * Destination not in FDB.
481 			 *
482 			 * If the destination is broadcast or
483 			 * multicast forward the packet to all
484 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
485 			 * except the caller.
486 			 */
487 			if (IS_BROADCAST(ehp)) {
488 				D2(vswp, "%s: BROADCAST pkt", __func__);
489 				(void) vsw_forward_all(vswp, mp, caller, arg);
490 			} else if (IS_MULTICAST(ehp)) {
491 				D2(vswp, "%s: MULTICAST pkt", __func__);
492 				(void) vsw_forward_grp(vswp, mp, caller, arg);
493 			} else {
494 				/*
495 				 * If the destination is unicast, and came
496 				 * from either a logical network device or
497 				 * the switch itself when it is plumbed, then
498 				 * send it out on the physical device and also
499 				 * up the stack if the logical interface is
500 				 * in promiscious mode.
501 				 *
502 				 * NOTE:  The assumption here is that if we
503 				 * cannot find the destination in our fdb, its
504 				 * a unicast address, and came from either a
505 				 * vnet or down the stack (when plumbed) it
506 				 * must be destinded for an ethernet device
507 				 * outside our ldoms.
508 				 */
509 				if (caller == VSW_VNETPORT) {
510 					/* promisc check copy etc */
511 					vsw_mac_rx(vswp, mrh, mp,
512 					    VSW_MACRX_PROMISC |
513 					    VSW_MACRX_COPYMSG);
514 
515 					if ((ret_m = vsw_tx_msg(vswp, mp,
516 					    caller, arg)) != NULL) {
517 						DERR(vswp, "%s: drop mblks to "
518 						    "phys dev", __func__);
519 						freemsgchain(ret_m);
520 					}
521 
522 				} else if (caller == VSW_PHYSDEV) {
523 					/*
524 					 * Pkt seen because card in promisc
525 					 * mode. Send up stack if plumbed in
526 					 * promisc mode, else drop it.
527 					 */
528 					vsw_mac_rx(vswp, mrh, mp,
529 					    VSW_MACRX_PROMISC |
530 					    VSW_MACRX_FREEMSG);
531 
532 				} else if (caller == VSW_LOCALDEV) {
533 					/*
534 					 * Pkt came down the stack, send out
535 					 * over physical device.
536 					 */
537 					if ((ret_m = vsw_tx_msg(vswp, mp,
538 					    caller, NULL)) != NULL) {
539 						DERR(vswp, "%s: drop mblks to "
540 						    "phys dev", __func__);
541 						freemsgchain(ret_m);
542 					}
543 				}
544 			}
545 		}
546 	}
547 	D1(vswp, "%s: exit\n", __func__);
548 }
549 
550 /*
551  * Switch ethernet frame when in layer 3 mode (i.e. using IP
552  * layer to do the routing).
553  *
554  * There is a large amount of overlap between this function and
555  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
556  * both these functions.
557  */
558 void
559 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
560 			vsw_port_t *arg, mac_resource_handle_t mrh)
561 {
562 	struct ether_header	*ehp;
563 	mblk_t			*bp = NULL;
564 	vsw_fdbe_t		*fp;
565 
566 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
567 
568 	/*
569 	 * In layer 3 mode should only ever be switching packets
570 	 * between IP layer and vnet devices. So make sure thats
571 	 * who is invoking us.
572 	 */
573 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
574 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
575 		freemsgchain(mp);
576 		return;
577 	}
578 
579 	/* process the chain of packets */
580 	bp = mp;
581 	while (bp) {
582 		ehp = (struct ether_header *)bp->b_rptr;
583 		mp = vsw_get_same_dest_list(ehp, &bp);
584 		ASSERT(mp != NULL);
585 
586 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
587 		    __func__, MBLKSIZE(mp), MBLKL(mp));
588 
589 		/*
590 		 * Find fdb entry for the destination
591 		 * and hold a reference to it.
592 		 */
593 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
594 		if (fp != NULL) {
595 
596 			D2(vswp, "%s: sending to target port", __func__);
597 			(void) vsw_portsend(fp->portp, mp);
598 
599 			/* Release the reference on the fdb entry */
600 			VSW_FDBE_REFRELE(fp);
601 		} else {
602 			/*
603 			 * Destination not in FDB
604 			 *
605 			 * If the destination is broadcast or
606 			 * multicast forward the packet to all
607 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
608 			 * except the caller.
609 			 */
610 			if (IS_BROADCAST(ehp)) {
611 				D2(vswp, "%s: BROADCAST pkt", __func__);
612 				(void) vsw_forward_all(vswp, mp, caller, arg);
613 			} else if (IS_MULTICAST(ehp)) {
614 				D2(vswp, "%s: MULTICAST pkt", __func__);
615 				(void) vsw_forward_grp(vswp, mp, caller, arg);
616 			} else {
617 				/*
618 				 * Unicast pkt from vnet that we don't have
619 				 * an FDB entry for, so must be destinded for
620 				 * the outside world. Attempt to send up to the
621 				 * IP layer to allow it to deal with it.
622 				 */
623 				if (caller == VSW_VNETPORT) {
624 					vsw_mac_rx(vswp, mrh,
625 					    mp, VSW_MACRX_FREEMSG);
626 				}
627 			}
628 		}
629 	}
630 
631 	D1(vswp, "%s: exit", __func__);
632 }
633 
634 /*
635  * Setup mac addrs and hio resources for layer 2 switching only.
636  */
637 void
638 vsw_setup_layer2_post_process(vsw_t *vswp)
639 {
640 	if (vswp->smode & VSW_LAYER2) {
641 		/*
642 		 * Program unicst, mcst addrs of vsw
643 		 * interface and ports in the physdev.
644 		 */
645 		vsw_set_addrs(vswp);
646 
647 		/* Start HIO for ports that have already connected */
648 		vsw_hio_start_ports(vswp);
649 	}
650 }
651 
652 /*
653  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
654  * except the caller (port on which frame arrived).
655  */
656 static int
657 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
658 {
659 	vsw_port_list_t	*plist = &vswp->plist;
660 	vsw_port_t	*portp;
661 	mblk_t		*nmp = NULL;
662 	mblk_t		*ret_m = NULL;
663 	int		skip_port = 0;
664 
665 	D1(vswp, "vsw_forward_all: enter\n");
666 
667 	/*
668 	 * Broadcast message from inside ldoms so send to outside
669 	 * world if in either of layer 2 modes.
670 	 */
671 	if ((vswp->smode & VSW_LAYER2) &&
672 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
673 
674 		nmp = vsw_dupmsgchain(mp);
675 		if (nmp) {
676 			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
677 			    != NULL) {
678 				DERR(vswp, "%s: dropping pkt(s) "
679 				    "consisting of %ld bytes of data for"
680 				    " physical device", __func__, MBLKL(ret_m));
681 				freemsgchain(ret_m);
682 			}
683 		}
684 	}
685 
686 	if (caller == VSW_VNETPORT)
687 		skip_port = 1;
688 
689 	/*
690 	 * Broadcast message from other vnet (layer 2 or 3) or outside
691 	 * world (layer 2 only), send up stack if plumbed.
692 	 */
693 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
694 		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
695 	}
696 
697 	/* send it to all VNETPORTs */
698 	READ_ENTER(&plist->lockrw);
699 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
700 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
701 		/*
702 		 * Caution ! - don't reorder these two checks as arg
703 		 * will be NULL if the caller is PHYSDEV. skip_port is
704 		 * only set if caller is VNETPORT.
705 		 */
706 		if ((skip_port) && (portp == arg)) {
707 			continue;
708 		} else {
709 			nmp = vsw_dupmsgchain(mp);
710 			if (nmp) {
711 				/*
712 				 * The plist->lockrw is protecting the
713 				 * portp from getting destroyed here.
714 				 * So, no ref_cnt is incremented here.
715 				 */
716 				(void) vsw_portsend(portp, nmp);
717 			} else {
718 				DERR(vswp, "vsw_forward_all: nmp NULL");
719 			}
720 		}
721 	}
722 	RW_EXIT(&plist->lockrw);
723 
724 	freemsgchain(mp);
725 
726 	D1(vswp, "vsw_forward_all: exit\n");
727 	return (0);
728 }
729 
730 /*
731  * Forward pkts to any devices or interfaces which have registered
732  * an interest in them (i.e. multicast groups).
733  */
734 static int
735 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
736 {
737 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
738 	mfdb_ent_t		*entp = NULL;
739 	mfdb_ent_t		*tpp = NULL;
740 	vsw_port_t 		*port;
741 	uint64_t		key = 0;
742 	mblk_t			*nmp = NULL;
743 	mblk_t			*ret_m = NULL;
744 	boolean_t		check_if = B_TRUE;
745 
746 	/*
747 	 * Convert address to hash table key
748 	 */
749 	KEY_HASH(key, &ehp->ether_dhost);
750 
751 	D1(vswp, "%s: key 0x%llx", __func__, key);
752 
753 	/*
754 	 * If pkt came from either a vnet or down the stack (if we are
755 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
756 	 * over the physical adapter, and then check to see if any other
757 	 * vnets are interested in it.
758 	 */
759 	if ((vswp->smode & VSW_LAYER2) &&
760 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
761 		nmp = vsw_dupmsgchain(mp);
762 		if (nmp) {
763 			if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
764 			    != NULL) {
765 				DERR(vswp, "%s: dropping pkt(s) consisting of "
766 				    "%ld bytes of data for physical device",
767 				    __func__, MBLKL(ret_m));
768 				freemsgchain(ret_m);
769 			}
770 		}
771 	}
772 
773 	READ_ENTER(&vswp->mfdbrw);
774 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
775 	    (mod_hash_val_t *)&entp) != 0) {
776 		D3(vswp, "%s: no table entry found for addr 0x%llx",
777 		    __func__, key);
778 	} else {
779 		/*
780 		 * Send to list of devices associated with this address...
781 		 */
782 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
783 
784 			/* dont send to ourselves */
785 			if ((caller == VSW_VNETPORT) &&
786 			    (tpp->d_addr == (void *)arg)) {
787 				port = (vsw_port_t *)tpp->d_addr;
788 				D3(vswp, "%s: not sending to ourselves"
789 				    " : port %d", __func__, port->p_instance);
790 				continue;
791 
792 			} else if ((caller == VSW_LOCALDEV) &&
793 			    (tpp->d_type == VSW_LOCALDEV)) {
794 				D2(vswp, "%s: not sending back up stack",
795 				    __func__);
796 				continue;
797 			}
798 
799 			if (tpp->d_type == VSW_VNETPORT) {
800 				port = (vsw_port_t *)tpp->d_addr;
801 				D3(vswp, "%s: sending to port %ld for addr "
802 				    "0x%llx", __func__, port->p_instance, key);
803 
804 				nmp = vsw_dupmsgchain(mp);
805 				if (nmp) {
806 					/*
807 					 * The vswp->mfdbrw is protecting the
808 					 * portp from getting destroyed here.
809 					 * So, no ref_cnt is incremented here.
810 					 */
811 					(void) vsw_portsend(port, nmp);
812 				}
813 			} else {
814 				vsw_mac_rx(vswp, NULL,
815 				    mp, VSW_MACRX_COPYMSG);
816 				D2(vswp, "%s: sending up stack"
817 				    " for addr 0x%llx", __func__, key);
818 				check_if = B_FALSE;
819 			}
820 		}
821 	}
822 
823 	RW_EXIT(&vswp->mfdbrw);
824 
825 	/*
826 	 * If the pkt came from either a vnet or from physical device,
827 	 * and if we havent already sent the pkt up the stack then we
828 	 * check now if we can/should (i.e. the interface is plumbed
829 	 * and in promisc mode).
830 	 */
831 	if ((check_if) &&
832 	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
833 		vsw_mac_rx(vswp, NULL, mp,
834 		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
835 	}
836 
837 	freemsgchain(mp);
838 
839 	D1(vswp, "%s: exit", __func__);
840 
841 	return (0);
842 }
843 
844 /*
845  * This function creates the vlan id hash table for the given vsw device or
846  * port. It then adds each vlan that the device or port has been assigned,
847  * into this hash table.
848  * Arguments:
849  *   arg:  vsw device or port.
850  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
851  */
852 void
853 vsw_create_vlans(void *arg, int type)
854 {
855 	/* create vlan hash table */
856 	vsw_vlan_create_hash(arg, type);
857 
858 	/* add vlan ids of the vsw device into its hash table */
859 	vsw_vlan_add_ids(arg, type);
860 }
861 
862 /*
863  * This function removes the vlan ids of the vsw device or port from its hash
864  * table. It then destroys the vlan hash table.
865  * Arguments:
866  *   arg:  vsw device or port.
867  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
868  */
869 void
870 vsw_destroy_vlans(void *arg, int type)
871 {
872 	/* remove vlan ids from the hash table */
873 	vsw_vlan_remove_ids(arg, type);
874 
875 	/* destroy vlan-hash-table */
876 	vsw_vlan_destroy_hash(arg, type);
877 }
878 
879 /*
880  * Create a vlan-id hash table for the given vsw device or port.
881  */
882 static void
883 vsw_vlan_create_hash(void *arg, int type)
884 {
885 	char		hashname[MAXNAMELEN];
886 
887 	if (type == VSW_LOCALDEV) {
888 		vsw_t		*vswp = (vsw_t *)arg;
889 
890 		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
891 		    vswp->instance);
892 
893 		vswp->vlan_nchains = vsw_vlan_nchains;
894 		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
895 		    vswp->vlan_nchains, mod_hash_null_valdtor);
896 
897 	} else if (type == VSW_VNETPORT) {
898 		vsw_port_t	*portp = (vsw_port_t *)arg;
899 
900 		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
901 		    portp->p_instance);
902 
903 		portp->vlan_nchains = vsw_vlan_nchains;
904 		portp->vlan_hashp = mod_hash_create_idhash(hashname,
905 		    portp->vlan_nchains, mod_hash_null_valdtor);
906 
907 	} else {
908 		return;
909 	}
910 }
911 
912 /*
913  * Destroy the vlan-id hash table for the given vsw device or port.
914  */
915 static void
916 vsw_vlan_destroy_hash(void *arg, int type)
917 {
918 	if (type == VSW_LOCALDEV) {
919 		vsw_t		*vswp = (vsw_t *)arg;
920 
921 		mod_hash_destroy_hash(vswp->vlan_hashp);
922 		vswp->vlan_nchains = 0;
923 	} else if (type == VSW_VNETPORT) {
924 		vsw_port_t	*portp = (vsw_port_t *)arg;
925 
926 		mod_hash_destroy_hash(portp->vlan_hashp);
927 		portp->vlan_nchains = 0;
928 	} else {
929 		return;
930 	}
931 }
932 
933 /*
934  * Add vlan ids of the given vsw device or port into its hash table.
935  */
936 void
937 vsw_vlan_add_ids(void *arg, int type)
938 {
939 	int	rv;
940 	int	i;
941 
942 	if (type == VSW_LOCALDEV) {
943 		vsw_t		*vswp = (vsw_t *)arg;
944 
945 		rv = mod_hash_insert(vswp->vlan_hashp,
946 		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
947 		    (mod_hash_val_t)B_TRUE);
948 		if (rv != 0) {
949 			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
950 			    "the interface", vswp->instance, vswp->pvid);
951 		}
952 
953 		for (i = 0; i < vswp->nvids; i++) {
954 			rv = mod_hash_insert(vswp->vlan_hashp,
955 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid),
956 			    (mod_hash_val_t)B_TRUE);
957 			if (rv != 0) {
958 				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
959 				    " for the interface", vswp->instance,
960 				    vswp->pvid);
961 			}
962 		}
963 
964 	} else if (type == VSW_VNETPORT) {
965 		vsw_port_t	*portp = (vsw_port_t *)arg;
966 		vsw_t		*vswp = portp->p_vswp;
967 
968 		rv = mod_hash_insert(portp->vlan_hashp,
969 		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
970 		    (mod_hash_val_t)B_TRUE);
971 		if (rv != 0) {
972 			cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
973 			    "the port(%d)", vswp->instance, vswp->pvid,
974 			    portp->p_instance);
975 		}
976 
977 		for (i = 0; i < portp->nvids; i++) {
978 			rv = mod_hash_insert(portp->vlan_hashp,
979 			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid),
980 			    (mod_hash_val_t)B_TRUE);
981 			if (rv != 0) {
982 				cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
983 				    " for the port(%d)", vswp->instance,
984 				    vswp->pvid, portp->p_instance);
985 			}
986 		}
987 
988 	}
989 }
990 
991 /*
992  * Remove vlan ids of the given vsw device or port from its hash table.
993  */
994 void
995 vsw_vlan_remove_ids(void *arg, int type)
996 {
997 	mod_hash_val_t	vp;
998 	int		rv;
999 	int		i;
1000 
1001 	if (type == VSW_LOCALDEV) {
1002 		vsw_t		*vswp = (vsw_t *)arg;
1003 
1004 		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
1005 		if (rv == B_TRUE) {
1006 			rv = mod_hash_remove(vswp->vlan_hashp,
1007 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1008 			    (mod_hash_val_t *)&vp);
1009 			ASSERT(rv == 0);
1010 		}
1011 
1012 		for (i = 0; i < vswp->nvids; i++) {
1013 			rv = vsw_vlan_lookup(vswp->vlan_hashp,
1014 			    vswp->vids[i].vl_vid);
1015 			if (rv == B_TRUE) {
1016 				rv = mod_hash_remove(vswp->vlan_hashp,
1017 				    (mod_hash_key_t)VLAN_ID_KEY(
1018 				    vswp->vids[i].vl_vid),
1019 				    (mod_hash_val_t *)&vp);
1020 				ASSERT(rv == 0);
1021 			}
1022 		}
1023 
1024 	} else if (type == VSW_VNETPORT) {
1025 		vsw_port_t	*portp = (vsw_port_t *)arg;
1026 
1027 		portp = (vsw_port_t *)arg;
1028 		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1029 		if (rv == B_TRUE) {
1030 			rv = mod_hash_remove(portp->vlan_hashp,
1031 			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1032 			    (mod_hash_val_t *)&vp);
1033 			ASSERT(rv == 0);
1034 		}
1035 
1036 		for (i = 0; i < portp->nvids; i++) {
1037 			rv = vsw_vlan_lookup(portp->vlan_hashp,
1038 			    portp->vids[i].vl_vid);
1039 			if (rv == B_TRUE) {
1040 				rv = mod_hash_remove(portp->vlan_hashp,
1041 				    (mod_hash_key_t)VLAN_ID_KEY(
1042 				    portp->vids[i].vl_vid),
1043 				    (mod_hash_val_t *)&vp);
1044 				ASSERT(rv == 0);
1045 			}
1046 		}
1047 
1048 	} else {
1049 		return;
1050 	}
1051 }
1052 
1053 /*
1054  * Find the given vlan id in the hash table.
1055  * Return: B_TRUE if the id is found; B_FALSE if not found.
1056  */
1057 boolean_t
1058 vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1059 {
1060 	int		rv;
1061 	mod_hash_val_t	vp;
1062 
1063 	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1064 
1065 	if (rv != 0)
1066 		return (B_FALSE);
1067 
1068 	return (B_TRUE);
1069 }
1070 
1071 /*
1072  * Add an entry into FDB for the given vsw.
1073  */
1074 void
1075 vsw_fdbe_add(vsw_t *vswp, void *port)
1076 {
1077 	uint64_t	addr = 0;
1078 	vsw_port_t	*portp;
1079 	vsw_fdbe_t	*fp;
1080 	int		rv;
1081 
1082 	portp = (vsw_port_t *)port;
1083 	KEY_HASH(addr, &portp->p_macaddr);
1084 
1085 	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1086 	fp->portp = port;
1087 
1088 	/*
1089 	 * Note: duplicate keys will be rejected by mod_hash.
1090 	 */
1091 	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1092 	    (mod_hash_val_t)fp);
1093 	if (rv != 0) {
1094 		cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for "
1095 		    "the port(%d)", vswp->instance,
1096 		    ether_sprintf(&portp->p_macaddr), portp->p_instance);
1097 	}
1098 }
1099 
1100 /*
1101  * Remove an entry from FDB.
1102  */
1103 void
1104 vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1105 {
1106 	uint64_t	addr = 0;
1107 	vsw_fdbe_t	*fp;
1108 	int		rv;
1109 
1110 	KEY_HASH(addr, eaddr);
1111 
1112 	/*
1113 	 * Remove the entry from fdb hash table.
1114 	 * This prevents further references to this fdb entry.
1115 	 */
1116 	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1117 	    (mod_hash_val_t *)&fp);
1118 	if (rv != 0) {
1119 		/* invalid key? */
1120 		return;
1121 	}
1122 
1123 	/*
1124 	 * If there are threads already ref holding before the entry was
1125 	 * removed from hash table, then wait for ref count to drop to zero.
1126 	 */
1127 	while (fp->refcnt != 0) {
1128 		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1129 	}
1130 
1131 	kmem_free(fp, sizeof (*fp));
1132 }
1133 
1134 /*
1135  * Search fdb for a given mac address. If an entry is found, hold
1136  * a reference to it and return the entry, else returns NULL.
1137  */
1138 static vsw_fdbe_t *
1139 vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1140 {
1141 	uint64_t	key = 0;
1142 	vsw_fdbe_t	*fp;
1143 	int		rv;
1144 
1145 	KEY_HASH(key, addrp);
1146 
1147 	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1148 	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1149 
1150 	if (rv != 0)
1151 		return (NULL);
1152 
1153 	return (fp);
1154 }
1155 
1156 /*
1157  * Callback function provided to mod_hash_find_cb(). After finding the fdb
1158  * entry corresponding to the key (macaddr), this callback will be invoked by
1159  * mod_hash_find_cb() to atomically increment the reference count on the fdb
1160  * entry before returning the found entry.
1161  */
1162 static void
1163 vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1164 {
1165 	_NOTE(ARGUNUSED(key))
1166 	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1167 }
1168 
1169 /*
1170  * A given frame must be always tagged with the appropriate vlan id (unless it
1171  * is in the default-vlan) before the mac address switching function is called.
1172  * Otherwise, after switching function determines the destination, we cannot
1173  * figure out if the destination belongs to the the same vlan that the frame
1174  * originated from and if it needs tag/untag. Frames which are inbound from
1175  * the external(physical) network over a vlan trunk link are always tagged.
1176  * However frames which are received from a vnet-port over ldc or frames which
1177  * are coming down the stack on the service domain over vsw interface may be
1178  * untagged. These frames must be tagged with the appropriate pvid of the
1179  * sender (vnet-port or vsw device), before invoking the switching function.
1180  *
1181  * Arguments:
1182  *   arg:    caller of the function.
1183  *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1184  *   mp:     frame(s) to be tagged.
1185  */
1186 mblk_t *
1187 vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1188 {
1189 	vsw_t			*vswp;
1190 	vsw_port_t		*portp;
1191 	struct ether_header	*ehp;
1192 	mblk_t			*bp;
1193 	mblk_t			*bpt;
1194 	mblk_t			*bph;
1195 	mblk_t			*bpn;
1196 	uint16_t		pvid;
1197 
1198 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1199 
1200 	if (type == VSW_LOCALDEV) {
1201 		vswp = (vsw_t *)arg;
1202 		pvid = vswp->pvid;
1203 		portp = NULL;
1204 	} else {
1205 		/* VSW_VNETPORT */
1206 		portp = (vsw_port_t *)arg;
1207 		pvid = portp->pvid;
1208 		vswp = portp->p_vswp;
1209 	}
1210 
1211 	bpn = bph = bpt = NULL;
1212 
1213 	for (bp = mp; bp != NULL; bp = bpn) {
1214 
1215 		bpn = bp->b_next;
1216 		bp->b_next = bp->b_prev = NULL;
1217 
1218 		/* Determine if it is an untagged frame */
1219 		ehp = (struct ether_header *)bp->b_rptr;
1220 
1221 		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1222 
1223 			/* no need to tag if the frame is in default vlan */
1224 			if (pvid != vswp->default_vlan_id) {
1225 				bp = vnet_vlan_insert_tag(bp, pvid);
1226 				if (bp == NULL) {
1227 					continue;
1228 				}
1229 			}
1230 		}
1231 
1232 		/* build a chain of processed packets */
1233 		if (bph == NULL) {
1234 			bph = bpt = bp;
1235 		} else {
1236 			bpt->b_next = bp;
1237 			bpt = bp;
1238 		}
1239 
1240 	}
1241 
1242 	return (bph);
1243 }
1244 
1245 /*
1246  * Frames destined to a vnet-port or to the local vsw interface, must be
1247  * untagged if necessary before sending. This function first checks that the
1248  * frame can be sent to the destination in the vlan identified by the frame
1249  * tag. Note that when this function is invoked the frame must have been
1250  * already tagged (unless it is in the default-vlan). Because, this function is
1251  * called when the switching function determines the destination and invokes
1252  * its send function (vnet-port or vsw interface) and all frames would have
1253  * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1254  *
1255  * Arguments:
1256  *   arg:    destination device.
1257  *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1258  *   np:     head of pkt chain to be validated and untagged.
1259  *   npt:    tail of pkt chain to be validated and untagged.
1260  *
1261  * Returns:
1262  *   np:     head of updated chain of packets
1263  *   npt:    tail of updated chain of packets
1264  *   rv:     count of the packets in the returned list
1265  */
1266 uint32_t
1267 vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1268 {
1269 	mblk_t			*bp;
1270 	mblk_t			*bpt;
1271 	mblk_t			*bph;
1272 	mblk_t			*bpn;
1273 	vsw_port_t		*portp;
1274 	vsw_t			*vswp;
1275 	uint32_t		count;
1276 	struct ether_header	*ehp;
1277 	boolean_t		is_tagged;
1278 	boolean_t		rv;
1279 	uint16_t		vlan_id;
1280 	uint16_t		pvid;
1281 	mod_hash_t		*vlan_hashp;
1282 
1283 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1284 
1285 
1286 	if (type == VSW_LOCALDEV) {
1287 		vswp = (vsw_t *)arg;
1288 		pvid = vswp->pvid;
1289 		vlan_hashp = vswp->vlan_hashp;
1290 		portp = NULL;
1291 	} else {
1292 		/* type == VSW_VNETPORT */
1293 		portp = (vsw_port_t *)arg;
1294 		vswp = portp->p_vswp;
1295 		vlan_hashp = portp->vlan_hashp;
1296 		pvid = portp->pvid;
1297 	}
1298 
1299 	/*
1300 	 * If the MAC layer switching in place, then
1301 	 * untagging required only if the pvid is not
1302 	 * the same as default_vlan_id. This is because,
1303 	 * the MAC layer will send packets for the
1304 	 * registered vlans only.
1305 	 */
1306 	if ((vswp->mac_cl_switching == B_TRUE) &&
1307 	    (pvid == vswp->default_vlan_id)) {
1308 		/* simply count and set the tail */
1309 		count = 1;
1310 		bp = *np;
1311 		ASSERT(bp != NULL);
1312 		while (bp->b_next != NULL) {
1313 			bp = bp->b_next;
1314 			count++;
1315 		}
1316 		*npt = bp;
1317 		return (count);
1318 	}
1319 
1320 	bpn = bph = bpt = NULL;
1321 	count = 0;
1322 
1323 	for (bp = *np; bp != NULL; bp = bpn) {
1324 
1325 		bpn = bp->b_next;
1326 		bp->b_next = bp->b_prev = NULL;
1327 
1328 		/*
1329 		 * Determine the vlan id that the frame belongs to.
1330 		 */
1331 		ehp = (struct ether_header *)bp->b_rptr;
1332 		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1333 
1334 		/*
1335 		 * If MAC layer switching in place, then we
1336 		 * need to untag only if the tagged packet has
1337 		 * vlan-id same as the pvid.
1338 		 */
1339 		if (vswp->mac_cl_switching == B_TRUE) {
1340 
1341 			/* only tagged packets expected here */
1342 			ASSERT(is_tagged == B_TRUE);
1343 			if (vlan_id == pvid) {
1344 				bp = vnet_vlan_remove_tag(bp);
1345 				if (bp == NULL) {
1346 					/* packet dropped */
1347 					continue;
1348 				}
1349 			}
1350 		} else { /* No MAC layer switching */
1351 
1352 			/*
1353 			 * Check the frame header if tag/untag is  needed.
1354 			 */
1355 			if (is_tagged == B_FALSE) {
1356 				/*
1357 				 * Untagged frame. We shouldn't have an
1358 				 * untagged packet at this point, unless
1359 				 * the destination's  vlan id is
1360 				 * default-vlan-id; if it is not the
1361 				 * default-vlan-id, we drop the packet.
1362 				 */
1363 				if (vlan_id != vswp->default_vlan_id) {
1364 					/* drop the packet */
1365 					freemsg(bp);
1366 					continue;
1367 				}
1368 			} else {	/* Tagged */
1369 				/*
1370 				 * Tagged frame, untag if it's the
1371 				 * destination's pvid.
1372 				 */
1373 				if (vlan_id == pvid) {
1374 
1375 					bp = vnet_vlan_remove_tag(bp);
1376 					if (bp == NULL) {
1377 						/* packet dropped */
1378 						continue;
1379 					}
1380 				} else {
1381 
1382 					/*
1383 					 * Check if the destination is in the
1384 					 * same vlan.
1385 					 */
1386 					rv = vsw_vlan_lookup(vlan_hashp,
1387 					    vlan_id);
1388 					if (rv == B_FALSE) {
1389 						/* drop the packet */
1390 						freemsg(bp);
1391 						continue;
1392 					}
1393 				}
1394 
1395 			}
1396 		}
1397 
1398 		/* build a chain of processed packets */
1399 		if (bph == NULL) {
1400 			bph = bpt = bp;
1401 		} else {
1402 			bpt->b_next = bp;
1403 			bpt = bp;
1404 		}
1405 		count++;
1406 	}
1407 
1408 	*np = bph;
1409 	*npt = bpt;
1410 	return (count);
1411 }
1412 
1413 /*
1414  * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1415  * then the vlan-id is available in the tag; otherwise, its vlan id is
1416  * implicitly obtained based on the caller (destination of the frame:
1417  * VSW_VNETPORT or VSW_LOCALDEV).
1418  * The vlan id determined is returned in vidp.
1419  * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1420  */
1421 boolean_t
1422 vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1423 	uint16_t *vidp)
1424 {
1425 	struct ether_vlan_header	*evhp;
1426 	vsw_t				*vswp;
1427 	vsw_port_t			*portp;
1428 
1429 	/* If it's a tagged frame, get the vid from vlan header */
1430 	if (ehp->ether_type == ETHERTYPE_VLAN) {
1431 
1432 		evhp = (struct ether_vlan_header *)ehp;
1433 		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1434 		return (B_TRUE);
1435 	}
1436 
1437 	/* Untagged frame; determine vlan id based on caller */
1438 	switch (caller) {
1439 
1440 	case VSW_VNETPORT:
1441 		/*
1442 		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1443 		 */
1444 		portp = (vsw_port_t *)arg;
1445 		*vidp = portp->pvid;
1446 		break;
1447 
1448 	case VSW_LOCALDEV:
1449 
1450 		/*
1451 		 * packet destined to vsw interface;
1452 		 * vlan-id is port-vlan-id of vsw device.
1453 		 */
1454 		vswp = (vsw_t *)arg;
1455 		*vidp = vswp->pvid;
1456 		break;
1457 	}
1458 
1459 	return (B_FALSE);
1460 }
1461 
1462 /*
1463  * Add or remove multicast address(es).
1464  *
1465  * Returns 0 on success, 1 on failure.
1466  */
1467 int
1468 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1469 {
1470 	mcst_addr_t		*mcst_p = NULL;
1471 	vsw_t			*vswp = port->p_vswp;
1472 	uint64_t		addr = 0x0;
1473 	int			i;
1474 
1475 	D1(vswp, "%s: enter", __func__);
1476 
1477 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1478 
1479 	for (i = 0; i < mcst_pkt->count; i++) {
1480 		/*
1481 		 * Convert address into form that can be used
1482 		 * as hash table key.
1483 		 */
1484 		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1485 
1486 		/*
1487 		 * Add or delete the specified address/port combination.
1488 		 */
1489 		if (mcst_pkt->set == 0x1) {
1490 			D3(vswp, "%s: adding multicast address 0x%llx for "
1491 			    "port %ld", __func__, addr, port->p_instance);
1492 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1493 				/*
1494 				 * Update the list of multicast
1495 				 * addresses contained within the
1496 				 * port structure to include this new
1497 				 * one.
1498 				 */
1499 				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1500 				    KM_NOSLEEP);
1501 				if (mcst_p == NULL) {
1502 					DERR(vswp, "%s: unable to alloc mem",
1503 					    __func__);
1504 					(void) vsw_del_mcst(vswp,
1505 					    VSW_VNETPORT, addr, port);
1506 					return (1);
1507 				}
1508 
1509 				mcst_p->nextp = NULL;
1510 				mcst_p->addr = addr;
1511 				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1512 
1513 				/*
1514 				 * Program the address into HW. If the addr
1515 				 * has already been programmed then the MAC
1516 				 * just increments a ref counter (which is
1517 				 * used when the address is being deleted)
1518 				 */
1519 				if (vsw_mac_multicast_add(vswp, port, mcst_p,
1520 				    VSW_VNETPORT)) {
1521 					(void) vsw_del_mcst(vswp,
1522 					    VSW_VNETPORT, addr, port);
1523 					kmem_free(mcst_p, sizeof (*mcst_p));
1524 					return (1);
1525 				}
1526 
1527 				mutex_enter(&port->mca_lock);
1528 				mcst_p->nextp = port->mcap;
1529 				port->mcap = mcst_p;
1530 				mutex_exit(&port->mca_lock);
1531 
1532 			} else {
1533 				DERR(vswp, "%s: error adding multicast "
1534 				    "address 0x%llx for port %ld",
1535 				    __func__, addr, port->p_instance);
1536 				return (1);
1537 			}
1538 		} else {
1539 			/*
1540 			 * Delete an entry from the multicast hash
1541 			 * table and update the address list
1542 			 * appropriately.
1543 			 */
1544 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1545 				D3(vswp, "%s: deleting multicast address "
1546 				    "0x%llx for port %ld", __func__, addr,
1547 				    port->p_instance);
1548 
1549 				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1550 				ASSERT(mcst_p != NULL);
1551 
1552 				/*
1553 				 * Remove the address from HW. The address
1554 				 * will actually only be removed once the ref
1555 				 * count within the MAC layer has dropped to
1556 				 * zero. I.e. we can safely call this fn even
1557 				 * if other ports are interested in this
1558 				 * address.
1559 				 */
1560 				vsw_mac_multicast_remove(vswp, port, mcst_p,
1561 				    VSW_VNETPORT);
1562 				kmem_free(mcst_p, sizeof (*mcst_p));
1563 
1564 			} else {
1565 				DERR(vswp, "%s: error deleting multicast "
1566 				    "addr 0x%llx for port %ld",
1567 				    __func__, addr, port->p_instance);
1568 				return (1);
1569 			}
1570 		}
1571 	}
1572 	D1(vswp, "%s: exit", __func__);
1573 	return (0);
1574 }
1575 
1576 /*
1577  * Add a new multicast entry.
1578  *
1579  * Search hash table based on address. If match found then
1580  * update associated val (which is chain of ports), otherwise
1581  * create new key/val (addr/port) pair and insert into table.
1582  */
1583 int
1584 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1585 {
1586 	int		dup = 0;
1587 	int		rv = 0;
1588 	mfdb_ent_t	*ment = NULL;
1589 	mfdb_ent_t	*tmp_ent = NULL;
1590 	mfdb_ent_t	*new_ent = NULL;
1591 	void		*tgt = NULL;
1592 
1593 	if (devtype == VSW_VNETPORT) {
1594 		/*
1595 		 * Being invoked from a vnet.
1596 		 */
1597 		ASSERT(arg != NULL);
1598 		tgt = arg;
1599 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1600 		    ((vsw_port_t *)arg)->p_instance, addr);
1601 	} else {
1602 		/*
1603 		 * We are being invoked via the m_multicst mac entry
1604 		 * point.
1605 		 */
1606 		D2(NULL, "%s: address 0x%llx", __func__, addr);
1607 		tgt = (void *)vswp;
1608 	}
1609 
1610 	WRITE_ENTER(&vswp->mfdbrw);
1611 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1612 	    (mod_hash_val_t *)&ment) != 0) {
1613 
1614 		/* address not currently in table */
1615 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1616 		ment->d_addr = (void *)tgt;
1617 		ment->d_type = devtype;
1618 		ment->nextp = NULL;
1619 
1620 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1621 		    (mod_hash_val_t)ment) != 0) {
1622 			DERR(vswp, "%s: hash table insertion failed", __func__);
1623 			kmem_free(ment, sizeof (mfdb_ent_t));
1624 			rv = 1;
1625 		} else {
1626 			D2(vswp, "%s: added initial entry for 0x%llx to "
1627 			    "table", __func__, addr);
1628 		}
1629 	} else {
1630 		/*
1631 		 * Address in table. Check to see if specified port
1632 		 * is already associated with the address. If not add
1633 		 * it now.
1634 		 */
1635 		tmp_ent = ment;
1636 		while (tmp_ent != NULL) {
1637 			if (tmp_ent->d_addr == (void *)tgt) {
1638 				if (devtype == VSW_VNETPORT) {
1639 					DERR(vswp, "%s: duplicate port entry "
1640 					    "found for portid %ld and key "
1641 					    "0x%llx", __func__,
1642 					    ((vsw_port_t *)arg)->p_instance,
1643 					    addr);
1644 				} else {
1645 					DERR(vswp, "%s: duplicate entry found"
1646 					    "for key 0x%llx", __func__, addr);
1647 				}
1648 				rv = 1;
1649 				dup = 1;
1650 				break;
1651 			}
1652 			tmp_ent = tmp_ent->nextp;
1653 		}
1654 
1655 		/*
1656 		 * Port not on list so add it to end now.
1657 		 */
1658 		if (0 == dup) {
1659 			D2(vswp, "%s: added entry for 0x%llx to table",
1660 			    __func__, addr);
1661 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1662 			new_ent->d_addr = (void *)tgt;
1663 			new_ent->d_type = devtype;
1664 			new_ent->nextp = NULL;
1665 
1666 			tmp_ent = ment;
1667 			while (tmp_ent->nextp != NULL)
1668 				tmp_ent = tmp_ent->nextp;
1669 
1670 			tmp_ent->nextp = new_ent;
1671 		}
1672 	}
1673 
1674 	RW_EXIT(&vswp->mfdbrw);
1675 	return (rv);
1676 }
1677 
1678 /*
1679  * Remove a multicast entry from the hashtable.
1680  *
1681  * Search hash table based on address. If match found, scan
1682  * list of ports associated with address. If specified port
1683  * found remove it from list.
1684  */
1685 int
1686 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1687 {
1688 	mfdb_ent_t	*ment = NULL;
1689 	mfdb_ent_t	*curr_p, *prev_p;
1690 	void		*tgt = NULL;
1691 
1692 	D1(vswp, "%s: enter", __func__);
1693 
1694 	if (devtype == VSW_VNETPORT) {
1695 		tgt = (vsw_port_t *)arg;
1696 		D2(vswp, "%s: removing port %d from mFDB for address"
1697 		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1698 	} else {
1699 		D2(vswp, "%s: removing entry", __func__);
1700 		tgt = (void *)vswp;
1701 	}
1702 
1703 	WRITE_ENTER(&vswp->mfdbrw);
1704 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1705 	    (mod_hash_val_t *)&ment) != 0) {
1706 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1707 		RW_EXIT(&vswp->mfdbrw);
1708 		return (1);
1709 	}
1710 
1711 	prev_p = curr_p = ment;
1712 
1713 	while (curr_p != NULL) {
1714 		if (curr_p->d_addr == (void *)tgt) {
1715 			if (devtype == VSW_VNETPORT) {
1716 				D2(vswp, "%s: port %d found", __func__,
1717 				    ((vsw_port_t *)tgt)->p_instance);
1718 			} else {
1719 				D2(vswp, "%s: instance found", __func__);
1720 			}
1721 
1722 			if (prev_p == curr_p) {
1723 				/*
1724 				 * head of list, if no other element is in
1725 				 * list then destroy this entry, otherwise
1726 				 * just replace it with updated value.
1727 				 */
1728 				ment = curr_p->nextp;
1729 				if (ment == NULL) {
1730 					(void) mod_hash_destroy(vswp->mfdb,
1731 					    (mod_hash_val_t)addr);
1732 				} else {
1733 					(void) mod_hash_replace(vswp->mfdb,
1734 					    (mod_hash_key_t)addr,
1735 					    (mod_hash_val_t)ment);
1736 				}
1737 			} else {
1738 				/*
1739 				 * Not head of list, no need to do
1740 				 * replacement, just adjust list pointers.
1741 				 */
1742 				prev_p->nextp = curr_p->nextp;
1743 			}
1744 			break;
1745 		}
1746 
1747 		prev_p = curr_p;
1748 		curr_p = curr_p->nextp;
1749 	}
1750 
1751 	RW_EXIT(&vswp->mfdbrw);
1752 
1753 	D1(vswp, "%s: exit", __func__);
1754 
1755 	if (curr_p == NULL)
1756 		return (1);
1757 	kmem_free(curr_p, sizeof (mfdb_ent_t));
1758 	return (0);
1759 }
1760 
1761 /*
1762  * Port is being deleted, but has registered an interest in one
1763  * or more multicast groups. Using the list of addresses maintained
1764  * within the port structure find the appropriate entry in the hash
1765  * table and remove this port from the list of interested ports.
1766  */
1767 void
1768 vsw_del_mcst_port(vsw_port_t *port)
1769 {
1770 	mcst_addr_t	*mcap = NULL;
1771 	vsw_t		*vswp = port->p_vswp;
1772 
1773 	D1(vswp, "%s: enter", __func__);
1774 
1775 	mutex_enter(&port->mca_lock);
1776 
1777 	while ((mcap = port->mcap) != NULL) {
1778 
1779 		port->mcap = mcap->nextp;
1780 
1781 		mutex_exit(&port->mca_lock);
1782 
1783 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1784 		    mcap->addr, port);
1785 
1786 		/*
1787 		 * Remove the address from HW. The address
1788 		 * will actually only be removed once the ref
1789 		 * count within the MAC layer has dropped to
1790 		 * zero. I.e. we can safely call this fn even
1791 		 * if other ports are interested in this
1792 		 * address.
1793 		 */
1794 		vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT);
1795 		kmem_free(mcap, sizeof (*mcap));
1796 
1797 		mutex_enter(&port->mca_lock);
1798 
1799 	}
1800 
1801 	mutex_exit(&port->mca_lock);
1802 
1803 	D1(vswp, "%s: exit", __func__);
1804 }
1805 
1806 /*
1807  * This vsw instance is detaching, but has registered an interest in one
1808  * or more multicast groups. Using the list of addresses maintained
1809  * within the vsw structure find the appropriate entry in the hash
1810  * table and remove this instance from the list of interested ports.
1811  */
1812 void
1813 vsw_del_mcst_vsw(vsw_t *vswp)
1814 {
1815 	mcst_addr_t	*next_p = NULL;
1816 
1817 	D1(vswp, "%s: enter", __func__);
1818 
1819 	mutex_enter(&vswp->mca_lock);
1820 
1821 	while (vswp->mcap != NULL) {
1822 		DERR(vswp, "%s: deleting addr 0x%llx",
1823 		    __func__, vswp->mcap->addr);
1824 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1825 
1826 		next_p = vswp->mcap->nextp;
1827 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1828 		vswp->mcap = next_p;
1829 	}
1830 
1831 	vswp->mcap = NULL;
1832 	mutex_exit(&vswp->mca_lock);
1833 
1834 	D1(vswp, "%s: exit", __func__);
1835 }
1836 
1837 mblk_t *
1838 vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp)
1839 {
1840 	mblk_t			*bp;
1841 	mblk_t			*nbp;
1842 	mblk_t			*head = NULL;
1843 	mblk_t			*tail = NULL;
1844 	mblk_t			*prev = NULL;
1845 	struct ether_header	*behp;
1846 
1847 	/* process the chain of packets */
1848 	bp = *mpp;
1849 	while (bp) {
1850 		nbp = bp->b_next;
1851 		behp = (struct ether_header *)bp->b_rptr;
1852 		bp->b_prev = NULL;
1853 		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1854 			if (prev == NULL) {
1855 				*mpp = nbp;
1856 			} else {
1857 				prev->b_next = nbp;
1858 			}
1859 			bp->b_next =  NULL;
1860 			if (head == NULL) {
1861 				head = tail = bp;
1862 			} else {
1863 				tail->b_next = bp;
1864 				tail = bp;
1865 			}
1866 		} else {
1867 			prev = bp;
1868 		}
1869 		bp = nbp;
1870 	}
1871 	return (head);
1872 }
1873 
1874 static mblk_t *
1875 vsw_dupmsgchain(mblk_t *mp)
1876 {
1877 	mblk_t	*nmp = NULL;
1878 	mblk_t	**nmpp = &nmp;
1879 
1880 	for (; mp != NULL; mp = mp->b_next) {
1881 		if ((*nmpp = dupmsg(mp)) == NULL) {
1882 			freemsgchain(nmp);
1883 			return (NULL);
1884 		}
1885 
1886 		nmpp = &((*nmpp)->b_next);
1887 	}
1888 
1889 	return (nmp);
1890 }
1891