xref: /illumos-gate/usr/src/uts/common/io/bridge.c (revision 7a6d80f1660abd4755c68cbd094d4a914681d26e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  * Copyright 2019 Joyent, Inc.
27  */
28 
29 /*
30  * This module implements a STREAMS driver that provides layer-two (Ethernet)
31  * bridging functionality.  The STREAMS interface is used to provide
32  * observability (snoop/wireshark) and control, but not for interface plumbing.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/bitmap.h>
37 #include <sys/cmn_err.h>
38 #include <sys/conf.h>
39 #include <sys/ddi.h>
40 #include <sys/errno.h>
41 #include <sys/kstat.h>
42 #include <sys/modctl.h>
43 #include <sys/note.h>
44 #include <sys/param.h>
45 #include <sys/pattr.h>
46 #include <sys/policy.h>
47 #include <sys/sdt.h>
48 #include <sys/stat.h>
49 #include <sys/stream.h>
50 #include <sys/stropts.h>
51 #include <sys/strsun.h>
52 #include <sys/sunddi.h>
53 #include <sys/sysmacros.h>
54 #include <sys/systm.h>
55 #include <sys/time.h>
56 #include <sys/dlpi.h>
57 #include <sys/dls.h>
58 #include <sys/mac_ether.h>
59 #include <sys/mac_provider.h>
60 #include <sys/mac_client_priv.h>
61 #include <sys/mac_impl.h>
62 #include <sys/vlan.h>
63 #include <net/bridge.h>
64 #include <net/bridge_impl.h>
65 #include <net/trill.h>
66 #include <sys/dld_ioc.h>
67 
68 /*
69  * Locks and reference counts: object lifetime and design.
70  *
71  * bridge_mac_t
72  *   Bridge mac (snoop) instances are in bmac_list, which is protected by
73  *   bmac_rwlock.  They're allocated by bmac_alloc and freed by bridge_timer().
74  *   Every bridge_inst_t has a single bridge_mac_t, but when bridge_inst_t goes
75  *   away, the bridge_mac_t remains until either all of the users go away
76  *   (detected by a timer) or until the instance is picked up again by the same
77  *   bridge starting back up.
78  *
79  * bridge_inst_t
80  *   Bridge instances are in inst_list, which is protected by inst_lock.
81  *   They're allocated by inst_alloc() and freed by inst_free().  After
82  *   allocation, an instance is placed in inst_list, and the reference count is
83  *   incremented to represent this.  That reference is decremented when the
84  *   BIF_SHUTDOWN flag is set, and no new increments may occur.  When the last
85  *   reference is freed, the instance is removed from the list.
86  *
87  *   Bridge instances have lists of links and an AVL tree of forwarding
88  *   entries.  Each of these structures holds one reference on the bridge
89  *   instance.  These lists and tree are protected by bi_rwlock.
90  *
91  * bridge_stream_t
92  *   Bridge streams are allocated by stream_alloc() and freed by stream_free().
93  *   These streams are created when "bridged" opens /dev/bridgectl, and are
94  *   used to create new bridge instances (via BRIOC_NEWBRIDGE) and control the
95  *   links on the bridge.  When a stream closes, the bridge instance created is
96  *   destroyed.  There's at most one bridge instance for a given control
97  *   stream.
98  *
99  * bridge_link_t
100  *   Links are allocated by bridge_add_link() and freed by link_free().  The
101  *   bi_links list holds a reference to the link.  When the BLF_DELETED flag is
102  *   set, that reference is dropped.  The link isn't removed from the list
103  *   until the last reference drops.  Each forwarding entry that uses a given
104  *   link holds a reference, as does each thread transmitting a packet via the
105  *   link.  The MAC layer calls in via bridge_ref_cb() to hold a reference on
106  *   a link when transmitting.
107  *
108  *   It's important that once BLF_DELETED is set, there's no way for the
109  *   reference count to increase again.  If it can, then the link may be
110  *   double-freed.  The BLF_FREED flag is intended for use with assertions to
111  *   guard against this in testing.
112  *
113  * bridge_fwd_t
114  *   Bridge forwarding entries are allocated by bridge_recv_cb() and freed by
115  *   fwd_free().  The bi_fwd AVL tree holds one reference to the entry.  Unlike
116  *   other data structures, the reference is dropped when the entry is removed
117  *   from the tree by fwd_delete(), and the BFF_INTREE flag is removed.  Each
118  *   thread that's forwarding a packet to a known destination holds a reference
119  *   to a forwarding entry.
120  *
121  * TRILL notes:
122  *
123  *   The TRILL module does all of its I/O through bridging.  It uses references
124  *   on the bridge_inst_t and bridge_link_t structures, and has seven entry
125  *   points and four callbacks.  One entry point is for setting the callbacks
126  *   (bridge_trill_register_cb).  There are four entry points for taking bridge
127  *   and link references (bridge_trill_{br,ln}{ref,unref}).  The final two
128  *   entry points are for decapsulated packets from TRILL (bridge_trill_decaps)
129  *   that need to be bridged locally, and for TRILL-encapsulated output packets
130  *   (bridge_trill_output).
131  *
132  *   The four callbacks comprise two notification functions for bridges and
133  *   links being deleted, one function for raw received TRILL packets, and one
134  *   for bridge output to non-local TRILL destinations (tunnel entry).
135  */
136 
137 /*
138  * Ethernet reserved multicast addresses for TRILL; used also in TRILL module.
139  */
140 const uint8_t all_isis_rbridges[] = ALL_ISIS_RBRIDGES;
141 static const uint8_t all_esadi_rbridges[] = ALL_ESADI_RBRIDGES;
142 const uint8_t bridge_group_address[] = BRIDGE_GROUP_ADDRESS;
143 
144 static const char *inst_kstats_list[] = { KSINST_NAMES };
145 static const char *link_kstats_list[] = { KSLINK_NAMES };
146 
147 #define	KREF(p, m, vn)	p->m.vn.value.ui64
148 #define	KINCR(p, m, vn)	++KREF(p, m, vn)
149 #define	KDECR(p, m, vn)	--KREF(p, m, vn)
150 
151 #define	KIPINCR(p, vn)	KINCR(p, bi_kstats, vn)
152 #define	KIPDECR(p, vn)	KDECR(p, bi_kstats, vn)
153 #define	KLPINCR(p, vn)	KINCR(p, bl_kstats, vn)
154 
155 #define	KIINCR(vn)	KIPINCR(bip, vn)
156 #define	KIDECR(vn)	KIPDECR(bip, vn)
157 #define	KLINCR(vn)	KLPINCR(blp, vn)
158 
159 #define	Dim(x)		(sizeof (x) / sizeof (*(x)))
160 
161 /* Amount of overhead added when encapsulating with VLAN headers */
162 #define	VLAN_INCR	(sizeof (struct ether_vlan_header) -	\
163 			sizeof (struct ether_header))
164 
165 static dev_info_t *bridge_dev_info;
166 static major_t bridge_major;
167 static ddi_taskq_t *bridge_taskq;
168 
169 /*
170  * These are the bridge instance management data structures.  The mutex lock
171  * protects the list of bridge instances.  A reference count is then used on
172  * each instance to determine when to free it.  We use mac_minor_hold() to
173  * allocate minor_t values, which are used both for self-cloning /dev/net/
174  * device nodes as well as client streams.  Minor node 0 is reserved for the
175  * allocation control node.
176  */
177 static list_t inst_list;
178 static kcondvar_t inst_cv;		/* Allows us to wait for shutdown */
179 static kmutex_t inst_lock;
180 
181 static krwlock_t bmac_rwlock;
182 static list_t bmac_list;
183 
184 /* Wait for taskq entries that use STREAMS */
185 static kcondvar_t stream_ref_cv;
186 static kmutex_t stream_ref_lock;
187 
188 static timeout_id_t bridge_timerid;
189 static clock_t bridge_scan_interval;
190 static clock_t bridge_fwd_age;
191 
192 static bridge_inst_t *bridge_find_name(const char *);
193 static void bridge_timer(void *);
194 static void bridge_unref(bridge_inst_t *);
195 
196 static const uint8_t zero_addr[ETHERADDRL] = { 0 };
197 
198 /* Global TRILL linkage */
199 static trill_recv_pkt_t trill_recv_fn;
200 static trill_encap_pkt_t trill_encap_fn;
201 static trill_br_dstr_t trill_brdstr_fn;
202 static trill_ln_dstr_t trill_lndstr_fn;
203 
204 /* special settings to accommodate DLD flow control; see dld_str.c */
205 static struct module_info bridge_dld_modinfo = {
206 	0,			/* mi_idnum */
207 	BRIDGE_DEV_NAME,	/* mi_idname */
208 	0,			/* mi_minpsz */
209 	INFPSZ,			/* mi_maxpsz */
210 	1,			/* mi_hiwat */
211 	0			/* mi_lowat */
212 };
213 
214 static struct qinit bridge_dld_rinit = {
215 	NULL,			/* qi_putp */
216 	NULL,			/* qi_srvp */
217 	dld_open,		/* qi_qopen */
218 	dld_close,		/* qi_qclose */
219 	NULL,			/* qi_qadmin */
220 	&bridge_dld_modinfo,	/* qi_minfo */
221 	NULL			/* qi_mstat */
222 };
223 
224 static struct qinit bridge_dld_winit = {
225 	dld_wput,		/* qi_putp */
226 	dld_wsrv,		/* qi_srvp */
227 	NULL,			/* qi_qopen */
228 	NULL,			/* qi_qclose */
229 	NULL,			/* qi_qadmin */
230 	&bridge_dld_modinfo,	/* qi_minfo */
231 	NULL			/* qi_mstat */
232 };
233 
234 static int bridge_ioc_listfwd(void *, intptr_t, int, cred_t *, int *);
235 
236 /* GLDv3 control ioctls used by Bridging */
237 static dld_ioc_info_t bridge_ioc_list[] = {
238 	{BRIDGE_IOC_LISTFWD, DLDCOPYINOUT, sizeof (bridge_listfwd_t),
239 	    bridge_ioc_listfwd, NULL},
240 };
241 
242 /*
243  * Given a bridge mac pointer, get a ref-held pointer to the corresponding
244  * bridge instance, if any.  We must hold the global bmac_rwlock so that
245  * bm_inst doesn't slide out from under us.
246  */
247 static bridge_inst_t *
248 mac_to_inst(const bridge_mac_t *bmp)
249 {
250 	bridge_inst_t *bip;
251 
252 	rw_enter(&bmac_rwlock, RW_READER);
253 	if ((bip = bmp->bm_inst) != NULL)
254 		atomic_inc_uint(&bip->bi_refs);
255 	rw_exit(&bmac_rwlock);
256 	return (bip);
257 }
258 
259 static void
260 link_sdu_fail(bridge_link_t *blp, boolean_t failed, mblk_t **mlist)
261 {
262 	mblk_t *mp;
263 	bridge_ctl_t *bcp;
264 	bridge_link_t *blcmp;
265 	bridge_inst_t *bip;
266 	bridge_mac_t *bmp;
267 
268 	if (failed) {
269 		if (blp->bl_flags & BLF_SDUFAIL)
270 			return;
271 		blp->bl_flags |= BLF_SDUFAIL;
272 	} else {
273 		if (!(blp->bl_flags & BLF_SDUFAIL))
274 			return;
275 		blp->bl_flags &= ~BLF_SDUFAIL;
276 	}
277 
278 	/*
279 	 * If this link is otherwise up, then check if there are any other
280 	 * non-failed non-down links.  If not, then we control the state of the
281 	 * whole bridge.
282 	 */
283 	bip = blp->bl_inst;
284 	bmp = bip->bi_mac;
285 	if (blp->bl_linkstate != LINK_STATE_DOWN) {
286 		for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
287 		    blcmp = list_next(&bip->bi_links, blcmp)) {
288 			if (blp != blcmp &&
289 			    !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
290 			    blcmp->bl_linkstate != LINK_STATE_DOWN)
291 				break;
292 		}
293 		if (blcmp == NULL) {
294 			bmp->bm_linkstate = failed ? LINK_STATE_DOWN :
295 			    LINK_STATE_UP;
296 			mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
297 		}
298 	}
299 
300 	/*
301 	 * If we're becoming failed, then the link's current true state needs
302 	 * to be reflected upwards to this link's clients.  If we're becoming
303 	 * unfailed, then we get the state of the bridge instead on all
304 	 * clients.
305 	 */
306 	if (failed) {
307 		if (bmp->bm_linkstate != blp->bl_linkstate)
308 			mac_link_redo(blp->bl_mh, blp->bl_linkstate);
309 	} else {
310 		mac_link_redo(blp->bl_mh, bmp->bm_linkstate);
311 	}
312 
313 	/* get the current mblk we're going to send up */
314 	if ((mp = blp->bl_lfailmp) == NULL &&
315 	    (mp = allocb(sizeof (bridge_ctl_t), BPRI_MED)) == NULL)
316 		return;
317 
318 	/* get a new one for next time */
319 	blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
320 
321 	/* if none for next time, then report only failures */
322 	if (blp->bl_lfailmp == NULL && !failed) {
323 		blp->bl_lfailmp = mp;
324 		return;
325 	}
326 
327 	/* LINTED: alignment */
328 	bcp = (bridge_ctl_t *)mp->b_rptr;
329 	bcp->bc_linkid = blp->bl_linkid;
330 	bcp->bc_failed = failed;
331 	mp->b_wptr = (uchar_t *)(bcp + 1);
332 	mp->b_next = *mlist;
333 	*mlist = mp;
334 }
335 
336 /*
337  * Send control messages (link SDU changes) using the stream to the
338  * bridge instance daemon.
339  */
340 static void
341 send_up_messages(bridge_inst_t *bip, mblk_t *mp)
342 {
343 	mblk_t *mnext;
344 	queue_t *rq;
345 
346 	rq = bip->bi_control->bs_wq;
347 	rq = OTHERQ(rq);
348 	while (mp != NULL) {
349 		mnext = mp->b_next;
350 		mp->b_next = NULL;
351 		putnext(rq, mp);
352 		mp = mnext;
353 	}
354 }
355 
356 /* ARGSUSED */
357 static int
358 bridge_m_getstat(void *arg, uint_t stat, uint64_t *val)
359 {
360 	return (ENOTSUP);
361 }
362 
363 static int
364 bridge_m_start(void *arg)
365 {
366 	bridge_mac_t *bmp = arg;
367 
368 	bmp->bm_flags |= BMF_STARTED;
369 	return (0);
370 }
371 
372 static void
373 bridge_m_stop(void *arg)
374 {
375 	bridge_mac_t *bmp = arg;
376 
377 	bmp->bm_flags &= ~BMF_STARTED;
378 }
379 
380 /* ARGSUSED */
381 static int
382 bridge_m_setpromisc(void *arg, boolean_t on)
383 {
384 	return (0);
385 }
386 
387 /* ARGSUSED */
388 static int
389 bridge_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
390 {
391 	return (0);
392 }
393 
394 /* ARGSUSED */
395 static int
396 bridge_m_unicst(void *arg, const uint8_t *macaddr)
397 {
398 	return (ENOTSUP);
399 }
400 
401 static mblk_t *
402 bridge_m_tx(void *arg, mblk_t *mp)
403 {
404 	_NOTE(ARGUNUSED(arg));
405 	freemsgchain(mp);
406 	return (NULL);
407 }
408 
409 /* ARGSUSED */
410 static int
411 bridge_ioc_listfwd(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
412 {
413 	bridge_listfwd_t *blf = karg;
414 	bridge_inst_t *bip;
415 	bridge_fwd_t *bfp, match;
416 	avl_index_t where;
417 
418 	bip = bridge_find_name(blf->blf_name);
419 	if (bip == NULL)
420 		return (ENOENT);
421 
422 	bcopy(blf->blf_dest, match.bf_dest, ETHERADDRL);
423 	match.bf_flags |= BFF_VLANLOCAL;
424 	rw_enter(&bip->bi_rwlock, RW_READER);
425 	if ((bfp = avl_find(&bip->bi_fwd, &match, &where)) == NULL)
426 		bfp = avl_nearest(&bip->bi_fwd, where, AVL_AFTER);
427 	else
428 		bfp = AVL_NEXT(&bip->bi_fwd, bfp);
429 	if (bfp == NULL) {
430 		bzero(blf, sizeof (*blf));
431 	} else {
432 		bcopy(bfp->bf_dest, blf->blf_dest, ETHERADDRL);
433 		blf->blf_trill_nick = bfp->bf_trill_nick;
434 		blf->blf_ms_age =
435 		    drv_hztousec(ddi_get_lbolt() - bfp->bf_lastheard) / 1000;
436 		blf->blf_is_local =
437 		    (bfp->bf_flags & BFF_LOCALADDR) != 0;
438 		blf->blf_linkid = bfp->bf_links[0]->bl_linkid;
439 	}
440 	rw_exit(&bip->bi_rwlock);
441 	bridge_unref(bip);
442 	return (0);
443 }
444 
445 static int
446 bridge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
447     uint_t pr_valsize, const void *pr_val)
448 {
449 	bridge_mac_t *bmp = arg;
450 	bridge_inst_t *bip;
451 	bridge_link_t *blp;
452 	int err;
453 	uint_t maxsdu;
454 	mblk_t *mlist;
455 
456 	_NOTE(ARGUNUSED(pr_name));
457 	switch (pr_num) {
458 	case MAC_PROP_MTU:
459 		if (pr_valsize < sizeof (bmp->bm_maxsdu)) {
460 			err = EINVAL;
461 			break;
462 		}
463 		(void) bcopy(pr_val, &maxsdu, sizeof (maxsdu));
464 		if (maxsdu == bmp->bm_maxsdu) {
465 			err = 0;
466 		} else if ((bip = mac_to_inst(bmp)) == NULL) {
467 			err = ENXIO;
468 		} else {
469 			rw_enter(&bip->bi_rwlock, RW_WRITER);
470 			mlist = NULL;
471 			for (blp = list_head(&bip->bi_links); blp != NULL;
472 			    blp = list_next(&bip->bi_links, blp)) {
473 				if (blp->bl_flags & BLF_DELETED)
474 					continue;
475 				if (blp->bl_maxsdu == maxsdu)
476 					link_sdu_fail(blp, B_FALSE, &mlist);
477 				else if (blp->bl_maxsdu == bmp->bm_maxsdu)
478 					link_sdu_fail(blp, B_TRUE, &mlist);
479 			}
480 			rw_exit(&bip->bi_rwlock);
481 			bmp->bm_maxsdu = maxsdu;
482 			(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
483 			send_up_messages(bip, mlist);
484 			bridge_unref(bip);
485 			err = 0;
486 		}
487 		break;
488 
489 	default:
490 		err = ENOTSUP;
491 		break;
492 	}
493 	return (err);
494 }
495 
496 static int
497 bridge_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
498     uint_t pr_valsize, void *pr_val)
499 {
500 	bridge_mac_t *bmp = arg;
501 	int err = 0;
502 
503 	_NOTE(ARGUNUSED(pr_name));
504 	switch (pr_num) {
505 	case MAC_PROP_STATUS:
506 		ASSERT(pr_valsize >= sizeof (bmp->bm_linkstate));
507 		bcopy(&bmp->bm_linkstate, pr_val, sizeof (bmp->bm_linkstate));
508 		break;
509 
510 	default:
511 		err = ENOTSUP;
512 		break;
513 	}
514 	return (err);
515 }
516 
517 static void
518 bridge_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
519     mac_prop_info_handle_t prh)
520 {
521 	bridge_mac_t *bmp = arg;
522 
523 	_NOTE(ARGUNUSED(pr_name));
524 
525 	switch (pr_num) {
526 	case MAC_PROP_MTU:
527 		mac_prop_info_set_range_uint32(prh, bmp->bm_maxsdu,
528 		    bmp->bm_maxsdu);
529 		break;
530 	case MAC_PROP_STATUS:
531 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
532 		break;
533 	}
534 }
535 
536 static mac_callbacks_t bridge_m_callbacks = {
537 	MC_SETPROP | MC_GETPROP | MC_PROPINFO,
538 	bridge_m_getstat,
539 	bridge_m_start,
540 	bridge_m_stop,
541 	bridge_m_setpromisc,
542 	bridge_m_multicst,
543 	bridge_m_unicst,
544 	bridge_m_tx,
545 	NULL,	/* reserved */
546 	NULL,	/* ioctl */
547 	NULL,	/* getcapab */
548 	NULL,	/* open */
549 	NULL,	/* close */
550 	bridge_m_setprop,
551 	bridge_m_getprop,
552 	bridge_m_propinfo
553 };
554 
555 /*
556  * Create kstats from a list.
557  */
558 static kstat_t *
559 kstat_setup(kstat_named_t *knt, const char **names, int nstat,
560     const char *unitname)
561 {
562 	kstat_t *ksp;
563 	int i;
564 
565 	for (i = 0; i < nstat; i++)
566 		kstat_named_init(&knt[i], names[i], KSTAT_DATA_UINT64);
567 
568 	ksp = kstat_create_zone(BRIDGE_DEV_NAME, 0, unitname, "net",
569 	    KSTAT_TYPE_NAMED, nstat, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
570 	if (ksp != NULL) {
571 		ksp->ks_data = knt;
572 		kstat_install(ksp);
573 	}
574 	return (ksp);
575 }
576 
577 /*
578  * Find an existing bridge_mac_t structure or allocate a new one for the given
579  * bridge instance.  This creates the mac driver instance that snoop can use.
580  */
581 static int
582 bmac_alloc(bridge_inst_t *bip, bridge_mac_t **bmacp)
583 {
584 	bridge_mac_t *bmp, *bnew;
585 	mac_register_t *mac;
586 	int err;
587 
588 	*bmacp = NULL;
589 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
590 		return (EINVAL);
591 
592 	bnew = kmem_zalloc(sizeof (*bnew), KM_SLEEP);
593 
594 	rw_enter(&bmac_rwlock, RW_WRITER);
595 	for (bmp = list_head(&bmac_list); bmp != NULL;
596 	    bmp = list_next(&bmac_list, bmp)) {
597 		if (strcmp(bip->bi_name, bmp->bm_name) == 0) {
598 			ASSERT(bmp->bm_inst == NULL);
599 			bmp->bm_inst = bip;
600 			rw_exit(&bmac_rwlock);
601 			kmem_free(bnew, sizeof (*bnew));
602 			mac_free(mac);
603 			*bmacp = bmp;
604 			return (0);
605 		}
606 	}
607 
608 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
609 	mac->m_driver = bnew;
610 	mac->m_dip = bridge_dev_info;
611 	mac->m_instance = (uint_t)-1;
612 	mac->m_src_addr = (uint8_t *)zero_addr;
613 	mac->m_callbacks = &bridge_m_callbacks;
614 
615 	/*
616 	 * Note that the SDU limits are irrelevant, as nobody transmits on the
617 	 * bridge node itself.  It's mainly for monitoring but we allow
618 	 * setting the bridge MTU for quick transition of all links part of the
619 	 * bridge to a new MTU.
620 	 */
621 	mac->m_min_sdu = 1;
622 	mac->m_max_sdu = 1500;
623 	err = mac_register(mac, &bnew->bm_mh);
624 	mac_free(mac);
625 	if (err != 0) {
626 		rw_exit(&bmac_rwlock);
627 		kmem_free(bnew, sizeof (*bnew));
628 		return (err);
629 	}
630 
631 	bnew->bm_inst = bip;
632 	(void) strcpy(bnew->bm_name, bip->bi_name);
633 	if (list_is_empty(&bmac_list)) {
634 		bridge_timerid = timeout(bridge_timer, NULL,
635 		    bridge_scan_interval);
636 	}
637 	list_insert_tail(&bmac_list, bnew);
638 	rw_exit(&bmac_rwlock);
639 
640 	/*
641 	 * Mark the MAC as unable to go "active" so that only passive clients
642 	 * (such as snoop) can bind to it.
643 	 */
644 	mac_no_active(bnew->bm_mh);
645 	*bmacp = bnew;
646 	return (0);
647 }
648 
649 /*
650  * Disconnect the given bridge_mac_t from its bridge instance.  The bridge
651  * instance is going away.  The mac instance can't go away until the clients
652  * are gone (see bridge_timer).
653  */
654 static void
655 bmac_disconnect(bridge_mac_t *bmp)
656 {
657 	bridge_inst_t *bip;
658 
659 	bmp->bm_linkstate = LINK_STATE_DOWN;
660 	mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
661 
662 	rw_enter(&bmac_rwlock, RW_READER);
663 	bip = bmp->bm_inst;
664 	bip->bi_mac = NULL;
665 	bmp->bm_inst = NULL;
666 	rw_exit(&bmac_rwlock);
667 }
668 
669 /* This is used by the avl trees to sort forwarding table entries */
670 static int
671 fwd_compare(const void *addr1, const void *addr2)
672 {
673 	const bridge_fwd_t *fwd1 = addr1;
674 	const bridge_fwd_t *fwd2 = addr2;
675 	int diff = memcmp(fwd1->bf_dest, fwd2->bf_dest, ETHERADDRL);
676 
677 	if (diff != 0)
678 		return (diff > 0 ? 1 : -1);
679 
680 	if ((fwd1->bf_flags ^ fwd2->bf_flags) & BFF_VLANLOCAL) {
681 		if (fwd1->bf_vlanid > fwd2->bf_vlanid)
682 			return (1);
683 		else if (fwd1->bf_vlanid < fwd2->bf_vlanid)
684 			return (-1);
685 	}
686 	return (0);
687 }
688 
689 static void
690 inst_free(bridge_inst_t *bip)
691 {
692 	ASSERT(bip->bi_mac == NULL);
693 	rw_destroy(&bip->bi_rwlock);
694 	list_destroy(&bip->bi_links);
695 	cv_destroy(&bip->bi_linkwait);
696 	avl_destroy(&bip->bi_fwd);
697 	if (bip->bi_ksp != NULL)
698 		kstat_delete(bip->bi_ksp);
699 	kmem_free(bip, sizeof (*bip));
700 }
701 
702 static bridge_inst_t *
703 inst_alloc(const char *bridge)
704 {
705 	bridge_inst_t *bip;
706 
707 	bip = kmem_zalloc(sizeof (*bip), KM_SLEEP);
708 	bip->bi_refs = 1;
709 	(void) strcpy(bip->bi_name, bridge);
710 	rw_init(&bip->bi_rwlock, NULL, RW_DRIVER, NULL);
711 	list_create(&bip->bi_links, sizeof (bridge_link_t),
712 	    offsetof(bridge_link_t, bl_node));
713 	cv_init(&bip->bi_linkwait, NULL, CV_DRIVER, NULL);
714 	avl_create(&bip->bi_fwd, fwd_compare, sizeof (bridge_fwd_t),
715 	    offsetof(bridge_fwd_t, bf_node));
716 	return (bip);
717 }
718 
719 static bridge_inst_t *
720 bridge_find_name(const char *bridge)
721 {
722 	bridge_inst_t *bip;
723 
724 	mutex_enter(&inst_lock);
725 	for (bip = list_head(&inst_list); bip != NULL;
726 	    bip = list_next(&inst_list, bip)) {
727 		if (!(bip->bi_flags & BIF_SHUTDOWN) &&
728 		    strcmp(bridge, bip->bi_name) == 0) {
729 			atomic_inc_uint(&bip->bi_refs);
730 			break;
731 		}
732 	}
733 	mutex_exit(&inst_lock);
734 
735 	return (bip);
736 }
737 
738 static int
739 bridge_create(datalink_id_t linkid, const char *bridge, bridge_inst_t **bipc,
740     cred_t *cred)
741 {
742 	bridge_inst_t *bip, *bipnew;
743 	bridge_mac_t *bmp = NULL;
744 	int err;
745 
746 	*bipc = NULL;
747 	bipnew = inst_alloc(bridge);
748 
749 	mutex_enter(&inst_lock);
750 lookup_retry:
751 	for (bip = list_head(&inst_list); bip != NULL;
752 	    bip = list_next(&inst_list, bip)) {
753 		if (strcmp(bridge, bip->bi_name) == 0)
754 			break;
755 	}
756 
757 	/* This should not take long; if it does, we've got a design problem */
758 	if (bip != NULL && (bip->bi_flags & BIF_SHUTDOWN)) {
759 		cv_wait(&inst_cv, &inst_lock);
760 		goto lookup_retry;
761 	}
762 
763 	if (bip == NULL) {
764 		bip = bipnew;
765 		bipnew = NULL;
766 		list_insert_tail(&inst_list, bip);
767 	}
768 
769 	mutex_exit(&inst_lock);
770 	if (bipnew != NULL) {
771 		inst_free(bipnew);
772 		return (EEXIST);
773 	}
774 
775 	bip->bi_ksp = kstat_setup((kstat_named_t *)&bip->bi_kstats,
776 	    inst_kstats_list, Dim(inst_kstats_list), bip->bi_name);
777 
778 	err = bmac_alloc(bip, &bmp);
779 	if ((bip->bi_mac = bmp) == NULL)
780 		goto fail_create;
781 
782 	/*
783 	 * bm_inst is set, so the timer cannot yank the DLS rug from under us.
784 	 * No extra locking is needed here.
785 	 */
786 	if (!(bmp->bm_flags & BMF_DLS)) {
787 		err = dls_devnet_create(bmp->bm_mh, linkid, crgetzoneid(cred));
788 		if (err != 0)
789 			goto fail_create;
790 		bmp->bm_flags |= BMF_DLS;
791 	}
792 
793 	bip->bi_dev = makedevice(bridge_major, mac_minor(bmp->bm_mh));
794 	*bipc = bip;
795 	return (0);
796 
797 fail_create:
798 	ASSERT(bip->bi_trilldata == NULL);
799 	bip->bi_flags |= BIF_SHUTDOWN;
800 	bridge_unref(bip);
801 	return (err);
802 }
803 
804 static void
805 bridge_unref(bridge_inst_t *bip)
806 {
807 	if (atomic_dec_uint_nv(&bip->bi_refs) == 0) {
808 		ASSERT(bip->bi_flags & BIF_SHUTDOWN);
809 		/* free up mac for reuse before leaving global list */
810 		if (bip->bi_mac != NULL)
811 			bmac_disconnect(bip->bi_mac);
812 		mutex_enter(&inst_lock);
813 		list_remove(&inst_list, bip);
814 		cv_broadcast(&inst_cv);
815 		mutex_exit(&inst_lock);
816 		inst_free(bip);
817 	}
818 }
819 
820 /*
821  * Stream instances are used only for allocating bridges and serving as a
822  * control node.  They serve no data-handling function.
823  */
824 static bridge_stream_t *
825 stream_alloc(void)
826 {
827 	bridge_stream_t *bsp;
828 	minor_t mn;
829 
830 	if ((mn = mac_minor_hold(B_FALSE)) == 0)
831 		return (NULL);
832 	bsp = kmem_zalloc(sizeof (*bsp), KM_SLEEP);
833 	bsp->bs_minor = mn;
834 	return (bsp);
835 }
836 
837 static void
838 stream_free(bridge_stream_t *bsp)
839 {
840 	mac_minor_rele(bsp->bs_minor);
841 	kmem_free(bsp, sizeof (*bsp));
842 }
843 
844 /* Reference hold/release functions for STREAMS-related taskq */
845 static void
846 stream_ref(bridge_stream_t *bsp)
847 {
848 	mutex_enter(&stream_ref_lock);
849 	bsp->bs_taskq_cnt++;
850 	mutex_exit(&stream_ref_lock);
851 }
852 
853 static void
854 stream_unref(bridge_stream_t *bsp)
855 {
856 	mutex_enter(&stream_ref_lock);
857 	if (--bsp->bs_taskq_cnt == 0)
858 		cv_broadcast(&stream_ref_cv);
859 	mutex_exit(&stream_ref_lock);
860 }
861 
862 static void
863 link_free(bridge_link_t *blp)
864 {
865 	bridge_inst_t *bip = blp->bl_inst;
866 
867 	ASSERT(!(blp->bl_flags & BLF_FREED));
868 	blp->bl_flags |= BLF_FREED;
869 	if (blp->bl_ksp != NULL)
870 		kstat_delete(blp->bl_ksp);
871 	if (blp->bl_lfailmp != NULL)
872 		freeb(blp->bl_lfailmp);
873 	cv_destroy(&blp->bl_trillwait);
874 	mutex_destroy(&blp->bl_trilllock);
875 	kmem_free(blp, sizeof (*blp));
876 	/* Don't unreference the bridge until the MAC is closed */
877 	bridge_unref(bip);
878 }
879 
880 static void
881 link_unref(bridge_link_t *blp)
882 {
883 	if (atomic_dec_uint_nv(&blp->bl_refs) == 0) {
884 		bridge_inst_t *bip = blp->bl_inst;
885 
886 		ASSERT(blp->bl_flags & BLF_DELETED);
887 		rw_enter(&bip->bi_rwlock, RW_WRITER);
888 		if (blp->bl_flags & BLF_LINK_ADDED)
889 			list_remove(&bip->bi_links, blp);
890 		rw_exit(&bip->bi_rwlock);
891 		if (bip->bi_trilldata != NULL && list_is_empty(&bip->bi_links))
892 			cv_broadcast(&bip->bi_linkwait);
893 		link_free(blp);
894 	}
895 }
896 
897 static bridge_fwd_t *
898 fwd_alloc(const uint8_t *addr, uint_t nlinks, uint16_t nick)
899 {
900 	bridge_fwd_t *bfp;
901 
902 	bfp = kmem_zalloc(sizeof (*bfp) + (nlinks * sizeof (bridge_link_t *)),
903 	    KM_NOSLEEP);
904 	if (bfp != NULL) {
905 		bcopy(addr, bfp->bf_dest, ETHERADDRL);
906 		bfp->bf_lastheard = ddi_get_lbolt();
907 		bfp->bf_maxlinks = nlinks;
908 		bfp->bf_links = (bridge_link_t **)(bfp + 1);
909 		bfp->bf_trill_nick = nick;
910 	}
911 	return (bfp);
912 }
913 
914 static bridge_fwd_t *
915 fwd_find(bridge_inst_t *bip, const uint8_t *addr, uint16_t vlanid)
916 {
917 	bridge_fwd_t *bfp, *vbfp;
918 	bridge_fwd_t match;
919 
920 	bcopy(addr, match.bf_dest, ETHERADDRL);
921 	match.bf_flags = 0;
922 	rw_enter(&bip->bi_rwlock, RW_READER);
923 	if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
924 		if (bfp->bf_vlanid != vlanid && bfp->bf_vcnt > 0) {
925 			match.bf_vlanid = vlanid;
926 			match.bf_flags = BFF_VLANLOCAL;
927 			vbfp = avl_find(&bip->bi_fwd, &match, NULL);
928 			if (vbfp != NULL)
929 				bfp = vbfp;
930 		}
931 		atomic_inc_uint(&bfp->bf_refs);
932 	}
933 	rw_exit(&bip->bi_rwlock);
934 	return (bfp);
935 }
936 
937 static void
938 fwd_free(bridge_fwd_t *bfp)
939 {
940 	uint_t i;
941 	bridge_inst_t *bip = bfp->bf_links[0]->bl_inst;
942 
943 	KIDECR(bki_count);
944 	for (i = 0; i < bfp->bf_nlinks; i++)
945 		link_unref(bfp->bf_links[i]);
946 	kmem_free(bfp,
947 	    sizeof (*bfp) + bfp->bf_maxlinks * sizeof (bridge_link_t *));
948 }
949 
950 static void
951 fwd_unref(bridge_fwd_t *bfp)
952 {
953 	if (atomic_dec_uint_nv(&bfp->bf_refs) == 0) {
954 		ASSERT(!(bfp->bf_flags & BFF_INTREE));
955 		fwd_free(bfp);
956 	}
957 }
958 
959 static void
960 fwd_delete(bridge_fwd_t *bfp)
961 {
962 	bridge_inst_t *bip;
963 	bridge_fwd_t *bfpzero;
964 
965 	if (bfp->bf_flags & BFF_INTREE) {
966 		ASSERT(bfp->bf_nlinks > 0);
967 		bip = bfp->bf_links[0]->bl_inst;
968 		rw_enter(&bip->bi_rwlock, RW_WRITER);
969 		/* Another thread could beat us to this */
970 		if (bfp->bf_flags & BFF_INTREE) {
971 			avl_remove(&bip->bi_fwd, bfp);
972 			bfp->bf_flags &= ~BFF_INTREE;
973 			if (bfp->bf_flags & BFF_VLANLOCAL) {
974 				bfp->bf_flags &= ~BFF_VLANLOCAL;
975 				bfpzero = avl_find(&bip->bi_fwd, bfp, NULL);
976 				if (bfpzero != NULL && bfpzero->bf_vcnt > 0)
977 					bfpzero->bf_vcnt--;
978 			}
979 			rw_exit(&bip->bi_rwlock);
980 			fwd_unref(bfp);		/* no longer in avl tree */
981 		} else {
982 			rw_exit(&bip->bi_rwlock);
983 		}
984 	}
985 }
986 
987 static boolean_t
988 fwd_insert(bridge_inst_t *bip, bridge_fwd_t *bfp)
989 {
990 	avl_index_t idx;
991 	boolean_t retv;
992 
993 	rw_enter(&bip->bi_rwlock, RW_WRITER);
994 	if (!(bip->bi_flags & BIF_SHUTDOWN) &&
995 	    avl_numnodes(&bip->bi_fwd) < bip->bi_tablemax &&
996 	    avl_find(&bip->bi_fwd, bfp, &idx) == NULL) {
997 		avl_insert(&bip->bi_fwd, bfp, idx);
998 		bfp->bf_flags |= BFF_INTREE;
999 		atomic_inc_uint(&bfp->bf_refs);	/* avl entry */
1000 		retv = B_TRUE;
1001 	} else {
1002 		retv = B_FALSE;
1003 	}
1004 	rw_exit(&bip->bi_rwlock);
1005 	return (retv);
1006 }
1007 
1008 static void
1009 fwd_update_local(bridge_link_t *blp, const uint8_t *oldaddr,
1010     const uint8_t *newaddr)
1011 {
1012 	bridge_inst_t *bip = blp->bl_inst;
1013 	bridge_fwd_t *bfp, *bfnew;
1014 	bridge_fwd_t match;
1015 	avl_index_t idx;
1016 	boolean_t drop_ref = B_FALSE;
1017 
1018 	if (bcmp(oldaddr, newaddr, ETHERADDRL) == 0)
1019 		return;
1020 
1021 	if (bcmp(oldaddr, zero_addr, ETHERADDRL) == 0)
1022 		goto no_old_addr;
1023 
1024 	/*
1025 	 * Find the previous entry, and remove our link from it.
1026 	 */
1027 	bcopy(oldaddr, match.bf_dest, ETHERADDRL);
1028 	rw_enter(&bip->bi_rwlock, RW_WRITER);
1029 	if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
1030 		int i;
1031 
1032 		/*
1033 		 * See if we're in the list, and remove if so.
1034 		 */
1035 		for (i = 0; i < bfp->bf_nlinks; i++) {
1036 			if (bfp->bf_links[i] == blp) {
1037 				/*
1038 				 * We assume writes are atomic, so no special
1039 				 * MT handling is needed.  The list length is
1040 				 * decremented first, and then we remove
1041 				 * entries.
1042 				 */
1043 				bfp->bf_nlinks--;
1044 				for (; i < bfp->bf_nlinks; i++)
1045 					bfp->bf_links[i] = bfp->bf_links[i + 1];
1046 				drop_ref = B_TRUE;
1047 				break;
1048 			}
1049 		}
1050 		/* If no more links, then remove and free up */
1051 		if (bfp->bf_nlinks == 0) {
1052 			avl_remove(&bip->bi_fwd, bfp);
1053 			bfp->bf_flags &= ~BFF_INTREE;
1054 		} else {
1055 			bfp = NULL;
1056 		}
1057 	}
1058 	rw_exit(&bip->bi_rwlock);
1059 	if (bfp != NULL)
1060 		fwd_unref(bfp);		/* no longer in avl tree */
1061 
1062 	/*
1063 	 * Now get the new link address and add this link to the list.  The
1064 	 * list should be of length 1 unless the user has configured multiple
1065 	 * NICs with the same address.  (That's an incorrect configuration, but
1066 	 * we support it anyway.)
1067 	 */
1068 no_old_addr:
1069 	bfp = NULL;
1070 	if ((bip->bi_flags & BIF_SHUTDOWN) ||
1071 	    bcmp(newaddr, zero_addr, ETHERADDRL) == 0)
1072 		goto no_new_addr;
1073 
1074 	bcopy(newaddr, match.bf_dest, ETHERADDRL);
1075 	rw_enter(&bip->bi_rwlock, RW_WRITER);
1076 	if ((bfp = avl_find(&bip->bi_fwd, &match, &idx)) == NULL) {
1077 		bfnew = fwd_alloc(newaddr, 1, RBRIDGE_NICKNAME_NONE);
1078 		if (bfnew != NULL)
1079 			KIINCR(bki_count);
1080 	} else if (bfp->bf_nlinks < bfp->bf_maxlinks) {
1081 		/* special case: link fits in existing entry */
1082 		bfnew = bfp;
1083 	} else {
1084 		bfnew = fwd_alloc(newaddr, bfp->bf_nlinks + 1,
1085 		    RBRIDGE_NICKNAME_NONE);
1086 		if (bfnew != NULL) {
1087 			KIINCR(bki_count);
1088 			avl_remove(&bip->bi_fwd, bfp);
1089 			bfp->bf_flags &= ~BFF_INTREE;
1090 			bfnew->bf_nlinks = bfp->bf_nlinks;
1091 			bcopy(bfp->bf_links, bfnew->bf_links,
1092 			    bfp->bf_nlinks * sizeof (bfp));
1093 			/* reset the idx value due to removal above */
1094 			(void) avl_find(&bip->bi_fwd, &match, &idx);
1095 		}
1096 	}
1097 
1098 	if (bfnew != NULL) {
1099 		bfnew->bf_links[bfnew->bf_nlinks++] = blp;
1100 		if (drop_ref)
1101 			drop_ref = B_FALSE;
1102 		else
1103 			atomic_inc_uint(&blp->bl_refs);	/* bf_links entry */
1104 
1105 		if (bfnew != bfp) {
1106 			/* local addresses are not subject to table limits */
1107 			avl_insert(&bip->bi_fwd, bfnew, idx);
1108 			bfnew->bf_flags |= (BFF_INTREE | BFF_LOCALADDR);
1109 			atomic_inc_uint(&bfnew->bf_refs);	/* avl entry */
1110 		}
1111 	}
1112 	rw_exit(&bip->bi_rwlock);
1113 
1114 no_new_addr:
1115 	/*
1116 	 * If we found an existing entry and we replaced it with a new one,
1117 	 * then drop the table reference from the old one.  We removed it from
1118 	 * the AVL tree above.
1119 	 */
1120 	if (bfnew != NULL && bfp != NULL && bfnew != bfp)
1121 		fwd_unref(bfp);
1122 
1123 	/* Account for removed entry. */
1124 	if (drop_ref)
1125 		link_unref(blp);
1126 }
1127 
1128 static void
1129 bridge_new_unicst(bridge_link_t *blp)
1130 {
1131 	uint8_t new_mac[ETHERADDRL];
1132 
1133 	mac_unicast_primary_get(blp->bl_mh, new_mac);
1134 	fwd_update_local(blp, blp->bl_local_mac, new_mac);
1135 	bcopy(new_mac, blp->bl_local_mac, ETHERADDRL);
1136 }
1137 
1138 /*
1139  * We must shut down a link prior to freeing it, and doing that requires
1140  * blocking to wait for running MAC threads while holding a reference.  This is
1141  * run from a taskq to accomplish proper link shutdown followed by reference
1142  * drop.
1143  */
1144 static void
1145 link_shutdown(void *arg)
1146 {
1147 	bridge_link_t *blp = arg;
1148 	mac_handle_t mh = blp->bl_mh;
1149 	bridge_inst_t *bip;
1150 	bridge_fwd_t *bfp, *bfnext;
1151 	avl_tree_t fwd_scavenge;
1152 	int i;
1153 
1154 	/*
1155 	 * This link is being destroyed.  Notify TRILL now that it's no longer
1156 	 * possible to send packets.  Data packets may still arrive until TRILL
1157 	 * calls bridge_trill_lnunref.
1158 	 */
1159 	if (blp->bl_trilldata != NULL)
1160 		trill_lndstr_fn(blp->bl_trilldata, blp);
1161 
1162 	if (blp->bl_flags & BLF_PROM_ADDED)
1163 		(void) mac_promisc_remove(blp->bl_mphp);
1164 
1165 	if (blp->bl_flags & BLF_SET_BRIDGE)
1166 		mac_bridge_clear(mh, (mac_handle_t)blp);
1167 
1168 	if (blp->bl_flags & BLF_MARGIN_ADDED) {
1169 		(void) mac_notify_remove(blp->bl_mnh, B_TRUE);
1170 		(void) mac_margin_remove(mh, blp->bl_margin);
1171 	}
1172 
1173 	/* Tell the clients the real link state when we leave */
1174 	mac_link_redo(blp->bl_mh,
1175 	    mac_stat_get(blp->bl_mh, MAC_STAT_LOWLINK_STATE));
1176 
1177 	/* Destroy all of the forwarding entries related to this link */
1178 	avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1179 	    offsetof(bridge_fwd_t, bf_node));
1180 	bip = blp->bl_inst;
1181 	rw_enter(&bip->bi_rwlock, RW_WRITER);
1182 	bfnext = avl_first(&bip->bi_fwd);
1183 	while ((bfp = bfnext) != NULL) {
1184 		bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1185 		for (i = 0; i < bfp->bf_nlinks; i++) {
1186 			if (bfp->bf_links[i] == blp)
1187 				break;
1188 		}
1189 		if (i >= bfp->bf_nlinks)
1190 			continue;
1191 		if (bfp->bf_nlinks > 1) {
1192 			/* note that this can't be the last reference */
1193 			link_unref(blp);
1194 			bfp->bf_nlinks--;
1195 			for (; i < bfp->bf_nlinks; i++)
1196 				bfp->bf_links[i] = bfp->bf_links[i + 1];
1197 		} else {
1198 			ASSERT(bfp->bf_flags & BFF_INTREE);
1199 			avl_remove(&bip->bi_fwd, bfp);
1200 			bfp->bf_flags &= ~BFF_INTREE;
1201 			avl_add(&fwd_scavenge, bfp);
1202 		}
1203 	}
1204 	rw_exit(&bip->bi_rwlock);
1205 	bfnext = avl_first(&fwd_scavenge);
1206 	while ((bfp = bfnext) != NULL) {
1207 		bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1208 		avl_remove(&fwd_scavenge, bfp);
1209 		fwd_unref(bfp);
1210 	}
1211 	avl_destroy(&fwd_scavenge);
1212 
1213 	if (blp->bl_flags & BLF_CLIENT_OPEN)
1214 		mac_client_close(blp->bl_mch, 0);
1215 
1216 	mac_close(mh);
1217 
1218 	/*
1219 	 * We are now completely removed from the active list, so drop the
1220 	 * reference (see bridge_add_link).
1221 	 */
1222 	link_unref(blp);
1223 }
1224 
1225 static void
1226 shutdown_inst(bridge_inst_t *bip)
1227 {
1228 	bridge_link_t *blp, *blnext;
1229 	bridge_fwd_t *bfp;
1230 
1231 	mutex_enter(&inst_lock);
1232 	if (bip->bi_flags & BIF_SHUTDOWN) {
1233 		mutex_exit(&inst_lock);
1234 		return;
1235 	}
1236 
1237 	/*
1238 	 * Once on the inst_list, the bridge instance must not leave that list
1239 	 * without having the shutdown flag set first.  When the shutdown flag
1240 	 * is set, we own the list reference, so we must drop it before
1241 	 * returning.
1242 	 */
1243 	bip->bi_flags |= BIF_SHUTDOWN;
1244 	mutex_exit(&inst_lock);
1245 
1246 	bip->bi_control = NULL;
1247 
1248 	rw_enter(&bip->bi_rwlock, RW_READER);
1249 	blnext = list_head(&bip->bi_links);
1250 	while ((blp = blnext) != NULL) {
1251 		blnext = list_next(&bip->bi_links, blp);
1252 		if (!(blp->bl_flags & BLF_DELETED)) {
1253 			blp->bl_flags |= BLF_DELETED;
1254 			(void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
1255 			    blp, DDI_SLEEP);
1256 		}
1257 	}
1258 	while ((bfp = avl_first(&bip->bi_fwd)) != NULL) {
1259 		atomic_inc_uint(&bfp->bf_refs);
1260 		rw_exit(&bip->bi_rwlock);
1261 		fwd_delete(bfp);
1262 		fwd_unref(bfp);
1263 		rw_enter(&bip->bi_rwlock, RW_READER);
1264 	}
1265 	rw_exit(&bip->bi_rwlock);
1266 
1267 	/*
1268 	 * This bridge is being destroyed.  Notify TRILL once all of the
1269 	 * links are all gone.
1270 	 */
1271 	mutex_enter(&inst_lock);
1272 	while (bip->bi_trilldata != NULL && !list_is_empty(&bip->bi_links))
1273 		cv_wait(&bip->bi_linkwait, &inst_lock);
1274 	mutex_exit(&inst_lock);
1275 	if (bip->bi_trilldata != NULL)
1276 		trill_brdstr_fn(bip->bi_trilldata, bip);
1277 
1278 	bridge_unref(bip);
1279 }
1280 
1281 /*
1282  * This is called once by the TRILL module when it starts up.  It just sets the
1283  * global TRILL callback function pointers -- data transmit/receive and bridge
1284  * and link destroy notification.  There's only one TRILL module, so only one
1285  * registration is needed.
1286  *
1287  * TRILL should call this function with NULL pointers before unloading.  It
1288  * must not do so before dropping all references to bridges and links.  We
1289  * assert that this is true on debug builds.
1290  */
1291 void
1292 bridge_trill_register_cb(trill_recv_pkt_t recv_fn, trill_encap_pkt_t encap_fn,
1293     trill_br_dstr_t brdstr_fn, trill_ln_dstr_t lndstr_fn)
1294 {
1295 #ifdef DEBUG
1296 	if (recv_fn == NULL && trill_recv_fn != NULL) {
1297 		bridge_inst_t *bip;
1298 		bridge_link_t *blp;
1299 
1300 		mutex_enter(&inst_lock);
1301 		for (bip = list_head(&inst_list); bip != NULL;
1302 		    bip = list_next(&inst_list, bip)) {
1303 			ASSERT(bip->bi_trilldata == NULL);
1304 			rw_enter(&bip->bi_rwlock, RW_READER);
1305 			for (blp = list_head(&bip->bi_links); blp != NULL;
1306 			    blp = list_next(&bip->bi_links, blp)) {
1307 				ASSERT(blp->bl_trilldata == NULL);
1308 			}
1309 			rw_exit(&bip->bi_rwlock);
1310 		}
1311 		mutex_exit(&inst_lock);
1312 	}
1313 #endif
1314 	trill_recv_fn = recv_fn;
1315 	trill_encap_fn = encap_fn;
1316 	trill_brdstr_fn = brdstr_fn;
1317 	trill_lndstr_fn = lndstr_fn;
1318 }
1319 
1320 /*
1321  * This registers the TRILL instance pointer with a bridge.  Before this
1322  * pointer is set, the forwarding, TRILL receive, and bridge destructor
1323  * functions won't be called.
1324  *
1325  * TRILL holds a reference on a bridge with this call.  It must free the
1326  * reference by calling the unregister function below.
1327  */
1328 bridge_inst_t *
1329 bridge_trill_brref(const char *bname, void *ptr)
1330 {
1331 	char bridge[MAXLINKNAMELEN];
1332 	bridge_inst_t *bip;
1333 
1334 	(void) snprintf(bridge, MAXLINKNAMELEN, "%s0", bname);
1335 	bip = bridge_find_name(bridge);
1336 	if (bip != NULL) {
1337 		ASSERT(bip->bi_trilldata == NULL && ptr != NULL);
1338 		bip->bi_trilldata = ptr;
1339 	}
1340 	return (bip);
1341 }
1342 
1343 void
1344 bridge_trill_brunref(bridge_inst_t *bip)
1345 {
1346 	ASSERT(bip->bi_trilldata != NULL);
1347 	bip->bi_trilldata = NULL;
1348 	bridge_unref(bip);
1349 }
1350 
1351 /*
1352  * TRILL calls this function when referencing a particular link on a bridge.
1353  *
1354  * It holds a reference on the link, so TRILL must clear out the reference when
1355  * it's done with the link (on unbinding).
1356  */
1357 bridge_link_t *
1358 bridge_trill_lnref(bridge_inst_t *bip, datalink_id_t linkid, void *ptr)
1359 {
1360 	bridge_link_t *blp;
1361 
1362 	ASSERT(ptr != NULL);
1363 	rw_enter(&bip->bi_rwlock, RW_READER);
1364 	for (blp = list_head(&bip->bi_links); blp != NULL;
1365 	    blp = list_next(&bip->bi_links, blp)) {
1366 		if (!(blp->bl_flags & BLF_DELETED) &&
1367 		    blp->bl_linkid == linkid && blp->bl_trilldata == NULL) {
1368 			blp->bl_trilldata = ptr;
1369 			blp->bl_flags &= ~BLF_TRILLACTIVE;
1370 			(void) memset(blp->bl_afs, 0, sizeof (blp->bl_afs));
1371 			atomic_inc_uint(&blp->bl_refs);
1372 			break;
1373 		}
1374 	}
1375 	rw_exit(&bip->bi_rwlock);
1376 	return (blp);
1377 }
1378 
1379 void
1380 bridge_trill_lnunref(bridge_link_t *blp)
1381 {
1382 	mutex_enter(&blp->bl_trilllock);
1383 	ASSERT(blp->bl_trilldata != NULL);
1384 	blp->bl_trilldata = NULL;
1385 	blp->bl_flags &= ~BLF_TRILLACTIVE;
1386 	while (blp->bl_trillthreads > 0)
1387 		cv_wait(&blp->bl_trillwait, &blp->bl_trilllock);
1388 	mutex_exit(&blp->bl_trilllock);
1389 	(void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
1390 	link_unref(blp);
1391 }
1392 
1393 /*
1394  * This periodic timer performs three functions:
1395  *  1. It scans the list of learned forwarding entries, and removes ones that
1396  *     haven't been heard from in a while.  The time limit is backed down if
1397  *     we're above the configured table limit.
1398  *  2. It walks the links and decays away the bl_learns counter.
1399  *  3. It scans the observability node entries looking for ones that can be
1400  *     freed up.
1401  */
1402 /* ARGSUSED */
1403 static void
1404 bridge_timer(void *arg)
1405 {
1406 	bridge_inst_t *bip;
1407 	bridge_fwd_t *bfp, *bfnext;
1408 	bridge_mac_t *bmp, *bmnext;
1409 	bridge_link_t *blp;
1410 	int err;
1411 	datalink_id_t tmpid;
1412 	avl_tree_t fwd_scavenge;
1413 	clock_t age_limit;
1414 	uint32_t ldecay;
1415 
1416 	avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1417 	    offsetof(bridge_fwd_t, bf_node));
1418 	mutex_enter(&inst_lock);
1419 	for (bip = list_head(&inst_list); bip != NULL;
1420 	    bip = list_next(&inst_list, bip)) {
1421 		if (bip->bi_flags & BIF_SHUTDOWN)
1422 			continue;
1423 		rw_enter(&bip->bi_rwlock, RW_WRITER);
1424 		/* compute scaled maximum age based on table limit */
1425 		if (avl_numnodes(&bip->bi_fwd) > bip->bi_tablemax)
1426 			bip->bi_tshift++;
1427 		else
1428 			bip->bi_tshift = 0;
1429 		if ((age_limit = bridge_fwd_age >> bip->bi_tshift) == 0) {
1430 			if (bip->bi_tshift != 0)
1431 				bip->bi_tshift--;
1432 			age_limit = 1;
1433 		}
1434 		bfnext = avl_first(&bip->bi_fwd);
1435 		while ((bfp = bfnext) != NULL) {
1436 			bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1437 			if (!(bfp->bf_flags & BFF_LOCALADDR) &&
1438 			    (ddi_get_lbolt() - bfp->bf_lastheard) > age_limit) {
1439 				ASSERT(bfp->bf_flags & BFF_INTREE);
1440 				avl_remove(&bip->bi_fwd, bfp);
1441 				bfp->bf_flags &= ~BFF_INTREE;
1442 				avl_add(&fwd_scavenge, bfp);
1443 			}
1444 		}
1445 		for (blp = list_head(&bip->bi_links); blp != NULL;
1446 		    blp = list_next(&bip->bi_links, blp)) {
1447 			ldecay = mac_get_ldecay(blp->bl_mh);
1448 			if (ldecay >= blp->bl_learns)
1449 				blp->bl_learns = 0;
1450 			else
1451 				atomic_add_int(&blp->bl_learns, -(int)ldecay);
1452 		}
1453 		rw_exit(&bip->bi_rwlock);
1454 		bfnext = avl_first(&fwd_scavenge);
1455 		while ((bfp = bfnext) != NULL) {
1456 			bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1457 			avl_remove(&fwd_scavenge, bfp);
1458 			KIINCR(bki_expire);
1459 			fwd_unref(bfp);	/* drop tree reference */
1460 		}
1461 	}
1462 	mutex_exit(&inst_lock);
1463 	avl_destroy(&fwd_scavenge);
1464 
1465 	/*
1466 	 * Scan the bridge_mac_t entries and try to free up the ones that are
1467 	 * no longer active.  This must be done by polling, as neither DLS nor
1468 	 * MAC provides a driver any sort of positive control over clients.
1469 	 */
1470 	rw_enter(&bmac_rwlock, RW_WRITER);
1471 	bmnext = list_head(&bmac_list);
1472 	while ((bmp = bmnext) != NULL) {
1473 		bmnext = list_next(&bmac_list, bmp);
1474 
1475 		/* ignore active bridges */
1476 		if (bmp->bm_inst != NULL)
1477 			continue;
1478 
1479 		if (bmp->bm_flags & BMF_DLS) {
1480 			err = dls_devnet_destroy(bmp->bm_mh, &tmpid, B_FALSE);
1481 			ASSERT(err == 0 || err == EBUSY);
1482 			if (err == 0)
1483 				bmp->bm_flags &= ~BMF_DLS;
1484 		}
1485 
1486 		if (!(bmp->bm_flags & BMF_DLS)) {
1487 			err = mac_unregister(bmp->bm_mh);
1488 			ASSERT(err == 0 || err == EBUSY);
1489 			if (err == 0) {
1490 				list_remove(&bmac_list, bmp);
1491 				kmem_free(bmp, sizeof (*bmp));
1492 			}
1493 		}
1494 	}
1495 	if (list_is_empty(&bmac_list)) {
1496 		bridge_timerid = 0;
1497 	} else {
1498 		bridge_timerid = timeout(bridge_timer, NULL,
1499 		    bridge_scan_interval);
1500 	}
1501 	rw_exit(&bmac_rwlock);
1502 }
1503 
1504 static int
1505 bridge_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp)
1506 {
1507 	bridge_stream_t	*bsp;
1508 
1509 	if (rq->q_ptr != NULL)
1510 		return (0);
1511 
1512 	if (sflag & MODOPEN)
1513 		return (EINVAL);
1514 
1515 	/*
1516 	 * Check the minor node number being opened.  This tells us which
1517 	 * bridge instance the user wants.
1518 	 */
1519 	if (getminor(*devp) != 0) {
1520 		/*
1521 		 * This is a regular DLPI stream for snoop or the like.
1522 		 * Redirect it through DLD.
1523 		 */
1524 		rq->q_qinfo = &bridge_dld_rinit;
1525 		OTHERQ(rq)->q_qinfo = &bridge_dld_winit;
1526 		return (dld_open(rq, devp, oflag, sflag, credp));
1527 	} else {
1528 		/*
1529 		 * Allocate the bridge control stream structure.
1530 		 */
1531 		if ((bsp = stream_alloc()) == NULL)
1532 			return (ENOSR);
1533 		rq->q_ptr = WR(rq)->q_ptr = (caddr_t)bsp;
1534 		bsp->bs_wq = WR(rq);
1535 		*devp = makedevice(getmajor(*devp), bsp->bs_minor);
1536 		qprocson(rq);
1537 		return (0);
1538 	}
1539 }
1540 
1541 /*
1542  * This is used only for bridge control streams.  DLPI goes through dld
1543  * instead.
1544  */
1545 /* ARGSUSED */
1546 static int
1547 bridge_close(queue_t *rq, int flags __unused, cred_t *credp __unused)
1548 {
1549 	bridge_stream_t	*bsp = rq->q_ptr;
1550 	bridge_inst_t *bip;
1551 
1552 	/*
1553 	 * Wait for any stray taskq (add/delete link) entries related to this
1554 	 * stream to leave the system.
1555 	 */
1556 	mutex_enter(&stream_ref_lock);
1557 	while (bsp->bs_taskq_cnt != 0)
1558 		cv_wait(&stream_ref_cv, &stream_ref_lock);
1559 	mutex_exit(&stream_ref_lock);
1560 
1561 	qprocsoff(rq);
1562 	if ((bip = bsp->bs_inst) != NULL)
1563 		shutdown_inst(bip);
1564 	rq->q_ptr = WR(rq)->q_ptr = NULL;
1565 	stream_free(bsp);
1566 	if (bip != NULL)
1567 		bridge_unref(bip);
1568 
1569 	return (0);
1570 }
1571 
1572 static void
1573 bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick,
1574     uint16_t vlanid)
1575 {
1576 	bridge_inst_t *bip = blp->bl_inst;
1577 	bridge_fwd_t *bfp, *bfpnew;
1578 	int i;
1579 	boolean_t replaced = B_FALSE;
1580 
1581 	/* Ignore multi-destination address used as source; it's nonsense. */
1582 	if (*saddr & 1)
1583 		return;
1584 
1585 	/*
1586 	 * If the source is known, then check whether it belongs on this link.
1587 	 * If not, and this isn't a fixed local address, then we've detected a
1588 	 * move.  If it's not known, learn it.
1589 	 */
1590 	if ((bfp = fwd_find(bip, saddr, vlanid)) != NULL) {
1591 		/*
1592 		 * If the packet has a fixed local source address, then there's
1593 		 * nothing we can learn.  We must quit.  If this was a received
1594 		 * packet, then the sender has stolen our address, but there's
1595 		 * nothing we can do.  If it's a transmitted packet, then
1596 		 * that's the normal case.
1597 		 */
1598 		if (bfp->bf_flags & BFF_LOCALADDR) {
1599 			fwd_unref(bfp);
1600 			return;
1601 		}
1602 
1603 		/*
1604 		 * Check if the link (and TRILL sender, if any) being used is
1605 		 * among the ones registered for this address.  If so, then
1606 		 * this is information that we already know.
1607 		 */
1608 		if (bfp->bf_trill_nick == ingress_nick) {
1609 			for (i = 0; i < bfp->bf_nlinks; i++) {
1610 				if (bfp->bf_links[i] == blp) {
1611 					bfp->bf_lastheard = ddi_get_lbolt();
1612 					fwd_unref(bfp);
1613 					return;
1614 				}
1615 			}
1616 		}
1617 	}
1618 
1619 	/*
1620 	 * Note that we intentionally "unlearn" things that appear to be under
1621 	 * attack on this link.  The forwarding cache is a negative thing for
1622 	 * security -- it disables reachability as a performance optimization
1623 	 * -- so leaving out entries optimizes for success and defends against
1624 	 * the attack.  Thus, the bare increment without a check in the delete
1625 	 * code above is right.  (And it's ok if we skid over the limit a
1626 	 * little, so there's no syncronization needed on the test.)
1627 	 */
1628 	if (blp->bl_learns >= mac_get_llimit(blp->bl_mh)) {
1629 		if (bfp != NULL) {
1630 			if (bfp->bf_vcnt == 0)
1631 				fwd_delete(bfp);
1632 			fwd_unref(bfp);
1633 		}
1634 		return;
1635 	}
1636 
1637 	atomic_inc_uint(&blp->bl_learns);
1638 
1639 	if ((bfpnew = fwd_alloc(saddr, 1, ingress_nick)) == NULL) {
1640 		if (bfp != NULL)
1641 			fwd_unref(bfp);
1642 		return;
1643 	}
1644 	KIINCR(bki_count);
1645 
1646 	if (bfp != NULL) {
1647 		/*
1648 		 * If this is a new destination for the same VLAN, then delete
1649 		 * so that we can update.  If it's a different VLAN, then we're
1650 		 * not going to delete the original.  Split off instead into an
1651 		 * IVL entry.
1652 		 */
1653 		if (bfp->bf_vlanid == vlanid) {
1654 			/* save the count of IVL duplicates */
1655 			bfpnew->bf_vcnt = bfp->bf_vcnt;
1656 
1657 			/* entry deletes count as learning events */
1658 			atomic_inc_uint(&blp->bl_learns);
1659 
1660 			/* destroy and create anew; node moved */
1661 			fwd_delete(bfp);
1662 			replaced = B_TRUE;
1663 			KIINCR(bki_moved);
1664 		} else {
1665 			bfp->bf_vcnt++;
1666 			bfpnew->bf_flags |= BFF_VLANLOCAL;
1667 		}
1668 		fwd_unref(bfp);
1669 	}
1670 	bfpnew->bf_links[0] = blp;
1671 	bfpnew->bf_nlinks = 1;
1672 	atomic_inc_uint(&blp->bl_refs);	/* bf_links entry */
1673 	if (!fwd_insert(bip, bfpnew))
1674 		fwd_free(bfpnew);
1675 	else if (!replaced)
1676 		KIINCR(bki_source);
1677 }
1678 
1679 /*
1680  * Process the VLAN headers for output on a given link.  There are several
1681  * cases (noting that we don't map VLANs):
1682  *   1. The input packet is good as it is; either
1683  *	a. It has no tag, and output has same PVID
1684  *	b. It has a non-zero priority-only tag for PVID, and b_band is same
1685  *	c. It has a tag with VLAN different from PVID, and b_band is same
1686  *   2. The tag must change: non-zero b_band is different from tag priority
1687  *   3. The packet has a tag and should not (VLAN same as PVID, b_band zero)
1688  *   4. The packet has no tag and needs one:
1689  *      a. VLAN ID same as PVID, but b_band is non-zero
1690  *      b. VLAN ID different from PVID
1691  * We exclude case 1 first, then modify the packet.  Note that output packets
1692  * get a priority set by the mblk, not by the header, because QoS in bridging
1693  * requires priority recalculation at each node.
1694  *
1695  * The passed-in tci is the "impossible" value 0xFFFF when no tag is present.
1696  */
1697 static mblk_t *
1698 reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
1699 {
1700 	boolean_t source_has_tag = (tci != 0xFFFF);
1701 	mblk_t *mpcopy;
1702 	size_t mlen, minlen;
1703 	struct ether_vlan_header *evh;
1704 	int pri;
1705 
1706 	/* This helps centralize error handling in the caller. */
1707 	if (mp == NULL)
1708 		return (mp);
1709 
1710 	/*
1711 	 * A forwarded packet cannot have hardware offloads enabled
1712 	 * because we don't know if the destination can handle them.
1713 	 * By this point, any hardware offloads present should have
1714 	 * been emulated.
1715 	 */
1716 	DB_CKSUMFLAGS(mp) = 0;
1717 
1718 	/* Get the no-modification cases out of the way first */
1719 	if (!source_has_tag && vlanid == pvid)		/* 1a */
1720 		return (mp);
1721 
1722 	pri = VLAN_PRI(tci);
1723 	if (source_has_tag && mp->b_band == pri) {
1724 		if (vlanid != pvid)			/* 1c */
1725 			return (mp);
1726 		if (pri != 0 && VLAN_ID(tci) == 0)	/* 1b */
1727 			return (mp);
1728 	}
1729 
1730 	/*
1731 	 * We now know that we must modify the packet.  Prepare for that.  Note
1732 	 * that if a tag is present, the caller has already done a pullup for
1733 	 * the VLAN header, so we're good to go.
1734 	 */
1735 	if (MBLKL(mp) < sizeof (struct ether_header)) {
1736 		mpcopy = msgpullup(mp, sizeof (struct ether_header));
1737 		if (mpcopy == NULL) {
1738 			freemsg(mp);
1739 			return (NULL);
1740 		}
1741 		mp = mpcopy;
1742 	}
1743 	if (DB_REF(mp) > 1 || !IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)) ||
1744 	    (!source_has_tag && MBLKTAIL(mp) < VLAN_INCR)) {
1745 		minlen = mlen = MBLKL(mp);
1746 		if (!source_has_tag)
1747 			minlen += VLAN_INCR;
1748 		ASSERT(minlen >= sizeof (struct ether_vlan_header));
1749 		/*
1750 		 * We're willing to copy some data to avoid fragmentation, but
1751 		 * not a lot.
1752 		 */
1753 		if (minlen > 256)
1754 			minlen = sizeof (struct ether_vlan_header);
1755 		mpcopy = allocb(minlen, BPRI_MED);
1756 		if (mpcopy == NULL) {
1757 			freemsg(mp);
1758 			return (NULL);
1759 		}
1760 		if (mlen <= minlen) {
1761 			/* We toss the first mblk when we can. */
1762 			bcopy(mp->b_rptr, mpcopy->b_rptr, mlen);
1763 			mpcopy->b_wptr += mlen;
1764 			mpcopy->b_cont = mp->b_cont;
1765 			freeb(mp);
1766 		} else {
1767 			/* If not, then just copy what we need */
1768 			if (!source_has_tag)
1769 				minlen = sizeof (struct ether_header);
1770 			bcopy(mp->b_rptr, mpcopy->b_rptr, minlen);
1771 			mpcopy->b_wptr += minlen;
1772 			mpcopy->b_cont = mp;
1773 			mp->b_rptr += minlen;
1774 		}
1775 		mp = mpcopy;
1776 	}
1777 
1778 	/* LINTED: pointer alignment */
1779 	evh = (struct ether_vlan_header *)mp->b_rptr;
1780 	if (source_has_tag) {
1781 		if (mp->b_band == 0 && vlanid == pvid) {	/* 3 */
1782 			evh->ether_tpid = evh->ether_type;
1783 			mlen = MBLKL(mp);
1784 			if (mlen > sizeof (struct ether_vlan_header))
1785 				ovbcopy(mp->b_rptr +
1786 				    sizeof (struct ether_vlan_header),
1787 				    mp->b_rptr + sizeof (struct ether_header),
1788 				    mlen - sizeof (struct ether_vlan_header));
1789 			mp->b_wptr -= VLAN_INCR;
1790 		} else {					/* 2 */
1791 			if (vlanid == pvid)
1792 				vlanid = VLAN_ID_NONE;
1793 			tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1794 			evh->ether_tci = htons(tci);
1795 		}
1796 	} else {
1797 		/* case 4: no header present, but one is needed */
1798 		mlen = MBLKL(mp);
1799 		if (mlen > sizeof (struct ether_header))
1800 			ovbcopy(mp->b_rptr + sizeof (struct ether_header),
1801 			    mp->b_rptr + sizeof (struct ether_vlan_header),
1802 			    mlen - sizeof (struct ether_header));
1803 		mp->b_wptr += VLAN_INCR;
1804 		ASSERT(mp->b_wptr <= DB_LIM(mp));
1805 		if (vlanid == pvid)
1806 			vlanid = VLAN_ID_NONE;
1807 		tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1808 		evh->ether_type = evh->ether_tpid;
1809 		evh->ether_tpid = htons(ETHERTYPE_VLAN);
1810 		evh->ether_tci = htons(tci);
1811 	}
1812 	return (mp);
1813 }
1814 
1815 /* Record VLAN information and strip header if requested . */
1816 static void
1817 update_header(mblk_t *mp, mac_header_info_t *hdr_info, boolean_t striphdr)
1818 {
1819 	if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
1820 		struct ether_vlan_header *evhp;
1821 		uint16_t ether_type;
1822 
1823 		/* LINTED: alignment */
1824 		evhp = (struct ether_vlan_header *)mp->b_rptr;
1825 		hdr_info->mhi_istagged = B_TRUE;
1826 		hdr_info->mhi_tci = ntohs(evhp->ether_tci);
1827 		if (striphdr) {
1828 			/*
1829 			 * For VLAN tagged frames update the ether_type
1830 			 * in hdr_info before stripping the header.
1831 			 */
1832 			ether_type = ntohs(evhp->ether_type);
1833 			hdr_info->mhi_origsap = ether_type;
1834 			hdr_info->mhi_bindsap = (ether_type > ETHERMTU) ?
1835 			    ether_type : DLS_SAP_LLC;
1836 			mp->b_rptr = (uchar_t *)(evhp + 1);
1837 		}
1838 	} else {
1839 		hdr_info->mhi_istagged = B_FALSE;
1840 		hdr_info->mhi_tci = VLAN_ID_NONE;
1841 		if (striphdr)
1842 			mp->b_rptr += sizeof (struct ether_header);
1843 	}
1844 }
1845 
1846 /*
1847  * Return B_TRUE if we're allowed to send on this link with the given VLAN ID.
1848  */
1849 static boolean_t
1850 bridge_can_send(bridge_link_t *blp, uint16_t vlanid)
1851 {
1852 	ASSERT(vlanid != VLAN_ID_NONE);
1853 	if (blp->bl_flags & BLF_DELETED)
1854 		return (B_FALSE);
1855 	if (blp->bl_trilldata == NULL && blp->bl_state != BLS_FORWARDING)
1856 		return (B_FALSE);
1857 	return (BRIDGE_VLAN_ISSET(blp, vlanid) && BRIDGE_AF_ISSET(blp, vlanid));
1858 }
1859 
1860 /*
1861  * This function scans the bridge forwarding tables in order to forward a given
1862  * packet.  If the packet either doesn't need forwarding (the current link is
1863  * correct) or the current link needs a copy as well, then the packet is
1864  * returned to the caller.
1865  *
1866  * If a packet has been decapsulated from TRILL, then it must *NOT* reenter a
1867  * TRILL tunnel.  If the destination points there, then drop instead.
1868  */
1869 static mblk_t *
1870 bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
1871     uint16_t vlanid, uint16_t tci, boolean_t from_trill, boolean_t is_xmit)
1872 {
1873 	mblk_t *mpsend, *mpcopy;
1874 	bridge_inst_t *bip = blp->bl_inst;
1875 	bridge_link_t *blpsend, *blpnext;
1876 	bridge_fwd_t *bfp;
1877 	uint_t i;
1878 	boolean_t selfseen = B_FALSE;
1879 	void *tdp;
1880 	const uint8_t *daddr = hdr_info->mhi_daddr;
1881 
1882 	/*
1883 	 * Check for the IEEE "reserved" multicast addresses.  Messages sent to
1884 	 * these addresses are used for link-local control (STP and pause), and
1885 	 * are never forwarded or redirected.
1886 	 */
1887 	if (daddr[0] == 1 && daddr[1] == 0x80 && daddr[2] == 0xc2 &&
1888 	    daddr[3] == 0 && daddr[4] == 0 && (daddr[5] & 0xf0) == 0) {
1889 		if (from_trill) {
1890 			freemsg(mp);
1891 			mp = NULL;
1892 		}
1893 		return (mp);
1894 	}
1895 
1896 	if ((bfp = fwd_find(bip, daddr, vlanid)) != NULL) {
1897 
1898 		/*
1899 		 * If trill indicates a destination for this node, then it's
1900 		 * clearly not intended for local delivery.  We must tell TRILL
1901 		 * to encapsulate, as long as we didn't just decapsulate it.
1902 		 */
1903 		if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE) {
1904 			/*
1905 			 * Error case: can't reencapsulate if the protocols are
1906 			 * working correctly.
1907 			 */
1908 			if (from_trill) {
1909 				freemsg(mp);
1910 				return (NULL);
1911 			}
1912 			mutex_enter(&blp->bl_trilllock);
1913 			if ((tdp = blp->bl_trilldata) != NULL) {
1914 				blp->bl_trillthreads++;
1915 				mutex_exit(&blp->bl_trilllock);
1916 				update_header(mp, hdr_info, B_FALSE);
1917 
1918 				/*
1919 				 * All trill data frames have
1920 				 * Inner.VLAN.
1921 				 */
1922 				mp = reform_vlan_header(mp, vlanid, tci, 0);
1923 
1924 				if (mp == NULL) {
1925 					KIINCR(bki_drops);
1926 					goto done;
1927 				}
1928 
1929 				trill_encap_fn(tdp, blp, hdr_info, mp,
1930 				    bfp->bf_trill_nick);
1931 
1932 done:
1933 				mutex_enter(&blp->bl_trilllock);
1934 				if (--blp->bl_trillthreads == 0 &&
1935 				    blp->bl_trilldata == NULL)
1936 					cv_broadcast(&blp->bl_trillwait);
1937 			}
1938 			mutex_exit(&blp->bl_trilllock);
1939 
1940 			/* if TRILL has been disabled, then kill this stray */
1941 			if (tdp == NULL) {
1942 				freemsg(mp);
1943 				fwd_delete(bfp);
1944 			}
1945 			fwd_unref(bfp);
1946 			return (NULL);
1947 		}
1948 
1949 		/* find first link we can send on */
1950 		for (i = 0; i < bfp->bf_nlinks; i++) {
1951 			blpsend = bfp->bf_links[i];
1952 			if (blpsend == blp)
1953 				selfseen = B_TRUE;
1954 			else if (bridge_can_send(blpsend, vlanid))
1955 				break;
1956 		}
1957 
1958 		while (i < bfp->bf_nlinks) {
1959 			blpsend = bfp->bf_links[i];
1960 			for (i++; i < bfp->bf_nlinks; i++) {
1961 				blpnext = bfp->bf_links[i];
1962 				if (blpnext == blp)
1963 					selfseen = B_TRUE;
1964 				else if (bridge_can_send(blpnext, vlanid))
1965 					break;
1966 			}
1967 			if (i == bfp->bf_nlinks && !selfseen) {
1968 				mpsend = mp;
1969 				mp = NULL;
1970 			} else {
1971 				mpsend = copymsg(mp);
1972 			}
1973 
1974 			mpsend = reform_vlan_header(mpsend, vlanid, tci,
1975 			    blpsend->bl_pvid);
1976 
1977 			if (mpsend == NULL) {
1978 				KIINCR(bki_drops);
1979 				continue;
1980 			}
1981 
1982 			KIINCR(bki_forwards);
1983 
1984 			/*
1985 			 * No need to bump up the link reference count, as
1986 			 * the forwarding entry itself holds a reference to
1987 			 * the link.
1988 			 */
1989 			if (bfp->bf_flags & BFF_LOCALADDR) {
1990 				mac_rx_common(blpsend->bl_mh, NULL, mpsend);
1991 			} else {
1992 				KLPINCR(blpsend, bkl_xmit);
1993 				mpsend = mac_ring_tx(blpsend->bl_mh, NULL,
1994 				    mpsend);
1995 				freemsg(mpsend);
1996 			}
1997 		}
1998 
1999 		/*
2000 		 * Handle a special case: if we're transmitting to the original
2001 		 * link, then check whether the localaddr flag is set.  If it
2002 		 * is, then receive instead.  This doesn't happen with ordinary
2003 		 * bridging, but does happen often with TRILL decapsulation.
2004 		 */
2005 		if (mp != NULL && is_xmit && (bfp->bf_flags & BFF_LOCALADDR)) {
2006 			mac_rx_common(blp->bl_mh, NULL, mp);
2007 			mp = NULL;
2008 		}
2009 		fwd_unref(bfp);
2010 	} else {
2011 		/*
2012 		 * TRILL has two cases to handle.  If the packet is off the
2013 		 * wire (not from TRILL), then we need to send up into the
2014 		 * TRILL module to have the distribution tree computed.  If the
2015 		 * packet is from TRILL (decapsulated), then we're part of the
2016 		 * distribution tree, and we need to copy the packet on member
2017 		 * interfaces.
2018 		 *
2019 		 * Thus, the from TRILL case is identical to the STP case.
2020 		 */
2021 		if (!from_trill && blp->bl_trilldata != NULL) {
2022 			mutex_enter(&blp->bl_trilllock);
2023 			if ((tdp = blp->bl_trilldata) != NULL) {
2024 				blp->bl_trillthreads++;
2025 				mutex_exit(&blp->bl_trilllock);
2026 				if ((mpsend = copymsg(mp)) != NULL) {
2027 					update_header(mpsend,
2028 					    hdr_info, B_FALSE);
2029 					/*
2030 					 * all trill data frames have
2031 					 * Inner.VLAN
2032 					 */
2033 					mpsend = reform_vlan_header(mpsend,
2034 					    vlanid, tci, 0);
2035 					if (mpsend == NULL) {
2036 						KIINCR(bki_drops);
2037 					} else {
2038 						trill_encap_fn(tdp, blp,
2039 						    hdr_info, mpsend,
2040 						    RBRIDGE_NICKNAME_NONE);
2041 					}
2042 				}
2043 				mutex_enter(&blp->bl_trilllock);
2044 				if (--blp->bl_trillthreads == 0 &&
2045 				    blp->bl_trilldata == NULL)
2046 					cv_broadcast(&blp->bl_trillwait);
2047 			}
2048 			mutex_exit(&blp->bl_trilllock);
2049 		}
2050 
2051 		/*
2052 		 * This is an unknown destination, so flood.
2053 		 */
2054 		rw_enter(&bip->bi_rwlock, RW_READER);
2055 		for (blpnext = list_head(&bip->bi_links); blpnext != NULL;
2056 		    blpnext = list_next(&bip->bi_links, blpnext)) {
2057 			if (blpnext == blp)
2058 				selfseen = B_TRUE;
2059 			else if (bridge_can_send(blpnext, vlanid))
2060 				break;
2061 		}
2062 		if (blpnext != NULL)
2063 			atomic_inc_uint(&blpnext->bl_refs);
2064 		rw_exit(&bip->bi_rwlock);
2065 		while ((blpsend = blpnext) != NULL) {
2066 			rw_enter(&bip->bi_rwlock, RW_READER);
2067 			for (blpnext = list_next(&bip->bi_links, blpsend);
2068 			    blpnext != NULL;
2069 			    blpnext = list_next(&bip->bi_links, blpnext)) {
2070 				if (blpnext == blp)
2071 					selfseen = B_TRUE;
2072 				else if (bridge_can_send(blpnext, vlanid))
2073 					break;
2074 			}
2075 			if (blpnext != NULL)
2076 				atomic_inc_uint(&blpnext->bl_refs);
2077 			rw_exit(&bip->bi_rwlock);
2078 			if (blpnext == NULL && !selfseen) {
2079 				mpsend = mp;
2080 				mp = NULL;
2081 			} else {
2082 				mpsend = copymsg(mp);
2083 			}
2084 
2085 			mpsend = reform_vlan_header(mpsend, vlanid, tci,
2086 			    blpsend->bl_pvid);
2087 
2088 			if (mpsend == NULL) {
2089 				KIINCR(bki_drops);
2090 				continue;
2091 			}
2092 
2093 			if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST)
2094 				KIINCR(bki_unknown);
2095 			else
2096 				KIINCR(bki_mbcast);
2097 
2098 			KLPINCR(blpsend, bkl_xmit);
2099 			if ((mpcopy = copymsg(mpsend)) != NULL) {
2100 				mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
2101 			}
2102 
2103 			mpsend = mac_ring_tx(blpsend->bl_mh, NULL, mpsend);
2104 			freemsg(mpsend);
2105 			link_unref(blpsend);
2106 		}
2107 	}
2108 
2109 	/*
2110 	 * At this point, if np is non-NULL, it means that the caller needs to
2111 	 * continue on the selected link.
2112 	 */
2113 	return (mp);
2114 }
2115 
2116 /*
2117  * Extract and validate the VLAN information for a given packet.  This checks
2118  * conformance with the rules for use of the PVID on the link, and for the
2119  * allowed (configured) VLAN set.
2120  *
2121  * Returns B_TRUE if the packet passes, B_FALSE if it fails.
2122  */
2123 static boolean_t
2124 bridge_get_vlan(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
2125     uint16_t *vlanidp, uint16_t *tcip)
2126 {
2127 	uint16_t tci, vlanid;
2128 
2129 	if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
2130 		ptrdiff_t tpos = offsetof(struct ether_vlan_header, ether_tci);
2131 		ptrdiff_t mlen;
2132 
2133 		/*
2134 		 * Extract the VLAN ID information, regardless of alignment,
2135 		 * and without a pullup.  This isn't attractive, but we do this
2136 		 * to avoid having to deal with the pointers stashed in
2137 		 * hdr_info moving around or having the caller deal with a new
2138 		 * mblk_t pointer.
2139 		 */
2140 		while (mp != NULL) {
2141 			mlen = MBLKL(mp);
2142 			if (mlen > tpos && mlen > 0)
2143 				break;
2144 			tpos -= mlen;
2145 			mp = mp->b_cont;
2146 		}
2147 		if (mp == NULL)
2148 			return (B_FALSE);
2149 		tci = mp->b_rptr[tpos] << 8;
2150 		if (++tpos >= mlen) {
2151 			do {
2152 				mp = mp->b_cont;
2153 			} while (mp != NULL && MBLKL(mp) == 0);
2154 			if (mp == NULL)
2155 				return (B_FALSE);
2156 			tpos = 0;
2157 		}
2158 		tci |= mp->b_rptr[tpos];
2159 
2160 		vlanid = VLAN_ID(tci);
2161 		if (VLAN_CFI(tci) != ETHER_CFI || vlanid > VLAN_ID_MAX)
2162 			return (B_FALSE);
2163 		if (vlanid == VLAN_ID_NONE || vlanid == blp->bl_pvid)
2164 			goto input_no_vlan;
2165 		if (!BRIDGE_VLAN_ISSET(blp, vlanid))
2166 			return (B_FALSE);
2167 	} else {
2168 		tci = 0xFFFF;
2169 input_no_vlan:
2170 		/*
2171 		 * If PVID is set to zero, then untagged traffic is not
2172 		 * supported here.  Do not learn or forward.
2173 		 */
2174 		if ((vlanid = blp->bl_pvid) == VLAN_ID_NONE)
2175 			return (B_FALSE);
2176 	}
2177 
2178 	*tcip = tci;
2179 	*vlanidp = vlanid;
2180 	return (B_TRUE);
2181 }
2182 
2183 /*
2184  * Handle MAC notifications.
2185  */
2186 static void
2187 bridge_notify_cb(void *arg, mac_notify_type_t note_type)
2188 {
2189 	bridge_link_t *blp = arg;
2190 
2191 	switch (note_type) {
2192 	case MAC_NOTE_UNICST:
2193 		bridge_new_unicst(blp);
2194 		break;
2195 
2196 	case MAC_NOTE_SDU_SIZE: {
2197 		uint_t maxsdu;
2198 		bridge_inst_t *bip = blp->bl_inst;
2199 		bridge_mac_t *bmp = bip->bi_mac;
2200 		boolean_t notify = B_FALSE;
2201 		mblk_t *mlist = NULL;
2202 
2203 		mac_sdu_get(blp->bl_mh, NULL, &maxsdu);
2204 		rw_enter(&bip->bi_rwlock, RW_READER);
2205 		if (list_prev(&bip->bi_links, blp) == NULL &&
2206 		    list_next(&bip->bi_links, blp) == NULL) {
2207 			notify = (maxsdu != bmp->bm_maxsdu);
2208 			bmp->bm_maxsdu = maxsdu;
2209 		}
2210 		blp->bl_maxsdu = maxsdu;
2211 		if (maxsdu != bmp->bm_maxsdu)
2212 			link_sdu_fail(blp, B_TRUE, &mlist);
2213 		else if (notify)
2214 			(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2215 		rw_exit(&bip->bi_rwlock);
2216 		send_up_messages(bip, mlist);
2217 		break;
2218 	}
2219 	}
2220 }
2221 
2222 /*
2223  * This is called by the MAC layer.  As with the transmit side, we're right in
2224  * the data path for all I/O on this port, so if we don't need to forward this
2225  * packet anywhere, we have to send it upwards via mac_rx_common.
2226  */
2227 static void
2228 bridge_recv_cb(mac_handle_t mh, mac_resource_handle_t rsrc, mblk_t *mpnext)
2229 {
2230 	mblk_t *mp, *mpcopy;
2231 	bridge_link_t *blp = (bridge_link_t *)mh;
2232 	bridge_inst_t *bip = blp->bl_inst;
2233 	bridge_mac_t *bmp = bip->bi_mac;
2234 	mac_header_info_t hdr_info;
2235 	uint16_t vlanid, tci;
2236 	boolean_t trillmode = B_FALSE;
2237 
2238 	KIINCR(bki_recv);
2239 	KLINCR(bkl_recv);
2240 
2241 	/*
2242 	 * Regardless of state, check for inbound TRILL packets when TRILL is
2243 	 * active.  These are pulled out of band and sent for TRILL handling.
2244 	 */
2245 	if (blp->bl_trilldata != NULL) {
2246 		void *tdp;
2247 		mblk_t *newhead;
2248 		mblk_t *tail = NULL;
2249 
2250 		mutex_enter(&blp->bl_trilllock);
2251 		if ((tdp = blp->bl_trilldata) != NULL) {
2252 			blp->bl_trillthreads++;
2253 			mutex_exit(&blp->bl_trilllock);
2254 			trillmode = B_TRUE;
2255 			newhead = mpnext;
2256 			while ((mp = mpnext) != NULL) {
2257 				boolean_t raw_isis, bridge_group;
2258 
2259 				mpnext = mp->b_next;
2260 
2261 				/*
2262 				 * If the header isn't readable, then leave on
2263 				 * the list and continue.
2264 				 */
2265 				if (mac_header_info(blp->bl_mh, mp,
2266 				    &hdr_info) != 0) {
2267 					tail = mp;
2268 					continue;
2269 				}
2270 
2271 				/*
2272 				 * The TRILL document specifies that, on
2273 				 * Ethernet alone, IS-IS packets arrive with
2274 				 * LLC rather than Ethertype, and using a
2275 				 * specific destination address.  We must check
2276 				 * for that here.  Also, we need to give BPDUs
2277 				 * to TRILL for processing.
2278 				 */
2279 				raw_isis = bridge_group = B_FALSE;
2280 				if (hdr_info.mhi_dsttype ==
2281 				    MAC_ADDRTYPE_MULTICAST) {
2282 					if (memcmp(hdr_info.mhi_daddr,
2283 					    all_isis_rbridges, ETHERADDRL) == 0)
2284 						raw_isis = B_TRUE;
2285 					else if (memcmp(hdr_info.mhi_daddr,
2286 					    bridge_group_address, ETHERADDRL) ==
2287 					    0)
2288 						bridge_group = B_TRUE;
2289 				}
2290 				if (!raw_isis && !bridge_group &&
2291 				    hdr_info.mhi_bindsap != ETHERTYPE_TRILL &&
2292 				    (hdr_info.mhi_bindsap != ETHERTYPE_VLAN ||
2293 				    /* LINTED: alignment */
2294 				    ((struct ether_vlan_header *)mp->b_rptr)->
2295 				    ether_type != htons(ETHERTYPE_TRILL))) {
2296 					tail = mp;
2297 					continue;
2298 				}
2299 
2300 				/*
2301 				 * We've got TRILL input.  Remove from the list
2302 				 * and send up through the TRILL module.  (Send
2303 				 * a copy through promiscuous receive just to
2304 				 * support snooping on TRILL.  Order isn't
2305 				 * preserved strictly, but that doesn't matter
2306 				 * here.)
2307 				 */
2308 				if (tail != NULL)
2309 					tail->b_next = mpnext;
2310 				mp->b_next = NULL;
2311 				if (mp == newhead)
2312 					newhead = mpnext;
2313 				mac_trill_snoop(blp->bl_mh, mp);
2314 				update_header(mp, &hdr_info, B_TRUE);
2315 				/*
2316 				 * On raw IS-IS and BPDU frames, we have to
2317 				 * make sure that the length is trimmed
2318 				 * properly.  We use origsap in order to cope
2319 				 * with jumbograms for IS-IS.  (Regular mac
2320 				 * can't.)
2321 				 */
2322 				if (raw_isis || bridge_group) {
2323 					size_t msglen = msgdsize(mp);
2324 
2325 					if (msglen > hdr_info.mhi_origsap) {
2326 						(void) adjmsg(mp,
2327 						    hdr_info.mhi_origsap -
2328 						    msglen);
2329 					} else if (msglen <
2330 					    hdr_info.mhi_origsap) {
2331 						freemsg(mp);
2332 						continue;
2333 					}
2334 				}
2335 				trill_recv_fn(tdp, blp, rsrc, mp, &hdr_info);
2336 			}
2337 			mpnext = newhead;
2338 			mutex_enter(&blp->bl_trilllock);
2339 			if (--blp->bl_trillthreads == 0 &&
2340 			    blp->bl_trilldata == NULL)
2341 				cv_broadcast(&blp->bl_trillwait);
2342 		}
2343 		mutex_exit(&blp->bl_trilllock);
2344 		if (mpnext == NULL)
2345 			return;
2346 	}
2347 
2348 	/*
2349 	 * If this is a TRILL RBridge, then just check whether this link is
2350 	 * used at all for forwarding.  If not, then we're done.
2351 	 */
2352 	if (trillmode) {
2353 		if (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2354 		    (blp->bl_flags & BLF_SDUFAIL)) {
2355 			mac_rx_common(blp->bl_mh, rsrc, mpnext);
2356 			return;
2357 		}
2358 	} else {
2359 		/*
2360 		 * For regular (STP) bridges, if we're in blocking or listening
2361 		 * state, then do nothing.  We don't learn or forward until
2362 		 * told to do so.
2363 		 */
2364 		if (blp->bl_state == BLS_BLOCKLISTEN) {
2365 			mac_rx_common(blp->bl_mh, rsrc, mpnext);
2366 			return;
2367 		}
2368 	}
2369 
2370 	/*
2371 	 * Send a copy of the message chain up to the observability node users.
2372 	 * For TRILL, we must obey the VLAN AF rules, so we go packet-by-
2373 	 * packet.
2374 	 */
2375 	if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2376 	    (bmp->bm_flags & BMF_STARTED) &&
2377 	    (mp = copymsgchain(mpnext)) != NULL) {
2378 		mac_rx(bmp->bm_mh, NULL, mp);
2379 	}
2380 
2381 	/*
2382 	 * We must be in learning or forwarding state, or using TRILL on a link
2383 	 * with one or more VLANs active.  For each packet in the list, process
2384 	 * the source address, and then attempt to forward.
2385 	 */
2386 	while ((mp = mpnext) != NULL) {
2387 		mpnext = mp->b_next;
2388 		mp->b_next = NULL;
2389 
2390 		/*
2391 		 * If we can't decode the header or if the header specifies a
2392 		 * multicast source address (impossible!), then don't bother
2393 		 * learning or forwarding, but go ahead and forward up the
2394 		 * stack for subsequent processing.
2395 		 */
2396 		if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0 ||
2397 		    (hdr_info.mhi_saddr[0] & 1) != 0) {
2398 			KIINCR(bki_drops);
2399 			KLINCR(bkl_drops);
2400 			mac_rx_common(blp->bl_mh, rsrc, mp);
2401 			continue;
2402 		}
2403 
2404 		/*
2405 		 * Extract and validate the VLAN ID for this packet.
2406 		 */
2407 		if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2408 		    !BRIDGE_AF_ISSET(blp, vlanid)) {
2409 			mac_rx_common(blp->bl_mh, rsrc, mp);
2410 			continue;
2411 		}
2412 
2413 		if (trillmode) {
2414 			/*
2415 			 * Special test required by TRILL document: must
2416 			 * discard frames with outer address set to ESADI.
2417 			 */
2418 			if (memcmp(hdr_info.mhi_daddr, all_esadi_rbridges,
2419 			    ETHERADDRL) == 0) {
2420 				mac_rx_common(blp->bl_mh, rsrc, mp);
2421 				continue;
2422 			}
2423 
2424 			/*
2425 			 * If we're in TRILL mode, then the call above to get
2426 			 * the VLAN ID has also checked that we're the
2427 			 * appointed forwarder, so report that we're handling
2428 			 * this packet to any observability node users.
2429 			 */
2430 			if ((bmp->bm_flags & BMF_STARTED) &&
2431 			    (mpcopy = copymsg(mp)) != NULL)
2432 				mac_rx(bmp->bm_mh, NULL, mpcopy);
2433 		}
2434 
2435 		/*
2436 		 * First process the source address and learn from it.  For
2437 		 * TRILL, we learn only if we're the appointed forwarder.
2438 		 */
2439 		bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2440 		    vlanid);
2441 
2442 		/*
2443 		 * Now check whether we're forwarding and look up the
2444 		 * destination.  If we can forward, do so.
2445 		 */
2446 		if (trillmode || blp->bl_state == BLS_FORWARDING) {
2447 			mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2448 			    B_FALSE, B_FALSE);
2449 		}
2450 		if (mp != NULL)
2451 			mac_rx_common(blp->bl_mh, rsrc, mp);
2452 	}
2453 }
2454 
2455 
2456 /* ARGSUSED */
2457 static mblk_t *
2458 bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext)
2459 {
2460 	bridge_link_t *blp = (bridge_link_t *)mh;
2461 	bridge_inst_t *bip = blp->bl_inst;
2462 	bridge_mac_t *bmp = bip->bi_mac;
2463 	mac_header_info_t hdr_info;
2464 	uint16_t vlanid, tci;
2465 	mblk_t *mp, *mpcopy;
2466 	boolean_t trillmode;
2467 
2468 	trillmode = blp->bl_trilldata != NULL;
2469 
2470 	/*
2471 	 * If we're using STP and we're in blocking or listening state, or if
2472 	 * we're using TRILL and no VLANs are active, then behave as though the
2473 	 * bridge isn't here at all, and send on the local link alone.
2474 	 */
2475 	if ((!trillmode && blp->bl_state == BLS_BLOCKLISTEN) ||
2476 	    (trillmode &&
2477 	    (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2478 	    (blp->bl_flags & BLF_SDUFAIL)))) {
2479 		KIINCR(bki_sent);
2480 		KLINCR(bkl_xmit);
2481 		mp = mac_ring_tx(blp->bl_mh, rh, mpnext);
2482 		return (mp);
2483 	}
2484 
2485 	/*
2486 	 * Send a copy of the message up to the observability node users.
2487 	 * TRILL needs to check on a packet-by-packet basis.
2488 	 */
2489 	if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2490 	    (bmp->bm_flags & BMF_STARTED) &&
2491 	    (mp = copymsgchain(mpnext)) != NULL) {
2492 		mac_rx(bmp->bm_mh, NULL, mp);
2493 	}
2494 
2495 	while ((mp = mpnext) != NULL) {
2496 		mpnext = mp->b_next;
2497 		mp->b_next = NULL;
2498 
2499 		if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2500 			freemsg(mp);
2501 			continue;
2502 		}
2503 
2504 		/*
2505 		 * Extract and validate the VLAN ID for this packet.
2506 		 */
2507 		if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2508 		    !BRIDGE_AF_ISSET(blp, vlanid)) {
2509 			freemsg(mp);
2510 			continue;
2511 		}
2512 
2513 		/*
2514 		 * If we're using TRILL, then we've now validated that we're
2515 		 * the forwarder for this VLAN, so go ahead and let
2516 		 * observability node users know about the packet.
2517 		 */
2518 		if (trillmode && (bmp->bm_flags & BMF_STARTED) &&
2519 		    (mpcopy = copymsg(mp)) != NULL) {
2520 			mac_rx(bmp->bm_mh, NULL, mpcopy);
2521 		}
2522 
2523 		/*
2524 		 * We have to learn from our own transmitted packets, because
2525 		 * there may be a Solaris DLPI raw sender (which can specify its
2526 		 * own source address) using promiscuous mode for receive.  The
2527 		 * mac layer information won't (and can't) tell us everything
2528 		 * we need to know.
2529 		 */
2530 		bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2531 		    vlanid);
2532 
2533 		/* attempt forwarding */
2534 		if (trillmode || blp->bl_state == BLS_FORWARDING) {
2535 			mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2536 			    B_FALSE, B_TRUE);
2537 		}
2538 		if (mp != NULL) {
2539 			mp = mac_ring_tx(blp->bl_mh, rh, mp);
2540 			if (mp == NULL) {
2541 				KIINCR(bki_sent);
2542 				KLINCR(bkl_xmit);
2543 			}
2544 		}
2545 		/*
2546 		 * If we get stuck, then stop.  Don't let the user's output
2547 		 * packets get out of order.  (More importantly: don't try to
2548 		 * bridge the same packet multiple times if flow control is
2549 		 * asserted.)
2550 		 */
2551 		if (mp != NULL) {
2552 			mp->b_next = mpnext;
2553 			break;
2554 		}
2555 	}
2556 	return (mp);
2557 }
2558 
2559 /*
2560  * This is called by TRILL when it decapsulates an packet, and we must forward
2561  * locally.  On failure, we just drop.
2562  *
2563  * Note that the ingress_nick reported by TRILL must not represent this local
2564  * node.
2565  */
2566 void
2567 bridge_trill_decaps(bridge_link_t *blp, mblk_t *mp, uint16_t ingress_nick)
2568 {
2569 	mac_header_info_t hdr_info;
2570 	uint16_t vlanid, tci;
2571 	bridge_inst_t *bip = blp->bl_inst;	/* used by macros */
2572 	mblk_t *mpcopy;
2573 
2574 	if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2575 		freemsg(mp);
2576 		return;
2577 	}
2578 
2579 	/* Extract VLAN ID for this packet. */
2580 	if (hdr_info.mhi_bindsap == ETHERTYPE_VLAN) {
2581 		struct ether_vlan_header *evhp;
2582 
2583 		/* LINTED: alignment */
2584 		evhp = (struct ether_vlan_header *)mp->b_rptr;
2585 		tci = ntohs(evhp->ether_tci);
2586 		vlanid = VLAN_ID(tci);
2587 	} else {
2588 		/* Inner VLAN headers are required in TRILL data packets */
2589 		DTRACE_PROBE3(bridge__trill__decaps__novlan, bridge_link_t *,
2590 		    blp, mblk_t *, mp, uint16_t, ingress_nick);
2591 		freemsg(mp);
2592 		return;
2593 	}
2594 
2595 	/* Learn the location of this sender in the RBridge network */
2596 	bridge_learn(blp, hdr_info.mhi_saddr, ingress_nick, vlanid);
2597 
2598 	/* attempt forwarding */
2599 	mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci, B_TRUE, B_TRUE);
2600 	if (mp != NULL) {
2601 		if (bridge_can_send(blp, vlanid)) {
2602 			/* Deliver a copy locally as well */
2603 			if ((mpcopy = copymsg(mp)) != NULL)
2604 				mac_rx_common(blp->bl_mh, NULL, mpcopy);
2605 			mp = mac_ring_tx(blp->bl_mh, NULL, mp);
2606 		}
2607 		if (mp == NULL) {
2608 			KIINCR(bki_sent);
2609 			KLINCR(bkl_xmit);
2610 		} else {
2611 			freemsg(mp);
2612 		}
2613 	}
2614 }
2615 
2616 /*
2617  * This function is used by TRILL _only_ to transmit TRILL-encapsulated
2618  * packets.  It sends on a single underlying link and does not bridge.
2619  */
2620 mblk_t *
2621 bridge_trill_output(bridge_link_t *blp, mblk_t *mp)
2622 {
2623 	bridge_inst_t *bip = blp->bl_inst;	/* used by macros */
2624 
2625 	mac_trill_snoop(blp->bl_mh, mp);
2626 	mp = mac_ring_tx(blp->bl_mh, NULL, mp);
2627 	if (mp == NULL) {
2628 		KIINCR(bki_sent);
2629 		KLINCR(bkl_xmit);
2630 	}
2631 	return (mp);
2632 }
2633 
2634 /*
2635  * Set the "appointed forwarder" flag array for this link.  TRILL controls
2636  * forwarding on a VLAN basis.  The "trillactive" flag is an optimization for
2637  * the forwarder.
2638  */
2639 void
2640 bridge_trill_setvlans(bridge_link_t *blp, const uint8_t *arr)
2641 {
2642 	int i;
2643 	uint_t newflags = 0;
2644 
2645 	for (i = 0; i < BRIDGE_VLAN_ARR_SIZE; i++) {
2646 		if ((blp->bl_afs[i] = arr[i]) != 0)
2647 			newflags = BLF_TRILLACTIVE;
2648 	}
2649 	blp->bl_flags = (blp->bl_flags & ~BLF_TRILLACTIVE) | newflags;
2650 }
2651 
2652 void
2653 bridge_trill_flush(bridge_link_t *blp, uint16_t vlan, boolean_t dotrill)
2654 {
2655 	bridge_inst_t *bip = blp->bl_inst;
2656 	bridge_fwd_t *bfp, *bfnext;
2657 	avl_tree_t fwd_scavenge;
2658 	int i;
2659 
2660 	_NOTE(ARGUNUSED(vlan));
2661 
2662 	avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
2663 	    offsetof(bridge_fwd_t, bf_node));
2664 	rw_enter(&bip->bi_rwlock, RW_WRITER);
2665 	bfnext = avl_first(&bip->bi_fwd);
2666 	while ((bfp = bfnext) != NULL) {
2667 		bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
2668 		if (bfp->bf_flags & BFF_LOCALADDR)
2669 			continue;
2670 		if (dotrill) {
2671 			/* port doesn't matter if we're flushing TRILL */
2672 			if (bfp->bf_trill_nick == RBRIDGE_NICKNAME_NONE)
2673 				continue;
2674 		} else {
2675 			if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE)
2676 				continue;
2677 			for (i = 0; i < bfp->bf_nlinks; i++) {
2678 				if (bfp->bf_links[i] == blp)
2679 					break;
2680 			}
2681 			if (i >= bfp->bf_nlinks)
2682 				continue;
2683 		}
2684 		ASSERT(bfp->bf_flags & BFF_INTREE);
2685 		avl_remove(&bip->bi_fwd, bfp);
2686 		bfp->bf_flags &= ~BFF_INTREE;
2687 		avl_add(&fwd_scavenge, bfp);
2688 	}
2689 	rw_exit(&bip->bi_rwlock);
2690 	bfnext = avl_first(&fwd_scavenge);
2691 	while ((bfp = bfnext) != NULL) {
2692 		bfnext = AVL_NEXT(&fwd_scavenge, bfp);
2693 		avl_remove(&fwd_scavenge, bfp);
2694 		fwd_unref(bfp);
2695 	}
2696 	avl_destroy(&fwd_scavenge);
2697 }
2698 
2699 /*
2700  * Let the mac module take or drop a reference to a bridge link.  When this is
2701  * called, the mac module is holding the mi_bridge_lock, so the link cannot be
2702  * in the process of entering or leaving a bridge.
2703  */
2704 static void
2705 bridge_ref_cb(mac_handle_t mh, boolean_t hold)
2706 {
2707 	bridge_link_t *blp = (bridge_link_t *)mh;
2708 
2709 	if (hold)
2710 		atomic_inc_uint(&blp->bl_refs);
2711 	else
2712 		link_unref(blp);
2713 }
2714 
2715 /*
2716  * Handle link state changes reported by the mac layer.  This acts as a filter
2717  * for link state changes: if a link is reporting down, but there are other
2718  * links still up on the bridge, then the state is changed to "up."  When the
2719  * last link goes down, all are marked down, and when the first link goes up,
2720  * all are marked up.  (Recursion is avoided by the use of the "redo" function.)
2721  *
2722  * We treat unknown as equivalent to "up."
2723  */
2724 static link_state_t
2725 bridge_ls_cb(mac_handle_t mh, link_state_t newls)
2726 {
2727 	bridge_link_t *blp = (bridge_link_t *)mh;
2728 	bridge_link_t *blcmp;
2729 	bridge_inst_t *bip;
2730 	bridge_mac_t *bmp;
2731 
2732 	if (newls != LINK_STATE_DOWN && blp->bl_linkstate != LINK_STATE_DOWN ||
2733 	    (blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL))) {
2734 		blp->bl_linkstate = newls;
2735 		return (newls);
2736 	}
2737 
2738 	/*
2739 	 * Scan first to see if there are any other non-down links.  If there
2740 	 * are, then we're done.  Otherwise, if all others are down, then the
2741 	 * state of this link is the state of the bridge.
2742 	 */
2743 	bip = blp->bl_inst;
2744 	rw_enter(&bip->bi_rwlock, RW_WRITER);
2745 	for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2746 	    blcmp = list_next(&bip->bi_links, blcmp)) {
2747 		if (blcmp != blp &&
2748 		    !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
2749 		    blcmp->bl_linkstate != LINK_STATE_DOWN)
2750 			break;
2751 	}
2752 
2753 	if (blcmp != NULL) {
2754 		/*
2755 		 * If there are other links that are considered up, then tell
2756 		 * the caller that the link is actually still up, regardless of
2757 		 * this link's underlying state.
2758 		 */
2759 		blp->bl_linkstate = newls;
2760 		newls = LINK_STATE_UP;
2761 	} else if (blp->bl_linkstate != newls) {
2762 		/*
2763 		 * If we've found no other 'up' links, and this link has
2764 		 * changed state, then report the new state of the bridge to
2765 		 * all other clients.
2766 		 */
2767 		blp->bl_linkstate = newls;
2768 		for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2769 		    blcmp = list_next(&bip->bi_links, blcmp)) {
2770 			if (blcmp != blp && !(blcmp->bl_flags & BLF_DELETED))
2771 				mac_link_redo(blcmp->bl_mh, newls);
2772 		}
2773 		bmp = bip->bi_mac;
2774 		if ((bmp->bm_linkstate = newls) != LINK_STATE_DOWN)
2775 			bmp->bm_linkstate = LINK_STATE_UP;
2776 		mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
2777 	}
2778 	rw_exit(&bip->bi_rwlock);
2779 	return (newls);
2780 }
2781 
2782 static void
2783 bridge_add_link(void *arg)
2784 {
2785 	mblk_t *mp = arg;
2786 	bridge_stream_t *bsp;
2787 	bridge_inst_t *bip, *bipt;
2788 	bridge_mac_t *bmp;
2789 	datalink_id_t linkid;
2790 	int err;
2791 	mac_handle_t mh;
2792 	uint_t maxsdu;
2793 	bridge_link_t *blp = NULL, *blpt;
2794 	const mac_info_t *mip;
2795 	boolean_t macopen = B_FALSE;
2796 	char linkname[MAXLINKNAMELEN];
2797 	char kstatname[KSTAT_STRLEN];
2798 	int i;
2799 	link_state_t linkstate;
2800 	mblk_t *mlist;
2801 
2802 	bsp = (bridge_stream_t *)mp->b_next;
2803 	mp->b_next = NULL;
2804 	bip = bsp->bs_inst;
2805 	/* LINTED: alignment */
2806 	linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2807 
2808 	/*
2809 	 * First make sure that there is no other bridge that has this link.
2810 	 * We don't want to overlap operations from two bridges; the MAC layer
2811 	 * supports only one bridge on a given MAC at a time.
2812 	 *
2813 	 * We rely on the fact that there's just one taskq thread for the
2814 	 * bridging module: once we've checked for a duplicate, we can drop the
2815 	 * lock, because no other thread could possibly be adding another link
2816 	 * until we're done.
2817 	 */
2818 	mutex_enter(&inst_lock);
2819 	for (bipt = list_head(&inst_list); bipt != NULL;
2820 	    bipt = list_next(&inst_list, bipt)) {
2821 		rw_enter(&bipt->bi_rwlock, RW_READER);
2822 		for (blpt = list_head(&bipt->bi_links); blpt != NULL;
2823 		    blpt = list_next(&bipt->bi_links, blpt)) {
2824 			if (linkid == blpt->bl_linkid)
2825 				break;
2826 		}
2827 		rw_exit(&bipt->bi_rwlock);
2828 		if (blpt != NULL)
2829 			break;
2830 	}
2831 	mutex_exit(&inst_lock);
2832 	if (bipt != NULL) {
2833 		err = EBUSY;
2834 		goto fail;
2835 	}
2836 
2837 	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
2838 		goto fail;
2839 	macopen = B_TRUE;
2840 
2841 	/* we bridge only Ethernet */
2842 	mip = mac_info(mh);
2843 	if (mip->mi_media != DL_ETHER) {
2844 		err = ENOTSUP;
2845 		goto fail;
2846 	}
2847 
2848 	/*
2849 	 * Get the current maximum SDU on this interface.  If there are other
2850 	 * links on the bridge, then this one must match, or it errors out.
2851 	 * Otherwise, the first link becomes the standard for the new bridge.
2852 	 */
2853 	mac_sdu_get(mh, NULL, &maxsdu);
2854 	bmp = bip->bi_mac;
2855 	if (list_is_empty(&bip->bi_links)) {
2856 		bmp->bm_maxsdu = maxsdu;
2857 		(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2858 	}
2859 
2860 	/* figure the kstat name; also used as the mac client name */
2861 	i = MBLKL(mp->b_cont) - sizeof (datalink_id_t);
2862 	if (i < 0 || i >= MAXLINKNAMELEN)
2863 		i = MAXLINKNAMELEN - 1;
2864 	bcopy(mp->b_cont->b_rptr + sizeof (datalink_id_t), linkname, i);
2865 	linkname[i] = '\0';
2866 	(void) snprintf(kstatname, sizeof (kstatname), "%s-%s", bip->bi_name,
2867 	    linkname);
2868 
2869 	if ((blp = kmem_zalloc(sizeof (*blp), KM_NOSLEEP)) == NULL) {
2870 		err = ENOMEM;
2871 		goto fail;
2872 	}
2873 	blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
2874 	if (blp->bl_lfailmp == NULL) {
2875 		kmem_free(blp, sizeof (*blp));
2876 		blp = NULL;
2877 		err = ENOMEM;
2878 		goto fail;
2879 	}
2880 
2881 	blp->bl_refs = 1;
2882 	atomic_inc_uint(&bip->bi_refs);
2883 	blp->bl_inst = bip;
2884 	blp->bl_mh = mh;
2885 	blp->bl_linkid = linkid;
2886 	blp->bl_maxsdu = maxsdu;
2887 	cv_init(&blp->bl_trillwait, NULL, CV_DRIVER, NULL);
2888 	mutex_init(&blp->bl_trilllock, NULL, MUTEX_DRIVER, NULL);
2889 	(void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
2890 
2891 	err = mac_client_open(mh, &blp->bl_mch, kstatname, 0);
2892 	if (err != 0)
2893 		goto fail;
2894 	blp->bl_flags |= BLF_CLIENT_OPEN;
2895 
2896 	err = mac_margin_add(mh, &blp->bl_margin, B_TRUE);
2897 	if (err != 0)
2898 		goto fail;
2899 	blp->bl_flags |= BLF_MARGIN_ADDED;
2900 
2901 	blp->bl_mnh = mac_notify_add(mh, bridge_notify_cb, blp);
2902 
2903 	/* Enable Bridging on the link */
2904 	err = mac_bridge_set(mh, (mac_handle_t)blp);
2905 	if (err != 0)
2906 		goto fail;
2907 	blp->bl_flags |= BLF_SET_BRIDGE;
2908 
2909 	err = mac_promisc_add(blp->bl_mch, MAC_CLIENT_PROMISC_ALL, NULL,
2910 	    blp, &blp->bl_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP);
2911 	if (err != 0)
2912 		goto fail;
2913 	blp->bl_flags |= BLF_PROM_ADDED;
2914 
2915 	bridge_new_unicst(blp);
2916 
2917 	blp->bl_ksp = kstat_setup((kstat_named_t *)&blp->bl_kstats,
2918 	    link_kstats_list, Dim(link_kstats_list), kstatname);
2919 
2920 	/*
2921 	 * The link holds a reference to the bridge instance, so that the
2922 	 * instance can't go away before the link is freed.  The insertion into
2923 	 * bi_links holds a reference on the link (reference set to 1 above).
2924 	 * When marking as removed from bi_links (BLF_DELETED), drop the
2925 	 * reference on the link. When freeing the link, drop the reference on
2926 	 * the instance. BLF_LINK_ADDED tracks link insertion in bi_links list.
2927 	 */
2928 	rw_enter(&bip->bi_rwlock, RW_WRITER);
2929 	list_insert_tail(&bip->bi_links, blp);
2930 	blp->bl_flags |= BLF_LINK_ADDED;
2931 
2932 	/*
2933 	 * If the new link is no good on this bridge, then let the daemon know
2934 	 * about the problem.
2935 	 */
2936 	mlist = NULL;
2937 	if (maxsdu != bmp->bm_maxsdu)
2938 		link_sdu_fail(blp, B_TRUE, &mlist);
2939 	rw_exit(&bip->bi_rwlock);
2940 	send_up_messages(bip, mlist);
2941 
2942 	/*
2943 	 * Trigger a link state update so that if this link is the first one
2944 	 * "up" in the bridge, then we notify everyone.  This triggers a trip
2945 	 * through bridge_ls_cb.
2946 	 */
2947 	linkstate = mac_stat_get(mh, MAC_STAT_LOWLINK_STATE);
2948 	blp->bl_linkstate = LINK_STATE_DOWN;
2949 	mac_link_update(mh, linkstate);
2950 
2951 	/*
2952 	 * We now need to report back to the stream that invoked us, and then
2953 	 * drop the reference on the stream that we're holding.
2954 	 */
2955 	miocack(bsp->bs_wq, mp, 0, 0);
2956 	stream_unref(bsp);
2957 	return;
2958 
2959 fail:
2960 	if (blp == NULL) {
2961 		if (macopen)
2962 			mac_close(mh);
2963 	} else {
2964 		link_shutdown(blp);
2965 	}
2966 	miocnak(bsp->bs_wq, mp, 0, err);
2967 	stream_unref(bsp);
2968 }
2969 
2970 static void
2971 bridge_rem_link(void *arg)
2972 {
2973 	mblk_t *mp = arg;
2974 	bridge_stream_t *bsp;
2975 	bridge_inst_t *bip;
2976 	bridge_mac_t *bmp;
2977 	datalink_id_t linkid;
2978 	bridge_link_t *blp, *blsave;
2979 	boolean_t found;
2980 	mblk_t *mlist;
2981 
2982 	bsp = (bridge_stream_t *)mp->b_next;
2983 	mp->b_next = NULL;
2984 	bip = bsp->bs_inst;
2985 	/* LINTED: alignment */
2986 	linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2987 
2988 	/*
2989 	 * We become reader here so that we can loop over the other links and
2990 	 * deliver link up/down notification.
2991 	 */
2992 	rw_enter(&bip->bi_rwlock, RW_READER);
2993 	found = B_FALSE;
2994 	for (blp = list_head(&bip->bi_links); blp != NULL;
2995 	    blp = list_next(&bip->bi_links, blp)) {
2996 		if (blp->bl_linkid == linkid &&
2997 		    !(blp->bl_flags & BLF_DELETED)) {
2998 			blp->bl_flags |= BLF_DELETED;
2999 			(void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
3000 			    blp, DDI_SLEEP);
3001 			found = B_TRUE;
3002 			break;
3003 		}
3004 	}
3005 
3006 	/*
3007 	 * Check if this link is up and the remainder of the links are all
3008 	 * down.
3009 	 */
3010 	if (blp != NULL && blp->bl_linkstate != LINK_STATE_DOWN) {
3011 		for (blp = list_head(&bip->bi_links); blp != NULL;
3012 		    blp = list_next(&bip->bi_links, blp)) {
3013 			if (blp->bl_linkstate != LINK_STATE_DOWN &&
3014 			    !(blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)))
3015 				break;
3016 		}
3017 		if (blp == NULL) {
3018 			for (blp = list_head(&bip->bi_links); blp != NULL;
3019 			    blp = list_next(&bip->bi_links, blp)) {
3020 				if (!(blp->bl_flags & BLF_DELETED))
3021 					mac_link_redo(blp->bl_mh,
3022 					    LINK_STATE_DOWN);
3023 			}
3024 			bmp = bip->bi_mac;
3025 			bmp->bm_linkstate = LINK_STATE_DOWN;
3026 			mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
3027 		}
3028 	}
3029 
3030 	/*
3031 	 * Check if there's just one working link left on the bridge.  If so,
3032 	 * then that link is now authoritative for bridge MTU.
3033 	 */
3034 	blsave = NULL;
3035 	for (blp = list_head(&bip->bi_links); blp != NULL;
3036 	    blp = list_next(&bip->bi_links, blp)) {
3037 		if (!(blp->bl_flags & BLF_DELETED)) {
3038 			if (blsave == NULL)
3039 				blsave = blp;
3040 			else
3041 				break;
3042 		}
3043 	}
3044 	mlist = NULL;
3045 	bmp = bip->bi_mac;
3046 	if (blsave != NULL && blp == NULL &&
3047 	    blsave->bl_maxsdu != bmp->bm_maxsdu) {
3048 		bmp->bm_maxsdu = blsave->bl_maxsdu;
3049 		(void) mac_maxsdu_update(bmp->bm_mh, blsave->bl_maxsdu);
3050 		link_sdu_fail(blsave, B_FALSE, &mlist);
3051 	}
3052 	rw_exit(&bip->bi_rwlock);
3053 	send_up_messages(bip, mlist);
3054 
3055 	if (found)
3056 		miocack(bsp->bs_wq, mp, 0, 0);
3057 	else
3058 		miocnak(bsp->bs_wq, mp, 0, ENOENT);
3059 	stream_unref(bsp);
3060 }
3061 
3062 /*
3063  * This function intentionally returns with bi_rwlock held; it is intended for
3064  * quick checks and updates.
3065  */
3066 static bridge_link_t *
3067 enter_link(bridge_inst_t *bip, datalink_id_t linkid)
3068 {
3069 	bridge_link_t *blp;
3070 
3071 	rw_enter(&bip->bi_rwlock, RW_READER);
3072 	for (blp = list_head(&bip->bi_links); blp != NULL;
3073 	    blp = list_next(&bip->bi_links, blp)) {
3074 		if (blp->bl_linkid == linkid && !(blp->bl_flags & BLF_DELETED))
3075 			break;
3076 	}
3077 	return (blp);
3078 }
3079 
3080 static void
3081 bridge_ioctl(queue_t *wq, mblk_t *mp)
3082 {
3083 	bridge_stream_t *bsp = wq->q_ptr;
3084 	bridge_inst_t *bip;
3085 	struct iocblk *iop;
3086 	int rc = EINVAL;
3087 	int len = 0;
3088 	bridge_link_t *blp;
3089 	cred_t *cr;
3090 
3091 	/* LINTED: alignment */
3092 	iop = (struct iocblk *)mp->b_rptr;
3093 
3094 	/*
3095 	 * For now, all of the bridge ioctls are privileged.
3096 	 */
3097 	if ((cr = msg_getcred(mp, NULL)) == NULL)
3098 		cr = iop->ioc_cr;
3099 	if (cr != NULL && secpolicy_net_config(cr, B_FALSE) != 0) {
3100 		miocnak(wq, mp, 0, EPERM);
3101 		return;
3102 	}
3103 
3104 	switch (iop->ioc_cmd) {
3105 	case BRIOC_NEWBRIDGE: {
3106 		bridge_newbridge_t *bnb;
3107 
3108 		if (bsp->bs_inst != NULL ||
3109 		    (rc = miocpullup(mp, sizeof (bridge_newbridge_t))) != 0)
3110 			break;
3111 		/* LINTED: alignment */
3112 		bnb = (bridge_newbridge_t *)mp->b_cont->b_rptr;
3113 		bnb->bnb_name[MAXNAMELEN-1] = '\0';
3114 		rc = bridge_create(bnb->bnb_linkid, bnb->bnb_name, &bip, cr);
3115 		if (rc != 0)
3116 			break;
3117 
3118 		rw_enter(&bip->bi_rwlock, RW_WRITER);
3119 		if (bip->bi_control != NULL) {
3120 			rw_exit(&bip->bi_rwlock);
3121 			bridge_unref(bip);
3122 			rc = EBUSY;
3123 		} else {
3124 			atomic_inc_uint(&bip->bi_refs);
3125 			bsp->bs_inst = bip;	/* stream holds reference */
3126 			bip->bi_control = bsp;
3127 			rw_exit(&bip->bi_rwlock);
3128 			rc = 0;
3129 		}
3130 		break;
3131 	}
3132 
3133 	case BRIOC_ADDLINK:
3134 		if ((bip = bsp->bs_inst) == NULL ||
3135 		    (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3136 			break;
3137 		/*
3138 		 * We cannot perform the action in this thread, because we're
3139 		 * not in process context, and we may already be holding
3140 		 * MAC-related locks.  Place the request on taskq.
3141 		 */
3142 		mp->b_next = (mblk_t *)bsp;
3143 		stream_ref(bsp);
3144 		(void) ddi_taskq_dispatch(bridge_taskq, bridge_add_link, mp,
3145 		    DDI_SLEEP);
3146 		return;
3147 
3148 	case BRIOC_REMLINK:
3149 		if ((bip = bsp->bs_inst) == NULL ||
3150 		    (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3151 			break;
3152 		/*
3153 		 * We cannot perform the action in this thread, because we're
3154 		 * not in process context, and we may already be holding
3155 		 * MAC-related locks.  Place the request on taskq.
3156 		 */
3157 		mp->b_next = (mblk_t *)bsp;
3158 		stream_ref(bsp);
3159 		(void) ddi_taskq_dispatch(bridge_taskq, bridge_rem_link, mp,
3160 		    DDI_SLEEP);
3161 		return;
3162 
3163 	case BRIOC_SETSTATE: {
3164 		bridge_setstate_t *bss;
3165 
3166 		if ((bip = bsp->bs_inst) == NULL ||
3167 		    (rc = miocpullup(mp, sizeof (*bss))) != 0)
3168 			break;
3169 		/* LINTED: alignment */
3170 		bss = (bridge_setstate_t *)mp->b_cont->b_rptr;
3171 		if ((blp = enter_link(bip, bss->bss_linkid)) == NULL) {
3172 			rc = ENOENT;
3173 		} else {
3174 			rc = 0;
3175 			blp->bl_state = bss->bss_state;
3176 		}
3177 		rw_exit(&bip->bi_rwlock);
3178 		break;
3179 	}
3180 
3181 	case BRIOC_SETPVID: {
3182 		bridge_setpvid_t *bsv;
3183 
3184 		if ((bip = bsp->bs_inst) == NULL ||
3185 		    (rc = miocpullup(mp, sizeof (*bsv))) != 0)
3186 			break;
3187 		/* LINTED: alignment */
3188 		bsv = (bridge_setpvid_t *)mp->b_cont->b_rptr;
3189 		if (bsv->bsv_vlan > VLAN_ID_MAX)
3190 			break;
3191 		if ((blp = enter_link(bip, bsv->bsv_linkid)) == NULL) {
3192 			rc = ENOENT;
3193 		} else if (blp->bl_pvid == bsv->bsv_vlan) {
3194 			rc = 0;
3195 		} else {
3196 			rc = 0;
3197 			BRIDGE_VLAN_CLR(blp, blp->bl_pvid);
3198 			blp->bl_pvid = bsv->bsv_vlan;
3199 			if (blp->bl_pvid != 0)
3200 				BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3201 		}
3202 		rw_exit(&bip->bi_rwlock);
3203 		break;
3204 	}
3205 
3206 	case BRIOC_VLANENAB: {
3207 		bridge_vlanenab_t *bve;
3208 
3209 		if ((bip = bsp->bs_inst) == NULL ||
3210 		    (rc = miocpullup(mp, sizeof (*bve))) != 0)
3211 			break;
3212 		/* LINTED: alignment */
3213 		bve = (bridge_vlanenab_t *)mp->b_cont->b_rptr;
3214 		if (bve->bve_vlan > VLAN_ID_MAX)
3215 			break;
3216 		if ((blp = enter_link(bip, bve->bve_linkid)) == NULL) {
3217 			rc = ENOENT;
3218 		} else {
3219 			rc = 0;
3220 			/* special case: vlan 0 means "all" */
3221 			if (bve->bve_vlan == 0) {
3222 				(void) memset(blp->bl_vlans,
3223 				    bve->bve_onoff ? ~0 : 0,
3224 				    sizeof (blp->bl_vlans));
3225 				BRIDGE_VLAN_CLR(blp, 0);
3226 				if (blp->bl_pvid != 0)
3227 					BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3228 			} else if (bve->bve_vlan == blp->bl_pvid) {
3229 				rc = EINVAL;
3230 			} else if (bve->bve_onoff) {
3231 				BRIDGE_VLAN_SET(blp, bve->bve_vlan);
3232 			} else {
3233 				BRIDGE_VLAN_CLR(blp, bve->bve_vlan);
3234 			}
3235 		}
3236 		rw_exit(&bip->bi_rwlock);
3237 		break;
3238 	}
3239 
3240 	case BRIOC_FLUSHFWD: {
3241 		bridge_flushfwd_t *bff;
3242 		bridge_fwd_t *bfp, *bfnext;
3243 		avl_tree_t fwd_scavenge;
3244 		int i;
3245 
3246 		if ((bip = bsp->bs_inst) == NULL ||
3247 		    (rc = miocpullup(mp, sizeof (*bff))) != 0)
3248 			break;
3249 		/* LINTED: alignment */
3250 		bff = (bridge_flushfwd_t *)mp->b_cont->b_rptr;
3251 		rw_enter(&bip->bi_rwlock, RW_WRITER);
3252 		/* This case means "all" */
3253 		if (bff->bff_linkid == DATALINK_INVALID_LINKID) {
3254 			blp = NULL;
3255 		} else {
3256 			for (blp = list_head(&bip->bi_links); blp != NULL;
3257 			    blp = list_next(&bip->bi_links, blp)) {
3258 				if (blp->bl_linkid == bff->bff_linkid &&
3259 				    !(blp->bl_flags & BLF_DELETED))
3260 					break;
3261 			}
3262 			if (blp == NULL) {
3263 				rc = ENOENT;
3264 				rw_exit(&bip->bi_rwlock);
3265 				break;
3266 			}
3267 		}
3268 		avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
3269 		    offsetof(bridge_fwd_t, bf_node));
3270 		bfnext = avl_first(&bip->bi_fwd);
3271 		while ((bfp = bfnext) != NULL) {
3272 			bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
3273 			if (bfp->bf_flags & BFF_LOCALADDR)
3274 				continue;
3275 			if (blp != NULL) {
3276 				for (i = 0; i < bfp->bf_maxlinks; i++) {
3277 					if (bfp->bf_links[i] == blp)
3278 						break;
3279 				}
3280 				/*
3281 				 * If the link is there and we're excluding,
3282 				 * then skip.  If the link is not there and
3283 				 * we're doing only that link, then skip.
3284 				 */
3285 				if ((i < bfp->bf_maxlinks) == bff->bff_exclude)
3286 					continue;
3287 			}
3288 			ASSERT(bfp->bf_flags & BFF_INTREE);
3289 			avl_remove(&bip->bi_fwd, bfp);
3290 			bfp->bf_flags &= ~BFF_INTREE;
3291 			avl_add(&fwd_scavenge, bfp);
3292 		}
3293 		rw_exit(&bip->bi_rwlock);
3294 		bfnext = avl_first(&fwd_scavenge);
3295 		while ((bfp = bfnext) != NULL) {
3296 			bfnext = AVL_NEXT(&fwd_scavenge, bfp);
3297 			avl_remove(&fwd_scavenge, bfp);
3298 			fwd_unref(bfp);	/* drop tree reference */
3299 		}
3300 		avl_destroy(&fwd_scavenge);
3301 		break;
3302 	}
3303 
3304 	case BRIOC_TABLEMAX:
3305 		if ((bip = bsp->bs_inst) == NULL ||
3306 		    (rc = miocpullup(mp, sizeof (uint32_t))) != 0)
3307 			break;
3308 		/* LINTED: alignment */
3309 		bip->bi_tablemax = *(uint32_t *)mp->b_cont->b_rptr;
3310 		break;
3311 	}
3312 
3313 	if (rc == 0)
3314 		miocack(wq, mp, len, 0);
3315 	else
3316 		miocnak(wq, mp, 0, rc);
3317 }
3318 
3319 static int
3320 bridge_wput(queue_t *wq, mblk_t *mp)
3321 {
3322 	switch (DB_TYPE(mp)) {
3323 	case M_IOCTL:
3324 		bridge_ioctl(wq, mp);
3325 		break;
3326 	case M_FLUSH:
3327 		if (*mp->b_rptr & FLUSHW)
3328 			*mp->b_rptr &= ~FLUSHW;
3329 		if (*mp->b_rptr & FLUSHR)
3330 			qreply(wq, mp);
3331 		else
3332 			freemsg(mp);
3333 		break;
3334 	default:
3335 		freemsg(mp);
3336 		break;
3337 	}
3338 	return (0);
3339 }
3340 
3341 /*
3342  * This function allocates the main data structures for the bridge driver and
3343  * connects us into devfs.
3344  */
3345 static void
3346 bridge_inst_init(void)
3347 {
3348 	bridge_scan_interval = 5 * drv_usectohz(1000000);
3349 	bridge_fwd_age = 25 * drv_usectohz(1000000);
3350 
3351 	rw_init(&bmac_rwlock, NULL, RW_DRIVER, NULL);
3352 	list_create(&bmac_list, sizeof (bridge_mac_t),
3353 	    offsetof(bridge_mac_t, bm_node));
3354 	list_create(&inst_list, sizeof (bridge_inst_t),
3355 	    offsetof(bridge_inst_t, bi_node));
3356 	cv_init(&inst_cv, NULL, CV_DRIVER, NULL);
3357 	mutex_init(&inst_lock, NULL, MUTEX_DRIVER, NULL);
3358 	cv_init(&stream_ref_cv, NULL, CV_DRIVER, NULL);
3359 	mutex_init(&stream_ref_lock, NULL, MUTEX_DRIVER, NULL);
3360 
3361 	mac_bridge_vectors(bridge_xmit_cb, bridge_recv_cb, bridge_ref_cb,
3362 	    bridge_ls_cb);
3363 }
3364 
3365 /*
3366  * This function disconnects from devfs and destroys all data structures in
3367  * preparation for unload.  It's assumed that there are no active bridge
3368  * references left at this point.
3369  */
3370 static void
3371 bridge_inst_fini(void)
3372 {
3373 	mac_bridge_vectors(NULL, NULL, NULL, NULL);
3374 	if (bridge_timerid != 0)
3375 		(void) untimeout(bridge_timerid);
3376 	rw_destroy(&bmac_rwlock);
3377 	list_destroy(&bmac_list);
3378 	list_destroy(&inst_list);
3379 	cv_destroy(&inst_cv);
3380 	mutex_destroy(&inst_lock);
3381 	cv_destroy(&stream_ref_cv);
3382 	mutex_destroy(&stream_ref_lock);
3383 }
3384 
3385 /*
3386  * bridge_attach()
3387  *
3388  * Description:
3389  *    Attach bridge driver to the system.
3390  */
3391 static int
3392 bridge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3393 {
3394 	if (cmd != DDI_ATTACH)
3395 		return (DDI_FAILURE);
3396 
3397 	if (ddi_create_minor_node(dip, BRIDGE_CTL, S_IFCHR, 0, DDI_PSEUDO,
3398 	    CLONE_DEV) == DDI_FAILURE) {
3399 		return (DDI_FAILURE);
3400 	}
3401 
3402 	if (dld_ioc_register(BRIDGE_IOC, bridge_ioc_list,
3403 	    DLDIOCCNT(bridge_ioc_list)) != 0) {
3404 		ddi_remove_minor_node(dip, BRIDGE_CTL);
3405 		return (DDI_FAILURE);
3406 	}
3407 
3408 	bridge_dev_info = dip;
3409 	bridge_major = ddi_driver_major(dip);
3410 	bridge_taskq = ddi_taskq_create(dip, BRIDGE_DEV_NAME, 1,
3411 	    TASKQ_DEFAULTPRI, 0);
3412 	return (DDI_SUCCESS);
3413 }
3414 
3415 /*
3416  * bridge_detach()
3417  *
3418  * Description:
3419  *    Detach an interface to the system.
3420  */
3421 static int
3422 bridge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3423 {
3424 	if (cmd != DDI_DETACH)
3425 		return (DDI_FAILURE);
3426 
3427 	ddi_remove_minor_node(dip, NULL);
3428 	ddi_taskq_destroy(bridge_taskq);
3429 	bridge_dev_info = NULL;
3430 	return (DDI_SUCCESS);
3431 }
3432 
3433 /*
3434  * bridge_info()
3435  *
3436  * Description:
3437  *    Translate "dev_t" to a pointer to the associated "dev_info_t".
3438  */
3439 /* ARGSUSED */
3440 static int
3441 bridge_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
3442     void **result)
3443 {
3444 	int	rc;
3445 
3446 	switch (infocmd) {
3447 	case DDI_INFO_DEVT2DEVINFO:
3448 		if (bridge_dev_info == NULL) {
3449 			rc = DDI_FAILURE;
3450 		} else {
3451 			*result = (void *)bridge_dev_info;
3452 			rc = DDI_SUCCESS;
3453 		}
3454 		break;
3455 	case DDI_INFO_DEVT2INSTANCE:
3456 		*result = NULL;
3457 		rc = DDI_SUCCESS;
3458 		break;
3459 	default:
3460 		rc = DDI_FAILURE;
3461 		break;
3462 	}
3463 	return (rc);
3464 }
3465 
3466 static struct module_info bridge_modinfo = {
3467 	2105,			/* mi_idnum */
3468 	BRIDGE_DEV_NAME,	/* mi_idname */
3469 	0,			/* mi_minpsz */
3470 	16384,			/* mi_maxpsz */
3471 	65536,			/* mi_hiwat */
3472 	128			/* mi_lowat */
3473 };
3474 
3475 static struct qinit bridge_rinit = {
3476 	NULL,			/* qi_putp */
3477 	NULL,			/* qi_srvp */
3478 	bridge_open,		/* qi_qopen */
3479 	bridge_close,		/* qi_qclose */
3480 	NULL,			/* qi_qadmin */
3481 	&bridge_modinfo,	/* qi_minfo */
3482 	NULL			/* qi_mstat */
3483 };
3484 
3485 static struct qinit bridge_winit = {
3486 	(int (*)())bridge_wput, /* qi_putp */
3487 	NULL,			/* qi_srvp */
3488 	NULL,			/* qi_qopen */
3489 	NULL,			/* qi_qclose */
3490 	NULL,			/* qi_qadmin */
3491 	&bridge_modinfo,	/* qi_minfo */
3492 	NULL			/* qi_mstat */
3493 };
3494 
3495 static struct streamtab bridge_tab = {
3496 	&bridge_rinit,	/* st_rdinit */
3497 	&bridge_winit	/* st_wrinit */
3498 };
3499 
3500 /* No STREAMS perimeters; we do all our own locking */
3501 DDI_DEFINE_STREAM_OPS(bridge_ops, nulldev, nulldev, bridge_attach,
3502     bridge_detach, nodev, bridge_info, D_NEW | D_MP, &bridge_tab,
3503     ddi_quiesce_not_supported);
3504 
3505 static struct modldrv modldrv = {
3506 	&mod_driverops,
3507 	"bridging driver",
3508 	&bridge_ops
3509 };
3510 
3511 static struct modlinkage modlinkage = {
3512 	MODREV_1,
3513 	(void *)&modldrv,
3514 	NULL
3515 };
3516 
3517 int
3518 _init(void)
3519 {
3520 	int retv;
3521 
3522 	mac_init_ops(NULL, BRIDGE_DEV_NAME);
3523 	bridge_inst_init();
3524 	if ((retv = mod_install(&modlinkage)) != 0)
3525 		bridge_inst_fini();
3526 	return (retv);
3527 }
3528 
3529 int
3530 _fini(void)
3531 {
3532 	int retv;
3533 
3534 	rw_enter(&bmac_rwlock, RW_READER);
3535 	retv = list_is_empty(&bmac_list) ? 0 : EBUSY;
3536 	rw_exit(&bmac_rwlock);
3537 	if (retv == 0 &&
3538 	    (retv = mod_remove(&modlinkage)) == 0)
3539 		bridge_inst_fini();
3540 	return (retv);
3541 }
3542 
3543 int
3544 _info(struct modinfo *modinfop)
3545 {
3546 	return (mod_info(&modlinkage, modinfop));
3547 }
3548