xref: /titanic_41/usr/src/uts/common/io/bridge.c (revision 3a7bd03955840c70afc1457eb632dfcd13b91f03)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * This module implements a STREAMS driver that provides layer-two (Ethernet)
29  * bridging functionality.  The STREAMS interface is used to provide
30  * observability (snoop/wireshark) and control, but not for interface plumbing.
31  */
32 
33 #include <sys/types.h>
34 #include <sys/bitmap.h>
35 #include <sys/cmn_err.h>
36 #include <sys/conf.h>
37 #include <sys/ddi.h>
38 #include <sys/errno.h>
39 #include <sys/kstat.h>
40 #include <sys/modctl.h>
41 #include <sys/note.h>
42 #include <sys/param.h>
43 #include <sys/policy.h>
44 #include <sys/sdt.h>
45 #include <sys/stat.h>
46 #include <sys/stream.h>
47 #include <sys/stropts.h>
48 #include <sys/strsun.h>
49 #include <sys/sunddi.h>
50 #include <sys/sysmacros.h>
51 #include <sys/systm.h>
52 #include <sys/time.h>
53 #include <sys/dlpi.h>
54 #include <sys/dls.h>
55 #include <sys/mac_ether.h>
56 #include <sys/mac_provider.h>
57 #include <sys/mac_client_priv.h>
58 #include <sys/mac_impl.h>
59 #include <sys/vlan.h>
60 #include <net/bridge.h>
61 #include <net/bridge_impl.h>
62 #include <net/trill.h>
63 
64 /*
65  * Locks and reference counts: object lifetime and design.
66  *
67  * bridge_mac_t
68  *   Bridge mac (snoop) instances are in bmac_list, which is protected by
69  *   bmac_rwlock.  They're allocated by bmac_alloc and freed by bridge_timer().
70  *   Every bridge_inst_t has a single bridge_mac_t, but when bridge_inst_t goes
71  *   away, the bridge_mac_t remains until either all of the users go away
72  *   (detected by a timer) or until the instance is picked up again by the same
73  *   bridge starting back up.
74  *
75  * bridge_inst_t
76  *   Bridge instances are in inst_list, which is protected by inst_lock.
77  *   They're allocated by inst_alloc() and freed by inst_free().  After
78  *   allocation, an instance is placed in inst_list, and the reference count is
79  *   incremented to represent this.  That reference is decremented when the
80  *   BIF_SHUTDOWN flag is set, and no new increments may occur.  When the last
81  *   reference is freed, the instance is removed from the list.
82  *
83  *   Bridge instances have lists of links and an AVL tree of forwarding
84  *   entries.  Each of these structures holds one reference on the bridge
85  *   instance.  These lists and tree are protected by bi_rwlock.
86  *
87  * bridge_stream_t
88  *   Bridge streams are allocated by stream_alloc() and freed by stream_free().
89  *   These streams are created when "bridged" opens /dev/bridgectl, and are
90  *   used to create new bridge instances (via BRIOC_NEWBRIDGE) and control the
91  *   links on the bridge.  When a stream closes, the bridge instance created is
92  *   destroyed.  There's at most one bridge instance for a given control
93  *   stream.
94  *
95  * bridge_link_t
96  *   Links are allocated by bridge_add_link() and freed by link_free().  The
97  *   bi_links list holds a reference to the link.  When the BLF_DELETED flag is
98  *   set, that reference is dropped.  The link isn't removed from the list
99  *   until the last reference drops.  Each forwarding entry that uses a given
100  *   link holds a reference, as does each thread transmitting a packet via the
101  *   link.  The MAC layer calls in via bridge_ref_cb() to hold a reference on
102  *   a link when transmitting.
103  *
104  *   It's important that once BLF_DELETED is set, there's no way for the
105  *   reference count to increase again.  If it can, then the link may be
106  *   double-freed.  The BLF_FREED flag is intended for use with assertions to
107  *   guard against this in testing.
108  *
109  * bridge_fwd_t
110  *   Bridge forwarding entries are allocated by bridge_recv_cb() and freed by
111  *   fwd_free().  The bi_fwd AVL tree holds one reference to the entry.  Unlike
112  *   other data structures, the reference is dropped when the entry is removed
113  *   from the tree by fwd_delete(), and the BFF_INTREE flag is removed.  Each
114  *   thread that's forwarding a packet to a known destination holds a reference
115  *   to a forwarding entry.
116  *
117  * TRILL notes:
118  *
119  *   The TRILL module does all of its I/O through bridging.  It uses references
120  *   on the bridge_inst_t and bridge_link_t structures, and has seven entry
121  *   points and four callbacks.  One entry point is for setting the callbacks
122  *   (bridge_trill_register_cb).  There are four entry points for taking bridge
123  *   and link references (bridge_trill_{br,ln}{ref,unref}).  The final two
124  *   entry points are for decapsulated packets from TRILL (bridge_trill_decaps)
125  *   that need to be bridged locally, and for TRILL-encapsulated output packets
126  *   (bridge_trill_output).
127  *
128  *   The four callbacks comprise two notification functions for bridges and
129  *   links being deleted, one function for raw received TRILL packets, and one
130  *   for bridge output to non-local TRILL destinations (tunnel entry).
131  */
132 
133 /*
134  * Ethernet reserved multicast addresses for TRILL; used also in TRILL module.
135  */
136 const uint8_t all_isis_rbridges[] = ALL_ISIS_RBRIDGES;
137 static const uint8_t all_esadi_rbridges[] = ALL_ESADI_RBRIDGES;
138 const uint8_t bridge_group_address[] = BRIDGE_GROUP_ADDRESS;
139 
140 static const char *inst_kstats_list[] = { KSINST_NAMES };
141 static const char *link_kstats_list[] = { KSLINK_NAMES };
142 
143 #define	KREF(p, m, vn)	p->m.vn.value.ui64
144 #define	KINCR(p, m, vn)	++KREF(p, m, vn)
145 #define	KDECR(p, m, vn)	--KREF(p, m, vn)
146 
147 #define	KIPINCR(p, vn)	KINCR(p, bi_kstats, vn)
148 #define	KIPDECR(p, vn)	KDECR(p, bi_kstats, vn)
149 #define	KLPINCR(p, vn)	KINCR(p, bl_kstats, vn)
150 
151 #define	KIINCR(vn)	KIPINCR(bip, vn)
152 #define	KIDECR(vn)	KIPDECR(bip, vn)
153 #define	KLINCR(vn)	KLPINCR(blp, vn)
154 
155 #define	Dim(x)		(sizeof (x) / sizeof (*(x)))
156 
157 /* Amount of overhead added when encapsulating with VLAN headers */
158 #define	VLAN_INCR	(sizeof (struct ether_vlan_header) -	\
159 			sizeof (struct ether_header))
160 
161 static dev_info_t *bridge_dev_info;
162 static major_t bridge_major;
163 static ddi_taskq_t *bridge_taskq;
164 
165 /*
166  * These are the bridge instance management data structures.  The mutex lock
167  * protects the list of bridge instances.  A reference count is then used on
168  * each instance to determine when to free it.  We use mac_minor_hold() to
169  * allocate minor_t values, which are used both for self-cloning /dev/net/
170  * device nodes as well as client streams.  Minor node 0 is reserved for the
171  * allocation control node.
172  */
173 static list_t inst_list;
174 static kcondvar_t inst_cv;		/* Allows us to wait for shutdown */
175 static kmutex_t inst_lock;
176 
177 static krwlock_t bmac_rwlock;
178 static list_t bmac_list;
179 
180 /* Wait for taskq entries that use STREAMS */
181 static kcondvar_t stream_ref_cv;
182 static kmutex_t stream_ref_lock;
183 
184 static timeout_id_t bridge_timerid;
185 static clock_t bridge_scan_interval;
186 static clock_t bridge_fwd_age;
187 
188 static bridge_inst_t *bridge_find_name(const char *);
189 static void bridge_timer(void *);
190 static void bridge_unref(bridge_inst_t *);
191 
192 static const uint8_t zero_addr[ETHERADDRL] = { 0 };
193 
194 /* Global TRILL linkage */
195 static trill_recv_pkt_t trill_recv_fn;
196 static trill_encap_pkt_t trill_encap_fn;
197 static trill_br_dstr_t trill_brdstr_fn;
198 static trill_ln_dstr_t trill_lndstr_fn;
199 
200 /* special settings to accommodate DLD flow control; see dld_str.c */
201 static struct module_info bridge_dld_modinfo = {
202 	0,			/* mi_idnum */
203 	"bridge",		/* mi_idname */
204 	0,			/* mi_minpsz */
205 	INFPSZ,			/* mi_maxpsz */
206 	1,			/* mi_hiwat */
207 	0			/* mi_lowat */
208 };
209 
210 static struct qinit bridge_dld_rinit = {
211 	NULL,			/* qi_putp */
212 	NULL,			/* qi_srvp */
213 	dld_open,		/* qi_qopen */
214 	dld_close,		/* qi_qclose */
215 	NULL,			/* qi_qadmin */
216 	&bridge_dld_modinfo,	/* qi_minfo */
217 	NULL			/* qi_mstat */
218 };
219 
220 static struct qinit bridge_dld_winit = {
221 	(int (*)())dld_wput,	/* qi_putp */
222 	(int (*)())dld_wsrv,	/* qi_srvp */
223 	NULL,			/* qi_qopen */
224 	NULL,			/* qi_qclose */
225 	NULL,			/* qi_qadmin */
226 	&bridge_dld_modinfo,	/* qi_minfo */
227 	NULL			/* qi_mstat */
228 };
229 
230 static int bridge_ioc_listfwd(void *, intptr_t, int, cred_t *, int *);
231 
232 /* GLDv3 control ioctls used by Bridging */
233 static dld_ioc_info_t bridge_ioc_list[] = {
234 	{BRIDGE_IOC_LISTFWD, DLDCOPYINOUT, sizeof (bridge_listfwd_t),
235 	    bridge_ioc_listfwd, NULL},
236 };
237 
238 /*
239  * Given a bridge mac pointer, get a ref-held pointer to the corresponding
240  * bridge instance, if any.  We must hold the global bmac_rwlock so that
241  * bm_inst doesn't slide out from under us.
242  */
243 static bridge_inst_t *
244 mac_to_inst(const bridge_mac_t *bmp)
245 {
246 	bridge_inst_t *bip;
247 
248 	rw_enter(&bmac_rwlock, RW_READER);
249 	if ((bip = bmp->bm_inst) != NULL)
250 		atomic_inc_uint(&bip->bi_refs);
251 	rw_exit(&bmac_rwlock);
252 	return (bip);
253 }
254 
255 static void
256 link_sdu_fail(bridge_link_t *blp, boolean_t failed, mblk_t **mlist)
257 {
258 	mblk_t *mp;
259 	bridge_ctl_t *bcp;
260 	bridge_link_t *blcmp;
261 	bridge_inst_t *bip;
262 	bridge_mac_t *bmp;
263 
264 	if (failed) {
265 		if (blp->bl_flags & BLF_SDUFAIL)
266 			return;
267 		blp->bl_flags |= BLF_SDUFAIL;
268 	} else {
269 		if (!(blp->bl_flags & BLF_SDUFAIL))
270 			return;
271 		blp->bl_flags &= ~BLF_SDUFAIL;
272 	}
273 
274 	/*
275 	 * If this link is otherwise up, then check if there are any other
276 	 * non-failed non-down links.  If not, then we control the state of the
277 	 * whole bridge.
278 	 */
279 	bip = blp->bl_inst;
280 	bmp = bip->bi_mac;
281 	if (blp->bl_linkstate != LINK_STATE_DOWN) {
282 		for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
283 		    blcmp = list_next(&bip->bi_links, blcmp)) {
284 			if (blp != blcmp &&
285 			    !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
286 			    blcmp->bl_linkstate != LINK_STATE_DOWN)
287 				break;
288 		}
289 		if (blcmp == NULL) {
290 			bmp->bm_linkstate = failed ? LINK_STATE_DOWN :
291 			    LINK_STATE_UP;
292 			mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
293 		}
294 	}
295 
296 	/*
297 	 * If we're becoming failed, then the link's current true state needs
298 	 * to be reflected upwards to this link's clients.  If we're becoming
299 	 * unfailed, then we get the state of the bridge instead on all
300 	 * clients.
301 	 */
302 	if (failed) {
303 		if (bmp->bm_linkstate != blp->bl_linkstate)
304 			mac_link_redo(blp->bl_mh, blp->bl_linkstate);
305 	} else {
306 		mac_link_redo(blp->bl_mh, bmp->bm_linkstate);
307 	}
308 
309 	/* get the current mblk we're going to send up */
310 	if ((mp = blp->bl_lfailmp) == NULL &&
311 	    (mp = allocb(sizeof (bridge_ctl_t), BPRI_MED)) == NULL)
312 		return;
313 
314 	/* get a new one for next time */
315 	blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
316 
317 	/* if none for next time, then report only failures */
318 	if (blp->bl_lfailmp == NULL && !failed) {
319 		blp->bl_lfailmp = mp;
320 		return;
321 	}
322 
323 	/* LINTED: alignment */
324 	bcp = (bridge_ctl_t *)mp->b_rptr;
325 	bcp->bc_linkid = blp->bl_linkid;
326 	bcp->bc_failed = failed;
327 	mp->b_wptr = (uchar_t *)(bcp + 1);
328 	mp->b_next = *mlist;
329 	*mlist = mp;
330 }
331 
332 /*
333  * Send control messages (link SDU changes) using the stream to the
334  * bridge instance daemon.
335  */
336 static void
337 send_up_messages(bridge_inst_t *bip, mblk_t *mp)
338 {
339 	mblk_t *mnext;
340 	queue_t *rq;
341 
342 	rq = bip->bi_control->bs_wq;
343 	rq = OTHERQ(rq);
344 	while (mp != NULL) {
345 		mnext = mp->b_next;
346 		mp->b_next = NULL;
347 		putnext(rq, mp);
348 		mp = mnext;
349 	}
350 }
351 
352 /* ARGSUSED */
353 static int
354 bridge_m_getstat(void *arg, uint_t stat, uint64_t *val)
355 {
356 	return (ENOTSUP);
357 }
358 
359 static int
360 bridge_m_start(void *arg)
361 {
362 	bridge_mac_t *bmp = arg;
363 
364 	bmp->bm_flags |= BMF_STARTED;
365 	return (0);
366 }
367 
368 static void
369 bridge_m_stop(void *arg)
370 {
371 	bridge_mac_t *bmp = arg;
372 
373 	bmp->bm_flags &= ~BMF_STARTED;
374 }
375 
376 /* ARGSUSED */
377 static int
378 bridge_m_setpromisc(void *arg, boolean_t on)
379 {
380 	return (0);
381 }
382 
383 /* ARGSUSED */
384 static int
385 bridge_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
386 {
387 	return (0);
388 }
389 
390 /* ARGSUSED */
391 static int
392 bridge_m_unicst(void *arg, const uint8_t *macaddr)
393 {
394 	return (ENOTSUP);
395 }
396 
397 static mblk_t *
398 bridge_m_tx(void *arg, mblk_t *mp)
399 {
400 	_NOTE(ARGUNUSED(arg));
401 	freemsgchain(mp);
402 	return (NULL);
403 }
404 
405 /* ARGSUSED */
406 static int
407 bridge_ioc_listfwd(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
408 {
409 	bridge_listfwd_t *blf = karg;
410 	bridge_inst_t *bip;
411 	bridge_fwd_t *bfp, match;
412 	avl_index_t where;
413 
414 	bip = bridge_find_name(blf->blf_name);
415 	if (bip == NULL)
416 		return (ENOENT);
417 
418 	bcopy(blf->blf_dest, match.bf_dest, ETHERADDRL);
419 	match.bf_flags |= BFF_VLANLOCAL;
420 	rw_enter(&bip->bi_rwlock, RW_READER);
421 	if ((bfp = avl_find(&bip->bi_fwd, &match, &where)) == NULL)
422 		bfp = avl_nearest(&bip->bi_fwd, where, AVL_AFTER);
423 	else
424 		bfp = AVL_NEXT(&bip->bi_fwd, bfp);
425 	if (bfp == NULL) {
426 		bzero(blf, sizeof (*blf));
427 	} else {
428 		bcopy(bfp->bf_dest, blf->blf_dest, ETHERADDRL);
429 		blf->blf_trill_nick = bfp->bf_trill_nick;
430 		blf->blf_ms_age =
431 		    drv_hztousec(lbolt - bfp->bf_lastheard) / 1000;
432 		blf->blf_is_local =
433 		    (bfp->bf_flags & BFF_LOCALADDR) != 0;
434 		blf->blf_linkid = bfp->bf_links[0]->bl_linkid;
435 	}
436 	rw_exit(&bip->bi_rwlock);
437 	bridge_unref(bip);
438 	return (0);
439 }
440 
441 static int
442 bridge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
443     uint_t pr_valsize, const void *pr_val)
444 {
445 	bridge_mac_t *bmp = arg;
446 	bridge_inst_t *bip;
447 	bridge_link_t *blp;
448 	int err;
449 	uint_t maxsdu;
450 	mblk_t *mlist;
451 
452 	_NOTE(ARGUNUSED(pr_name));
453 	switch (pr_num) {
454 	case MAC_PROP_MTU:
455 		if (pr_valsize < sizeof (bmp->bm_maxsdu)) {
456 			err = EINVAL;
457 			break;
458 		}
459 		(void) bcopy(pr_val, &maxsdu, sizeof (maxsdu));
460 		if (maxsdu == bmp->bm_maxsdu) {
461 			err = 0;
462 		} else if ((bip = mac_to_inst(bmp)) == NULL) {
463 			err = ENXIO;
464 		} else {
465 			rw_enter(&bip->bi_rwlock, RW_WRITER);
466 			mlist = NULL;
467 			for (blp = list_head(&bip->bi_links); blp != NULL;
468 			    blp = list_next(&bip->bi_links, blp)) {
469 				if (blp->bl_flags & BLF_DELETED)
470 					continue;
471 				if (blp->bl_maxsdu == maxsdu)
472 					link_sdu_fail(blp, B_FALSE, &mlist);
473 				else if (blp->bl_maxsdu == bmp->bm_maxsdu)
474 					link_sdu_fail(blp, B_TRUE, &mlist);
475 			}
476 			rw_exit(&bip->bi_rwlock);
477 			bmp->bm_maxsdu = maxsdu;
478 			(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
479 			send_up_messages(bip, mlist);
480 			bridge_unref(bip);
481 			err = 0;
482 		}
483 		break;
484 
485 	default:
486 		err = ENOTSUP;
487 		break;
488 	}
489 	return (err);
490 }
491 
492 static int
493 bridge_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
494     uint_t pr_flags, uint_t pr_valsize, void *pr_val, uint_t *perm)
495 {
496 	bridge_mac_t *bmp = arg;
497 	int err = 0;
498 
499 	_NOTE(ARGUNUSED(pr_name));
500 	switch (pr_num) {
501 	case MAC_PROP_MTU: {
502 		mac_propval_range_t range;
503 
504 		if (!(pr_flags & MAC_PROP_POSSIBLE))
505 			return (ENOTSUP);
506 		if (pr_valsize < sizeof (mac_propval_range_t))
507 			return (EINVAL);
508 		range.mpr_count = 1;
509 		range.mpr_type = MAC_PROPVAL_UINT32;
510 		range.range_uint32[0].mpur_min =
511 		    range.range_uint32[0].mpur_max = bmp->bm_maxsdu;
512 		bcopy(&range, pr_val, sizeof (range));
513 		*perm = MAC_PROP_PERM_RW;
514 		break;
515 	}
516 	case MAC_PROP_STATUS:
517 		if (pr_valsize < sizeof (bmp->bm_linkstate)) {
518 			err = EINVAL;
519 		} else {
520 			bcopy(&bmp->bm_linkstate, pr_val,
521 			    sizeof (&bmp->bm_linkstate));
522 			*perm = MAC_PROP_PERM_READ;
523 		}
524 		break;
525 
526 	default:
527 		err = ENOTSUP;
528 		break;
529 	}
530 	return (err);
531 }
532 
533 static mac_callbacks_t bridge_m_callbacks = {
534 	MC_SETPROP | MC_GETPROP,
535 	bridge_m_getstat,
536 	bridge_m_start,
537 	bridge_m_stop,
538 	bridge_m_setpromisc,
539 	bridge_m_multicst,
540 	bridge_m_unicst,
541 	bridge_m_tx,
542 	NULL,	/* ioctl */
543 	NULL,	/* getcapab */
544 	NULL,	/* open */
545 	NULL,	/* close */
546 	bridge_m_setprop,
547 	bridge_m_getprop
548 };
549 
550 /*
551  * Create kstats from a list.
552  */
553 static kstat_t *
554 kstat_setup(kstat_named_t *knt, const char **names, int nstat,
555     const char *unitname)
556 {
557 	kstat_t *ksp;
558 	int i;
559 
560 	for (i = 0; i < nstat; i++)
561 		kstat_named_init(&knt[i], names[i], KSTAT_DATA_UINT64);
562 
563 	ksp = kstat_create_zone("bridge", 0, unitname, "net",
564 	    KSTAT_TYPE_NAMED, nstat, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
565 	if (ksp != NULL) {
566 		ksp->ks_data = knt;
567 		kstat_install(ksp);
568 	}
569 	return (ksp);
570 }
571 
572 /*
573  * Find an existing bridge_mac_t structure or allocate a new one for the given
574  * bridge instance.  This creates the mac driver instance that snoop can use.
575  */
576 static int
577 bmac_alloc(bridge_inst_t *bip, bridge_mac_t **bmacp)
578 {
579 	bridge_mac_t *bmp, *bnew;
580 	mac_register_t *mac;
581 	int err;
582 
583 	*bmacp = NULL;
584 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
585 		return (EINVAL);
586 
587 	bnew = kmem_zalloc(sizeof (*bnew), KM_SLEEP);
588 
589 	rw_enter(&bmac_rwlock, RW_WRITER);
590 	for (bmp = list_head(&bmac_list); bmp != NULL;
591 	    bmp = list_next(&bmac_list, bmp)) {
592 		if (strcmp(bip->bi_name, bmp->bm_name) == 0) {
593 			ASSERT(bmp->bm_inst == NULL);
594 			bmp->bm_inst = bip;
595 			rw_exit(&bmac_rwlock);
596 			kmem_free(bnew, sizeof (*bnew));
597 			mac_free(mac);
598 			*bmacp = bmp;
599 			return (0);
600 		}
601 	}
602 
603 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
604 	mac->m_driver = bnew;
605 	mac->m_dip = bridge_dev_info;
606 	mac->m_instance = (uint_t)-1;
607 	mac->m_src_addr = (uint8_t *)zero_addr;
608 	mac->m_callbacks = &bridge_m_callbacks;
609 
610 	/*
611 	 * Note that the SDU limits are irrelevant, as nobody transmits on the
612 	 * bridge node itself.  It's mainly for monitoring but we allow
613 	 * setting the bridge MTU for quick transition of all links part of the
614 	 * bridge to a new MTU.
615 	 */
616 	mac->m_min_sdu = 1;
617 	mac->m_max_sdu = 1500;
618 	err = mac_register(mac, &bnew->bm_mh);
619 	mac_free(mac);
620 	if (err != 0) {
621 		rw_exit(&bmac_rwlock);
622 		kmem_free(bnew, sizeof (*bnew));
623 		return (err);
624 	}
625 
626 	bnew->bm_inst = bip;
627 	(void) strcpy(bnew->bm_name, bip->bi_name);
628 	if (list_is_empty(&bmac_list)) {
629 		bridge_timerid = timeout(bridge_timer, NULL,
630 		    bridge_scan_interval);
631 	}
632 	list_insert_tail(&bmac_list, bnew);
633 	rw_exit(&bmac_rwlock);
634 
635 	/*
636 	 * Mark the MAC as unable to go "active" so that only passive clients
637 	 * (such as snoop) can bind to it.
638 	 */
639 	mac_no_active(bnew->bm_mh);
640 	*bmacp = bnew;
641 	return (0);
642 }
643 
644 /*
645  * Disconnect the given bridge_mac_t from its bridge instance.  The bridge
646  * instance is going away.  The mac instance can't go away until the clients
647  * are gone (see bridge_timer).
648  */
649 static void
650 bmac_disconnect(bridge_mac_t *bmp)
651 {
652 	bridge_inst_t *bip;
653 
654 	bmp->bm_linkstate = LINK_STATE_DOWN;
655 	mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
656 
657 	rw_enter(&bmac_rwlock, RW_READER);
658 	bip = bmp->bm_inst;
659 	bip->bi_mac = NULL;
660 	bmp->bm_inst = NULL;
661 	rw_exit(&bmac_rwlock);
662 }
663 
664 /* This is used by the avl trees to sort forwarding table entries */
665 static int
666 fwd_compare(const void *addr1, const void *addr2)
667 {
668 	const bridge_fwd_t *fwd1 = addr1;
669 	const bridge_fwd_t *fwd2 = addr2;
670 	int diff = memcmp(fwd1->bf_dest, fwd2->bf_dest, ETHERADDRL);
671 
672 	if (diff != 0)
673 		return (diff > 0 ? 1 : -1);
674 
675 	if ((fwd1->bf_flags ^ fwd2->bf_flags) & BFF_VLANLOCAL) {
676 		if (fwd1->bf_vlanid > fwd2->bf_vlanid)
677 			return (1);
678 		else if (fwd1->bf_vlanid < fwd2->bf_vlanid)
679 			return (-1);
680 	}
681 	return (0);
682 }
683 
684 static void
685 inst_free(bridge_inst_t *bip)
686 {
687 	ASSERT(bip->bi_mac == NULL);
688 	rw_destroy(&bip->bi_rwlock);
689 	list_destroy(&bip->bi_links);
690 	cv_destroy(&bip->bi_linkwait);
691 	avl_destroy(&bip->bi_fwd);
692 	if (bip->bi_ksp != NULL)
693 		kstat_delete(bip->bi_ksp);
694 	kmem_free(bip, sizeof (*bip));
695 }
696 
697 static bridge_inst_t *
698 inst_alloc(const char *bridge)
699 {
700 	bridge_inst_t *bip;
701 
702 	bip = kmem_zalloc(sizeof (*bip), KM_SLEEP);
703 	bip->bi_refs = 1;
704 	(void) strcpy(bip->bi_name, bridge);
705 	rw_init(&bip->bi_rwlock, NULL, RW_DRIVER, NULL);
706 	list_create(&bip->bi_links, sizeof (bridge_link_t),
707 	    offsetof(bridge_link_t, bl_node));
708 	cv_init(&bip->bi_linkwait, NULL, CV_DRIVER, NULL);
709 	avl_create(&bip->bi_fwd, fwd_compare, sizeof (bridge_fwd_t),
710 	    offsetof(bridge_fwd_t, bf_node));
711 	return (bip);
712 }
713 
714 static bridge_inst_t *
715 bridge_find_name(const char *bridge)
716 {
717 	bridge_inst_t *bip;
718 
719 	mutex_enter(&inst_lock);
720 	for (bip = list_head(&inst_list); bip != NULL;
721 	    bip = list_next(&inst_list, bip)) {
722 		if (!(bip->bi_flags & BIF_SHUTDOWN) &&
723 		    strcmp(bridge, bip->bi_name) == 0) {
724 			atomic_inc_uint(&bip->bi_refs);
725 			break;
726 		}
727 	}
728 	mutex_exit(&inst_lock);
729 
730 	return (bip);
731 }
732 
733 static int
734 bridge_create(datalink_id_t linkid, const char *bridge, bridge_inst_t **bipc)
735 {
736 	bridge_inst_t *bip, *bipnew;
737 	bridge_mac_t *bmp = NULL;
738 	int err;
739 
740 	*bipc = NULL;
741 	bipnew = inst_alloc(bridge);
742 
743 	mutex_enter(&inst_lock);
744 lookup_retry:
745 	for (bip = list_head(&inst_list); bip != NULL;
746 	    bip = list_next(&inst_list, bip)) {
747 		if (strcmp(bridge, bip->bi_name) == 0)
748 			break;
749 	}
750 
751 	/* This should not take long; if it does, we've got a design problem */
752 	if (bip != NULL && (bip->bi_flags & BIF_SHUTDOWN)) {
753 		cv_wait(&inst_cv, &inst_lock);
754 		goto lookup_retry;
755 	}
756 
757 	if (bip != NULL) {
758 		/* We weren't expecting to find anything */
759 		bip = NULL;
760 		err = EEXIST;
761 	} else {
762 		bip = bipnew;
763 		bipnew = NULL;
764 		list_insert_tail(&inst_list, bip);
765 	}
766 
767 	mutex_exit(&inst_lock);
768 	if (bip == NULL)
769 		goto fail;
770 
771 	bip->bi_ksp = kstat_setup((kstat_named_t *)&bip->bi_kstats,
772 	    inst_kstats_list, Dim(inst_kstats_list), bip->bi_name);
773 
774 	err = bmac_alloc(bip, &bmp);
775 	if ((bip->bi_mac = bmp) == NULL)
776 		goto fail_create;
777 
778 	/*
779 	 * bm_inst is set, so the timer cannot yank the DLS rug from under us.
780 	 * No extra locking is needed here.
781 	 */
782 	if (!(bmp->bm_flags & BMF_DLS)) {
783 		if ((err = dls_devnet_create(bmp->bm_mh, linkid)) != 0)
784 			goto fail_create;
785 		bmp->bm_flags |= BMF_DLS;
786 	}
787 
788 	bip->bi_dev = makedevice(bridge_major, mac_minor(bmp->bm_mh));
789 	*bipc = bip;
790 	return (0);
791 
792 fail_create:
793 	if (bmp != NULL)
794 		bmac_disconnect(bip->bi_mac);
795 	bipnew = bip;
796 fail:
797 	ASSERT(bipnew->bi_trilldata == NULL);
798 	bipnew->bi_flags |= BIF_SHUTDOWN;
799 	inst_free(bipnew);
800 	return (err);
801 }
802 
803 static void
804 bridge_unref(bridge_inst_t *bip)
805 {
806 	if (atomic_dec_uint_nv(&bip->bi_refs) == 0) {
807 		ASSERT(bip->bi_flags & BIF_SHUTDOWN);
808 		/* free up mac for reuse before leaving global list */
809 		if (bip->bi_mac != NULL)
810 			bmac_disconnect(bip->bi_mac);
811 		mutex_enter(&inst_lock);
812 		list_remove(&inst_list, bip);
813 		cv_broadcast(&inst_cv);
814 		mutex_exit(&inst_lock);
815 		inst_free(bip);
816 	}
817 }
818 
819 /*
820  * Stream instances are used only for allocating bridges and serving as a
821  * control node.  They serve no data-handling function.
822  */
823 static bridge_stream_t *
824 stream_alloc(void)
825 {
826 	bridge_stream_t *bsp;
827 	minor_t mn;
828 
829 	if ((mn = mac_minor_hold(B_FALSE)) == 0)
830 		return (NULL);
831 	bsp = kmem_zalloc(sizeof (*bsp), KM_SLEEP);
832 	bsp->bs_minor = mn;
833 	return (bsp);
834 }
835 
836 static void
837 stream_free(bridge_stream_t *bsp)
838 {
839 	mac_minor_rele(bsp->bs_minor);
840 	kmem_free(bsp, sizeof (*bsp));
841 }
842 
843 /* Reference hold/release functions for STREAMS-related taskq */
844 static void
845 stream_ref(bridge_stream_t *bsp)
846 {
847 	mutex_enter(&stream_ref_lock);
848 	bsp->bs_taskq_cnt++;
849 	mutex_exit(&stream_ref_lock);
850 }
851 
852 static void
853 stream_unref(bridge_stream_t *bsp)
854 {
855 	mutex_enter(&stream_ref_lock);
856 	if (--bsp->bs_taskq_cnt == 0)
857 		cv_broadcast(&stream_ref_cv);
858 	mutex_exit(&stream_ref_lock);
859 }
860 
861 static void
862 link_free(bridge_link_t *blp)
863 {
864 	bridge_inst_t *bip = blp->bl_inst;
865 
866 	ASSERT(!(blp->bl_flags & BLF_FREED));
867 	blp->bl_flags |= BLF_FREED;
868 	if (blp->bl_ksp != NULL)
869 		kstat_delete(blp->bl_ksp);
870 	if (blp->bl_lfailmp != NULL)
871 		freeb(blp->bl_lfailmp);
872 	cv_destroy(&blp->bl_trillwait);
873 	mutex_destroy(&blp->bl_trilllock);
874 	kmem_free(blp, sizeof (*blp));
875 	/* Don't unreference the bridge until the MAC is closed */
876 	bridge_unref(bip);
877 }
878 
879 static void
880 link_unref(bridge_link_t *blp)
881 {
882 	if (atomic_dec_uint_nv(&blp->bl_refs) == 0) {
883 		bridge_inst_t *bip = blp->bl_inst;
884 
885 		ASSERT(blp->bl_flags & BLF_DELETED);
886 		rw_enter(&bip->bi_rwlock, RW_WRITER);
887 		list_remove(&bip->bi_links, blp);
888 		rw_exit(&bip->bi_rwlock);
889 		if (bip->bi_trilldata != NULL && list_is_empty(&bip->bi_links))
890 			cv_broadcast(&bip->bi_linkwait);
891 		link_free(blp);
892 	}
893 }
894 
895 static bridge_fwd_t *
896 fwd_alloc(const uint8_t *addr, uint_t nlinks, uint16_t nick)
897 {
898 	bridge_fwd_t *bfp;
899 
900 	bfp = kmem_zalloc(sizeof (*bfp) + (nlinks * sizeof (bridge_link_t *)),
901 	    KM_NOSLEEP);
902 	if (bfp != NULL) {
903 		bcopy(addr, bfp->bf_dest, ETHERADDRL);
904 		bfp->bf_lastheard = lbolt;
905 		bfp->bf_maxlinks = nlinks;
906 		bfp->bf_links = (bridge_link_t **)(bfp + 1);
907 		bfp->bf_trill_nick = nick;
908 	}
909 	return (bfp);
910 }
911 
912 static bridge_fwd_t *
913 fwd_find(bridge_inst_t *bip, const uint8_t *addr, uint16_t vlanid)
914 {
915 	bridge_fwd_t *bfp, *vbfp;
916 	bridge_fwd_t match;
917 
918 	bcopy(addr, match.bf_dest, ETHERADDRL);
919 	match.bf_flags = 0;
920 	rw_enter(&bip->bi_rwlock, RW_READER);
921 	if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
922 		if (bfp->bf_vlanid != vlanid && bfp->bf_vcnt > 0) {
923 			match.bf_vlanid = vlanid;
924 			match.bf_flags = BFF_VLANLOCAL;
925 			vbfp = avl_find(&bip->bi_fwd, &match, NULL);
926 			if (vbfp != NULL)
927 				bfp = vbfp;
928 		}
929 		atomic_inc_uint(&bfp->bf_refs);
930 	}
931 	rw_exit(&bip->bi_rwlock);
932 	return (bfp);
933 }
934 
935 static void
936 fwd_free(bridge_fwd_t *bfp)
937 {
938 	uint_t i;
939 	bridge_inst_t *bip = bfp->bf_links[0]->bl_inst;
940 
941 	KIDECR(bki_count);
942 	for (i = 0; i < bfp->bf_nlinks; i++)
943 		link_unref(bfp->bf_links[i]);
944 	kmem_free(bfp,
945 	    sizeof (*bfp) + bfp->bf_maxlinks * sizeof (bridge_link_t *));
946 }
947 
948 static void
949 fwd_unref(bridge_fwd_t *bfp)
950 {
951 	if (atomic_dec_uint_nv(&bfp->bf_refs) == 0) {
952 		ASSERT(!(bfp->bf_flags & BFF_INTREE));
953 		fwd_free(bfp);
954 	}
955 }
956 
957 static void
958 fwd_delete(bridge_fwd_t *bfp)
959 {
960 	bridge_inst_t *bip;
961 	bridge_fwd_t *bfpzero;
962 
963 	if (bfp->bf_flags & BFF_INTREE) {
964 		ASSERT(bfp->bf_nlinks > 0);
965 		bip = bfp->bf_links[0]->bl_inst;
966 		rw_enter(&bip->bi_rwlock, RW_WRITER);
967 		/* Another thread could beat us to this */
968 		if (bfp->bf_flags & BFF_INTREE) {
969 			avl_remove(&bip->bi_fwd, bfp);
970 			bfp->bf_flags &= ~BFF_INTREE;
971 			if (bfp->bf_flags & BFF_VLANLOCAL) {
972 				bfp->bf_flags &= ~BFF_VLANLOCAL;
973 				bfpzero = avl_find(&bip->bi_fwd, bfp, NULL);
974 				if (bfpzero != NULL && bfpzero->bf_vcnt > 0)
975 					bfpzero->bf_vcnt--;
976 			}
977 			rw_exit(&bip->bi_rwlock);
978 			fwd_unref(bfp);		/* no longer in avl tree */
979 		} else {
980 			rw_exit(&bip->bi_rwlock);
981 		}
982 	}
983 }
984 
985 static boolean_t
986 fwd_insert(bridge_inst_t *bip, bridge_fwd_t *bfp)
987 {
988 	avl_index_t idx;
989 	boolean_t retv;
990 
991 	rw_enter(&bip->bi_rwlock, RW_WRITER);
992 	if (!(bip->bi_flags & BIF_SHUTDOWN) &&
993 	    avl_numnodes(&bip->bi_fwd) < bip->bi_tablemax &&
994 	    avl_find(&bip->bi_fwd, bfp, &idx) == NULL) {
995 		avl_insert(&bip->bi_fwd, bfp, idx);
996 		bfp->bf_flags |= BFF_INTREE;
997 		atomic_inc_uint(&bfp->bf_refs);	/* avl entry */
998 		retv = B_TRUE;
999 	} else {
1000 		retv = B_FALSE;
1001 	}
1002 	rw_exit(&bip->bi_rwlock);
1003 	return (retv);
1004 }
1005 
1006 static void
1007 fwd_update_local(bridge_link_t *blp, const uint8_t *oldaddr,
1008     const uint8_t *newaddr)
1009 {
1010 	bridge_inst_t *bip = blp->bl_inst;
1011 	bridge_fwd_t *bfp, *bfnew;
1012 	bridge_fwd_t match;
1013 	avl_index_t idx;
1014 	boolean_t drop_ref = B_FALSE;
1015 
1016 	if (bcmp(oldaddr, newaddr, ETHERADDRL) == 0)
1017 		return;
1018 
1019 	if (bcmp(oldaddr, zero_addr, ETHERADDRL) == 0)
1020 		goto no_old_addr;
1021 
1022 	/*
1023 	 * Find the previous entry, and remove our link from it.
1024 	 */
1025 	bcopy(oldaddr, match.bf_dest, ETHERADDRL);
1026 	rw_enter(&bip->bi_rwlock, RW_WRITER);
1027 	if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
1028 		int i;
1029 
1030 		/*
1031 		 * See if we're in the list, and remove if so.
1032 		 */
1033 		for (i = 0; i < bfp->bf_nlinks; i++) {
1034 			if (bfp->bf_links[i] == blp) {
1035 				/*
1036 				 * We assume writes are atomic, so no special
1037 				 * MT handling is needed.  The list length is
1038 				 * decremented first, and then we remove
1039 				 * entries.
1040 				 */
1041 				bfp->bf_nlinks--;
1042 				for (; i < bfp->bf_nlinks; i++)
1043 					bfp->bf_links[i] = bfp->bf_links[i + 1];
1044 				drop_ref = B_TRUE;
1045 				break;
1046 			}
1047 		}
1048 		/* If no more links, then remove and free up */
1049 		if (bfp->bf_nlinks == 0) {
1050 			avl_remove(&bip->bi_fwd, bfp);
1051 			bfp->bf_flags &= ~BFF_INTREE;
1052 		} else {
1053 			bfp = NULL;
1054 		}
1055 	}
1056 	rw_exit(&bip->bi_rwlock);
1057 	if (bfp != NULL)
1058 		fwd_unref(bfp);		/* no longer in avl tree */
1059 
1060 	/*
1061 	 * Now get the new link address and add this link to the list.  The
1062 	 * list should be of length 1 unless the user has configured multiple
1063 	 * NICs with the same address.  (That's an incorrect configuration, but
1064 	 * we support it anyway.)
1065 	 */
1066 no_old_addr:
1067 	bfp = NULL;
1068 	if ((bip->bi_flags & BIF_SHUTDOWN) ||
1069 	    bcmp(newaddr, zero_addr, ETHERADDRL) == 0)
1070 		goto no_new_addr;
1071 
1072 	bcopy(newaddr, match.bf_dest, ETHERADDRL);
1073 	rw_enter(&bip->bi_rwlock, RW_WRITER);
1074 	if ((bfp = avl_find(&bip->bi_fwd, &match, &idx)) == NULL) {
1075 		bfnew = fwd_alloc(newaddr, 1, RBRIDGE_NICKNAME_NONE);
1076 		if (bfnew != NULL)
1077 			KIINCR(bki_count);
1078 	} else if (bfp->bf_nlinks < bfp->bf_maxlinks) {
1079 		/* special case: link fits in existing entry */
1080 		bfnew = bfp;
1081 	} else {
1082 		bfnew = fwd_alloc(newaddr, bfp->bf_nlinks + 1,
1083 		    RBRIDGE_NICKNAME_NONE);
1084 		if (bfnew != NULL) {
1085 			KIINCR(bki_count);
1086 			avl_remove(&bip->bi_fwd, bfp);
1087 			bfp->bf_flags &= ~BFF_INTREE;
1088 			bfnew->bf_nlinks = bfp->bf_nlinks;
1089 			bcopy(bfp->bf_links, bfnew->bf_links,
1090 			    bfp->bf_nlinks * sizeof (bfp));
1091 			/* reset the idx value due to removal above */
1092 			(void) avl_find(&bip->bi_fwd, &match, &idx);
1093 		}
1094 	}
1095 
1096 	if (bfnew != NULL) {
1097 		bfnew->bf_links[bfnew->bf_nlinks++] = blp;
1098 		if (drop_ref)
1099 			drop_ref = B_FALSE;
1100 		else
1101 			atomic_inc_uint(&blp->bl_refs);	/* bf_links entry */
1102 
1103 		if (bfnew != bfp) {
1104 			/* local addresses are not subject to table limits */
1105 			avl_insert(&bip->bi_fwd, bfnew, idx);
1106 			bfnew->bf_flags |= (BFF_INTREE | BFF_LOCALADDR);
1107 			atomic_inc_uint(&bfnew->bf_refs);	/* avl entry */
1108 		}
1109 	}
1110 	rw_exit(&bip->bi_rwlock);
1111 
1112 no_new_addr:
1113 	/*
1114 	 * If we found an existing entry and we replaced it with a new one,
1115 	 * then drop the table reference from the old one.  We removed it from
1116 	 * the AVL tree above.
1117 	 */
1118 	if (bfnew != NULL && bfp != NULL && bfnew != bfp)
1119 		fwd_unref(bfp);
1120 
1121 	/* Account for removed entry. */
1122 	if (drop_ref)
1123 		link_unref(blp);
1124 }
1125 
1126 static void
1127 bridge_new_unicst(bridge_link_t *blp)
1128 {
1129 	uint8_t new_mac[ETHERADDRL];
1130 
1131 	mac_unicast_primary_get(blp->bl_mh, new_mac);
1132 	fwd_update_local(blp, blp->bl_local_mac, new_mac);
1133 	bcopy(new_mac, blp->bl_local_mac, ETHERADDRL);
1134 }
1135 
1136 /*
1137  * We must shut down a link prior to freeing it, and doing that requires
1138  * blocking to wait for running MAC threads while holding a reference.  This is
1139  * run from a taskq to accomplish proper link shutdown followed by reference
1140  * drop.
1141  */
1142 static void
1143 link_shutdown(void *arg)
1144 {
1145 	bridge_link_t *blp = arg;
1146 	mac_handle_t mh = blp->bl_mh;
1147 	bridge_inst_t *bip;
1148 	bridge_fwd_t *bfp, *bfnext;
1149 	avl_tree_t fwd_scavenge;
1150 	int i;
1151 
1152 	/*
1153 	 * This link is being destroyed.  Notify TRILL now that it's no longer
1154 	 * possible to send packets.  Data packets may still arrive until TRILL
1155 	 * calls bridge_trill_lnunref.
1156 	 */
1157 	if (blp->bl_trilldata != NULL)
1158 		trill_lndstr_fn(blp->bl_trilldata, blp);
1159 
1160 	if (blp->bl_flags & BLF_PROM_ADDED)
1161 		(void) mac_promisc_remove(blp->bl_mphp);
1162 
1163 	if (blp->bl_flags & BLF_SET_BRIDGE)
1164 		mac_bridge_clear(mh, (mac_handle_t)blp);
1165 
1166 	if (blp->bl_flags & BLF_MARGIN_ADDED) {
1167 		(void) mac_notify_remove(blp->bl_mnh, B_TRUE);
1168 		(void) mac_margin_remove(mh, blp->bl_margin);
1169 	}
1170 
1171 	/* Tell the clients the real link state when we leave */
1172 	mac_link_redo(blp->bl_mh,
1173 	    mac_stat_get(blp->bl_mh, MAC_STAT_LOWLINK_STATE));
1174 
1175 	/* Destroy all of the forwarding entries related to this link */
1176 	avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1177 	    offsetof(bridge_fwd_t, bf_node));
1178 	bip = blp->bl_inst;
1179 	rw_enter(&bip->bi_rwlock, RW_WRITER);
1180 	bfnext = avl_first(&bip->bi_fwd);
1181 	while ((bfp = bfnext) != NULL) {
1182 		bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1183 		for (i = 0; i < bfp->bf_nlinks; i++) {
1184 			if (bfp->bf_links[i] == blp)
1185 				break;
1186 		}
1187 		if (i >= bfp->bf_nlinks)
1188 			continue;
1189 		if (bfp->bf_nlinks > 1) {
1190 			/* note that this can't be the last reference */
1191 			link_unref(blp);
1192 			bfp->bf_nlinks--;
1193 			for (; i < bfp->bf_nlinks; i++)
1194 				bfp->bf_links[i] = bfp->bf_links[i + 1];
1195 		} else {
1196 			ASSERT(bfp->bf_flags & BFF_INTREE);
1197 			avl_remove(&bip->bi_fwd, bfp);
1198 			bfp->bf_flags &= ~BFF_INTREE;
1199 			avl_add(&fwd_scavenge, bfp);
1200 		}
1201 	}
1202 	rw_exit(&bip->bi_rwlock);
1203 	bfnext = avl_first(&fwd_scavenge);
1204 	while ((bfp = bfnext) != NULL) {
1205 		bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1206 		avl_remove(&fwd_scavenge, bfp);
1207 		fwd_unref(bfp);
1208 	}
1209 	avl_destroy(&fwd_scavenge);
1210 
1211 	if (blp->bl_flags & BLF_CLIENT_OPEN)
1212 		mac_client_close(blp->bl_mch, 0);
1213 
1214 	mac_close(mh);
1215 
1216 	/*
1217 	 * We are now completely removed from the active list, so drop the
1218 	 * reference (see bridge_add_link).
1219 	 */
1220 	link_unref(blp);
1221 }
1222 
1223 static void
1224 shutdown_inst(bridge_inst_t *bip)
1225 {
1226 	bridge_link_t *blp, *blnext;
1227 	bridge_fwd_t *bfp;
1228 
1229 	mutex_enter(&inst_lock);
1230 	if (bip->bi_flags & BIF_SHUTDOWN) {
1231 		mutex_exit(&inst_lock);
1232 		return;
1233 	}
1234 
1235 	/*
1236 	 * Once on the inst_list, the bridge instance must not leave that list
1237 	 * without having the shutdown flag set first.  When the shutdown flag
1238 	 * is set, we own the list reference, so we must drop it before
1239 	 * returning.
1240 	 */
1241 	bip->bi_flags |= BIF_SHUTDOWN;
1242 	mutex_exit(&inst_lock);
1243 
1244 	bip->bi_control = NULL;
1245 
1246 	rw_enter(&bip->bi_rwlock, RW_READER);
1247 	blnext = list_head(&bip->bi_links);
1248 	while ((blp = blnext) != NULL) {
1249 		blnext = list_next(&bip->bi_links, blp);
1250 		if (!(blp->bl_flags & BLF_DELETED)) {
1251 			blp->bl_flags |= BLF_DELETED;
1252 			(void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
1253 			    blp, DDI_SLEEP);
1254 		}
1255 	}
1256 	while ((bfp = avl_first(&bip->bi_fwd)) != NULL) {
1257 		atomic_inc_uint(&bfp->bf_refs);
1258 		rw_exit(&bip->bi_rwlock);
1259 		fwd_delete(bfp);
1260 		fwd_unref(bfp);
1261 		rw_enter(&bip->bi_rwlock, RW_READER);
1262 	}
1263 	rw_exit(&bip->bi_rwlock);
1264 
1265 	/*
1266 	 * This bridge is being destroyed.  Notify TRILL once all of the
1267 	 * links are all gone.
1268 	 */
1269 	mutex_enter(&inst_lock);
1270 	while (bip->bi_trilldata != NULL && !list_is_empty(&bip->bi_links))
1271 		cv_wait(&bip->bi_linkwait, &inst_lock);
1272 	mutex_exit(&inst_lock);
1273 	if (bip->bi_trilldata != NULL)
1274 		trill_brdstr_fn(bip->bi_trilldata, bip);
1275 
1276 	bridge_unref(bip);
1277 }
1278 
1279 /*
1280  * This is called once by the TRILL module when it starts up.  It just sets the
1281  * global TRILL callback function pointers -- data transmit/receive and bridge
1282  * and link destroy notification.  There's only one TRILL module, so only one
1283  * registration is needed.
1284  *
1285  * TRILL should call this function with NULL pointers before unloading.  It
1286  * must not do so before dropping all references to bridges and links.  We
1287  * assert that this is true on debug builds.
1288  */
1289 void
1290 bridge_trill_register_cb(trill_recv_pkt_t recv_fn, trill_encap_pkt_t encap_fn,
1291     trill_br_dstr_t brdstr_fn, trill_ln_dstr_t lndstr_fn)
1292 {
1293 #ifdef DEBUG
1294 	if (recv_fn == NULL && trill_recv_fn != NULL) {
1295 		bridge_inst_t *bip;
1296 		bridge_link_t *blp;
1297 
1298 		mutex_enter(&inst_lock);
1299 		for (bip = list_head(&inst_list); bip != NULL;
1300 		    bip = list_next(&inst_list, bip)) {
1301 			ASSERT(bip->bi_trilldata == NULL);
1302 			rw_enter(&bip->bi_rwlock, RW_READER);
1303 			for (blp = list_head(&bip->bi_links); blp != NULL;
1304 			    blp = list_next(&bip->bi_links, blp)) {
1305 				ASSERT(blp->bl_trilldata == NULL);
1306 			}
1307 			rw_exit(&bip->bi_rwlock);
1308 		}
1309 		mutex_exit(&inst_lock);
1310 	}
1311 #endif
1312 	trill_recv_fn = recv_fn;
1313 	trill_encap_fn = encap_fn;
1314 	trill_brdstr_fn = brdstr_fn;
1315 	trill_lndstr_fn = lndstr_fn;
1316 }
1317 
1318 /*
1319  * This registers the TRILL instance pointer with a bridge.  Before this
1320  * pointer is set, the forwarding, TRILL receive, and bridge destructor
1321  * functions won't be called.
1322  *
1323  * TRILL holds a reference on a bridge with this call.  It must free the
1324  * reference by calling the unregister function below.
1325  */
1326 bridge_inst_t *
1327 bridge_trill_brref(const char *bname, void *ptr)
1328 {
1329 	char bridge[MAXLINKNAMELEN];
1330 	bridge_inst_t *bip;
1331 
1332 	(void) snprintf(bridge, MAXLINKNAMELEN, "%s0", bname);
1333 	bip = bridge_find_name(bridge);
1334 	if (bip != NULL) {
1335 		ASSERT(bip->bi_trilldata == NULL && ptr != NULL);
1336 		bip->bi_trilldata = ptr;
1337 	}
1338 	return (bip);
1339 }
1340 
1341 void
1342 bridge_trill_brunref(bridge_inst_t *bip)
1343 {
1344 	ASSERT(bip->bi_trilldata != NULL);
1345 	bip->bi_trilldata = NULL;
1346 	bridge_unref(bip);
1347 }
1348 
1349 /*
1350  * TRILL calls this function when referencing a particular link on a bridge.
1351  *
1352  * It holds a reference on the link, so TRILL must clear out the reference when
1353  * it's done with the link (on unbinding).
1354  */
1355 bridge_link_t *
1356 bridge_trill_lnref(bridge_inst_t *bip, datalink_id_t linkid, void *ptr)
1357 {
1358 	bridge_link_t *blp;
1359 
1360 	ASSERT(ptr != NULL);
1361 	rw_enter(&bip->bi_rwlock, RW_READER);
1362 	for (blp = list_head(&bip->bi_links); blp != NULL;
1363 	    blp = list_next(&bip->bi_links, blp)) {
1364 		if (!(blp->bl_flags & BLF_DELETED) &&
1365 		    blp->bl_linkid == linkid && blp->bl_trilldata == NULL) {
1366 			blp->bl_trilldata = ptr;
1367 			blp->bl_flags &= ~BLF_TRILLACTIVE;
1368 			(void) memset(blp->bl_afs, 0, sizeof (blp->bl_afs));
1369 			atomic_inc_uint(&blp->bl_refs);
1370 			break;
1371 		}
1372 	}
1373 	rw_exit(&bip->bi_rwlock);
1374 	return (blp);
1375 }
1376 
1377 void
1378 bridge_trill_lnunref(bridge_link_t *blp)
1379 {
1380 	mutex_enter(&blp->bl_trilllock);
1381 	ASSERT(blp->bl_trilldata != NULL);
1382 	blp->bl_trilldata = NULL;
1383 	blp->bl_flags &= ~BLF_TRILLACTIVE;
1384 	while (blp->bl_trillthreads > 0)
1385 		cv_wait(&blp->bl_trillwait, &blp->bl_trilllock);
1386 	mutex_exit(&blp->bl_trilllock);
1387 	(void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
1388 	link_unref(blp);
1389 }
1390 
1391 /*
1392  * This periodic timer performs three functions:
1393  *  1. It scans the list of learned forwarding entries, and removes ones that
1394  *     haven't been heard from in a while.  The time limit is backed down if
1395  *     we're above the configured table limit.
1396  *  2. It walks the links and decays away the bl_learns counter.
1397  *  3. It scans the observability node entries looking for ones that can be
1398  *     freed up.
1399  */
1400 /* ARGSUSED */
1401 static void
1402 bridge_timer(void *arg)
1403 {
1404 	bridge_inst_t *bip;
1405 	bridge_fwd_t *bfp, *bfnext;
1406 	bridge_mac_t *bmp, *bmnext;
1407 	bridge_link_t *blp;
1408 	int err;
1409 	datalink_id_t tmpid;
1410 	avl_tree_t fwd_scavenge;
1411 	clock_t age_limit;
1412 	uint32_t ldecay;
1413 
1414 	avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1415 	    offsetof(bridge_fwd_t, bf_node));
1416 	mutex_enter(&inst_lock);
1417 	for (bip = list_head(&inst_list); bip != NULL;
1418 	    bip = list_next(&inst_list, bip)) {
1419 		if (bip->bi_flags & BIF_SHUTDOWN)
1420 			continue;
1421 		rw_enter(&bip->bi_rwlock, RW_WRITER);
1422 		/* compute scaled maximum age based on table limit */
1423 		if (avl_numnodes(&bip->bi_fwd) > bip->bi_tablemax)
1424 			bip->bi_tshift++;
1425 		else
1426 			bip->bi_tshift = 0;
1427 		if ((age_limit = bridge_fwd_age >> bip->bi_tshift) == 0) {
1428 			if (bip->bi_tshift != 0)
1429 				bip->bi_tshift--;
1430 			age_limit = 1;
1431 		}
1432 		bfnext = avl_first(&bip->bi_fwd);
1433 		while ((bfp = bfnext) != NULL) {
1434 			bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1435 			if (!(bfp->bf_flags & BFF_LOCALADDR) &&
1436 			    (lbolt - bfp->bf_lastheard) > age_limit) {
1437 				ASSERT(bfp->bf_flags & BFF_INTREE);
1438 				avl_remove(&bip->bi_fwd, bfp);
1439 				bfp->bf_flags &= ~BFF_INTREE;
1440 				avl_add(&fwd_scavenge, bfp);
1441 			}
1442 		}
1443 		for (blp = list_head(&bip->bi_links); blp != NULL;
1444 		    blp = list_next(&bip->bi_links, blp)) {
1445 			ldecay = mac_get_ldecay(blp->bl_mh);
1446 			if (ldecay >= blp->bl_learns)
1447 				blp->bl_learns = 0;
1448 			else
1449 				atomic_add_int(&blp->bl_learns, -(int)ldecay);
1450 		}
1451 		rw_exit(&bip->bi_rwlock);
1452 		bfnext = avl_first(&fwd_scavenge);
1453 		while ((bfp = bfnext) != NULL) {
1454 			bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1455 			avl_remove(&fwd_scavenge, bfp);
1456 			KIINCR(bki_expire);
1457 			fwd_unref(bfp);	/* drop tree reference */
1458 		}
1459 	}
1460 	mutex_exit(&inst_lock);
1461 	avl_destroy(&fwd_scavenge);
1462 
1463 	/*
1464 	 * Scan the bridge_mac_t entries and try to free up the ones that are
1465 	 * no longer active.  This must be done by polling, as neither DLS nor
1466 	 * MAC provides a driver any sort of positive control over clients.
1467 	 */
1468 	rw_enter(&bmac_rwlock, RW_WRITER);
1469 	bmnext = list_head(&bmac_list);
1470 	while ((bmp = bmnext) != NULL) {
1471 		bmnext = list_next(&bmac_list, bmp);
1472 
1473 		/* ignore active bridges */
1474 		if (bmp->bm_inst != NULL)
1475 			continue;
1476 
1477 		if (bmp->bm_flags & BMF_DLS) {
1478 			err = dls_devnet_destroy(bmp->bm_mh, &tmpid, B_FALSE);
1479 			ASSERT(err == 0 || err == EBUSY);
1480 			if (err == 0)
1481 				bmp->bm_flags &= ~BMF_DLS;
1482 		}
1483 
1484 		if (!(bmp->bm_flags & BMF_DLS)) {
1485 			err = mac_unregister(bmp->bm_mh);
1486 			ASSERT(err == 0 || err == EBUSY);
1487 			if (err == 0) {
1488 				list_remove(&bmac_list, bmp);
1489 				kmem_free(bmp, sizeof (*bmp));
1490 			}
1491 		}
1492 	}
1493 	if (list_is_empty(&bmac_list)) {
1494 		bridge_timerid = 0;
1495 	} else {
1496 		bridge_timerid = timeout(bridge_timer, NULL,
1497 		    bridge_scan_interval);
1498 	}
1499 	rw_exit(&bmac_rwlock);
1500 }
1501 
1502 static int
1503 bridge_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp)
1504 {
1505 	bridge_stream_t	*bsp;
1506 
1507 	if (rq->q_ptr != NULL)
1508 		return (0);
1509 
1510 	if (sflag & MODOPEN)
1511 		return (EINVAL);
1512 
1513 	/*
1514 	 * Check the minor node number being opened.  This tells us which
1515 	 * bridge instance the user wants.
1516 	 */
1517 	if (getminor(*devp) != 0) {
1518 		/*
1519 		 * This is a regular DLPI stream for snoop or the like.
1520 		 * Redirect it through DLD.
1521 		 */
1522 		rq->q_qinfo = &bridge_dld_rinit;
1523 		OTHERQ(rq)->q_qinfo = &bridge_dld_winit;
1524 		return (dld_open(rq, devp, oflag, sflag, credp));
1525 	} else {
1526 		/*
1527 		 * Allocate the bridge control stream structure.
1528 		 */
1529 		if ((bsp = stream_alloc()) == NULL)
1530 			return (ENOSR);
1531 		rq->q_ptr = WR(rq)->q_ptr = (caddr_t)bsp;
1532 		bsp->bs_wq = WR(rq);
1533 		*devp = makedevice(getmajor(*devp), bsp->bs_minor);
1534 		qprocson(rq);
1535 		return (0);
1536 	}
1537 }
1538 
1539 /*
1540  * This is used only for bridge control streams.  DLPI goes through dld
1541  * instead.
1542  */
1543 static int
1544 bridge_close(queue_t *rq)
1545 {
1546 	bridge_stream_t	*bsp = rq->q_ptr;
1547 	bridge_inst_t *bip;
1548 
1549 	/*
1550 	 * Wait for any stray taskq (add/delete link) entries related to this
1551 	 * stream to leave the system.
1552 	 */
1553 	mutex_enter(&stream_ref_lock);
1554 	while (bsp->bs_taskq_cnt != 0)
1555 		cv_wait(&stream_ref_cv, &stream_ref_lock);
1556 	mutex_exit(&stream_ref_lock);
1557 
1558 	qprocsoff(rq);
1559 	if ((bip = bsp->bs_inst) != NULL)
1560 		shutdown_inst(bip);
1561 	rq->q_ptr = WR(rq)->q_ptr = NULL;
1562 	stream_free(bsp);
1563 	if (bip != NULL)
1564 		bridge_unref(bip);
1565 
1566 	return (0);
1567 }
1568 
1569 static void
1570 bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick,
1571     uint16_t vlanid)
1572 {
1573 	bridge_inst_t *bip = blp->bl_inst;
1574 	bridge_fwd_t *bfp, *bfpnew;
1575 	int i;
1576 	boolean_t replaced = B_FALSE;
1577 
1578 	/* Ignore multi-destination address used as source; it's nonsense. */
1579 	if (*saddr & 1)
1580 		return;
1581 
1582 	/*
1583 	 * If the source is known, then check whether it belongs on this link.
1584 	 * If not, and this isn't a fixed local address, then we've detected a
1585 	 * move.  If it's not known, learn it.
1586 	 */
1587 	if ((bfp = fwd_find(bip, saddr, vlanid)) != NULL) {
1588 		/*
1589 		 * If the packet has a fixed local source address, then there's
1590 		 * nothing we can learn.  We must quit.  If this was a received
1591 		 * packet, then the sender has stolen our address, but there's
1592 		 * nothing we can do.  If it's a transmitted packet, then
1593 		 * that's the normal case.
1594 		 */
1595 		if (bfp->bf_flags & BFF_LOCALADDR) {
1596 			fwd_unref(bfp);
1597 			return;
1598 		}
1599 
1600 		/*
1601 		 * Check if the link (and TRILL sender, if any) being used is
1602 		 * among the ones registered for this address.  If so, then
1603 		 * this is information that we already know.
1604 		 */
1605 		if (bfp->bf_trill_nick == ingress_nick) {
1606 			for (i = 0; i < bfp->bf_nlinks; i++) {
1607 				if (bfp->bf_links[i] == blp) {
1608 					bfp->bf_lastheard = lbolt;
1609 					fwd_unref(bfp);
1610 					return;
1611 				}
1612 			}
1613 		}
1614 	}
1615 
1616 	/*
1617 	 * Note that we intentionally "unlearn" things that appear to be under
1618 	 * attack on this link.  The forwarding cache is a negative thing for
1619 	 * security -- it disables reachability as a performance optimization
1620 	 * -- so leaving out entries optimizes for success and defends against
1621 	 * the attack.  Thus, the bare increment without a check in the delete
1622 	 * code above is right.  (And it's ok if we skid over the limit a
1623 	 * little, so there's no syncronization needed on the test.)
1624 	 */
1625 	if (blp->bl_learns >= mac_get_llimit(blp->bl_mh)) {
1626 		if (bfp != NULL) {
1627 			if (bfp->bf_vcnt == 0)
1628 				fwd_delete(bfp);
1629 			fwd_unref(bfp);
1630 		}
1631 		return;
1632 	}
1633 
1634 	atomic_inc_uint(&blp->bl_learns);
1635 
1636 	if ((bfpnew = fwd_alloc(saddr, 1, ingress_nick)) == NULL) {
1637 		if (bfp != NULL)
1638 			fwd_unref(bfp);
1639 		return;
1640 	}
1641 	KIINCR(bki_count);
1642 
1643 	if (bfp != NULL) {
1644 		/*
1645 		 * If this is a new destination for the same VLAN, then delete
1646 		 * so that we can update.  If it's a different VLAN, then we're
1647 		 * not going to delete the original.  Split off instead into an
1648 		 * IVL entry.
1649 		 */
1650 		if (bfp->bf_vlanid == vlanid) {
1651 			/* save the count of IVL duplicates */
1652 			bfpnew->bf_vcnt = bfp->bf_vcnt;
1653 
1654 			/* entry deletes count as learning events */
1655 			atomic_inc_uint(&blp->bl_learns);
1656 
1657 			/* destroy and create anew; node moved */
1658 			fwd_delete(bfp);
1659 			replaced = B_TRUE;
1660 			KIINCR(bki_moved);
1661 		} else {
1662 			bfp->bf_vcnt++;
1663 			bfpnew->bf_flags |= BFF_VLANLOCAL;
1664 		}
1665 		fwd_unref(bfp);
1666 	}
1667 	bfpnew->bf_links[0] = blp;
1668 	bfpnew->bf_nlinks = 1;
1669 	atomic_inc_uint(&blp->bl_refs);	/* bf_links entry */
1670 	if (!fwd_insert(bip, bfpnew))
1671 		fwd_free(bfpnew);
1672 	else if (!replaced)
1673 		KIINCR(bki_source);
1674 }
1675 
1676 /*
1677  * Process the VLAN headers for output on a given link.  There are several
1678  * cases (noting that we don't map VLANs):
1679  *   1. The input packet is good as it is; either
1680  *	a. It has no tag, and output has same PVID
1681  *	b. It has a non-zero priority-only tag for PVID, and b_band is same
1682  *	c. It has a tag with VLAN different from PVID, and b_band is same
1683  *   2. The tag must change: non-zero b_band is different from tag priority
1684  *   3. The packet has a tag and should not (VLAN same as PVID, b_band zero)
1685  *   4. The packet has no tag and needs one:
1686  *      a. VLAN ID same as PVID, but b_band is non-zero
1687  *      b. VLAN ID different from PVID
1688  * We exclude case 1 first, then modify the packet.  Note that output packets
1689  * get a priority set by the mblk, not by the header, because QoS in bridging
1690  * requires priority recalculation at each node.
1691  *
1692  * The passed-in tci is the "impossible" value 0xFFFF when no tag is present.
1693  */
1694 static mblk_t *
1695 reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
1696 {
1697 	boolean_t source_has_tag = (tci != 0xFFFF);
1698 	mblk_t *mpcopy;
1699 	size_t mlen, minlen;
1700 	struct ether_vlan_header *evh;
1701 	int pri;
1702 
1703 	/* This helps centralize error handling in the caller. */
1704 	if (mp == NULL)
1705 		return (mp);
1706 
1707 	/* No forwarded packet can have hardware checksum enabled */
1708 	DB_CKSUMFLAGS(mp) = 0;
1709 
1710 	/* Get the no-modification cases out of the way first */
1711 	if (!source_has_tag && vlanid == pvid)		/* 1a */
1712 		return (mp);
1713 
1714 	pri = VLAN_PRI(tci);
1715 	if (source_has_tag && mp->b_band == pri) {
1716 		if (vlanid != pvid)			/* 1c */
1717 			return (mp);
1718 		if (pri != 0 && VLAN_ID(tci) == 0)	/* 1b */
1719 			return (mp);
1720 	}
1721 
1722 	/*
1723 	 * We now know that we must modify the packet.  Prepare for that.  Note
1724 	 * that if a tag is present, the caller has already done a pullup for
1725 	 * the VLAN header, so we're good to go.
1726 	 */
1727 	if (MBLKL(mp) < sizeof (struct ether_header)) {
1728 		mpcopy = msgpullup(mp, sizeof (struct ether_header));
1729 		if (mpcopy == NULL) {
1730 			freemsg(mp);
1731 			return (NULL);
1732 		}
1733 		mp = mpcopy;
1734 	}
1735 	if (DB_REF(mp) > 1 || !IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)) ||
1736 	    (!source_has_tag && MBLKTAIL(mp) < VLAN_INCR)) {
1737 		minlen = mlen = MBLKL(mp);
1738 		if (!source_has_tag)
1739 			minlen += VLAN_INCR;
1740 		ASSERT(minlen >= sizeof (struct ether_vlan_header));
1741 		/*
1742 		 * We're willing to copy some data to avoid fragmentation, but
1743 		 * not a lot.
1744 		 */
1745 		if (minlen > 256)
1746 			minlen = sizeof (struct ether_vlan_header);
1747 		mpcopy = allocb(minlen, BPRI_MED);
1748 		if (mpcopy == NULL) {
1749 			freemsg(mp);
1750 			return (NULL);
1751 		}
1752 		if (mlen <= minlen) {
1753 			/* We toss the first mblk when we can. */
1754 			bcopy(mp->b_rptr, mpcopy->b_rptr, mlen);
1755 			mpcopy->b_wptr += mlen;
1756 			mpcopy->b_cont = mp->b_cont;
1757 			freeb(mp);
1758 		} else {
1759 			/* If not, then just copy what we need */
1760 			if (!source_has_tag)
1761 				minlen = sizeof (struct ether_header);
1762 			bcopy(mp->b_rptr, mpcopy->b_rptr, minlen);
1763 			mpcopy->b_wptr += minlen;
1764 			mpcopy->b_cont = mp;
1765 			mp->b_rptr += minlen;
1766 		}
1767 		mp = mpcopy;
1768 	}
1769 
1770 	/* LINTED: pointer alignment */
1771 	evh = (struct ether_vlan_header *)mp->b_rptr;
1772 	if (source_has_tag) {
1773 		if (mp->b_band == 0 && vlanid == pvid) {	/* 3 */
1774 			evh->ether_tpid = evh->ether_type;
1775 			mlen = MBLKL(mp);
1776 			if (mlen > sizeof (struct ether_vlan_header))
1777 				ovbcopy(mp->b_rptr +
1778 				    sizeof (struct ether_vlan_header),
1779 				    mp->b_rptr + sizeof (struct ether_header),
1780 				    mlen - sizeof (struct ether_vlan_header));
1781 			mp->b_wptr -= VLAN_INCR;
1782 		} else {					/* 2 */
1783 			if (vlanid == pvid)
1784 				vlanid = VLAN_ID_NONE;
1785 			tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1786 			evh->ether_tci = htons(tci);
1787 		}
1788 	} else {
1789 		/* case 4: no header present, but one is needed */
1790 		mlen = MBLKL(mp);
1791 		if (mlen > sizeof (struct ether_header))
1792 			ovbcopy(mp->b_rptr + sizeof (struct ether_header),
1793 			    mp->b_rptr + sizeof (struct ether_vlan_header),
1794 			    mlen - sizeof (struct ether_header));
1795 		mp->b_wptr += VLAN_INCR;
1796 		ASSERT(mp->b_wptr <= DB_LIM(mp));
1797 		if (vlanid == pvid)
1798 			vlanid = VLAN_ID_NONE;
1799 		tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1800 		evh->ether_type = evh->ether_tpid;
1801 		evh->ether_tpid = htons(ETHERTYPE_VLAN);
1802 		evh->ether_tci = htons(tci);
1803 	}
1804 	return (mp);
1805 }
1806 
1807 /* Record VLAN information and strip header if requested . */
1808 static void
1809 update_header(mblk_t *mp, mac_header_info_t *hdr_info, boolean_t striphdr)
1810 {
1811 	if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
1812 		struct ether_vlan_header *evhp;
1813 		uint16_t ether_type;
1814 
1815 		/* LINTED: alignment */
1816 		evhp = (struct ether_vlan_header *)mp->b_rptr;
1817 		hdr_info->mhi_istagged = B_TRUE;
1818 		hdr_info->mhi_tci = ntohs(evhp->ether_tci);
1819 		if (striphdr) {
1820 			/*
1821 			 * For VLAN tagged frames update the ether_type
1822 			 * in hdr_info before stripping the header.
1823 			 */
1824 			ether_type = ntohs(evhp->ether_type);
1825 			hdr_info->mhi_origsap = ether_type;
1826 			hdr_info->mhi_bindsap = (ether_type > ETHERMTU) ?
1827 			    ether_type : DLS_SAP_LLC;
1828 			mp->b_rptr = (uchar_t *)(evhp + 1);
1829 		}
1830 	} else {
1831 		hdr_info->mhi_istagged = B_FALSE;
1832 		hdr_info->mhi_tci = VLAN_ID_NONE;
1833 		if (striphdr)
1834 			mp->b_rptr += sizeof (struct ether_header);
1835 	}
1836 }
1837 
1838 /*
1839  * Return B_TRUE if we're allowed to send on this link with the given VLAN ID.
1840  */
1841 static boolean_t
1842 bridge_can_send(bridge_link_t *blp, uint16_t vlanid)
1843 {
1844 	ASSERT(vlanid != VLAN_ID_NONE);
1845 	if (blp->bl_flags & BLF_DELETED)
1846 		return (B_FALSE);
1847 	if (blp->bl_trilldata == NULL && blp->bl_state != BLS_FORWARDING)
1848 		return (B_FALSE);
1849 	return (BRIDGE_VLAN_ISSET(blp, vlanid) && BRIDGE_AF_ISSET(blp, vlanid));
1850 }
1851 
1852 /*
1853  * This function scans the bridge forwarding tables in order to forward a given
1854  * packet.  If the packet either doesn't need forwarding (the current link is
1855  * correct) or the current link needs a copy as well, then the packet is
1856  * returned to the caller.
1857  *
1858  * If a packet has been decapsulated from TRILL, then it must *NOT* reenter a
1859  * TRILL tunnel.  If the destination points there, then drop instead.
1860  */
1861 static mblk_t *
1862 bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
1863     uint16_t vlanid, uint16_t tci, boolean_t from_trill, boolean_t is_xmit)
1864 {
1865 	mblk_t *mpsend, *mpcopy;
1866 	bridge_inst_t *bip = blp->bl_inst;
1867 	bridge_link_t *blpsend, *blpnext;
1868 	bridge_fwd_t *bfp;
1869 	uint_t i;
1870 	boolean_t selfseen = B_FALSE;
1871 	void *tdp;
1872 	const uint8_t *daddr = hdr_info->mhi_daddr;
1873 
1874 	/*
1875 	 * Check for the IEEE "reserved" multicast addresses.  Messages sent to
1876 	 * these addresses are used for link-local control (STP and pause), and
1877 	 * are never forwarded or redirected.
1878 	 */
1879 	if (daddr[0] == 1 && daddr[1] == 0x80 && daddr[2] == 0xc2 &&
1880 	    daddr[3] == 0 && daddr[4] == 0 && (daddr[5] & 0xf0) == 0) {
1881 		if (from_trill) {
1882 			freemsg(mp);
1883 			mp = NULL;
1884 		}
1885 		return (mp);
1886 	}
1887 
1888 	if ((bfp = fwd_find(bip, daddr, vlanid)) != NULL) {
1889 
1890 		/*
1891 		 * If trill indicates a destination for this node, then it's
1892 		 * clearly not intended for local delivery.  We must tell TRILL
1893 		 * to encapsulate, as long as we didn't just decapsulate it.
1894 		 */
1895 		if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE) {
1896 			/*
1897 			 * Error case: can't reencapsulate if the protocols are
1898 			 * working correctly.
1899 			 */
1900 			if (from_trill) {
1901 				freemsg(mp);
1902 				return (NULL);
1903 			}
1904 			mutex_enter(&blp->bl_trilllock);
1905 			if ((tdp = blp->bl_trilldata) != NULL) {
1906 				blp->bl_trillthreads++;
1907 				mutex_exit(&blp->bl_trilllock);
1908 				update_header(mp, hdr_info, B_FALSE);
1909 				if (is_xmit)
1910 					mp = mac_fix_cksum(mp);
1911 				/* all trill data frames have Inner.VLAN */
1912 				mp = reform_vlan_header(mp, vlanid, tci, 0);
1913 				if (mp == NULL) {
1914 					KIINCR(bki_drops);
1915 					fwd_unref(bfp);
1916 					return (NULL);
1917 				}
1918 				trill_encap_fn(tdp, blp, hdr_info, mp,
1919 				    bfp->bf_trill_nick);
1920 				mutex_enter(&blp->bl_trilllock);
1921 				if (--blp->bl_trillthreads == 0 &&
1922 				    blp->bl_trilldata == NULL)
1923 					cv_broadcast(&blp->bl_trillwait);
1924 			}
1925 			mutex_exit(&blp->bl_trilllock);
1926 
1927 			/* if TRILL has been disabled, then kill this stray */
1928 			if (tdp == NULL) {
1929 				freemsg(mp);
1930 				fwd_delete(bfp);
1931 			}
1932 			fwd_unref(bfp);
1933 			return (NULL);
1934 		}
1935 
1936 		/* find first link we can send on */
1937 		for (i = 0; i < bfp->bf_nlinks; i++) {
1938 			blpsend = bfp->bf_links[i];
1939 			if (blpsend == blp)
1940 				selfseen = B_TRUE;
1941 			else if (bridge_can_send(blpsend, vlanid))
1942 				break;
1943 		}
1944 
1945 		while (i < bfp->bf_nlinks) {
1946 			blpsend = bfp->bf_links[i];
1947 			for (i++; i < bfp->bf_nlinks; i++) {
1948 				blpnext = bfp->bf_links[i];
1949 				if (blpnext == blp)
1950 					selfseen = B_TRUE;
1951 				else if (bridge_can_send(blpnext, vlanid))
1952 					break;
1953 			}
1954 			if (i == bfp->bf_nlinks && !selfseen) {
1955 				mpsend = mp;
1956 				mp = NULL;
1957 			} else {
1958 				mpsend = copymsg(mp);
1959 			}
1960 
1961 			if (!from_trill && is_xmit)
1962 				mpsend = mac_fix_cksum(mpsend);
1963 
1964 			mpsend = reform_vlan_header(mpsend, vlanid, tci,
1965 			    blpsend->bl_pvid);
1966 			if (mpsend == NULL) {
1967 				KIINCR(bki_drops);
1968 				continue;
1969 			}
1970 
1971 			KIINCR(bki_forwards);
1972 			/*
1973 			 * No need to bump up the link reference count, as
1974 			 * the forwarding entry itself holds a reference to
1975 			 * the link.
1976 			 */
1977 			if (bfp->bf_flags & BFF_LOCALADDR) {
1978 				mac_rx_common(blpsend->bl_mh, NULL, mpsend);
1979 			} else {
1980 				KLPINCR(blpsend, bkl_xmit);
1981 				MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
1982 				    mpsend);
1983 				freemsg(mpsend);
1984 			}
1985 		}
1986 		/*
1987 		 * Handle a special case: if we're transmitting to the original
1988 		 * link, then check whether the localaddr flag is set.  If it
1989 		 * is, then receive instead.  This doesn't happen with ordinary
1990 		 * bridging, but does happen often with TRILL decapsulation.
1991 		 */
1992 		if (mp != NULL && is_xmit && (bfp->bf_flags & BFF_LOCALADDR)) {
1993 			mac_rx_common(blp->bl_mh, NULL, mp);
1994 			mp = NULL;
1995 		}
1996 		fwd_unref(bfp);
1997 	} else {
1998 		/*
1999 		 * TRILL has two cases to handle.  If the packet is off the
2000 		 * wire (not from TRILL), then we need to send up into the
2001 		 * TRILL module to have the distribution tree computed.  If the
2002 		 * packet is from TRILL (decapsulated), then we're part of the
2003 		 * distribution tree, and we need to copy the packet on member
2004 		 * interfaces.
2005 		 *
2006 		 * Thus, the from TRILL case is identical to the STP case.
2007 		 */
2008 		if (!from_trill && blp->bl_trilldata != NULL) {
2009 			mutex_enter(&blp->bl_trilllock);
2010 			if ((tdp = blp->bl_trilldata) != NULL) {
2011 				blp->bl_trillthreads++;
2012 				mutex_exit(&blp->bl_trilllock);
2013 				if ((mpsend = copymsg(mp)) != NULL) {
2014 					update_header(mpsend,
2015 					    hdr_info, B_FALSE);
2016 					/*
2017 					 * all trill data frames have
2018 					 * Inner.VLAN
2019 					 */
2020 					mpsend = reform_vlan_header(mpsend,
2021 					    vlanid, tci, 0);
2022 					if (mpsend == NULL) {
2023 						KIINCR(bki_drops);
2024 					} else {
2025 						trill_encap_fn(tdp, blp,
2026 						    hdr_info, mpsend,
2027 						    RBRIDGE_NICKNAME_NONE);
2028 					}
2029 				}
2030 				mutex_enter(&blp->bl_trilllock);
2031 				if (--blp->bl_trillthreads == 0 &&
2032 				    blp->bl_trilldata == NULL)
2033 					cv_broadcast(&blp->bl_trillwait);
2034 			}
2035 			mutex_exit(&blp->bl_trilllock);
2036 		}
2037 
2038 		/*
2039 		 * This is an unknown destination, so flood.
2040 		 */
2041 		rw_enter(&bip->bi_rwlock, RW_READER);
2042 		for (blpnext = list_head(&bip->bi_links); blpnext != NULL;
2043 		    blpnext = list_next(&bip->bi_links, blpnext)) {
2044 			if (blpnext == blp)
2045 				selfseen = B_TRUE;
2046 			else if (bridge_can_send(blpnext, vlanid))
2047 				break;
2048 		}
2049 		if (blpnext != NULL)
2050 			atomic_inc_uint(&blpnext->bl_refs);
2051 		rw_exit(&bip->bi_rwlock);
2052 		while ((blpsend = blpnext) != NULL) {
2053 			rw_enter(&bip->bi_rwlock, RW_READER);
2054 			for (blpnext = list_next(&bip->bi_links, blpsend);
2055 			    blpnext != NULL;
2056 			    blpnext = list_next(&bip->bi_links, blpnext)) {
2057 				if (blpnext == blp)
2058 					selfseen = B_TRUE;
2059 				else if (bridge_can_send(blpnext, vlanid))
2060 					break;
2061 			}
2062 			if (blpnext != NULL)
2063 				atomic_inc_uint(&blpnext->bl_refs);
2064 			rw_exit(&bip->bi_rwlock);
2065 			if (blpnext == NULL && !selfseen) {
2066 				mpsend = mp;
2067 				mp = NULL;
2068 			} else {
2069 				mpsend = copymsg(mp);
2070 			}
2071 
2072 			if (!from_trill && is_xmit)
2073 				mpsend = mac_fix_cksum(mpsend);
2074 
2075 			mpsend = reform_vlan_header(mpsend, vlanid, tci,
2076 			    blpsend->bl_pvid);
2077 			if (mpsend == NULL) {
2078 				KIINCR(bki_drops);
2079 				continue;
2080 			}
2081 
2082 			if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST)
2083 				KIINCR(bki_unknown);
2084 			else
2085 				KIINCR(bki_mbcast);
2086 			KLPINCR(blpsend, bkl_xmit);
2087 			if ((mpcopy = copymsg(mpsend)) != NULL)
2088 				mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
2089 			MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend);
2090 			freemsg(mpsend);
2091 			link_unref(blpsend);
2092 		}
2093 	}
2094 
2095 	/*
2096 	 * At this point, if np is non-NULL, it means that the caller needs to
2097 	 * continue on the selected link.
2098 	 */
2099 	return (mp);
2100 }
2101 
2102 /*
2103  * Extract and validate the VLAN information for a given packet.  This checks
2104  * conformance with the rules for use of the PVID on the link, and for the
2105  * allowed (configured) VLAN set.
2106  *
2107  * Returns B_TRUE if the packet passes, B_FALSE if it fails.
2108  */
2109 static boolean_t
2110 bridge_get_vlan(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
2111     uint16_t *vlanidp, uint16_t *tcip)
2112 {
2113 	uint16_t tci, vlanid;
2114 
2115 	if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
2116 		ptrdiff_t tpos = offsetof(struct ether_vlan_header, ether_tci);
2117 		ptrdiff_t mlen;
2118 
2119 		/*
2120 		 * Extract the VLAN ID information, regardless of alignment,
2121 		 * and without a pullup.  This isn't attractive, but we do this
2122 		 * to avoid having to deal with the pointers stashed in
2123 		 * hdr_info moving around or having the caller deal with a new
2124 		 * mblk_t pointer.
2125 		 */
2126 		while (mp != NULL) {
2127 			mlen = MBLKL(mp);
2128 			if (mlen > tpos && mlen > 0)
2129 				break;
2130 			tpos -= mlen;
2131 			mp = mp->b_cont;
2132 		}
2133 		if (mp == NULL)
2134 			return (B_FALSE);
2135 		tci = mp->b_rptr[tpos] << 8;
2136 		if (++tpos >= mlen) {
2137 			do {
2138 				mp = mp->b_cont;
2139 			} while (mp != NULL && MBLKL(mp) == 0);
2140 			if (mp == NULL)
2141 				return (B_FALSE);
2142 			tpos = 0;
2143 		}
2144 		tci |= mp->b_rptr[tpos];
2145 
2146 		vlanid = VLAN_ID(tci);
2147 		if (VLAN_CFI(tci) != ETHER_CFI || vlanid > VLAN_ID_MAX)
2148 			return (B_FALSE);
2149 		if (vlanid == VLAN_ID_NONE || vlanid == blp->bl_pvid)
2150 			goto input_no_vlan;
2151 		if (!BRIDGE_VLAN_ISSET(blp, vlanid))
2152 			return (B_FALSE);
2153 	} else {
2154 		tci = 0xFFFF;
2155 input_no_vlan:
2156 		/*
2157 		 * If PVID is set to zero, then untagged traffic is not
2158 		 * supported here.  Do not learn or forward.
2159 		 */
2160 		if ((vlanid = blp->bl_pvid) == VLAN_ID_NONE)
2161 			return (B_FALSE);
2162 	}
2163 
2164 	*tcip = tci;
2165 	*vlanidp = vlanid;
2166 	return (B_TRUE);
2167 }
2168 
2169 /*
2170  * Handle MAC notifications.
2171  */
2172 static void
2173 bridge_notify_cb(void *arg, mac_notify_type_t note_type)
2174 {
2175 	bridge_link_t *blp = arg;
2176 
2177 	switch (note_type) {
2178 	case MAC_NOTE_UNICST:
2179 		bridge_new_unicst(blp);
2180 		break;
2181 
2182 	case MAC_NOTE_SDU_SIZE: {
2183 		uint_t maxsdu;
2184 		bridge_inst_t *bip = blp->bl_inst;
2185 		bridge_mac_t *bmp = bip->bi_mac;
2186 		boolean_t notify = B_FALSE;
2187 		mblk_t *mlist = NULL;
2188 
2189 		mac_sdu_get(blp->bl_mh, NULL, &maxsdu);
2190 		rw_enter(&bip->bi_rwlock, RW_READER);
2191 		if (list_prev(&bip->bi_links, blp) == NULL &&
2192 		    list_next(&bip->bi_links, blp) == NULL) {
2193 			notify = (maxsdu != bmp->bm_maxsdu);
2194 			bmp->bm_maxsdu = maxsdu;
2195 		}
2196 		blp->bl_maxsdu = maxsdu;
2197 		if (maxsdu != bmp->bm_maxsdu)
2198 			link_sdu_fail(blp, B_TRUE, &mlist);
2199 		else if (notify)
2200 			(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2201 		rw_exit(&bip->bi_rwlock);
2202 		send_up_messages(bip, mlist);
2203 		break;
2204 	}
2205 	}
2206 }
2207 
2208 /*
2209  * This is called by the MAC layer.  As with the transmit side, we're right in
2210  * the data path for all I/O on this port, so if we don't need to forward this
2211  * packet anywhere, we have to send it upwards via mac_rx_common.
2212  */
2213 static void
2214 bridge_recv_cb(mac_handle_t mh, mac_resource_handle_t rsrc, mblk_t *mpnext)
2215 {
2216 	mblk_t *mp, *mpcopy;
2217 	bridge_link_t *blp = (bridge_link_t *)mh;
2218 	bridge_inst_t *bip = blp->bl_inst;
2219 	bridge_mac_t *bmp = bip->bi_mac;
2220 	mac_header_info_t hdr_info;
2221 	uint16_t vlanid, tci;
2222 	boolean_t trillmode = B_FALSE;
2223 
2224 	KIINCR(bki_recv);
2225 	KLINCR(bkl_recv);
2226 
2227 	/*
2228 	 * Regardless of state, check for inbound TRILL packets when TRILL is
2229 	 * active.  These are pulled out of band and sent for TRILL handling.
2230 	 */
2231 	if (blp->bl_trilldata != NULL) {
2232 		void *tdp;
2233 		mblk_t *newhead;
2234 		mblk_t *tail = NULL;
2235 
2236 		mutex_enter(&blp->bl_trilllock);
2237 		if ((tdp = blp->bl_trilldata) != NULL) {
2238 			blp->bl_trillthreads++;
2239 			mutex_exit(&blp->bl_trilllock);
2240 			trillmode = B_TRUE;
2241 			newhead = mpnext;
2242 			while ((mp = mpnext) != NULL) {
2243 				boolean_t raw_isis, bridge_group;
2244 
2245 				mpnext = mp->b_next;
2246 
2247 				/*
2248 				 * If the header isn't readable, then leave on
2249 				 * the list and continue.
2250 				 */
2251 				if (mac_header_info(blp->bl_mh, mp,
2252 				    &hdr_info) != 0) {
2253 					tail = mp;
2254 					continue;
2255 				}
2256 
2257 				/*
2258 				 * The TRILL document specifies that, on
2259 				 * Ethernet alone, IS-IS packets arrive with
2260 				 * LLC rather than Ethertype, and using a
2261 				 * specific destination address.  We must check
2262 				 * for that here.  Also, we need to give BPDUs
2263 				 * to TRILL for processing.
2264 				 */
2265 				raw_isis = bridge_group = B_FALSE;
2266 				if (hdr_info.mhi_dsttype ==
2267 				    MAC_ADDRTYPE_MULTICAST) {
2268 					if (memcmp(hdr_info.mhi_daddr,
2269 					    all_isis_rbridges, ETHERADDRL) == 0)
2270 						raw_isis = B_TRUE;
2271 					else if (memcmp(hdr_info.mhi_daddr,
2272 					    bridge_group_address, ETHERADDRL) ==
2273 					    0)
2274 						bridge_group = B_TRUE;
2275 				}
2276 				if (!raw_isis && !bridge_group &&
2277 				    hdr_info.mhi_bindsap != ETHERTYPE_TRILL &&
2278 				    (hdr_info.mhi_bindsap != ETHERTYPE_VLAN ||
2279 				    /* LINTED: alignment */
2280 				    ((struct ether_vlan_header *)mp->b_rptr)->
2281 				    ether_type != htons(ETHERTYPE_TRILL))) {
2282 					tail = mp;
2283 					continue;
2284 				}
2285 
2286 				/*
2287 				 * We've got TRILL input.  Remove from the list
2288 				 * and send up through the TRILL module.  (Send
2289 				 * a copy through promiscuous receive just to
2290 				 * support snooping on TRILL.  Order isn't
2291 				 * preserved strictly, but that doesn't matter
2292 				 * here.)
2293 				 */
2294 				if (tail != NULL)
2295 					tail->b_next = mpnext;
2296 				mp->b_next = NULL;
2297 				if (mp == newhead)
2298 					newhead = mpnext;
2299 				mac_trill_snoop(blp->bl_mh, mp);
2300 				update_header(mp, &hdr_info, B_TRUE);
2301 				/*
2302 				 * On raw IS-IS and BPDU frames, we have to
2303 				 * make sure that the length is trimmed
2304 				 * properly.  We use origsap in order to cope
2305 				 * with jumbograms for IS-IS.  (Regular mac
2306 				 * can't.)
2307 				 */
2308 				if (raw_isis || bridge_group) {
2309 					size_t msglen = msgdsize(mp);
2310 
2311 					if (msglen > hdr_info.mhi_origsap) {
2312 						(void) adjmsg(mp,
2313 						    hdr_info.mhi_origsap -
2314 						    msglen);
2315 					} else if (msglen <
2316 					    hdr_info.mhi_origsap) {
2317 						freemsg(mp);
2318 						continue;
2319 					}
2320 				}
2321 				trill_recv_fn(tdp, blp, rsrc, mp, &hdr_info);
2322 			}
2323 			mpnext = newhead;
2324 			mutex_enter(&blp->bl_trilllock);
2325 			if (--blp->bl_trillthreads == 0 &&
2326 			    blp->bl_trilldata == NULL)
2327 				cv_broadcast(&blp->bl_trillwait);
2328 		}
2329 		mutex_exit(&blp->bl_trilllock);
2330 		if (mpnext == NULL)
2331 			return;
2332 	}
2333 
2334 	/*
2335 	 * If this is a TRILL RBridge, then just check whether this link is
2336 	 * used at all for forwarding.  If not, then we're done.
2337 	 */
2338 	if (trillmode) {
2339 		if (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2340 		    (blp->bl_flags & BLF_SDUFAIL)) {
2341 			mac_rx_common(blp->bl_mh, rsrc, mpnext);
2342 			return;
2343 		}
2344 	} else {
2345 		/*
2346 		 * For regular (STP) bridges, if we're in blocking or listening
2347 		 * state, then do nothing.  We don't learn or forward until
2348 		 * told to do so.
2349 		 */
2350 		if (blp->bl_state == BLS_BLOCKLISTEN) {
2351 			mac_rx_common(blp->bl_mh, rsrc, mpnext);
2352 			return;
2353 		}
2354 	}
2355 
2356 	/*
2357 	 * Send a copy of the message chain up to the observability node users.
2358 	 * For TRILL, we must obey the VLAN AF rules, so we go packet-by-
2359 	 * packet.
2360 	 */
2361 	if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2362 	    (bmp->bm_flags & BMF_STARTED) &&
2363 	    (mp = copymsgchain(mpnext)) != NULL) {
2364 		mac_rx(bmp->bm_mh, NULL, mp);
2365 	}
2366 
2367 	/*
2368 	 * We must be in learning or forwarding state, or using TRILL on a link
2369 	 * with one or more VLANs active.  For each packet in the list, process
2370 	 * the source address, and then attempt to forward.
2371 	 */
2372 	while ((mp = mpnext) != NULL) {
2373 		mpnext = mp->b_next;
2374 		mp->b_next = NULL;
2375 
2376 		/*
2377 		 * If we can't decode the header or if the header specifies a
2378 		 * multicast source address (impossible!), then don't bother
2379 		 * learning or forwarding, but go ahead and forward up the
2380 		 * stack for subsequent processing.
2381 		 */
2382 		if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0 ||
2383 		    (hdr_info.mhi_saddr[0] & 1) != 0) {
2384 			KIINCR(bki_drops);
2385 			KLINCR(bkl_drops);
2386 			mac_rx_common(blp->bl_mh, rsrc, mp);
2387 			continue;
2388 		}
2389 
2390 		/*
2391 		 * Extract and validate the VLAN ID for this packet.
2392 		 */
2393 		if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2394 		    !BRIDGE_AF_ISSET(blp, vlanid)) {
2395 			mac_rx_common(blp->bl_mh, rsrc, mp);
2396 			continue;
2397 		}
2398 
2399 		if (trillmode) {
2400 			/*
2401 			 * Special test required by TRILL document: must
2402 			 * discard frames with outer address set to ESADI.
2403 			 */
2404 			if (memcmp(hdr_info.mhi_daddr, all_esadi_rbridges,
2405 			    ETHERADDRL) == 0) {
2406 				mac_rx_common(blp->bl_mh, rsrc, mp);
2407 				continue;
2408 			}
2409 
2410 			/*
2411 			 * If we're in TRILL mode, then the call above to get
2412 			 * the VLAN ID has also checked that we're the
2413 			 * appointed forwarder, so report that we're handling
2414 			 * this packet to any observability node users.
2415 			 */
2416 			if ((bmp->bm_flags & BMF_STARTED) &&
2417 			    (mpcopy = copymsg(mp)) != NULL)
2418 				mac_rx(bmp->bm_mh, NULL, mpcopy);
2419 		}
2420 
2421 		/*
2422 		 * First process the source address and learn from it.  For
2423 		 * TRILL, we learn only if we're the appointed forwarder.
2424 		 */
2425 		bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2426 		    vlanid);
2427 
2428 		/*
2429 		 * Now check whether we're forwarding and look up the
2430 		 * destination.  If we can forward, do so.
2431 		 */
2432 		if (trillmode || blp->bl_state == BLS_FORWARDING) {
2433 			mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2434 			    B_FALSE, B_FALSE);
2435 		}
2436 		if (mp != NULL)
2437 			mac_rx_common(blp->bl_mh, rsrc, mp);
2438 	}
2439 }
2440 
2441 
2442 /* ARGSUSED */
2443 static mblk_t *
2444 bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext)
2445 {
2446 	bridge_link_t *blp = (bridge_link_t *)mh;
2447 	bridge_inst_t *bip = blp->bl_inst;
2448 	bridge_mac_t *bmp = bip->bi_mac;
2449 	mac_header_info_t hdr_info;
2450 	uint16_t vlanid, tci;
2451 	mblk_t *mp, *mpcopy;
2452 	boolean_t trillmode;
2453 
2454 	trillmode = blp->bl_trilldata != NULL;
2455 
2456 	/*
2457 	 * If we're using STP and we're in blocking or listening state, or if
2458 	 * we're using TRILL and no VLANs are active, then behave as though the
2459 	 * bridge isn't here at all, and send on the local link alone.
2460 	 */
2461 	if ((!trillmode && blp->bl_state == BLS_BLOCKLISTEN) ||
2462 	    (trillmode &&
2463 	    (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2464 	    (blp->bl_flags & BLF_SDUFAIL)))) {
2465 		KIINCR(bki_sent);
2466 		KLINCR(bkl_xmit);
2467 		MAC_RING_TX(blp->bl_mh, rh, mpnext, mp);
2468 		return (mp);
2469 	}
2470 
2471 	/*
2472 	 * Send a copy of the message up to the observability node users.
2473 	 * TRILL needs to check on a packet-by-packet basis.
2474 	 */
2475 	if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2476 	    (bmp->bm_flags & BMF_STARTED) &&
2477 	    (mp = copymsgchain(mpnext)) != NULL) {
2478 		mac_rx(bmp->bm_mh, NULL, mp);
2479 	}
2480 
2481 	while ((mp = mpnext) != NULL) {
2482 		mpnext = mp->b_next;
2483 		mp->b_next = NULL;
2484 
2485 		if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2486 			freemsg(mp);
2487 			continue;
2488 		}
2489 
2490 		/*
2491 		 * Extract and validate the VLAN ID for this packet.
2492 		 */
2493 		if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2494 		    !BRIDGE_AF_ISSET(blp, vlanid)) {
2495 			freemsg(mp);
2496 			continue;
2497 		}
2498 
2499 		/*
2500 		 * If we're using TRILL, then we've now validated that we're
2501 		 * the forwarder for this VLAN, so go ahead and let
2502 		 * observability node users know about the packet.
2503 		 */
2504 		if (trillmode && (bmp->bm_flags & BMF_STARTED) &&
2505 		    (mpcopy = copymsg(mp)) != NULL) {
2506 			mac_rx(bmp->bm_mh, NULL, mpcopy);
2507 		}
2508 
2509 		/*
2510 		 * We have to learn from our own transmitted packets, because
2511 		 * there may be a Solaris DLPI raw sender (who can specify his
2512 		 * own source address) using promiscuous mode for receive.  The
2513 		 * mac layer information won't (and can't) tell us everything
2514 		 * we need to know.
2515 		 */
2516 		bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2517 		    vlanid);
2518 
2519 		/* attempt forwarding */
2520 		if (trillmode || blp->bl_state == BLS_FORWARDING) {
2521 			mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2522 			    B_FALSE, B_TRUE);
2523 		}
2524 		if (mp != NULL) {
2525 			MAC_RING_TX(blp->bl_mh, rh, mp, mp);
2526 			if (mp == NULL) {
2527 				KIINCR(bki_sent);
2528 				KLINCR(bkl_xmit);
2529 			}
2530 		}
2531 		/*
2532 		 * If we get stuck, then stop.  Don't let the user's output
2533 		 * packets get out of order.  (More importantly: don't try to
2534 		 * bridge the same packet multiple times if flow control is
2535 		 * asserted.)
2536 		 */
2537 		if (mp != NULL) {
2538 			mp->b_next = mpnext;
2539 			break;
2540 		}
2541 	}
2542 	return (mp);
2543 }
2544 
2545 /*
2546  * This is called by TRILL when it decapsulates an packet, and we must forward
2547  * locally.  On failure, we just drop.
2548  *
2549  * Note that the ingress_nick reported by TRILL must not represent this local
2550  * node.
2551  */
2552 void
2553 bridge_trill_decaps(bridge_link_t *blp, mblk_t *mp, uint16_t ingress_nick)
2554 {
2555 	mac_header_info_t hdr_info;
2556 	uint16_t vlanid, tci;
2557 	bridge_inst_t *bip = blp->bl_inst;	/* used by macros */
2558 	mblk_t *mpcopy;
2559 
2560 	if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2561 		freemsg(mp);
2562 		return;
2563 	}
2564 
2565 	/* Extract VLAN ID for this packet. */
2566 	if (hdr_info.mhi_bindsap == ETHERTYPE_VLAN) {
2567 		struct ether_vlan_header *evhp;
2568 
2569 		/* LINTED: alignment */
2570 		evhp = (struct ether_vlan_header *)mp->b_rptr;
2571 		tci = ntohs(evhp->ether_tci);
2572 		vlanid = VLAN_ID(tci);
2573 	} else {
2574 		/* Inner VLAN headers are required in TRILL data packets */
2575 		DTRACE_PROBE3(bridge__trill__decaps__novlan, bridge_link_t *,
2576 		    blp, mblk_t *, mp, uint16_t, ingress_nick);
2577 		freemsg(mp);
2578 		return;
2579 	}
2580 
2581 	/* Learn the location of this sender in the RBridge network */
2582 	bridge_learn(blp, hdr_info.mhi_saddr, ingress_nick, vlanid);
2583 
2584 	/* attempt forwarding */
2585 	mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci, B_TRUE, B_TRUE);
2586 	if (mp != NULL) {
2587 		if (bridge_can_send(blp, vlanid)) {
2588 			/* Deliver a copy locally as well */
2589 			if ((mpcopy = copymsg(mp)) != NULL)
2590 				mac_rx_common(blp->bl_mh, NULL, mpcopy);
2591 			MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
2592 		}
2593 		if (mp == NULL) {
2594 			KIINCR(bki_sent);
2595 			KLINCR(bkl_xmit);
2596 		} else {
2597 			freemsg(mp);
2598 		}
2599 	}
2600 }
2601 
2602 /*
2603  * This function is used by TRILL _only_ to transmit TRILL-encapsulated
2604  * packets.  It sends on a single underlying link and does not bridge.
2605  */
2606 mblk_t *
2607 bridge_trill_output(bridge_link_t *blp, mblk_t *mp)
2608 {
2609 	bridge_inst_t *bip = blp->bl_inst;	/* used by macros */
2610 
2611 	mac_trill_snoop(blp->bl_mh, mp);
2612 	MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
2613 	if (mp == NULL) {
2614 		KIINCR(bki_sent);
2615 		KLINCR(bkl_xmit);
2616 	}
2617 	return (mp);
2618 }
2619 
2620 /*
2621  * Set the "appointed forwarder" flag array for this link.  TRILL controls
2622  * forwarding on a VLAN basis.  The "trillactive" flag is an optimization for
2623  * the forwarder.
2624  */
2625 void
2626 bridge_trill_setvlans(bridge_link_t *blp, const uint8_t *arr)
2627 {
2628 	int i;
2629 	uint_t newflags = 0;
2630 
2631 	for (i = 0; i < BRIDGE_VLAN_ARR_SIZE; i++) {
2632 		if ((blp->bl_afs[i] = arr[i]) != 0)
2633 			newflags = BLF_TRILLACTIVE;
2634 	}
2635 	blp->bl_flags = (blp->bl_flags & ~BLF_TRILLACTIVE) | newflags;
2636 }
2637 
2638 void
2639 bridge_trill_flush(bridge_link_t *blp, uint16_t vlan, boolean_t dotrill)
2640 {
2641 	bridge_inst_t *bip = blp->bl_inst;
2642 	bridge_fwd_t *bfp, *bfnext;
2643 	avl_tree_t fwd_scavenge;
2644 	int i;
2645 
2646 	_NOTE(ARGUNUSED(vlan));
2647 
2648 	avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
2649 	    offsetof(bridge_fwd_t, bf_node));
2650 	rw_enter(&bip->bi_rwlock, RW_WRITER);
2651 	bfnext = avl_first(&bip->bi_fwd);
2652 	while ((bfp = bfnext) != NULL) {
2653 		bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
2654 		if (bfp->bf_flags & BFF_LOCALADDR)
2655 			continue;
2656 		if (dotrill) {
2657 			/* port doesn't matter if we're flushing TRILL */
2658 			if (bfp->bf_trill_nick == RBRIDGE_NICKNAME_NONE)
2659 				continue;
2660 		} else {
2661 			if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE)
2662 				continue;
2663 			for (i = 0; i < bfp->bf_nlinks; i++) {
2664 				if (bfp->bf_links[i] == blp)
2665 					break;
2666 			}
2667 			if (i >= bfp->bf_nlinks)
2668 				continue;
2669 		}
2670 		ASSERT(bfp->bf_flags & BFF_INTREE);
2671 		avl_remove(&bip->bi_fwd, bfp);
2672 		bfp->bf_flags &= ~BFF_INTREE;
2673 		avl_add(&fwd_scavenge, bfp);
2674 	}
2675 	rw_exit(&bip->bi_rwlock);
2676 	bfnext = avl_first(&fwd_scavenge);
2677 	while ((bfp = bfnext) != NULL) {
2678 		bfnext = AVL_NEXT(&fwd_scavenge, bfp);
2679 		avl_remove(&fwd_scavenge, bfp);
2680 		fwd_unref(bfp);
2681 	}
2682 	avl_destroy(&fwd_scavenge);
2683 }
2684 
2685 /*
2686  * Let the mac module take or drop a reference to a bridge link.  When this is
2687  * called, the mac module is holding the mi_bridge_lock, so the link cannot be
2688  * in the process of entering or leaving a bridge.
2689  */
2690 static void
2691 bridge_ref_cb(mac_handle_t mh, boolean_t hold)
2692 {
2693 	bridge_link_t *blp = (bridge_link_t *)mh;
2694 
2695 	if (hold)
2696 		atomic_inc_uint(&blp->bl_refs);
2697 	else
2698 		link_unref(blp);
2699 }
2700 
2701 /*
2702  * Handle link state changes reported by the mac layer.  This acts as a filter
2703  * for link state changes: if a link is reporting down, but there are other
2704  * links still up on the bridge, then the state is changed to "up."  When the
2705  * last link goes down, all are marked down, and when the first link goes up,
2706  * all are marked up.  (Recursion is avoided by the use of the "redo" function.)
2707  *
2708  * We treat unknown as equivalent to "up."
2709  */
2710 static link_state_t
2711 bridge_ls_cb(mac_handle_t mh, link_state_t newls)
2712 {
2713 	bridge_link_t *blp = (bridge_link_t *)mh;
2714 	bridge_link_t *blcmp;
2715 	bridge_inst_t *bip;
2716 	bridge_mac_t *bmp;
2717 
2718 	if (newls != LINK_STATE_DOWN && blp->bl_linkstate != LINK_STATE_DOWN ||
2719 	    (blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL))) {
2720 		blp->bl_linkstate = newls;
2721 		return (newls);
2722 	}
2723 
2724 	/*
2725 	 * Scan first to see if there are any other non-down links.  If there
2726 	 * are, then we're done.  Otherwise, if all others are down, then the
2727 	 * state of this link is the state of the bridge.
2728 	 */
2729 	bip = blp->bl_inst;
2730 	rw_enter(&bip->bi_rwlock, RW_WRITER);
2731 	for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2732 	    blcmp = list_next(&bip->bi_links, blcmp)) {
2733 		if (blcmp != blp &&
2734 		    !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
2735 		    blcmp->bl_linkstate != LINK_STATE_DOWN)
2736 			break;
2737 	}
2738 
2739 	if (blcmp != NULL) {
2740 		/*
2741 		 * If there are other links that are considered up, then tell
2742 		 * the caller that the link is actually still up, regardless of
2743 		 * this link's underlying state.
2744 		 */
2745 		blp->bl_linkstate = newls;
2746 		newls = LINK_STATE_UP;
2747 	} else if (blp->bl_linkstate != newls) {
2748 		/*
2749 		 * If we've found no other 'up' links, and this link has
2750 		 * changed state, then report the new state of the bridge to
2751 		 * all other clients.
2752 		 */
2753 		blp->bl_linkstate = newls;
2754 		for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2755 		    blcmp = list_next(&bip->bi_links, blcmp)) {
2756 			if (blcmp != blp && !(blcmp->bl_flags & BLF_DELETED))
2757 				mac_link_redo(blcmp->bl_mh, newls);
2758 		}
2759 		bmp = bip->bi_mac;
2760 		if ((bmp->bm_linkstate = newls) != LINK_STATE_DOWN)
2761 			bmp->bm_linkstate = LINK_STATE_UP;
2762 		mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
2763 	}
2764 	rw_exit(&bip->bi_rwlock);
2765 	return (newls);
2766 }
2767 
2768 static void
2769 bridge_add_link(void *arg)
2770 {
2771 	mblk_t *mp = arg;
2772 	bridge_stream_t *bsp;
2773 	bridge_inst_t *bip, *bipt;
2774 	bridge_mac_t *bmp;
2775 	datalink_id_t linkid;
2776 	int err;
2777 	mac_handle_t mh;
2778 	uint_t maxsdu;
2779 	bridge_link_t *blp = NULL, *blpt;
2780 	const mac_info_t *mip;
2781 	boolean_t macopen = B_FALSE;
2782 	char linkname[MAXLINKNAMELEN];
2783 	char kstatname[KSTAT_STRLEN];
2784 	int i;
2785 	link_state_t linkstate;
2786 	mblk_t *mlist;
2787 
2788 	bsp = (bridge_stream_t *)mp->b_next;
2789 	mp->b_next = NULL;
2790 	bip = bsp->bs_inst;
2791 	/* LINTED: alignment */
2792 	linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2793 
2794 	/*
2795 	 * First make sure that there is no other bridge that has this link.
2796 	 * We don't want to overlap operations from two bridges; the MAC layer
2797 	 * supports only one bridge on a given MAC at a time.
2798 	 *
2799 	 * We rely on the fact that there's just one taskq thread for the
2800 	 * bridging module: once we've checked for a duplicate, we can drop the
2801 	 * lock, because no other thread could possibly be adding another link
2802 	 * until we're done.
2803 	 */
2804 	mutex_enter(&inst_lock);
2805 	for (bipt = list_head(&inst_list); bipt != NULL;
2806 	    bipt = list_next(&inst_list, bipt)) {
2807 		rw_enter(&bipt->bi_rwlock, RW_READER);
2808 		for (blpt = list_head(&bipt->bi_links); blpt != NULL;
2809 		    blpt = list_next(&bipt->bi_links, blpt)) {
2810 			if (linkid == blpt->bl_linkid)
2811 				break;
2812 		}
2813 		rw_exit(&bipt->bi_rwlock);
2814 		if (blpt != NULL)
2815 			break;
2816 	}
2817 	mutex_exit(&inst_lock);
2818 	if (bipt != NULL) {
2819 		err = EBUSY;
2820 		goto fail;
2821 	}
2822 
2823 	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
2824 		goto fail;
2825 	macopen = B_TRUE;
2826 
2827 	/* we bridge only Ethernet */
2828 	mip = mac_info(mh);
2829 	if (mip->mi_media != DL_ETHER) {
2830 		err = ENOTSUP;
2831 		goto fail;
2832 	}
2833 
2834 	/*
2835 	 * Get the current maximum SDU on this interface.  If there are other
2836 	 * links on the bridge, then this one must match, or it errors out.
2837 	 * Otherwise, the first link becomes the standard for the new bridge.
2838 	 */
2839 	mac_sdu_get(mh, NULL, &maxsdu);
2840 	bmp = bip->bi_mac;
2841 	if (list_is_empty(&bip->bi_links)) {
2842 		bmp->bm_maxsdu = maxsdu;
2843 		(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2844 	}
2845 
2846 	/* figure the kstat name; also used as the mac client name */
2847 	i = MBLKL(mp->b_cont) - sizeof (datalink_id_t);
2848 	if (i < 0 || i >= MAXLINKNAMELEN)
2849 		i = MAXLINKNAMELEN - 1;
2850 	bcopy(mp->b_cont->b_rptr + sizeof (datalink_id_t), linkname, i);
2851 	linkname[i] = '\0';
2852 	(void) snprintf(kstatname, sizeof (kstatname), "%s-%s", bip->bi_name,
2853 	    linkname);
2854 
2855 	if ((blp = kmem_zalloc(sizeof (*blp), KM_NOSLEEP)) == NULL) {
2856 		err = ENOMEM;
2857 		goto fail;
2858 	}
2859 	blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
2860 	if (blp->bl_lfailmp == NULL) {
2861 		kmem_free(blp, sizeof (*blp));
2862 		err = ENOMEM;
2863 		goto fail;
2864 	}
2865 
2866 	atomic_inc_uint(&bip->bi_refs);
2867 	blp->bl_inst = bip;
2868 	blp->bl_mh = mh;
2869 	blp->bl_linkid = linkid;
2870 	blp->bl_maxsdu = maxsdu;
2871 	cv_init(&blp->bl_trillwait, NULL, CV_DRIVER, NULL);
2872 	mutex_init(&blp->bl_trilllock, NULL, MUTEX_DRIVER, NULL);
2873 	(void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
2874 
2875 	err = mac_client_open(mh, &blp->bl_mch, kstatname, 0);
2876 	if (err != 0)
2877 		goto fail;
2878 	blp->bl_flags |= BLF_CLIENT_OPEN;
2879 
2880 	err = mac_margin_add(mh, &blp->bl_margin, B_TRUE);
2881 	if (err != 0)
2882 		goto fail;
2883 	blp->bl_flags |= BLF_MARGIN_ADDED;
2884 
2885 	blp->bl_mnh = mac_notify_add(mh, bridge_notify_cb, blp);
2886 
2887 	err = mac_bridge_set(mh, (mac_handle_t)blp);
2888 	if (err != 0)
2889 		goto fail;
2890 	blp->bl_flags |= BLF_SET_BRIDGE;
2891 
2892 	err = mac_promisc_add(blp->bl_mch, MAC_CLIENT_PROMISC_ALL, NULL,
2893 	    blp, &blp->bl_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP);
2894 	if (err != 0)
2895 		goto fail;
2896 	blp->bl_flags |= BLF_PROM_ADDED;
2897 
2898 	bridge_new_unicst(blp);
2899 
2900 	blp->bl_ksp = kstat_setup((kstat_named_t *)&blp->bl_kstats,
2901 	    link_kstats_list, Dim(link_kstats_list), kstatname);
2902 
2903 	/*
2904 	 * The link holds a reference to the bridge instance, so that the
2905 	 * instance can't go away before the link is freed.  The insertion into
2906 	 * bi_links holds a reference on the link.  When marking as removed
2907 	 * from bi_links (BLF_DELETED), drop the reference on the link.  When
2908 	 * freeing the link, drop the reference on the instance.
2909 	 */
2910 	rw_enter(&bip->bi_rwlock, RW_WRITER);
2911 	list_insert_tail(&bip->bi_links, blp);
2912 	atomic_inc_uint(&blp->bl_refs);
2913 
2914 	/*
2915 	 * If the new link is no good on this bridge, then let the daemon know
2916 	 * about the problem.
2917 	 */
2918 	mlist = NULL;
2919 	if (maxsdu != bmp->bm_maxsdu)
2920 		link_sdu_fail(blp, B_TRUE, &mlist);
2921 	rw_exit(&bip->bi_rwlock);
2922 	send_up_messages(bip, mlist);
2923 
2924 	/*
2925 	 * Trigger a link state update so that if this link is the first one
2926 	 * "up" in the bridge, then we notify everyone.  This triggers a trip
2927 	 * through bridge_ls_cb.
2928 	 */
2929 	linkstate = mac_stat_get(mh, MAC_STAT_LOWLINK_STATE);
2930 	blp->bl_linkstate = LINK_STATE_DOWN;
2931 	mac_link_update(mh, linkstate);
2932 
2933 	/*
2934 	 * We now need to report back to the stream that invoked us, and then
2935 	 * drop the reference on the stream that we're holding.
2936 	 */
2937 	miocack(bsp->bs_wq, mp, 0, 0);
2938 	stream_unref(bsp);
2939 	return;
2940 
2941 fail:
2942 	if (blp == NULL) {
2943 		if (macopen)
2944 			mac_close(mh);
2945 	} else {
2946 		link_shutdown(blp);
2947 		link_free(blp);
2948 	}
2949 	miocnak(bsp->bs_wq, mp, 0, err);
2950 	stream_unref(bsp);
2951 }
2952 
2953 static void
2954 bridge_rem_link(void *arg)
2955 {
2956 	mblk_t *mp = arg;
2957 	bridge_stream_t *bsp;
2958 	bridge_inst_t *bip;
2959 	bridge_mac_t *bmp;
2960 	datalink_id_t linkid;
2961 	bridge_link_t *blp, *blsave;
2962 	boolean_t found;
2963 	mblk_t *mlist;
2964 
2965 	bsp = (bridge_stream_t *)mp->b_next;
2966 	mp->b_next = NULL;
2967 	bip = bsp->bs_inst;
2968 	/* LINTED: alignment */
2969 	linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2970 
2971 	/*
2972 	 * We become reader here so that we can loop over the other links and
2973 	 * deliver link up/down notification.
2974 	 */
2975 	rw_enter(&bip->bi_rwlock, RW_READER);
2976 	found = B_FALSE;
2977 	for (blp = list_head(&bip->bi_links); blp != NULL;
2978 	    blp = list_next(&bip->bi_links, blp)) {
2979 		if (blp->bl_linkid == linkid &&
2980 		    !(blp->bl_flags & BLF_DELETED)) {
2981 			blp->bl_flags |= BLF_DELETED;
2982 			(void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
2983 			    blp, DDI_SLEEP);
2984 			found = B_TRUE;
2985 			break;
2986 		}
2987 	}
2988 
2989 	/*
2990 	 * Check if this link is up and the remainder of the links are all
2991 	 * down.
2992 	 */
2993 	if (blp != NULL && blp->bl_linkstate != LINK_STATE_DOWN) {
2994 		for (blp = list_head(&bip->bi_links); blp != NULL;
2995 		    blp = list_next(&bip->bi_links, blp)) {
2996 			if (blp->bl_linkstate != LINK_STATE_DOWN &&
2997 			    !(blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)))
2998 				break;
2999 		}
3000 		if (blp == NULL) {
3001 			for (blp = list_head(&bip->bi_links); blp != NULL;
3002 			    blp = list_next(&bip->bi_links, blp)) {
3003 				if (!(blp->bl_flags & BLF_DELETED))
3004 					mac_link_redo(blp->bl_mh,
3005 					    LINK_STATE_DOWN);
3006 			}
3007 			bmp = bip->bi_mac;
3008 			bmp->bm_linkstate = LINK_STATE_DOWN;
3009 			mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
3010 		}
3011 	}
3012 
3013 	/*
3014 	 * Check if there's just one working link left on the bridge.  If so,
3015 	 * then that link is now authoritative for bridge MTU.
3016 	 */
3017 	blsave = NULL;
3018 	for (blp = list_head(&bip->bi_links); blp != NULL;
3019 	    blp = list_next(&bip->bi_links, blp)) {
3020 		if (!(blp->bl_flags & BLF_DELETED)) {
3021 			if (blsave == NULL)
3022 				blsave = blp;
3023 			else
3024 				break;
3025 		}
3026 	}
3027 	mlist = NULL;
3028 	bmp = bip->bi_mac;
3029 	if (blsave != NULL && blp == NULL &&
3030 	    blsave->bl_maxsdu != bmp->bm_maxsdu) {
3031 		bmp->bm_maxsdu = blsave->bl_maxsdu;
3032 		(void) mac_maxsdu_update(bmp->bm_mh, blsave->bl_maxsdu);
3033 		link_sdu_fail(blsave, B_FALSE, &mlist);
3034 	}
3035 	rw_exit(&bip->bi_rwlock);
3036 	send_up_messages(bip, mlist);
3037 
3038 	if (found)
3039 		miocack(bsp->bs_wq, mp, 0, 0);
3040 	else
3041 		miocnak(bsp->bs_wq, mp, 0, ENOENT);
3042 	stream_unref(bsp);
3043 }
3044 
3045 /*
3046  * This function intentionally returns with bi_rwlock held; it is intended for
3047  * quick checks and updates.
3048  */
3049 static bridge_link_t *
3050 enter_link(bridge_inst_t *bip, datalink_id_t linkid)
3051 {
3052 	bridge_link_t *blp;
3053 
3054 	rw_enter(&bip->bi_rwlock, RW_READER);
3055 	for (blp = list_head(&bip->bi_links); blp != NULL;
3056 	    blp = list_next(&bip->bi_links, blp)) {
3057 		if (blp->bl_linkid == linkid && !(blp->bl_flags & BLF_DELETED))
3058 			break;
3059 	}
3060 	return (blp);
3061 }
3062 
3063 static void
3064 bridge_ioctl(queue_t *wq, mblk_t *mp)
3065 {
3066 	bridge_stream_t *bsp = wq->q_ptr;
3067 	bridge_inst_t *bip;
3068 	struct iocblk *iop;
3069 	int rc = EINVAL;
3070 	int len = 0;
3071 	bridge_link_t *blp;
3072 	cred_t *cr;
3073 
3074 	/* LINTED: alignment */
3075 	iop = (struct iocblk *)mp->b_rptr;
3076 
3077 	/*
3078 	 * For now, all of the bridge ioctls are privileged.
3079 	 */
3080 	if ((cr = msg_getcred(mp, NULL)) == NULL)
3081 		cr = iop->ioc_cr;
3082 	if (cr != NULL && secpolicy_net_config(cr, B_FALSE) != 0) {
3083 		miocnak(wq, mp, 0, EPERM);
3084 		return;
3085 	}
3086 
3087 	switch (iop->ioc_cmd) {
3088 	case BRIOC_NEWBRIDGE: {
3089 		bridge_newbridge_t *bnb;
3090 
3091 		if (bsp->bs_inst != NULL ||
3092 		    (rc = miocpullup(mp, sizeof (bridge_newbridge_t))) != 0)
3093 			break;
3094 		/* LINTED: alignment */
3095 		bnb = (bridge_newbridge_t *)mp->b_cont->b_rptr;
3096 		bnb->bnb_name[MAXNAMELEN-1] = '\0';
3097 		if ((rc = bridge_create(bnb->bnb_linkid,
3098 		    bnb->bnb_name, &bip)) != 0)
3099 			break;
3100 
3101 		rw_enter(&bip->bi_rwlock, RW_WRITER);
3102 		if (bip->bi_control != NULL) {
3103 			rw_exit(&bip->bi_rwlock);
3104 			bridge_unref(bip);
3105 			rc = EBUSY;
3106 		} else {
3107 			atomic_inc_uint(&bip->bi_refs);
3108 			bsp->bs_inst = bip;	/* stream holds reference */
3109 			bip->bi_control = bsp;
3110 			rw_exit(&bip->bi_rwlock);
3111 			rc = 0;
3112 		}
3113 		break;
3114 	}
3115 
3116 	case BRIOC_ADDLINK:
3117 		if ((bip = bsp->bs_inst) == NULL ||
3118 		    (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3119 			break;
3120 		/*
3121 		 * We cannot perform the action in this thread, because we're
3122 		 * not in process context, and we may already be holding
3123 		 * MAC-related locks.  Place the request on taskq.
3124 		 */
3125 		mp->b_next = (mblk_t *)bsp;
3126 		stream_ref(bsp);
3127 		(void) ddi_taskq_dispatch(bridge_taskq, bridge_add_link, mp,
3128 		    DDI_SLEEP);
3129 		return;
3130 
3131 	case BRIOC_REMLINK:
3132 		if ((bip = bsp->bs_inst) == NULL ||
3133 		    (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3134 			break;
3135 		/*
3136 		 * We cannot perform the action in this thread, because we're
3137 		 * not in process context, and we may already be holding
3138 		 * MAC-related locks.  Place the request on taskq.
3139 		 */
3140 		mp->b_next = (mblk_t *)bsp;
3141 		stream_ref(bsp);
3142 		(void) ddi_taskq_dispatch(bridge_taskq, bridge_rem_link, mp,
3143 		    DDI_SLEEP);
3144 		return;
3145 
3146 	case BRIOC_SETSTATE: {
3147 		bridge_setstate_t *bss;
3148 
3149 		if ((bip = bsp->bs_inst) == NULL ||
3150 		    (rc = miocpullup(mp, sizeof (*bss))) != 0)
3151 			break;
3152 		/* LINTED: alignment */
3153 		bss = (bridge_setstate_t *)mp->b_cont->b_rptr;
3154 		if ((blp = enter_link(bip, bss->bss_linkid)) == NULL) {
3155 			rc = ENOENT;
3156 		} else {
3157 			rc = 0;
3158 			blp->bl_state = bss->bss_state;
3159 		}
3160 		rw_exit(&bip->bi_rwlock);
3161 		break;
3162 	}
3163 
3164 	case BRIOC_SETPVID: {
3165 		bridge_setpvid_t *bsv;
3166 
3167 		if ((bip = bsp->bs_inst) == NULL ||
3168 		    (rc = miocpullup(mp, sizeof (*bsv))) != 0)
3169 			break;
3170 		/* LINTED: alignment */
3171 		bsv = (bridge_setpvid_t *)mp->b_cont->b_rptr;
3172 		if (bsv->bsv_vlan > VLAN_ID_MAX)
3173 			break;
3174 		if ((blp = enter_link(bip, bsv->bsv_linkid)) == NULL) {
3175 			rc = ENOENT;
3176 		} else if (blp->bl_pvid == bsv->bsv_vlan) {
3177 			rc = 0;
3178 		} else {
3179 			rc = 0;
3180 			BRIDGE_VLAN_CLR(blp, blp->bl_pvid);
3181 			blp->bl_pvid = bsv->bsv_vlan;
3182 			if (blp->bl_pvid != 0)
3183 				BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3184 		}
3185 		rw_exit(&bip->bi_rwlock);
3186 		break;
3187 	}
3188 
3189 	case BRIOC_VLANENAB: {
3190 		bridge_vlanenab_t *bve;
3191 
3192 		if ((bip = bsp->bs_inst) == NULL ||
3193 		    (rc = miocpullup(mp, sizeof (*bve))) != 0)
3194 			break;
3195 		/* LINTED: alignment */
3196 		bve = (bridge_vlanenab_t *)mp->b_cont->b_rptr;
3197 		if (bve->bve_vlan > VLAN_ID_MAX)
3198 			break;
3199 		if ((blp = enter_link(bip, bve->bve_linkid)) == NULL) {
3200 			rc = ENOENT;
3201 		} else {
3202 			rc = 0;
3203 			/* special case: vlan 0 means "all" */
3204 			if (bve->bve_vlan == 0) {
3205 				(void) memset(blp->bl_vlans,
3206 				    bve->bve_onoff ? ~0 : 0,
3207 				    sizeof (blp->bl_vlans));
3208 				BRIDGE_VLAN_CLR(blp, 0);
3209 				if (blp->bl_pvid != 0)
3210 					BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3211 			} else if (bve->bve_vlan == blp->bl_pvid) {
3212 				rc = EINVAL;
3213 			} else if (bve->bve_onoff) {
3214 				BRIDGE_VLAN_SET(blp, bve->bve_vlan);
3215 			} else {
3216 				BRIDGE_VLAN_CLR(blp, bve->bve_vlan);
3217 			}
3218 		}
3219 		rw_exit(&bip->bi_rwlock);
3220 		break;
3221 	}
3222 
3223 	case BRIOC_FLUSHFWD: {
3224 		bridge_flushfwd_t *bff;
3225 		bridge_fwd_t *bfp, *bfnext;
3226 		avl_tree_t fwd_scavenge;
3227 		int i;
3228 
3229 		if ((bip = bsp->bs_inst) == NULL ||
3230 		    (rc = miocpullup(mp, sizeof (*bff))) != 0)
3231 			break;
3232 		/* LINTED: alignment */
3233 		bff = (bridge_flushfwd_t *)mp->b_cont->b_rptr;
3234 		rw_enter(&bip->bi_rwlock, RW_WRITER);
3235 		/* This case means "all" */
3236 		if (bff->bff_linkid == DATALINK_INVALID_LINKID) {
3237 			blp = NULL;
3238 		} else {
3239 			for (blp = list_head(&bip->bi_links); blp != NULL;
3240 			    blp = list_next(&bip->bi_links, blp)) {
3241 				if (blp->bl_linkid == bff->bff_linkid &&
3242 				    !(blp->bl_flags & BLF_DELETED))
3243 					break;
3244 			}
3245 			if (blp == NULL) {
3246 				rc = ENOENT;
3247 				rw_exit(&bip->bi_rwlock);
3248 				break;
3249 			}
3250 		}
3251 		avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
3252 		    offsetof(bridge_fwd_t, bf_node));
3253 		bfnext = avl_first(&bip->bi_fwd);
3254 		while ((bfp = bfnext) != NULL) {
3255 			bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
3256 			if (bfp->bf_flags & BFF_LOCALADDR)
3257 				continue;
3258 			if (blp != NULL) {
3259 				for (i = 0; i < bfp->bf_maxlinks; i++) {
3260 					if (bfp->bf_links[i] == blp)
3261 						break;
3262 				}
3263 				/*
3264 				 * If the link is there and we're excluding,
3265 				 * then skip.  If the link is not there and
3266 				 * we're doing only that link, then skip.
3267 				 */
3268 				if ((i < bfp->bf_maxlinks) == bff->bff_exclude)
3269 					continue;
3270 			}
3271 			ASSERT(bfp->bf_flags & BFF_INTREE);
3272 			avl_remove(&bip->bi_fwd, bfp);
3273 			bfp->bf_flags &= ~BFF_INTREE;
3274 			avl_add(&fwd_scavenge, bfp);
3275 		}
3276 		rw_exit(&bip->bi_rwlock);
3277 		bfnext = avl_first(&fwd_scavenge);
3278 		while ((bfp = bfnext) != NULL) {
3279 			bfnext = AVL_NEXT(&fwd_scavenge, bfp);
3280 			avl_remove(&fwd_scavenge, bfp);
3281 			fwd_unref(bfp);	/* drop tree reference */
3282 		}
3283 		avl_destroy(&fwd_scavenge);
3284 		break;
3285 	}
3286 
3287 	case BRIOC_TABLEMAX:
3288 		if ((bip = bsp->bs_inst) == NULL ||
3289 		    (rc = miocpullup(mp, sizeof (uint32_t))) != 0)
3290 			break;
3291 		/* LINTED: alignment */
3292 		bip->bi_tablemax = *(uint32_t *)mp->b_cont->b_rptr;
3293 		break;
3294 	}
3295 
3296 	if (rc == 0)
3297 		miocack(wq, mp, len, 0);
3298 	else
3299 		miocnak(wq, mp, 0, rc);
3300 }
3301 
3302 static void
3303 bridge_wput(queue_t *wq, mblk_t *mp)
3304 {
3305 	switch (DB_TYPE(mp)) {
3306 	case M_IOCTL:
3307 		bridge_ioctl(wq, mp);
3308 		break;
3309 	case M_FLUSH:
3310 		if (*mp->b_rptr & FLUSHW)
3311 			*mp->b_rptr &= ~FLUSHW;
3312 		if (*mp->b_rptr & FLUSHR)
3313 			qreply(wq, mp);
3314 		else
3315 			freemsg(mp);
3316 		break;
3317 	default:
3318 		freemsg(mp);
3319 		break;
3320 	}
3321 }
3322 
3323 /*
3324  * This function allocates the main data structures for the bridge driver and
3325  * connects us into devfs.
3326  */
3327 static void
3328 bridge_inst_init(void)
3329 {
3330 	bridge_scan_interval = 5 * drv_usectohz(1000000);
3331 	bridge_fwd_age = 25 * drv_usectohz(1000000);
3332 
3333 	rw_init(&bmac_rwlock, NULL, RW_DRIVER, NULL);
3334 	list_create(&bmac_list, sizeof (bridge_mac_t),
3335 	    offsetof(bridge_mac_t, bm_node));
3336 	list_create(&inst_list, sizeof (bridge_inst_t),
3337 	    offsetof(bridge_inst_t, bi_node));
3338 	cv_init(&inst_cv, NULL, CV_DRIVER, NULL);
3339 	mutex_init(&inst_lock, NULL, MUTEX_DRIVER, NULL);
3340 	cv_init(&stream_ref_cv, NULL, CV_DRIVER, NULL);
3341 	mutex_init(&stream_ref_lock, NULL, MUTEX_DRIVER, NULL);
3342 
3343 	mac_bridge_vectors(bridge_xmit_cb, bridge_recv_cb, bridge_ref_cb,
3344 	    bridge_ls_cb);
3345 }
3346 
3347 /*
3348  * This function disconnects from devfs and destroys all data structures in
3349  * preparation for unload.  It's assumed that there are no active bridge
3350  * references left at this point.
3351  */
3352 static void
3353 bridge_inst_fini(void)
3354 {
3355 	mac_bridge_vectors(NULL, NULL, NULL, NULL);
3356 	if (bridge_timerid != 0)
3357 		(void) untimeout(bridge_timerid);
3358 	rw_destroy(&bmac_rwlock);
3359 	list_destroy(&bmac_list);
3360 	list_destroy(&inst_list);
3361 	cv_destroy(&inst_cv);
3362 	mutex_destroy(&inst_lock);
3363 	cv_destroy(&stream_ref_cv);
3364 	mutex_destroy(&stream_ref_lock);
3365 }
3366 
3367 /*
3368  * bridge_attach()
3369  *
3370  * Description:
3371  *    Attach bridge driver to the system.
3372  */
3373 static int
3374 bridge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3375 {
3376 	if (cmd != DDI_ATTACH)
3377 		return (DDI_FAILURE);
3378 
3379 	if (ddi_create_minor_node(dip, BRIDGE_CTL, S_IFCHR, 0, DDI_PSEUDO,
3380 	    CLONE_DEV) == DDI_FAILURE) {
3381 		return (DDI_FAILURE);
3382 	}
3383 
3384 	if (dld_ioc_register(BRIDGE_IOC, bridge_ioc_list,
3385 	    DLDIOCCNT(bridge_ioc_list)) != 0) {
3386 		ddi_remove_minor_node(dip, BRIDGE_CTL);
3387 		return (DDI_FAILURE);
3388 	}
3389 
3390 	bridge_dev_info = dip;
3391 	bridge_major = ddi_driver_major(dip);
3392 	bridge_taskq = ddi_taskq_create(dip, "bridge", 1, TASKQ_DEFAULTPRI, 0);
3393 	return (DDI_SUCCESS);
3394 }
3395 
3396 /*
3397  * bridge_detach()
3398  *
3399  * Description:
3400  *    Detach an interface to the system.
3401  */
3402 static int
3403 bridge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3404 {
3405 	if (cmd != DDI_DETACH)
3406 		return (DDI_FAILURE);
3407 
3408 	ddi_remove_minor_node(dip, NULL);
3409 	ddi_taskq_destroy(bridge_taskq);
3410 	bridge_dev_info = NULL;
3411 	return (DDI_SUCCESS);
3412 }
3413 
3414 /*
3415  * bridge_info()
3416  *
3417  * Description:
3418  *    Translate "dev_t" to a pointer to the associated "dev_info_t".
3419  */
3420 /* ARGSUSED */
3421 static int
3422 bridge_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
3423 	void **result)
3424 {
3425 	int	rc;
3426 
3427 	switch (infocmd) {
3428 	case DDI_INFO_DEVT2DEVINFO:
3429 		if (bridge_dev_info == NULL) {
3430 			rc = DDI_FAILURE;
3431 		} else {
3432 			*result = (void *)bridge_dev_info;
3433 			rc = DDI_SUCCESS;
3434 		}
3435 		break;
3436 	case DDI_INFO_DEVT2INSTANCE:
3437 		*result = NULL;
3438 		rc = DDI_SUCCESS;
3439 		break;
3440 	default:
3441 		rc = DDI_FAILURE;
3442 		break;
3443 	}
3444 	return (rc);
3445 }
3446 
3447 static struct module_info bridge_modinfo = {
3448 	2105,			/* mi_idnum */
3449 	"bridge",		/* mi_idname */
3450 	0,			/* mi_minpsz */
3451 	16384,			/* mi_maxpsz */
3452 	65536,			/* mi_hiwat */
3453 	128			/* mi_lowat */
3454 };
3455 
3456 static struct qinit bridge_rinit = {
3457 	NULL,			/* qi_putp */
3458 	NULL,			/* qi_srvp */
3459 	bridge_open,		/* qi_qopen */
3460 	bridge_close,		/* qi_qclose */
3461 	NULL,			/* qi_qadmin */
3462 	&bridge_modinfo,	/* qi_minfo */
3463 	NULL			/* qi_mstat */
3464 };
3465 
3466 static struct qinit bridge_winit = {
3467 	(int (*)())bridge_wput, /* qi_putp */
3468 	NULL,			/* qi_srvp */
3469 	NULL,			/* qi_qopen */
3470 	NULL,			/* qi_qclose */
3471 	NULL,			/* qi_qadmin */
3472 	&bridge_modinfo,	/* qi_minfo */
3473 	NULL			/* qi_mstat */
3474 };
3475 
3476 static struct streamtab bridge_tab = {
3477 	&bridge_rinit,	/* st_rdinit */
3478 	&bridge_winit	/* st_wrinit */
3479 };
3480 
3481 /* No STREAMS perimeters; we do all our own locking */
3482 DDI_DEFINE_STREAM_OPS(bridge_ops, nulldev, nulldev, bridge_attach,
3483     bridge_detach, nodev, bridge_info, D_NEW | D_MP, &bridge_tab,
3484     ddi_quiesce_not_supported);
3485 
3486 static struct modldrv modldrv = {
3487 	&mod_driverops,
3488 	"bridging driver",
3489 	&bridge_ops
3490 };
3491 
3492 static struct modlinkage modlinkage = {
3493 	MODREV_1,
3494 	(void *)&modldrv,
3495 	NULL
3496 };
3497 
3498 int
3499 _init(void)
3500 {
3501 	int retv;
3502 
3503 	bridge_inst_init();
3504 	if ((retv = mod_install(&modlinkage)) != 0)
3505 		bridge_inst_fini();
3506 	return (retv);
3507 }
3508 
3509 int
3510 _fini(void)
3511 {
3512 	int retv;
3513 
3514 	rw_enter(&bmac_rwlock, RW_READER);
3515 	retv = list_is_empty(&bmac_list) ? 0 : EBUSY;
3516 	rw_exit(&bmac_rwlock);
3517 	if (retv == 0 &&
3518 	    (retv = mod_remove(&modlinkage)) == 0)
3519 		bridge_inst_fini();
3520 	return (retv);
3521 }
3522 
3523 int
3524 _info(struct modinfo *modinfop)
3525 {
3526 	return (mod_info(&modlinkage, modinfop));
3527 }
3528