xref: /titanic_44/usr/src/uts/common/io/bridge.c (revision fc256490629fe68815f7e0f23cf9b3545720cfac)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * This module implements a STREAMS driver that provides layer-two (Ethernet)
29  * bridging functionality.  The STREAMS interface is used to provide
30  * observability (snoop/wireshark) and control, but not for interface plumbing.
31  */
32 
33 #include <sys/types.h>
34 #include <sys/bitmap.h>
35 #include <sys/cmn_err.h>
36 #include <sys/conf.h>
37 #include <sys/ddi.h>
38 #include <sys/errno.h>
39 #include <sys/kstat.h>
40 #include <sys/modctl.h>
41 #include <sys/note.h>
42 #include <sys/param.h>
43 #include <sys/policy.h>
44 #include <sys/sdt.h>
45 #include <sys/stat.h>
46 #include <sys/stream.h>
47 #include <sys/stropts.h>
48 #include <sys/strsun.h>
49 #include <sys/sunddi.h>
50 #include <sys/sysmacros.h>
51 #include <sys/systm.h>
52 #include <sys/time.h>
53 #include <sys/dlpi.h>
54 #include <sys/dls.h>
55 #include <sys/mac_ether.h>
56 #include <sys/mac_provider.h>
57 #include <sys/mac_client_priv.h>
58 #include <sys/mac_impl.h>
59 #include <sys/vlan.h>
60 #include <net/bridge.h>
61 #include <net/bridge_impl.h>
62 #include <net/trill.h>
63 #include <sys/dld_ioc.h>
64 
65 /*
66  * Locks and reference counts: object lifetime and design.
67  *
68  * bridge_mac_t
69  *   Bridge mac (snoop) instances are in bmac_list, which is protected by
70  *   bmac_rwlock.  They're allocated by bmac_alloc and freed by bridge_timer().
71  *   Every bridge_inst_t has a single bridge_mac_t, but when bridge_inst_t goes
72  *   away, the bridge_mac_t remains until either all of the users go away
73  *   (detected by a timer) or until the instance is picked up again by the same
74  *   bridge starting back up.
75  *
76  * bridge_inst_t
77  *   Bridge instances are in inst_list, which is protected by inst_lock.
78  *   They're allocated by inst_alloc() and freed by inst_free().  After
79  *   allocation, an instance is placed in inst_list, and the reference count is
80  *   incremented to represent this.  That reference is decremented when the
81  *   BIF_SHUTDOWN flag is set, and no new increments may occur.  When the last
82  *   reference is freed, the instance is removed from the list.
83  *
84  *   Bridge instances have lists of links and an AVL tree of forwarding
85  *   entries.  Each of these structures holds one reference on the bridge
86  *   instance.  These lists and tree are protected by bi_rwlock.
87  *
88  * bridge_stream_t
89  *   Bridge streams are allocated by stream_alloc() and freed by stream_free().
90  *   These streams are created when "bridged" opens /dev/bridgectl, and are
91  *   used to create new bridge instances (via BRIOC_NEWBRIDGE) and control the
92  *   links on the bridge.  When a stream closes, the bridge instance created is
93  *   destroyed.  There's at most one bridge instance for a given control
94  *   stream.
95  *
96  * bridge_link_t
97  *   Links are allocated by bridge_add_link() and freed by link_free().  The
98  *   bi_links list holds a reference to the link.  When the BLF_DELETED flag is
99  *   set, that reference is dropped.  The link isn't removed from the list
100  *   until the last reference drops.  Each forwarding entry that uses a given
101  *   link holds a reference, as does each thread transmitting a packet via the
102  *   link.  The MAC layer calls in via bridge_ref_cb() to hold a reference on
103  *   a link when transmitting.
104  *
105  *   It's important that once BLF_DELETED is set, there's no way for the
106  *   reference count to increase again.  If it can, then the link may be
107  *   double-freed.  The BLF_FREED flag is intended for use with assertions to
108  *   guard against this in testing.
109  *
110  * bridge_fwd_t
111  *   Bridge forwarding entries are allocated by bridge_recv_cb() and freed by
112  *   fwd_free().  The bi_fwd AVL tree holds one reference to the entry.  Unlike
113  *   other data structures, the reference is dropped when the entry is removed
114  *   from the tree by fwd_delete(), and the BFF_INTREE flag is removed.  Each
115  *   thread that's forwarding a packet to a known destination holds a reference
116  *   to a forwarding entry.
117  *
118  * TRILL notes:
119  *
120  *   The TRILL module does all of its I/O through bridging.  It uses references
121  *   on the bridge_inst_t and bridge_link_t structures, and has seven entry
122  *   points and four callbacks.  One entry point is for setting the callbacks
123  *   (bridge_trill_register_cb).  There are four entry points for taking bridge
124  *   and link references (bridge_trill_{br,ln}{ref,unref}).  The final two
125  *   entry points are for decapsulated packets from TRILL (bridge_trill_decaps)
126  *   that need to be bridged locally, and for TRILL-encapsulated output packets
127  *   (bridge_trill_output).
128  *
129  *   The four callbacks comprise two notification functions for bridges and
130  *   links being deleted, one function for raw received TRILL packets, and one
131  *   for bridge output to non-local TRILL destinations (tunnel entry).
132  */
133 
134 /*
135  * Ethernet reserved multicast addresses for TRILL; used also in TRILL module.
136  */
137 const uint8_t all_isis_rbridges[] = ALL_ISIS_RBRIDGES;
138 static const uint8_t all_esadi_rbridges[] = ALL_ESADI_RBRIDGES;
139 const uint8_t bridge_group_address[] = BRIDGE_GROUP_ADDRESS;
140 
141 static const char *inst_kstats_list[] = { KSINST_NAMES };
142 static const char *link_kstats_list[] = { KSLINK_NAMES };
143 
144 #define	KREF(p, m, vn)	p->m.vn.value.ui64
145 #define	KINCR(p, m, vn)	++KREF(p, m, vn)
146 #define	KDECR(p, m, vn)	--KREF(p, m, vn)
147 
148 #define	KIPINCR(p, vn)	KINCR(p, bi_kstats, vn)
149 #define	KIPDECR(p, vn)	KDECR(p, bi_kstats, vn)
150 #define	KLPINCR(p, vn)	KINCR(p, bl_kstats, vn)
151 
152 #define	KIINCR(vn)	KIPINCR(bip, vn)
153 #define	KIDECR(vn)	KIPDECR(bip, vn)
154 #define	KLINCR(vn)	KLPINCR(blp, vn)
155 
156 #define	Dim(x)		(sizeof (x) / sizeof (*(x)))
157 
158 /* Amount of overhead added when encapsulating with VLAN headers */
159 #define	VLAN_INCR	(sizeof (struct ether_vlan_header) -	\
160 			sizeof (struct ether_header))
161 
162 static dev_info_t *bridge_dev_info;
163 static major_t bridge_major;
164 static ddi_taskq_t *bridge_taskq;
165 
166 /*
167  * These are the bridge instance management data structures.  The mutex lock
168  * protects the list of bridge instances.  A reference count is then used on
169  * each instance to determine when to free it.  We use mac_minor_hold() to
170  * allocate minor_t values, which are used both for self-cloning /dev/net/
171  * device nodes as well as client streams.  Minor node 0 is reserved for the
172  * allocation control node.
173  */
174 static list_t inst_list;
175 static kcondvar_t inst_cv;		/* Allows us to wait for shutdown */
176 static kmutex_t inst_lock;
177 
178 static krwlock_t bmac_rwlock;
179 static list_t bmac_list;
180 
181 /* Wait for taskq entries that use STREAMS */
182 static kcondvar_t stream_ref_cv;
183 static kmutex_t stream_ref_lock;
184 
185 static timeout_id_t bridge_timerid;
186 static clock_t bridge_scan_interval;
187 static clock_t bridge_fwd_age;
188 
189 static bridge_inst_t *bridge_find_name(const char *);
190 static void bridge_timer(void *);
191 static void bridge_unref(bridge_inst_t *);
192 
193 static const uint8_t zero_addr[ETHERADDRL] = { 0 };
194 
195 /* Global TRILL linkage */
196 static trill_recv_pkt_t trill_recv_fn;
197 static trill_encap_pkt_t trill_encap_fn;
198 static trill_br_dstr_t trill_brdstr_fn;
199 static trill_ln_dstr_t trill_lndstr_fn;
200 
201 /* special settings to accommodate DLD flow control; see dld_str.c */
202 static struct module_info bridge_dld_modinfo = {
203 	0,			/* mi_idnum */
204 	BRIDGE_DEV_NAME,	/* mi_idname */
205 	0,			/* mi_minpsz */
206 	INFPSZ,			/* mi_maxpsz */
207 	1,			/* mi_hiwat */
208 	0			/* mi_lowat */
209 };
210 
211 static struct qinit bridge_dld_rinit = {
212 	NULL,			/* qi_putp */
213 	NULL,			/* qi_srvp */
214 	dld_open,		/* qi_qopen */
215 	dld_close,		/* qi_qclose */
216 	NULL,			/* qi_qadmin */
217 	&bridge_dld_modinfo,	/* qi_minfo */
218 	NULL			/* qi_mstat */
219 };
220 
221 static struct qinit bridge_dld_winit = {
222 	(int (*)())dld_wput,	/* qi_putp */
223 	(int (*)())dld_wsrv,	/* qi_srvp */
224 	NULL,			/* qi_qopen */
225 	NULL,			/* qi_qclose */
226 	NULL,			/* qi_qadmin */
227 	&bridge_dld_modinfo,	/* qi_minfo */
228 	NULL			/* qi_mstat */
229 };
230 
231 static int bridge_ioc_listfwd(void *, intptr_t, int, cred_t *, int *);
232 
233 /* GLDv3 control ioctls used by Bridging */
234 static dld_ioc_info_t bridge_ioc_list[] = {
235 	{BRIDGE_IOC_LISTFWD, DLDCOPYINOUT, sizeof (bridge_listfwd_t),
236 	    bridge_ioc_listfwd, NULL},
237 };
238 
239 /*
240  * Given a bridge mac pointer, get a ref-held pointer to the corresponding
241  * bridge instance, if any.  We must hold the global bmac_rwlock so that
242  * bm_inst doesn't slide out from under us.
243  */
244 static bridge_inst_t *
245 mac_to_inst(const bridge_mac_t *bmp)
246 {
247 	bridge_inst_t *bip;
248 
249 	rw_enter(&bmac_rwlock, RW_READER);
250 	if ((bip = bmp->bm_inst) != NULL)
251 		atomic_inc_uint(&bip->bi_refs);
252 	rw_exit(&bmac_rwlock);
253 	return (bip);
254 }
255 
256 static void
257 link_sdu_fail(bridge_link_t *blp, boolean_t failed, mblk_t **mlist)
258 {
259 	mblk_t *mp;
260 	bridge_ctl_t *bcp;
261 	bridge_link_t *blcmp;
262 	bridge_inst_t *bip;
263 	bridge_mac_t *bmp;
264 
265 	if (failed) {
266 		if (blp->bl_flags & BLF_SDUFAIL)
267 			return;
268 		blp->bl_flags |= BLF_SDUFAIL;
269 	} else {
270 		if (!(blp->bl_flags & BLF_SDUFAIL))
271 			return;
272 		blp->bl_flags &= ~BLF_SDUFAIL;
273 	}
274 
275 	/*
276 	 * If this link is otherwise up, then check if there are any other
277 	 * non-failed non-down links.  If not, then we control the state of the
278 	 * whole bridge.
279 	 */
280 	bip = blp->bl_inst;
281 	bmp = bip->bi_mac;
282 	if (blp->bl_linkstate != LINK_STATE_DOWN) {
283 		for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
284 		    blcmp = list_next(&bip->bi_links, blcmp)) {
285 			if (blp != blcmp &&
286 			    !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
287 			    blcmp->bl_linkstate != LINK_STATE_DOWN)
288 				break;
289 		}
290 		if (blcmp == NULL) {
291 			bmp->bm_linkstate = failed ? LINK_STATE_DOWN :
292 			    LINK_STATE_UP;
293 			mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
294 		}
295 	}
296 
297 	/*
298 	 * If we're becoming failed, then the link's current true state needs
299 	 * to be reflected upwards to this link's clients.  If we're becoming
300 	 * unfailed, then we get the state of the bridge instead on all
301 	 * clients.
302 	 */
303 	if (failed) {
304 		if (bmp->bm_linkstate != blp->bl_linkstate)
305 			mac_link_redo(blp->bl_mh, blp->bl_linkstate);
306 	} else {
307 		mac_link_redo(blp->bl_mh, bmp->bm_linkstate);
308 	}
309 
310 	/* get the current mblk we're going to send up */
311 	if ((mp = blp->bl_lfailmp) == NULL &&
312 	    (mp = allocb(sizeof (bridge_ctl_t), BPRI_MED)) == NULL)
313 		return;
314 
315 	/* get a new one for next time */
316 	blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
317 
318 	/* if none for next time, then report only failures */
319 	if (blp->bl_lfailmp == NULL && !failed) {
320 		blp->bl_lfailmp = mp;
321 		return;
322 	}
323 
324 	/* LINTED: alignment */
325 	bcp = (bridge_ctl_t *)mp->b_rptr;
326 	bcp->bc_linkid = blp->bl_linkid;
327 	bcp->bc_failed = failed;
328 	mp->b_wptr = (uchar_t *)(bcp + 1);
329 	mp->b_next = *mlist;
330 	*mlist = mp;
331 }
332 
333 /*
334  * Send control messages (link SDU changes) using the stream to the
335  * bridge instance daemon.
336  */
337 static void
338 send_up_messages(bridge_inst_t *bip, mblk_t *mp)
339 {
340 	mblk_t *mnext;
341 	queue_t *rq;
342 
343 	rq = bip->bi_control->bs_wq;
344 	rq = OTHERQ(rq);
345 	while (mp != NULL) {
346 		mnext = mp->b_next;
347 		mp->b_next = NULL;
348 		putnext(rq, mp);
349 		mp = mnext;
350 	}
351 }
352 
353 /* ARGSUSED */
354 static int
355 bridge_m_getstat(void *arg, uint_t stat, uint64_t *val)
356 {
357 	return (ENOTSUP);
358 }
359 
360 static int
361 bridge_m_start(void *arg)
362 {
363 	bridge_mac_t *bmp = arg;
364 
365 	bmp->bm_flags |= BMF_STARTED;
366 	return (0);
367 }
368 
369 static void
370 bridge_m_stop(void *arg)
371 {
372 	bridge_mac_t *bmp = arg;
373 
374 	bmp->bm_flags &= ~BMF_STARTED;
375 }
376 
377 /* ARGSUSED */
378 static int
379 bridge_m_setpromisc(void *arg, boolean_t on)
380 {
381 	return (0);
382 }
383 
384 /* ARGSUSED */
385 static int
386 bridge_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
387 {
388 	return (0);
389 }
390 
391 /* ARGSUSED */
392 static int
393 bridge_m_unicst(void *arg, const uint8_t *macaddr)
394 {
395 	return (ENOTSUP);
396 }
397 
398 static mblk_t *
399 bridge_m_tx(void *arg, mblk_t *mp)
400 {
401 	_NOTE(ARGUNUSED(arg));
402 	freemsgchain(mp);
403 	return (NULL);
404 }
405 
406 /* ARGSUSED */
407 static int
408 bridge_ioc_listfwd(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
409 {
410 	bridge_listfwd_t *blf = karg;
411 	bridge_inst_t *bip;
412 	bridge_fwd_t *bfp, match;
413 	avl_index_t where;
414 
415 	bip = bridge_find_name(blf->blf_name);
416 	if (bip == NULL)
417 		return (ENOENT);
418 
419 	bcopy(blf->blf_dest, match.bf_dest, ETHERADDRL);
420 	match.bf_flags |= BFF_VLANLOCAL;
421 	rw_enter(&bip->bi_rwlock, RW_READER);
422 	if ((bfp = avl_find(&bip->bi_fwd, &match, &where)) == NULL)
423 		bfp = avl_nearest(&bip->bi_fwd, where, AVL_AFTER);
424 	else
425 		bfp = AVL_NEXT(&bip->bi_fwd, bfp);
426 	if (bfp == NULL) {
427 		bzero(blf, sizeof (*blf));
428 	} else {
429 		bcopy(bfp->bf_dest, blf->blf_dest, ETHERADDRL);
430 		blf->blf_trill_nick = bfp->bf_trill_nick;
431 		blf->blf_ms_age =
432 		    drv_hztousec(ddi_get_lbolt() - bfp->bf_lastheard) / 1000;
433 		blf->blf_is_local =
434 		    (bfp->bf_flags & BFF_LOCALADDR) != 0;
435 		blf->blf_linkid = bfp->bf_links[0]->bl_linkid;
436 	}
437 	rw_exit(&bip->bi_rwlock);
438 	bridge_unref(bip);
439 	return (0);
440 }
441 
442 static int
443 bridge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
444     uint_t pr_valsize, const void *pr_val)
445 {
446 	bridge_mac_t *bmp = arg;
447 	bridge_inst_t *bip;
448 	bridge_link_t *blp;
449 	int err;
450 	uint_t maxsdu;
451 	mblk_t *mlist;
452 
453 	_NOTE(ARGUNUSED(pr_name));
454 	switch (pr_num) {
455 	case MAC_PROP_MTU:
456 		if (pr_valsize < sizeof (bmp->bm_maxsdu)) {
457 			err = EINVAL;
458 			break;
459 		}
460 		(void) bcopy(pr_val, &maxsdu, sizeof (maxsdu));
461 		if (maxsdu == bmp->bm_maxsdu) {
462 			err = 0;
463 		} else if ((bip = mac_to_inst(bmp)) == NULL) {
464 			err = ENXIO;
465 		} else {
466 			rw_enter(&bip->bi_rwlock, RW_WRITER);
467 			mlist = NULL;
468 			for (blp = list_head(&bip->bi_links); blp != NULL;
469 			    blp = list_next(&bip->bi_links, blp)) {
470 				if (blp->bl_flags & BLF_DELETED)
471 					continue;
472 				if (blp->bl_maxsdu == maxsdu)
473 					link_sdu_fail(blp, B_FALSE, &mlist);
474 				else if (blp->bl_maxsdu == bmp->bm_maxsdu)
475 					link_sdu_fail(blp, B_TRUE, &mlist);
476 			}
477 			rw_exit(&bip->bi_rwlock);
478 			bmp->bm_maxsdu = maxsdu;
479 			(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
480 			send_up_messages(bip, mlist);
481 			bridge_unref(bip);
482 			err = 0;
483 		}
484 		break;
485 
486 	default:
487 		err = ENOTSUP;
488 		break;
489 	}
490 	return (err);
491 }
492 
493 static int
494 bridge_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
495     uint_t pr_flags, uint_t pr_valsize, void *pr_val, uint_t *perm)
496 {
497 	bridge_mac_t *bmp = arg;
498 	int err = 0;
499 
500 	_NOTE(ARGUNUSED(pr_name));
501 	switch (pr_num) {
502 	case MAC_PROP_MTU: {
503 		mac_propval_range_t range;
504 
505 		if (!(pr_flags & MAC_PROP_POSSIBLE))
506 			return (ENOTSUP);
507 		if (pr_valsize < sizeof (mac_propval_range_t))
508 			return (EINVAL);
509 		range.mpr_count = 1;
510 		range.mpr_type = MAC_PROPVAL_UINT32;
511 		range.range_uint32[0].mpur_min =
512 		    range.range_uint32[0].mpur_max = bmp->bm_maxsdu;
513 		bcopy(&range, pr_val, sizeof (range));
514 		*perm = MAC_PROP_PERM_RW;
515 		break;
516 	}
517 	case MAC_PROP_STATUS:
518 		if (pr_valsize < sizeof (bmp->bm_linkstate)) {
519 			err = EINVAL;
520 		} else {
521 			bcopy(&bmp->bm_linkstate, pr_val,
522 			    sizeof (&bmp->bm_linkstate));
523 			*perm = MAC_PROP_PERM_READ;
524 		}
525 		break;
526 
527 	default:
528 		err = ENOTSUP;
529 		break;
530 	}
531 	return (err);
532 }
533 
534 static mac_callbacks_t bridge_m_callbacks = {
535 	MC_SETPROP | MC_GETPROP,
536 	bridge_m_getstat,
537 	bridge_m_start,
538 	bridge_m_stop,
539 	bridge_m_setpromisc,
540 	bridge_m_multicst,
541 	bridge_m_unicst,
542 	bridge_m_tx,
543 	NULL,	/* ioctl */
544 	NULL,	/* getcapab */
545 	NULL,	/* open */
546 	NULL,	/* close */
547 	bridge_m_setprop,
548 	bridge_m_getprop
549 };
550 
551 /*
552  * Create kstats from a list.
553  */
554 static kstat_t *
555 kstat_setup(kstat_named_t *knt, const char **names, int nstat,
556     const char *unitname)
557 {
558 	kstat_t *ksp;
559 	int i;
560 
561 	for (i = 0; i < nstat; i++)
562 		kstat_named_init(&knt[i], names[i], KSTAT_DATA_UINT64);
563 
564 	ksp = kstat_create_zone(BRIDGE_DEV_NAME, 0, unitname, "net",
565 	    KSTAT_TYPE_NAMED, nstat, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
566 	if (ksp != NULL) {
567 		ksp->ks_data = knt;
568 		kstat_install(ksp);
569 	}
570 	return (ksp);
571 }
572 
573 /*
574  * Find an existing bridge_mac_t structure or allocate a new one for the given
575  * bridge instance.  This creates the mac driver instance that snoop can use.
576  */
577 static int
578 bmac_alloc(bridge_inst_t *bip, bridge_mac_t **bmacp)
579 {
580 	bridge_mac_t *bmp, *bnew;
581 	mac_register_t *mac;
582 	int err;
583 
584 	*bmacp = NULL;
585 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
586 		return (EINVAL);
587 
588 	bnew = kmem_zalloc(sizeof (*bnew), KM_SLEEP);
589 
590 	rw_enter(&bmac_rwlock, RW_WRITER);
591 	for (bmp = list_head(&bmac_list); bmp != NULL;
592 	    bmp = list_next(&bmac_list, bmp)) {
593 		if (strcmp(bip->bi_name, bmp->bm_name) == 0) {
594 			ASSERT(bmp->bm_inst == NULL);
595 			bmp->bm_inst = bip;
596 			rw_exit(&bmac_rwlock);
597 			kmem_free(bnew, sizeof (*bnew));
598 			mac_free(mac);
599 			*bmacp = bmp;
600 			return (0);
601 		}
602 	}
603 
604 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
605 	mac->m_driver = bnew;
606 	mac->m_dip = bridge_dev_info;
607 	mac->m_instance = (uint_t)-1;
608 	mac->m_src_addr = (uint8_t *)zero_addr;
609 	mac->m_callbacks = &bridge_m_callbacks;
610 
611 	/*
612 	 * Note that the SDU limits are irrelevant, as nobody transmits on the
613 	 * bridge node itself.  It's mainly for monitoring but we allow
614 	 * setting the bridge MTU for quick transition of all links part of the
615 	 * bridge to a new MTU.
616 	 */
617 	mac->m_min_sdu = 1;
618 	mac->m_max_sdu = 1500;
619 	err = mac_register(mac, &bnew->bm_mh);
620 	mac_free(mac);
621 	if (err != 0) {
622 		rw_exit(&bmac_rwlock);
623 		kmem_free(bnew, sizeof (*bnew));
624 		return (err);
625 	}
626 
627 	bnew->bm_inst = bip;
628 	(void) strcpy(bnew->bm_name, bip->bi_name);
629 	if (list_is_empty(&bmac_list)) {
630 		bridge_timerid = timeout(bridge_timer, NULL,
631 		    bridge_scan_interval);
632 	}
633 	list_insert_tail(&bmac_list, bnew);
634 	rw_exit(&bmac_rwlock);
635 
636 	/*
637 	 * Mark the MAC as unable to go "active" so that only passive clients
638 	 * (such as snoop) can bind to it.
639 	 */
640 	mac_no_active(bnew->bm_mh);
641 	*bmacp = bnew;
642 	return (0);
643 }
644 
645 /*
646  * Disconnect the given bridge_mac_t from its bridge instance.  The bridge
647  * instance is going away.  The mac instance can't go away until the clients
648  * are gone (see bridge_timer).
649  */
650 static void
651 bmac_disconnect(bridge_mac_t *bmp)
652 {
653 	bridge_inst_t *bip;
654 
655 	bmp->bm_linkstate = LINK_STATE_DOWN;
656 	mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
657 
658 	rw_enter(&bmac_rwlock, RW_READER);
659 	bip = bmp->bm_inst;
660 	bip->bi_mac = NULL;
661 	bmp->bm_inst = NULL;
662 	rw_exit(&bmac_rwlock);
663 }
664 
665 /* This is used by the avl trees to sort forwarding table entries */
666 static int
667 fwd_compare(const void *addr1, const void *addr2)
668 {
669 	const bridge_fwd_t *fwd1 = addr1;
670 	const bridge_fwd_t *fwd2 = addr2;
671 	int diff = memcmp(fwd1->bf_dest, fwd2->bf_dest, ETHERADDRL);
672 
673 	if (diff != 0)
674 		return (diff > 0 ? 1 : -1);
675 
676 	if ((fwd1->bf_flags ^ fwd2->bf_flags) & BFF_VLANLOCAL) {
677 		if (fwd1->bf_vlanid > fwd2->bf_vlanid)
678 			return (1);
679 		else if (fwd1->bf_vlanid < fwd2->bf_vlanid)
680 			return (-1);
681 	}
682 	return (0);
683 }
684 
685 static void
686 inst_free(bridge_inst_t *bip)
687 {
688 	ASSERT(bip->bi_mac == NULL);
689 	rw_destroy(&bip->bi_rwlock);
690 	list_destroy(&bip->bi_links);
691 	cv_destroy(&bip->bi_linkwait);
692 	avl_destroy(&bip->bi_fwd);
693 	if (bip->bi_ksp != NULL)
694 		kstat_delete(bip->bi_ksp);
695 	kmem_free(bip, sizeof (*bip));
696 }
697 
698 static bridge_inst_t *
699 inst_alloc(const char *bridge)
700 {
701 	bridge_inst_t *bip;
702 
703 	bip = kmem_zalloc(sizeof (*bip), KM_SLEEP);
704 	bip->bi_refs = 1;
705 	(void) strcpy(bip->bi_name, bridge);
706 	rw_init(&bip->bi_rwlock, NULL, RW_DRIVER, NULL);
707 	list_create(&bip->bi_links, sizeof (bridge_link_t),
708 	    offsetof(bridge_link_t, bl_node));
709 	cv_init(&bip->bi_linkwait, NULL, CV_DRIVER, NULL);
710 	avl_create(&bip->bi_fwd, fwd_compare, sizeof (bridge_fwd_t),
711 	    offsetof(bridge_fwd_t, bf_node));
712 	return (bip);
713 }
714 
715 static bridge_inst_t *
716 bridge_find_name(const char *bridge)
717 {
718 	bridge_inst_t *bip;
719 
720 	mutex_enter(&inst_lock);
721 	for (bip = list_head(&inst_list); bip != NULL;
722 	    bip = list_next(&inst_list, bip)) {
723 		if (!(bip->bi_flags & BIF_SHUTDOWN) &&
724 		    strcmp(bridge, bip->bi_name) == 0) {
725 			atomic_inc_uint(&bip->bi_refs);
726 			break;
727 		}
728 	}
729 	mutex_exit(&inst_lock);
730 
731 	return (bip);
732 }
733 
734 static int
735 bridge_create(datalink_id_t linkid, const char *bridge, bridge_inst_t **bipc,
736     cred_t *cred)
737 {
738 	bridge_inst_t *bip, *bipnew;
739 	bridge_mac_t *bmp = NULL;
740 	int err;
741 
742 	*bipc = NULL;
743 	bipnew = inst_alloc(bridge);
744 
745 	mutex_enter(&inst_lock);
746 lookup_retry:
747 	for (bip = list_head(&inst_list); bip != NULL;
748 	    bip = list_next(&inst_list, bip)) {
749 		if (strcmp(bridge, bip->bi_name) == 0)
750 			break;
751 	}
752 
753 	/* This should not take long; if it does, we've got a design problem */
754 	if (bip != NULL && (bip->bi_flags & BIF_SHUTDOWN)) {
755 		cv_wait(&inst_cv, &inst_lock);
756 		goto lookup_retry;
757 	}
758 
759 	if (bip == NULL) {
760 		bip = bipnew;
761 		bipnew = NULL;
762 		list_insert_tail(&inst_list, bip);
763 	}
764 
765 	mutex_exit(&inst_lock);
766 	if (bipnew != NULL) {
767 		inst_free(bipnew);
768 		return (EEXIST);
769 	}
770 
771 	bip->bi_ksp = kstat_setup((kstat_named_t *)&bip->bi_kstats,
772 	    inst_kstats_list, Dim(inst_kstats_list), bip->bi_name);
773 
774 	err = bmac_alloc(bip, &bmp);
775 	if ((bip->bi_mac = bmp) == NULL)
776 		goto fail_create;
777 
778 	/*
779 	 * bm_inst is set, so the timer cannot yank the DLS rug from under us.
780 	 * No extra locking is needed here.
781 	 */
782 	if (!(bmp->bm_flags & BMF_DLS)) {
783 		err = dls_devnet_create(bmp->bm_mh, linkid, crgetzoneid(cred));
784 		if (err != 0)
785 			goto fail_create;
786 		bmp->bm_flags |= BMF_DLS;
787 	}
788 
789 	bip->bi_dev = makedevice(bridge_major, mac_minor(bmp->bm_mh));
790 	*bipc = bip;
791 	return (0);
792 
793 fail_create:
794 	ASSERT(bip->bi_trilldata == NULL);
795 	bip->bi_flags |= BIF_SHUTDOWN;
796 	bridge_unref(bip);
797 	return (err);
798 }
799 
800 static void
801 bridge_unref(bridge_inst_t *bip)
802 {
803 	if (atomic_dec_uint_nv(&bip->bi_refs) == 0) {
804 		ASSERT(bip->bi_flags & BIF_SHUTDOWN);
805 		/* free up mac for reuse before leaving global list */
806 		if (bip->bi_mac != NULL)
807 			bmac_disconnect(bip->bi_mac);
808 		mutex_enter(&inst_lock);
809 		list_remove(&inst_list, bip);
810 		cv_broadcast(&inst_cv);
811 		mutex_exit(&inst_lock);
812 		inst_free(bip);
813 	}
814 }
815 
816 /*
817  * Stream instances are used only for allocating bridges and serving as a
818  * control node.  They serve no data-handling function.
819  */
820 static bridge_stream_t *
821 stream_alloc(void)
822 {
823 	bridge_stream_t *bsp;
824 	minor_t mn;
825 
826 	if ((mn = mac_minor_hold(B_FALSE)) == 0)
827 		return (NULL);
828 	bsp = kmem_zalloc(sizeof (*bsp), KM_SLEEP);
829 	bsp->bs_minor = mn;
830 	return (bsp);
831 }
832 
833 static void
834 stream_free(bridge_stream_t *bsp)
835 {
836 	mac_minor_rele(bsp->bs_minor);
837 	kmem_free(bsp, sizeof (*bsp));
838 }
839 
840 /* Reference hold/release functions for STREAMS-related taskq */
841 static void
842 stream_ref(bridge_stream_t *bsp)
843 {
844 	mutex_enter(&stream_ref_lock);
845 	bsp->bs_taskq_cnt++;
846 	mutex_exit(&stream_ref_lock);
847 }
848 
849 static void
850 stream_unref(bridge_stream_t *bsp)
851 {
852 	mutex_enter(&stream_ref_lock);
853 	if (--bsp->bs_taskq_cnt == 0)
854 		cv_broadcast(&stream_ref_cv);
855 	mutex_exit(&stream_ref_lock);
856 }
857 
858 static void
859 link_free(bridge_link_t *blp)
860 {
861 	bridge_inst_t *bip = blp->bl_inst;
862 
863 	ASSERT(!(blp->bl_flags & BLF_FREED));
864 	blp->bl_flags |= BLF_FREED;
865 	if (blp->bl_ksp != NULL)
866 		kstat_delete(blp->bl_ksp);
867 	if (blp->bl_lfailmp != NULL)
868 		freeb(blp->bl_lfailmp);
869 	cv_destroy(&blp->bl_trillwait);
870 	mutex_destroy(&blp->bl_trilllock);
871 	kmem_free(blp, sizeof (*blp));
872 	/* Don't unreference the bridge until the MAC is closed */
873 	bridge_unref(bip);
874 }
875 
876 static void
877 link_unref(bridge_link_t *blp)
878 {
879 	if (atomic_dec_uint_nv(&blp->bl_refs) == 0) {
880 		bridge_inst_t *bip = blp->bl_inst;
881 
882 		ASSERT(blp->bl_flags & BLF_DELETED);
883 		rw_enter(&bip->bi_rwlock, RW_WRITER);
884 		if (blp->bl_flags & BLF_LINK_ADDED)
885 			list_remove(&bip->bi_links, blp);
886 		rw_exit(&bip->bi_rwlock);
887 		if (bip->bi_trilldata != NULL && list_is_empty(&bip->bi_links))
888 			cv_broadcast(&bip->bi_linkwait);
889 		link_free(blp);
890 	}
891 }
892 
893 static bridge_fwd_t *
894 fwd_alloc(const uint8_t *addr, uint_t nlinks, uint16_t nick)
895 {
896 	bridge_fwd_t *bfp;
897 
898 	bfp = kmem_zalloc(sizeof (*bfp) + (nlinks * sizeof (bridge_link_t *)),
899 	    KM_NOSLEEP);
900 	if (bfp != NULL) {
901 		bcopy(addr, bfp->bf_dest, ETHERADDRL);
902 		bfp->bf_lastheard = ddi_get_lbolt();
903 		bfp->bf_maxlinks = nlinks;
904 		bfp->bf_links = (bridge_link_t **)(bfp + 1);
905 		bfp->bf_trill_nick = nick;
906 	}
907 	return (bfp);
908 }
909 
910 static bridge_fwd_t *
911 fwd_find(bridge_inst_t *bip, const uint8_t *addr, uint16_t vlanid)
912 {
913 	bridge_fwd_t *bfp, *vbfp;
914 	bridge_fwd_t match;
915 
916 	bcopy(addr, match.bf_dest, ETHERADDRL);
917 	match.bf_flags = 0;
918 	rw_enter(&bip->bi_rwlock, RW_READER);
919 	if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
920 		if (bfp->bf_vlanid != vlanid && bfp->bf_vcnt > 0) {
921 			match.bf_vlanid = vlanid;
922 			match.bf_flags = BFF_VLANLOCAL;
923 			vbfp = avl_find(&bip->bi_fwd, &match, NULL);
924 			if (vbfp != NULL)
925 				bfp = vbfp;
926 		}
927 		atomic_inc_uint(&bfp->bf_refs);
928 	}
929 	rw_exit(&bip->bi_rwlock);
930 	return (bfp);
931 }
932 
933 static void
934 fwd_free(bridge_fwd_t *bfp)
935 {
936 	uint_t i;
937 	bridge_inst_t *bip = bfp->bf_links[0]->bl_inst;
938 
939 	KIDECR(bki_count);
940 	for (i = 0; i < bfp->bf_nlinks; i++)
941 		link_unref(bfp->bf_links[i]);
942 	kmem_free(bfp,
943 	    sizeof (*bfp) + bfp->bf_maxlinks * sizeof (bridge_link_t *));
944 }
945 
946 static void
947 fwd_unref(bridge_fwd_t *bfp)
948 {
949 	if (atomic_dec_uint_nv(&bfp->bf_refs) == 0) {
950 		ASSERT(!(bfp->bf_flags & BFF_INTREE));
951 		fwd_free(bfp);
952 	}
953 }
954 
955 static void
956 fwd_delete(bridge_fwd_t *bfp)
957 {
958 	bridge_inst_t *bip;
959 	bridge_fwd_t *bfpzero;
960 
961 	if (bfp->bf_flags & BFF_INTREE) {
962 		ASSERT(bfp->bf_nlinks > 0);
963 		bip = bfp->bf_links[0]->bl_inst;
964 		rw_enter(&bip->bi_rwlock, RW_WRITER);
965 		/* Another thread could beat us to this */
966 		if (bfp->bf_flags & BFF_INTREE) {
967 			avl_remove(&bip->bi_fwd, bfp);
968 			bfp->bf_flags &= ~BFF_INTREE;
969 			if (bfp->bf_flags & BFF_VLANLOCAL) {
970 				bfp->bf_flags &= ~BFF_VLANLOCAL;
971 				bfpzero = avl_find(&bip->bi_fwd, bfp, NULL);
972 				if (bfpzero != NULL && bfpzero->bf_vcnt > 0)
973 					bfpzero->bf_vcnt--;
974 			}
975 			rw_exit(&bip->bi_rwlock);
976 			fwd_unref(bfp);		/* no longer in avl tree */
977 		} else {
978 			rw_exit(&bip->bi_rwlock);
979 		}
980 	}
981 }
982 
983 static boolean_t
984 fwd_insert(bridge_inst_t *bip, bridge_fwd_t *bfp)
985 {
986 	avl_index_t idx;
987 	boolean_t retv;
988 
989 	rw_enter(&bip->bi_rwlock, RW_WRITER);
990 	if (!(bip->bi_flags & BIF_SHUTDOWN) &&
991 	    avl_numnodes(&bip->bi_fwd) < bip->bi_tablemax &&
992 	    avl_find(&bip->bi_fwd, bfp, &idx) == NULL) {
993 		avl_insert(&bip->bi_fwd, bfp, idx);
994 		bfp->bf_flags |= BFF_INTREE;
995 		atomic_inc_uint(&bfp->bf_refs);	/* avl entry */
996 		retv = B_TRUE;
997 	} else {
998 		retv = B_FALSE;
999 	}
1000 	rw_exit(&bip->bi_rwlock);
1001 	return (retv);
1002 }
1003 
1004 static void
1005 fwd_update_local(bridge_link_t *blp, const uint8_t *oldaddr,
1006     const uint8_t *newaddr)
1007 {
1008 	bridge_inst_t *bip = blp->bl_inst;
1009 	bridge_fwd_t *bfp, *bfnew;
1010 	bridge_fwd_t match;
1011 	avl_index_t idx;
1012 	boolean_t drop_ref = B_FALSE;
1013 
1014 	if (bcmp(oldaddr, newaddr, ETHERADDRL) == 0)
1015 		return;
1016 
1017 	if (bcmp(oldaddr, zero_addr, ETHERADDRL) == 0)
1018 		goto no_old_addr;
1019 
1020 	/*
1021 	 * Find the previous entry, and remove our link from it.
1022 	 */
1023 	bcopy(oldaddr, match.bf_dest, ETHERADDRL);
1024 	rw_enter(&bip->bi_rwlock, RW_WRITER);
1025 	if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
1026 		int i;
1027 
1028 		/*
1029 		 * See if we're in the list, and remove if so.
1030 		 */
1031 		for (i = 0; i < bfp->bf_nlinks; i++) {
1032 			if (bfp->bf_links[i] == blp) {
1033 				/*
1034 				 * We assume writes are atomic, so no special
1035 				 * MT handling is needed.  The list length is
1036 				 * decremented first, and then we remove
1037 				 * entries.
1038 				 */
1039 				bfp->bf_nlinks--;
1040 				for (; i < bfp->bf_nlinks; i++)
1041 					bfp->bf_links[i] = bfp->bf_links[i + 1];
1042 				drop_ref = B_TRUE;
1043 				break;
1044 			}
1045 		}
1046 		/* If no more links, then remove and free up */
1047 		if (bfp->bf_nlinks == 0) {
1048 			avl_remove(&bip->bi_fwd, bfp);
1049 			bfp->bf_flags &= ~BFF_INTREE;
1050 		} else {
1051 			bfp = NULL;
1052 		}
1053 	}
1054 	rw_exit(&bip->bi_rwlock);
1055 	if (bfp != NULL)
1056 		fwd_unref(bfp);		/* no longer in avl tree */
1057 
1058 	/*
1059 	 * Now get the new link address and add this link to the list.  The
1060 	 * list should be of length 1 unless the user has configured multiple
1061 	 * NICs with the same address.  (That's an incorrect configuration, but
1062 	 * we support it anyway.)
1063 	 */
1064 no_old_addr:
1065 	bfp = NULL;
1066 	if ((bip->bi_flags & BIF_SHUTDOWN) ||
1067 	    bcmp(newaddr, zero_addr, ETHERADDRL) == 0)
1068 		goto no_new_addr;
1069 
1070 	bcopy(newaddr, match.bf_dest, ETHERADDRL);
1071 	rw_enter(&bip->bi_rwlock, RW_WRITER);
1072 	if ((bfp = avl_find(&bip->bi_fwd, &match, &idx)) == NULL) {
1073 		bfnew = fwd_alloc(newaddr, 1, RBRIDGE_NICKNAME_NONE);
1074 		if (bfnew != NULL)
1075 			KIINCR(bki_count);
1076 	} else if (bfp->bf_nlinks < bfp->bf_maxlinks) {
1077 		/* special case: link fits in existing entry */
1078 		bfnew = bfp;
1079 	} else {
1080 		bfnew = fwd_alloc(newaddr, bfp->bf_nlinks + 1,
1081 		    RBRIDGE_NICKNAME_NONE);
1082 		if (bfnew != NULL) {
1083 			KIINCR(bki_count);
1084 			avl_remove(&bip->bi_fwd, bfp);
1085 			bfp->bf_flags &= ~BFF_INTREE;
1086 			bfnew->bf_nlinks = bfp->bf_nlinks;
1087 			bcopy(bfp->bf_links, bfnew->bf_links,
1088 			    bfp->bf_nlinks * sizeof (bfp));
1089 			/* reset the idx value due to removal above */
1090 			(void) avl_find(&bip->bi_fwd, &match, &idx);
1091 		}
1092 	}
1093 
1094 	if (bfnew != NULL) {
1095 		bfnew->bf_links[bfnew->bf_nlinks++] = blp;
1096 		if (drop_ref)
1097 			drop_ref = B_FALSE;
1098 		else
1099 			atomic_inc_uint(&blp->bl_refs);	/* bf_links entry */
1100 
1101 		if (bfnew != bfp) {
1102 			/* local addresses are not subject to table limits */
1103 			avl_insert(&bip->bi_fwd, bfnew, idx);
1104 			bfnew->bf_flags |= (BFF_INTREE | BFF_LOCALADDR);
1105 			atomic_inc_uint(&bfnew->bf_refs);	/* avl entry */
1106 		}
1107 	}
1108 	rw_exit(&bip->bi_rwlock);
1109 
1110 no_new_addr:
1111 	/*
1112 	 * If we found an existing entry and we replaced it with a new one,
1113 	 * then drop the table reference from the old one.  We removed it from
1114 	 * the AVL tree above.
1115 	 */
1116 	if (bfnew != NULL && bfp != NULL && bfnew != bfp)
1117 		fwd_unref(bfp);
1118 
1119 	/* Account for removed entry. */
1120 	if (drop_ref)
1121 		link_unref(blp);
1122 }
1123 
1124 static void
1125 bridge_new_unicst(bridge_link_t *blp)
1126 {
1127 	uint8_t new_mac[ETHERADDRL];
1128 
1129 	mac_unicast_primary_get(blp->bl_mh, new_mac);
1130 	fwd_update_local(blp, blp->bl_local_mac, new_mac);
1131 	bcopy(new_mac, blp->bl_local_mac, ETHERADDRL);
1132 }
1133 
1134 /*
1135  * We must shut down a link prior to freeing it, and doing that requires
1136  * blocking to wait for running MAC threads while holding a reference.  This is
1137  * run from a taskq to accomplish proper link shutdown followed by reference
1138  * drop.
1139  */
1140 static void
1141 link_shutdown(void *arg)
1142 {
1143 	bridge_link_t *blp = arg;
1144 	mac_handle_t mh = blp->bl_mh;
1145 	bridge_inst_t *bip;
1146 	bridge_fwd_t *bfp, *bfnext;
1147 	avl_tree_t fwd_scavenge;
1148 	int i;
1149 
1150 	/*
1151 	 * This link is being destroyed.  Notify TRILL now that it's no longer
1152 	 * possible to send packets.  Data packets may still arrive until TRILL
1153 	 * calls bridge_trill_lnunref.
1154 	 */
1155 	if (blp->bl_trilldata != NULL)
1156 		trill_lndstr_fn(blp->bl_trilldata, blp);
1157 
1158 	if (blp->bl_flags & BLF_PROM_ADDED)
1159 		(void) mac_promisc_remove(blp->bl_mphp);
1160 
1161 	if (blp->bl_flags & BLF_SET_BRIDGE)
1162 		mac_bridge_clear(mh, (mac_handle_t)blp);
1163 
1164 	if (blp->bl_flags & BLF_MARGIN_ADDED) {
1165 		(void) mac_notify_remove(blp->bl_mnh, B_TRUE);
1166 		(void) mac_margin_remove(mh, blp->bl_margin);
1167 	}
1168 
1169 	/* Tell the clients the real link state when we leave */
1170 	mac_link_redo(blp->bl_mh,
1171 	    mac_stat_get(blp->bl_mh, MAC_STAT_LOWLINK_STATE));
1172 
1173 	/* Destroy all of the forwarding entries related to this link */
1174 	avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1175 	    offsetof(bridge_fwd_t, bf_node));
1176 	bip = blp->bl_inst;
1177 	rw_enter(&bip->bi_rwlock, RW_WRITER);
1178 	bfnext = avl_first(&bip->bi_fwd);
1179 	while ((bfp = bfnext) != NULL) {
1180 		bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1181 		for (i = 0; i < bfp->bf_nlinks; i++) {
1182 			if (bfp->bf_links[i] == blp)
1183 				break;
1184 		}
1185 		if (i >= bfp->bf_nlinks)
1186 			continue;
1187 		if (bfp->bf_nlinks > 1) {
1188 			/* note that this can't be the last reference */
1189 			link_unref(blp);
1190 			bfp->bf_nlinks--;
1191 			for (; i < bfp->bf_nlinks; i++)
1192 				bfp->bf_links[i] = bfp->bf_links[i + 1];
1193 		} else {
1194 			ASSERT(bfp->bf_flags & BFF_INTREE);
1195 			avl_remove(&bip->bi_fwd, bfp);
1196 			bfp->bf_flags &= ~BFF_INTREE;
1197 			avl_add(&fwd_scavenge, bfp);
1198 		}
1199 	}
1200 	rw_exit(&bip->bi_rwlock);
1201 	bfnext = avl_first(&fwd_scavenge);
1202 	while ((bfp = bfnext) != NULL) {
1203 		bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1204 		avl_remove(&fwd_scavenge, bfp);
1205 		fwd_unref(bfp);
1206 	}
1207 	avl_destroy(&fwd_scavenge);
1208 
1209 	if (blp->bl_flags & BLF_CLIENT_OPEN)
1210 		mac_client_close(blp->bl_mch, 0);
1211 
1212 	mac_close(mh);
1213 
1214 	/*
1215 	 * We are now completely removed from the active list, so drop the
1216 	 * reference (see bridge_add_link).
1217 	 */
1218 	link_unref(blp);
1219 }
1220 
1221 static void
1222 shutdown_inst(bridge_inst_t *bip)
1223 {
1224 	bridge_link_t *blp, *blnext;
1225 	bridge_fwd_t *bfp;
1226 
1227 	mutex_enter(&inst_lock);
1228 	if (bip->bi_flags & BIF_SHUTDOWN) {
1229 		mutex_exit(&inst_lock);
1230 		return;
1231 	}
1232 
1233 	/*
1234 	 * Once on the inst_list, the bridge instance must not leave that list
1235 	 * without having the shutdown flag set first.  When the shutdown flag
1236 	 * is set, we own the list reference, so we must drop it before
1237 	 * returning.
1238 	 */
1239 	bip->bi_flags |= BIF_SHUTDOWN;
1240 	mutex_exit(&inst_lock);
1241 
1242 	bip->bi_control = NULL;
1243 
1244 	rw_enter(&bip->bi_rwlock, RW_READER);
1245 	blnext = list_head(&bip->bi_links);
1246 	while ((blp = blnext) != NULL) {
1247 		blnext = list_next(&bip->bi_links, blp);
1248 		if (!(blp->bl_flags & BLF_DELETED)) {
1249 			blp->bl_flags |= BLF_DELETED;
1250 			(void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
1251 			    blp, DDI_SLEEP);
1252 		}
1253 	}
1254 	while ((bfp = avl_first(&bip->bi_fwd)) != NULL) {
1255 		atomic_inc_uint(&bfp->bf_refs);
1256 		rw_exit(&bip->bi_rwlock);
1257 		fwd_delete(bfp);
1258 		fwd_unref(bfp);
1259 		rw_enter(&bip->bi_rwlock, RW_READER);
1260 	}
1261 	rw_exit(&bip->bi_rwlock);
1262 
1263 	/*
1264 	 * This bridge is being destroyed.  Notify TRILL once all of the
1265 	 * links are all gone.
1266 	 */
1267 	mutex_enter(&inst_lock);
1268 	while (bip->bi_trilldata != NULL && !list_is_empty(&bip->bi_links))
1269 		cv_wait(&bip->bi_linkwait, &inst_lock);
1270 	mutex_exit(&inst_lock);
1271 	if (bip->bi_trilldata != NULL)
1272 		trill_brdstr_fn(bip->bi_trilldata, bip);
1273 
1274 	bridge_unref(bip);
1275 }
1276 
1277 /*
1278  * This is called once by the TRILL module when it starts up.  It just sets the
1279  * global TRILL callback function pointers -- data transmit/receive and bridge
1280  * and link destroy notification.  There's only one TRILL module, so only one
1281  * registration is needed.
1282  *
1283  * TRILL should call this function with NULL pointers before unloading.  It
1284  * must not do so before dropping all references to bridges and links.  We
1285  * assert that this is true on debug builds.
1286  */
1287 void
1288 bridge_trill_register_cb(trill_recv_pkt_t recv_fn, trill_encap_pkt_t encap_fn,
1289     trill_br_dstr_t brdstr_fn, trill_ln_dstr_t lndstr_fn)
1290 {
1291 #ifdef DEBUG
1292 	if (recv_fn == NULL && trill_recv_fn != NULL) {
1293 		bridge_inst_t *bip;
1294 		bridge_link_t *blp;
1295 
1296 		mutex_enter(&inst_lock);
1297 		for (bip = list_head(&inst_list); bip != NULL;
1298 		    bip = list_next(&inst_list, bip)) {
1299 			ASSERT(bip->bi_trilldata == NULL);
1300 			rw_enter(&bip->bi_rwlock, RW_READER);
1301 			for (blp = list_head(&bip->bi_links); blp != NULL;
1302 			    blp = list_next(&bip->bi_links, blp)) {
1303 				ASSERT(blp->bl_trilldata == NULL);
1304 			}
1305 			rw_exit(&bip->bi_rwlock);
1306 		}
1307 		mutex_exit(&inst_lock);
1308 	}
1309 #endif
1310 	trill_recv_fn = recv_fn;
1311 	trill_encap_fn = encap_fn;
1312 	trill_brdstr_fn = brdstr_fn;
1313 	trill_lndstr_fn = lndstr_fn;
1314 }
1315 
1316 /*
1317  * This registers the TRILL instance pointer with a bridge.  Before this
1318  * pointer is set, the forwarding, TRILL receive, and bridge destructor
1319  * functions won't be called.
1320  *
1321  * TRILL holds a reference on a bridge with this call.  It must free the
1322  * reference by calling the unregister function below.
1323  */
1324 bridge_inst_t *
1325 bridge_trill_brref(const char *bname, void *ptr)
1326 {
1327 	char bridge[MAXLINKNAMELEN];
1328 	bridge_inst_t *bip;
1329 
1330 	(void) snprintf(bridge, MAXLINKNAMELEN, "%s0", bname);
1331 	bip = bridge_find_name(bridge);
1332 	if (bip != NULL) {
1333 		ASSERT(bip->bi_trilldata == NULL && ptr != NULL);
1334 		bip->bi_trilldata = ptr;
1335 	}
1336 	return (bip);
1337 }
1338 
1339 void
1340 bridge_trill_brunref(bridge_inst_t *bip)
1341 {
1342 	ASSERT(bip->bi_trilldata != NULL);
1343 	bip->bi_trilldata = NULL;
1344 	bridge_unref(bip);
1345 }
1346 
1347 /*
1348  * TRILL calls this function when referencing a particular link on a bridge.
1349  *
1350  * It holds a reference on the link, so TRILL must clear out the reference when
1351  * it's done with the link (on unbinding).
1352  */
1353 bridge_link_t *
1354 bridge_trill_lnref(bridge_inst_t *bip, datalink_id_t linkid, void *ptr)
1355 {
1356 	bridge_link_t *blp;
1357 
1358 	ASSERT(ptr != NULL);
1359 	rw_enter(&bip->bi_rwlock, RW_READER);
1360 	for (blp = list_head(&bip->bi_links); blp != NULL;
1361 	    blp = list_next(&bip->bi_links, blp)) {
1362 		if (!(blp->bl_flags & BLF_DELETED) &&
1363 		    blp->bl_linkid == linkid && blp->bl_trilldata == NULL) {
1364 			blp->bl_trilldata = ptr;
1365 			blp->bl_flags &= ~BLF_TRILLACTIVE;
1366 			(void) memset(blp->bl_afs, 0, sizeof (blp->bl_afs));
1367 			atomic_inc_uint(&blp->bl_refs);
1368 			break;
1369 		}
1370 	}
1371 	rw_exit(&bip->bi_rwlock);
1372 	return (blp);
1373 }
1374 
1375 void
1376 bridge_trill_lnunref(bridge_link_t *blp)
1377 {
1378 	mutex_enter(&blp->bl_trilllock);
1379 	ASSERT(blp->bl_trilldata != NULL);
1380 	blp->bl_trilldata = NULL;
1381 	blp->bl_flags &= ~BLF_TRILLACTIVE;
1382 	while (blp->bl_trillthreads > 0)
1383 		cv_wait(&blp->bl_trillwait, &blp->bl_trilllock);
1384 	mutex_exit(&blp->bl_trilllock);
1385 	(void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
1386 	link_unref(blp);
1387 }
1388 
1389 /*
1390  * This periodic timer performs three functions:
1391  *  1. It scans the list of learned forwarding entries, and removes ones that
1392  *     haven't been heard from in a while.  The time limit is backed down if
1393  *     we're above the configured table limit.
1394  *  2. It walks the links and decays away the bl_learns counter.
1395  *  3. It scans the observability node entries looking for ones that can be
1396  *     freed up.
1397  */
1398 /* ARGSUSED */
1399 static void
1400 bridge_timer(void *arg)
1401 {
1402 	bridge_inst_t *bip;
1403 	bridge_fwd_t *bfp, *bfnext;
1404 	bridge_mac_t *bmp, *bmnext;
1405 	bridge_link_t *blp;
1406 	int err;
1407 	datalink_id_t tmpid;
1408 	avl_tree_t fwd_scavenge;
1409 	clock_t age_limit;
1410 	uint32_t ldecay;
1411 
1412 	avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1413 	    offsetof(bridge_fwd_t, bf_node));
1414 	mutex_enter(&inst_lock);
1415 	for (bip = list_head(&inst_list); bip != NULL;
1416 	    bip = list_next(&inst_list, bip)) {
1417 		if (bip->bi_flags & BIF_SHUTDOWN)
1418 			continue;
1419 		rw_enter(&bip->bi_rwlock, RW_WRITER);
1420 		/* compute scaled maximum age based on table limit */
1421 		if (avl_numnodes(&bip->bi_fwd) > bip->bi_tablemax)
1422 			bip->bi_tshift++;
1423 		else
1424 			bip->bi_tshift = 0;
1425 		if ((age_limit = bridge_fwd_age >> bip->bi_tshift) == 0) {
1426 			if (bip->bi_tshift != 0)
1427 				bip->bi_tshift--;
1428 			age_limit = 1;
1429 		}
1430 		bfnext = avl_first(&bip->bi_fwd);
1431 		while ((bfp = bfnext) != NULL) {
1432 			bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1433 			if (!(bfp->bf_flags & BFF_LOCALADDR) &&
1434 			    (ddi_get_lbolt() - bfp->bf_lastheard) > age_limit) {
1435 				ASSERT(bfp->bf_flags & BFF_INTREE);
1436 				avl_remove(&bip->bi_fwd, bfp);
1437 				bfp->bf_flags &= ~BFF_INTREE;
1438 				avl_add(&fwd_scavenge, bfp);
1439 			}
1440 		}
1441 		for (blp = list_head(&bip->bi_links); blp != NULL;
1442 		    blp = list_next(&bip->bi_links, blp)) {
1443 			ldecay = mac_get_ldecay(blp->bl_mh);
1444 			if (ldecay >= blp->bl_learns)
1445 				blp->bl_learns = 0;
1446 			else
1447 				atomic_add_int(&blp->bl_learns, -(int)ldecay);
1448 		}
1449 		rw_exit(&bip->bi_rwlock);
1450 		bfnext = avl_first(&fwd_scavenge);
1451 		while ((bfp = bfnext) != NULL) {
1452 			bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1453 			avl_remove(&fwd_scavenge, bfp);
1454 			KIINCR(bki_expire);
1455 			fwd_unref(bfp);	/* drop tree reference */
1456 		}
1457 	}
1458 	mutex_exit(&inst_lock);
1459 	avl_destroy(&fwd_scavenge);
1460 
1461 	/*
1462 	 * Scan the bridge_mac_t entries and try to free up the ones that are
1463 	 * no longer active.  This must be done by polling, as neither DLS nor
1464 	 * MAC provides a driver any sort of positive control over clients.
1465 	 */
1466 	rw_enter(&bmac_rwlock, RW_WRITER);
1467 	bmnext = list_head(&bmac_list);
1468 	while ((bmp = bmnext) != NULL) {
1469 		bmnext = list_next(&bmac_list, bmp);
1470 
1471 		/* ignore active bridges */
1472 		if (bmp->bm_inst != NULL)
1473 			continue;
1474 
1475 		if (bmp->bm_flags & BMF_DLS) {
1476 			err = dls_devnet_destroy(bmp->bm_mh, &tmpid, B_FALSE);
1477 			ASSERT(err == 0 || err == EBUSY);
1478 			if (err == 0)
1479 				bmp->bm_flags &= ~BMF_DLS;
1480 		}
1481 
1482 		if (!(bmp->bm_flags & BMF_DLS)) {
1483 			err = mac_unregister(bmp->bm_mh);
1484 			ASSERT(err == 0 || err == EBUSY);
1485 			if (err == 0) {
1486 				list_remove(&bmac_list, bmp);
1487 				kmem_free(bmp, sizeof (*bmp));
1488 			}
1489 		}
1490 	}
1491 	if (list_is_empty(&bmac_list)) {
1492 		bridge_timerid = 0;
1493 	} else {
1494 		bridge_timerid = timeout(bridge_timer, NULL,
1495 		    bridge_scan_interval);
1496 	}
1497 	rw_exit(&bmac_rwlock);
1498 }
1499 
1500 static int
1501 bridge_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp)
1502 {
1503 	bridge_stream_t	*bsp;
1504 
1505 	if (rq->q_ptr != NULL)
1506 		return (0);
1507 
1508 	if (sflag & MODOPEN)
1509 		return (EINVAL);
1510 
1511 	/*
1512 	 * Check the minor node number being opened.  This tells us which
1513 	 * bridge instance the user wants.
1514 	 */
1515 	if (getminor(*devp) != 0) {
1516 		/*
1517 		 * This is a regular DLPI stream for snoop or the like.
1518 		 * Redirect it through DLD.
1519 		 */
1520 		rq->q_qinfo = &bridge_dld_rinit;
1521 		OTHERQ(rq)->q_qinfo = &bridge_dld_winit;
1522 		return (dld_open(rq, devp, oflag, sflag, credp));
1523 	} else {
1524 		/*
1525 		 * Allocate the bridge control stream structure.
1526 		 */
1527 		if ((bsp = stream_alloc()) == NULL)
1528 			return (ENOSR);
1529 		rq->q_ptr = WR(rq)->q_ptr = (caddr_t)bsp;
1530 		bsp->bs_wq = WR(rq);
1531 		*devp = makedevice(getmajor(*devp), bsp->bs_minor);
1532 		qprocson(rq);
1533 		return (0);
1534 	}
1535 }
1536 
1537 /*
1538  * This is used only for bridge control streams.  DLPI goes through dld
1539  * instead.
1540  */
1541 static int
1542 bridge_close(queue_t *rq)
1543 {
1544 	bridge_stream_t	*bsp = rq->q_ptr;
1545 	bridge_inst_t *bip;
1546 
1547 	/*
1548 	 * Wait for any stray taskq (add/delete link) entries related to this
1549 	 * stream to leave the system.
1550 	 */
1551 	mutex_enter(&stream_ref_lock);
1552 	while (bsp->bs_taskq_cnt != 0)
1553 		cv_wait(&stream_ref_cv, &stream_ref_lock);
1554 	mutex_exit(&stream_ref_lock);
1555 
1556 	qprocsoff(rq);
1557 	if ((bip = bsp->bs_inst) != NULL)
1558 		shutdown_inst(bip);
1559 	rq->q_ptr = WR(rq)->q_ptr = NULL;
1560 	stream_free(bsp);
1561 	if (bip != NULL)
1562 		bridge_unref(bip);
1563 
1564 	return (0);
1565 }
1566 
1567 static void
1568 bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick,
1569     uint16_t vlanid)
1570 {
1571 	bridge_inst_t *bip = blp->bl_inst;
1572 	bridge_fwd_t *bfp, *bfpnew;
1573 	int i;
1574 	boolean_t replaced = B_FALSE;
1575 
1576 	/* Ignore multi-destination address used as source; it's nonsense. */
1577 	if (*saddr & 1)
1578 		return;
1579 
1580 	/*
1581 	 * If the source is known, then check whether it belongs on this link.
1582 	 * If not, and this isn't a fixed local address, then we've detected a
1583 	 * move.  If it's not known, learn it.
1584 	 */
1585 	if ((bfp = fwd_find(bip, saddr, vlanid)) != NULL) {
1586 		/*
1587 		 * If the packet has a fixed local source address, then there's
1588 		 * nothing we can learn.  We must quit.  If this was a received
1589 		 * packet, then the sender has stolen our address, but there's
1590 		 * nothing we can do.  If it's a transmitted packet, then
1591 		 * that's the normal case.
1592 		 */
1593 		if (bfp->bf_flags & BFF_LOCALADDR) {
1594 			fwd_unref(bfp);
1595 			return;
1596 		}
1597 
1598 		/*
1599 		 * Check if the link (and TRILL sender, if any) being used is
1600 		 * among the ones registered for this address.  If so, then
1601 		 * this is information that we already know.
1602 		 */
1603 		if (bfp->bf_trill_nick == ingress_nick) {
1604 			for (i = 0; i < bfp->bf_nlinks; i++) {
1605 				if (bfp->bf_links[i] == blp) {
1606 					bfp->bf_lastheard = ddi_get_lbolt();
1607 					fwd_unref(bfp);
1608 					return;
1609 				}
1610 			}
1611 		}
1612 	}
1613 
1614 	/*
1615 	 * Note that we intentionally "unlearn" things that appear to be under
1616 	 * attack on this link.  The forwarding cache is a negative thing for
1617 	 * security -- it disables reachability as a performance optimization
1618 	 * -- so leaving out entries optimizes for success and defends against
1619 	 * the attack.  Thus, the bare increment without a check in the delete
1620 	 * code above is right.  (And it's ok if we skid over the limit a
1621 	 * little, so there's no syncronization needed on the test.)
1622 	 */
1623 	if (blp->bl_learns >= mac_get_llimit(blp->bl_mh)) {
1624 		if (bfp != NULL) {
1625 			if (bfp->bf_vcnt == 0)
1626 				fwd_delete(bfp);
1627 			fwd_unref(bfp);
1628 		}
1629 		return;
1630 	}
1631 
1632 	atomic_inc_uint(&blp->bl_learns);
1633 
1634 	if ((bfpnew = fwd_alloc(saddr, 1, ingress_nick)) == NULL) {
1635 		if (bfp != NULL)
1636 			fwd_unref(bfp);
1637 		return;
1638 	}
1639 	KIINCR(bki_count);
1640 
1641 	if (bfp != NULL) {
1642 		/*
1643 		 * If this is a new destination for the same VLAN, then delete
1644 		 * so that we can update.  If it's a different VLAN, then we're
1645 		 * not going to delete the original.  Split off instead into an
1646 		 * IVL entry.
1647 		 */
1648 		if (bfp->bf_vlanid == vlanid) {
1649 			/* save the count of IVL duplicates */
1650 			bfpnew->bf_vcnt = bfp->bf_vcnt;
1651 
1652 			/* entry deletes count as learning events */
1653 			atomic_inc_uint(&blp->bl_learns);
1654 
1655 			/* destroy and create anew; node moved */
1656 			fwd_delete(bfp);
1657 			replaced = B_TRUE;
1658 			KIINCR(bki_moved);
1659 		} else {
1660 			bfp->bf_vcnt++;
1661 			bfpnew->bf_flags |= BFF_VLANLOCAL;
1662 		}
1663 		fwd_unref(bfp);
1664 	}
1665 	bfpnew->bf_links[0] = blp;
1666 	bfpnew->bf_nlinks = 1;
1667 	atomic_inc_uint(&blp->bl_refs);	/* bf_links entry */
1668 	if (!fwd_insert(bip, bfpnew))
1669 		fwd_free(bfpnew);
1670 	else if (!replaced)
1671 		KIINCR(bki_source);
1672 }
1673 
1674 /*
1675  * Process the VLAN headers for output on a given link.  There are several
1676  * cases (noting that we don't map VLANs):
1677  *   1. The input packet is good as it is; either
1678  *	a. It has no tag, and output has same PVID
1679  *	b. It has a non-zero priority-only tag for PVID, and b_band is same
1680  *	c. It has a tag with VLAN different from PVID, and b_band is same
1681  *   2. The tag must change: non-zero b_band is different from tag priority
1682  *   3. The packet has a tag and should not (VLAN same as PVID, b_band zero)
1683  *   4. The packet has no tag and needs one:
1684  *      a. VLAN ID same as PVID, but b_band is non-zero
1685  *      b. VLAN ID different from PVID
1686  * We exclude case 1 first, then modify the packet.  Note that output packets
1687  * get a priority set by the mblk, not by the header, because QoS in bridging
1688  * requires priority recalculation at each node.
1689  *
1690  * The passed-in tci is the "impossible" value 0xFFFF when no tag is present.
1691  */
1692 static mblk_t *
1693 reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
1694 {
1695 	boolean_t source_has_tag = (tci != 0xFFFF);
1696 	mblk_t *mpcopy;
1697 	size_t mlen, minlen;
1698 	struct ether_vlan_header *evh;
1699 	int pri;
1700 
1701 	/* This helps centralize error handling in the caller. */
1702 	if (mp == NULL)
1703 		return (mp);
1704 
1705 	/* No forwarded packet can have hardware checksum enabled */
1706 	DB_CKSUMFLAGS(mp) = 0;
1707 
1708 	/* Get the no-modification cases out of the way first */
1709 	if (!source_has_tag && vlanid == pvid)		/* 1a */
1710 		return (mp);
1711 
1712 	pri = VLAN_PRI(tci);
1713 	if (source_has_tag && mp->b_band == pri) {
1714 		if (vlanid != pvid)			/* 1c */
1715 			return (mp);
1716 		if (pri != 0 && VLAN_ID(tci) == 0)	/* 1b */
1717 			return (mp);
1718 	}
1719 
1720 	/*
1721 	 * We now know that we must modify the packet.  Prepare for that.  Note
1722 	 * that if a tag is present, the caller has already done a pullup for
1723 	 * the VLAN header, so we're good to go.
1724 	 */
1725 	if (MBLKL(mp) < sizeof (struct ether_header)) {
1726 		mpcopy = msgpullup(mp, sizeof (struct ether_header));
1727 		if (mpcopy == NULL) {
1728 			freemsg(mp);
1729 			return (NULL);
1730 		}
1731 		mp = mpcopy;
1732 	}
1733 	if (DB_REF(mp) > 1 || !IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)) ||
1734 	    (!source_has_tag && MBLKTAIL(mp) < VLAN_INCR)) {
1735 		minlen = mlen = MBLKL(mp);
1736 		if (!source_has_tag)
1737 			minlen += VLAN_INCR;
1738 		ASSERT(minlen >= sizeof (struct ether_vlan_header));
1739 		/*
1740 		 * We're willing to copy some data to avoid fragmentation, but
1741 		 * not a lot.
1742 		 */
1743 		if (minlen > 256)
1744 			minlen = sizeof (struct ether_vlan_header);
1745 		mpcopy = allocb(minlen, BPRI_MED);
1746 		if (mpcopy == NULL) {
1747 			freemsg(mp);
1748 			return (NULL);
1749 		}
1750 		if (mlen <= minlen) {
1751 			/* We toss the first mblk when we can. */
1752 			bcopy(mp->b_rptr, mpcopy->b_rptr, mlen);
1753 			mpcopy->b_wptr += mlen;
1754 			mpcopy->b_cont = mp->b_cont;
1755 			freeb(mp);
1756 		} else {
1757 			/* If not, then just copy what we need */
1758 			if (!source_has_tag)
1759 				minlen = sizeof (struct ether_header);
1760 			bcopy(mp->b_rptr, mpcopy->b_rptr, minlen);
1761 			mpcopy->b_wptr += minlen;
1762 			mpcopy->b_cont = mp;
1763 			mp->b_rptr += minlen;
1764 		}
1765 		mp = mpcopy;
1766 	}
1767 
1768 	/* LINTED: pointer alignment */
1769 	evh = (struct ether_vlan_header *)mp->b_rptr;
1770 	if (source_has_tag) {
1771 		if (mp->b_band == 0 && vlanid == pvid) {	/* 3 */
1772 			evh->ether_tpid = evh->ether_type;
1773 			mlen = MBLKL(mp);
1774 			if (mlen > sizeof (struct ether_vlan_header))
1775 				ovbcopy(mp->b_rptr +
1776 				    sizeof (struct ether_vlan_header),
1777 				    mp->b_rptr + sizeof (struct ether_header),
1778 				    mlen - sizeof (struct ether_vlan_header));
1779 			mp->b_wptr -= VLAN_INCR;
1780 		} else {					/* 2 */
1781 			if (vlanid == pvid)
1782 				vlanid = VLAN_ID_NONE;
1783 			tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1784 			evh->ether_tci = htons(tci);
1785 		}
1786 	} else {
1787 		/* case 4: no header present, but one is needed */
1788 		mlen = MBLKL(mp);
1789 		if (mlen > sizeof (struct ether_header))
1790 			ovbcopy(mp->b_rptr + sizeof (struct ether_header),
1791 			    mp->b_rptr + sizeof (struct ether_vlan_header),
1792 			    mlen - sizeof (struct ether_header));
1793 		mp->b_wptr += VLAN_INCR;
1794 		ASSERT(mp->b_wptr <= DB_LIM(mp));
1795 		if (vlanid == pvid)
1796 			vlanid = VLAN_ID_NONE;
1797 		tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1798 		evh->ether_type = evh->ether_tpid;
1799 		evh->ether_tpid = htons(ETHERTYPE_VLAN);
1800 		evh->ether_tci = htons(tci);
1801 	}
1802 	return (mp);
1803 }
1804 
1805 /* Record VLAN information and strip header if requested . */
1806 static void
1807 update_header(mblk_t *mp, mac_header_info_t *hdr_info, boolean_t striphdr)
1808 {
1809 	if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
1810 		struct ether_vlan_header *evhp;
1811 		uint16_t ether_type;
1812 
1813 		/* LINTED: alignment */
1814 		evhp = (struct ether_vlan_header *)mp->b_rptr;
1815 		hdr_info->mhi_istagged = B_TRUE;
1816 		hdr_info->mhi_tci = ntohs(evhp->ether_tci);
1817 		if (striphdr) {
1818 			/*
1819 			 * For VLAN tagged frames update the ether_type
1820 			 * in hdr_info before stripping the header.
1821 			 */
1822 			ether_type = ntohs(evhp->ether_type);
1823 			hdr_info->mhi_origsap = ether_type;
1824 			hdr_info->mhi_bindsap = (ether_type > ETHERMTU) ?
1825 			    ether_type : DLS_SAP_LLC;
1826 			mp->b_rptr = (uchar_t *)(evhp + 1);
1827 		}
1828 	} else {
1829 		hdr_info->mhi_istagged = B_FALSE;
1830 		hdr_info->mhi_tci = VLAN_ID_NONE;
1831 		if (striphdr)
1832 			mp->b_rptr += sizeof (struct ether_header);
1833 	}
1834 }
1835 
1836 /*
1837  * Return B_TRUE if we're allowed to send on this link with the given VLAN ID.
1838  */
1839 static boolean_t
1840 bridge_can_send(bridge_link_t *blp, uint16_t vlanid)
1841 {
1842 	ASSERT(vlanid != VLAN_ID_NONE);
1843 	if (blp->bl_flags & BLF_DELETED)
1844 		return (B_FALSE);
1845 	if (blp->bl_trilldata == NULL && blp->bl_state != BLS_FORWARDING)
1846 		return (B_FALSE);
1847 	return (BRIDGE_VLAN_ISSET(blp, vlanid) && BRIDGE_AF_ISSET(blp, vlanid));
1848 }
1849 
1850 /*
1851  * This function scans the bridge forwarding tables in order to forward a given
1852  * packet.  If the packet either doesn't need forwarding (the current link is
1853  * correct) or the current link needs a copy as well, then the packet is
1854  * returned to the caller.
1855  *
1856  * If a packet has been decapsulated from TRILL, then it must *NOT* reenter a
1857  * TRILL tunnel.  If the destination points there, then drop instead.
1858  */
1859 static mblk_t *
1860 bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
1861     uint16_t vlanid, uint16_t tci, boolean_t from_trill, boolean_t is_xmit)
1862 {
1863 	mblk_t *mpsend, *mpcopy;
1864 	bridge_inst_t *bip = blp->bl_inst;
1865 	bridge_link_t *blpsend, *blpnext;
1866 	bridge_fwd_t *bfp;
1867 	uint_t i;
1868 	boolean_t selfseen = B_FALSE;
1869 	void *tdp;
1870 	const uint8_t *daddr = hdr_info->mhi_daddr;
1871 
1872 	/*
1873 	 * Check for the IEEE "reserved" multicast addresses.  Messages sent to
1874 	 * these addresses are used for link-local control (STP and pause), and
1875 	 * are never forwarded or redirected.
1876 	 */
1877 	if (daddr[0] == 1 && daddr[1] == 0x80 && daddr[2] == 0xc2 &&
1878 	    daddr[3] == 0 && daddr[4] == 0 && (daddr[5] & 0xf0) == 0) {
1879 		if (from_trill) {
1880 			freemsg(mp);
1881 			mp = NULL;
1882 		}
1883 		return (mp);
1884 	}
1885 
1886 	if ((bfp = fwd_find(bip, daddr, vlanid)) != NULL) {
1887 
1888 		/*
1889 		 * If trill indicates a destination for this node, then it's
1890 		 * clearly not intended for local delivery.  We must tell TRILL
1891 		 * to encapsulate, as long as we didn't just decapsulate it.
1892 		 */
1893 		if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE) {
1894 			/*
1895 			 * Error case: can't reencapsulate if the protocols are
1896 			 * working correctly.
1897 			 */
1898 			if (from_trill) {
1899 				freemsg(mp);
1900 				return (NULL);
1901 			}
1902 			mutex_enter(&blp->bl_trilllock);
1903 			if ((tdp = blp->bl_trilldata) != NULL) {
1904 				blp->bl_trillthreads++;
1905 				mutex_exit(&blp->bl_trilllock);
1906 				update_header(mp, hdr_info, B_FALSE);
1907 				if (is_xmit)
1908 					mp = mac_fix_cksum(mp);
1909 				/* all trill data frames have Inner.VLAN */
1910 				mp = reform_vlan_header(mp, vlanid, tci, 0);
1911 				if (mp == NULL) {
1912 					KIINCR(bki_drops);
1913 					fwd_unref(bfp);
1914 					return (NULL);
1915 				}
1916 				trill_encap_fn(tdp, blp, hdr_info, mp,
1917 				    bfp->bf_trill_nick);
1918 				mutex_enter(&blp->bl_trilllock);
1919 				if (--blp->bl_trillthreads == 0 &&
1920 				    blp->bl_trilldata == NULL)
1921 					cv_broadcast(&blp->bl_trillwait);
1922 			}
1923 			mutex_exit(&blp->bl_trilllock);
1924 
1925 			/* if TRILL has been disabled, then kill this stray */
1926 			if (tdp == NULL) {
1927 				freemsg(mp);
1928 				fwd_delete(bfp);
1929 			}
1930 			fwd_unref(bfp);
1931 			return (NULL);
1932 		}
1933 
1934 		/* find first link we can send on */
1935 		for (i = 0; i < bfp->bf_nlinks; i++) {
1936 			blpsend = bfp->bf_links[i];
1937 			if (blpsend == blp)
1938 				selfseen = B_TRUE;
1939 			else if (bridge_can_send(blpsend, vlanid))
1940 				break;
1941 		}
1942 
1943 		while (i < bfp->bf_nlinks) {
1944 			blpsend = bfp->bf_links[i];
1945 			for (i++; i < bfp->bf_nlinks; i++) {
1946 				blpnext = bfp->bf_links[i];
1947 				if (blpnext == blp)
1948 					selfseen = B_TRUE;
1949 				else if (bridge_can_send(blpnext, vlanid))
1950 					break;
1951 			}
1952 			if (i == bfp->bf_nlinks && !selfseen) {
1953 				mpsend = mp;
1954 				mp = NULL;
1955 			} else {
1956 				mpsend = copymsg(mp);
1957 			}
1958 
1959 			if (!from_trill && is_xmit)
1960 				mpsend = mac_fix_cksum(mpsend);
1961 
1962 			mpsend = reform_vlan_header(mpsend, vlanid, tci,
1963 			    blpsend->bl_pvid);
1964 			if (mpsend == NULL) {
1965 				KIINCR(bki_drops);
1966 				continue;
1967 			}
1968 
1969 			KIINCR(bki_forwards);
1970 			/*
1971 			 * No need to bump up the link reference count, as
1972 			 * the forwarding entry itself holds a reference to
1973 			 * the link.
1974 			 */
1975 			if (bfp->bf_flags & BFF_LOCALADDR) {
1976 				mac_rx_common(blpsend->bl_mh, NULL, mpsend);
1977 			} else {
1978 				KLPINCR(blpsend, bkl_xmit);
1979 				MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
1980 				    mpsend);
1981 				freemsg(mpsend);
1982 			}
1983 		}
1984 		/*
1985 		 * Handle a special case: if we're transmitting to the original
1986 		 * link, then check whether the localaddr flag is set.  If it
1987 		 * is, then receive instead.  This doesn't happen with ordinary
1988 		 * bridging, but does happen often with TRILL decapsulation.
1989 		 */
1990 		if (mp != NULL && is_xmit && (bfp->bf_flags & BFF_LOCALADDR)) {
1991 			mac_rx_common(blp->bl_mh, NULL, mp);
1992 			mp = NULL;
1993 		}
1994 		fwd_unref(bfp);
1995 	} else {
1996 		/*
1997 		 * TRILL has two cases to handle.  If the packet is off the
1998 		 * wire (not from TRILL), then we need to send up into the
1999 		 * TRILL module to have the distribution tree computed.  If the
2000 		 * packet is from TRILL (decapsulated), then we're part of the
2001 		 * distribution tree, and we need to copy the packet on member
2002 		 * interfaces.
2003 		 *
2004 		 * Thus, the from TRILL case is identical to the STP case.
2005 		 */
2006 		if (!from_trill && blp->bl_trilldata != NULL) {
2007 			mutex_enter(&blp->bl_trilllock);
2008 			if ((tdp = blp->bl_trilldata) != NULL) {
2009 				blp->bl_trillthreads++;
2010 				mutex_exit(&blp->bl_trilllock);
2011 				if ((mpsend = copymsg(mp)) != NULL) {
2012 					update_header(mpsend,
2013 					    hdr_info, B_FALSE);
2014 					/*
2015 					 * all trill data frames have
2016 					 * Inner.VLAN
2017 					 */
2018 					mpsend = reform_vlan_header(mpsend,
2019 					    vlanid, tci, 0);
2020 					if (mpsend == NULL) {
2021 						KIINCR(bki_drops);
2022 					} else {
2023 						trill_encap_fn(tdp, blp,
2024 						    hdr_info, mpsend,
2025 						    RBRIDGE_NICKNAME_NONE);
2026 					}
2027 				}
2028 				mutex_enter(&blp->bl_trilllock);
2029 				if (--blp->bl_trillthreads == 0 &&
2030 				    blp->bl_trilldata == NULL)
2031 					cv_broadcast(&blp->bl_trillwait);
2032 			}
2033 			mutex_exit(&blp->bl_trilllock);
2034 		}
2035 
2036 		/*
2037 		 * This is an unknown destination, so flood.
2038 		 */
2039 		rw_enter(&bip->bi_rwlock, RW_READER);
2040 		for (blpnext = list_head(&bip->bi_links); blpnext != NULL;
2041 		    blpnext = list_next(&bip->bi_links, blpnext)) {
2042 			if (blpnext == blp)
2043 				selfseen = B_TRUE;
2044 			else if (bridge_can_send(blpnext, vlanid))
2045 				break;
2046 		}
2047 		if (blpnext != NULL)
2048 			atomic_inc_uint(&blpnext->bl_refs);
2049 		rw_exit(&bip->bi_rwlock);
2050 		while ((blpsend = blpnext) != NULL) {
2051 			rw_enter(&bip->bi_rwlock, RW_READER);
2052 			for (blpnext = list_next(&bip->bi_links, blpsend);
2053 			    blpnext != NULL;
2054 			    blpnext = list_next(&bip->bi_links, blpnext)) {
2055 				if (blpnext == blp)
2056 					selfseen = B_TRUE;
2057 				else if (bridge_can_send(blpnext, vlanid))
2058 					break;
2059 			}
2060 			if (blpnext != NULL)
2061 				atomic_inc_uint(&blpnext->bl_refs);
2062 			rw_exit(&bip->bi_rwlock);
2063 			if (blpnext == NULL && !selfseen) {
2064 				mpsend = mp;
2065 				mp = NULL;
2066 			} else {
2067 				mpsend = copymsg(mp);
2068 			}
2069 
2070 			if (!from_trill && is_xmit)
2071 				mpsend = mac_fix_cksum(mpsend);
2072 
2073 			mpsend = reform_vlan_header(mpsend, vlanid, tci,
2074 			    blpsend->bl_pvid);
2075 			if (mpsend == NULL) {
2076 				KIINCR(bki_drops);
2077 				continue;
2078 			}
2079 
2080 			if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST)
2081 				KIINCR(bki_unknown);
2082 			else
2083 				KIINCR(bki_mbcast);
2084 			KLPINCR(blpsend, bkl_xmit);
2085 			if ((mpcopy = copymsg(mpsend)) != NULL)
2086 				mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
2087 			MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend);
2088 			freemsg(mpsend);
2089 			link_unref(blpsend);
2090 		}
2091 	}
2092 
2093 	/*
2094 	 * At this point, if np is non-NULL, it means that the caller needs to
2095 	 * continue on the selected link.
2096 	 */
2097 	return (mp);
2098 }
2099 
2100 /*
2101  * Extract and validate the VLAN information for a given packet.  This checks
2102  * conformance with the rules for use of the PVID on the link, and for the
2103  * allowed (configured) VLAN set.
2104  *
2105  * Returns B_TRUE if the packet passes, B_FALSE if it fails.
2106  */
2107 static boolean_t
2108 bridge_get_vlan(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
2109     uint16_t *vlanidp, uint16_t *tcip)
2110 {
2111 	uint16_t tci, vlanid;
2112 
2113 	if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
2114 		ptrdiff_t tpos = offsetof(struct ether_vlan_header, ether_tci);
2115 		ptrdiff_t mlen;
2116 
2117 		/*
2118 		 * Extract the VLAN ID information, regardless of alignment,
2119 		 * and without a pullup.  This isn't attractive, but we do this
2120 		 * to avoid having to deal with the pointers stashed in
2121 		 * hdr_info moving around or having the caller deal with a new
2122 		 * mblk_t pointer.
2123 		 */
2124 		while (mp != NULL) {
2125 			mlen = MBLKL(mp);
2126 			if (mlen > tpos && mlen > 0)
2127 				break;
2128 			tpos -= mlen;
2129 			mp = mp->b_cont;
2130 		}
2131 		if (mp == NULL)
2132 			return (B_FALSE);
2133 		tci = mp->b_rptr[tpos] << 8;
2134 		if (++tpos >= mlen) {
2135 			do {
2136 				mp = mp->b_cont;
2137 			} while (mp != NULL && MBLKL(mp) == 0);
2138 			if (mp == NULL)
2139 				return (B_FALSE);
2140 			tpos = 0;
2141 		}
2142 		tci |= mp->b_rptr[tpos];
2143 
2144 		vlanid = VLAN_ID(tci);
2145 		if (VLAN_CFI(tci) != ETHER_CFI || vlanid > VLAN_ID_MAX)
2146 			return (B_FALSE);
2147 		if (vlanid == VLAN_ID_NONE || vlanid == blp->bl_pvid)
2148 			goto input_no_vlan;
2149 		if (!BRIDGE_VLAN_ISSET(blp, vlanid))
2150 			return (B_FALSE);
2151 	} else {
2152 		tci = 0xFFFF;
2153 input_no_vlan:
2154 		/*
2155 		 * If PVID is set to zero, then untagged traffic is not
2156 		 * supported here.  Do not learn or forward.
2157 		 */
2158 		if ((vlanid = blp->bl_pvid) == VLAN_ID_NONE)
2159 			return (B_FALSE);
2160 	}
2161 
2162 	*tcip = tci;
2163 	*vlanidp = vlanid;
2164 	return (B_TRUE);
2165 }
2166 
2167 /*
2168  * Handle MAC notifications.
2169  */
2170 static void
2171 bridge_notify_cb(void *arg, mac_notify_type_t note_type)
2172 {
2173 	bridge_link_t *blp = arg;
2174 
2175 	switch (note_type) {
2176 	case MAC_NOTE_UNICST:
2177 		bridge_new_unicst(blp);
2178 		break;
2179 
2180 	case MAC_NOTE_SDU_SIZE: {
2181 		uint_t maxsdu;
2182 		bridge_inst_t *bip = blp->bl_inst;
2183 		bridge_mac_t *bmp = bip->bi_mac;
2184 		boolean_t notify = B_FALSE;
2185 		mblk_t *mlist = NULL;
2186 
2187 		mac_sdu_get(blp->bl_mh, NULL, &maxsdu);
2188 		rw_enter(&bip->bi_rwlock, RW_READER);
2189 		if (list_prev(&bip->bi_links, blp) == NULL &&
2190 		    list_next(&bip->bi_links, blp) == NULL) {
2191 			notify = (maxsdu != bmp->bm_maxsdu);
2192 			bmp->bm_maxsdu = maxsdu;
2193 		}
2194 		blp->bl_maxsdu = maxsdu;
2195 		if (maxsdu != bmp->bm_maxsdu)
2196 			link_sdu_fail(blp, B_TRUE, &mlist);
2197 		else if (notify)
2198 			(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2199 		rw_exit(&bip->bi_rwlock);
2200 		send_up_messages(bip, mlist);
2201 		break;
2202 	}
2203 	}
2204 }
2205 
2206 /*
2207  * This is called by the MAC layer.  As with the transmit side, we're right in
2208  * the data path for all I/O on this port, so if we don't need to forward this
2209  * packet anywhere, we have to send it upwards via mac_rx_common.
2210  */
2211 static void
2212 bridge_recv_cb(mac_handle_t mh, mac_resource_handle_t rsrc, mblk_t *mpnext)
2213 {
2214 	mblk_t *mp, *mpcopy;
2215 	bridge_link_t *blp = (bridge_link_t *)mh;
2216 	bridge_inst_t *bip = blp->bl_inst;
2217 	bridge_mac_t *bmp = bip->bi_mac;
2218 	mac_header_info_t hdr_info;
2219 	uint16_t vlanid, tci;
2220 	boolean_t trillmode = B_FALSE;
2221 
2222 	KIINCR(bki_recv);
2223 	KLINCR(bkl_recv);
2224 
2225 	/*
2226 	 * Regardless of state, check for inbound TRILL packets when TRILL is
2227 	 * active.  These are pulled out of band and sent for TRILL handling.
2228 	 */
2229 	if (blp->bl_trilldata != NULL) {
2230 		void *tdp;
2231 		mblk_t *newhead;
2232 		mblk_t *tail = NULL;
2233 
2234 		mutex_enter(&blp->bl_trilllock);
2235 		if ((tdp = blp->bl_trilldata) != NULL) {
2236 			blp->bl_trillthreads++;
2237 			mutex_exit(&blp->bl_trilllock);
2238 			trillmode = B_TRUE;
2239 			newhead = mpnext;
2240 			while ((mp = mpnext) != NULL) {
2241 				boolean_t raw_isis, bridge_group;
2242 
2243 				mpnext = mp->b_next;
2244 
2245 				/*
2246 				 * If the header isn't readable, then leave on
2247 				 * the list and continue.
2248 				 */
2249 				if (mac_header_info(blp->bl_mh, mp,
2250 				    &hdr_info) != 0) {
2251 					tail = mp;
2252 					continue;
2253 				}
2254 
2255 				/*
2256 				 * The TRILL document specifies that, on
2257 				 * Ethernet alone, IS-IS packets arrive with
2258 				 * LLC rather than Ethertype, and using a
2259 				 * specific destination address.  We must check
2260 				 * for that here.  Also, we need to give BPDUs
2261 				 * to TRILL for processing.
2262 				 */
2263 				raw_isis = bridge_group = B_FALSE;
2264 				if (hdr_info.mhi_dsttype ==
2265 				    MAC_ADDRTYPE_MULTICAST) {
2266 					if (memcmp(hdr_info.mhi_daddr,
2267 					    all_isis_rbridges, ETHERADDRL) == 0)
2268 						raw_isis = B_TRUE;
2269 					else if (memcmp(hdr_info.mhi_daddr,
2270 					    bridge_group_address, ETHERADDRL) ==
2271 					    0)
2272 						bridge_group = B_TRUE;
2273 				}
2274 				if (!raw_isis && !bridge_group &&
2275 				    hdr_info.mhi_bindsap != ETHERTYPE_TRILL &&
2276 				    (hdr_info.mhi_bindsap != ETHERTYPE_VLAN ||
2277 				    /* LINTED: alignment */
2278 				    ((struct ether_vlan_header *)mp->b_rptr)->
2279 				    ether_type != htons(ETHERTYPE_TRILL))) {
2280 					tail = mp;
2281 					continue;
2282 				}
2283 
2284 				/*
2285 				 * We've got TRILL input.  Remove from the list
2286 				 * and send up through the TRILL module.  (Send
2287 				 * a copy through promiscuous receive just to
2288 				 * support snooping on TRILL.  Order isn't
2289 				 * preserved strictly, but that doesn't matter
2290 				 * here.)
2291 				 */
2292 				if (tail != NULL)
2293 					tail->b_next = mpnext;
2294 				mp->b_next = NULL;
2295 				if (mp == newhead)
2296 					newhead = mpnext;
2297 				mac_trill_snoop(blp->bl_mh, mp);
2298 				update_header(mp, &hdr_info, B_TRUE);
2299 				/*
2300 				 * On raw IS-IS and BPDU frames, we have to
2301 				 * make sure that the length is trimmed
2302 				 * properly.  We use origsap in order to cope
2303 				 * with jumbograms for IS-IS.  (Regular mac
2304 				 * can't.)
2305 				 */
2306 				if (raw_isis || bridge_group) {
2307 					size_t msglen = msgdsize(mp);
2308 
2309 					if (msglen > hdr_info.mhi_origsap) {
2310 						(void) adjmsg(mp,
2311 						    hdr_info.mhi_origsap -
2312 						    msglen);
2313 					} else if (msglen <
2314 					    hdr_info.mhi_origsap) {
2315 						freemsg(mp);
2316 						continue;
2317 					}
2318 				}
2319 				trill_recv_fn(tdp, blp, rsrc, mp, &hdr_info);
2320 			}
2321 			mpnext = newhead;
2322 			mutex_enter(&blp->bl_trilllock);
2323 			if (--blp->bl_trillthreads == 0 &&
2324 			    blp->bl_trilldata == NULL)
2325 				cv_broadcast(&blp->bl_trillwait);
2326 		}
2327 		mutex_exit(&blp->bl_trilllock);
2328 		if (mpnext == NULL)
2329 			return;
2330 	}
2331 
2332 	/*
2333 	 * If this is a TRILL RBridge, then just check whether this link is
2334 	 * used at all for forwarding.  If not, then we're done.
2335 	 */
2336 	if (trillmode) {
2337 		if (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2338 		    (blp->bl_flags & BLF_SDUFAIL)) {
2339 			mac_rx_common(blp->bl_mh, rsrc, mpnext);
2340 			return;
2341 		}
2342 	} else {
2343 		/*
2344 		 * For regular (STP) bridges, if we're in blocking or listening
2345 		 * state, then do nothing.  We don't learn or forward until
2346 		 * told to do so.
2347 		 */
2348 		if (blp->bl_state == BLS_BLOCKLISTEN) {
2349 			mac_rx_common(blp->bl_mh, rsrc, mpnext);
2350 			return;
2351 		}
2352 	}
2353 
2354 	/*
2355 	 * Send a copy of the message chain up to the observability node users.
2356 	 * For TRILL, we must obey the VLAN AF rules, so we go packet-by-
2357 	 * packet.
2358 	 */
2359 	if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2360 	    (bmp->bm_flags & BMF_STARTED) &&
2361 	    (mp = copymsgchain(mpnext)) != NULL) {
2362 		mac_rx(bmp->bm_mh, NULL, mp);
2363 	}
2364 
2365 	/*
2366 	 * We must be in learning or forwarding state, or using TRILL on a link
2367 	 * with one or more VLANs active.  For each packet in the list, process
2368 	 * the source address, and then attempt to forward.
2369 	 */
2370 	while ((mp = mpnext) != NULL) {
2371 		mpnext = mp->b_next;
2372 		mp->b_next = NULL;
2373 
2374 		/*
2375 		 * If we can't decode the header or if the header specifies a
2376 		 * multicast source address (impossible!), then don't bother
2377 		 * learning or forwarding, but go ahead and forward up the
2378 		 * stack for subsequent processing.
2379 		 */
2380 		if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0 ||
2381 		    (hdr_info.mhi_saddr[0] & 1) != 0) {
2382 			KIINCR(bki_drops);
2383 			KLINCR(bkl_drops);
2384 			mac_rx_common(blp->bl_mh, rsrc, mp);
2385 			continue;
2386 		}
2387 
2388 		/*
2389 		 * Extract and validate the VLAN ID for this packet.
2390 		 */
2391 		if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2392 		    !BRIDGE_AF_ISSET(blp, vlanid)) {
2393 			mac_rx_common(blp->bl_mh, rsrc, mp);
2394 			continue;
2395 		}
2396 
2397 		if (trillmode) {
2398 			/*
2399 			 * Special test required by TRILL document: must
2400 			 * discard frames with outer address set to ESADI.
2401 			 */
2402 			if (memcmp(hdr_info.mhi_daddr, all_esadi_rbridges,
2403 			    ETHERADDRL) == 0) {
2404 				mac_rx_common(blp->bl_mh, rsrc, mp);
2405 				continue;
2406 			}
2407 
2408 			/*
2409 			 * If we're in TRILL mode, then the call above to get
2410 			 * the VLAN ID has also checked that we're the
2411 			 * appointed forwarder, so report that we're handling
2412 			 * this packet to any observability node users.
2413 			 */
2414 			if ((bmp->bm_flags & BMF_STARTED) &&
2415 			    (mpcopy = copymsg(mp)) != NULL)
2416 				mac_rx(bmp->bm_mh, NULL, mpcopy);
2417 		}
2418 
2419 		/*
2420 		 * First process the source address and learn from it.  For
2421 		 * TRILL, we learn only if we're the appointed forwarder.
2422 		 */
2423 		bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2424 		    vlanid);
2425 
2426 		/*
2427 		 * Now check whether we're forwarding and look up the
2428 		 * destination.  If we can forward, do so.
2429 		 */
2430 		if (trillmode || blp->bl_state == BLS_FORWARDING) {
2431 			mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2432 			    B_FALSE, B_FALSE);
2433 		}
2434 		if (mp != NULL)
2435 			mac_rx_common(blp->bl_mh, rsrc, mp);
2436 	}
2437 }
2438 
2439 
2440 /* ARGSUSED */
2441 static mblk_t *
2442 bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext)
2443 {
2444 	bridge_link_t *blp = (bridge_link_t *)mh;
2445 	bridge_inst_t *bip = blp->bl_inst;
2446 	bridge_mac_t *bmp = bip->bi_mac;
2447 	mac_header_info_t hdr_info;
2448 	uint16_t vlanid, tci;
2449 	mblk_t *mp, *mpcopy;
2450 	boolean_t trillmode;
2451 
2452 	trillmode = blp->bl_trilldata != NULL;
2453 
2454 	/*
2455 	 * If we're using STP and we're in blocking or listening state, or if
2456 	 * we're using TRILL and no VLANs are active, then behave as though the
2457 	 * bridge isn't here at all, and send on the local link alone.
2458 	 */
2459 	if ((!trillmode && blp->bl_state == BLS_BLOCKLISTEN) ||
2460 	    (trillmode &&
2461 	    (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2462 	    (blp->bl_flags & BLF_SDUFAIL)))) {
2463 		KIINCR(bki_sent);
2464 		KLINCR(bkl_xmit);
2465 		MAC_RING_TX(blp->bl_mh, rh, mpnext, mp);
2466 		return (mp);
2467 	}
2468 
2469 	/*
2470 	 * Send a copy of the message up to the observability node users.
2471 	 * TRILL needs to check on a packet-by-packet basis.
2472 	 */
2473 	if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2474 	    (bmp->bm_flags & BMF_STARTED) &&
2475 	    (mp = copymsgchain(mpnext)) != NULL) {
2476 		mac_rx(bmp->bm_mh, NULL, mp);
2477 	}
2478 
2479 	while ((mp = mpnext) != NULL) {
2480 		mpnext = mp->b_next;
2481 		mp->b_next = NULL;
2482 
2483 		if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2484 			freemsg(mp);
2485 			continue;
2486 		}
2487 
2488 		/*
2489 		 * Extract and validate the VLAN ID for this packet.
2490 		 */
2491 		if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2492 		    !BRIDGE_AF_ISSET(blp, vlanid)) {
2493 			freemsg(mp);
2494 			continue;
2495 		}
2496 
2497 		/*
2498 		 * If we're using TRILL, then we've now validated that we're
2499 		 * the forwarder for this VLAN, so go ahead and let
2500 		 * observability node users know about the packet.
2501 		 */
2502 		if (trillmode && (bmp->bm_flags & BMF_STARTED) &&
2503 		    (mpcopy = copymsg(mp)) != NULL) {
2504 			mac_rx(bmp->bm_mh, NULL, mpcopy);
2505 		}
2506 
2507 		/*
2508 		 * We have to learn from our own transmitted packets, because
2509 		 * there may be a Solaris DLPI raw sender (who can specify his
2510 		 * own source address) using promiscuous mode for receive.  The
2511 		 * mac layer information won't (and can't) tell us everything
2512 		 * we need to know.
2513 		 */
2514 		bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2515 		    vlanid);
2516 
2517 		/* attempt forwarding */
2518 		if (trillmode || blp->bl_state == BLS_FORWARDING) {
2519 			mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2520 			    B_FALSE, B_TRUE);
2521 		}
2522 		if (mp != NULL) {
2523 			MAC_RING_TX(blp->bl_mh, rh, mp, mp);
2524 			if (mp == NULL) {
2525 				KIINCR(bki_sent);
2526 				KLINCR(bkl_xmit);
2527 			}
2528 		}
2529 		/*
2530 		 * If we get stuck, then stop.  Don't let the user's output
2531 		 * packets get out of order.  (More importantly: don't try to
2532 		 * bridge the same packet multiple times if flow control is
2533 		 * asserted.)
2534 		 */
2535 		if (mp != NULL) {
2536 			mp->b_next = mpnext;
2537 			break;
2538 		}
2539 	}
2540 	return (mp);
2541 }
2542 
2543 /*
2544  * This is called by TRILL when it decapsulates an packet, and we must forward
2545  * locally.  On failure, we just drop.
2546  *
2547  * Note that the ingress_nick reported by TRILL must not represent this local
2548  * node.
2549  */
2550 void
2551 bridge_trill_decaps(bridge_link_t *blp, mblk_t *mp, uint16_t ingress_nick)
2552 {
2553 	mac_header_info_t hdr_info;
2554 	uint16_t vlanid, tci;
2555 	bridge_inst_t *bip = blp->bl_inst;	/* used by macros */
2556 	mblk_t *mpcopy;
2557 
2558 	if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2559 		freemsg(mp);
2560 		return;
2561 	}
2562 
2563 	/* Extract VLAN ID for this packet. */
2564 	if (hdr_info.mhi_bindsap == ETHERTYPE_VLAN) {
2565 		struct ether_vlan_header *evhp;
2566 
2567 		/* LINTED: alignment */
2568 		evhp = (struct ether_vlan_header *)mp->b_rptr;
2569 		tci = ntohs(evhp->ether_tci);
2570 		vlanid = VLAN_ID(tci);
2571 	} else {
2572 		/* Inner VLAN headers are required in TRILL data packets */
2573 		DTRACE_PROBE3(bridge__trill__decaps__novlan, bridge_link_t *,
2574 		    blp, mblk_t *, mp, uint16_t, ingress_nick);
2575 		freemsg(mp);
2576 		return;
2577 	}
2578 
2579 	/* Learn the location of this sender in the RBridge network */
2580 	bridge_learn(blp, hdr_info.mhi_saddr, ingress_nick, vlanid);
2581 
2582 	/* attempt forwarding */
2583 	mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci, B_TRUE, B_TRUE);
2584 	if (mp != NULL) {
2585 		if (bridge_can_send(blp, vlanid)) {
2586 			/* Deliver a copy locally as well */
2587 			if ((mpcopy = copymsg(mp)) != NULL)
2588 				mac_rx_common(blp->bl_mh, NULL, mpcopy);
2589 			MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
2590 		}
2591 		if (mp == NULL) {
2592 			KIINCR(bki_sent);
2593 			KLINCR(bkl_xmit);
2594 		} else {
2595 			freemsg(mp);
2596 		}
2597 	}
2598 }
2599 
2600 /*
2601  * This function is used by TRILL _only_ to transmit TRILL-encapsulated
2602  * packets.  It sends on a single underlying link and does not bridge.
2603  */
2604 mblk_t *
2605 bridge_trill_output(bridge_link_t *blp, mblk_t *mp)
2606 {
2607 	bridge_inst_t *bip = blp->bl_inst;	/* used by macros */
2608 
2609 	mac_trill_snoop(blp->bl_mh, mp);
2610 	MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
2611 	if (mp == NULL) {
2612 		KIINCR(bki_sent);
2613 		KLINCR(bkl_xmit);
2614 	}
2615 	return (mp);
2616 }
2617 
2618 /*
2619  * Set the "appointed forwarder" flag array for this link.  TRILL controls
2620  * forwarding on a VLAN basis.  The "trillactive" flag is an optimization for
2621  * the forwarder.
2622  */
2623 void
2624 bridge_trill_setvlans(bridge_link_t *blp, const uint8_t *arr)
2625 {
2626 	int i;
2627 	uint_t newflags = 0;
2628 
2629 	for (i = 0; i < BRIDGE_VLAN_ARR_SIZE; i++) {
2630 		if ((blp->bl_afs[i] = arr[i]) != 0)
2631 			newflags = BLF_TRILLACTIVE;
2632 	}
2633 	blp->bl_flags = (blp->bl_flags & ~BLF_TRILLACTIVE) | newflags;
2634 }
2635 
2636 void
2637 bridge_trill_flush(bridge_link_t *blp, uint16_t vlan, boolean_t dotrill)
2638 {
2639 	bridge_inst_t *bip = blp->bl_inst;
2640 	bridge_fwd_t *bfp, *bfnext;
2641 	avl_tree_t fwd_scavenge;
2642 	int i;
2643 
2644 	_NOTE(ARGUNUSED(vlan));
2645 
2646 	avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
2647 	    offsetof(bridge_fwd_t, bf_node));
2648 	rw_enter(&bip->bi_rwlock, RW_WRITER);
2649 	bfnext = avl_first(&bip->bi_fwd);
2650 	while ((bfp = bfnext) != NULL) {
2651 		bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
2652 		if (bfp->bf_flags & BFF_LOCALADDR)
2653 			continue;
2654 		if (dotrill) {
2655 			/* port doesn't matter if we're flushing TRILL */
2656 			if (bfp->bf_trill_nick == RBRIDGE_NICKNAME_NONE)
2657 				continue;
2658 		} else {
2659 			if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE)
2660 				continue;
2661 			for (i = 0; i < bfp->bf_nlinks; i++) {
2662 				if (bfp->bf_links[i] == blp)
2663 					break;
2664 			}
2665 			if (i >= bfp->bf_nlinks)
2666 				continue;
2667 		}
2668 		ASSERT(bfp->bf_flags & BFF_INTREE);
2669 		avl_remove(&bip->bi_fwd, bfp);
2670 		bfp->bf_flags &= ~BFF_INTREE;
2671 		avl_add(&fwd_scavenge, bfp);
2672 	}
2673 	rw_exit(&bip->bi_rwlock);
2674 	bfnext = avl_first(&fwd_scavenge);
2675 	while ((bfp = bfnext) != NULL) {
2676 		bfnext = AVL_NEXT(&fwd_scavenge, bfp);
2677 		avl_remove(&fwd_scavenge, bfp);
2678 		fwd_unref(bfp);
2679 	}
2680 	avl_destroy(&fwd_scavenge);
2681 }
2682 
2683 /*
2684  * Let the mac module take or drop a reference to a bridge link.  When this is
2685  * called, the mac module is holding the mi_bridge_lock, so the link cannot be
2686  * in the process of entering or leaving a bridge.
2687  */
2688 static void
2689 bridge_ref_cb(mac_handle_t mh, boolean_t hold)
2690 {
2691 	bridge_link_t *blp = (bridge_link_t *)mh;
2692 
2693 	if (hold)
2694 		atomic_inc_uint(&blp->bl_refs);
2695 	else
2696 		link_unref(blp);
2697 }
2698 
2699 /*
2700  * Handle link state changes reported by the mac layer.  This acts as a filter
2701  * for link state changes: if a link is reporting down, but there are other
2702  * links still up on the bridge, then the state is changed to "up."  When the
2703  * last link goes down, all are marked down, and when the first link goes up,
2704  * all are marked up.  (Recursion is avoided by the use of the "redo" function.)
2705  *
2706  * We treat unknown as equivalent to "up."
2707  */
2708 static link_state_t
2709 bridge_ls_cb(mac_handle_t mh, link_state_t newls)
2710 {
2711 	bridge_link_t *blp = (bridge_link_t *)mh;
2712 	bridge_link_t *blcmp;
2713 	bridge_inst_t *bip;
2714 	bridge_mac_t *bmp;
2715 
2716 	if (newls != LINK_STATE_DOWN && blp->bl_linkstate != LINK_STATE_DOWN ||
2717 	    (blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL))) {
2718 		blp->bl_linkstate = newls;
2719 		return (newls);
2720 	}
2721 
2722 	/*
2723 	 * Scan first to see if there are any other non-down links.  If there
2724 	 * are, then we're done.  Otherwise, if all others are down, then the
2725 	 * state of this link is the state of the bridge.
2726 	 */
2727 	bip = blp->bl_inst;
2728 	rw_enter(&bip->bi_rwlock, RW_WRITER);
2729 	for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2730 	    blcmp = list_next(&bip->bi_links, blcmp)) {
2731 		if (blcmp != blp &&
2732 		    !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
2733 		    blcmp->bl_linkstate != LINK_STATE_DOWN)
2734 			break;
2735 	}
2736 
2737 	if (blcmp != NULL) {
2738 		/*
2739 		 * If there are other links that are considered up, then tell
2740 		 * the caller that the link is actually still up, regardless of
2741 		 * this link's underlying state.
2742 		 */
2743 		blp->bl_linkstate = newls;
2744 		newls = LINK_STATE_UP;
2745 	} else if (blp->bl_linkstate != newls) {
2746 		/*
2747 		 * If we've found no other 'up' links, and this link has
2748 		 * changed state, then report the new state of the bridge to
2749 		 * all other clients.
2750 		 */
2751 		blp->bl_linkstate = newls;
2752 		for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2753 		    blcmp = list_next(&bip->bi_links, blcmp)) {
2754 			if (blcmp != blp && !(blcmp->bl_flags & BLF_DELETED))
2755 				mac_link_redo(blcmp->bl_mh, newls);
2756 		}
2757 		bmp = bip->bi_mac;
2758 		if ((bmp->bm_linkstate = newls) != LINK_STATE_DOWN)
2759 			bmp->bm_linkstate = LINK_STATE_UP;
2760 		mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
2761 	}
2762 	rw_exit(&bip->bi_rwlock);
2763 	return (newls);
2764 }
2765 
2766 static void
2767 bridge_add_link(void *arg)
2768 {
2769 	mblk_t *mp = arg;
2770 	bridge_stream_t *bsp;
2771 	bridge_inst_t *bip, *bipt;
2772 	bridge_mac_t *bmp;
2773 	datalink_id_t linkid;
2774 	int err;
2775 	mac_handle_t mh;
2776 	uint_t maxsdu;
2777 	bridge_link_t *blp = NULL, *blpt;
2778 	const mac_info_t *mip;
2779 	boolean_t macopen = B_FALSE;
2780 	char linkname[MAXLINKNAMELEN];
2781 	char kstatname[KSTAT_STRLEN];
2782 	int i;
2783 	link_state_t linkstate;
2784 	mblk_t *mlist;
2785 
2786 	bsp = (bridge_stream_t *)mp->b_next;
2787 	mp->b_next = NULL;
2788 	bip = bsp->bs_inst;
2789 	/* LINTED: alignment */
2790 	linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2791 
2792 	/*
2793 	 * First make sure that there is no other bridge that has this link.
2794 	 * We don't want to overlap operations from two bridges; the MAC layer
2795 	 * supports only one bridge on a given MAC at a time.
2796 	 *
2797 	 * We rely on the fact that there's just one taskq thread for the
2798 	 * bridging module: once we've checked for a duplicate, we can drop the
2799 	 * lock, because no other thread could possibly be adding another link
2800 	 * until we're done.
2801 	 */
2802 	mutex_enter(&inst_lock);
2803 	for (bipt = list_head(&inst_list); bipt != NULL;
2804 	    bipt = list_next(&inst_list, bipt)) {
2805 		rw_enter(&bipt->bi_rwlock, RW_READER);
2806 		for (blpt = list_head(&bipt->bi_links); blpt != NULL;
2807 		    blpt = list_next(&bipt->bi_links, blpt)) {
2808 			if (linkid == blpt->bl_linkid)
2809 				break;
2810 		}
2811 		rw_exit(&bipt->bi_rwlock);
2812 		if (blpt != NULL)
2813 			break;
2814 	}
2815 	mutex_exit(&inst_lock);
2816 	if (bipt != NULL) {
2817 		err = EBUSY;
2818 		goto fail;
2819 	}
2820 
2821 	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
2822 		goto fail;
2823 	macopen = B_TRUE;
2824 
2825 	/* we bridge only Ethernet */
2826 	mip = mac_info(mh);
2827 	if (mip->mi_media != DL_ETHER) {
2828 		err = ENOTSUP;
2829 		goto fail;
2830 	}
2831 
2832 	/*
2833 	 * Get the current maximum SDU on this interface.  If there are other
2834 	 * links on the bridge, then this one must match, or it errors out.
2835 	 * Otherwise, the first link becomes the standard for the new bridge.
2836 	 */
2837 	mac_sdu_get(mh, NULL, &maxsdu);
2838 	bmp = bip->bi_mac;
2839 	if (list_is_empty(&bip->bi_links)) {
2840 		bmp->bm_maxsdu = maxsdu;
2841 		(void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2842 	}
2843 
2844 	/* figure the kstat name; also used as the mac client name */
2845 	i = MBLKL(mp->b_cont) - sizeof (datalink_id_t);
2846 	if (i < 0 || i >= MAXLINKNAMELEN)
2847 		i = MAXLINKNAMELEN - 1;
2848 	bcopy(mp->b_cont->b_rptr + sizeof (datalink_id_t), linkname, i);
2849 	linkname[i] = '\0';
2850 	(void) snprintf(kstatname, sizeof (kstatname), "%s-%s", bip->bi_name,
2851 	    linkname);
2852 
2853 	if ((blp = kmem_zalloc(sizeof (*blp), KM_NOSLEEP)) == NULL) {
2854 		err = ENOMEM;
2855 		goto fail;
2856 	}
2857 	blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
2858 	if (blp->bl_lfailmp == NULL) {
2859 		kmem_free(blp, sizeof (*blp));
2860 		blp = NULL;
2861 		err = ENOMEM;
2862 		goto fail;
2863 	}
2864 
2865 	blp->bl_refs = 1;
2866 	atomic_inc_uint(&bip->bi_refs);
2867 	blp->bl_inst = bip;
2868 	blp->bl_mh = mh;
2869 	blp->bl_linkid = linkid;
2870 	blp->bl_maxsdu = maxsdu;
2871 	cv_init(&blp->bl_trillwait, NULL, CV_DRIVER, NULL);
2872 	mutex_init(&blp->bl_trilllock, NULL, MUTEX_DRIVER, NULL);
2873 	(void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
2874 
2875 	err = mac_client_open(mh, &blp->bl_mch, kstatname, 0);
2876 	if (err != 0)
2877 		goto fail;
2878 	blp->bl_flags |= BLF_CLIENT_OPEN;
2879 
2880 	err = mac_margin_add(mh, &blp->bl_margin, B_TRUE);
2881 	if (err != 0)
2882 		goto fail;
2883 	blp->bl_flags |= BLF_MARGIN_ADDED;
2884 
2885 	blp->bl_mnh = mac_notify_add(mh, bridge_notify_cb, blp);
2886 
2887 	/* Enable Bridging on the link */
2888 	err = mac_bridge_set(mh, (mac_handle_t)blp);
2889 	if (err != 0)
2890 		goto fail;
2891 	blp->bl_flags |= BLF_SET_BRIDGE;
2892 
2893 	err = mac_promisc_add(blp->bl_mch, MAC_CLIENT_PROMISC_ALL, NULL,
2894 	    blp, &blp->bl_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP);
2895 	if (err != 0)
2896 		goto fail;
2897 	blp->bl_flags |= BLF_PROM_ADDED;
2898 
2899 	bridge_new_unicst(blp);
2900 
2901 	blp->bl_ksp = kstat_setup((kstat_named_t *)&blp->bl_kstats,
2902 	    link_kstats_list, Dim(link_kstats_list), kstatname);
2903 
2904 	/*
2905 	 * The link holds a reference to the bridge instance, so that the
2906 	 * instance can't go away before the link is freed.  The insertion into
2907 	 * bi_links holds a reference on the link (reference set to 1 above).
2908 	 * When marking as removed from bi_links (BLF_DELETED), drop the
2909 	 * reference on the link. When freeing the link, drop the reference on
2910 	 * the instance. BLF_LINK_ADDED tracks link insertion in bi_links list.
2911 	 */
2912 	rw_enter(&bip->bi_rwlock, RW_WRITER);
2913 	list_insert_tail(&bip->bi_links, blp);
2914 	blp->bl_flags |= BLF_LINK_ADDED;
2915 
2916 	/*
2917 	 * If the new link is no good on this bridge, then let the daemon know
2918 	 * about the problem.
2919 	 */
2920 	mlist = NULL;
2921 	if (maxsdu != bmp->bm_maxsdu)
2922 		link_sdu_fail(blp, B_TRUE, &mlist);
2923 	rw_exit(&bip->bi_rwlock);
2924 	send_up_messages(bip, mlist);
2925 
2926 	/*
2927 	 * Trigger a link state update so that if this link is the first one
2928 	 * "up" in the bridge, then we notify everyone.  This triggers a trip
2929 	 * through bridge_ls_cb.
2930 	 */
2931 	linkstate = mac_stat_get(mh, MAC_STAT_LOWLINK_STATE);
2932 	blp->bl_linkstate = LINK_STATE_DOWN;
2933 	mac_link_update(mh, linkstate);
2934 
2935 	/*
2936 	 * We now need to report back to the stream that invoked us, and then
2937 	 * drop the reference on the stream that we're holding.
2938 	 */
2939 	miocack(bsp->bs_wq, mp, 0, 0);
2940 	stream_unref(bsp);
2941 	return;
2942 
2943 fail:
2944 	if (blp == NULL) {
2945 		if (macopen)
2946 			mac_close(mh);
2947 	} else {
2948 		link_shutdown(blp);
2949 	}
2950 	miocnak(bsp->bs_wq, mp, 0, err);
2951 	stream_unref(bsp);
2952 }
2953 
2954 static void
2955 bridge_rem_link(void *arg)
2956 {
2957 	mblk_t *mp = arg;
2958 	bridge_stream_t *bsp;
2959 	bridge_inst_t *bip;
2960 	bridge_mac_t *bmp;
2961 	datalink_id_t linkid;
2962 	bridge_link_t *blp, *blsave;
2963 	boolean_t found;
2964 	mblk_t *mlist;
2965 
2966 	bsp = (bridge_stream_t *)mp->b_next;
2967 	mp->b_next = NULL;
2968 	bip = bsp->bs_inst;
2969 	/* LINTED: alignment */
2970 	linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2971 
2972 	/*
2973 	 * We become reader here so that we can loop over the other links and
2974 	 * deliver link up/down notification.
2975 	 */
2976 	rw_enter(&bip->bi_rwlock, RW_READER);
2977 	found = B_FALSE;
2978 	for (blp = list_head(&bip->bi_links); blp != NULL;
2979 	    blp = list_next(&bip->bi_links, blp)) {
2980 		if (blp->bl_linkid == linkid &&
2981 		    !(blp->bl_flags & BLF_DELETED)) {
2982 			blp->bl_flags |= BLF_DELETED;
2983 			(void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
2984 			    blp, DDI_SLEEP);
2985 			found = B_TRUE;
2986 			break;
2987 		}
2988 	}
2989 
2990 	/*
2991 	 * Check if this link is up and the remainder of the links are all
2992 	 * down.
2993 	 */
2994 	if (blp != NULL && blp->bl_linkstate != LINK_STATE_DOWN) {
2995 		for (blp = list_head(&bip->bi_links); blp != NULL;
2996 		    blp = list_next(&bip->bi_links, blp)) {
2997 			if (blp->bl_linkstate != LINK_STATE_DOWN &&
2998 			    !(blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)))
2999 				break;
3000 		}
3001 		if (blp == NULL) {
3002 			for (blp = list_head(&bip->bi_links); blp != NULL;
3003 			    blp = list_next(&bip->bi_links, blp)) {
3004 				if (!(blp->bl_flags & BLF_DELETED))
3005 					mac_link_redo(blp->bl_mh,
3006 					    LINK_STATE_DOWN);
3007 			}
3008 			bmp = bip->bi_mac;
3009 			bmp->bm_linkstate = LINK_STATE_DOWN;
3010 			mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
3011 		}
3012 	}
3013 
3014 	/*
3015 	 * Check if there's just one working link left on the bridge.  If so,
3016 	 * then that link is now authoritative for bridge MTU.
3017 	 */
3018 	blsave = NULL;
3019 	for (blp = list_head(&bip->bi_links); blp != NULL;
3020 	    blp = list_next(&bip->bi_links, blp)) {
3021 		if (!(blp->bl_flags & BLF_DELETED)) {
3022 			if (blsave == NULL)
3023 				blsave = blp;
3024 			else
3025 				break;
3026 		}
3027 	}
3028 	mlist = NULL;
3029 	bmp = bip->bi_mac;
3030 	if (blsave != NULL && blp == NULL &&
3031 	    blsave->bl_maxsdu != bmp->bm_maxsdu) {
3032 		bmp->bm_maxsdu = blsave->bl_maxsdu;
3033 		(void) mac_maxsdu_update(bmp->bm_mh, blsave->bl_maxsdu);
3034 		link_sdu_fail(blsave, B_FALSE, &mlist);
3035 	}
3036 	rw_exit(&bip->bi_rwlock);
3037 	send_up_messages(bip, mlist);
3038 
3039 	if (found)
3040 		miocack(bsp->bs_wq, mp, 0, 0);
3041 	else
3042 		miocnak(bsp->bs_wq, mp, 0, ENOENT);
3043 	stream_unref(bsp);
3044 }
3045 
3046 /*
3047  * This function intentionally returns with bi_rwlock held; it is intended for
3048  * quick checks and updates.
3049  */
3050 static bridge_link_t *
3051 enter_link(bridge_inst_t *bip, datalink_id_t linkid)
3052 {
3053 	bridge_link_t *blp;
3054 
3055 	rw_enter(&bip->bi_rwlock, RW_READER);
3056 	for (blp = list_head(&bip->bi_links); blp != NULL;
3057 	    blp = list_next(&bip->bi_links, blp)) {
3058 		if (blp->bl_linkid == linkid && !(blp->bl_flags & BLF_DELETED))
3059 			break;
3060 	}
3061 	return (blp);
3062 }
3063 
3064 static void
3065 bridge_ioctl(queue_t *wq, mblk_t *mp)
3066 {
3067 	bridge_stream_t *bsp = wq->q_ptr;
3068 	bridge_inst_t *bip;
3069 	struct iocblk *iop;
3070 	int rc = EINVAL;
3071 	int len = 0;
3072 	bridge_link_t *blp;
3073 	cred_t *cr;
3074 
3075 	/* LINTED: alignment */
3076 	iop = (struct iocblk *)mp->b_rptr;
3077 
3078 	/*
3079 	 * For now, all of the bridge ioctls are privileged.
3080 	 */
3081 	if ((cr = msg_getcred(mp, NULL)) == NULL)
3082 		cr = iop->ioc_cr;
3083 	if (cr != NULL && secpolicy_net_config(cr, B_FALSE) != 0) {
3084 		miocnak(wq, mp, 0, EPERM);
3085 		return;
3086 	}
3087 
3088 	switch (iop->ioc_cmd) {
3089 	case BRIOC_NEWBRIDGE: {
3090 		bridge_newbridge_t *bnb;
3091 
3092 		if (bsp->bs_inst != NULL ||
3093 		    (rc = miocpullup(mp, sizeof (bridge_newbridge_t))) != 0)
3094 			break;
3095 		/* LINTED: alignment */
3096 		bnb = (bridge_newbridge_t *)mp->b_cont->b_rptr;
3097 		bnb->bnb_name[MAXNAMELEN-1] = '\0';
3098 		rc = bridge_create(bnb->bnb_linkid, bnb->bnb_name, &bip, cr);
3099 		if (rc != 0)
3100 			break;
3101 
3102 		rw_enter(&bip->bi_rwlock, RW_WRITER);
3103 		if (bip->bi_control != NULL) {
3104 			rw_exit(&bip->bi_rwlock);
3105 			bridge_unref(bip);
3106 			rc = EBUSY;
3107 		} else {
3108 			atomic_inc_uint(&bip->bi_refs);
3109 			bsp->bs_inst = bip;	/* stream holds reference */
3110 			bip->bi_control = bsp;
3111 			rw_exit(&bip->bi_rwlock);
3112 			rc = 0;
3113 		}
3114 		break;
3115 	}
3116 
3117 	case BRIOC_ADDLINK:
3118 		if ((bip = bsp->bs_inst) == NULL ||
3119 		    (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3120 			break;
3121 		/*
3122 		 * We cannot perform the action in this thread, because we're
3123 		 * not in process context, and we may already be holding
3124 		 * MAC-related locks.  Place the request on taskq.
3125 		 */
3126 		mp->b_next = (mblk_t *)bsp;
3127 		stream_ref(bsp);
3128 		(void) ddi_taskq_dispatch(bridge_taskq, bridge_add_link, mp,
3129 		    DDI_SLEEP);
3130 		return;
3131 
3132 	case BRIOC_REMLINK:
3133 		if ((bip = bsp->bs_inst) == NULL ||
3134 		    (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3135 			break;
3136 		/*
3137 		 * We cannot perform the action in this thread, because we're
3138 		 * not in process context, and we may already be holding
3139 		 * MAC-related locks.  Place the request on taskq.
3140 		 */
3141 		mp->b_next = (mblk_t *)bsp;
3142 		stream_ref(bsp);
3143 		(void) ddi_taskq_dispatch(bridge_taskq, bridge_rem_link, mp,
3144 		    DDI_SLEEP);
3145 		return;
3146 
3147 	case BRIOC_SETSTATE: {
3148 		bridge_setstate_t *bss;
3149 
3150 		if ((bip = bsp->bs_inst) == NULL ||
3151 		    (rc = miocpullup(mp, sizeof (*bss))) != 0)
3152 			break;
3153 		/* LINTED: alignment */
3154 		bss = (bridge_setstate_t *)mp->b_cont->b_rptr;
3155 		if ((blp = enter_link(bip, bss->bss_linkid)) == NULL) {
3156 			rc = ENOENT;
3157 		} else {
3158 			rc = 0;
3159 			blp->bl_state = bss->bss_state;
3160 		}
3161 		rw_exit(&bip->bi_rwlock);
3162 		break;
3163 	}
3164 
3165 	case BRIOC_SETPVID: {
3166 		bridge_setpvid_t *bsv;
3167 
3168 		if ((bip = bsp->bs_inst) == NULL ||
3169 		    (rc = miocpullup(mp, sizeof (*bsv))) != 0)
3170 			break;
3171 		/* LINTED: alignment */
3172 		bsv = (bridge_setpvid_t *)mp->b_cont->b_rptr;
3173 		if (bsv->bsv_vlan > VLAN_ID_MAX)
3174 			break;
3175 		if ((blp = enter_link(bip, bsv->bsv_linkid)) == NULL) {
3176 			rc = ENOENT;
3177 		} else if (blp->bl_pvid == bsv->bsv_vlan) {
3178 			rc = 0;
3179 		} else {
3180 			rc = 0;
3181 			BRIDGE_VLAN_CLR(blp, blp->bl_pvid);
3182 			blp->bl_pvid = bsv->bsv_vlan;
3183 			if (blp->bl_pvid != 0)
3184 				BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3185 		}
3186 		rw_exit(&bip->bi_rwlock);
3187 		break;
3188 	}
3189 
3190 	case BRIOC_VLANENAB: {
3191 		bridge_vlanenab_t *bve;
3192 
3193 		if ((bip = bsp->bs_inst) == NULL ||
3194 		    (rc = miocpullup(mp, sizeof (*bve))) != 0)
3195 			break;
3196 		/* LINTED: alignment */
3197 		bve = (bridge_vlanenab_t *)mp->b_cont->b_rptr;
3198 		if (bve->bve_vlan > VLAN_ID_MAX)
3199 			break;
3200 		if ((blp = enter_link(bip, bve->bve_linkid)) == NULL) {
3201 			rc = ENOENT;
3202 		} else {
3203 			rc = 0;
3204 			/* special case: vlan 0 means "all" */
3205 			if (bve->bve_vlan == 0) {
3206 				(void) memset(blp->bl_vlans,
3207 				    bve->bve_onoff ? ~0 : 0,
3208 				    sizeof (blp->bl_vlans));
3209 				BRIDGE_VLAN_CLR(blp, 0);
3210 				if (blp->bl_pvid != 0)
3211 					BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3212 			} else if (bve->bve_vlan == blp->bl_pvid) {
3213 				rc = EINVAL;
3214 			} else if (bve->bve_onoff) {
3215 				BRIDGE_VLAN_SET(blp, bve->bve_vlan);
3216 			} else {
3217 				BRIDGE_VLAN_CLR(blp, bve->bve_vlan);
3218 			}
3219 		}
3220 		rw_exit(&bip->bi_rwlock);
3221 		break;
3222 	}
3223 
3224 	case BRIOC_FLUSHFWD: {
3225 		bridge_flushfwd_t *bff;
3226 		bridge_fwd_t *bfp, *bfnext;
3227 		avl_tree_t fwd_scavenge;
3228 		int i;
3229 
3230 		if ((bip = bsp->bs_inst) == NULL ||
3231 		    (rc = miocpullup(mp, sizeof (*bff))) != 0)
3232 			break;
3233 		/* LINTED: alignment */
3234 		bff = (bridge_flushfwd_t *)mp->b_cont->b_rptr;
3235 		rw_enter(&bip->bi_rwlock, RW_WRITER);
3236 		/* This case means "all" */
3237 		if (bff->bff_linkid == DATALINK_INVALID_LINKID) {
3238 			blp = NULL;
3239 		} else {
3240 			for (blp = list_head(&bip->bi_links); blp != NULL;
3241 			    blp = list_next(&bip->bi_links, blp)) {
3242 				if (blp->bl_linkid == bff->bff_linkid &&
3243 				    !(blp->bl_flags & BLF_DELETED))
3244 					break;
3245 			}
3246 			if (blp == NULL) {
3247 				rc = ENOENT;
3248 				rw_exit(&bip->bi_rwlock);
3249 				break;
3250 			}
3251 		}
3252 		avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
3253 		    offsetof(bridge_fwd_t, bf_node));
3254 		bfnext = avl_first(&bip->bi_fwd);
3255 		while ((bfp = bfnext) != NULL) {
3256 			bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
3257 			if (bfp->bf_flags & BFF_LOCALADDR)
3258 				continue;
3259 			if (blp != NULL) {
3260 				for (i = 0; i < bfp->bf_maxlinks; i++) {
3261 					if (bfp->bf_links[i] == blp)
3262 						break;
3263 				}
3264 				/*
3265 				 * If the link is there and we're excluding,
3266 				 * then skip.  If the link is not there and
3267 				 * we're doing only that link, then skip.
3268 				 */
3269 				if ((i < bfp->bf_maxlinks) == bff->bff_exclude)
3270 					continue;
3271 			}
3272 			ASSERT(bfp->bf_flags & BFF_INTREE);
3273 			avl_remove(&bip->bi_fwd, bfp);
3274 			bfp->bf_flags &= ~BFF_INTREE;
3275 			avl_add(&fwd_scavenge, bfp);
3276 		}
3277 		rw_exit(&bip->bi_rwlock);
3278 		bfnext = avl_first(&fwd_scavenge);
3279 		while ((bfp = bfnext) != NULL) {
3280 			bfnext = AVL_NEXT(&fwd_scavenge, bfp);
3281 			avl_remove(&fwd_scavenge, bfp);
3282 			fwd_unref(bfp);	/* drop tree reference */
3283 		}
3284 		avl_destroy(&fwd_scavenge);
3285 		break;
3286 	}
3287 
3288 	case BRIOC_TABLEMAX:
3289 		if ((bip = bsp->bs_inst) == NULL ||
3290 		    (rc = miocpullup(mp, sizeof (uint32_t))) != 0)
3291 			break;
3292 		/* LINTED: alignment */
3293 		bip->bi_tablemax = *(uint32_t *)mp->b_cont->b_rptr;
3294 		break;
3295 	}
3296 
3297 	if (rc == 0)
3298 		miocack(wq, mp, len, 0);
3299 	else
3300 		miocnak(wq, mp, 0, rc);
3301 }
3302 
3303 static void
3304 bridge_wput(queue_t *wq, mblk_t *mp)
3305 {
3306 	switch (DB_TYPE(mp)) {
3307 	case M_IOCTL:
3308 		bridge_ioctl(wq, mp);
3309 		break;
3310 	case M_FLUSH:
3311 		if (*mp->b_rptr & FLUSHW)
3312 			*mp->b_rptr &= ~FLUSHW;
3313 		if (*mp->b_rptr & FLUSHR)
3314 			qreply(wq, mp);
3315 		else
3316 			freemsg(mp);
3317 		break;
3318 	default:
3319 		freemsg(mp);
3320 		break;
3321 	}
3322 }
3323 
3324 /*
3325  * This function allocates the main data structures for the bridge driver and
3326  * connects us into devfs.
3327  */
3328 static void
3329 bridge_inst_init(void)
3330 {
3331 	bridge_scan_interval = 5 * drv_usectohz(1000000);
3332 	bridge_fwd_age = 25 * drv_usectohz(1000000);
3333 
3334 	rw_init(&bmac_rwlock, NULL, RW_DRIVER, NULL);
3335 	list_create(&bmac_list, sizeof (bridge_mac_t),
3336 	    offsetof(bridge_mac_t, bm_node));
3337 	list_create(&inst_list, sizeof (bridge_inst_t),
3338 	    offsetof(bridge_inst_t, bi_node));
3339 	cv_init(&inst_cv, NULL, CV_DRIVER, NULL);
3340 	mutex_init(&inst_lock, NULL, MUTEX_DRIVER, NULL);
3341 	cv_init(&stream_ref_cv, NULL, CV_DRIVER, NULL);
3342 	mutex_init(&stream_ref_lock, NULL, MUTEX_DRIVER, NULL);
3343 
3344 	mac_bridge_vectors(bridge_xmit_cb, bridge_recv_cb, bridge_ref_cb,
3345 	    bridge_ls_cb);
3346 }
3347 
3348 /*
3349  * This function disconnects from devfs and destroys all data structures in
3350  * preparation for unload.  It's assumed that there are no active bridge
3351  * references left at this point.
3352  */
3353 static void
3354 bridge_inst_fini(void)
3355 {
3356 	mac_bridge_vectors(NULL, NULL, NULL, NULL);
3357 	if (bridge_timerid != 0)
3358 		(void) untimeout(bridge_timerid);
3359 	rw_destroy(&bmac_rwlock);
3360 	list_destroy(&bmac_list);
3361 	list_destroy(&inst_list);
3362 	cv_destroy(&inst_cv);
3363 	mutex_destroy(&inst_lock);
3364 	cv_destroy(&stream_ref_cv);
3365 	mutex_destroy(&stream_ref_lock);
3366 }
3367 
3368 /*
3369  * bridge_attach()
3370  *
3371  * Description:
3372  *    Attach bridge driver to the system.
3373  */
3374 static int
3375 bridge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3376 {
3377 	if (cmd != DDI_ATTACH)
3378 		return (DDI_FAILURE);
3379 
3380 	if (ddi_create_minor_node(dip, BRIDGE_CTL, S_IFCHR, 0, DDI_PSEUDO,
3381 	    CLONE_DEV) == DDI_FAILURE) {
3382 		return (DDI_FAILURE);
3383 	}
3384 
3385 	if (dld_ioc_register(BRIDGE_IOC, bridge_ioc_list,
3386 	    DLDIOCCNT(bridge_ioc_list)) != 0) {
3387 		ddi_remove_minor_node(dip, BRIDGE_CTL);
3388 		return (DDI_FAILURE);
3389 	}
3390 
3391 	bridge_dev_info = dip;
3392 	bridge_major = ddi_driver_major(dip);
3393 	bridge_taskq = ddi_taskq_create(dip, BRIDGE_DEV_NAME, 1,
3394 	    TASKQ_DEFAULTPRI, 0);
3395 	return (DDI_SUCCESS);
3396 }
3397 
3398 /*
3399  * bridge_detach()
3400  *
3401  * Description:
3402  *    Detach an interface to the system.
3403  */
3404 static int
3405 bridge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3406 {
3407 	if (cmd != DDI_DETACH)
3408 		return (DDI_FAILURE);
3409 
3410 	ddi_remove_minor_node(dip, NULL);
3411 	ddi_taskq_destroy(bridge_taskq);
3412 	bridge_dev_info = NULL;
3413 	return (DDI_SUCCESS);
3414 }
3415 
3416 /*
3417  * bridge_info()
3418  *
3419  * Description:
3420  *    Translate "dev_t" to a pointer to the associated "dev_info_t".
3421  */
3422 /* ARGSUSED */
3423 static int
3424 bridge_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
3425 	void **result)
3426 {
3427 	int	rc;
3428 
3429 	switch (infocmd) {
3430 	case DDI_INFO_DEVT2DEVINFO:
3431 		if (bridge_dev_info == NULL) {
3432 			rc = DDI_FAILURE;
3433 		} else {
3434 			*result = (void *)bridge_dev_info;
3435 			rc = DDI_SUCCESS;
3436 		}
3437 		break;
3438 	case DDI_INFO_DEVT2INSTANCE:
3439 		*result = NULL;
3440 		rc = DDI_SUCCESS;
3441 		break;
3442 	default:
3443 		rc = DDI_FAILURE;
3444 		break;
3445 	}
3446 	return (rc);
3447 }
3448 
3449 static struct module_info bridge_modinfo = {
3450 	2105,			/* mi_idnum */
3451 	BRIDGE_DEV_NAME,	/* mi_idname */
3452 	0,			/* mi_minpsz */
3453 	16384,			/* mi_maxpsz */
3454 	65536,			/* mi_hiwat */
3455 	128			/* mi_lowat */
3456 };
3457 
3458 static struct qinit bridge_rinit = {
3459 	NULL,			/* qi_putp */
3460 	NULL,			/* qi_srvp */
3461 	bridge_open,		/* qi_qopen */
3462 	bridge_close,		/* qi_qclose */
3463 	NULL,			/* qi_qadmin */
3464 	&bridge_modinfo,	/* qi_minfo */
3465 	NULL			/* qi_mstat */
3466 };
3467 
3468 static struct qinit bridge_winit = {
3469 	(int (*)())bridge_wput, /* qi_putp */
3470 	NULL,			/* qi_srvp */
3471 	NULL,			/* qi_qopen */
3472 	NULL,			/* qi_qclose */
3473 	NULL,			/* qi_qadmin */
3474 	&bridge_modinfo,	/* qi_minfo */
3475 	NULL			/* qi_mstat */
3476 };
3477 
3478 static struct streamtab bridge_tab = {
3479 	&bridge_rinit,	/* st_rdinit */
3480 	&bridge_winit	/* st_wrinit */
3481 };
3482 
3483 /* No STREAMS perimeters; we do all our own locking */
3484 DDI_DEFINE_STREAM_OPS(bridge_ops, nulldev, nulldev, bridge_attach,
3485     bridge_detach, nodev, bridge_info, D_NEW | D_MP, &bridge_tab,
3486     ddi_quiesce_not_supported);
3487 
3488 static struct modldrv modldrv = {
3489 	&mod_driverops,
3490 	"bridging driver",
3491 	&bridge_ops
3492 };
3493 
3494 static struct modlinkage modlinkage = {
3495 	MODREV_1,
3496 	(void *)&modldrv,
3497 	NULL
3498 };
3499 
3500 int
3501 _init(void)
3502 {
3503 	int retv;
3504 
3505 	mac_init_ops(NULL, BRIDGE_DEV_NAME);
3506 	bridge_inst_init();
3507 	if ((retv = mod_install(&modlinkage)) != 0)
3508 		bridge_inst_fini();
3509 	return (retv);
3510 }
3511 
3512 int
3513 _fini(void)
3514 {
3515 	int retv;
3516 
3517 	rw_enter(&bmac_rwlock, RW_READER);
3518 	retv = list_is_empty(&bmac_list) ? 0 : EBUSY;
3519 	rw_exit(&bmac_rwlock);
3520 	if (retv == 0 &&
3521 	    (retv = mod_remove(&modlinkage)) == 0)
3522 		bridge_inst_fini();
3523 	return (retv);
3524 }
3525 
3526 int
3527 _info(struct modinfo *modinfop)
3528 {
3529 	return (mod_info(&modlinkage, modinfop));
3530 }
3531