1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 * Copyright 2019 Joyent, Inc.
27 */
28
29 /*
30 * This module implements a STREAMS driver that provides layer-two (Ethernet)
31 * bridging functionality. The STREAMS interface is used to provide
32 * observability (snoop/wireshark) and control, but not for interface plumbing.
33 */
34
35 #include <sys/types.h>
36 #include <sys/bitmap.h>
37 #include <sys/cmn_err.h>
38 #include <sys/conf.h>
39 #include <sys/ddi.h>
40 #include <sys/errno.h>
41 #include <sys/kstat.h>
42 #include <sys/modctl.h>
43 #include <sys/note.h>
44 #include <sys/param.h>
45 #include <sys/pattr.h>
46 #include <sys/policy.h>
47 #include <sys/sdt.h>
48 #include <sys/stat.h>
49 #include <sys/stream.h>
50 #include <sys/stropts.h>
51 #include <sys/strsun.h>
52 #include <sys/sunddi.h>
53 #include <sys/sysmacros.h>
54 #include <sys/systm.h>
55 #include <sys/time.h>
56 #include <sys/dlpi.h>
57 #include <sys/dls.h>
58 #include <sys/mac_ether.h>
59 #include <sys/mac_provider.h>
60 #include <sys/mac_client_priv.h>
61 #include <sys/mac_impl.h>
62 #include <sys/vlan.h>
63 #include <net/bridge.h>
64 #include <net/bridge_impl.h>
65 #include <net/trill.h>
66 #include <sys/dld_ioc.h>
67
68 /*
69 * Locks and reference counts: object lifetime and design.
70 *
71 * bridge_mac_t
72 * Bridge mac (snoop) instances are in bmac_list, which is protected by
73 * bmac_rwlock. They're allocated by bmac_alloc and freed by bridge_timer().
74 * Every bridge_inst_t has a single bridge_mac_t, but when bridge_inst_t goes
75 * away, the bridge_mac_t remains until either all of the users go away
76 * (detected by a timer) or until the instance is picked up again by the same
77 * bridge starting back up.
78 *
79 * bridge_inst_t
80 * Bridge instances are in inst_list, which is protected by inst_lock.
81 * They're allocated by inst_alloc() and freed by inst_free(). After
82 * allocation, an instance is placed in inst_list, and the reference count is
83 * incremented to represent this. That reference is decremented when the
84 * BIF_SHUTDOWN flag is set, and no new increments may occur. When the last
85 * reference is freed, the instance is removed from the list.
86 *
87 * Bridge instances have lists of links and an AVL tree of forwarding
88 * entries. Each of these structures holds one reference on the bridge
89 * instance. These lists and tree are protected by bi_rwlock.
90 *
91 * bridge_stream_t
92 * Bridge streams are allocated by stream_alloc() and freed by stream_free().
93 * These streams are created when "bridged" opens /dev/bridgectl, and are
94 * used to create new bridge instances (via BRIOC_NEWBRIDGE) and control the
95 * links on the bridge. When a stream closes, the bridge instance created is
96 * destroyed. There's at most one bridge instance for a given control
97 * stream.
98 *
99 * bridge_link_t
100 * Links are allocated by bridge_add_link() and freed by link_free(). The
101 * bi_links list holds a reference to the link. When the BLF_DELETED flag is
102 * set, that reference is dropped. The link isn't removed from the list
103 * until the last reference drops. Each forwarding entry that uses a given
104 * link holds a reference, as does each thread transmitting a packet via the
105 * link. The MAC layer calls in via bridge_ref_cb() to hold a reference on
106 * a link when transmitting.
107 *
108 * It's important that once BLF_DELETED is set, there's no way for the
109 * reference count to increase again. If it can, then the link may be
110 * double-freed. The BLF_FREED flag is intended for use with assertions to
111 * guard against this in testing.
112 *
113 * bridge_fwd_t
114 * Bridge forwarding entries are allocated by bridge_recv_cb() and freed by
115 * fwd_free(). The bi_fwd AVL tree holds one reference to the entry. Unlike
116 * other data structures, the reference is dropped when the entry is removed
117 * from the tree by fwd_delete(), and the BFF_INTREE flag is removed. Each
118 * thread that's forwarding a packet to a known destination holds a reference
119 * to a forwarding entry.
120 *
121 * TRILL notes:
122 *
123 * The TRILL module does all of its I/O through bridging. It uses references
124 * on the bridge_inst_t and bridge_link_t structures, and has seven entry
125 * points and four callbacks. One entry point is for setting the callbacks
126 * (bridge_trill_register_cb). There are four entry points for taking bridge
127 * and link references (bridge_trill_{br,ln}{ref,unref}). The final two
128 * entry points are for decapsulated packets from TRILL (bridge_trill_decaps)
129 * that need to be bridged locally, and for TRILL-encapsulated output packets
130 * (bridge_trill_output).
131 *
132 * The four callbacks comprise two notification functions for bridges and
133 * links being deleted, one function for raw received TRILL packets, and one
134 * for bridge output to non-local TRILL destinations (tunnel entry).
135 */
136
137 /*
138 * Ethernet reserved multicast addresses for TRILL; used also in TRILL module.
139 */
140 const uint8_t all_isis_rbridges[] = ALL_ISIS_RBRIDGES;
141 static const uint8_t all_esadi_rbridges[] = ALL_ESADI_RBRIDGES;
142 const uint8_t bridge_group_address[] = BRIDGE_GROUP_ADDRESS;
143
144 static const char *inst_kstats_list[] = { KSINST_NAMES };
145 static const char *link_kstats_list[] = { KSLINK_NAMES };
146
147 #define KREF(p, m, vn) p->m.vn.value.ui64
148 #define KINCR(p, m, vn) ++KREF(p, m, vn)
149 #define KDECR(p, m, vn) --KREF(p, m, vn)
150
151 #define KIPINCR(p, vn) KINCR(p, bi_kstats, vn)
152 #define KIPDECR(p, vn) KDECR(p, bi_kstats, vn)
153 #define KLPINCR(p, vn) KINCR(p, bl_kstats, vn)
154
155 #define KIINCR(vn) KIPINCR(bip, vn)
156 #define KIDECR(vn) KIPDECR(bip, vn)
157 #define KLINCR(vn) KLPINCR(blp, vn)
158
159 #define Dim(x) (sizeof (x) / sizeof (*(x)))
160
161 /* Amount of overhead added when encapsulating with VLAN headers */
162 #define VLAN_INCR (sizeof (struct ether_vlan_header) - \
163 sizeof (struct ether_header))
164
165 static dev_info_t *bridge_dev_info;
166 static major_t bridge_major;
167 static ddi_taskq_t *bridge_taskq;
168
169 /*
170 * These are the bridge instance management data structures. The mutex lock
171 * protects the list of bridge instances. A reference count is then used on
172 * each instance to determine when to free it. We use mac_minor_hold() to
173 * allocate minor_t values, which are used both for self-cloning /dev/net/
174 * device nodes as well as client streams. Minor node 0 is reserved for the
175 * allocation control node.
176 */
177 static list_t inst_list;
178 static kcondvar_t inst_cv; /* Allows us to wait for shutdown */
179 static kmutex_t inst_lock;
180
181 static krwlock_t bmac_rwlock;
182 static list_t bmac_list;
183
184 /* Wait for taskq entries that use STREAMS */
185 static kcondvar_t stream_ref_cv;
186 static kmutex_t stream_ref_lock;
187
188 static timeout_id_t bridge_timerid;
189 static clock_t bridge_scan_interval;
190 static clock_t bridge_fwd_age;
191
192 static bridge_inst_t *bridge_find_name(const char *);
193 static void bridge_timer(void *);
194 static void bridge_unref(bridge_inst_t *);
195
196 static const uint8_t zero_addr[ETHERADDRL] = { 0 };
197
198 /* Global TRILL linkage */
199 static trill_recv_pkt_t trill_recv_fn;
200 static trill_encap_pkt_t trill_encap_fn;
201 static trill_br_dstr_t trill_brdstr_fn;
202 static trill_ln_dstr_t trill_lndstr_fn;
203
204 /* special settings to accommodate DLD flow control; see dld_str.c */
205 static struct module_info bridge_dld_modinfo = {
206 0, /* mi_idnum */
207 BRIDGE_DEV_NAME, /* mi_idname */
208 0, /* mi_minpsz */
209 INFPSZ, /* mi_maxpsz */
210 1, /* mi_hiwat */
211 0 /* mi_lowat */
212 };
213
214 static struct qinit bridge_dld_rinit = {
215 NULL, /* qi_putp */
216 NULL, /* qi_srvp */
217 dld_open, /* qi_qopen */
218 dld_close, /* qi_qclose */
219 NULL, /* qi_qadmin */
220 &bridge_dld_modinfo, /* qi_minfo */
221 NULL /* qi_mstat */
222 };
223
224 static struct qinit bridge_dld_winit = {
225 dld_wput, /* qi_putp */
226 dld_wsrv, /* qi_srvp */
227 NULL, /* qi_qopen */
228 NULL, /* qi_qclose */
229 NULL, /* qi_qadmin */
230 &bridge_dld_modinfo, /* qi_minfo */
231 NULL /* qi_mstat */
232 };
233
234 static int bridge_ioc_listfwd(void *, intptr_t, int, cred_t *, int *);
235
236 /* GLDv3 control ioctls used by Bridging */
237 static dld_ioc_info_t bridge_ioc_list[] = {
238 {BRIDGE_IOC_LISTFWD, DLDCOPYINOUT, sizeof (bridge_listfwd_t),
239 bridge_ioc_listfwd, NULL},
240 };
241
242 /*
243 * Given a bridge mac pointer, get a ref-held pointer to the corresponding
244 * bridge instance, if any. We must hold the global bmac_rwlock so that
245 * bm_inst doesn't slide out from under us.
246 */
247 static bridge_inst_t *
mac_to_inst(const bridge_mac_t * bmp)248 mac_to_inst(const bridge_mac_t *bmp)
249 {
250 bridge_inst_t *bip;
251
252 rw_enter(&bmac_rwlock, RW_READER);
253 if ((bip = bmp->bm_inst) != NULL)
254 atomic_inc_uint(&bip->bi_refs);
255 rw_exit(&bmac_rwlock);
256 return (bip);
257 }
258
259 static void
link_sdu_fail(bridge_link_t * blp,boolean_t failed,mblk_t ** mlist)260 link_sdu_fail(bridge_link_t *blp, boolean_t failed, mblk_t **mlist)
261 {
262 mblk_t *mp;
263 bridge_ctl_t *bcp;
264 bridge_link_t *blcmp;
265 bridge_inst_t *bip;
266 bridge_mac_t *bmp;
267
268 if (failed) {
269 if (blp->bl_flags & BLF_SDUFAIL)
270 return;
271 blp->bl_flags |= BLF_SDUFAIL;
272 } else {
273 if (!(blp->bl_flags & BLF_SDUFAIL))
274 return;
275 blp->bl_flags &= ~BLF_SDUFAIL;
276 }
277
278 /*
279 * If this link is otherwise up, then check if there are any other
280 * non-failed non-down links. If not, then we control the state of the
281 * whole bridge.
282 */
283 bip = blp->bl_inst;
284 bmp = bip->bi_mac;
285 if (blp->bl_linkstate != LINK_STATE_DOWN) {
286 for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
287 blcmp = list_next(&bip->bi_links, blcmp)) {
288 if (blp != blcmp &&
289 !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
290 blcmp->bl_linkstate != LINK_STATE_DOWN)
291 break;
292 }
293 if (blcmp == NULL) {
294 bmp->bm_linkstate = failed ? LINK_STATE_DOWN :
295 LINK_STATE_UP;
296 mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
297 }
298 }
299
300 /*
301 * If we're becoming failed, then the link's current true state needs
302 * to be reflected upwards to this link's clients. If we're becoming
303 * unfailed, then we get the state of the bridge instead on all
304 * clients.
305 */
306 if (failed) {
307 if (bmp->bm_linkstate != blp->bl_linkstate)
308 mac_link_redo(blp->bl_mh, blp->bl_linkstate);
309 } else {
310 mac_link_redo(blp->bl_mh, bmp->bm_linkstate);
311 }
312
313 /* get the current mblk we're going to send up */
314 if ((mp = blp->bl_lfailmp) == NULL &&
315 (mp = allocb(sizeof (bridge_ctl_t), BPRI_MED)) == NULL)
316 return;
317
318 /* get a new one for next time */
319 blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
320
321 /* if none for next time, then report only failures */
322 if (blp->bl_lfailmp == NULL && !failed) {
323 blp->bl_lfailmp = mp;
324 return;
325 }
326
327 /* LINTED: alignment */
328 bcp = (bridge_ctl_t *)mp->b_rptr;
329 bcp->bc_linkid = blp->bl_linkid;
330 bcp->bc_failed = failed;
331 mp->b_wptr = (uchar_t *)(bcp + 1);
332 mp->b_next = *mlist;
333 *mlist = mp;
334 }
335
336 /*
337 * Send control messages (link SDU changes) using the stream to the
338 * bridge instance daemon.
339 */
340 static void
send_up_messages(bridge_inst_t * bip,mblk_t * mp)341 send_up_messages(bridge_inst_t *bip, mblk_t *mp)
342 {
343 mblk_t *mnext;
344 queue_t *rq;
345
346 rq = bip->bi_control->bs_wq;
347 rq = OTHERQ(rq);
348 while (mp != NULL) {
349 mnext = mp->b_next;
350 mp->b_next = NULL;
351 putnext(rq, mp);
352 mp = mnext;
353 }
354 }
355
356 /* ARGSUSED */
357 static int
bridge_m_getstat(void * arg,uint_t stat,uint64_t * val)358 bridge_m_getstat(void *arg, uint_t stat, uint64_t *val)
359 {
360 return (ENOTSUP);
361 }
362
363 static int
bridge_m_start(void * arg)364 bridge_m_start(void *arg)
365 {
366 bridge_mac_t *bmp = arg;
367
368 bmp->bm_flags |= BMF_STARTED;
369 return (0);
370 }
371
372 static void
bridge_m_stop(void * arg)373 bridge_m_stop(void *arg)
374 {
375 bridge_mac_t *bmp = arg;
376
377 bmp->bm_flags &= ~BMF_STARTED;
378 }
379
380 /* ARGSUSED */
381 static int
bridge_m_setpromisc(void * arg,boolean_t on)382 bridge_m_setpromisc(void *arg, boolean_t on)
383 {
384 return (0);
385 }
386
387 /* ARGSUSED */
388 static int
bridge_m_multicst(void * arg,boolean_t add,const uint8_t * mca)389 bridge_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
390 {
391 return (0);
392 }
393
394 /* ARGSUSED */
395 static int
bridge_m_unicst(void * arg,const uint8_t * macaddr)396 bridge_m_unicst(void *arg, const uint8_t *macaddr)
397 {
398 return (ENOTSUP);
399 }
400
401 static mblk_t *
bridge_m_tx(void * arg,mblk_t * mp)402 bridge_m_tx(void *arg, mblk_t *mp)
403 {
404 _NOTE(ARGUNUSED(arg));
405 freemsgchain(mp);
406 return (NULL);
407 }
408
409 /* ARGSUSED */
410 static int
bridge_ioc_listfwd(void * karg,intptr_t arg,int mode,cred_t * cred,int * rvalp)411 bridge_ioc_listfwd(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
412 {
413 bridge_listfwd_t *blf = karg;
414 bridge_inst_t *bip;
415 bridge_fwd_t *bfp, match;
416 avl_index_t where;
417
418 bip = bridge_find_name(blf->blf_name);
419 if (bip == NULL)
420 return (ENOENT);
421
422 bcopy(blf->blf_dest, match.bf_dest, ETHERADDRL);
423 match.bf_flags |= BFF_VLANLOCAL;
424 rw_enter(&bip->bi_rwlock, RW_READER);
425 if ((bfp = avl_find(&bip->bi_fwd, &match, &where)) == NULL)
426 bfp = avl_nearest(&bip->bi_fwd, where, AVL_AFTER);
427 else
428 bfp = AVL_NEXT(&bip->bi_fwd, bfp);
429 if (bfp == NULL) {
430 bzero(blf, sizeof (*blf));
431 } else {
432 bcopy(bfp->bf_dest, blf->blf_dest, ETHERADDRL);
433 blf->blf_trill_nick = bfp->bf_trill_nick;
434 blf->blf_ms_age =
435 drv_hztousec(ddi_get_lbolt() - bfp->bf_lastheard) / 1000;
436 blf->blf_is_local =
437 (bfp->bf_flags & BFF_LOCALADDR) != 0;
438 blf->blf_linkid = bfp->bf_links[0]->bl_linkid;
439 }
440 rw_exit(&bip->bi_rwlock);
441 bridge_unref(bip);
442 return (0);
443 }
444
445 static int
bridge_m_setprop(void * arg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,const void * pr_val)446 bridge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
447 uint_t pr_valsize, const void *pr_val)
448 {
449 bridge_mac_t *bmp = arg;
450 bridge_inst_t *bip;
451 bridge_link_t *blp;
452 int err;
453 uint_t maxsdu;
454 mblk_t *mlist;
455
456 _NOTE(ARGUNUSED(pr_name));
457 switch (pr_num) {
458 case MAC_PROP_MTU:
459 if (pr_valsize < sizeof (bmp->bm_maxsdu)) {
460 err = EINVAL;
461 break;
462 }
463 (void) bcopy(pr_val, &maxsdu, sizeof (maxsdu));
464 if (maxsdu == bmp->bm_maxsdu) {
465 err = 0;
466 } else if ((bip = mac_to_inst(bmp)) == NULL) {
467 err = ENXIO;
468 } else {
469 rw_enter(&bip->bi_rwlock, RW_WRITER);
470 mlist = NULL;
471 for (blp = list_head(&bip->bi_links); blp != NULL;
472 blp = list_next(&bip->bi_links, blp)) {
473 if (blp->bl_flags & BLF_DELETED)
474 continue;
475 if (blp->bl_maxsdu == maxsdu)
476 link_sdu_fail(blp, B_FALSE, &mlist);
477 else if (blp->bl_maxsdu == bmp->bm_maxsdu)
478 link_sdu_fail(blp, B_TRUE, &mlist);
479 }
480 rw_exit(&bip->bi_rwlock);
481 bmp->bm_maxsdu = maxsdu;
482 (void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
483 send_up_messages(bip, mlist);
484 bridge_unref(bip);
485 err = 0;
486 }
487 break;
488
489 default:
490 err = ENOTSUP;
491 break;
492 }
493 return (err);
494 }
495
496 static int
bridge_m_getprop(void * arg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,void * pr_val)497 bridge_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
498 uint_t pr_valsize, void *pr_val)
499 {
500 bridge_mac_t *bmp = arg;
501 int err = 0;
502
503 _NOTE(ARGUNUSED(pr_name));
504 switch (pr_num) {
505 case MAC_PROP_STATUS:
506 ASSERT(pr_valsize >= sizeof (bmp->bm_linkstate));
507 bcopy(&bmp->bm_linkstate, pr_val, sizeof (bmp->bm_linkstate));
508 break;
509
510 default:
511 err = ENOTSUP;
512 break;
513 }
514 return (err);
515 }
516
517 static void
bridge_m_propinfo(void * arg,const char * pr_name,mac_prop_id_t pr_num,mac_prop_info_handle_t prh)518 bridge_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
519 mac_prop_info_handle_t prh)
520 {
521 bridge_mac_t *bmp = arg;
522
523 _NOTE(ARGUNUSED(pr_name));
524
525 switch (pr_num) {
526 case MAC_PROP_MTU:
527 mac_prop_info_set_range_uint32(prh, bmp->bm_maxsdu,
528 bmp->bm_maxsdu);
529 break;
530 case MAC_PROP_STATUS:
531 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
532 break;
533 }
534 }
535
536 static mac_callbacks_t bridge_m_callbacks = {
537 MC_SETPROP | MC_GETPROP | MC_PROPINFO,
538 bridge_m_getstat,
539 bridge_m_start,
540 bridge_m_stop,
541 bridge_m_setpromisc,
542 bridge_m_multicst,
543 bridge_m_unicst,
544 bridge_m_tx,
545 NULL, /* reserved */
546 NULL, /* ioctl */
547 NULL, /* getcapab */
548 NULL, /* open */
549 NULL, /* close */
550 bridge_m_setprop,
551 bridge_m_getprop,
552 bridge_m_propinfo
553 };
554
555 /*
556 * Create kstats from a list.
557 */
558 static kstat_t *
kstat_setup(kstat_named_t * knt,const char ** names,int nstat,const char * unitname)559 kstat_setup(kstat_named_t *knt, const char **names, int nstat,
560 const char *unitname)
561 {
562 kstat_t *ksp;
563 int i;
564
565 for (i = 0; i < nstat; i++)
566 kstat_named_init(&knt[i], names[i], KSTAT_DATA_UINT64);
567
568 ksp = kstat_create_zone(BRIDGE_DEV_NAME, 0, unitname, "net",
569 KSTAT_TYPE_NAMED, nstat, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
570 if (ksp != NULL) {
571 ksp->ks_data = knt;
572 kstat_install(ksp);
573 }
574 return (ksp);
575 }
576
577 /*
578 * Find an existing bridge_mac_t structure or allocate a new one for the given
579 * bridge instance. This creates the mac driver instance that snoop can use.
580 */
581 static int
bmac_alloc(bridge_inst_t * bip,bridge_mac_t ** bmacp)582 bmac_alloc(bridge_inst_t *bip, bridge_mac_t **bmacp)
583 {
584 bridge_mac_t *bmp, *bnew;
585 mac_register_t *mac;
586 int err;
587
588 *bmacp = NULL;
589 if ((mac = mac_alloc(MAC_VERSION)) == NULL)
590 return (EINVAL);
591
592 bnew = kmem_zalloc(sizeof (*bnew), KM_SLEEP);
593
594 rw_enter(&bmac_rwlock, RW_WRITER);
595 for (bmp = list_head(&bmac_list); bmp != NULL;
596 bmp = list_next(&bmac_list, bmp)) {
597 if (strcmp(bip->bi_name, bmp->bm_name) == 0) {
598 ASSERT(bmp->bm_inst == NULL);
599 bmp->bm_inst = bip;
600 rw_exit(&bmac_rwlock);
601 kmem_free(bnew, sizeof (*bnew));
602 mac_free(mac);
603 *bmacp = bmp;
604 return (0);
605 }
606 }
607
608 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
609 mac->m_driver = bnew;
610 mac->m_dip = bridge_dev_info;
611 mac->m_instance = (uint_t)-1;
612 mac->m_src_addr = (uint8_t *)zero_addr;
613 mac->m_callbacks = &bridge_m_callbacks;
614
615 /*
616 * Note that the SDU limits are irrelevant, as nobody transmits on the
617 * bridge node itself. It's mainly for monitoring but we allow
618 * setting the bridge MTU for quick transition of all links part of the
619 * bridge to a new MTU.
620 */
621 mac->m_min_sdu = 1;
622 mac->m_max_sdu = 1500;
623 err = mac_register(mac, &bnew->bm_mh);
624 mac_free(mac);
625 if (err != 0) {
626 rw_exit(&bmac_rwlock);
627 kmem_free(bnew, sizeof (*bnew));
628 return (err);
629 }
630
631 bnew->bm_inst = bip;
632 (void) strcpy(bnew->bm_name, bip->bi_name);
633 if (list_is_empty(&bmac_list)) {
634 bridge_timerid = timeout(bridge_timer, NULL,
635 bridge_scan_interval);
636 }
637 list_insert_tail(&bmac_list, bnew);
638 rw_exit(&bmac_rwlock);
639
640 /*
641 * Mark the MAC as unable to go "active" so that only passive clients
642 * (such as snoop) can bind to it.
643 */
644 mac_no_active(bnew->bm_mh);
645 *bmacp = bnew;
646 return (0);
647 }
648
649 /*
650 * Disconnect the given bridge_mac_t from its bridge instance. The bridge
651 * instance is going away. The mac instance can't go away until the clients
652 * are gone (see bridge_timer).
653 */
654 static void
bmac_disconnect(bridge_mac_t * bmp)655 bmac_disconnect(bridge_mac_t *bmp)
656 {
657 bridge_inst_t *bip;
658
659 bmp->bm_linkstate = LINK_STATE_DOWN;
660 mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
661
662 rw_enter(&bmac_rwlock, RW_READER);
663 bip = bmp->bm_inst;
664 bip->bi_mac = NULL;
665 bmp->bm_inst = NULL;
666 rw_exit(&bmac_rwlock);
667 }
668
669 /* This is used by the avl trees to sort forwarding table entries */
670 static int
fwd_compare(const void * addr1,const void * addr2)671 fwd_compare(const void *addr1, const void *addr2)
672 {
673 const bridge_fwd_t *fwd1 = addr1;
674 const bridge_fwd_t *fwd2 = addr2;
675 int diff = memcmp(fwd1->bf_dest, fwd2->bf_dest, ETHERADDRL);
676
677 if (diff != 0)
678 return (diff > 0 ? 1 : -1);
679
680 if ((fwd1->bf_flags ^ fwd2->bf_flags) & BFF_VLANLOCAL) {
681 if (fwd1->bf_vlanid > fwd2->bf_vlanid)
682 return (1);
683 else if (fwd1->bf_vlanid < fwd2->bf_vlanid)
684 return (-1);
685 }
686 return (0);
687 }
688
689 static void
inst_free(bridge_inst_t * bip)690 inst_free(bridge_inst_t *bip)
691 {
692 ASSERT(bip->bi_mac == NULL);
693 rw_destroy(&bip->bi_rwlock);
694 list_destroy(&bip->bi_links);
695 cv_destroy(&bip->bi_linkwait);
696 avl_destroy(&bip->bi_fwd);
697 if (bip->bi_ksp != NULL)
698 kstat_delete(bip->bi_ksp);
699 kmem_free(bip, sizeof (*bip));
700 }
701
702 static bridge_inst_t *
inst_alloc(const char * bridge)703 inst_alloc(const char *bridge)
704 {
705 bridge_inst_t *bip;
706
707 bip = kmem_zalloc(sizeof (*bip), KM_SLEEP);
708 bip->bi_refs = 1;
709 (void) strcpy(bip->bi_name, bridge);
710 rw_init(&bip->bi_rwlock, NULL, RW_DRIVER, NULL);
711 list_create(&bip->bi_links, sizeof (bridge_link_t),
712 offsetof(bridge_link_t, bl_node));
713 cv_init(&bip->bi_linkwait, NULL, CV_DRIVER, NULL);
714 avl_create(&bip->bi_fwd, fwd_compare, sizeof (bridge_fwd_t),
715 offsetof(bridge_fwd_t, bf_node));
716 return (bip);
717 }
718
719 static bridge_inst_t *
bridge_find_name(const char * bridge)720 bridge_find_name(const char *bridge)
721 {
722 bridge_inst_t *bip;
723
724 mutex_enter(&inst_lock);
725 for (bip = list_head(&inst_list); bip != NULL;
726 bip = list_next(&inst_list, bip)) {
727 if (!(bip->bi_flags & BIF_SHUTDOWN) &&
728 strcmp(bridge, bip->bi_name) == 0) {
729 atomic_inc_uint(&bip->bi_refs);
730 break;
731 }
732 }
733 mutex_exit(&inst_lock);
734
735 return (bip);
736 }
737
738 static int
bridge_create(datalink_id_t linkid,const char * bridge,bridge_inst_t ** bipc,cred_t * cred)739 bridge_create(datalink_id_t linkid, const char *bridge, bridge_inst_t **bipc,
740 cred_t *cred)
741 {
742 bridge_inst_t *bip, *bipnew;
743 bridge_mac_t *bmp = NULL;
744 int err;
745
746 *bipc = NULL;
747 bipnew = inst_alloc(bridge);
748
749 mutex_enter(&inst_lock);
750 lookup_retry:
751 for (bip = list_head(&inst_list); bip != NULL;
752 bip = list_next(&inst_list, bip)) {
753 if (strcmp(bridge, bip->bi_name) == 0)
754 break;
755 }
756
757 /* This should not take long; if it does, we've got a design problem */
758 if (bip != NULL && (bip->bi_flags & BIF_SHUTDOWN)) {
759 cv_wait(&inst_cv, &inst_lock);
760 goto lookup_retry;
761 }
762
763 if (bip == NULL) {
764 bip = bipnew;
765 bipnew = NULL;
766 list_insert_tail(&inst_list, bip);
767 }
768
769 mutex_exit(&inst_lock);
770 if (bipnew != NULL) {
771 inst_free(bipnew);
772 return (EEXIST);
773 }
774
775 bip->bi_ksp = kstat_setup((kstat_named_t *)&bip->bi_kstats,
776 inst_kstats_list, Dim(inst_kstats_list), bip->bi_name);
777
778 err = bmac_alloc(bip, &bmp);
779 if ((bip->bi_mac = bmp) == NULL)
780 goto fail_create;
781
782 /*
783 * bm_inst is set, so the timer cannot yank the DLS rug from under us.
784 * No extra locking is needed here.
785 */
786 if (!(bmp->bm_flags & BMF_DLS)) {
787 err = dls_devnet_create(bmp->bm_mh, linkid, crgetzoneid(cred));
788 if (err != 0)
789 goto fail_create;
790 bmp->bm_flags |= BMF_DLS;
791 }
792
793 bip->bi_dev = makedevice(bridge_major, mac_minor(bmp->bm_mh));
794 *bipc = bip;
795 return (0);
796
797 fail_create:
798 ASSERT(bip->bi_trilldata == NULL);
799 bip->bi_flags |= BIF_SHUTDOWN;
800 bridge_unref(bip);
801 return (err);
802 }
803
804 static void
bridge_unref(bridge_inst_t * bip)805 bridge_unref(bridge_inst_t *bip)
806 {
807 if (atomic_dec_uint_nv(&bip->bi_refs) == 0) {
808 ASSERT(bip->bi_flags & BIF_SHUTDOWN);
809 /* free up mac for reuse before leaving global list */
810 if (bip->bi_mac != NULL)
811 bmac_disconnect(bip->bi_mac);
812 mutex_enter(&inst_lock);
813 list_remove(&inst_list, bip);
814 cv_broadcast(&inst_cv);
815 mutex_exit(&inst_lock);
816 inst_free(bip);
817 }
818 }
819
820 /*
821 * Stream instances are used only for allocating bridges and serving as a
822 * control node. They serve no data-handling function.
823 */
824 static bridge_stream_t *
stream_alloc(void)825 stream_alloc(void)
826 {
827 bridge_stream_t *bsp;
828 minor_t mn;
829
830 if ((mn = mac_minor_hold(B_FALSE)) == 0)
831 return (NULL);
832 bsp = kmem_zalloc(sizeof (*bsp), KM_SLEEP);
833 bsp->bs_minor = mn;
834 return (bsp);
835 }
836
837 static void
stream_free(bridge_stream_t * bsp)838 stream_free(bridge_stream_t *bsp)
839 {
840 mac_minor_rele(bsp->bs_minor);
841 kmem_free(bsp, sizeof (*bsp));
842 }
843
844 /* Reference hold/release functions for STREAMS-related taskq */
845 static void
stream_ref(bridge_stream_t * bsp)846 stream_ref(bridge_stream_t *bsp)
847 {
848 mutex_enter(&stream_ref_lock);
849 bsp->bs_taskq_cnt++;
850 mutex_exit(&stream_ref_lock);
851 }
852
853 static void
stream_unref(bridge_stream_t * bsp)854 stream_unref(bridge_stream_t *bsp)
855 {
856 mutex_enter(&stream_ref_lock);
857 if (--bsp->bs_taskq_cnt == 0)
858 cv_broadcast(&stream_ref_cv);
859 mutex_exit(&stream_ref_lock);
860 }
861
862 static void
link_free(bridge_link_t * blp)863 link_free(bridge_link_t *blp)
864 {
865 bridge_inst_t *bip = blp->bl_inst;
866
867 ASSERT(!(blp->bl_flags & BLF_FREED));
868 blp->bl_flags |= BLF_FREED;
869 if (blp->bl_ksp != NULL)
870 kstat_delete(blp->bl_ksp);
871 if (blp->bl_lfailmp != NULL)
872 freeb(blp->bl_lfailmp);
873 cv_destroy(&blp->bl_trillwait);
874 mutex_destroy(&blp->bl_trilllock);
875 kmem_free(blp, sizeof (*blp));
876 /* Don't unreference the bridge until the MAC is closed */
877 bridge_unref(bip);
878 }
879
880 static void
link_unref(bridge_link_t * blp)881 link_unref(bridge_link_t *blp)
882 {
883 if (atomic_dec_uint_nv(&blp->bl_refs) == 0) {
884 bridge_inst_t *bip = blp->bl_inst;
885
886 ASSERT(blp->bl_flags & BLF_DELETED);
887 rw_enter(&bip->bi_rwlock, RW_WRITER);
888 if (blp->bl_flags & BLF_LINK_ADDED)
889 list_remove(&bip->bi_links, blp);
890 rw_exit(&bip->bi_rwlock);
891 if (bip->bi_trilldata != NULL && list_is_empty(&bip->bi_links))
892 cv_broadcast(&bip->bi_linkwait);
893 link_free(blp);
894 }
895 }
896
897 static bridge_fwd_t *
fwd_alloc(const uint8_t * addr,uint_t nlinks,uint16_t nick)898 fwd_alloc(const uint8_t *addr, uint_t nlinks, uint16_t nick)
899 {
900 bridge_fwd_t *bfp;
901
902 bfp = kmem_zalloc(sizeof (*bfp) + (nlinks * sizeof (bridge_link_t *)),
903 KM_NOSLEEP);
904 if (bfp != NULL) {
905 bcopy(addr, bfp->bf_dest, ETHERADDRL);
906 bfp->bf_lastheard = ddi_get_lbolt();
907 bfp->bf_maxlinks = nlinks;
908 bfp->bf_links = (bridge_link_t **)(bfp + 1);
909 bfp->bf_trill_nick = nick;
910 }
911 return (bfp);
912 }
913
914 static bridge_fwd_t *
fwd_find(bridge_inst_t * bip,const uint8_t * addr,uint16_t vlanid)915 fwd_find(bridge_inst_t *bip, const uint8_t *addr, uint16_t vlanid)
916 {
917 bridge_fwd_t *bfp, *vbfp;
918 bridge_fwd_t match;
919
920 bcopy(addr, match.bf_dest, ETHERADDRL);
921 match.bf_flags = 0;
922 rw_enter(&bip->bi_rwlock, RW_READER);
923 if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
924 if (bfp->bf_vlanid != vlanid && bfp->bf_vcnt > 0) {
925 match.bf_vlanid = vlanid;
926 match.bf_flags = BFF_VLANLOCAL;
927 vbfp = avl_find(&bip->bi_fwd, &match, NULL);
928 if (vbfp != NULL)
929 bfp = vbfp;
930 }
931 atomic_inc_uint(&bfp->bf_refs);
932 }
933 rw_exit(&bip->bi_rwlock);
934 return (bfp);
935 }
936
937 static void
fwd_free(bridge_fwd_t * bfp)938 fwd_free(bridge_fwd_t *bfp)
939 {
940 uint_t i;
941 bridge_inst_t *bip = bfp->bf_links[0]->bl_inst;
942
943 KIDECR(bki_count);
944 for (i = 0; i < bfp->bf_nlinks; i++)
945 link_unref(bfp->bf_links[i]);
946 kmem_free(bfp,
947 sizeof (*bfp) + bfp->bf_maxlinks * sizeof (bridge_link_t *));
948 }
949
950 static void
fwd_unref(bridge_fwd_t * bfp)951 fwd_unref(bridge_fwd_t *bfp)
952 {
953 if (atomic_dec_uint_nv(&bfp->bf_refs) == 0) {
954 ASSERT(!(bfp->bf_flags & BFF_INTREE));
955 fwd_free(bfp);
956 }
957 }
958
959 static void
fwd_delete(bridge_fwd_t * bfp)960 fwd_delete(bridge_fwd_t *bfp)
961 {
962 bridge_inst_t *bip;
963 bridge_fwd_t *bfpzero;
964
965 if (bfp->bf_flags & BFF_INTREE) {
966 ASSERT(bfp->bf_nlinks > 0);
967 bip = bfp->bf_links[0]->bl_inst;
968 rw_enter(&bip->bi_rwlock, RW_WRITER);
969 /* Another thread could beat us to this */
970 if (bfp->bf_flags & BFF_INTREE) {
971 avl_remove(&bip->bi_fwd, bfp);
972 bfp->bf_flags &= ~BFF_INTREE;
973 if (bfp->bf_flags & BFF_VLANLOCAL) {
974 bfp->bf_flags &= ~BFF_VLANLOCAL;
975 bfpzero = avl_find(&bip->bi_fwd, bfp, NULL);
976 if (bfpzero != NULL && bfpzero->bf_vcnt > 0)
977 bfpzero->bf_vcnt--;
978 }
979 rw_exit(&bip->bi_rwlock);
980 fwd_unref(bfp); /* no longer in avl tree */
981 } else {
982 rw_exit(&bip->bi_rwlock);
983 }
984 }
985 }
986
987 static boolean_t
fwd_insert(bridge_inst_t * bip,bridge_fwd_t * bfp)988 fwd_insert(bridge_inst_t *bip, bridge_fwd_t *bfp)
989 {
990 avl_index_t idx;
991 boolean_t retv;
992
993 rw_enter(&bip->bi_rwlock, RW_WRITER);
994 if (!(bip->bi_flags & BIF_SHUTDOWN) &&
995 avl_numnodes(&bip->bi_fwd) < bip->bi_tablemax &&
996 avl_find(&bip->bi_fwd, bfp, &idx) == NULL) {
997 avl_insert(&bip->bi_fwd, bfp, idx);
998 bfp->bf_flags |= BFF_INTREE;
999 atomic_inc_uint(&bfp->bf_refs); /* avl entry */
1000 retv = B_TRUE;
1001 } else {
1002 retv = B_FALSE;
1003 }
1004 rw_exit(&bip->bi_rwlock);
1005 return (retv);
1006 }
1007
1008 static void
fwd_update_local(bridge_link_t * blp,const uint8_t * oldaddr,const uint8_t * newaddr)1009 fwd_update_local(bridge_link_t *blp, const uint8_t *oldaddr,
1010 const uint8_t *newaddr)
1011 {
1012 bridge_inst_t *bip = blp->bl_inst;
1013 bridge_fwd_t *bfp, *bfnew;
1014 bridge_fwd_t match;
1015 avl_index_t idx;
1016 boolean_t drop_ref = B_FALSE;
1017
1018 if (bcmp(oldaddr, newaddr, ETHERADDRL) == 0)
1019 return;
1020
1021 if (bcmp(oldaddr, zero_addr, ETHERADDRL) == 0)
1022 goto no_old_addr;
1023
1024 /*
1025 * Find the previous entry, and remove our link from it.
1026 */
1027 bcopy(oldaddr, match.bf_dest, ETHERADDRL);
1028 rw_enter(&bip->bi_rwlock, RW_WRITER);
1029 if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
1030 int i;
1031
1032 /*
1033 * See if we're in the list, and remove if so.
1034 */
1035 for (i = 0; i < bfp->bf_nlinks; i++) {
1036 if (bfp->bf_links[i] == blp) {
1037 /*
1038 * We assume writes are atomic, so no special
1039 * MT handling is needed. The list length is
1040 * decremented first, and then we remove
1041 * entries.
1042 */
1043 bfp->bf_nlinks--;
1044 for (; i < bfp->bf_nlinks; i++)
1045 bfp->bf_links[i] = bfp->bf_links[i + 1];
1046 drop_ref = B_TRUE;
1047 break;
1048 }
1049 }
1050 /* If no more links, then remove and free up */
1051 if (bfp->bf_nlinks == 0) {
1052 avl_remove(&bip->bi_fwd, bfp);
1053 bfp->bf_flags &= ~BFF_INTREE;
1054 } else {
1055 bfp = NULL;
1056 }
1057 }
1058 rw_exit(&bip->bi_rwlock);
1059 if (bfp != NULL)
1060 fwd_unref(bfp); /* no longer in avl tree */
1061
1062 /*
1063 * Now get the new link address and add this link to the list. The
1064 * list should be of length 1 unless the user has configured multiple
1065 * NICs with the same address. (That's an incorrect configuration, but
1066 * we support it anyway.)
1067 */
1068 no_old_addr:
1069 bfp = NULL;
1070 if ((bip->bi_flags & BIF_SHUTDOWN) ||
1071 bcmp(newaddr, zero_addr, ETHERADDRL) == 0)
1072 goto no_new_addr;
1073
1074 bcopy(newaddr, match.bf_dest, ETHERADDRL);
1075 rw_enter(&bip->bi_rwlock, RW_WRITER);
1076 if ((bfp = avl_find(&bip->bi_fwd, &match, &idx)) == NULL) {
1077 bfnew = fwd_alloc(newaddr, 1, RBRIDGE_NICKNAME_NONE);
1078 if (bfnew != NULL)
1079 KIINCR(bki_count);
1080 } else if (bfp->bf_nlinks < bfp->bf_maxlinks) {
1081 /* special case: link fits in existing entry */
1082 bfnew = bfp;
1083 } else {
1084 bfnew = fwd_alloc(newaddr, bfp->bf_nlinks + 1,
1085 RBRIDGE_NICKNAME_NONE);
1086 if (bfnew != NULL) {
1087 KIINCR(bki_count);
1088 avl_remove(&bip->bi_fwd, bfp);
1089 bfp->bf_flags &= ~BFF_INTREE;
1090 bfnew->bf_nlinks = bfp->bf_nlinks;
1091 bcopy(bfp->bf_links, bfnew->bf_links,
1092 bfp->bf_nlinks * sizeof (bfp));
1093 /* reset the idx value due to removal above */
1094 (void) avl_find(&bip->bi_fwd, &match, &idx);
1095 }
1096 }
1097
1098 if (bfnew != NULL) {
1099 bfnew->bf_links[bfnew->bf_nlinks++] = blp;
1100 if (drop_ref)
1101 drop_ref = B_FALSE;
1102 else
1103 atomic_inc_uint(&blp->bl_refs); /* bf_links entry */
1104
1105 if (bfnew != bfp) {
1106 /* local addresses are not subject to table limits */
1107 avl_insert(&bip->bi_fwd, bfnew, idx);
1108 bfnew->bf_flags |= (BFF_INTREE | BFF_LOCALADDR);
1109 atomic_inc_uint(&bfnew->bf_refs); /* avl entry */
1110 }
1111 }
1112 rw_exit(&bip->bi_rwlock);
1113
1114 no_new_addr:
1115 /*
1116 * If we found an existing entry and we replaced it with a new one,
1117 * then drop the table reference from the old one. We removed it from
1118 * the AVL tree above.
1119 */
1120 if (bfnew != NULL && bfp != NULL && bfnew != bfp)
1121 fwd_unref(bfp);
1122
1123 /* Account for removed entry. */
1124 if (drop_ref)
1125 link_unref(blp);
1126 }
1127
1128 static void
bridge_new_unicst(bridge_link_t * blp)1129 bridge_new_unicst(bridge_link_t *blp)
1130 {
1131 uint8_t new_mac[ETHERADDRL];
1132
1133 mac_unicast_primary_get(blp->bl_mh, new_mac);
1134 fwd_update_local(blp, blp->bl_local_mac, new_mac);
1135 bcopy(new_mac, blp->bl_local_mac, ETHERADDRL);
1136 }
1137
1138 /*
1139 * We must shut down a link prior to freeing it, and doing that requires
1140 * blocking to wait for running MAC threads while holding a reference. This is
1141 * run from a taskq to accomplish proper link shutdown followed by reference
1142 * drop.
1143 */
1144 static void
link_shutdown(void * arg)1145 link_shutdown(void *arg)
1146 {
1147 bridge_link_t *blp = arg;
1148 mac_handle_t mh = blp->bl_mh;
1149 bridge_inst_t *bip;
1150 bridge_fwd_t *bfp, *bfnext;
1151 avl_tree_t fwd_scavenge;
1152 int i;
1153
1154 /*
1155 * This link is being destroyed. Notify TRILL now that it's no longer
1156 * possible to send packets. Data packets may still arrive until TRILL
1157 * calls bridge_trill_lnunref.
1158 */
1159 if (blp->bl_trilldata != NULL)
1160 trill_lndstr_fn(blp->bl_trilldata, blp);
1161
1162 if (blp->bl_flags & BLF_PROM_ADDED)
1163 (void) mac_promisc_remove(blp->bl_mphp);
1164
1165 if (blp->bl_flags & BLF_SET_BRIDGE)
1166 mac_bridge_clear(mh, (mac_handle_t)blp);
1167
1168 if (blp->bl_flags & BLF_MARGIN_ADDED) {
1169 (void) mac_notify_remove(blp->bl_mnh, B_TRUE);
1170 (void) mac_margin_remove(mh, blp->bl_margin);
1171 }
1172
1173 /* Tell the clients the real link state when we leave */
1174 mac_link_redo(blp->bl_mh,
1175 mac_stat_get(blp->bl_mh, MAC_STAT_LOWLINK_STATE));
1176
1177 /* Destroy all of the forwarding entries related to this link */
1178 avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1179 offsetof(bridge_fwd_t, bf_node));
1180 bip = blp->bl_inst;
1181 rw_enter(&bip->bi_rwlock, RW_WRITER);
1182 bfnext = avl_first(&bip->bi_fwd);
1183 while ((bfp = bfnext) != NULL) {
1184 bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1185 for (i = 0; i < bfp->bf_nlinks; i++) {
1186 if (bfp->bf_links[i] == blp)
1187 break;
1188 }
1189 if (i >= bfp->bf_nlinks)
1190 continue;
1191 if (bfp->bf_nlinks > 1) {
1192 /* note that this can't be the last reference */
1193 link_unref(blp);
1194 bfp->bf_nlinks--;
1195 for (; i < bfp->bf_nlinks; i++)
1196 bfp->bf_links[i] = bfp->bf_links[i + 1];
1197 } else {
1198 ASSERT(bfp->bf_flags & BFF_INTREE);
1199 avl_remove(&bip->bi_fwd, bfp);
1200 bfp->bf_flags &= ~BFF_INTREE;
1201 avl_add(&fwd_scavenge, bfp);
1202 }
1203 }
1204 rw_exit(&bip->bi_rwlock);
1205 bfnext = avl_first(&fwd_scavenge);
1206 while ((bfp = bfnext) != NULL) {
1207 bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1208 avl_remove(&fwd_scavenge, bfp);
1209 fwd_unref(bfp);
1210 }
1211 avl_destroy(&fwd_scavenge);
1212
1213 if (blp->bl_flags & BLF_CLIENT_OPEN)
1214 mac_client_close(blp->bl_mch, 0);
1215
1216 mac_close(mh);
1217
1218 /*
1219 * We are now completely removed from the active list, so drop the
1220 * reference (see bridge_add_link).
1221 */
1222 link_unref(blp);
1223 }
1224
1225 static void
shutdown_inst(bridge_inst_t * bip)1226 shutdown_inst(bridge_inst_t *bip)
1227 {
1228 bridge_link_t *blp, *blnext;
1229 bridge_fwd_t *bfp;
1230
1231 mutex_enter(&inst_lock);
1232 if (bip->bi_flags & BIF_SHUTDOWN) {
1233 mutex_exit(&inst_lock);
1234 return;
1235 }
1236
1237 /*
1238 * Once on the inst_list, the bridge instance must not leave that list
1239 * without having the shutdown flag set first. When the shutdown flag
1240 * is set, we own the list reference, so we must drop it before
1241 * returning.
1242 */
1243 bip->bi_flags |= BIF_SHUTDOWN;
1244 mutex_exit(&inst_lock);
1245
1246 bip->bi_control = NULL;
1247
1248 rw_enter(&bip->bi_rwlock, RW_READER);
1249 blnext = list_head(&bip->bi_links);
1250 while ((blp = blnext) != NULL) {
1251 blnext = list_next(&bip->bi_links, blp);
1252 if (!(blp->bl_flags & BLF_DELETED)) {
1253 blp->bl_flags |= BLF_DELETED;
1254 (void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
1255 blp, DDI_SLEEP);
1256 }
1257 }
1258 while ((bfp = avl_first(&bip->bi_fwd)) != NULL) {
1259 atomic_inc_uint(&bfp->bf_refs);
1260 rw_exit(&bip->bi_rwlock);
1261 fwd_delete(bfp);
1262 fwd_unref(bfp);
1263 rw_enter(&bip->bi_rwlock, RW_READER);
1264 }
1265 rw_exit(&bip->bi_rwlock);
1266
1267 /*
1268 * This bridge is being destroyed. Notify TRILL once all of the
1269 * links are all gone.
1270 */
1271 mutex_enter(&inst_lock);
1272 while (bip->bi_trilldata != NULL && !list_is_empty(&bip->bi_links))
1273 cv_wait(&bip->bi_linkwait, &inst_lock);
1274 mutex_exit(&inst_lock);
1275 if (bip->bi_trilldata != NULL)
1276 trill_brdstr_fn(bip->bi_trilldata, bip);
1277
1278 bridge_unref(bip);
1279 }
1280
1281 /*
1282 * This is called once by the TRILL module when it starts up. It just sets the
1283 * global TRILL callback function pointers -- data transmit/receive and bridge
1284 * and link destroy notification. There's only one TRILL module, so only one
1285 * registration is needed.
1286 *
1287 * TRILL should call this function with NULL pointers before unloading. It
1288 * must not do so before dropping all references to bridges and links. We
1289 * assert that this is true on debug builds.
1290 */
1291 void
bridge_trill_register_cb(trill_recv_pkt_t recv_fn,trill_encap_pkt_t encap_fn,trill_br_dstr_t brdstr_fn,trill_ln_dstr_t lndstr_fn)1292 bridge_trill_register_cb(trill_recv_pkt_t recv_fn, trill_encap_pkt_t encap_fn,
1293 trill_br_dstr_t brdstr_fn, trill_ln_dstr_t lndstr_fn)
1294 {
1295 #ifdef DEBUG
1296 if (recv_fn == NULL && trill_recv_fn != NULL) {
1297 bridge_inst_t *bip;
1298 bridge_link_t *blp;
1299
1300 mutex_enter(&inst_lock);
1301 for (bip = list_head(&inst_list); bip != NULL;
1302 bip = list_next(&inst_list, bip)) {
1303 ASSERT(bip->bi_trilldata == NULL);
1304 rw_enter(&bip->bi_rwlock, RW_READER);
1305 for (blp = list_head(&bip->bi_links); blp != NULL;
1306 blp = list_next(&bip->bi_links, blp)) {
1307 ASSERT(blp->bl_trilldata == NULL);
1308 }
1309 rw_exit(&bip->bi_rwlock);
1310 }
1311 mutex_exit(&inst_lock);
1312 }
1313 #endif
1314 trill_recv_fn = recv_fn;
1315 trill_encap_fn = encap_fn;
1316 trill_brdstr_fn = brdstr_fn;
1317 trill_lndstr_fn = lndstr_fn;
1318 }
1319
1320 /*
1321 * This registers the TRILL instance pointer with a bridge. Before this
1322 * pointer is set, the forwarding, TRILL receive, and bridge destructor
1323 * functions won't be called.
1324 *
1325 * TRILL holds a reference on a bridge with this call. It must free the
1326 * reference by calling the unregister function below.
1327 */
1328 bridge_inst_t *
bridge_trill_brref(const char * bname,void * ptr)1329 bridge_trill_brref(const char *bname, void *ptr)
1330 {
1331 char bridge[MAXLINKNAMELEN];
1332 bridge_inst_t *bip;
1333
1334 (void) snprintf(bridge, MAXLINKNAMELEN, "%s0", bname);
1335 bip = bridge_find_name(bridge);
1336 if (bip != NULL) {
1337 ASSERT(bip->bi_trilldata == NULL && ptr != NULL);
1338 bip->bi_trilldata = ptr;
1339 }
1340 return (bip);
1341 }
1342
1343 void
bridge_trill_brunref(bridge_inst_t * bip)1344 bridge_trill_brunref(bridge_inst_t *bip)
1345 {
1346 ASSERT(bip->bi_trilldata != NULL);
1347 bip->bi_trilldata = NULL;
1348 bridge_unref(bip);
1349 }
1350
1351 /*
1352 * TRILL calls this function when referencing a particular link on a bridge.
1353 *
1354 * It holds a reference on the link, so TRILL must clear out the reference when
1355 * it's done with the link (on unbinding).
1356 */
1357 bridge_link_t *
bridge_trill_lnref(bridge_inst_t * bip,datalink_id_t linkid,void * ptr)1358 bridge_trill_lnref(bridge_inst_t *bip, datalink_id_t linkid, void *ptr)
1359 {
1360 bridge_link_t *blp;
1361
1362 ASSERT(ptr != NULL);
1363 rw_enter(&bip->bi_rwlock, RW_READER);
1364 for (blp = list_head(&bip->bi_links); blp != NULL;
1365 blp = list_next(&bip->bi_links, blp)) {
1366 if (!(blp->bl_flags & BLF_DELETED) &&
1367 blp->bl_linkid == linkid && blp->bl_trilldata == NULL) {
1368 blp->bl_trilldata = ptr;
1369 blp->bl_flags &= ~BLF_TRILLACTIVE;
1370 (void) memset(blp->bl_afs, 0, sizeof (blp->bl_afs));
1371 atomic_inc_uint(&blp->bl_refs);
1372 break;
1373 }
1374 }
1375 rw_exit(&bip->bi_rwlock);
1376 return (blp);
1377 }
1378
1379 void
bridge_trill_lnunref(bridge_link_t * blp)1380 bridge_trill_lnunref(bridge_link_t *blp)
1381 {
1382 mutex_enter(&blp->bl_trilllock);
1383 ASSERT(blp->bl_trilldata != NULL);
1384 blp->bl_trilldata = NULL;
1385 blp->bl_flags &= ~BLF_TRILLACTIVE;
1386 while (blp->bl_trillthreads > 0)
1387 cv_wait(&blp->bl_trillwait, &blp->bl_trilllock);
1388 mutex_exit(&blp->bl_trilllock);
1389 (void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
1390 link_unref(blp);
1391 }
1392
1393 /*
1394 * This periodic timer performs three functions:
1395 * 1. It scans the list of learned forwarding entries, and removes ones that
1396 * haven't been heard from in a while. The time limit is backed down if
1397 * we're above the configured table limit.
1398 * 2. It walks the links and decays away the bl_learns counter.
1399 * 3. It scans the observability node entries looking for ones that can be
1400 * freed up.
1401 */
1402 /* ARGSUSED */
1403 static void
bridge_timer(void * arg)1404 bridge_timer(void *arg)
1405 {
1406 bridge_inst_t *bip;
1407 bridge_fwd_t *bfp, *bfnext;
1408 bridge_mac_t *bmp, *bmnext;
1409 bridge_link_t *blp;
1410 int err;
1411 datalink_id_t tmpid;
1412 avl_tree_t fwd_scavenge;
1413 clock_t age_limit;
1414 uint32_t ldecay;
1415
1416 avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1417 offsetof(bridge_fwd_t, bf_node));
1418 mutex_enter(&inst_lock);
1419 for (bip = list_head(&inst_list); bip != NULL;
1420 bip = list_next(&inst_list, bip)) {
1421 if (bip->bi_flags & BIF_SHUTDOWN)
1422 continue;
1423 rw_enter(&bip->bi_rwlock, RW_WRITER);
1424 /* compute scaled maximum age based on table limit */
1425 if (avl_numnodes(&bip->bi_fwd) > bip->bi_tablemax)
1426 bip->bi_tshift++;
1427 else
1428 bip->bi_tshift = 0;
1429 if ((age_limit = bridge_fwd_age >> bip->bi_tshift) == 0) {
1430 if (bip->bi_tshift != 0)
1431 bip->bi_tshift--;
1432 age_limit = 1;
1433 }
1434 bfnext = avl_first(&bip->bi_fwd);
1435 while ((bfp = bfnext) != NULL) {
1436 bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1437 if (!(bfp->bf_flags & BFF_LOCALADDR) &&
1438 (ddi_get_lbolt() - bfp->bf_lastheard) > age_limit) {
1439 ASSERT(bfp->bf_flags & BFF_INTREE);
1440 avl_remove(&bip->bi_fwd, bfp);
1441 bfp->bf_flags &= ~BFF_INTREE;
1442 avl_add(&fwd_scavenge, bfp);
1443 }
1444 }
1445 for (blp = list_head(&bip->bi_links); blp != NULL;
1446 blp = list_next(&bip->bi_links, blp)) {
1447 ldecay = mac_get_ldecay(blp->bl_mh);
1448 if (ldecay >= blp->bl_learns)
1449 blp->bl_learns = 0;
1450 else
1451 atomic_add_int(&blp->bl_learns, -(int)ldecay);
1452 }
1453 rw_exit(&bip->bi_rwlock);
1454 bfnext = avl_first(&fwd_scavenge);
1455 while ((bfp = bfnext) != NULL) {
1456 bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1457 avl_remove(&fwd_scavenge, bfp);
1458 KIINCR(bki_expire);
1459 fwd_unref(bfp); /* drop tree reference */
1460 }
1461 }
1462 mutex_exit(&inst_lock);
1463 avl_destroy(&fwd_scavenge);
1464
1465 /*
1466 * Scan the bridge_mac_t entries and try to free up the ones that are
1467 * no longer active. This must be done by polling, as neither DLS nor
1468 * MAC provides a driver any sort of positive control over clients.
1469 */
1470 rw_enter(&bmac_rwlock, RW_WRITER);
1471 bmnext = list_head(&bmac_list);
1472 while ((bmp = bmnext) != NULL) {
1473 bmnext = list_next(&bmac_list, bmp);
1474
1475 /* ignore active bridges */
1476 if (bmp->bm_inst != NULL)
1477 continue;
1478
1479 if (bmp->bm_flags & BMF_DLS) {
1480 err = dls_devnet_destroy(bmp->bm_mh, &tmpid, B_FALSE);
1481 ASSERT(err == 0 || err == EBUSY);
1482 if (err == 0)
1483 bmp->bm_flags &= ~BMF_DLS;
1484 }
1485
1486 if (!(bmp->bm_flags & BMF_DLS)) {
1487 err = mac_unregister(bmp->bm_mh);
1488 ASSERT(err == 0 || err == EBUSY);
1489 if (err == 0) {
1490 list_remove(&bmac_list, bmp);
1491 kmem_free(bmp, sizeof (*bmp));
1492 }
1493 }
1494 }
1495 if (list_is_empty(&bmac_list)) {
1496 bridge_timerid = 0;
1497 } else {
1498 bridge_timerid = timeout(bridge_timer, NULL,
1499 bridge_scan_interval);
1500 }
1501 rw_exit(&bmac_rwlock);
1502 }
1503
1504 static int
bridge_open(queue_t * rq,dev_t * devp,int oflag,int sflag,cred_t * credp)1505 bridge_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp)
1506 {
1507 bridge_stream_t *bsp;
1508
1509 if (rq->q_ptr != NULL)
1510 return (0);
1511
1512 if (sflag & MODOPEN)
1513 return (EINVAL);
1514
1515 /*
1516 * Check the minor node number being opened. This tells us which
1517 * bridge instance the user wants.
1518 */
1519 if (getminor(*devp) != 0) {
1520 /*
1521 * This is a regular DLPI stream for snoop or the like.
1522 * Redirect it through DLD.
1523 */
1524 rq->q_qinfo = &bridge_dld_rinit;
1525 OTHERQ(rq)->q_qinfo = &bridge_dld_winit;
1526 return (dld_open(rq, devp, oflag, sflag, credp));
1527 } else {
1528 /*
1529 * Allocate the bridge control stream structure.
1530 */
1531 if ((bsp = stream_alloc()) == NULL)
1532 return (ENOSR);
1533 rq->q_ptr = WR(rq)->q_ptr = (caddr_t)bsp;
1534 bsp->bs_wq = WR(rq);
1535 *devp = makedevice(getmajor(*devp), bsp->bs_minor);
1536 qprocson(rq);
1537 return (0);
1538 }
1539 }
1540
1541 /*
1542 * This is used only for bridge control streams. DLPI goes through dld
1543 * instead.
1544 */
1545 /* ARGSUSED */
1546 static int
bridge_close(queue_t * rq,int flags __unused,cred_t * credp __unused)1547 bridge_close(queue_t *rq, int flags __unused, cred_t *credp __unused)
1548 {
1549 bridge_stream_t *bsp = rq->q_ptr;
1550 bridge_inst_t *bip;
1551
1552 /*
1553 * Wait for any stray taskq (add/delete link) entries related to this
1554 * stream to leave the system.
1555 */
1556 mutex_enter(&stream_ref_lock);
1557 while (bsp->bs_taskq_cnt != 0)
1558 cv_wait(&stream_ref_cv, &stream_ref_lock);
1559 mutex_exit(&stream_ref_lock);
1560
1561 qprocsoff(rq);
1562 if ((bip = bsp->bs_inst) != NULL)
1563 shutdown_inst(bip);
1564 rq->q_ptr = WR(rq)->q_ptr = NULL;
1565 stream_free(bsp);
1566 if (bip != NULL)
1567 bridge_unref(bip);
1568
1569 return (0);
1570 }
1571
1572 static void
bridge_learn(bridge_link_t * blp,const uint8_t * saddr,uint16_t ingress_nick,uint16_t vlanid)1573 bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick,
1574 uint16_t vlanid)
1575 {
1576 bridge_inst_t *bip = blp->bl_inst;
1577 bridge_fwd_t *bfp, *bfpnew;
1578 int i;
1579 boolean_t replaced = B_FALSE;
1580
1581 /* Ignore multi-destination address used as source; it's nonsense. */
1582 if (*saddr & 1)
1583 return;
1584
1585 /*
1586 * If the source is known, then check whether it belongs on this link.
1587 * If not, and this isn't a fixed local address, then we've detected a
1588 * move. If it's not known, learn it.
1589 */
1590 if ((bfp = fwd_find(bip, saddr, vlanid)) != NULL) {
1591 /*
1592 * If the packet has a fixed local source address, then there's
1593 * nothing we can learn. We must quit. If this was a received
1594 * packet, then the sender has stolen our address, but there's
1595 * nothing we can do. If it's a transmitted packet, then
1596 * that's the normal case.
1597 */
1598 if (bfp->bf_flags & BFF_LOCALADDR) {
1599 fwd_unref(bfp);
1600 return;
1601 }
1602
1603 /*
1604 * Check if the link (and TRILL sender, if any) being used is
1605 * among the ones registered for this address. If so, then
1606 * this is information that we already know.
1607 */
1608 if (bfp->bf_trill_nick == ingress_nick) {
1609 for (i = 0; i < bfp->bf_nlinks; i++) {
1610 if (bfp->bf_links[i] == blp) {
1611 bfp->bf_lastheard = ddi_get_lbolt();
1612 fwd_unref(bfp);
1613 return;
1614 }
1615 }
1616 }
1617 }
1618
1619 /*
1620 * Note that we intentionally "unlearn" things that appear to be under
1621 * attack on this link. The forwarding cache is a negative thing for
1622 * security -- it disables reachability as a performance optimization
1623 * -- so leaving out entries optimizes for success and defends against
1624 * the attack. Thus, the bare increment without a check in the delete
1625 * code above is right. (And it's ok if we skid over the limit a
1626 * little, so there's no syncronization needed on the test.)
1627 */
1628 if (blp->bl_learns >= mac_get_llimit(blp->bl_mh)) {
1629 if (bfp != NULL) {
1630 if (bfp->bf_vcnt == 0)
1631 fwd_delete(bfp);
1632 fwd_unref(bfp);
1633 }
1634 return;
1635 }
1636
1637 atomic_inc_uint(&blp->bl_learns);
1638
1639 if ((bfpnew = fwd_alloc(saddr, 1, ingress_nick)) == NULL) {
1640 if (bfp != NULL)
1641 fwd_unref(bfp);
1642 return;
1643 }
1644 KIINCR(bki_count);
1645
1646 if (bfp != NULL) {
1647 /*
1648 * If this is a new destination for the same VLAN, then delete
1649 * so that we can update. If it's a different VLAN, then we're
1650 * not going to delete the original. Split off instead into an
1651 * IVL entry.
1652 */
1653 if (bfp->bf_vlanid == vlanid) {
1654 /* save the count of IVL duplicates */
1655 bfpnew->bf_vcnt = bfp->bf_vcnt;
1656
1657 /* entry deletes count as learning events */
1658 atomic_inc_uint(&blp->bl_learns);
1659
1660 /* destroy and create anew; node moved */
1661 fwd_delete(bfp);
1662 replaced = B_TRUE;
1663 KIINCR(bki_moved);
1664 } else {
1665 bfp->bf_vcnt++;
1666 bfpnew->bf_flags |= BFF_VLANLOCAL;
1667 }
1668 fwd_unref(bfp);
1669 }
1670 bfpnew->bf_links[0] = blp;
1671 bfpnew->bf_nlinks = 1;
1672 atomic_inc_uint(&blp->bl_refs); /* bf_links entry */
1673 if (!fwd_insert(bip, bfpnew))
1674 fwd_free(bfpnew);
1675 else if (!replaced)
1676 KIINCR(bki_source);
1677 }
1678
1679 /*
1680 * Process the VLAN headers for output on a given link. There are several
1681 * cases (noting that we don't map VLANs):
1682 * 1. The input packet is good as it is; either
1683 * a. It has no tag, and output has same PVID
1684 * b. It has a non-zero priority-only tag for PVID, and b_band is same
1685 * c. It has a tag with VLAN different from PVID, and b_band is same
1686 * 2. The tag must change: non-zero b_band is different from tag priority
1687 * 3. The packet has a tag and should not (VLAN same as PVID, b_band zero)
1688 * 4. The packet has no tag and needs one:
1689 * a. VLAN ID same as PVID, but b_band is non-zero
1690 * b. VLAN ID different from PVID
1691 * We exclude case 1 first, then modify the packet. Note that output packets
1692 * get a priority set by the mblk, not by the header, because QoS in bridging
1693 * requires priority recalculation at each node.
1694 *
1695 * The passed-in tci is the "impossible" value 0xFFFF when no tag is present.
1696 */
1697 static mblk_t *
reform_vlan_header(mblk_t * mp,uint16_t vlanid,uint16_t tci,uint16_t pvid)1698 reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
1699 {
1700 boolean_t source_has_tag = (tci != 0xFFFF);
1701 mblk_t *mpcopy;
1702 size_t mlen, minlen;
1703 struct ether_vlan_header *evh;
1704 int pri;
1705
1706 /* This helps centralize error handling in the caller. */
1707 if (mp == NULL)
1708 return (mp);
1709
1710 /*
1711 * A forwarded packet cannot have hardware offloads enabled
1712 * because we don't know if the destination can handle them.
1713 * By this point, any hardware offloads present should have
1714 * been emulated.
1715 */
1716 DB_CKSUMFLAGS(mp) = 0;
1717
1718 /* Get the no-modification cases out of the way first */
1719 if (!source_has_tag && vlanid == pvid) /* 1a */
1720 return (mp);
1721
1722 pri = VLAN_PRI(tci);
1723 if (source_has_tag && mp->b_band == pri) {
1724 if (vlanid != pvid) /* 1c */
1725 return (mp);
1726 if (pri != 0 && VLAN_ID(tci) == 0) /* 1b */
1727 return (mp);
1728 }
1729
1730 /*
1731 * We now know that we must modify the packet. Prepare for that. Note
1732 * that if a tag is present, the caller has already done a pullup for
1733 * the VLAN header, so we're good to go.
1734 */
1735 if (MBLKL(mp) < sizeof (struct ether_header)) {
1736 mpcopy = msgpullup(mp, sizeof (struct ether_header));
1737 if (mpcopy == NULL) {
1738 freemsg(mp);
1739 return (NULL);
1740 }
1741 mp = mpcopy;
1742 }
1743 if (DB_REF(mp) > 1 || !IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)) ||
1744 (!source_has_tag && MBLKTAIL(mp) < VLAN_INCR)) {
1745 minlen = mlen = MBLKL(mp);
1746 if (!source_has_tag)
1747 minlen += VLAN_INCR;
1748 ASSERT(minlen >= sizeof (struct ether_vlan_header));
1749 /*
1750 * We're willing to copy some data to avoid fragmentation, but
1751 * not a lot.
1752 */
1753 if (minlen > 256)
1754 minlen = sizeof (struct ether_vlan_header);
1755 mpcopy = allocb(minlen, BPRI_MED);
1756 if (mpcopy == NULL) {
1757 freemsg(mp);
1758 return (NULL);
1759 }
1760 if (mlen <= minlen) {
1761 /* We toss the first mblk when we can. */
1762 bcopy(mp->b_rptr, mpcopy->b_rptr, mlen);
1763 mpcopy->b_wptr += mlen;
1764 mpcopy->b_cont = mp->b_cont;
1765 freeb(mp);
1766 } else {
1767 /* If not, then just copy what we need */
1768 if (!source_has_tag)
1769 minlen = sizeof (struct ether_header);
1770 bcopy(mp->b_rptr, mpcopy->b_rptr, minlen);
1771 mpcopy->b_wptr += minlen;
1772 mpcopy->b_cont = mp;
1773 mp->b_rptr += minlen;
1774 }
1775 mp = mpcopy;
1776 }
1777
1778 /* LINTED: pointer alignment */
1779 evh = (struct ether_vlan_header *)mp->b_rptr;
1780 if (source_has_tag) {
1781 if (mp->b_band == 0 && vlanid == pvid) { /* 3 */
1782 evh->ether_tpid = evh->ether_type;
1783 mlen = MBLKL(mp);
1784 if (mlen > sizeof (struct ether_vlan_header))
1785 ovbcopy(mp->b_rptr +
1786 sizeof (struct ether_vlan_header),
1787 mp->b_rptr + sizeof (struct ether_header),
1788 mlen - sizeof (struct ether_vlan_header));
1789 mp->b_wptr -= VLAN_INCR;
1790 } else { /* 2 */
1791 if (vlanid == pvid)
1792 vlanid = VLAN_ID_NONE;
1793 tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1794 evh->ether_tci = htons(tci);
1795 }
1796 } else {
1797 /* case 4: no header present, but one is needed */
1798 mlen = MBLKL(mp);
1799 if (mlen > sizeof (struct ether_header))
1800 ovbcopy(mp->b_rptr + sizeof (struct ether_header),
1801 mp->b_rptr + sizeof (struct ether_vlan_header),
1802 mlen - sizeof (struct ether_header));
1803 mp->b_wptr += VLAN_INCR;
1804 ASSERT(mp->b_wptr <= DB_LIM(mp));
1805 if (vlanid == pvid)
1806 vlanid = VLAN_ID_NONE;
1807 tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1808 evh->ether_type = evh->ether_tpid;
1809 evh->ether_tpid = htons(ETHERTYPE_VLAN);
1810 evh->ether_tci = htons(tci);
1811 }
1812 return (mp);
1813 }
1814
1815 /* Record VLAN information and strip header if requested . */
1816 static void
update_header(mblk_t * mp,mac_header_info_t * hdr_info,boolean_t striphdr)1817 update_header(mblk_t *mp, mac_header_info_t *hdr_info, boolean_t striphdr)
1818 {
1819 if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
1820 struct ether_vlan_header *evhp;
1821 uint16_t ether_type;
1822
1823 /* LINTED: alignment */
1824 evhp = (struct ether_vlan_header *)mp->b_rptr;
1825 hdr_info->mhi_istagged = B_TRUE;
1826 hdr_info->mhi_tci = ntohs(evhp->ether_tci);
1827 if (striphdr) {
1828 /*
1829 * For VLAN tagged frames update the ether_type
1830 * in hdr_info before stripping the header.
1831 */
1832 ether_type = ntohs(evhp->ether_type);
1833 hdr_info->mhi_origsap = ether_type;
1834 hdr_info->mhi_bindsap = (ether_type > ETHERMTU) ?
1835 ether_type : DLS_SAP_LLC;
1836 mp->b_rptr = (uchar_t *)(evhp + 1);
1837 }
1838 } else {
1839 hdr_info->mhi_istagged = B_FALSE;
1840 hdr_info->mhi_tci = VLAN_ID_NONE;
1841 if (striphdr)
1842 mp->b_rptr += sizeof (struct ether_header);
1843 }
1844 }
1845
1846 /*
1847 * Return B_TRUE if we're allowed to send on this link with the given VLAN ID.
1848 */
1849 static boolean_t
bridge_can_send(bridge_link_t * blp,uint16_t vlanid)1850 bridge_can_send(bridge_link_t *blp, uint16_t vlanid)
1851 {
1852 ASSERT(vlanid != VLAN_ID_NONE);
1853 if (blp->bl_flags & BLF_DELETED)
1854 return (B_FALSE);
1855 if (blp->bl_trilldata == NULL && blp->bl_state != BLS_FORWARDING)
1856 return (B_FALSE);
1857 return (BRIDGE_VLAN_ISSET(blp, vlanid) && BRIDGE_AF_ISSET(blp, vlanid));
1858 }
1859
1860 /*
1861 * This function scans the bridge forwarding tables in order to forward a given
1862 * packet. If the packet either doesn't need forwarding (the current link is
1863 * correct) or the current link needs a copy as well, then the packet is
1864 * returned to the caller.
1865 *
1866 * If a packet has been decapsulated from TRILL, then it must *NOT* reenter a
1867 * TRILL tunnel. If the destination points there, then drop instead.
1868 */
1869 static mblk_t *
bridge_forward(bridge_link_t * blp,mac_header_info_t * hdr_info,mblk_t * mp,uint16_t vlanid,uint16_t tci,boolean_t from_trill,boolean_t is_xmit)1870 bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
1871 uint16_t vlanid, uint16_t tci, boolean_t from_trill, boolean_t is_xmit)
1872 {
1873 mblk_t *mpsend, *mpcopy;
1874 bridge_inst_t *bip = blp->bl_inst;
1875 bridge_link_t *blpsend, *blpnext;
1876 bridge_fwd_t *bfp;
1877 uint_t i;
1878 boolean_t selfseen = B_FALSE;
1879 void *tdp;
1880 const uint8_t *daddr = hdr_info->mhi_daddr;
1881
1882 /*
1883 * Check for the IEEE "reserved" multicast addresses. Messages sent to
1884 * these addresses are used for link-local control (STP and pause), and
1885 * are never forwarded or redirected.
1886 */
1887 if (daddr[0] == 1 && daddr[1] == 0x80 && daddr[2] == 0xc2 &&
1888 daddr[3] == 0 && daddr[4] == 0 && (daddr[5] & 0xf0) == 0) {
1889 if (from_trill) {
1890 freemsg(mp);
1891 mp = NULL;
1892 }
1893 return (mp);
1894 }
1895
1896 if ((bfp = fwd_find(bip, daddr, vlanid)) != NULL) {
1897
1898 /*
1899 * If trill indicates a destination for this node, then it's
1900 * clearly not intended for local delivery. We must tell TRILL
1901 * to encapsulate, as long as we didn't just decapsulate it.
1902 */
1903 if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE) {
1904 /*
1905 * Error case: can't reencapsulate if the protocols are
1906 * working correctly.
1907 */
1908 if (from_trill) {
1909 freemsg(mp);
1910 return (NULL);
1911 }
1912 mutex_enter(&blp->bl_trilllock);
1913 if ((tdp = blp->bl_trilldata) != NULL) {
1914 blp->bl_trillthreads++;
1915 mutex_exit(&blp->bl_trilllock);
1916 update_header(mp, hdr_info, B_FALSE);
1917
1918 /*
1919 * All trill data frames have
1920 * Inner.VLAN.
1921 */
1922 mp = reform_vlan_header(mp, vlanid, tci, 0);
1923
1924 if (mp == NULL) {
1925 KIINCR(bki_drops);
1926 goto done;
1927 }
1928
1929 trill_encap_fn(tdp, blp, hdr_info, mp,
1930 bfp->bf_trill_nick);
1931
1932 done:
1933 mutex_enter(&blp->bl_trilllock);
1934 if (--blp->bl_trillthreads == 0 &&
1935 blp->bl_trilldata == NULL)
1936 cv_broadcast(&blp->bl_trillwait);
1937 }
1938 mutex_exit(&blp->bl_trilllock);
1939
1940 /* if TRILL has been disabled, then kill this stray */
1941 if (tdp == NULL) {
1942 freemsg(mp);
1943 fwd_delete(bfp);
1944 }
1945 fwd_unref(bfp);
1946 return (NULL);
1947 }
1948
1949 /* find first link we can send on */
1950 for (i = 0; i < bfp->bf_nlinks; i++) {
1951 blpsend = bfp->bf_links[i];
1952 if (blpsend == blp)
1953 selfseen = B_TRUE;
1954 else if (bridge_can_send(blpsend, vlanid))
1955 break;
1956 }
1957
1958 while (i < bfp->bf_nlinks) {
1959 blpsend = bfp->bf_links[i];
1960 for (i++; i < bfp->bf_nlinks; i++) {
1961 blpnext = bfp->bf_links[i];
1962 if (blpnext == blp)
1963 selfseen = B_TRUE;
1964 else if (bridge_can_send(blpnext, vlanid))
1965 break;
1966 }
1967 if (i == bfp->bf_nlinks && !selfseen) {
1968 mpsend = mp;
1969 mp = NULL;
1970 } else {
1971 mpsend = copymsg(mp);
1972 }
1973
1974 mpsend = reform_vlan_header(mpsend, vlanid, tci,
1975 blpsend->bl_pvid);
1976
1977 if (mpsend == NULL) {
1978 KIINCR(bki_drops);
1979 continue;
1980 }
1981
1982 KIINCR(bki_forwards);
1983
1984 /*
1985 * No need to bump up the link reference count, as
1986 * the forwarding entry itself holds a reference to
1987 * the link.
1988 */
1989 if (bfp->bf_flags & BFF_LOCALADDR) {
1990 mac_rx_common(blpsend->bl_mh, NULL, mpsend);
1991 } else {
1992 KLPINCR(blpsend, bkl_xmit);
1993 mpsend = mac_ring_tx(blpsend->bl_mh, NULL,
1994 mpsend);
1995 freemsg(mpsend);
1996 }
1997 }
1998
1999 /*
2000 * Handle a special case: if we're transmitting to the original
2001 * link, then check whether the localaddr flag is set. If it
2002 * is, then receive instead. This doesn't happen with ordinary
2003 * bridging, but does happen often with TRILL decapsulation.
2004 */
2005 if (mp != NULL && is_xmit && (bfp->bf_flags & BFF_LOCALADDR)) {
2006 mac_rx_common(blp->bl_mh, NULL, mp);
2007 mp = NULL;
2008 }
2009 fwd_unref(bfp);
2010 } else {
2011 /*
2012 * TRILL has two cases to handle. If the packet is off the
2013 * wire (not from TRILL), then we need to send up into the
2014 * TRILL module to have the distribution tree computed. If the
2015 * packet is from TRILL (decapsulated), then we're part of the
2016 * distribution tree, and we need to copy the packet on member
2017 * interfaces.
2018 *
2019 * Thus, the from TRILL case is identical to the STP case.
2020 */
2021 if (!from_trill && blp->bl_trilldata != NULL) {
2022 mutex_enter(&blp->bl_trilllock);
2023 if ((tdp = blp->bl_trilldata) != NULL) {
2024 blp->bl_trillthreads++;
2025 mutex_exit(&blp->bl_trilllock);
2026 if ((mpsend = copymsg(mp)) != NULL) {
2027 update_header(mpsend,
2028 hdr_info, B_FALSE);
2029 /*
2030 * all trill data frames have
2031 * Inner.VLAN
2032 */
2033 mpsend = reform_vlan_header(mpsend,
2034 vlanid, tci, 0);
2035 if (mpsend == NULL) {
2036 KIINCR(bki_drops);
2037 } else {
2038 trill_encap_fn(tdp, blp,
2039 hdr_info, mpsend,
2040 RBRIDGE_NICKNAME_NONE);
2041 }
2042 }
2043 mutex_enter(&blp->bl_trilllock);
2044 if (--blp->bl_trillthreads == 0 &&
2045 blp->bl_trilldata == NULL)
2046 cv_broadcast(&blp->bl_trillwait);
2047 }
2048 mutex_exit(&blp->bl_trilllock);
2049 }
2050
2051 /*
2052 * This is an unknown destination, so flood.
2053 */
2054 rw_enter(&bip->bi_rwlock, RW_READER);
2055 for (blpnext = list_head(&bip->bi_links); blpnext != NULL;
2056 blpnext = list_next(&bip->bi_links, blpnext)) {
2057 if (blpnext == blp)
2058 selfseen = B_TRUE;
2059 else if (bridge_can_send(blpnext, vlanid))
2060 break;
2061 }
2062 if (blpnext != NULL)
2063 atomic_inc_uint(&blpnext->bl_refs);
2064 rw_exit(&bip->bi_rwlock);
2065 while ((blpsend = blpnext) != NULL) {
2066 rw_enter(&bip->bi_rwlock, RW_READER);
2067 for (blpnext = list_next(&bip->bi_links, blpsend);
2068 blpnext != NULL;
2069 blpnext = list_next(&bip->bi_links, blpnext)) {
2070 if (blpnext == blp)
2071 selfseen = B_TRUE;
2072 else if (bridge_can_send(blpnext, vlanid))
2073 break;
2074 }
2075 if (blpnext != NULL)
2076 atomic_inc_uint(&blpnext->bl_refs);
2077 rw_exit(&bip->bi_rwlock);
2078 if (blpnext == NULL && !selfseen) {
2079 mpsend = mp;
2080 mp = NULL;
2081 } else {
2082 mpsend = copymsg(mp);
2083 }
2084
2085 mpsend = reform_vlan_header(mpsend, vlanid, tci,
2086 blpsend->bl_pvid);
2087
2088 if (mpsend == NULL) {
2089 KIINCR(bki_drops);
2090 continue;
2091 }
2092
2093 if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST)
2094 KIINCR(bki_unknown);
2095 else
2096 KIINCR(bki_mbcast);
2097
2098 KLPINCR(blpsend, bkl_xmit);
2099 if ((mpcopy = copymsg(mpsend)) != NULL) {
2100 mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
2101 }
2102
2103 mpsend = mac_ring_tx(blpsend->bl_mh, NULL, mpsend);
2104 freemsg(mpsend);
2105 link_unref(blpsend);
2106 }
2107 }
2108
2109 /*
2110 * At this point, if np is non-NULL, it means that the caller needs to
2111 * continue on the selected link.
2112 */
2113 return (mp);
2114 }
2115
2116 /*
2117 * Extract and validate the VLAN information for a given packet. This checks
2118 * conformance with the rules for use of the PVID on the link, and for the
2119 * allowed (configured) VLAN set.
2120 *
2121 * Returns B_TRUE if the packet passes, B_FALSE if it fails.
2122 */
2123 static boolean_t
bridge_get_vlan(bridge_link_t * blp,mac_header_info_t * hdr_info,mblk_t * mp,uint16_t * vlanidp,uint16_t * tcip)2124 bridge_get_vlan(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
2125 uint16_t *vlanidp, uint16_t *tcip)
2126 {
2127 uint16_t tci, vlanid;
2128
2129 if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
2130 ptrdiff_t tpos = offsetof(struct ether_vlan_header, ether_tci);
2131 ptrdiff_t mlen;
2132
2133 /*
2134 * Extract the VLAN ID information, regardless of alignment,
2135 * and without a pullup. This isn't attractive, but we do this
2136 * to avoid having to deal with the pointers stashed in
2137 * hdr_info moving around or having the caller deal with a new
2138 * mblk_t pointer.
2139 */
2140 while (mp != NULL) {
2141 mlen = MBLKL(mp);
2142 if (mlen > tpos && mlen > 0)
2143 break;
2144 tpos -= mlen;
2145 mp = mp->b_cont;
2146 }
2147 if (mp == NULL)
2148 return (B_FALSE);
2149 tci = mp->b_rptr[tpos] << 8;
2150 if (++tpos >= mlen) {
2151 do {
2152 mp = mp->b_cont;
2153 } while (mp != NULL && MBLKL(mp) == 0);
2154 if (mp == NULL)
2155 return (B_FALSE);
2156 tpos = 0;
2157 }
2158 tci |= mp->b_rptr[tpos];
2159
2160 vlanid = VLAN_ID(tci);
2161 if (VLAN_CFI(tci) != ETHER_CFI || vlanid > VLAN_ID_MAX)
2162 return (B_FALSE);
2163 if (vlanid == VLAN_ID_NONE || vlanid == blp->bl_pvid)
2164 goto input_no_vlan;
2165 if (!BRIDGE_VLAN_ISSET(blp, vlanid))
2166 return (B_FALSE);
2167 } else {
2168 tci = 0xFFFF;
2169 input_no_vlan:
2170 /*
2171 * If PVID is set to zero, then untagged traffic is not
2172 * supported here. Do not learn or forward.
2173 */
2174 if ((vlanid = blp->bl_pvid) == VLAN_ID_NONE)
2175 return (B_FALSE);
2176 }
2177
2178 *tcip = tci;
2179 *vlanidp = vlanid;
2180 return (B_TRUE);
2181 }
2182
2183 /*
2184 * Handle MAC notifications.
2185 */
2186 static void
bridge_notify_cb(void * arg,mac_notify_type_t note_type)2187 bridge_notify_cb(void *arg, mac_notify_type_t note_type)
2188 {
2189 bridge_link_t *blp = arg;
2190
2191 switch (note_type) {
2192 case MAC_NOTE_UNICST:
2193 bridge_new_unicst(blp);
2194 break;
2195
2196 case MAC_NOTE_SDU_SIZE: {
2197 uint_t maxsdu;
2198 bridge_inst_t *bip = blp->bl_inst;
2199 bridge_mac_t *bmp = bip->bi_mac;
2200 boolean_t notify = B_FALSE;
2201 mblk_t *mlist = NULL;
2202
2203 mac_sdu_get(blp->bl_mh, NULL, &maxsdu);
2204 rw_enter(&bip->bi_rwlock, RW_READER);
2205 if (list_prev(&bip->bi_links, blp) == NULL &&
2206 list_next(&bip->bi_links, blp) == NULL) {
2207 notify = (maxsdu != bmp->bm_maxsdu);
2208 bmp->bm_maxsdu = maxsdu;
2209 }
2210 blp->bl_maxsdu = maxsdu;
2211 if (maxsdu != bmp->bm_maxsdu)
2212 link_sdu_fail(blp, B_TRUE, &mlist);
2213 else if (notify)
2214 (void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2215 rw_exit(&bip->bi_rwlock);
2216 send_up_messages(bip, mlist);
2217 break;
2218 }
2219 }
2220 }
2221
2222 /*
2223 * This is called by the MAC layer. As with the transmit side, we're right in
2224 * the data path for all I/O on this port, so if we don't need to forward this
2225 * packet anywhere, we have to send it upwards via mac_rx_common.
2226 */
2227 static void
bridge_recv_cb(mac_handle_t mh,mac_resource_handle_t rsrc,mblk_t * mpnext)2228 bridge_recv_cb(mac_handle_t mh, mac_resource_handle_t rsrc, mblk_t *mpnext)
2229 {
2230 mblk_t *mp, *mpcopy;
2231 bridge_link_t *blp = (bridge_link_t *)mh;
2232 bridge_inst_t *bip = blp->bl_inst;
2233 bridge_mac_t *bmp = bip->bi_mac;
2234 mac_header_info_t hdr_info;
2235 uint16_t vlanid, tci;
2236 boolean_t trillmode = B_FALSE;
2237
2238 KIINCR(bki_recv);
2239 KLINCR(bkl_recv);
2240
2241 /*
2242 * Regardless of state, check for inbound TRILL packets when TRILL is
2243 * active. These are pulled out of band and sent for TRILL handling.
2244 */
2245 if (blp->bl_trilldata != NULL) {
2246 void *tdp;
2247 mblk_t *newhead;
2248 mblk_t *tail = NULL;
2249
2250 mutex_enter(&blp->bl_trilllock);
2251 if ((tdp = blp->bl_trilldata) != NULL) {
2252 blp->bl_trillthreads++;
2253 mutex_exit(&blp->bl_trilllock);
2254 trillmode = B_TRUE;
2255 newhead = mpnext;
2256 while ((mp = mpnext) != NULL) {
2257 boolean_t raw_isis, bridge_group;
2258
2259 mpnext = mp->b_next;
2260
2261 /*
2262 * If the header isn't readable, then leave on
2263 * the list and continue.
2264 */
2265 if (mac_header_info(blp->bl_mh, mp,
2266 &hdr_info) != 0) {
2267 tail = mp;
2268 continue;
2269 }
2270
2271 /*
2272 * The TRILL document specifies that, on
2273 * Ethernet alone, IS-IS packets arrive with
2274 * LLC rather than Ethertype, and using a
2275 * specific destination address. We must check
2276 * for that here. Also, we need to give BPDUs
2277 * to TRILL for processing.
2278 */
2279 raw_isis = bridge_group = B_FALSE;
2280 if (hdr_info.mhi_dsttype ==
2281 MAC_ADDRTYPE_MULTICAST) {
2282 if (memcmp(hdr_info.mhi_daddr,
2283 all_isis_rbridges, ETHERADDRL) == 0)
2284 raw_isis = B_TRUE;
2285 else if (memcmp(hdr_info.mhi_daddr,
2286 bridge_group_address, ETHERADDRL) ==
2287 0)
2288 bridge_group = B_TRUE;
2289 }
2290 if (!raw_isis && !bridge_group &&
2291 hdr_info.mhi_bindsap != ETHERTYPE_TRILL &&
2292 (hdr_info.mhi_bindsap != ETHERTYPE_VLAN ||
2293 /* LINTED: alignment */
2294 ((struct ether_vlan_header *)mp->b_rptr)->
2295 ether_type != htons(ETHERTYPE_TRILL))) {
2296 tail = mp;
2297 continue;
2298 }
2299
2300 /*
2301 * We've got TRILL input. Remove from the list
2302 * and send up through the TRILL module. (Send
2303 * a copy through promiscuous receive just to
2304 * support snooping on TRILL. Order isn't
2305 * preserved strictly, but that doesn't matter
2306 * here.)
2307 */
2308 if (tail != NULL)
2309 tail->b_next = mpnext;
2310 mp->b_next = NULL;
2311 if (mp == newhead)
2312 newhead = mpnext;
2313 mac_trill_snoop(blp->bl_mh, mp);
2314 update_header(mp, &hdr_info, B_TRUE);
2315 /*
2316 * On raw IS-IS and BPDU frames, we have to
2317 * make sure that the length is trimmed
2318 * properly. We use origsap in order to cope
2319 * with jumbograms for IS-IS. (Regular mac
2320 * can't.)
2321 */
2322 if (raw_isis || bridge_group) {
2323 size_t msglen = msgdsize(mp);
2324
2325 if (msglen > hdr_info.mhi_origsap) {
2326 (void) adjmsg(mp,
2327 hdr_info.mhi_origsap -
2328 msglen);
2329 } else if (msglen <
2330 hdr_info.mhi_origsap) {
2331 freemsg(mp);
2332 continue;
2333 }
2334 }
2335 trill_recv_fn(tdp, blp, rsrc, mp, &hdr_info);
2336 }
2337 mpnext = newhead;
2338 mutex_enter(&blp->bl_trilllock);
2339 if (--blp->bl_trillthreads == 0 &&
2340 blp->bl_trilldata == NULL)
2341 cv_broadcast(&blp->bl_trillwait);
2342 }
2343 mutex_exit(&blp->bl_trilllock);
2344 if (mpnext == NULL)
2345 return;
2346 }
2347
2348 /*
2349 * If this is a TRILL RBridge, then just check whether this link is
2350 * used at all for forwarding. If not, then we're done.
2351 */
2352 if (trillmode) {
2353 if (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2354 (blp->bl_flags & BLF_SDUFAIL)) {
2355 mac_rx_common(blp->bl_mh, rsrc, mpnext);
2356 return;
2357 }
2358 } else {
2359 /*
2360 * For regular (STP) bridges, if we're in blocking or listening
2361 * state, then do nothing. We don't learn or forward until
2362 * told to do so.
2363 */
2364 if (blp->bl_state == BLS_BLOCKLISTEN) {
2365 mac_rx_common(blp->bl_mh, rsrc, mpnext);
2366 return;
2367 }
2368 }
2369
2370 /*
2371 * Send a copy of the message chain up to the observability node users.
2372 * For TRILL, we must obey the VLAN AF rules, so we go packet-by-
2373 * packet.
2374 */
2375 if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2376 (bmp->bm_flags & BMF_STARTED) &&
2377 (mp = copymsgchain(mpnext)) != NULL) {
2378 mac_rx(bmp->bm_mh, NULL, mp);
2379 }
2380
2381 /*
2382 * We must be in learning or forwarding state, or using TRILL on a link
2383 * with one or more VLANs active. For each packet in the list, process
2384 * the source address, and then attempt to forward.
2385 */
2386 while ((mp = mpnext) != NULL) {
2387 mpnext = mp->b_next;
2388 mp->b_next = NULL;
2389
2390 /*
2391 * If we can't decode the header or if the header specifies a
2392 * multicast source address (impossible!), then don't bother
2393 * learning or forwarding, but go ahead and forward up the
2394 * stack for subsequent processing.
2395 */
2396 if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0 ||
2397 (hdr_info.mhi_saddr[0] & 1) != 0) {
2398 KIINCR(bki_drops);
2399 KLINCR(bkl_drops);
2400 mac_rx_common(blp->bl_mh, rsrc, mp);
2401 continue;
2402 }
2403
2404 /*
2405 * Extract and validate the VLAN ID for this packet.
2406 */
2407 if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2408 !BRIDGE_AF_ISSET(blp, vlanid)) {
2409 mac_rx_common(blp->bl_mh, rsrc, mp);
2410 continue;
2411 }
2412
2413 if (trillmode) {
2414 /*
2415 * Special test required by TRILL document: must
2416 * discard frames with outer address set to ESADI.
2417 */
2418 if (memcmp(hdr_info.mhi_daddr, all_esadi_rbridges,
2419 ETHERADDRL) == 0) {
2420 mac_rx_common(blp->bl_mh, rsrc, mp);
2421 continue;
2422 }
2423
2424 /*
2425 * If we're in TRILL mode, then the call above to get
2426 * the VLAN ID has also checked that we're the
2427 * appointed forwarder, so report that we're handling
2428 * this packet to any observability node users.
2429 */
2430 if ((bmp->bm_flags & BMF_STARTED) &&
2431 (mpcopy = copymsg(mp)) != NULL)
2432 mac_rx(bmp->bm_mh, NULL, mpcopy);
2433 }
2434
2435 /*
2436 * First process the source address and learn from it. For
2437 * TRILL, we learn only if we're the appointed forwarder.
2438 */
2439 bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2440 vlanid);
2441
2442 /*
2443 * Now check whether we're forwarding and look up the
2444 * destination. If we can forward, do so.
2445 */
2446 if (trillmode || blp->bl_state == BLS_FORWARDING) {
2447 mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2448 B_FALSE, B_FALSE);
2449 }
2450 if (mp != NULL)
2451 mac_rx_common(blp->bl_mh, rsrc, mp);
2452 }
2453 }
2454
2455
2456 /* ARGSUSED */
2457 static mblk_t *
bridge_xmit_cb(mac_handle_t mh,mac_ring_handle_t rh,mblk_t * mpnext)2458 bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext)
2459 {
2460 bridge_link_t *blp = (bridge_link_t *)mh;
2461 bridge_inst_t *bip = blp->bl_inst;
2462 bridge_mac_t *bmp = bip->bi_mac;
2463 mac_header_info_t hdr_info;
2464 uint16_t vlanid, tci;
2465 mblk_t *mp, *mpcopy;
2466 boolean_t trillmode;
2467
2468 trillmode = blp->bl_trilldata != NULL;
2469
2470 /*
2471 * If we're using STP and we're in blocking or listening state, or if
2472 * we're using TRILL and no VLANs are active, then behave as though the
2473 * bridge isn't here at all, and send on the local link alone.
2474 */
2475 if ((!trillmode && blp->bl_state == BLS_BLOCKLISTEN) ||
2476 (trillmode &&
2477 (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2478 (blp->bl_flags & BLF_SDUFAIL)))) {
2479 KIINCR(bki_sent);
2480 KLINCR(bkl_xmit);
2481 mp = mac_ring_tx(blp->bl_mh, rh, mpnext);
2482 return (mp);
2483 }
2484
2485 /*
2486 * Send a copy of the message up to the observability node users.
2487 * TRILL needs to check on a packet-by-packet basis.
2488 */
2489 if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2490 (bmp->bm_flags & BMF_STARTED) &&
2491 (mp = copymsgchain(mpnext)) != NULL) {
2492 mac_rx(bmp->bm_mh, NULL, mp);
2493 }
2494
2495 while ((mp = mpnext) != NULL) {
2496 mpnext = mp->b_next;
2497 mp->b_next = NULL;
2498
2499 if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2500 freemsg(mp);
2501 continue;
2502 }
2503
2504 /*
2505 * Extract and validate the VLAN ID for this packet.
2506 */
2507 if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2508 !BRIDGE_AF_ISSET(blp, vlanid)) {
2509 freemsg(mp);
2510 continue;
2511 }
2512
2513 /*
2514 * If we're using TRILL, then we've now validated that we're
2515 * the forwarder for this VLAN, so go ahead and let
2516 * observability node users know about the packet.
2517 */
2518 if (trillmode && (bmp->bm_flags & BMF_STARTED) &&
2519 (mpcopy = copymsg(mp)) != NULL) {
2520 mac_rx(bmp->bm_mh, NULL, mpcopy);
2521 }
2522
2523 /*
2524 * We have to learn from our own transmitted packets, because
2525 * there may be a Solaris DLPI raw sender (which can specify its
2526 * own source address) using promiscuous mode for receive. The
2527 * mac layer information won't (and can't) tell us everything
2528 * we need to know.
2529 */
2530 bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2531 vlanid);
2532
2533 /* attempt forwarding */
2534 if (trillmode || blp->bl_state == BLS_FORWARDING) {
2535 mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2536 B_FALSE, B_TRUE);
2537 }
2538 if (mp != NULL) {
2539 mp = mac_ring_tx(blp->bl_mh, rh, mp);
2540 if (mp == NULL) {
2541 KIINCR(bki_sent);
2542 KLINCR(bkl_xmit);
2543 }
2544 }
2545 /*
2546 * If we get stuck, then stop. Don't let the user's output
2547 * packets get out of order. (More importantly: don't try to
2548 * bridge the same packet multiple times if flow control is
2549 * asserted.)
2550 */
2551 if (mp != NULL) {
2552 mp->b_next = mpnext;
2553 break;
2554 }
2555 }
2556 return (mp);
2557 }
2558
2559 /*
2560 * This is called by TRILL when it decapsulates an packet, and we must forward
2561 * locally. On failure, we just drop.
2562 *
2563 * Note that the ingress_nick reported by TRILL must not represent this local
2564 * node.
2565 */
2566 void
bridge_trill_decaps(bridge_link_t * blp,mblk_t * mp,uint16_t ingress_nick)2567 bridge_trill_decaps(bridge_link_t *blp, mblk_t *mp, uint16_t ingress_nick)
2568 {
2569 mac_header_info_t hdr_info;
2570 uint16_t vlanid, tci;
2571 bridge_inst_t *bip = blp->bl_inst; /* used by macros */
2572 mblk_t *mpcopy;
2573
2574 if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2575 freemsg(mp);
2576 return;
2577 }
2578
2579 /* Extract VLAN ID for this packet. */
2580 if (hdr_info.mhi_bindsap == ETHERTYPE_VLAN) {
2581 struct ether_vlan_header *evhp;
2582
2583 /* LINTED: alignment */
2584 evhp = (struct ether_vlan_header *)mp->b_rptr;
2585 tci = ntohs(evhp->ether_tci);
2586 vlanid = VLAN_ID(tci);
2587 } else {
2588 /* Inner VLAN headers are required in TRILL data packets */
2589 DTRACE_PROBE3(bridge__trill__decaps__novlan, bridge_link_t *,
2590 blp, mblk_t *, mp, uint16_t, ingress_nick);
2591 freemsg(mp);
2592 return;
2593 }
2594
2595 /* Learn the location of this sender in the RBridge network */
2596 bridge_learn(blp, hdr_info.mhi_saddr, ingress_nick, vlanid);
2597
2598 /* attempt forwarding */
2599 mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci, B_TRUE, B_TRUE);
2600 if (mp != NULL) {
2601 if (bridge_can_send(blp, vlanid)) {
2602 /* Deliver a copy locally as well */
2603 if ((mpcopy = copymsg(mp)) != NULL)
2604 mac_rx_common(blp->bl_mh, NULL, mpcopy);
2605 mp = mac_ring_tx(blp->bl_mh, NULL, mp);
2606 }
2607 if (mp == NULL) {
2608 KIINCR(bki_sent);
2609 KLINCR(bkl_xmit);
2610 } else {
2611 freemsg(mp);
2612 }
2613 }
2614 }
2615
2616 /*
2617 * This function is used by TRILL _only_ to transmit TRILL-encapsulated
2618 * packets. It sends on a single underlying link and does not bridge.
2619 */
2620 mblk_t *
bridge_trill_output(bridge_link_t * blp,mblk_t * mp)2621 bridge_trill_output(bridge_link_t *blp, mblk_t *mp)
2622 {
2623 bridge_inst_t *bip = blp->bl_inst; /* used by macros */
2624
2625 mac_trill_snoop(blp->bl_mh, mp);
2626 mp = mac_ring_tx(blp->bl_mh, NULL, mp);
2627 if (mp == NULL) {
2628 KIINCR(bki_sent);
2629 KLINCR(bkl_xmit);
2630 }
2631 return (mp);
2632 }
2633
2634 /*
2635 * Set the "appointed forwarder" flag array for this link. TRILL controls
2636 * forwarding on a VLAN basis. The "trillactive" flag is an optimization for
2637 * the forwarder.
2638 */
2639 void
bridge_trill_setvlans(bridge_link_t * blp,const uint8_t * arr)2640 bridge_trill_setvlans(bridge_link_t *blp, const uint8_t *arr)
2641 {
2642 int i;
2643 uint_t newflags = 0;
2644
2645 for (i = 0; i < BRIDGE_VLAN_ARR_SIZE; i++) {
2646 if ((blp->bl_afs[i] = arr[i]) != 0)
2647 newflags = BLF_TRILLACTIVE;
2648 }
2649 blp->bl_flags = (blp->bl_flags & ~BLF_TRILLACTIVE) | newflags;
2650 }
2651
2652 void
bridge_trill_flush(bridge_link_t * blp,uint16_t vlan,boolean_t dotrill)2653 bridge_trill_flush(bridge_link_t *blp, uint16_t vlan, boolean_t dotrill)
2654 {
2655 bridge_inst_t *bip = blp->bl_inst;
2656 bridge_fwd_t *bfp, *bfnext;
2657 avl_tree_t fwd_scavenge;
2658 int i;
2659
2660 _NOTE(ARGUNUSED(vlan));
2661
2662 avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
2663 offsetof(bridge_fwd_t, bf_node));
2664 rw_enter(&bip->bi_rwlock, RW_WRITER);
2665 bfnext = avl_first(&bip->bi_fwd);
2666 while ((bfp = bfnext) != NULL) {
2667 bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
2668 if (bfp->bf_flags & BFF_LOCALADDR)
2669 continue;
2670 if (dotrill) {
2671 /* port doesn't matter if we're flushing TRILL */
2672 if (bfp->bf_trill_nick == RBRIDGE_NICKNAME_NONE)
2673 continue;
2674 } else {
2675 if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE)
2676 continue;
2677 for (i = 0; i < bfp->bf_nlinks; i++) {
2678 if (bfp->bf_links[i] == blp)
2679 break;
2680 }
2681 if (i >= bfp->bf_nlinks)
2682 continue;
2683 }
2684 ASSERT(bfp->bf_flags & BFF_INTREE);
2685 avl_remove(&bip->bi_fwd, bfp);
2686 bfp->bf_flags &= ~BFF_INTREE;
2687 avl_add(&fwd_scavenge, bfp);
2688 }
2689 rw_exit(&bip->bi_rwlock);
2690 bfnext = avl_first(&fwd_scavenge);
2691 while ((bfp = bfnext) != NULL) {
2692 bfnext = AVL_NEXT(&fwd_scavenge, bfp);
2693 avl_remove(&fwd_scavenge, bfp);
2694 fwd_unref(bfp);
2695 }
2696 avl_destroy(&fwd_scavenge);
2697 }
2698
2699 /*
2700 * Let the mac module take or drop a reference to a bridge link. When this is
2701 * called, the mac module is holding the mi_bridge_lock, so the link cannot be
2702 * in the process of entering or leaving a bridge.
2703 */
2704 static void
bridge_ref_cb(mac_handle_t mh,boolean_t hold)2705 bridge_ref_cb(mac_handle_t mh, boolean_t hold)
2706 {
2707 bridge_link_t *blp = (bridge_link_t *)mh;
2708
2709 if (hold)
2710 atomic_inc_uint(&blp->bl_refs);
2711 else
2712 link_unref(blp);
2713 }
2714
2715 /*
2716 * Handle link state changes reported by the mac layer. This acts as a filter
2717 * for link state changes: if a link is reporting down, but there are other
2718 * links still up on the bridge, then the state is changed to "up." When the
2719 * last link goes down, all are marked down, and when the first link goes up,
2720 * all are marked up. (Recursion is avoided by the use of the "redo" function.)
2721 *
2722 * We treat unknown as equivalent to "up."
2723 */
2724 static link_state_t
bridge_ls_cb(mac_handle_t mh,link_state_t newls)2725 bridge_ls_cb(mac_handle_t mh, link_state_t newls)
2726 {
2727 bridge_link_t *blp = (bridge_link_t *)mh;
2728 bridge_link_t *blcmp;
2729 bridge_inst_t *bip;
2730 bridge_mac_t *bmp;
2731
2732 if (newls != LINK_STATE_DOWN && blp->bl_linkstate != LINK_STATE_DOWN ||
2733 (blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL))) {
2734 blp->bl_linkstate = newls;
2735 return (newls);
2736 }
2737
2738 /*
2739 * Scan first to see if there are any other non-down links. If there
2740 * are, then we're done. Otherwise, if all others are down, then the
2741 * state of this link is the state of the bridge.
2742 */
2743 bip = blp->bl_inst;
2744 rw_enter(&bip->bi_rwlock, RW_WRITER);
2745 for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2746 blcmp = list_next(&bip->bi_links, blcmp)) {
2747 if (blcmp != blp &&
2748 !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
2749 blcmp->bl_linkstate != LINK_STATE_DOWN)
2750 break;
2751 }
2752
2753 if (blcmp != NULL) {
2754 /*
2755 * If there are other links that are considered up, then tell
2756 * the caller that the link is actually still up, regardless of
2757 * this link's underlying state.
2758 */
2759 blp->bl_linkstate = newls;
2760 newls = LINK_STATE_UP;
2761 } else if (blp->bl_linkstate != newls) {
2762 /*
2763 * If we've found no other 'up' links, and this link has
2764 * changed state, then report the new state of the bridge to
2765 * all other clients.
2766 */
2767 blp->bl_linkstate = newls;
2768 for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2769 blcmp = list_next(&bip->bi_links, blcmp)) {
2770 if (blcmp != blp && !(blcmp->bl_flags & BLF_DELETED))
2771 mac_link_redo(blcmp->bl_mh, newls);
2772 }
2773 bmp = bip->bi_mac;
2774 if ((bmp->bm_linkstate = newls) != LINK_STATE_DOWN)
2775 bmp->bm_linkstate = LINK_STATE_UP;
2776 mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
2777 }
2778 rw_exit(&bip->bi_rwlock);
2779 return (newls);
2780 }
2781
2782 static void
bridge_add_link(void * arg)2783 bridge_add_link(void *arg)
2784 {
2785 mblk_t *mp = arg;
2786 bridge_stream_t *bsp;
2787 bridge_inst_t *bip, *bipt;
2788 bridge_mac_t *bmp;
2789 datalink_id_t linkid;
2790 int err;
2791 mac_handle_t mh;
2792 uint_t maxsdu;
2793 bridge_link_t *blp = NULL, *blpt;
2794 const mac_info_t *mip;
2795 boolean_t macopen = B_FALSE;
2796 char linkname[MAXLINKNAMELEN];
2797 char kstatname[KSTAT_STRLEN];
2798 int i;
2799 link_state_t linkstate;
2800 mblk_t *mlist;
2801
2802 bsp = (bridge_stream_t *)mp->b_next;
2803 mp->b_next = NULL;
2804 bip = bsp->bs_inst;
2805 /* LINTED: alignment */
2806 linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2807
2808 /*
2809 * First make sure that there is no other bridge that has this link.
2810 * We don't want to overlap operations from two bridges; the MAC layer
2811 * supports only one bridge on a given MAC at a time.
2812 *
2813 * We rely on the fact that there's just one taskq thread for the
2814 * bridging module: once we've checked for a duplicate, we can drop the
2815 * lock, because no other thread could possibly be adding another link
2816 * until we're done.
2817 */
2818 mutex_enter(&inst_lock);
2819 for (bipt = list_head(&inst_list); bipt != NULL;
2820 bipt = list_next(&inst_list, bipt)) {
2821 rw_enter(&bipt->bi_rwlock, RW_READER);
2822 for (blpt = list_head(&bipt->bi_links); blpt != NULL;
2823 blpt = list_next(&bipt->bi_links, blpt)) {
2824 if (linkid == blpt->bl_linkid)
2825 break;
2826 }
2827 rw_exit(&bipt->bi_rwlock);
2828 if (blpt != NULL)
2829 break;
2830 }
2831 mutex_exit(&inst_lock);
2832 if (bipt != NULL) {
2833 err = EBUSY;
2834 goto fail;
2835 }
2836
2837 if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
2838 goto fail;
2839 macopen = B_TRUE;
2840
2841 /* we bridge only Ethernet */
2842 mip = mac_info(mh);
2843 if (mip->mi_media != DL_ETHER) {
2844 err = ENOTSUP;
2845 goto fail;
2846 }
2847
2848 /*
2849 * Get the current maximum SDU on this interface. If there are other
2850 * links on the bridge, then this one must match, or it errors out.
2851 * Otherwise, the first link becomes the standard for the new bridge.
2852 */
2853 mac_sdu_get(mh, NULL, &maxsdu);
2854 bmp = bip->bi_mac;
2855 if (list_is_empty(&bip->bi_links)) {
2856 bmp->bm_maxsdu = maxsdu;
2857 (void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2858 }
2859
2860 /* figure the kstat name; also used as the mac client name */
2861 i = MBLKL(mp->b_cont) - sizeof (datalink_id_t);
2862 if (i < 0 || i >= MAXLINKNAMELEN)
2863 i = MAXLINKNAMELEN - 1;
2864 bcopy(mp->b_cont->b_rptr + sizeof (datalink_id_t), linkname, i);
2865 linkname[i] = '\0';
2866 (void) snprintf(kstatname, sizeof (kstatname), "%s-%s", bip->bi_name,
2867 linkname);
2868
2869 if ((blp = kmem_zalloc(sizeof (*blp), KM_NOSLEEP)) == NULL) {
2870 err = ENOMEM;
2871 goto fail;
2872 }
2873 blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
2874 if (blp->bl_lfailmp == NULL) {
2875 kmem_free(blp, sizeof (*blp));
2876 blp = NULL;
2877 err = ENOMEM;
2878 goto fail;
2879 }
2880
2881 blp->bl_refs = 1;
2882 atomic_inc_uint(&bip->bi_refs);
2883 blp->bl_inst = bip;
2884 blp->bl_mh = mh;
2885 blp->bl_linkid = linkid;
2886 blp->bl_maxsdu = maxsdu;
2887 cv_init(&blp->bl_trillwait, NULL, CV_DRIVER, NULL);
2888 mutex_init(&blp->bl_trilllock, NULL, MUTEX_DRIVER, NULL);
2889 (void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
2890
2891 err = mac_client_open(mh, &blp->bl_mch, kstatname, 0);
2892 if (err != 0)
2893 goto fail;
2894 blp->bl_flags |= BLF_CLIENT_OPEN;
2895
2896 err = mac_margin_add(mh, &blp->bl_margin, B_TRUE);
2897 if (err != 0)
2898 goto fail;
2899 blp->bl_flags |= BLF_MARGIN_ADDED;
2900
2901 blp->bl_mnh = mac_notify_add(mh, bridge_notify_cb, blp);
2902
2903 /* Enable Bridging on the link */
2904 err = mac_bridge_set(mh, (mac_handle_t)blp);
2905 if (err != 0)
2906 goto fail;
2907 blp->bl_flags |= BLF_SET_BRIDGE;
2908
2909 err = mac_promisc_add(blp->bl_mch, MAC_CLIENT_PROMISC_ALL, NULL,
2910 blp, &blp->bl_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP);
2911 if (err != 0)
2912 goto fail;
2913 blp->bl_flags |= BLF_PROM_ADDED;
2914
2915 bridge_new_unicst(blp);
2916
2917 blp->bl_ksp = kstat_setup((kstat_named_t *)&blp->bl_kstats,
2918 link_kstats_list, Dim(link_kstats_list), kstatname);
2919
2920 /*
2921 * The link holds a reference to the bridge instance, so that the
2922 * instance can't go away before the link is freed. The insertion into
2923 * bi_links holds a reference on the link (reference set to 1 above).
2924 * When marking as removed from bi_links (BLF_DELETED), drop the
2925 * reference on the link. When freeing the link, drop the reference on
2926 * the instance. BLF_LINK_ADDED tracks link insertion in bi_links list.
2927 */
2928 rw_enter(&bip->bi_rwlock, RW_WRITER);
2929 list_insert_tail(&bip->bi_links, blp);
2930 blp->bl_flags |= BLF_LINK_ADDED;
2931
2932 /*
2933 * If the new link is no good on this bridge, then let the daemon know
2934 * about the problem.
2935 */
2936 mlist = NULL;
2937 if (maxsdu != bmp->bm_maxsdu)
2938 link_sdu_fail(blp, B_TRUE, &mlist);
2939 rw_exit(&bip->bi_rwlock);
2940 send_up_messages(bip, mlist);
2941
2942 /*
2943 * Trigger a link state update so that if this link is the first one
2944 * "up" in the bridge, then we notify everyone. This triggers a trip
2945 * through bridge_ls_cb.
2946 */
2947 linkstate = mac_stat_get(mh, MAC_STAT_LOWLINK_STATE);
2948 blp->bl_linkstate = LINK_STATE_DOWN;
2949 mac_link_update(mh, linkstate);
2950
2951 /*
2952 * We now need to report back to the stream that invoked us, and then
2953 * drop the reference on the stream that we're holding.
2954 */
2955 miocack(bsp->bs_wq, mp, 0, 0);
2956 stream_unref(bsp);
2957 return;
2958
2959 fail:
2960 if (blp == NULL) {
2961 if (macopen)
2962 mac_close(mh);
2963 } else {
2964 link_shutdown(blp);
2965 }
2966 miocnak(bsp->bs_wq, mp, 0, err);
2967 stream_unref(bsp);
2968 }
2969
2970 static void
bridge_rem_link(void * arg)2971 bridge_rem_link(void *arg)
2972 {
2973 mblk_t *mp = arg;
2974 bridge_stream_t *bsp;
2975 bridge_inst_t *bip;
2976 bridge_mac_t *bmp;
2977 datalink_id_t linkid;
2978 bridge_link_t *blp, *blsave;
2979 boolean_t found;
2980 mblk_t *mlist;
2981
2982 bsp = (bridge_stream_t *)mp->b_next;
2983 mp->b_next = NULL;
2984 bip = bsp->bs_inst;
2985 /* LINTED: alignment */
2986 linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2987
2988 /*
2989 * We become reader here so that we can loop over the other links and
2990 * deliver link up/down notification.
2991 */
2992 rw_enter(&bip->bi_rwlock, RW_READER);
2993 found = B_FALSE;
2994 for (blp = list_head(&bip->bi_links); blp != NULL;
2995 blp = list_next(&bip->bi_links, blp)) {
2996 if (blp->bl_linkid == linkid &&
2997 !(blp->bl_flags & BLF_DELETED)) {
2998 blp->bl_flags |= BLF_DELETED;
2999 (void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
3000 blp, DDI_SLEEP);
3001 found = B_TRUE;
3002 break;
3003 }
3004 }
3005
3006 /*
3007 * Check if this link is up and the remainder of the links are all
3008 * down.
3009 */
3010 if (blp != NULL && blp->bl_linkstate != LINK_STATE_DOWN) {
3011 for (blp = list_head(&bip->bi_links); blp != NULL;
3012 blp = list_next(&bip->bi_links, blp)) {
3013 if (blp->bl_linkstate != LINK_STATE_DOWN &&
3014 !(blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)))
3015 break;
3016 }
3017 if (blp == NULL) {
3018 for (blp = list_head(&bip->bi_links); blp != NULL;
3019 blp = list_next(&bip->bi_links, blp)) {
3020 if (!(blp->bl_flags & BLF_DELETED))
3021 mac_link_redo(blp->bl_mh,
3022 LINK_STATE_DOWN);
3023 }
3024 bmp = bip->bi_mac;
3025 bmp->bm_linkstate = LINK_STATE_DOWN;
3026 mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
3027 }
3028 }
3029
3030 /*
3031 * Check if there's just one working link left on the bridge. If so,
3032 * then that link is now authoritative for bridge MTU.
3033 */
3034 blsave = NULL;
3035 for (blp = list_head(&bip->bi_links); blp != NULL;
3036 blp = list_next(&bip->bi_links, blp)) {
3037 if (!(blp->bl_flags & BLF_DELETED)) {
3038 if (blsave == NULL)
3039 blsave = blp;
3040 else
3041 break;
3042 }
3043 }
3044 mlist = NULL;
3045 bmp = bip->bi_mac;
3046 if (blsave != NULL && blp == NULL &&
3047 blsave->bl_maxsdu != bmp->bm_maxsdu) {
3048 bmp->bm_maxsdu = blsave->bl_maxsdu;
3049 (void) mac_maxsdu_update(bmp->bm_mh, blsave->bl_maxsdu);
3050 link_sdu_fail(blsave, B_FALSE, &mlist);
3051 }
3052 rw_exit(&bip->bi_rwlock);
3053 send_up_messages(bip, mlist);
3054
3055 if (found)
3056 miocack(bsp->bs_wq, mp, 0, 0);
3057 else
3058 miocnak(bsp->bs_wq, mp, 0, ENOENT);
3059 stream_unref(bsp);
3060 }
3061
3062 /*
3063 * This function intentionally returns with bi_rwlock held; it is intended for
3064 * quick checks and updates.
3065 */
3066 static bridge_link_t *
enter_link(bridge_inst_t * bip,datalink_id_t linkid)3067 enter_link(bridge_inst_t *bip, datalink_id_t linkid)
3068 {
3069 bridge_link_t *blp;
3070
3071 rw_enter(&bip->bi_rwlock, RW_READER);
3072 for (blp = list_head(&bip->bi_links); blp != NULL;
3073 blp = list_next(&bip->bi_links, blp)) {
3074 if (blp->bl_linkid == linkid && !(blp->bl_flags & BLF_DELETED))
3075 break;
3076 }
3077 return (blp);
3078 }
3079
3080 static void
bridge_ioctl(queue_t * wq,mblk_t * mp)3081 bridge_ioctl(queue_t *wq, mblk_t *mp)
3082 {
3083 bridge_stream_t *bsp = wq->q_ptr;
3084 bridge_inst_t *bip;
3085 struct iocblk *iop;
3086 int rc = EINVAL;
3087 int len = 0;
3088 bridge_link_t *blp;
3089 cred_t *cr;
3090
3091 /* LINTED: alignment */
3092 iop = (struct iocblk *)mp->b_rptr;
3093
3094 /*
3095 * For now, all of the bridge ioctls are privileged.
3096 */
3097 if ((cr = msg_getcred(mp, NULL)) == NULL)
3098 cr = iop->ioc_cr;
3099 if (cr != NULL && secpolicy_net_config(cr, B_FALSE) != 0) {
3100 miocnak(wq, mp, 0, EPERM);
3101 return;
3102 }
3103
3104 switch (iop->ioc_cmd) {
3105 case BRIOC_NEWBRIDGE: {
3106 bridge_newbridge_t *bnb;
3107
3108 if (bsp->bs_inst != NULL ||
3109 (rc = miocpullup(mp, sizeof (bridge_newbridge_t))) != 0)
3110 break;
3111 /* LINTED: alignment */
3112 bnb = (bridge_newbridge_t *)mp->b_cont->b_rptr;
3113 bnb->bnb_name[MAXNAMELEN-1] = '\0';
3114 rc = bridge_create(bnb->bnb_linkid, bnb->bnb_name, &bip, cr);
3115 if (rc != 0)
3116 break;
3117
3118 rw_enter(&bip->bi_rwlock, RW_WRITER);
3119 if (bip->bi_control != NULL) {
3120 rw_exit(&bip->bi_rwlock);
3121 bridge_unref(bip);
3122 rc = EBUSY;
3123 } else {
3124 atomic_inc_uint(&bip->bi_refs);
3125 bsp->bs_inst = bip; /* stream holds reference */
3126 bip->bi_control = bsp;
3127 rw_exit(&bip->bi_rwlock);
3128 rc = 0;
3129 }
3130 break;
3131 }
3132
3133 case BRIOC_ADDLINK:
3134 if ((bip = bsp->bs_inst) == NULL ||
3135 (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3136 break;
3137 /*
3138 * We cannot perform the action in this thread, because we're
3139 * not in process context, and we may already be holding
3140 * MAC-related locks. Place the request on taskq.
3141 */
3142 mp->b_next = (mblk_t *)bsp;
3143 stream_ref(bsp);
3144 (void) ddi_taskq_dispatch(bridge_taskq, bridge_add_link, mp,
3145 DDI_SLEEP);
3146 return;
3147
3148 case BRIOC_REMLINK:
3149 if ((bip = bsp->bs_inst) == NULL ||
3150 (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3151 break;
3152 /*
3153 * We cannot perform the action in this thread, because we're
3154 * not in process context, and we may already be holding
3155 * MAC-related locks. Place the request on taskq.
3156 */
3157 mp->b_next = (mblk_t *)bsp;
3158 stream_ref(bsp);
3159 (void) ddi_taskq_dispatch(bridge_taskq, bridge_rem_link, mp,
3160 DDI_SLEEP);
3161 return;
3162
3163 case BRIOC_SETSTATE: {
3164 bridge_setstate_t *bss;
3165
3166 if ((bip = bsp->bs_inst) == NULL ||
3167 (rc = miocpullup(mp, sizeof (*bss))) != 0)
3168 break;
3169 /* LINTED: alignment */
3170 bss = (bridge_setstate_t *)mp->b_cont->b_rptr;
3171 if ((blp = enter_link(bip, bss->bss_linkid)) == NULL) {
3172 rc = ENOENT;
3173 } else {
3174 rc = 0;
3175 blp->bl_state = bss->bss_state;
3176 }
3177 rw_exit(&bip->bi_rwlock);
3178 break;
3179 }
3180
3181 case BRIOC_SETPVID: {
3182 bridge_setpvid_t *bsv;
3183
3184 if ((bip = bsp->bs_inst) == NULL ||
3185 (rc = miocpullup(mp, sizeof (*bsv))) != 0)
3186 break;
3187 /* LINTED: alignment */
3188 bsv = (bridge_setpvid_t *)mp->b_cont->b_rptr;
3189 if (bsv->bsv_vlan > VLAN_ID_MAX)
3190 break;
3191 if ((blp = enter_link(bip, bsv->bsv_linkid)) == NULL) {
3192 rc = ENOENT;
3193 } else if (blp->bl_pvid == bsv->bsv_vlan) {
3194 rc = 0;
3195 } else {
3196 rc = 0;
3197 BRIDGE_VLAN_CLR(blp, blp->bl_pvid);
3198 blp->bl_pvid = bsv->bsv_vlan;
3199 if (blp->bl_pvid != 0)
3200 BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3201 }
3202 rw_exit(&bip->bi_rwlock);
3203 break;
3204 }
3205
3206 case BRIOC_VLANENAB: {
3207 bridge_vlanenab_t *bve;
3208
3209 if ((bip = bsp->bs_inst) == NULL ||
3210 (rc = miocpullup(mp, sizeof (*bve))) != 0)
3211 break;
3212 /* LINTED: alignment */
3213 bve = (bridge_vlanenab_t *)mp->b_cont->b_rptr;
3214 if (bve->bve_vlan > VLAN_ID_MAX)
3215 break;
3216 if ((blp = enter_link(bip, bve->bve_linkid)) == NULL) {
3217 rc = ENOENT;
3218 } else {
3219 rc = 0;
3220 /* special case: vlan 0 means "all" */
3221 if (bve->bve_vlan == 0) {
3222 (void) memset(blp->bl_vlans,
3223 bve->bve_onoff ? ~0 : 0,
3224 sizeof (blp->bl_vlans));
3225 BRIDGE_VLAN_CLR(blp, 0);
3226 if (blp->bl_pvid != 0)
3227 BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3228 } else if (bve->bve_vlan == blp->bl_pvid) {
3229 rc = EINVAL;
3230 } else if (bve->bve_onoff) {
3231 BRIDGE_VLAN_SET(blp, bve->bve_vlan);
3232 } else {
3233 BRIDGE_VLAN_CLR(blp, bve->bve_vlan);
3234 }
3235 }
3236 rw_exit(&bip->bi_rwlock);
3237 break;
3238 }
3239
3240 case BRIOC_FLUSHFWD: {
3241 bridge_flushfwd_t *bff;
3242 bridge_fwd_t *bfp, *bfnext;
3243 avl_tree_t fwd_scavenge;
3244 int i;
3245
3246 if ((bip = bsp->bs_inst) == NULL ||
3247 (rc = miocpullup(mp, sizeof (*bff))) != 0)
3248 break;
3249 /* LINTED: alignment */
3250 bff = (bridge_flushfwd_t *)mp->b_cont->b_rptr;
3251 rw_enter(&bip->bi_rwlock, RW_WRITER);
3252 /* This case means "all" */
3253 if (bff->bff_linkid == DATALINK_INVALID_LINKID) {
3254 blp = NULL;
3255 } else {
3256 for (blp = list_head(&bip->bi_links); blp != NULL;
3257 blp = list_next(&bip->bi_links, blp)) {
3258 if (blp->bl_linkid == bff->bff_linkid &&
3259 !(blp->bl_flags & BLF_DELETED))
3260 break;
3261 }
3262 if (blp == NULL) {
3263 rc = ENOENT;
3264 rw_exit(&bip->bi_rwlock);
3265 break;
3266 }
3267 }
3268 avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
3269 offsetof(bridge_fwd_t, bf_node));
3270 bfnext = avl_first(&bip->bi_fwd);
3271 while ((bfp = bfnext) != NULL) {
3272 bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
3273 if (bfp->bf_flags & BFF_LOCALADDR)
3274 continue;
3275 if (blp != NULL) {
3276 for (i = 0; i < bfp->bf_maxlinks; i++) {
3277 if (bfp->bf_links[i] == blp)
3278 break;
3279 }
3280 /*
3281 * If the link is there and we're excluding,
3282 * then skip. If the link is not there and
3283 * we're doing only that link, then skip.
3284 */
3285 if ((i < bfp->bf_maxlinks) == bff->bff_exclude)
3286 continue;
3287 }
3288 ASSERT(bfp->bf_flags & BFF_INTREE);
3289 avl_remove(&bip->bi_fwd, bfp);
3290 bfp->bf_flags &= ~BFF_INTREE;
3291 avl_add(&fwd_scavenge, bfp);
3292 }
3293 rw_exit(&bip->bi_rwlock);
3294 bfnext = avl_first(&fwd_scavenge);
3295 while ((bfp = bfnext) != NULL) {
3296 bfnext = AVL_NEXT(&fwd_scavenge, bfp);
3297 avl_remove(&fwd_scavenge, bfp);
3298 fwd_unref(bfp); /* drop tree reference */
3299 }
3300 avl_destroy(&fwd_scavenge);
3301 break;
3302 }
3303
3304 case BRIOC_TABLEMAX:
3305 if ((bip = bsp->bs_inst) == NULL ||
3306 (rc = miocpullup(mp, sizeof (uint32_t))) != 0)
3307 break;
3308 /* LINTED: alignment */
3309 bip->bi_tablemax = *(uint32_t *)mp->b_cont->b_rptr;
3310 break;
3311 }
3312
3313 if (rc == 0)
3314 miocack(wq, mp, len, 0);
3315 else
3316 miocnak(wq, mp, 0, rc);
3317 }
3318
3319 static int
bridge_wput(queue_t * wq,mblk_t * mp)3320 bridge_wput(queue_t *wq, mblk_t *mp)
3321 {
3322 switch (DB_TYPE(mp)) {
3323 case M_IOCTL:
3324 bridge_ioctl(wq, mp);
3325 break;
3326 case M_FLUSH:
3327 if (*mp->b_rptr & FLUSHW)
3328 *mp->b_rptr &= ~FLUSHW;
3329 if (*mp->b_rptr & FLUSHR)
3330 qreply(wq, mp);
3331 else
3332 freemsg(mp);
3333 break;
3334 default:
3335 freemsg(mp);
3336 break;
3337 }
3338 return (0);
3339 }
3340
3341 /*
3342 * This function allocates the main data structures for the bridge driver and
3343 * connects us into devfs.
3344 */
3345 static void
bridge_inst_init(void)3346 bridge_inst_init(void)
3347 {
3348 bridge_scan_interval = 5 * drv_usectohz(1000000);
3349 bridge_fwd_age = 25 * drv_usectohz(1000000);
3350
3351 rw_init(&bmac_rwlock, NULL, RW_DRIVER, NULL);
3352 list_create(&bmac_list, sizeof (bridge_mac_t),
3353 offsetof(bridge_mac_t, bm_node));
3354 list_create(&inst_list, sizeof (bridge_inst_t),
3355 offsetof(bridge_inst_t, bi_node));
3356 cv_init(&inst_cv, NULL, CV_DRIVER, NULL);
3357 mutex_init(&inst_lock, NULL, MUTEX_DRIVER, NULL);
3358 cv_init(&stream_ref_cv, NULL, CV_DRIVER, NULL);
3359 mutex_init(&stream_ref_lock, NULL, MUTEX_DRIVER, NULL);
3360
3361 mac_bridge_vectors(bridge_xmit_cb, bridge_recv_cb, bridge_ref_cb,
3362 bridge_ls_cb);
3363 }
3364
3365 /*
3366 * This function disconnects from devfs and destroys all data structures in
3367 * preparation for unload. It's assumed that there are no active bridge
3368 * references left at this point.
3369 */
3370 static void
bridge_inst_fini(void)3371 bridge_inst_fini(void)
3372 {
3373 mac_bridge_vectors(NULL, NULL, NULL, NULL);
3374 if (bridge_timerid != 0)
3375 (void) untimeout(bridge_timerid);
3376 rw_destroy(&bmac_rwlock);
3377 list_destroy(&bmac_list);
3378 list_destroy(&inst_list);
3379 cv_destroy(&inst_cv);
3380 mutex_destroy(&inst_lock);
3381 cv_destroy(&stream_ref_cv);
3382 mutex_destroy(&stream_ref_lock);
3383 }
3384
3385 /*
3386 * bridge_attach()
3387 *
3388 * Description:
3389 * Attach bridge driver to the system.
3390 */
3391 static int
bridge_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)3392 bridge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3393 {
3394 if (cmd != DDI_ATTACH)
3395 return (DDI_FAILURE);
3396
3397 if (ddi_create_minor_node(dip, BRIDGE_CTL, S_IFCHR, 0, DDI_PSEUDO,
3398 CLONE_DEV) == DDI_FAILURE) {
3399 return (DDI_FAILURE);
3400 }
3401
3402 if (dld_ioc_register(BRIDGE_IOC, bridge_ioc_list,
3403 DLDIOCCNT(bridge_ioc_list)) != 0) {
3404 ddi_remove_minor_node(dip, BRIDGE_CTL);
3405 return (DDI_FAILURE);
3406 }
3407
3408 bridge_dev_info = dip;
3409 bridge_major = ddi_driver_major(dip);
3410 bridge_taskq = ddi_taskq_create(dip, BRIDGE_DEV_NAME, 1,
3411 TASKQ_DEFAULTPRI, 0);
3412 return (DDI_SUCCESS);
3413 }
3414
3415 /*
3416 * bridge_detach()
3417 *
3418 * Description:
3419 * Detach an interface to the system.
3420 */
3421 static int
bridge_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)3422 bridge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3423 {
3424 if (cmd != DDI_DETACH)
3425 return (DDI_FAILURE);
3426
3427 ddi_remove_minor_node(dip, NULL);
3428 ddi_taskq_destroy(bridge_taskq);
3429 bridge_dev_info = NULL;
3430 return (DDI_SUCCESS);
3431 }
3432
3433 /*
3434 * bridge_info()
3435 *
3436 * Description:
3437 * Translate "dev_t" to a pointer to the associated "dev_info_t".
3438 */
3439 /* ARGSUSED */
3440 static int
bridge_info(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)3441 bridge_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
3442 void **result)
3443 {
3444 int rc;
3445
3446 switch (infocmd) {
3447 case DDI_INFO_DEVT2DEVINFO:
3448 if (bridge_dev_info == NULL) {
3449 rc = DDI_FAILURE;
3450 } else {
3451 *result = (void *)bridge_dev_info;
3452 rc = DDI_SUCCESS;
3453 }
3454 break;
3455 case DDI_INFO_DEVT2INSTANCE:
3456 *result = NULL;
3457 rc = DDI_SUCCESS;
3458 break;
3459 default:
3460 rc = DDI_FAILURE;
3461 break;
3462 }
3463 return (rc);
3464 }
3465
3466 static struct module_info bridge_modinfo = {
3467 2105, /* mi_idnum */
3468 BRIDGE_DEV_NAME, /* mi_idname */
3469 0, /* mi_minpsz */
3470 16384, /* mi_maxpsz */
3471 65536, /* mi_hiwat */
3472 128 /* mi_lowat */
3473 };
3474
3475 static struct qinit bridge_rinit = {
3476 NULL, /* qi_putp */
3477 NULL, /* qi_srvp */
3478 bridge_open, /* qi_qopen */
3479 bridge_close, /* qi_qclose */
3480 NULL, /* qi_qadmin */
3481 &bridge_modinfo, /* qi_minfo */
3482 NULL /* qi_mstat */
3483 };
3484
3485 static struct qinit bridge_winit = {
3486 (int (*)())bridge_wput, /* qi_putp */
3487 NULL, /* qi_srvp */
3488 NULL, /* qi_qopen */
3489 NULL, /* qi_qclose */
3490 NULL, /* qi_qadmin */
3491 &bridge_modinfo, /* qi_minfo */
3492 NULL /* qi_mstat */
3493 };
3494
3495 static struct streamtab bridge_tab = {
3496 &bridge_rinit, /* st_rdinit */
3497 &bridge_winit /* st_wrinit */
3498 };
3499
3500 /* No STREAMS perimeters; we do all our own locking */
3501 DDI_DEFINE_STREAM_OPS(bridge_ops, nulldev, nulldev, bridge_attach,
3502 bridge_detach, nodev, bridge_info, D_NEW | D_MP, &bridge_tab,
3503 ddi_quiesce_not_supported);
3504
3505 static struct modldrv modldrv = {
3506 &mod_driverops,
3507 "bridging driver",
3508 &bridge_ops
3509 };
3510
3511 static struct modlinkage modlinkage = {
3512 MODREV_1,
3513 (void *)&modldrv,
3514 NULL
3515 };
3516
3517 int
_init(void)3518 _init(void)
3519 {
3520 int retv;
3521
3522 mac_init_ops(NULL, BRIDGE_DEV_NAME);
3523 bridge_inst_init();
3524 if ((retv = mod_install(&modlinkage)) != 0)
3525 bridge_inst_fini();
3526 return (retv);
3527 }
3528
3529 int
_fini(void)3530 _fini(void)
3531 {
3532 int retv;
3533
3534 rw_enter(&bmac_rwlock, RW_READER);
3535 retv = list_is_empty(&bmac_list) ? 0 : EBUSY;
3536 rw_exit(&bmac_rwlock);
3537 if (retv == 0 &&
3538 (retv = mod_remove(&modlinkage)) == 0)
3539 bridge_inst_fini();
3540 return (retv);
3541 }
3542
3543 int
_info(struct modinfo * modinfop)3544 _info(struct modinfo *modinfop)
3545 {
3546 return (mod_info(&modlinkage, modinfop));
3547 }
3548