1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * This module implements a STREAMS driver that provides layer-two (Ethernet)
29 * bridging functionality. The STREAMS interface is used to provide
30 * observability (snoop/wireshark) and control, but not for interface plumbing.
31 */
32
33 #include <sys/types.h>
34 #include <sys/bitmap.h>
35 #include <sys/cmn_err.h>
36 #include <sys/conf.h>
37 #include <sys/ddi.h>
38 #include <sys/errno.h>
39 #include <sys/kstat.h>
40 #include <sys/modctl.h>
41 #include <sys/note.h>
42 #include <sys/param.h>
43 #include <sys/policy.h>
44 #include <sys/sdt.h>
45 #include <sys/stat.h>
46 #include <sys/stream.h>
47 #include <sys/stropts.h>
48 #include <sys/strsun.h>
49 #include <sys/sunddi.h>
50 #include <sys/sysmacros.h>
51 #include <sys/systm.h>
52 #include <sys/time.h>
53 #include <sys/dlpi.h>
54 #include <sys/dls.h>
55 #include <sys/mac_ether.h>
56 #include <sys/mac_provider.h>
57 #include <sys/mac_client_priv.h>
58 #include <sys/mac_impl.h>
59 #include <sys/vlan.h>
60 #include <net/bridge.h>
61 #include <net/bridge_impl.h>
62 #include <net/trill.h>
63 #include <sys/dld_ioc.h>
64
65 /*
66 * Locks and reference counts: object lifetime and design.
67 *
68 * bridge_mac_t
69 * Bridge mac (snoop) instances are in bmac_list, which is protected by
70 * bmac_rwlock. They're allocated by bmac_alloc and freed by bridge_timer().
71 * Every bridge_inst_t has a single bridge_mac_t, but when bridge_inst_t goes
72 * away, the bridge_mac_t remains until either all of the users go away
73 * (detected by a timer) or until the instance is picked up again by the same
74 * bridge starting back up.
75 *
76 * bridge_inst_t
77 * Bridge instances are in inst_list, which is protected by inst_lock.
78 * They're allocated by inst_alloc() and freed by inst_free(). After
79 * allocation, an instance is placed in inst_list, and the reference count is
80 * incremented to represent this. That reference is decremented when the
81 * BIF_SHUTDOWN flag is set, and no new increments may occur. When the last
82 * reference is freed, the instance is removed from the list.
83 *
84 * Bridge instances have lists of links and an AVL tree of forwarding
85 * entries. Each of these structures holds one reference on the bridge
86 * instance. These lists and tree are protected by bi_rwlock.
87 *
88 * bridge_stream_t
89 * Bridge streams are allocated by stream_alloc() and freed by stream_free().
90 * These streams are created when "bridged" opens /dev/bridgectl, and are
91 * used to create new bridge instances (via BRIOC_NEWBRIDGE) and control the
92 * links on the bridge. When a stream closes, the bridge instance created is
93 * destroyed. There's at most one bridge instance for a given control
94 * stream.
95 *
96 * bridge_link_t
97 * Links are allocated by bridge_add_link() and freed by link_free(). The
98 * bi_links list holds a reference to the link. When the BLF_DELETED flag is
99 * set, that reference is dropped. The link isn't removed from the list
100 * until the last reference drops. Each forwarding entry that uses a given
101 * link holds a reference, as does each thread transmitting a packet via the
102 * link. The MAC layer calls in via bridge_ref_cb() to hold a reference on
103 * a link when transmitting.
104 *
105 * It's important that once BLF_DELETED is set, there's no way for the
106 * reference count to increase again. If it can, then the link may be
107 * double-freed. The BLF_FREED flag is intended for use with assertions to
108 * guard against this in testing.
109 *
110 * bridge_fwd_t
111 * Bridge forwarding entries are allocated by bridge_recv_cb() and freed by
112 * fwd_free(). The bi_fwd AVL tree holds one reference to the entry. Unlike
113 * other data structures, the reference is dropped when the entry is removed
114 * from the tree by fwd_delete(), and the BFF_INTREE flag is removed. Each
115 * thread that's forwarding a packet to a known destination holds a reference
116 * to a forwarding entry.
117 *
118 * TRILL notes:
119 *
120 * The TRILL module does all of its I/O through bridging. It uses references
121 * on the bridge_inst_t and bridge_link_t structures, and has seven entry
122 * points and four callbacks. One entry point is for setting the callbacks
123 * (bridge_trill_register_cb). There are four entry points for taking bridge
124 * and link references (bridge_trill_{br,ln}{ref,unref}). The final two
125 * entry points are for decapsulated packets from TRILL (bridge_trill_decaps)
126 * that need to be bridged locally, and for TRILL-encapsulated output packets
127 * (bridge_trill_output).
128 *
129 * The four callbacks comprise two notification functions for bridges and
130 * links being deleted, one function for raw received TRILL packets, and one
131 * for bridge output to non-local TRILL destinations (tunnel entry).
132 */
133
134 /*
135 * Ethernet reserved multicast addresses for TRILL; used also in TRILL module.
136 */
137 const uint8_t all_isis_rbridges[] = ALL_ISIS_RBRIDGES;
138 static const uint8_t all_esadi_rbridges[] = ALL_ESADI_RBRIDGES;
139 const uint8_t bridge_group_address[] = BRIDGE_GROUP_ADDRESS;
140
141 static const char *inst_kstats_list[] = { KSINST_NAMES };
142 static const char *link_kstats_list[] = { KSLINK_NAMES };
143
144 #define KREF(p, m, vn) p->m.vn.value.ui64
145 #define KINCR(p, m, vn) ++KREF(p, m, vn)
146 #define KDECR(p, m, vn) --KREF(p, m, vn)
147
148 #define KIPINCR(p, vn) KINCR(p, bi_kstats, vn)
149 #define KIPDECR(p, vn) KDECR(p, bi_kstats, vn)
150 #define KLPINCR(p, vn) KINCR(p, bl_kstats, vn)
151
152 #define KIINCR(vn) KIPINCR(bip, vn)
153 #define KIDECR(vn) KIPDECR(bip, vn)
154 #define KLINCR(vn) KLPINCR(blp, vn)
155
156 #define Dim(x) (sizeof (x) / sizeof (*(x)))
157
158 /* Amount of overhead added when encapsulating with VLAN headers */
159 #define VLAN_INCR (sizeof (struct ether_vlan_header) - \
160 sizeof (struct ether_header))
161
162 static dev_info_t *bridge_dev_info;
163 static major_t bridge_major;
164 static ddi_taskq_t *bridge_taskq;
165
166 /*
167 * These are the bridge instance management data structures. The mutex lock
168 * protects the list of bridge instances. A reference count is then used on
169 * each instance to determine when to free it. We use mac_minor_hold() to
170 * allocate minor_t values, which are used both for self-cloning /dev/net/
171 * device nodes as well as client streams. Minor node 0 is reserved for the
172 * allocation control node.
173 */
174 static list_t inst_list;
175 static kcondvar_t inst_cv; /* Allows us to wait for shutdown */
176 static kmutex_t inst_lock;
177
178 static krwlock_t bmac_rwlock;
179 static list_t bmac_list;
180
181 /* Wait for taskq entries that use STREAMS */
182 static kcondvar_t stream_ref_cv;
183 static kmutex_t stream_ref_lock;
184
185 static timeout_id_t bridge_timerid;
186 static clock_t bridge_scan_interval;
187 static clock_t bridge_fwd_age;
188
189 static bridge_inst_t *bridge_find_name(const char *);
190 static void bridge_timer(void *);
191 static void bridge_unref(bridge_inst_t *);
192
193 static const uint8_t zero_addr[ETHERADDRL] = { 0 };
194
195 /* Global TRILL linkage */
196 static trill_recv_pkt_t trill_recv_fn;
197 static trill_encap_pkt_t trill_encap_fn;
198 static trill_br_dstr_t trill_brdstr_fn;
199 static trill_ln_dstr_t trill_lndstr_fn;
200
201 /* special settings to accommodate DLD flow control; see dld_str.c */
202 static struct module_info bridge_dld_modinfo = {
203 0, /* mi_idnum */
204 BRIDGE_DEV_NAME, /* mi_idname */
205 0, /* mi_minpsz */
206 INFPSZ, /* mi_maxpsz */
207 1, /* mi_hiwat */
208 0 /* mi_lowat */
209 };
210
211 static struct qinit bridge_dld_rinit = {
212 NULL, /* qi_putp */
213 NULL, /* qi_srvp */
214 dld_open, /* qi_qopen */
215 dld_close, /* qi_qclose */
216 NULL, /* qi_qadmin */
217 &bridge_dld_modinfo, /* qi_minfo */
218 NULL /* qi_mstat */
219 };
220
221 static struct qinit bridge_dld_winit = {
222 (int (*)())dld_wput, /* qi_putp */
223 (int (*)())dld_wsrv, /* qi_srvp */
224 NULL, /* qi_qopen */
225 NULL, /* qi_qclose */
226 NULL, /* qi_qadmin */
227 &bridge_dld_modinfo, /* qi_minfo */
228 NULL /* qi_mstat */
229 };
230
231 static int bridge_ioc_listfwd(void *, intptr_t, int, cred_t *, int *);
232
233 /* GLDv3 control ioctls used by Bridging */
234 static dld_ioc_info_t bridge_ioc_list[] = {
235 {BRIDGE_IOC_LISTFWD, DLDCOPYINOUT, sizeof (bridge_listfwd_t),
236 bridge_ioc_listfwd, NULL},
237 };
238
239 /*
240 * Given a bridge mac pointer, get a ref-held pointer to the corresponding
241 * bridge instance, if any. We must hold the global bmac_rwlock so that
242 * bm_inst doesn't slide out from under us.
243 */
244 static bridge_inst_t *
mac_to_inst(const bridge_mac_t * bmp)245 mac_to_inst(const bridge_mac_t *bmp)
246 {
247 bridge_inst_t *bip;
248
249 rw_enter(&bmac_rwlock, RW_READER);
250 if ((bip = bmp->bm_inst) != NULL)
251 atomic_inc_uint(&bip->bi_refs);
252 rw_exit(&bmac_rwlock);
253 return (bip);
254 }
255
256 static void
link_sdu_fail(bridge_link_t * blp,boolean_t failed,mblk_t ** mlist)257 link_sdu_fail(bridge_link_t *blp, boolean_t failed, mblk_t **mlist)
258 {
259 mblk_t *mp;
260 bridge_ctl_t *bcp;
261 bridge_link_t *blcmp;
262 bridge_inst_t *bip;
263 bridge_mac_t *bmp;
264
265 if (failed) {
266 if (blp->bl_flags & BLF_SDUFAIL)
267 return;
268 blp->bl_flags |= BLF_SDUFAIL;
269 } else {
270 if (!(blp->bl_flags & BLF_SDUFAIL))
271 return;
272 blp->bl_flags &= ~BLF_SDUFAIL;
273 }
274
275 /*
276 * If this link is otherwise up, then check if there are any other
277 * non-failed non-down links. If not, then we control the state of the
278 * whole bridge.
279 */
280 bip = blp->bl_inst;
281 bmp = bip->bi_mac;
282 if (blp->bl_linkstate != LINK_STATE_DOWN) {
283 for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
284 blcmp = list_next(&bip->bi_links, blcmp)) {
285 if (blp != blcmp &&
286 !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
287 blcmp->bl_linkstate != LINK_STATE_DOWN)
288 break;
289 }
290 if (blcmp == NULL) {
291 bmp->bm_linkstate = failed ? LINK_STATE_DOWN :
292 LINK_STATE_UP;
293 mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
294 }
295 }
296
297 /*
298 * If we're becoming failed, then the link's current true state needs
299 * to be reflected upwards to this link's clients. If we're becoming
300 * unfailed, then we get the state of the bridge instead on all
301 * clients.
302 */
303 if (failed) {
304 if (bmp->bm_linkstate != blp->bl_linkstate)
305 mac_link_redo(blp->bl_mh, blp->bl_linkstate);
306 } else {
307 mac_link_redo(blp->bl_mh, bmp->bm_linkstate);
308 }
309
310 /* get the current mblk we're going to send up */
311 if ((mp = blp->bl_lfailmp) == NULL &&
312 (mp = allocb(sizeof (bridge_ctl_t), BPRI_MED)) == NULL)
313 return;
314
315 /* get a new one for next time */
316 blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
317
318 /* if none for next time, then report only failures */
319 if (blp->bl_lfailmp == NULL && !failed) {
320 blp->bl_lfailmp = mp;
321 return;
322 }
323
324 /* LINTED: alignment */
325 bcp = (bridge_ctl_t *)mp->b_rptr;
326 bcp->bc_linkid = blp->bl_linkid;
327 bcp->bc_failed = failed;
328 mp->b_wptr = (uchar_t *)(bcp + 1);
329 mp->b_next = *mlist;
330 *mlist = mp;
331 }
332
333 /*
334 * Send control messages (link SDU changes) using the stream to the
335 * bridge instance daemon.
336 */
337 static void
send_up_messages(bridge_inst_t * bip,mblk_t * mp)338 send_up_messages(bridge_inst_t *bip, mblk_t *mp)
339 {
340 mblk_t *mnext;
341 queue_t *rq;
342
343 rq = bip->bi_control->bs_wq;
344 rq = OTHERQ(rq);
345 while (mp != NULL) {
346 mnext = mp->b_next;
347 mp->b_next = NULL;
348 putnext(rq, mp);
349 mp = mnext;
350 }
351 }
352
353 /* ARGSUSED */
354 static int
bridge_m_getstat(void * arg,uint_t stat,uint64_t * val)355 bridge_m_getstat(void *arg, uint_t stat, uint64_t *val)
356 {
357 return (ENOTSUP);
358 }
359
360 static int
bridge_m_start(void * arg)361 bridge_m_start(void *arg)
362 {
363 bridge_mac_t *bmp = arg;
364
365 bmp->bm_flags |= BMF_STARTED;
366 return (0);
367 }
368
369 static void
bridge_m_stop(void * arg)370 bridge_m_stop(void *arg)
371 {
372 bridge_mac_t *bmp = arg;
373
374 bmp->bm_flags &= ~BMF_STARTED;
375 }
376
377 /* ARGSUSED */
378 static int
bridge_m_setpromisc(void * arg,boolean_t on)379 bridge_m_setpromisc(void *arg, boolean_t on)
380 {
381 return (0);
382 }
383
384 /* ARGSUSED */
385 static int
bridge_m_multicst(void * arg,boolean_t add,const uint8_t * mca)386 bridge_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
387 {
388 return (0);
389 }
390
391 /* ARGSUSED */
392 static int
bridge_m_unicst(void * arg,const uint8_t * macaddr)393 bridge_m_unicst(void *arg, const uint8_t *macaddr)
394 {
395 return (ENOTSUP);
396 }
397
398 static mblk_t *
bridge_m_tx(void * arg,mblk_t * mp)399 bridge_m_tx(void *arg, mblk_t *mp)
400 {
401 _NOTE(ARGUNUSED(arg));
402 freemsgchain(mp);
403 return (NULL);
404 }
405
406 /* ARGSUSED */
407 static int
bridge_ioc_listfwd(void * karg,intptr_t arg,int mode,cred_t * cred,int * rvalp)408 bridge_ioc_listfwd(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
409 {
410 bridge_listfwd_t *blf = karg;
411 bridge_inst_t *bip;
412 bridge_fwd_t *bfp, match;
413 avl_index_t where;
414
415 bip = bridge_find_name(blf->blf_name);
416 if (bip == NULL)
417 return (ENOENT);
418
419 bcopy(blf->blf_dest, match.bf_dest, ETHERADDRL);
420 match.bf_flags |= BFF_VLANLOCAL;
421 rw_enter(&bip->bi_rwlock, RW_READER);
422 if ((bfp = avl_find(&bip->bi_fwd, &match, &where)) == NULL)
423 bfp = avl_nearest(&bip->bi_fwd, where, AVL_AFTER);
424 else
425 bfp = AVL_NEXT(&bip->bi_fwd, bfp);
426 if (bfp == NULL) {
427 bzero(blf, sizeof (*blf));
428 } else {
429 bcopy(bfp->bf_dest, blf->blf_dest, ETHERADDRL);
430 blf->blf_trill_nick = bfp->bf_trill_nick;
431 blf->blf_ms_age =
432 drv_hztousec(ddi_get_lbolt() - bfp->bf_lastheard) / 1000;
433 blf->blf_is_local =
434 (bfp->bf_flags & BFF_LOCALADDR) != 0;
435 blf->blf_linkid = bfp->bf_links[0]->bl_linkid;
436 }
437 rw_exit(&bip->bi_rwlock);
438 bridge_unref(bip);
439 return (0);
440 }
441
442 static int
bridge_m_setprop(void * arg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,const void * pr_val)443 bridge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
444 uint_t pr_valsize, const void *pr_val)
445 {
446 bridge_mac_t *bmp = arg;
447 bridge_inst_t *bip;
448 bridge_link_t *blp;
449 int err;
450 uint_t maxsdu;
451 mblk_t *mlist;
452
453 _NOTE(ARGUNUSED(pr_name));
454 switch (pr_num) {
455 case MAC_PROP_MTU:
456 if (pr_valsize < sizeof (bmp->bm_maxsdu)) {
457 err = EINVAL;
458 break;
459 }
460 (void) bcopy(pr_val, &maxsdu, sizeof (maxsdu));
461 if (maxsdu == bmp->bm_maxsdu) {
462 err = 0;
463 } else if ((bip = mac_to_inst(bmp)) == NULL) {
464 err = ENXIO;
465 } else {
466 rw_enter(&bip->bi_rwlock, RW_WRITER);
467 mlist = NULL;
468 for (blp = list_head(&bip->bi_links); blp != NULL;
469 blp = list_next(&bip->bi_links, blp)) {
470 if (blp->bl_flags & BLF_DELETED)
471 continue;
472 if (blp->bl_maxsdu == maxsdu)
473 link_sdu_fail(blp, B_FALSE, &mlist);
474 else if (blp->bl_maxsdu == bmp->bm_maxsdu)
475 link_sdu_fail(blp, B_TRUE, &mlist);
476 }
477 rw_exit(&bip->bi_rwlock);
478 bmp->bm_maxsdu = maxsdu;
479 (void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
480 send_up_messages(bip, mlist);
481 bridge_unref(bip);
482 err = 0;
483 }
484 break;
485
486 default:
487 err = ENOTSUP;
488 break;
489 }
490 return (err);
491 }
492
493 static int
bridge_m_getprop(void * arg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,void * pr_val)494 bridge_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
495 uint_t pr_valsize, void *pr_val)
496 {
497 bridge_mac_t *bmp = arg;
498 int err = 0;
499
500 _NOTE(ARGUNUSED(pr_name));
501 switch (pr_num) {
502 case MAC_PROP_STATUS:
503 ASSERT(pr_valsize >= sizeof (bmp->bm_linkstate));
504 bcopy(&bmp->bm_linkstate, pr_val, sizeof (&bmp->bm_linkstate));
505 break;
506
507 default:
508 err = ENOTSUP;
509 break;
510 }
511 return (err);
512 }
513
514 static void
bridge_m_propinfo(void * arg,const char * pr_name,mac_prop_id_t pr_num,mac_prop_info_handle_t prh)515 bridge_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
516 mac_prop_info_handle_t prh)
517 {
518 bridge_mac_t *bmp = arg;
519
520 _NOTE(ARGUNUSED(pr_name));
521
522 switch (pr_num) {
523 case MAC_PROP_MTU:
524 mac_prop_info_set_range_uint32(prh, bmp->bm_maxsdu,
525 bmp->bm_maxsdu);
526 break;
527 case MAC_PROP_STATUS:
528 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
529 break;
530 }
531 }
532
533 static mac_callbacks_t bridge_m_callbacks = {
534 MC_SETPROP | MC_GETPROP | MC_PROPINFO,
535 bridge_m_getstat,
536 bridge_m_start,
537 bridge_m_stop,
538 bridge_m_setpromisc,
539 bridge_m_multicst,
540 bridge_m_unicst,
541 bridge_m_tx,
542 NULL, /* reserved */
543 NULL, /* ioctl */
544 NULL, /* getcapab */
545 NULL, /* open */
546 NULL, /* close */
547 bridge_m_setprop,
548 bridge_m_getprop,
549 bridge_m_propinfo
550 };
551
552 /*
553 * Create kstats from a list.
554 */
555 static kstat_t *
kstat_setup(kstat_named_t * knt,const char ** names,int nstat,const char * unitname)556 kstat_setup(kstat_named_t *knt, const char **names, int nstat,
557 const char *unitname)
558 {
559 kstat_t *ksp;
560 int i;
561
562 for (i = 0; i < nstat; i++)
563 kstat_named_init(&knt[i], names[i], KSTAT_DATA_UINT64);
564
565 ksp = kstat_create_zone(BRIDGE_DEV_NAME, 0, unitname, "net",
566 KSTAT_TYPE_NAMED, nstat, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
567 if (ksp != NULL) {
568 ksp->ks_data = knt;
569 kstat_install(ksp);
570 }
571 return (ksp);
572 }
573
574 /*
575 * Find an existing bridge_mac_t structure or allocate a new one for the given
576 * bridge instance. This creates the mac driver instance that snoop can use.
577 */
578 static int
bmac_alloc(bridge_inst_t * bip,bridge_mac_t ** bmacp)579 bmac_alloc(bridge_inst_t *bip, bridge_mac_t **bmacp)
580 {
581 bridge_mac_t *bmp, *bnew;
582 mac_register_t *mac;
583 int err;
584
585 *bmacp = NULL;
586 if ((mac = mac_alloc(MAC_VERSION)) == NULL)
587 return (EINVAL);
588
589 bnew = kmem_zalloc(sizeof (*bnew), KM_SLEEP);
590
591 rw_enter(&bmac_rwlock, RW_WRITER);
592 for (bmp = list_head(&bmac_list); bmp != NULL;
593 bmp = list_next(&bmac_list, bmp)) {
594 if (strcmp(bip->bi_name, bmp->bm_name) == 0) {
595 ASSERT(bmp->bm_inst == NULL);
596 bmp->bm_inst = bip;
597 rw_exit(&bmac_rwlock);
598 kmem_free(bnew, sizeof (*bnew));
599 mac_free(mac);
600 *bmacp = bmp;
601 return (0);
602 }
603 }
604
605 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
606 mac->m_driver = bnew;
607 mac->m_dip = bridge_dev_info;
608 mac->m_instance = (uint_t)-1;
609 mac->m_src_addr = (uint8_t *)zero_addr;
610 mac->m_callbacks = &bridge_m_callbacks;
611
612 /*
613 * Note that the SDU limits are irrelevant, as nobody transmits on the
614 * bridge node itself. It's mainly for monitoring but we allow
615 * setting the bridge MTU for quick transition of all links part of the
616 * bridge to a new MTU.
617 */
618 mac->m_min_sdu = 1;
619 mac->m_max_sdu = 1500;
620 err = mac_register(mac, &bnew->bm_mh);
621 mac_free(mac);
622 if (err != 0) {
623 rw_exit(&bmac_rwlock);
624 kmem_free(bnew, sizeof (*bnew));
625 return (err);
626 }
627
628 bnew->bm_inst = bip;
629 (void) strcpy(bnew->bm_name, bip->bi_name);
630 if (list_is_empty(&bmac_list)) {
631 bridge_timerid = timeout(bridge_timer, NULL,
632 bridge_scan_interval);
633 }
634 list_insert_tail(&bmac_list, bnew);
635 rw_exit(&bmac_rwlock);
636
637 /*
638 * Mark the MAC as unable to go "active" so that only passive clients
639 * (such as snoop) can bind to it.
640 */
641 mac_no_active(bnew->bm_mh);
642 *bmacp = bnew;
643 return (0);
644 }
645
646 /*
647 * Disconnect the given bridge_mac_t from its bridge instance. The bridge
648 * instance is going away. The mac instance can't go away until the clients
649 * are gone (see bridge_timer).
650 */
651 static void
bmac_disconnect(bridge_mac_t * bmp)652 bmac_disconnect(bridge_mac_t *bmp)
653 {
654 bridge_inst_t *bip;
655
656 bmp->bm_linkstate = LINK_STATE_DOWN;
657 mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
658
659 rw_enter(&bmac_rwlock, RW_READER);
660 bip = bmp->bm_inst;
661 bip->bi_mac = NULL;
662 bmp->bm_inst = NULL;
663 rw_exit(&bmac_rwlock);
664 }
665
666 /* This is used by the avl trees to sort forwarding table entries */
667 static int
fwd_compare(const void * addr1,const void * addr2)668 fwd_compare(const void *addr1, const void *addr2)
669 {
670 const bridge_fwd_t *fwd1 = addr1;
671 const bridge_fwd_t *fwd2 = addr2;
672 int diff = memcmp(fwd1->bf_dest, fwd2->bf_dest, ETHERADDRL);
673
674 if (diff != 0)
675 return (diff > 0 ? 1 : -1);
676
677 if ((fwd1->bf_flags ^ fwd2->bf_flags) & BFF_VLANLOCAL) {
678 if (fwd1->bf_vlanid > fwd2->bf_vlanid)
679 return (1);
680 else if (fwd1->bf_vlanid < fwd2->bf_vlanid)
681 return (-1);
682 }
683 return (0);
684 }
685
686 static void
inst_free(bridge_inst_t * bip)687 inst_free(bridge_inst_t *bip)
688 {
689 ASSERT(bip->bi_mac == NULL);
690 rw_destroy(&bip->bi_rwlock);
691 list_destroy(&bip->bi_links);
692 cv_destroy(&bip->bi_linkwait);
693 avl_destroy(&bip->bi_fwd);
694 if (bip->bi_ksp != NULL)
695 kstat_delete(bip->bi_ksp);
696 kmem_free(bip, sizeof (*bip));
697 }
698
699 static bridge_inst_t *
inst_alloc(const char * bridge)700 inst_alloc(const char *bridge)
701 {
702 bridge_inst_t *bip;
703
704 bip = kmem_zalloc(sizeof (*bip), KM_SLEEP);
705 bip->bi_refs = 1;
706 (void) strcpy(bip->bi_name, bridge);
707 rw_init(&bip->bi_rwlock, NULL, RW_DRIVER, NULL);
708 list_create(&bip->bi_links, sizeof (bridge_link_t),
709 offsetof(bridge_link_t, bl_node));
710 cv_init(&bip->bi_linkwait, NULL, CV_DRIVER, NULL);
711 avl_create(&bip->bi_fwd, fwd_compare, sizeof (bridge_fwd_t),
712 offsetof(bridge_fwd_t, bf_node));
713 return (bip);
714 }
715
716 static bridge_inst_t *
bridge_find_name(const char * bridge)717 bridge_find_name(const char *bridge)
718 {
719 bridge_inst_t *bip;
720
721 mutex_enter(&inst_lock);
722 for (bip = list_head(&inst_list); bip != NULL;
723 bip = list_next(&inst_list, bip)) {
724 if (!(bip->bi_flags & BIF_SHUTDOWN) &&
725 strcmp(bridge, bip->bi_name) == 0) {
726 atomic_inc_uint(&bip->bi_refs);
727 break;
728 }
729 }
730 mutex_exit(&inst_lock);
731
732 return (bip);
733 }
734
735 static int
bridge_create(datalink_id_t linkid,const char * bridge,bridge_inst_t ** bipc,cred_t * cred)736 bridge_create(datalink_id_t linkid, const char *bridge, bridge_inst_t **bipc,
737 cred_t *cred)
738 {
739 bridge_inst_t *bip, *bipnew;
740 bridge_mac_t *bmp = NULL;
741 int err;
742
743 *bipc = NULL;
744 bipnew = inst_alloc(bridge);
745
746 mutex_enter(&inst_lock);
747 lookup_retry:
748 for (bip = list_head(&inst_list); bip != NULL;
749 bip = list_next(&inst_list, bip)) {
750 if (strcmp(bridge, bip->bi_name) == 0)
751 break;
752 }
753
754 /* This should not take long; if it does, we've got a design problem */
755 if (bip != NULL && (bip->bi_flags & BIF_SHUTDOWN)) {
756 cv_wait(&inst_cv, &inst_lock);
757 goto lookup_retry;
758 }
759
760 if (bip == NULL) {
761 bip = bipnew;
762 bipnew = NULL;
763 list_insert_tail(&inst_list, bip);
764 }
765
766 mutex_exit(&inst_lock);
767 if (bipnew != NULL) {
768 inst_free(bipnew);
769 return (EEXIST);
770 }
771
772 bip->bi_ksp = kstat_setup((kstat_named_t *)&bip->bi_kstats,
773 inst_kstats_list, Dim(inst_kstats_list), bip->bi_name);
774
775 err = bmac_alloc(bip, &bmp);
776 if ((bip->bi_mac = bmp) == NULL)
777 goto fail_create;
778
779 /*
780 * bm_inst is set, so the timer cannot yank the DLS rug from under us.
781 * No extra locking is needed here.
782 */
783 if (!(bmp->bm_flags & BMF_DLS)) {
784 err = dls_devnet_create(bmp->bm_mh, linkid, crgetzoneid(cred));
785 if (err != 0)
786 goto fail_create;
787 bmp->bm_flags |= BMF_DLS;
788 }
789
790 bip->bi_dev = makedevice(bridge_major, mac_minor(bmp->bm_mh));
791 *bipc = bip;
792 return (0);
793
794 fail_create:
795 ASSERT(bip->bi_trilldata == NULL);
796 bip->bi_flags |= BIF_SHUTDOWN;
797 bridge_unref(bip);
798 return (err);
799 }
800
801 static void
bridge_unref(bridge_inst_t * bip)802 bridge_unref(bridge_inst_t *bip)
803 {
804 if (atomic_dec_uint_nv(&bip->bi_refs) == 0) {
805 ASSERT(bip->bi_flags & BIF_SHUTDOWN);
806 /* free up mac for reuse before leaving global list */
807 if (bip->bi_mac != NULL)
808 bmac_disconnect(bip->bi_mac);
809 mutex_enter(&inst_lock);
810 list_remove(&inst_list, bip);
811 cv_broadcast(&inst_cv);
812 mutex_exit(&inst_lock);
813 inst_free(bip);
814 }
815 }
816
817 /*
818 * Stream instances are used only for allocating bridges and serving as a
819 * control node. They serve no data-handling function.
820 */
821 static bridge_stream_t *
stream_alloc(void)822 stream_alloc(void)
823 {
824 bridge_stream_t *bsp;
825 minor_t mn;
826
827 if ((mn = mac_minor_hold(B_FALSE)) == 0)
828 return (NULL);
829 bsp = kmem_zalloc(sizeof (*bsp), KM_SLEEP);
830 bsp->bs_minor = mn;
831 return (bsp);
832 }
833
834 static void
stream_free(bridge_stream_t * bsp)835 stream_free(bridge_stream_t *bsp)
836 {
837 mac_minor_rele(bsp->bs_minor);
838 kmem_free(bsp, sizeof (*bsp));
839 }
840
841 /* Reference hold/release functions for STREAMS-related taskq */
842 static void
stream_ref(bridge_stream_t * bsp)843 stream_ref(bridge_stream_t *bsp)
844 {
845 mutex_enter(&stream_ref_lock);
846 bsp->bs_taskq_cnt++;
847 mutex_exit(&stream_ref_lock);
848 }
849
850 static void
stream_unref(bridge_stream_t * bsp)851 stream_unref(bridge_stream_t *bsp)
852 {
853 mutex_enter(&stream_ref_lock);
854 if (--bsp->bs_taskq_cnt == 0)
855 cv_broadcast(&stream_ref_cv);
856 mutex_exit(&stream_ref_lock);
857 }
858
859 static void
link_free(bridge_link_t * blp)860 link_free(bridge_link_t *blp)
861 {
862 bridge_inst_t *bip = blp->bl_inst;
863
864 ASSERT(!(blp->bl_flags & BLF_FREED));
865 blp->bl_flags |= BLF_FREED;
866 if (blp->bl_ksp != NULL)
867 kstat_delete(blp->bl_ksp);
868 if (blp->bl_lfailmp != NULL)
869 freeb(blp->bl_lfailmp);
870 cv_destroy(&blp->bl_trillwait);
871 mutex_destroy(&blp->bl_trilllock);
872 kmem_free(blp, sizeof (*blp));
873 /* Don't unreference the bridge until the MAC is closed */
874 bridge_unref(bip);
875 }
876
877 static void
link_unref(bridge_link_t * blp)878 link_unref(bridge_link_t *blp)
879 {
880 if (atomic_dec_uint_nv(&blp->bl_refs) == 0) {
881 bridge_inst_t *bip = blp->bl_inst;
882
883 ASSERT(blp->bl_flags & BLF_DELETED);
884 rw_enter(&bip->bi_rwlock, RW_WRITER);
885 if (blp->bl_flags & BLF_LINK_ADDED)
886 list_remove(&bip->bi_links, blp);
887 rw_exit(&bip->bi_rwlock);
888 if (bip->bi_trilldata != NULL && list_is_empty(&bip->bi_links))
889 cv_broadcast(&bip->bi_linkwait);
890 link_free(blp);
891 }
892 }
893
894 static bridge_fwd_t *
fwd_alloc(const uint8_t * addr,uint_t nlinks,uint16_t nick)895 fwd_alloc(const uint8_t *addr, uint_t nlinks, uint16_t nick)
896 {
897 bridge_fwd_t *bfp;
898
899 bfp = kmem_zalloc(sizeof (*bfp) + (nlinks * sizeof (bridge_link_t *)),
900 KM_NOSLEEP);
901 if (bfp != NULL) {
902 bcopy(addr, bfp->bf_dest, ETHERADDRL);
903 bfp->bf_lastheard = ddi_get_lbolt();
904 bfp->bf_maxlinks = nlinks;
905 bfp->bf_links = (bridge_link_t **)(bfp + 1);
906 bfp->bf_trill_nick = nick;
907 }
908 return (bfp);
909 }
910
911 static bridge_fwd_t *
fwd_find(bridge_inst_t * bip,const uint8_t * addr,uint16_t vlanid)912 fwd_find(bridge_inst_t *bip, const uint8_t *addr, uint16_t vlanid)
913 {
914 bridge_fwd_t *bfp, *vbfp;
915 bridge_fwd_t match;
916
917 bcopy(addr, match.bf_dest, ETHERADDRL);
918 match.bf_flags = 0;
919 rw_enter(&bip->bi_rwlock, RW_READER);
920 if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
921 if (bfp->bf_vlanid != vlanid && bfp->bf_vcnt > 0) {
922 match.bf_vlanid = vlanid;
923 match.bf_flags = BFF_VLANLOCAL;
924 vbfp = avl_find(&bip->bi_fwd, &match, NULL);
925 if (vbfp != NULL)
926 bfp = vbfp;
927 }
928 atomic_inc_uint(&bfp->bf_refs);
929 }
930 rw_exit(&bip->bi_rwlock);
931 return (bfp);
932 }
933
934 static void
fwd_free(bridge_fwd_t * bfp)935 fwd_free(bridge_fwd_t *bfp)
936 {
937 uint_t i;
938 bridge_inst_t *bip = bfp->bf_links[0]->bl_inst;
939
940 KIDECR(bki_count);
941 for (i = 0; i < bfp->bf_nlinks; i++)
942 link_unref(bfp->bf_links[i]);
943 kmem_free(bfp,
944 sizeof (*bfp) + bfp->bf_maxlinks * sizeof (bridge_link_t *));
945 }
946
947 static void
fwd_unref(bridge_fwd_t * bfp)948 fwd_unref(bridge_fwd_t *bfp)
949 {
950 if (atomic_dec_uint_nv(&bfp->bf_refs) == 0) {
951 ASSERT(!(bfp->bf_flags & BFF_INTREE));
952 fwd_free(bfp);
953 }
954 }
955
956 static void
fwd_delete(bridge_fwd_t * bfp)957 fwd_delete(bridge_fwd_t *bfp)
958 {
959 bridge_inst_t *bip;
960 bridge_fwd_t *bfpzero;
961
962 if (bfp->bf_flags & BFF_INTREE) {
963 ASSERT(bfp->bf_nlinks > 0);
964 bip = bfp->bf_links[0]->bl_inst;
965 rw_enter(&bip->bi_rwlock, RW_WRITER);
966 /* Another thread could beat us to this */
967 if (bfp->bf_flags & BFF_INTREE) {
968 avl_remove(&bip->bi_fwd, bfp);
969 bfp->bf_flags &= ~BFF_INTREE;
970 if (bfp->bf_flags & BFF_VLANLOCAL) {
971 bfp->bf_flags &= ~BFF_VLANLOCAL;
972 bfpzero = avl_find(&bip->bi_fwd, bfp, NULL);
973 if (bfpzero != NULL && bfpzero->bf_vcnt > 0)
974 bfpzero->bf_vcnt--;
975 }
976 rw_exit(&bip->bi_rwlock);
977 fwd_unref(bfp); /* no longer in avl tree */
978 } else {
979 rw_exit(&bip->bi_rwlock);
980 }
981 }
982 }
983
984 static boolean_t
fwd_insert(bridge_inst_t * bip,bridge_fwd_t * bfp)985 fwd_insert(bridge_inst_t *bip, bridge_fwd_t *bfp)
986 {
987 avl_index_t idx;
988 boolean_t retv;
989
990 rw_enter(&bip->bi_rwlock, RW_WRITER);
991 if (!(bip->bi_flags & BIF_SHUTDOWN) &&
992 avl_numnodes(&bip->bi_fwd) < bip->bi_tablemax &&
993 avl_find(&bip->bi_fwd, bfp, &idx) == NULL) {
994 avl_insert(&bip->bi_fwd, bfp, idx);
995 bfp->bf_flags |= BFF_INTREE;
996 atomic_inc_uint(&bfp->bf_refs); /* avl entry */
997 retv = B_TRUE;
998 } else {
999 retv = B_FALSE;
1000 }
1001 rw_exit(&bip->bi_rwlock);
1002 return (retv);
1003 }
1004
1005 static void
fwd_update_local(bridge_link_t * blp,const uint8_t * oldaddr,const uint8_t * newaddr)1006 fwd_update_local(bridge_link_t *blp, const uint8_t *oldaddr,
1007 const uint8_t *newaddr)
1008 {
1009 bridge_inst_t *bip = blp->bl_inst;
1010 bridge_fwd_t *bfp, *bfnew;
1011 bridge_fwd_t match;
1012 avl_index_t idx;
1013 boolean_t drop_ref = B_FALSE;
1014
1015 if (bcmp(oldaddr, newaddr, ETHERADDRL) == 0)
1016 return;
1017
1018 if (bcmp(oldaddr, zero_addr, ETHERADDRL) == 0)
1019 goto no_old_addr;
1020
1021 /*
1022 * Find the previous entry, and remove our link from it.
1023 */
1024 bcopy(oldaddr, match.bf_dest, ETHERADDRL);
1025 rw_enter(&bip->bi_rwlock, RW_WRITER);
1026 if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
1027 int i;
1028
1029 /*
1030 * See if we're in the list, and remove if so.
1031 */
1032 for (i = 0; i < bfp->bf_nlinks; i++) {
1033 if (bfp->bf_links[i] == blp) {
1034 /*
1035 * We assume writes are atomic, so no special
1036 * MT handling is needed. The list length is
1037 * decremented first, and then we remove
1038 * entries.
1039 */
1040 bfp->bf_nlinks--;
1041 for (; i < bfp->bf_nlinks; i++)
1042 bfp->bf_links[i] = bfp->bf_links[i + 1];
1043 drop_ref = B_TRUE;
1044 break;
1045 }
1046 }
1047 /* If no more links, then remove and free up */
1048 if (bfp->bf_nlinks == 0) {
1049 avl_remove(&bip->bi_fwd, bfp);
1050 bfp->bf_flags &= ~BFF_INTREE;
1051 } else {
1052 bfp = NULL;
1053 }
1054 }
1055 rw_exit(&bip->bi_rwlock);
1056 if (bfp != NULL)
1057 fwd_unref(bfp); /* no longer in avl tree */
1058
1059 /*
1060 * Now get the new link address and add this link to the list. The
1061 * list should be of length 1 unless the user has configured multiple
1062 * NICs with the same address. (That's an incorrect configuration, but
1063 * we support it anyway.)
1064 */
1065 no_old_addr:
1066 bfp = NULL;
1067 if ((bip->bi_flags & BIF_SHUTDOWN) ||
1068 bcmp(newaddr, zero_addr, ETHERADDRL) == 0)
1069 goto no_new_addr;
1070
1071 bcopy(newaddr, match.bf_dest, ETHERADDRL);
1072 rw_enter(&bip->bi_rwlock, RW_WRITER);
1073 if ((bfp = avl_find(&bip->bi_fwd, &match, &idx)) == NULL) {
1074 bfnew = fwd_alloc(newaddr, 1, RBRIDGE_NICKNAME_NONE);
1075 if (bfnew != NULL)
1076 KIINCR(bki_count);
1077 } else if (bfp->bf_nlinks < bfp->bf_maxlinks) {
1078 /* special case: link fits in existing entry */
1079 bfnew = bfp;
1080 } else {
1081 bfnew = fwd_alloc(newaddr, bfp->bf_nlinks + 1,
1082 RBRIDGE_NICKNAME_NONE);
1083 if (bfnew != NULL) {
1084 KIINCR(bki_count);
1085 avl_remove(&bip->bi_fwd, bfp);
1086 bfp->bf_flags &= ~BFF_INTREE;
1087 bfnew->bf_nlinks = bfp->bf_nlinks;
1088 bcopy(bfp->bf_links, bfnew->bf_links,
1089 bfp->bf_nlinks * sizeof (bfp));
1090 /* reset the idx value due to removal above */
1091 (void) avl_find(&bip->bi_fwd, &match, &idx);
1092 }
1093 }
1094
1095 if (bfnew != NULL) {
1096 bfnew->bf_links[bfnew->bf_nlinks++] = blp;
1097 if (drop_ref)
1098 drop_ref = B_FALSE;
1099 else
1100 atomic_inc_uint(&blp->bl_refs); /* bf_links entry */
1101
1102 if (bfnew != bfp) {
1103 /* local addresses are not subject to table limits */
1104 avl_insert(&bip->bi_fwd, bfnew, idx);
1105 bfnew->bf_flags |= (BFF_INTREE | BFF_LOCALADDR);
1106 atomic_inc_uint(&bfnew->bf_refs); /* avl entry */
1107 }
1108 }
1109 rw_exit(&bip->bi_rwlock);
1110
1111 no_new_addr:
1112 /*
1113 * If we found an existing entry and we replaced it with a new one,
1114 * then drop the table reference from the old one. We removed it from
1115 * the AVL tree above.
1116 */
1117 if (bfnew != NULL && bfp != NULL && bfnew != bfp)
1118 fwd_unref(bfp);
1119
1120 /* Account for removed entry. */
1121 if (drop_ref)
1122 link_unref(blp);
1123 }
1124
1125 static void
bridge_new_unicst(bridge_link_t * blp)1126 bridge_new_unicst(bridge_link_t *blp)
1127 {
1128 uint8_t new_mac[ETHERADDRL];
1129
1130 mac_unicast_primary_get(blp->bl_mh, new_mac);
1131 fwd_update_local(blp, blp->bl_local_mac, new_mac);
1132 bcopy(new_mac, blp->bl_local_mac, ETHERADDRL);
1133 }
1134
1135 /*
1136 * We must shut down a link prior to freeing it, and doing that requires
1137 * blocking to wait for running MAC threads while holding a reference. This is
1138 * run from a taskq to accomplish proper link shutdown followed by reference
1139 * drop.
1140 */
1141 static void
link_shutdown(void * arg)1142 link_shutdown(void *arg)
1143 {
1144 bridge_link_t *blp = arg;
1145 mac_handle_t mh = blp->bl_mh;
1146 bridge_inst_t *bip;
1147 bridge_fwd_t *bfp, *bfnext;
1148 avl_tree_t fwd_scavenge;
1149 int i;
1150
1151 /*
1152 * This link is being destroyed. Notify TRILL now that it's no longer
1153 * possible to send packets. Data packets may still arrive until TRILL
1154 * calls bridge_trill_lnunref.
1155 */
1156 if (blp->bl_trilldata != NULL)
1157 trill_lndstr_fn(blp->bl_trilldata, blp);
1158
1159 if (blp->bl_flags & BLF_PROM_ADDED)
1160 (void) mac_promisc_remove(blp->bl_mphp);
1161
1162 if (blp->bl_flags & BLF_SET_BRIDGE)
1163 mac_bridge_clear(mh, (mac_handle_t)blp);
1164
1165 if (blp->bl_flags & BLF_MARGIN_ADDED) {
1166 (void) mac_notify_remove(blp->bl_mnh, B_TRUE);
1167 (void) mac_margin_remove(mh, blp->bl_margin);
1168 }
1169
1170 /* Tell the clients the real link state when we leave */
1171 mac_link_redo(blp->bl_mh,
1172 mac_stat_get(blp->bl_mh, MAC_STAT_LOWLINK_STATE));
1173
1174 /* Destroy all of the forwarding entries related to this link */
1175 avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1176 offsetof(bridge_fwd_t, bf_node));
1177 bip = blp->bl_inst;
1178 rw_enter(&bip->bi_rwlock, RW_WRITER);
1179 bfnext = avl_first(&bip->bi_fwd);
1180 while ((bfp = bfnext) != NULL) {
1181 bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1182 for (i = 0; i < bfp->bf_nlinks; i++) {
1183 if (bfp->bf_links[i] == blp)
1184 break;
1185 }
1186 if (i >= bfp->bf_nlinks)
1187 continue;
1188 if (bfp->bf_nlinks > 1) {
1189 /* note that this can't be the last reference */
1190 link_unref(blp);
1191 bfp->bf_nlinks--;
1192 for (; i < bfp->bf_nlinks; i++)
1193 bfp->bf_links[i] = bfp->bf_links[i + 1];
1194 } else {
1195 ASSERT(bfp->bf_flags & BFF_INTREE);
1196 avl_remove(&bip->bi_fwd, bfp);
1197 bfp->bf_flags &= ~BFF_INTREE;
1198 avl_add(&fwd_scavenge, bfp);
1199 }
1200 }
1201 rw_exit(&bip->bi_rwlock);
1202 bfnext = avl_first(&fwd_scavenge);
1203 while ((bfp = bfnext) != NULL) {
1204 bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1205 avl_remove(&fwd_scavenge, bfp);
1206 fwd_unref(bfp);
1207 }
1208 avl_destroy(&fwd_scavenge);
1209
1210 if (blp->bl_flags & BLF_CLIENT_OPEN)
1211 mac_client_close(blp->bl_mch, 0);
1212
1213 mac_close(mh);
1214
1215 /*
1216 * We are now completely removed from the active list, so drop the
1217 * reference (see bridge_add_link).
1218 */
1219 link_unref(blp);
1220 }
1221
1222 static void
shutdown_inst(bridge_inst_t * bip)1223 shutdown_inst(bridge_inst_t *bip)
1224 {
1225 bridge_link_t *blp, *blnext;
1226 bridge_fwd_t *bfp;
1227
1228 mutex_enter(&inst_lock);
1229 if (bip->bi_flags & BIF_SHUTDOWN) {
1230 mutex_exit(&inst_lock);
1231 return;
1232 }
1233
1234 /*
1235 * Once on the inst_list, the bridge instance must not leave that list
1236 * without having the shutdown flag set first. When the shutdown flag
1237 * is set, we own the list reference, so we must drop it before
1238 * returning.
1239 */
1240 bip->bi_flags |= BIF_SHUTDOWN;
1241 mutex_exit(&inst_lock);
1242
1243 bip->bi_control = NULL;
1244
1245 rw_enter(&bip->bi_rwlock, RW_READER);
1246 blnext = list_head(&bip->bi_links);
1247 while ((blp = blnext) != NULL) {
1248 blnext = list_next(&bip->bi_links, blp);
1249 if (!(blp->bl_flags & BLF_DELETED)) {
1250 blp->bl_flags |= BLF_DELETED;
1251 (void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
1252 blp, DDI_SLEEP);
1253 }
1254 }
1255 while ((bfp = avl_first(&bip->bi_fwd)) != NULL) {
1256 atomic_inc_uint(&bfp->bf_refs);
1257 rw_exit(&bip->bi_rwlock);
1258 fwd_delete(bfp);
1259 fwd_unref(bfp);
1260 rw_enter(&bip->bi_rwlock, RW_READER);
1261 }
1262 rw_exit(&bip->bi_rwlock);
1263
1264 /*
1265 * This bridge is being destroyed. Notify TRILL once all of the
1266 * links are all gone.
1267 */
1268 mutex_enter(&inst_lock);
1269 while (bip->bi_trilldata != NULL && !list_is_empty(&bip->bi_links))
1270 cv_wait(&bip->bi_linkwait, &inst_lock);
1271 mutex_exit(&inst_lock);
1272 if (bip->bi_trilldata != NULL)
1273 trill_brdstr_fn(bip->bi_trilldata, bip);
1274
1275 bridge_unref(bip);
1276 }
1277
1278 /*
1279 * This is called once by the TRILL module when it starts up. It just sets the
1280 * global TRILL callback function pointers -- data transmit/receive and bridge
1281 * and link destroy notification. There's only one TRILL module, so only one
1282 * registration is needed.
1283 *
1284 * TRILL should call this function with NULL pointers before unloading. It
1285 * must not do so before dropping all references to bridges and links. We
1286 * assert that this is true on debug builds.
1287 */
1288 void
bridge_trill_register_cb(trill_recv_pkt_t recv_fn,trill_encap_pkt_t encap_fn,trill_br_dstr_t brdstr_fn,trill_ln_dstr_t lndstr_fn)1289 bridge_trill_register_cb(trill_recv_pkt_t recv_fn, trill_encap_pkt_t encap_fn,
1290 trill_br_dstr_t brdstr_fn, trill_ln_dstr_t lndstr_fn)
1291 {
1292 #ifdef DEBUG
1293 if (recv_fn == NULL && trill_recv_fn != NULL) {
1294 bridge_inst_t *bip;
1295 bridge_link_t *blp;
1296
1297 mutex_enter(&inst_lock);
1298 for (bip = list_head(&inst_list); bip != NULL;
1299 bip = list_next(&inst_list, bip)) {
1300 ASSERT(bip->bi_trilldata == NULL);
1301 rw_enter(&bip->bi_rwlock, RW_READER);
1302 for (blp = list_head(&bip->bi_links); blp != NULL;
1303 blp = list_next(&bip->bi_links, blp)) {
1304 ASSERT(blp->bl_trilldata == NULL);
1305 }
1306 rw_exit(&bip->bi_rwlock);
1307 }
1308 mutex_exit(&inst_lock);
1309 }
1310 #endif
1311 trill_recv_fn = recv_fn;
1312 trill_encap_fn = encap_fn;
1313 trill_brdstr_fn = brdstr_fn;
1314 trill_lndstr_fn = lndstr_fn;
1315 }
1316
1317 /*
1318 * This registers the TRILL instance pointer with a bridge. Before this
1319 * pointer is set, the forwarding, TRILL receive, and bridge destructor
1320 * functions won't be called.
1321 *
1322 * TRILL holds a reference on a bridge with this call. It must free the
1323 * reference by calling the unregister function below.
1324 */
1325 bridge_inst_t *
bridge_trill_brref(const char * bname,void * ptr)1326 bridge_trill_brref(const char *bname, void *ptr)
1327 {
1328 char bridge[MAXLINKNAMELEN];
1329 bridge_inst_t *bip;
1330
1331 (void) snprintf(bridge, MAXLINKNAMELEN, "%s0", bname);
1332 bip = bridge_find_name(bridge);
1333 if (bip != NULL) {
1334 ASSERT(bip->bi_trilldata == NULL && ptr != NULL);
1335 bip->bi_trilldata = ptr;
1336 }
1337 return (bip);
1338 }
1339
1340 void
bridge_trill_brunref(bridge_inst_t * bip)1341 bridge_trill_brunref(bridge_inst_t *bip)
1342 {
1343 ASSERT(bip->bi_trilldata != NULL);
1344 bip->bi_trilldata = NULL;
1345 bridge_unref(bip);
1346 }
1347
1348 /*
1349 * TRILL calls this function when referencing a particular link on a bridge.
1350 *
1351 * It holds a reference on the link, so TRILL must clear out the reference when
1352 * it's done with the link (on unbinding).
1353 */
1354 bridge_link_t *
bridge_trill_lnref(bridge_inst_t * bip,datalink_id_t linkid,void * ptr)1355 bridge_trill_lnref(bridge_inst_t *bip, datalink_id_t linkid, void *ptr)
1356 {
1357 bridge_link_t *blp;
1358
1359 ASSERT(ptr != NULL);
1360 rw_enter(&bip->bi_rwlock, RW_READER);
1361 for (blp = list_head(&bip->bi_links); blp != NULL;
1362 blp = list_next(&bip->bi_links, blp)) {
1363 if (!(blp->bl_flags & BLF_DELETED) &&
1364 blp->bl_linkid == linkid && blp->bl_trilldata == NULL) {
1365 blp->bl_trilldata = ptr;
1366 blp->bl_flags &= ~BLF_TRILLACTIVE;
1367 (void) memset(blp->bl_afs, 0, sizeof (blp->bl_afs));
1368 atomic_inc_uint(&blp->bl_refs);
1369 break;
1370 }
1371 }
1372 rw_exit(&bip->bi_rwlock);
1373 return (blp);
1374 }
1375
1376 void
bridge_trill_lnunref(bridge_link_t * blp)1377 bridge_trill_lnunref(bridge_link_t *blp)
1378 {
1379 mutex_enter(&blp->bl_trilllock);
1380 ASSERT(blp->bl_trilldata != NULL);
1381 blp->bl_trilldata = NULL;
1382 blp->bl_flags &= ~BLF_TRILLACTIVE;
1383 while (blp->bl_trillthreads > 0)
1384 cv_wait(&blp->bl_trillwait, &blp->bl_trilllock);
1385 mutex_exit(&blp->bl_trilllock);
1386 (void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
1387 link_unref(blp);
1388 }
1389
1390 /*
1391 * This periodic timer performs three functions:
1392 * 1. It scans the list of learned forwarding entries, and removes ones that
1393 * haven't been heard from in a while. The time limit is backed down if
1394 * we're above the configured table limit.
1395 * 2. It walks the links and decays away the bl_learns counter.
1396 * 3. It scans the observability node entries looking for ones that can be
1397 * freed up.
1398 */
1399 /* ARGSUSED */
1400 static void
bridge_timer(void * arg)1401 bridge_timer(void *arg)
1402 {
1403 bridge_inst_t *bip;
1404 bridge_fwd_t *bfp, *bfnext;
1405 bridge_mac_t *bmp, *bmnext;
1406 bridge_link_t *blp;
1407 int err;
1408 datalink_id_t tmpid;
1409 avl_tree_t fwd_scavenge;
1410 clock_t age_limit;
1411 uint32_t ldecay;
1412
1413 avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1414 offsetof(bridge_fwd_t, bf_node));
1415 mutex_enter(&inst_lock);
1416 for (bip = list_head(&inst_list); bip != NULL;
1417 bip = list_next(&inst_list, bip)) {
1418 if (bip->bi_flags & BIF_SHUTDOWN)
1419 continue;
1420 rw_enter(&bip->bi_rwlock, RW_WRITER);
1421 /* compute scaled maximum age based on table limit */
1422 if (avl_numnodes(&bip->bi_fwd) > bip->bi_tablemax)
1423 bip->bi_tshift++;
1424 else
1425 bip->bi_tshift = 0;
1426 if ((age_limit = bridge_fwd_age >> bip->bi_tshift) == 0) {
1427 if (bip->bi_tshift != 0)
1428 bip->bi_tshift--;
1429 age_limit = 1;
1430 }
1431 bfnext = avl_first(&bip->bi_fwd);
1432 while ((bfp = bfnext) != NULL) {
1433 bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1434 if (!(bfp->bf_flags & BFF_LOCALADDR) &&
1435 (ddi_get_lbolt() - bfp->bf_lastheard) > age_limit) {
1436 ASSERT(bfp->bf_flags & BFF_INTREE);
1437 avl_remove(&bip->bi_fwd, bfp);
1438 bfp->bf_flags &= ~BFF_INTREE;
1439 avl_add(&fwd_scavenge, bfp);
1440 }
1441 }
1442 for (blp = list_head(&bip->bi_links); blp != NULL;
1443 blp = list_next(&bip->bi_links, blp)) {
1444 ldecay = mac_get_ldecay(blp->bl_mh);
1445 if (ldecay >= blp->bl_learns)
1446 blp->bl_learns = 0;
1447 else
1448 atomic_add_int(&blp->bl_learns, -(int)ldecay);
1449 }
1450 rw_exit(&bip->bi_rwlock);
1451 bfnext = avl_first(&fwd_scavenge);
1452 while ((bfp = bfnext) != NULL) {
1453 bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1454 avl_remove(&fwd_scavenge, bfp);
1455 KIINCR(bki_expire);
1456 fwd_unref(bfp); /* drop tree reference */
1457 }
1458 }
1459 mutex_exit(&inst_lock);
1460 avl_destroy(&fwd_scavenge);
1461
1462 /*
1463 * Scan the bridge_mac_t entries and try to free up the ones that are
1464 * no longer active. This must be done by polling, as neither DLS nor
1465 * MAC provides a driver any sort of positive control over clients.
1466 */
1467 rw_enter(&bmac_rwlock, RW_WRITER);
1468 bmnext = list_head(&bmac_list);
1469 while ((bmp = bmnext) != NULL) {
1470 bmnext = list_next(&bmac_list, bmp);
1471
1472 /* ignore active bridges */
1473 if (bmp->bm_inst != NULL)
1474 continue;
1475
1476 if (bmp->bm_flags & BMF_DLS) {
1477 err = dls_devnet_destroy(bmp->bm_mh, &tmpid, B_FALSE);
1478 ASSERT(err == 0 || err == EBUSY);
1479 if (err == 0)
1480 bmp->bm_flags &= ~BMF_DLS;
1481 }
1482
1483 if (!(bmp->bm_flags & BMF_DLS)) {
1484 err = mac_unregister(bmp->bm_mh);
1485 ASSERT(err == 0 || err == EBUSY);
1486 if (err == 0) {
1487 list_remove(&bmac_list, bmp);
1488 kmem_free(bmp, sizeof (*bmp));
1489 }
1490 }
1491 }
1492 if (list_is_empty(&bmac_list)) {
1493 bridge_timerid = 0;
1494 } else {
1495 bridge_timerid = timeout(bridge_timer, NULL,
1496 bridge_scan_interval);
1497 }
1498 rw_exit(&bmac_rwlock);
1499 }
1500
1501 static int
bridge_open(queue_t * rq,dev_t * devp,int oflag,int sflag,cred_t * credp)1502 bridge_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp)
1503 {
1504 bridge_stream_t *bsp;
1505
1506 if (rq->q_ptr != NULL)
1507 return (0);
1508
1509 if (sflag & MODOPEN)
1510 return (EINVAL);
1511
1512 /*
1513 * Check the minor node number being opened. This tells us which
1514 * bridge instance the user wants.
1515 */
1516 if (getminor(*devp) != 0) {
1517 /*
1518 * This is a regular DLPI stream for snoop or the like.
1519 * Redirect it through DLD.
1520 */
1521 rq->q_qinfo = &bridge_dld_rinit;
1522 OTHERQ(rq)->q_qinfo = &bridge_dld_winit;
1523 return (dld_open(rq, devp, oflag, sflag, credp));
1524 } else {
1525 /*
1526 * Allocate the bridge control stream structure.
1527 */
1528 if ((bsp = stream_alloc()) == NULL)
1529 return (ENOSR);
1530 rq->q_ptr = WR(rq)->q_ptr = (caddr_t)bsp;
1531 bsp->bs_wq = WR(rq);
1532 *devp = makedevice(getmajor(*devp), bsp->bs_minor);
1533 qprocson(rq);
1534 return (0);
1535 }
1536 }
1537
1538 /*
1539 * This is used only for bridge control streams. DLPI goes through dld
1540 * instead.
1541 */
1542 static int
bridge_close(queue_t * rq)1543 bridge_close(queue_t *rq)
1544 {
1545 bridge_stream_t *bsp = rq->q_ptr;
1546 bridge_inst_t *bip;
1547
1548 /*
1549 * Wait for any stray taskq (add/delete link) entries related to this
1550 * stream to leave the system.
1551 */
1552 mutex_enter(&stream_ref_lock);
1553 while (bsp->bs_taskq_cnt != 0)
1554 cv_wait(&stream_ref_cv, &stream_ref_lock);
1555 mutex_exit(&stream_ref_lock);
1556
1557 qprocsoff(rq);
1558 if ((bip = bsp->bs_inst) != NULL)
1559 shutdown_inst(bip);
1560 rq->q_ptr = WR(rq)->q_ptr = NULL;
1561 stream_free(bsp);
1562 if (bip != NULL)
1563 bridge_unref(bip);
1564
1565 return (0);
1566 }
1567
1568 static void
bridge_learn(bridge_link_t * blp,const uint8_t * saddr,uint16_t ingress_nick,uint16_t vlanid)1569 bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick,
1570 uint16_t vlanid)
1571 {
1572 bridge_inst_t *bip = blp->bl_inst;
1573 bridge_fwd_t *bfp, *bfpnew;
1574 int i;
1575 boolean_t replaced = B_FALSE;
1576
1577 /* Ignore multi-destination address used as source; it's nonsense. */
1578 if (*saddr & 1)
1579 return;
1580
1581 /*
1582 * If the source is known, then check whether it belongs on this link.
1583 * If not, and this isn't a fixed local address, then we've detected a
1584 * move. If it's not known, learn it.
1585 */
1586 if ((bfp = fwd_find(bip, saddr, vlanid)) != NULL) {
1587 /*
1588 * If the packet has a fixed local source address, then there's
1589 * nothing we can learn. We must quit. If this was a received
1590 * packet, then the sender has stolen our address, but there's
1591 * nothing we can do. If it's a transmitted packet, then
1592 * that's the normal case.
1593 */
1594 if (bfp->bf_flags & BFF_LOCALADDR) {
1595 fwd_unref(bfp);
1596 return;
1597 }
1598
1599 /*
1600 * Check if the link (and TRILL sender, if any) being used is
1601 * among the ones registered for this address. If so, then
1602 * this is information that we already know.
1603 */
1604 if (bfp->bf_trill_nick == ingress_nick) {
1605 for (i = 0; i < bfp->bf_nlinks; i++) {
1606 if (bfp->bf_links[i] == blp) {
1607 bfp->bf_lastheard = ddi_get_lbolt();
1608 fwd_unref(bfp);
1609 return;
1610 }
1611 }
1612 }
1613 }
1614
1615 /*
1616 * Note that we intentionally "unlearn" things that appear to be under
1617 * attack on this link. The forwarding cache is a negative thing for
1618 * security -- it disables reachability as a performance optimization
1619 * -- so leaving out entries optimizes for success and defends against
1620 * the attack. Thus, the bare increment without a check in the delete
1621 * code above is right. (And it's ok if we skid over the limit a
1622 * little, so there's no syncronization needed on the test.)
1623 */
1624 if (blp->bl_learns >= mac_get_llimit(blp->bl_mh)) {
1625 if (bfp != NULL) {
1626 if (bfp->bf_vcnt == 0)
1627 fwd_delete(bfp);
1628 fwd_unref(bfp);
1629 }
1630 return;
1631 }
1632
1633 atomic_inc_uint(&blp->bl_learns);
1634
1635 if ((bfpnew = fwd_alloc(saddr, 1, ingress_nick)) == NULL) {
1636 if (bfp != NULL)
1637 fwd_unref(bfp);
1638 return;
1639 }
1640 KIINCR(bki_count);
1641
1642 if (bfp != NULL) {
1643 /*
1644 * If this is a new destination for the same VLAN, then delete
1645 * so that we can update. If it's a different VLAN, then we're
1646 * not going to delete the original. Split off instead into an
1647 * IVL entry.
1648 */
1649 if (bfp->bf_vlanid == vlanid) {
1650 /* save the count of IVL duplicates */
1651 bfpnew->bf_vcnt = bfp->bf_vcnt;
1652
1653 /* entry deletes count as learning events */
1654 atomic_inc_uint(&blp->bl_learns);
1655
1656 /* destroy and create anew; node moved */
1657 fwd_delete(bfp);
1658 replaced = B_TRUE;
1659 KIINCR(bki_moved);
1660 } else {
1661 bfp->bf_vcnt++;
1662 bfpnew->bf_flags |= BFF_VLANLOCAL;
1663 }
1664 fwd_unref(bfp);
1665 }
1666 bfpnew->bf_links[0] = blp;
1667 bfpnew->bf_nlinks = 1;
1668 atomic_inc_uint(&blp->bl_refs); /* bf_links entry */
1669 if (!fwd_insert(bip, bfpnew))
1670 fwd_free(bfpnew);
1671 else if (!replaced)
1672 KIINCR(bki_source);
1673 }
1674
1675 /*
1676 * Process the VLAN headers for output on a given link. There are several
1677 * cases (noting that we don't map VLANs):
1678 * 1. The input packet is good as it is; either
1679 * a. It has no tag, and output has same PVID
1680 * b. It has a non-zero priority-only tag for PVID, and b_band is same
1681 * c. It has a tag with VLAN different from PVID, and b_band is same
1682 * 2. The tag must change: non-zero b_band is different from tag priority
1683 * 3. The packet has a tag and should not (VLAN same as PVID, b_band zero)
1684 * 4. The packet has no tag and needs one:
1685 * a. VLAN ID same as PVID, but b_band is non-zero
1686 * b. VLAN ID different from PVID
1687 * We exclude case 1 first, then modify the packet. Note that output packets
1688 * get a priority set by the mblk, not by the header, because QoS in bridging
1689 * requires priority recalculation at each node.
1690 *
1691 * The passed-in tci is the "impossible" value 0xFFFF when no tag is present.
1692 */
1693 static mblk_t *
reform_vlan_header(mblk_t * mp,uint16_t vlanid,uint16_t tci,uint16_t pvid)1694 reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
1695 {
1696 boolean_t source_has_tag = (tci != 0xFFFF);
1697 mblk_t *mpcopy;
1698 size_t mlen, minlen;
1699 struct ether_vlan_header *evh;
1700 int pri;
1701
1702 /* This helps centralize error handling in the caller. */
1703 if (mp == NULL)
1704 return (mp);
1705
1706 /* No forwarded packet can have hardware checksum enabled */
1707 DB_CKSUMFLAGS(mp) = 0;
1708
1709 /* Get the no-modification cases out of the way first */
1710 if (!source_has_tag && vlanid == pvid) /* 1a */
1711 return (mp);
1712
1713 pri = VLAN_PRI(tci);
1714 if (source_has_tag && mp->b_band == pri) {
1715 if (vlanid != pvid) /* 1c */
1716 return (mp);
1717 if (pri != 0 && VLAN_ID(tci) == 0) /* 1b */
1718 return (mp);
1719 }
1720
1721 /*
1722 * We now know that we must modify the packet. Prepare for that. Note
1723 * that if a tag is present, the caller has already done a pullup for
1724 * the VLAN header, so we're good to go.
1725 */
1726 if (MBLKL(mp) < sizeof (struct ether_header)) {
1727 mpcopy = msgpullup(mp, sizeof (struct ether_header));
1728 if (mpcopy == NULL) {
1729 freemsg(mp);
1730 return (NULL);
1731 }
1732 mp = mpcopy;
1733 }
1734 if (DB_REF(mp) > 1 || !IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)) ||
1735 (!source_has_tag && MBLKTAIL(mp) < VLAN_INCR)) {
1736 minlen = mlen = MBLKL(mp);
1737 if (!source_has_tag)
1738 minlen += VLAN_INCR;
1739 ASSERT(minlen >= sizeof (struct ether_vlan_header));
1740 /*
1741 * We're willing to copy some data to avoid fragmentation, but
1742 * not a lot.
1743 */
1744 if (minlen > 256)
1745 minlen = sizeof (struct ether_vlan_header);
1746 mpcopy = allocb(minlen, BPRI_MED);
1747 if (mpcopy == NULL) {
1748 freemsg(mp);
1749 return (NULL);
1750 }
1751 if (mlen <= minlen) {
1752 /* We toss the first mblk when we can. */
1753 bcopy(mp->b_rptr, mpcopy->b_rptr, mlen);
1754 mpcopy->b_wptr += mlen;
1755 mpcopy->b_cont = mp->b_cont;
1756 freeb(mp);
1757 } else {
1758 /* If not, then just copy what we need */
1759 if (!source_has_tag)
1760 minlen = sizeof (struct ether_header);
1761 bcopy(mp->b_rptr, mpcopy->b_rptr, minlen);
1762 mpcopy->b_wptr += minlen;
1763 mpcopy->b_cont = mp;
1764 mp->b_rptr += minlen;
1765 }
1766 mp = mpcopy;
1767 }
1768
1769 /* LINTED: pointer alignment */
1770 evh = (struct ether_vlan_header *)mp->b_rptr;
1771 if (source_has_tag) {
1772 if (mp->b_band == 0 && vlanid == pvid) { /* 3 */
1773 evh->ether_tpid = evh->ether_type;
1774 mlen = MBLKL(mp);
1775 if (mlen > sizeof (struct ether_vlan_header))
1776 ovbcopy(mp->b_rptr +
1777 sizeof (struct ether_vlan_header),
1778 mp->b_rptr + sizeof (struct ether_header),
1779 mlen - sizeof (struct ether_vlan_header));
1780 mp->b_wptr -= VLAN_INCR;
1781 } else { /* 2 */
1782 if (vlanid == pvid)
1783 vlanid = VLAN_ID_NONE;
1784 tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1785 evh->ether_tci = htons(tci);
1786 }
1787 } else {
1788 /* case 4: no header present, but one is needed */
1789 mlen = MBLKL(mp);
1790 if (mlen > sizeof (struct ether_header))
1791 ovbcopy(mp->b_rptr + sizeof (struct ether_header),
1792 mp->b_rptr + sizeof (struct ether_vlan_header),
1793 mlen - sizeof (struct ether_header));
1794 mp->b_wptr += VLAN_INCR;
1795 ASSERT(mp->b_wptr <= DB_LIM(mp));
1796 if (vlanid == pvid)
1797 vlanid = VLAN_ID_NONE;
1798 tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1799 evh->ether_type = evh->ether_tpid;
1800 evh->ether_tpid = htons(ETHERTYPE_VLAN);
1801 evh->ether_tci = htons(tci);
1802 }
1803 return (mp);
1804 }
1805
1806 /* Record VLAN information and strip header if requested . */
1807 static void
update_header(mblk_t * mp,mac_header_info_t * hdr_info,boolean_t striphdr)1808 update_header(mblk_t *mp, mac_header_info_t *hdr_info, boolean_t striphdr)
1809 {
1810 if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
1811 struct ether_vlan_header *evhp;
1812 uint16_t ether_type;
1813
1814 /* LINTED: alignment */
1815 evhp = (struct ether_vlan_header *)mp->b_rptr;
1816 hdr_info->mhi_istagged = B_TRUE;
1817 hdr_info->mhi_tci = ntohs(evhp->ether_tci);
1818 if (striphdr) {
1819 /*
1820 * For VLAN tagged frames update the ether_type
1821 * in hdr_info before stripping the header.
1822 */
1823 ether_type = ntohs(evhp->ether_type);
1824 hdr_info->mhi_origsap = ether_type;
1825 hdr_info->mhi_bindsap = (ether_type > ETHERMTU) ?
1826 ether_type : DLS_SAP_LLC;
1827 mp->b_rptr = (uchar_t *)(evhp + 1);
1828 }
1829 } else {
1830 hdr_info->mhi_istagged = B_FALSE;
1831 hdr_info->mhi_tci = VLAN_ID_NONE;
1832 if (striphdr)
1833 mp->b_rptr += sizeof (struct ether_header);
1834 }
1835 }
1836
1837 /*
1838 * Return B_TRUE if we're allowed to send on this link with the given VLAN ID.
1839 */
1840 static boolean_t
bridge_can_send(bridge_link_t * blp,uint16_t vlanid)1841 bridge_can_send(bridge_link_t *blp, uint16_t vlanid)
1842 {
1843 ASSERT(vlanid != VLAN_ID_NONE);
1844 if (blp->bl_flags & BLF_DELETED)
1845 return (B_FALSE);
1846 if (blp->bl_trilldata == NULL && blp->bl_state != BLS_FORWARDING)
1847 return (B_FALSE);
1848 return (BRIDGE_VLAN_ISSET(blp, vlanid) && BRIDGE_AF_ISSET(blp, vlanid));
1849 }
1850
1851 /*
1852 * This function scans the bridge forwarding tables in order to forward a given
1853 * packet. If the packet either doesn't need forwarding (the current link is
1854 * correct) or the current link needs a copy as well, then the packet is
1855 * returned to the caller.
1856 *
1857 * If a packet has been decapsulated from TRILL, then it must *NOT* reenter a
1858 * TRILL tunnel. If the destination points there, then drop instead.
1859 */
1860 static mblk_t *
bridge_forward(bridge_link_t * blp,mac_header_info_t * hdr_info,mblk_t * mp,uint16_t vlanid,uint16_t tci,boolean_t from_trill,boolean_t is_xmit)1861 bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
1862 uint16_t vlanid, uint16_t tci, boolean_t from_trill, boolean_t is_xmit)
1863 {
1864 mblk_t *mpsend, *mpcopy;
1865 bridge_inst_t *bip = blp->bl_inst;
1866 bridge_link_t *blpsend, *blpnext;
1867 bridge_fwd_t *bfp;
1868 uint_t i;
1869 boolean_t selfseen = B_FALSE;
1870 void *tdp;
1871 const uint8_t *daddr = hdr_info->mhi_daddr;
1872
1873 /*
1874 * Check for the IEEE "reserved" multicast addresses. Messages sent to
1875 * these addresses are used for link-local control (STP and pause), and
1876 * are never forwarded or redirected.
1877 */
1878 if (daddr[0] == 1 && daddr[1] == 0x80 && daddr[2] == 0xc2 &&
1879 daddr[3] == 0 && daddr[4] == 0 && (daddr[5] & 0xf0) == 0) {
1880 if (from_trill) {
1881 freemsg(mp);
1882 mp = NULL;
1883 }
1884 return (mp);
1885 }
1886
1887 if ((bfp = fwd_find(bip, daddr, vlanid)) != NULL) {
1888
1889 /*
1890 * If trill indicates a destination for this node, then it's
1891 * clearly not intended for local delivery. We must tell TRILL
1892 * to encapsulate, as long as we didn't just decapsulate it.
1893 */
1894 if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE) {
1895 /*
1896 * Error case: can't reencapsulate if the protocols are
1897 * working correctly.
1898 */
1899 if (from_trill) {
1900 freemsg(mp);
1901 return (NULL);
1902 }
1903 mutex_enter(&blp->bl_trilllock);
1904 if ((tdp = blp->bl_trilldata) != NULL) {
1905 blp->bl_trillthreads++;
1906 mutex_exit(&blp->bl_trilllock);
1907 update_header(mp, hdr_info, B_FALSE);
1908 if (is_xmit)
1909 mp = mac_fix_cksum(mp);
1910 /* all trill data frames have Inner.VLAN */
1911 mp = reform_vlan_header(mp, vlanid, tci, 0);
1912 if (mp == NULL) {
1913 KIINCR(bki_drops);
1914 fwd_unref(bfp);
1915 return (NULL);
1916 }
1917 trill_encap_fn(tdp, blp, hdr_info, mp,
1918 bfp->bf_trill_nick);
1919 mutex_enter(&blp->bl_trilllock);
1920 if (--blp->bl_trillthreads == 0 &&
1921 blp->bl_trilldata == NULL)
1922 cv_broadcast(&blp->bl_trillwait);
1923 }
1924 mutex_exit(&blp->bl_trilllock);
1925
1926 /* if TRILL has been disabled, then kill this stray */
1927 if (tdp == NULL) {
1928 freemsg(mp);
1929 fwd_delete(bfp);
1930 }
1931 fwd_unref(bfp);
1932 return (NULL);
1933 }
1934
1935 /* find first link we can send on */
1936 for (i = 0; i < bfp->bf_nlinks; i++) {
1937 blpsend = bfp->bf_links[i];
1938 if (blpsend == blp)
1939 selfseen = B_TRUE;
1940 else if (bridge_can_send(blpsend, vlanid))
1941 break;
1942 }
1943
1944 while (i < bfp->bf_nlinks) {
1945 blpsend = bfp->bf_links[i];
1946 for (i++; i < bfp->bf_nlinks; i++) {
1947 blpnext = bfp->bf_links[i];
1948 if (blpnext == blp)
1949 selfseen = B_TRUE;
1950 else if (bridge_can_send(blpnext, vlanid))
1951 break;
1952 }
1953 if (i == bfp->bf_nlinks && !selfseen) {
1954 mpsend = mp;
1955 mp = NULL;
1956 } else {
1957 mpsend = copymsg(mp);
1958 }
1959
1960 if (!from_trill && is_xmit)
1961 mpsend = mac_fix_cksum(mpsend);
1962
1963 mpsend = reform_vlan_header(mpsend, vlanid, tci,
1964 blpsend->bl_pvid);
1965 if (mpsend == NULL) {
1966 KIINCR(bki_drops);
1967 continue;
1968 }
1969
1970 KIINCR(bki_forwards);
1971 /*
1972 * No need to bump up the link reference count, as
1973 * the forwarding entry itself holds a reference to
1974 * the link.
1975 */
1976 if (bfp->bf_flags & BFF_LOCALADDR) {
1977 mac_rx_common(blpsend->bl_mh, NULL, mpsend);
1978 } else {
1979 KLPINCR(blpsend, bkl_xmit);
1980 MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
1981 mpsend);
1982 freemsg(mpsend);
1983 }
1984 }
1985 /*
1986 * Handle a special case: if we're transmitting to the original
1987 * link, then check whether the localaddr flag is set. If it
1988 * is, then receive instead. This doesn't happen with ordinary
1989 * bridging, but does happen often with TRILL decapsulation.
1990 */
1991 if (mp != NULL && is_xmit && (bfp->bf_flags & BFF_LOCALADDR)) {
1992 mac_rx_common(blp->bl_mh, NULL, mp);
1993 mp = NULL;
1994 }
1995 fwd_unref(bfp);
1996 } else {
1997 /*
1998 * TRILL has two cases to handle. If the packet is off the
1999 * wire (not from TRILL), then we need to send up into the
2000 * TRILL module to have the distribution tree computed. If the
2001 * packet is from TRILL (decapsulated), then we're part of the
2002 * distribution tree, and we need to copy the packet on member
2003 * interfaces.
2004 *
2005 * Thus, the from TRILL case is identical to the STP case.
2006 */
2007 if (!from_trill && blp->bl_trilldata != NULL) {
2008 mutex_enter(&blp->bl_trilllock);
2009 if ((tdp = blp->bl_trilldata) != NULL) {
2010 blp->bl_trillthreads++;
2011 mutex_exit(&blp->bl_trilllock);
2012 if ((mpsend = copymsg(mp)) != NULL) {
2013 update_header(mpsend,
2014 hdr_info, B_FALSE);
2015 /*
2016 * all trill data frames have
2017 * Inner.VLAN
2018 */
2019 mpsend = reform_vlan_header(mpsend,
2020 vlanid, tci, 0);
2021 if (mpsend == NULL) {
2022 KIINCR(bki_drops);
2023 } else {
2024 trill_encap_fn(tdp, blp,
2025 hdr_info, mpsend,
2026 RBRIDGE_NICKNAME_NONE);
2027 }
2028 }
2029 mutex_enter(&blp->bl_trilllock);
2030 if (--blp->bl_trillthreads == 0 &&
2031 blp->bl_trilldata == NULL)
2032 cv_broadcast(&blp->bl_trillwait);
2033 }
2034 mutex_exit(&blp->bl_trilllock);
2035 }
2036
2037 /*
2038 * This is an unknown destination, so flood.
2039 */
2040 rw_enter(&bip->bi_rwlock, RW_READER);
2041 for (blpnext = list_head(&bip->bi_links); blpnext != NULL;
2042 blpnext = list_next(&bip->bi_links, blpnext)) {
2043 if (blpnext == blp)
2044 selfseen = B_TRUE;
2045 else if (bridge_can_send(blpnext, vlanid))
2046 break;
2047 }
2048 if (blpnext != NULL)
2049 atomic_inc_uint(&blpnext->bl_refs);
2050 rw_exit(&bip->bi_rwlock);
2051 while ((blpsend = blpnext) != NULL) {
2052 rw_enter(&bip->bi_rwlock, RW_READER);
2053 for (blpnext = list_next(&bip->bi_links, blpsend);
2054 blpnext != NULL;
2055 blpnext = list_next(&bip->bi_links, blpnext)) {
2056 if (blpnext == blp)
2057 selfseen = B_TRUE;
2058 else if (bridge_can_send(blpnext, vlanid))
2059 break;
2060 }
2061 if (blpnext != NULL)
2062 atomic_inc_uint(&blpnext->bl_refs);
2063 rw_exit(&bip->bi_rwlock);
2064 if (blpnext == NULL && !selfseen) {
2065 mpsend = mp;
2066 mp = NULL;
2067 } else {
2068 mpsend = copymsg(mp);
2069 }
2070
2071 if (!from_trill && is_xmit)
2072 mpsend = mac_fix_cksum(mpsend);
2073
2074 mpsend = reform_vlan_header(mpsend, vlanid, tci,
2075 blpsend->bl_pvid);
2076 if (mpsend == NULL) {
2077 KIINCR(bki_drops);
2078 continue;
2079 }
2080
2081 if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST)
2082 KIINCR(bki_unknown);
2083 else
2084 KIINCR(bki_mbcast);
2085 KLPINCR(blpsend, bkl_xmit);
2086 if ((mpcopy = copymsg(mpsend)) != NULL)
2087 mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
2088 MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend);
2089 freemsg(mpsend);
2090 link_unref(blpsend);
2091 }
2092 }
2093
2094 /*
2095 * At this point, if np is non-NULL, it means that the caller needs to
2096 * continue on the selected link.
2097 */
2098 return (mp);
2099 }
2100
2101 /*
2102 * Extract and validate the VLAN information for a given packet. This checks
2103 * conformance with the rules for use of the PVID on the link, and for the
2104 * allowed (configured) VLAN set.
2105 *
2106 * Returns B_TRUE if the packet passes, B_FALSE if it fails.
2107 */
2108 static boolean_t
bridge_get_vlan(bridge_link_t * blp,mac_header_info_t * hdr_info,mblk_t * mp,uint16_t * vlanidp,uint16_t * tcip)2109 bridge_get_vlan(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
2110 uint16_t *vlanidp, uint16_t *tcip)
2111 {
2112 uint16_t tci, vlanid;
2113
2114 if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
2115 ptrdiff_t tpos = offsetof(struct ether_vlan_header, ether_tci);
2116 ptrdiff_t mlen;
2117
2118 /*
2119 * Extract the VLAN ID information, regardless of alignment,
2120 * and without a pullup. This isn't attractive, but we do this
2121 * to avoid having to deal with the pointers stashed in
2122 * hdr_info moving around or having the caller deal with a new
2123 * mblk_t pointer.
2124 */
2125 while (mp != NULL) {
2126 mlen = MBLKL(mp);
2127 if (mlen > tpos && mlen > 0)
2128 break;
2129 tpos -= mlen;
2130 mp = mp->b_cont;
2131 }
2132 if (mp == NULL)
2133 return (B_FALSE);
2134 tci = mp->b_rptr[tpos] << 8;
2135 if (++tpos >= mlen) {
2136 do {
2137 mp = mp->b_cont;
2138 } while (mp != NULL && MBLKL(mp) == 0);
2139 if (mp == NULL)
2140 return (B_FALSE);
2141 tpos = 0;
2142 }
2143 tci |= mp->b_rptr[tpos];
2144
2145 vlanid = VLAN_ID(tci);
2146 if (VLAN_CFI(tci) != ETHER_CFI || vlanid > VLAN_ID_MAX)
2147 return (B_FALSE);
2148 if (vlanid == VLAN_ID_NONE || vlanid == blp->bl_pvid)
2149 goto input_no_vlan;
2150 if (!BRIDGE_VLAN_ISSET(blp, vlanid))
2151 return (B_FALSE);
2152 } else {
2153 tci = 0xFFFF;
2154 input_no_vlan:
2155 /*
2156 * If PVID is set to zero, then untagged traffic is not
2157 * supported here. Do not learn or forward.
2158 */
2159 if ((vlanid = blp->bl_pvid) == VLAN_ID_NONE)
2160 return (B_FALSE);
2161 }
2162
2163 *tcip = tci;
2164 *vlanidp = vlanid;
2165 return (B_TRUE);
2166 }
2167
2168 /*
2169 * Handle MAC notifications.
2170 */
2171 static void
bridge_notify_cb(void * arg,mac_notify_type_t note_type)2172 bridge_notify_cb(void *arg, mac_notify_type_t note_type)
2173 {
2174 bridge_link_t *blp = arg;
2175
2176 switch (note_type) {
2177 case MAC_NOTE_UNICST:
2178 bridge_new_unicst(blp);
2179 break;
2180
2181 case MAC_NOTE_SDU_SIZE: {
2182 uint_t maxsdu;
2183 bridge_inst_t *bip = blp->bl_inst;
2184 bridge_mac_t *bmp = bip->bi_mac;
2185 boolean_t notify = B_FALSE;
2186 mblk_t *mlist = NULL;
2187
2188 mac_sdu_get(blp->bl_mh, NULL, &maxsdu);
2189 rw_enter(&bip->bi_rwlock, RW_READER);
2190 if (list_prev(&bip->bi_links, blp) == NULL &&
2191 list_next(&bip->bi_links, blp) == NULL) {
2192 notify = (maxsdu != bmp->bm_maxsdu);
2193 bmp->bm_maxsdu = maxsdu;
2194 }
2195 blp->bl_maxsdu = maxsdu;
2196 if (maxsdu != bmp->bm_maxsdu)
2197 link_sdu_fail(blp, B_TRUE, &mlist);
2198 else if (notify)
2199 (void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2200 rw_exit(&bip->bi_rwlock);
2201 send_up_messages(bip, mlist);
2202 break;
2203 }
2204 }
2205 }
2206
2207 /*
2208 * This is called by the MAC layer. As with the transmit side, we're right in
2209 * the data path for all I/O on this port, so if we don't need to forward this
2210 * packet anywhere, we have to send it upwards via mac_rx_common.
2211 */
2212 static void
bridge_recv_cb(mac_handle_t mh,mac_resource_handle_t rsrc,mblk_t * mpnext)2213 bridge_recv_cb(mac_handle_t mh, mac_resource_handle_t rsrc, mblk_t *mpnext)
2214 {
2215 mblk_t *mp, *mpcopy;
2216 bridge_link_t *blp = (bridge_link_t *)mh;
2217 bridge_inst_t *bip = blp->bl_inst;
2218 bridge_mac_t *bmp = bip->bi_mac;
2219 mac_header_info_t hdr_info;
2220 uint16_t vlanid, tci;
2221 boolean_t trillmode = B_FALSE;
2222
2223 KIINCR(bki_recv);
2224 KLINCR(bkl_recv);
2225
2226 /*
2227 * Regardless of state, check for inbound TRILL packets when TRILL is
2228 * active. These are pulled out of band and sent for TRILL handling.
2229 */
2230 if (blp->bl_trilldata != NULL) {
2231 void *tdp;
2232 mblk_t *newhead;
2233 mblk_t *tail = NULL;
2234
2235 mutex_enter(&blp->bl_trilllock);
2236 if ((tdp = blp->bl_trilldata) != NULL) {
2237 blp->bl_trillthreads++;
2238 mutex_exit(&blp->bl_trilllock);
2239 trillmode = B_TRUE;
2240 newhead = mpnext;
2241 while ((mp = mpnext) != NULL) {
2242 boolean_t raw_isis, bridge_group;
2243
2244 mpnext = mp->b_next;
2245
2246 /*
2247 * If the header isn't readable, then leave on
2248 * the list and continue.
2249 */
2250 if (mac_header_info(blp->bl_mh, mp,
2251 &hdr_info) != 0) {
2252 tail = mp;
2253 continue;
2254 }
2255
2256 /*
2257 * The TRILL document specifies that, on
2258 * Ethernet alone, IS-IS packets arrive with
2259 * LLC rather than Ethertype, and using a
2260 * specific destination address. We must check
2261 * for that here. Also, we need to give BPDUs
2262 * to TRILL for processing.
2263 */
2264 raw_isis = bridge_group = B_FALSE;
2265 if (hdr_info.mhi_dsttype ==
2266 MAC_ADDRTYPE_MULTICAST) {
2267 if (memcmp(hdr_info.mhi_daddr,
2268 all_isis_rbridges, ETHERADDRL) == 0)
2269 raw_isis = B_TRUE;
2270 else if (memcmp(hdr_info.mhi_daddr,
2271 bridge_group_address, ETHERADDRL) ==
2272 0)
2273 bridge_group = B_TRUE;
2274 }
2275 if (!raw_isis && !bridge_group &&
2276 hdr_info.mhi_bindsap != ETHERTYPE_TRILL &&
2277 (hdr_info.mhi_bindsap != ETHERTYPE_VLAN ||
2278 /* LINTED: alignment */
2279 ((struct ether_vlan_header *)mp->b_rptr)->
2280 ether_type != htons(ETHERTYPE_TRILL))) {
2281 tail = mp;
2282 continue;
2283 }
2284
2285 /*
2286 * We've got TRILL input. Remove from the list
2287 * and send up through the TRILL module. (Send
2288 * a copy through promiscuous receive just to
2289 * support snooping on TRILL. Order isn't
2290 * preserved strictly, but that doesn't matter
2291 * here.)
2292 */
2293 if (tail != NULL)
2294 tail->b_next = mpnext;
2295 mp->b_next = NULL;
2296 if (mp == newhead)
2297 newhead = mpnext;
2298 mac_trill_snoop(blp->bl_mh, mp);
2299 update_header(mp, &hdr_info, B_TRUE);
2300 /*
2301 * On raw IS-IS and BPDU frames, we have to
2302 * make sure that the length is trimmed
2303 * properly. We use origsap in order to cope
2304 * with jumbograms for IS-IS. (Regular mac
2305 * can't.)
2306 */
2307 if (raw_isis || bridge_group) {
2308 size_t msglen = msgdsize(mp);
2309
2310 if (msglen > hdr_info.mhi_origsap) {
2311 (void) adjmsg(mp,
2312 hdr_info.mhi_origsap -
2313 msglen);
2314 } else if (msglen <
2315 hdr_info.mhi_origsap) {
2316 freemsg(mp);
2317 continue;
2318 }
2319 }
2320 trill_recv_fn(tdp, blp, rsrc, mp, &hdr_info);
2321 }
2322 mpnext = newhead;
2323 mutex_enter(&blp->bl_trilllock);
2324 if (--blp->bl_trillthreads == 0 &&
2325 blp->bl_trilldata == NULL)
2326 cv_broadcast(&blp->bl_trillwait);
2327 }
2328 mutex_exit(&blp->bl_trilllock);
2329 if (mpnext == NULL)
2330 return;
2331 }
2332
2333 /*
2334 * If this is a TRILL RBridge, then just check whether this link is
2335 * used at all for forwarding. If not, then we're done.
2336 */
2337 if (trillmode) {
2338 if (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2339 (blp->bl_flags & BLF_SDUFAIL)) {
2340 mac_rx_common(blp->bl_mh, rsrc, mpnext);
2341 return;
2342 }
2343 } else {
2344 /*
2345 * For regular (STP) bridges, if we're in blocking or listening
2346 * state, then do nothing. We don't learn or forward until
2347 * told to do so.
2348 */
2349 if (blp->bl_state == BLS_BLOCKLISTEN) {
2350 mac_rx_common(blp->bl_mh, rsrc, mpnext);
2351 return;
2352 }
2353 }
2354
2355 /*
2356 * Send a copy of the message chain up to the observability node users.
2357 * For TRILL, we must obey the VLAN AF rules, so we go packet-by-
2358 * packet.
2359 */
2360 if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2361 (bmp->bm_flags & BMF_STARTED) &&
2362 (mp = copymsgchain(mpnext)) != NULL) {
2363 mac_rx(bmp->bm_mh, NULL, mp);
2364 }
2365
2366 /*
2367 * We must be in learning or forwarding state, or using TRILL on a link
2368 * with one or more VLANs active. For each packet in the list, process
2369 * the source address, and then attempt to forward.
2370 */
2371 while ((mp = mpnext) != NULL) {
2372 mpnext = mp->b_next;
2373 mp->b_next = NULL;
2374
2375 /*
2376 * If we can't decode the header or if the header specifies a
2377 * multicast source address (impossible!), then don't bother
2378 * learning or forwarding, but go ahead and forward up the
2379 * stack for subsequent processing.
2380 */
2381 if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0 ||
2382 (hdr_info.mhi_saddr[0] & 1) != 0) {
2383 KIINCR(bki_drops);
2384 KLINCR(bkl_drops);
2385 mac_rx_common(blp->bl_mh, rsrc, mp);
2386 continue;
2387 }
2388
2389 /*
2390 * Extract and validate the VLAN ID for this packet.
2391 */
2392 if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2393 !BRIDGE_AF_ISSET(blp, vlanid)) {
2394 mac_rx_common(blp->bl_mh, rsrc, mp);
2395 continue;
2396 }
2397
2398 if (trillmode) {
2399 /*
2400 * Special test required by TRILL document: must
2401 * discard frames with outer address set to ESADI.
2402 */
2403 if (memcmp(hdr_info.mhi_daddr, all_esadi_rbridges,
2404 ETHERADDRL) == 0) {
2405 mac_rx_common(blp->bl_mh, rsrc, mp);
2406 continue;
2407 }
2408
2409 /*
2410 * If we're in TRILL mode, then the call above to get
2411 * the VLAN ID has also checked that we're the
2412 * appointed forwarder, so report that we're handling
2413 * this packet to any observability node users.
2414 */
2415 if ((bmp->bm_flags & BMF_STARTED) &&
2416 (mpcopy = copymsg(mp)) != NULL)
2417 mac_rx(bmp->bm_mh, NULL, mpcopy);
2418 }
2419
2420 /*
2421 * First process the source address and learn from it. For
2422 * TRILL, we learn only if we're the appointed forwarder.
2423 */
2424 bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2425 vlanid);
2426
2427 /*
2428 * Now check whether we're forwarding and look up the
2429 * destination. If we can forward, do so.
2430 */
2431 if (trillmode || blp->bl_state == BLS_FORWARDING) {
2432 mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2433 B_FALSE, B_FALSE);
2434 }
2435 if (mp != NULL)
2436 mac_rx_common(blp->bl_mh, rsrc, mp);
2437 }
2438 }
2439
2440
2441 /* ARGSUSED */
2442 static mblk_t *
bridge_xmit_cb(mac_handle_t mh,mac_ring_handle_t rh,mblk_t * mpnext)2443 bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext)
2444 {
2445 bridge_link_t *blp = (bridge_link_t *)mh;
2446 bridge_inst_t *bip = blp->bl_inst;
2447 bridge_mac_t *bmp = bip->bi_mac;
2448 mac_header_info_t hdr_info;
2449 uint16_t vlanid, tci;
2450 mblk_t *mp, *mpcopy;
2451 boolean_t trillmode;
2452
2453 trillmode = blp->bl_trilldata != NULL;
2454
2455 /*
2456 * If we're using STP and we're in blocking or listening state, or if
2457 * we're using TRILL and no VLANs are active, then behave as though the
2458 * bridge isn't here at all, and send on the local link alone.
2459 */
2460 if ((!trillmode && blp->bl_state == BLS_BLOCKLISTEN) ||
2461 (trillmode &&
2462 (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2463 (blp->bl_flags & BLF_SDUFAIL)))) {
2464 KIINCR(bki_sent);
2465 KLINCR(bkl_xmit);
2466 MAC_RING_TX(blp->bl_mh, rh, mpnext, mp);
2467 return (mp);
2468 }
2469
2470 /*
2471 * Send a copy of the message up to the observability node users.
2472 * TRILL needs to check on a packet-by-packet basis.
2473 */
2474 if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2475 (bmp->bm_flags & BMF_STARTED) &&
2476 (mp = copymsgchain(mpnext)) != NULL) {
2477 mac_rx(bmp->bm_mh, NULL, mp);
2478 }
2479
2480 while ((mp = mpnext) != NULL) {
2481 mpnext = mp->b_next;
2482 mp->b_next = NULL;
2483
2484 if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2485 freemsg(mp);
2486 continue;
2487 }
2488
2489 /*
2490 * Extract and validate the VLAN ID for this packet.
2491 */
2492 if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2493 !BRIDGE_AF_ISSET(blp, vlanid)) {
2494 freemsg(mp);
2495 continue;
2496 }
2497
2498 /*
2499 * If we're using TRILL, then we've now validated that we're
2500 * the forwarder for this VLAN, so go ahead and let
2501 * observability node users know about the packet.
2502 */
2503 if (trillmode && (bmp->bm_flags & BMF_STARTED) &&
2504 (mpcopy = copymsg(mp)) != NULL) {
2505 mac_rx(bmp->bm_mh, NULL, mpcopy);
2506 }
2507
2508 /*
2509 * We have to learn from our own transmitted packets, because
2510 * there may be a Solaris DLPI raw sender (who can specify his
2511 * own source address) using promiscuous mode for receive. The
2512 * mac layer information won't (and can't) tell us everything
2513 * we need to know.
2514 */
2515 bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2516 vlanid);
2517
2518 /* attempt forwarding */
2519 if (trillmode || blp->bl_state == BLS_FORWARDING) {
2520 mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2521 B_FALSE, B_TRUE);
2522 }
2523 if (mp != NULL) {
2524 MAC_RING_TX(blp->bl_mh, rh, mp, mp);
2525 if (mp == NULL) {
2526 KIINCR(bki_sent);
2527 KLINCR(bkl_xmit);
2528 }
2529 }
2530 /*
2531 * If we get stuck, then stop. Don't let the user's output
2532 * packets get out of order. (More importantly: don't try to
2533 * bridge the same packet multiple times if flow control is
2534 * asserted.)
2535 */
2536 if (mp != NULL) {
2537 mp->b_next = mpnext;
2538 break;
2539 }
2540 }
2541 return (mp);
2542 }
2543
2544 /*
2545 * This is called by TRILL when it decapsulates an packet, and we must forward
2546 * locally. On failure, we just drop.
2547 *
2548 * Note that the ingress_nick reported by TRILL must not represent this local
2549 * node.
2550 */
2551 void
bridge_trill_decaps(bridge_link_t * blp,mblk_t * mp,uint16_t ingress_nick)2552 bridge_trill_decaps(bridge_link_t *blp, mblk_t *mp, uint16_t ingress_nick)
2553 {
2554 mac_header_info_t hdr_info;
2555 uint16_t vlanid, tci;
2556 bridge_inst_t *bip = blp->bl_inst; /* used by macros */
2557 mblk_t *mpcopy;
2558
2559 if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2560 freemsg(mp);
2561 return;
2562 }
2563
2564 /* Extract VLAN ID for this packet. */
2565 if (hdr_info.mhi_bindsap == ETHERTYPE_VLAN) {
2566 struct ether_vlan_header *evhp;
2567
2568 /* LINTED: alignment */
2569 evhp = (struct ether_vlan_header *)mp->b_rptr;
2570 tci = ntohs(evhp->ether_tci);
2571 vlanid = VLAN_ID(tci);
2572 } else {
2573 /* Inner VLAN headers are required in TRILL data packets */
2574 DTRACE_PROBE3(bridge__trill__decaps__novlan, bridge_link_t *,
2575 blp, mblk_t *, mp, uint16_t, ingress_nick);
2576 freemsg(mp);
2577 return;
2578 }
2579
2580 /* Learn the location of this sender in the RBridge network */
2581 bridge_learn(blp, hdr_info.mhi_saddr, ingress_nick, vlanid);
2582
2583 /* attempt forwarding */
2584 mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci, B_TRUE, B_TRUE);
2585 if (mp != NULL) {
2586 if (bridge_can_send(blp, vlanid)) {
2587 /* Deliver a copy locally as well */
2588 if ((mpcopy = copymsg(mp)) != NULL)
2589 mac_rx_common(blp->bl_mh, NULL, mpcopy);
2590 MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
2591 }
2592 if (mp == NULL) {
2593 KIINCR(bki_sent);
2594 KLINCR(bkl_xmit);
2595 } else {
2596 freemsg(mp);
2597 }
2598 }
2599 }
2600
2601 /*
2602 * This function is used by TRILL _only_ to transmit TRILL-encapsulated
2603 * packets. It sends on a single underlying link and does not bridge.
2604 */
2605 mblk_t *
bridge_trill_output(bridge_link_t * blp,mblk_t * mp)2606 bridge_trill_output(bridge_link_t *blp, mblk_t *mp)
2607 {
2608 bridge_inst_t *bip = blp->bl_inst; /* used by macros */
2609
2610 mac_trill_snoop(blp->bl_mh, mp);
2611 MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
2612 if (mp == NULL) {
2613 KIINCR(bki_sent);
2614 KLINCR(bkl_xmit);
2615 }
2616 return (mp);
2617 }
2618
2619 /*
2620 * Set the "appointed forwarder" flag array for this link. TRILL controls
2621 * forwarding on a VLAN basis. The "trillactive" flag is an optimization for
2622 * the forwarder.
2623 */
2624 void
bridge_trill_setvlans(bridge_link_t * blp,const uint8_t * arr)2625 bridge_trill_setvlans(bridge_link_t *blp, const uint8_t *arr)
2626 {
2627 int i;
2628 uint_t newflags = 0;
2629
2630 for (i = 0; i < BRIDGE_VLAN_ARR_SIZE; i++) {
2631 if ((blp->bl_afs[i] = arr[i]) != 0)
2632 newflags = BLF_TRILLACTIVE;
2633 }
2634 blp->bl_flags = (blp->bl_flags & ~BLF_TRILLACTIVE) | newflags;
2635 }
2636
2637 void
bridge_trill_flush(bridge_link_t * blp,uint16_t vlan,boolean_t dotrill)2638 bridge_trill_flush(bridge_link_t *blp, uint16_t vlan, boolean_t dotrill)
2639 {
2640 bridge_inst_t *bip = blp->bl_inst;
2641 bridge_fwd_t *bfp, *bfnext;
2642 avl_tree_t fwd_scavenge;
2643 int i;
2644
2645 _NOTE(ARGUNUSED(vlan));
2646
2647 avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
2648 offsetof(bridge_fwd_t, bf_node));
2649 rw_enter(&bip->bi_rwlock, RW_WRITER);
2650 bfnext = avl_first(&bip->bi_fwd);
2651 while ((bfp = bfnext) != NULL) {
2652 bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
2653 if (bfp->bf_flags & BFF_LOCALADDR)
2654 continue;
2655 if (dotrill) {
2656 /* port doesn't matter if we're flushing TRILL */
2657 if (bfp->bf_trill_nick == RBRIDGE_NICKNAME_NONE)
2658 continue;
2659 } else {
2660 if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE)
2661 continue;
2662 for (i = 0; i < bfp->bf_nlinks; i++) {
2663 if (bfp->bf_links[i] == blp)
2664 break;
2665 }
2666 if (i >= bfp->bf_nlinks)
2667 continue;
2668 }
2669 ASSERT(bfp->bf_flags & BFF_INTREE);
2670 avl_remove(&bip->bi_fwd, bfp);
2671 bfp->bf_flags &= ~BFF_INTREE;
2672 avl_add(&fwd_scavenge, bfp);
2673 }
2674 rw_exit(&bip->bi_rwlock);
2675 bfnext = avl_first(&fwd_scavenge);
2676 while ((bfp = bfnext) != NULL) {
2677 bfnext = AVL_NEXT(&fwd_scavenge, bfp);
2678 avl_remove(&fwd_scavenge, bfp);
2679 fwd_unref(bfp);
2680 }
2681 avl_destroy(&fwd_scavenge);
2682 }
2683
2684 /*
2685 * Let the mac module take or drop a reference to a bridge link. When this is
2686 * called, the mac module is holding the mi_bridge_lock, so the link cannot be
2687 * in the process of entering or leaving a bridge.
2688 */
2689 static void
bridge_ref_cb(mac_handle_t mh,boolean_t hold)2690 bridge_ref_cb(mac_handle_t mh, boolean_t hold)
2691 {
2692 bridge_link_t *blp = (bridge_link_t *)mh;
2693
2694 if (hold)
2695 atomic_inc_uint(&blp->bl_refs);
2696 else
2697 link_unref(blp);
2698 }
2699
2700 /*
2701 * Handle link state changes reported by the mac layer. This acts as a filter
2702 * for link state changes: if a link is reporting down, but there are other
2703 * links still up on the bridge, then the state is changed to "up." When the
2704 * last link goes down, all are marked down, and when the first link goes up,
2705 * all are marked up. (Recursion is avoided by the use of the "redo" function.)
2706 *
2707 * We treat unknown as equivalent to "up."
2708 */
2709 static link_state_t
bridge_ls_cb(mac_handle_t mh,link_state_t newls)2710 bridge_ls_cb(mac_handle_t mh, link_state_t newls)
2711 {
2712 bridge_link_t *blp = (bridge_link_t *)mh;
2713 bridge_link_t *blcmp;
2714 bridge_inst_t *bip;
2715 bridge_mac_t *bmp;
2716
2717 if (newls != LINK_STATE_DOWN && blp->bl_linkstate != LINK_STATE_DOWN ||
2718 (blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL))) {
2719 blp->bl_linkstate = newls;
2720 return (newls);
2721 }
2722
2723 /*
2724 * Scan first to see if there are any other non-down links. If there
2725 * are, then we're done. Otherwise, if all others are down, then the
2726 * state of this link is the state of the bridge.
2727 */
2728 bip = blp->bl_inst;
2729 rw_enter(&bip->bi_rwlock, RW_WRITER);
2730 for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2731 blcmp = list_next(&bip->bi_links, blcmp)) {
2732 if (blcmp != blp &&
2733 !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
2734 blcmp->bl_linkstate != LINK_STATE_DOWN)
2735 break;
2736 }
2737
2738 if (blcmp != NULL) {
2739 /*
2740 * If there are other links that are considered up, then tell
2741 * the caller that the link is actually still up, regardless of
2742 * this link's underlying state.
2743 */
2744 blp->bl_linkstate = newls;
2745 newls = LINK_STATE_UP;
2746 } else if (blp->bl_linkstate != newls) {
2747 /*
2748 * If we've found no other 'up' links, and this link has
2749 * changed state, then report the new state of the bridge to
2750 * all other clients.
2751 */
2752 blp->bl_linkstate = newls;
2753 for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2754 blcmp = list_next(&bip->bi_links, blcmp)) {
2755 if (blcmp != blp && !(blcmp->bl_flags & BLF_DELETED))
2756 mac_link_redo(blcmp->bl_mh, newls);
2757 }
2758 bmp = bip->bi_mac;
2759 if ((bmp->bm_linkstate = newls) != LINK_STATE_DOWN)
2760 bmp->bm_linkstate = LINK_STATE_UP;
2761 mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
2762 }
2763 rw_exit(&bip->bi_rwlock);
2764 return (newls);
2765 }
2766
2767 static void
bridge_add_link(void * arg)2768 bridge_add_link(void *arg)
2769 {
2770 mblk_t *mp = arg;
2771 bridge_stream_t *bsp;
2772 bridge_inst_t *bip, *bipt;
2773 bridge_mac_t *bmp;
2774 datalink_id_t linkid;
2775 int err;
2776 mac_handle_t mh;
2777 uint_t maxsdu;
2778 bridge_link_t *blp = NULL, *blpt;
2779 const mac_info_t *mip;
2780 boolean_t macopen = B_FALSE;
2781 char linkname[MAXLINKNAMELEN];
2782 char kstatname[KSTAT_STRLEN];
2783 int i;
2784 link_state_t linkstate;
2785 mblk_t *mlist;
2786
2787 bsp = (bridge_stream_t *)mp->b_next;
2788 mp->b_next = NULL;
2789 bip = bsp->bs_inst;
2790 /* LINTED: alignment */
2791 linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2792
2793 /*
2794 * First make sure that there is no other bridge that has this link.
2795 * We don't want to overlap operations from two bridges; the MAC layer
2796 * supports only one bridge on a given MAC at a time.
2797 *
2798 * We rely on the fact that there's just one taskq thread for the
2799 * bridging module: once we've checked for a duplicate, we can drop the
2800 * lock, because no other thread could possibly be adding another link
2801 * until we're done.
2802 */
2803 mutex_enter(&inst_lock);
2804 for (bipt = list_head(&inst_list); bipt != NULL;
2805 bipt = list_next(&inst_list, bipt)) {
2806 rw_enter(&bipt->bi_rwlock, RW_READER);
2807 for (blpt = list_head(&bipt->bi_links); blpt != NULL;
2808 blpt = list_next(&bipt->bi_links, blpt)) {
2809 if (linkid == blpt->bl_linkid)
2810 break;
2811 }
2812 rw_exit(&bipt->bi_rwlock);
2813 if (blpt != NULL)
2814 break;
2815 }
2816 mutex_exit(&inst_lock);
2817 if (bipt != NULL) {
2818 err = EBUSY;
2819 goto fail;
2820 }
2821
2822 if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
2823 goto fail;
2824 macopen = B_TRUE;
2825
2826 /* we bridge only Ethernet */
2827 mip = mac_info(mh);
2828 if (mip->mi_media != DL_ETHER) {
2829 err = ENOTSUP;
2830 goto fail;
2831 }
2832
2833 /*
2834 * Get the current maximum SDU on this interface. If there are other
2835 * links on the bridge, then this one must match, or it errors out.
2836 * Otherwise, the first link becomes the standard for the new bridge.
2837 */
2838 mac_sdu_get(mh, NULL, &maxsdu);
2839 bmp = bip->bi_mac;
2840 if (list_is_empty(&bip->bi_links)) {
2841 bmp->bm_maxsdu = maxsdu;
2842 (void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2843 }
2844
2845 /* figure the kstat name; also used as the mac client name */
2846 i = MBLKL(mp->b_cont) - sizeof (datalink_id_t);
2847 if (i < 0 || i >= MAXLINKNAMELEN)
2848 i = MAXLINKNAMELEN - 1;
2849 bcopy(mp->b_cont->b_rptr + sizeof (datalink_id_t), linkname, i);
2850 linkname[i] = '\0';
2851 (void) snprintf(kstatname, sizeof (kstatname), "%s-%s", bip->bi_name,
2852 linkname);
2853
2854 if ((blp = kmem_zalloc(sizeof (*blp), KM_NOSLEEP)) == NULL) {
2855 err = ENOMEM;
2856 goto fail;
2857 }
2858 blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
2859 if (blp->bl_lfailmp == NULL) {
2860 kmem_free(blp, sizeof (*blp));
2861 blp = NULL;
2862 err = ENOMEM;
2863 goto fail;
2864 }
2865
2866 blp->bl_refs = 1;
2867 atomic_inc_uint(&bip->bi_refs);
2868 blp->bl_inst = bip;
2869 blp->bl_mh = mh;
2870 blp->bl_linkid = linkid;
2871 blp->bl_maxsdu = maxsdu;
2872 cv_init(&blp->bl_trillwait, NULL, CV_DRIVER, NULL);
2873 mutex_init(&blp->bl_trilllock, NULL, MUTEX_DRIVER, NULL);
2874 (void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
2875
2876 err = mac_client_open(mh, &blp->bl_mch, kstatname, 0);
2877 if (err != 0)
2878 goto fail;
2879 blp->bl_flags |= BLF_CLIENT_OPEN;
2880
2881 err = mac_margin_add(mh, &blp->bl_margin, B_TRUE);
2882 if (err != 0)
2883 goto fail;
2884 blp->bl_flags |= BLF_MARGIN_ADDED;
2885
2886 blp->bl_mnh = mac_notify_add(mh, bridge_notify_cb, blp);
2887
2888 /* Enable Bridging on the link */
2889 err = mac_bridge_set(mh, (mac_handle_t)blp);
2890 if (err != 0)
2891 goto fail;
2892 blp->bl_flags |= BLF_SET_BRIDGE;
2893
2894 err = mac_promisc_add(blp->bl_mch, MAC_CLIENT_PROMISC_ALL, NULL,
2895 blp, &blp->bl_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP);
2896 if (err != 0)
2897 goto fail;
2898 blp->bl_flags |= BLF_PROM_ADDED;
2899
2900 bridge_new_unicst(blp);
2901
2902 blp->bl_ksp = kstat_setup((kstat_named_t *)&blp->bl_kstats,
2903 link_kstats_list, Dim(link_kstats_list), kstatname);
2904
2905 /*
2906 * The link holds a reference to the bridge instance, so that the
2907 * instance can't go away before the link is freed. The insertion into
2908 * bi_links holds a reference on the link (reference set to 1 above).
2909 * When marking as removed from bi_links (BLF_DELETED), drop the
2910 * reference on the link. When freeing the link, drop the reference on
2911 * the instance. BLF_LINK_ADDED tracks link insertion in bi_links list.
2912 */
2913 rw_enter(&bip->bi_rwlock, RW_WRITER);
2914 list_insert_tail(&bip->bi_links, blp);
2915 blp->bl_flags |= BLF_LINK_ADDED;
2916
2917 /*
2918 * If the new link is no good on this bridge, then let the daemon know
2919 * about the problem.
2920 */
2921 mlist = NULL;
2922 if (maxsdu != bmp->bm_maxsdu)
2923 link_sdu_fail(blp, B_TRUE, &mlist);
2924 rw_exit(&bip->bi_rwlock);
2925 send_up_messages(bip, mlist);
2926
2927 /*
2928 * Trigger a link state update so that if this link is the first one
2929 * "up" in the bridge, then we notify everyone. This triggers a trip
2930 * through bridge_ls_cb.
2931 */
2932 linkstate = mac_stat_get(mh, MAC_STAT_LOWLINK_STATE);
2933 blp->bl_linkstate = LINK_STATE_DOWN;
2934 mac_link_update(mh, linkstate);
2935
2936 /*
2937 * We now need to report back to the stream that invoked us, and then
2938 * drop the reference on the stream that we're holding.
2939 */
2940 miocack(bsp->bs_wq, mp, 0, 0);
2941 stream_unref(bsp);
2942 return;
2943
2944 fail:
2945 if (blp == NULL) {
2946 if (macopen)
2947 mac_close(mh);
2948 } else {
2949 link_shutdown(blp);
2950 }
2951 miocnak(bsp->bs_wq, mp, 0, err);
2952 stream_unref(bsp);
2953 }
2954
2955 static void
bridge_rem_link(void * arg)2956 bridge_rem_link(void *arg)
2957 {
2958 mblk_t *mp = arg;
2959 bridge_stream_t *bsp;
2960 bridge_inst_t *bip;
2961 bridge_mac_t *bmp;
2962 datalink_id_t linkid;
2963 bridge_link_t *blp, *blsave;
2964 boolean_t found;
2965 mblk_t *mlist;
2966
2967 bsp = (bridge_stream_t *)mp->b_next;
2968 mp->b_next = NULL;
2969 bip = bsp->bs_inst;
2970 /* LINTED: alignment */
2971 linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2972
2973 /*
2974 * We become reader here so that we can loop over the other links and
2975 * deliver link up/down notification.
2976 */
2977 rw_enter(&bip->bi_rwlock, RW_READER);
2978 found = B_FALSE;
2979 for (blp = list_head(&bip->bi_links); blp != NULL;
2980 blp = list_next(&bip->bi_links, blp)) {
2981 if (blp->bl_linkid == linkid &&
2982 !(blp->bl_flags & BLF_DELETED)) {
2983 blp->bl_flags |= BLF_DELETED;
2984 (void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
2985 blp, DDI_SLEEP);
2986 found = B_TRUE;
2987 break;
2988 }
2989 }
2990
2991 /*
2992 * Check if this link is up and the remainder of the links are all
2993 * down.
2994 */
2995 if (blp != NULL && blp->bl_linkstate != LINK_STATE_DOWN) {
2996 for (blp = list_head(&bip->bi_links); blp != NULL;
2997 blp = list_next(&bip->bi_links, blp)) {
2998 if (blp->bl_linkstate != LINK_STATE_DOWN &&
2999 !(blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)))
3000 break;
3001 }
3002 if (blp == NULL) {
3003 for (blp = list_head(&bip->bi_links); blp != NULL;
3004 blp = list_next(&bip->bi_links, blp)) {
3005 if (!(blp->bl_flags & BLF_DELETED))
3006 mac_link_redo(blp->bl_mh,
3007 LINK_STATE_DOWN);
3008 }
3009 bmp = bip->bi_mac;
3010 bmp->bm_linkstate = LINK_STATE_DOWN;
3011 mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
3012 }
3013 }
3014
3015 /*
3016 * Check if there's just one working link left on the bridge. If so,
3017 * then that link is now authoritative for bridge MTU.
3018 */
3019 blsave = NULL;
3020 for (blp = list_head(&bip->bi_links); blp != NULL;
3021 blp = list_next(&bip->bi_links, blp)) {
3022 if (!(blp->bl_flags & BLF_DELETED)) {
3023 if (blsave == NULL)
3024 blsave = blp;
3025 else
3026 break;
3027 }
3028 }
3029 mlist = NULL;
3030 bmp = bip->bi_mac;
3031 if (blsave != NULL && blp == NULL &&
3032 blsave->bl_maxsdu != bmp->bm_maxsdu) {
3033 bmp->bm_maxsdu = blsave->bl_maxsdu;
3034 (void) mac_maxsdu_update(bmp->bm_mh, blsave->bl_maxsdu);
3035 link_sdu_fail(blsave, B_FALSE, &mlist);
3036 }
3037 rw_exit(&bip->bi_rwlock);
3038 send_up_messages(bip, mlist);
3039
3040 if (found)
3041 miocack(bsp->bs_wq, mp, 0, 0);
3042 else
3043 miocnak(bsp->bs_wq, mp, 0, ENOENT);
3044 stream_unref(bsp);
3045 }
3046
3047 /*
3048 * This function intentionally returns with bi_rwlock held; it is intended for
3049 * quick checks and updates.
3050 */
3051 static bridge_link_t *
enter_link(bridge_inst_t * bip,datalink_id_t linkid)3052 enter_link(bridge_inst_t *bip, datalink_id_t linkid)
3053 {
3054 bridge_link_t *blp;
3055
3056 rw_enter(&bip->bi_rwlock, RW_READER);
3057 for (blp = list_head(&bip->bi_links); blp != NULL;
3058 blp = list_next(&bip->bi_links, blp)) {
3059 if (blp->bl_linkid == linkid && !(blp->bl_flags & BLF_DELETED))
3060 break;
3061 }
3062 return (blp);
3063 }
3064
3065 static void
bridge_ioctl(queue_t * wq,mblk_t * mp)3066 bridge_ioctl(queue_t *wq, mblk_t *mp)
3067 {
3068 bridge_stream_t *bsp = wq->q_ptr;
3069 bridge_inst_t *bip;
3070 struct iocblk *iop;
3071 int rc = EINVAL;
3072 int len = 0;
3073 bridge_link_t *blp;
3074 cred_t *cr;
3075
3076 /* LINTED: alignment */
3077 iop = (struct iocblk *)mp->b_rptr;
3078
3079 /*
3080 * For now, all of the bridge ioctls are privileged.
3081 */
3082 if ((cr = msg_getcred(mp, NULL)) == NULL)
3083 cr = iop->ioc_cr;
3084 if (cr != NULL && secpolicy_net_config(cr, B_FALSE) != 0) {
3085 miocnak(wq, mp, 0, EPERM);
3086 return;
3087 }
3088
3089 switch (iop->ioc_cmd) {
3090 case BRIOC_NEWBRIDGE: {
3091 bridge_newbridge_t *bnb;
3092
3093 if (bsp->bs_inst != NULL ||
3094 (rc = miocpullup(mp, sizeof (bridge_newbridge_t))) != 0)
3095 break;
3096 /* LINTED: alignment */
3097 bnb = (bridge_newbridge_t *)mp->b_cont->b_rptr;
3098 bnb->bnb_name[MAXNAMELEN-1] = '\0';
3099 rc = bridge_create(bnb->bnb_linkid, bnb->bnb_name, &bip, cr);
3100 if (rc != 0)
3101 break;
3102
3103 rw_enter(&bip->bi_rwlock, RW_WRITER);
3104 if (bip->bi_control != NULL) {
3105 rw_exit(&bip->bi_rwlock);
3106 bridge_unref(bip);
3107 rc = EBUSY;
3108 } else {
3109 atomic_inc_uint(&bip->bi_refs);
3110 bsp->bs_inst = bip; /* stream holds reference */
3111 bip->bi_control = bsp;
3112 rw_exit(&bip->bi_rwlock);
3113 rc = 0;
3114 }
3115 break;
3116 }
3117
3118 case BRIOC_ADDLINK:
3119 if ((bip = bsp->bs_inst) == NULL ||
3120 (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3121 break;
3122 /*
3123 * We cannot perform the action in this thread, because we're
3124 * not in process context, and we may already be holding
3125 * MAC-related locks. Place the request on taskq.
3126 */
3127 mp->b_next = (mblk_t *)bsp;
3128 stream_ref(bsp);
3129 (void) ddi_taskq_dispatch(bridge_taskq, bridge_add_link, mp,
3130 DDI_SLEEP);
3131 return;
3132
3133 case BRIOC_REMLINK:
3134 if ((bip = bsp->bs_inst) == NULL ||
3135 (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3136 break;
3137 /*
3138 * We cannot perform the action in this thread, because we're
3139 * not in process context, and we may already be holding
3140 * MAC-related locks. Place the request on taskq.
3141 */
3142 mp->b_next = (mblk_t *)bsp;
3143 stream_ref(bsp);
3144 (void) ddi_taskq_dispatch(bridge_taskq, bridge_rem_link, mp,
3145 DDI_SLEEP);
3146 return;
3147
3148 case BRIOC_SETSTATE: {
3149 bridge_setstate_t *bss;
3150
3151 if ((bip = bsp->bs_inst) == NULL ||
3152 (rc = miocpullup(mp, sizeof (*bss))) != 0)
3153 break;
3154 /* LINTED: alignment */
3155 bss = (bridge_setstate_t *)mp->b_cont->b_rptr;
3156 if ((blp = enter_link(bip, bss->bss_linkid)) == NULL) {
3157 rc = ENOENT;
3158 } else {
3159 rc = 0;
3160 blp->bl_state = bss->bss_state;
3161 }
3162 rw_exit(&bip->bi_rwlock);
3163 break;
3164 }
3165
3166 case BRIOC_SETPVID: {
3167 bridge_setpvid_t *bsv;
3168
3169 if ((bip = bsp->bs_inst) == NULL ||
3170 (rc = miocpullup(mp, sizeof (*bsv))) != 0)
3171 break;
3172 /* LINTED: alignment */
3173 bsv = (bridge_setpvid_t *)mp->b_cont->b_rptr;
3174 if (bsv->bsv_vlan > VLAN_ID_MAX)
3175 break;
3176 if ((blp = enter_link(bip, bsv->bsv_linkid)) == NULL) {
3177 rc = ENOENT;
3178 } else if (blp->bl_pvid == bsv->bsv_vlan) {
3179 rc = 0;
3180 } else {
3181 rc = 0;
3182 BRIDGE_VLAN_CLR(blp, blp->bl_pvid);
3183 blp->bl_pvid = bsv->bsv_vlan;
3184 if (blp->bl_pvid != 0)
3185 BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3186 }
3187 rw_exit(&bip->bi_rwlock);
3188 break;
3189 }
3190
3191 case BRIOC_VLANENAB: {
3192 bridge_vlanenab_t *bve;
3193
3194 if ((bip = bsp->bs_inst) == NULL ||
3195 (rc = miocpullup(mp, sizeof (*bve))) != 0)
3196 break;
3197 /* LINTED: alignment */
3198 bve = (bridge_vlanenab_t *)mp->b_cont->b_rptr;
3199 if (bve->bve_vlan > VLAN_ID_MAX)
3200 break;
3201 if ((blp = enter_link(bip, bve->bve_linkid)) == NULL) {
3202 rc = ENOENT;
3203 } else {
3204 rc = 0;
3205 /* special case: vlan 0 means "all" */
3206 if (bve->bve_vlan == 0) {
3207 (void) memset(blp->bl_vlans,
3208 bve->bve_onoff ? ~0 : 0,
3209 sizeof (blp->bl_vlans));
3210 BRIDGE_VLAN_CLR(blp, 0);
3211 if (blp->bl_pvid != 0)
3212 BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3213 } else if (bve->bve_vlan == blp->bl_pvid) {
3214 rc = EINVAL;
3215 } else if (bve->bve_onoff) {
3216 BRIDGE_VLAN_SET(blp, bve->bve_vlan);
3217 } else {
3218 BRIDGE_VLAN_CLR(blp, bve->bve_vlan);
3219 }
3220 }
3221 rw_exit(&bip->bi_rwlock);
3222 break;
3223 }
3224
3225 case BRIOC_FLUSHFWD: {
3226 bridge_flushfwd_t *bff;
3227 bridge_fwd_t *bfp, *bfnext;
3228 avl_tree_t fwd_scavenge;
3229 int i;
3230
3231 if ((bip = bsp->bs_inst) == NULL ||
3232 (rc = miocpullup(mp, sizeof (*bff))) != 0)
3233 break;
3234 /* LINTED: alignment */
3235 bff = (bridge_flushfwd_t *)mp->b_cont->b_rptr;
3236 rw_enter(&bip->bi_rwlock, RW_WRITER);
3237 /* This case means "all" */
3238 if (bff->bff_linkid == DATALINK_INVALID_LINKID) {
3239 blp = NULL;
3240 } else {
3241 for (blp = list_head(&bip->bi_links); blp != NULL;
3242 blp = list_next(&bip->bi_links, blp)) {
3243 if (blp->bl_linkid == bff->bff_linkid &&
3244 !(blp->bl_flags & BLF_DELETED))
3245 break;
3246 }
3247 if (blp == NULL) {
3248 rc = ENOENT;
3249 rw_exit(&bip->bi_rwlock);
3250 break;
3251 }
3252 }
3253 avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
3254 offsetof(bridge_fwd_t, bf_node));
3255 bfnext = avl_first(&bip->bi_fwd);
3256 while ((bfp = bfnext) != NULL) {
3257 bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
3258 if (bfp->bf_flags & BFF_LOCALADDR)
3259 continue;
3260 if (blp != NULL) {
3261 for (i = 0; i < bfp->bf_maxlinks; i++) {
3262 if (bfp->bf_links[i] == blp)
3263 break;
3264 }
3265 /*
3266 * If the link is there and we're excluding,
3267 * then skip. If the link is not there and
3268 * we're doing only that link, then skip.
3269 */
3270 if ((i < bfp->bf_maxlinks) == bff->bff_exclude)
3271 continue;
3272 }
3273 ASSERT(bfp->bf_flags & BFF_INTREE);
3274 avl_remove(&bip->bi_fwd, bfp);
3275 bfp->bf_flags &= ~BFF_INTREE;
3276 avl_add(&fwd_scavenge, bfp);
3277 }
3278 rw_exit(&bip->bi_rwlock);
3279 bfnext = avl_first(&fwd_scavenge);
3280 while ((bfp = bfnext) != NULL) {
3281 bfnext = AVL_NEXT(&fwd_scavenge, bfp);
3282 avl_remove(&fwd_scavenge, bfp);
3283 fwd_unref(bfp); /* drop tree reference */
3284 }
3285 avl_destroy(&fwd_scavenge);
3286 break;
3287 }
3288
3289 case BRIOC_TABLEMAX:
3290 if ((bip = bsp->bs_inst) == NULL ||
3291 (rc = miocpullup(mp, sizeof (uint32_t))) != 0)
3292 break;
3293 /* LINTED: alignment */
3294 bip->bi_tablemax = *(uint32_t *)mp->b_cont->b_rptr;
3295 break;
3296 }
3297
3298 if (rc == 0)
3299 miocack(wq, mp, len, 0);
3300 else
3301 miocnak(wq, mp, 0, rc);
3302 }
3303
3304 static void
bridge_wput(queue_t * wq,mblk_t * mp)3305 bridge_wput(queue_t *wq, mblk_t *mp)
3306 {
3307 switch (DB_TYPE(mp)) {
3308 case M_IOCTL:
3309 bridge_ioctl(wq, mp);
3310 break;
3311 case M_FLUSH:
3312 if (*mp->b_rptr & FLUSHW)
3313 *mp->b_rptr &= ~FLUSHW;
3314 if (*mp->b_rptr & FLUSHR)
3315 qreply(wq, mp);
3316 else
3317 freemsg(mp);
3318 break;
3319 default:
3320 freemsg(mp);
3321 break;
3322 }
3323 }
3324
3325 /*
3326 * This function allocates the main data structures for the bridge driver and
3327 * connects us into devfs.
3328 */
3329 static void
bridge_inst_init(void)3330 bridge_inst_init(void)
3331 {
3332 bridge_scan_interval = 5 * drv_usectohz(1000000);
3333 bridge_fwd_age = 25 * drv_usectohz(1000000);
3334
3335 rw_init(&bmac_rwlock, NULL, RW_DRIVER, NULL);
3336 list_create(&bmac_list, sizeof (bridge_mac_t),
3337 offsetof(bridge_mac_t, bm_node));
3338 list_create(&inst_list, sizeof (bridge_inst_t),
3339 offsetof(bridge_inst_t, bi_node));
3340 cv_init(&inst_cv, NULL, CV_DRIVER, NULL);
3341 mutex_init(&inst_lock, NULL, MUTEX_DRIVER, NULL);
3342 cv_init(&stream_ref_cv, NULL, CV_DRIVER, NULL);
3343 mutex_init(&stream_ref_lock, NULL, MUTEX_DRIVER, NULL);
3344
3345 mac_bridge_vectors(bridge_xmit_cb, bridge_recv_cb, bridge_ref_cb,
3346 bridge_ls_cb);
3347 }
3348
3349 /*
3350 * This function disconnects from devfs and destroys all data structures in
3351 * preparation for unload. It's assumed that there are no active bridge
3352 * references left at this point.
3353 */
3354 static void
bridge_inst_fini(void)3355 bridge_inst_fini(void)
3356 {
3357 mac_bridge_vectors(NULL, NULL, NULL, NULL);
3358 if (bridge_timerid != 0)
3359 (void) untimeout(bridge_timerid);
3360 rw_destroy(&bmac_rwlock);
3361 list_destroy(&bmac_list);
3362 list_destroy(&inst_list);
3363 cv_destroy(&inst_cv);
3364 mutex_destroy(&inst_lock);
3365 cv_destroy(&stream_ref_cv);
3366 mutex_destroy(&stream_ref_lock);
3367 }
3368
3369 /*
3370 * bridge_attach()
3371 *
3372 * Description:
3373 * Attach bridge driver to the system.
3374 */
3375 static int
bridge_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)3376 bridge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3377 {
3378 if (cmd != DDI_ATTACH)
3379 return (DDI_FAILURE);
3380
3381 if (ddi_create_minor_node(dip, BRIDGE_CTL, S_IFCHR, 0, DDI_PSEUDO,
3382 CLONE_DEV) == DDI_FAILURE) {
3383 return (DDI_FAILURE);
3384 }
3385
3386 if (dld_ioc_register(BRIDGE_IOC, bridge_ioc_list,
3387 DLDIOCCNT(bridge_ioc_list)) != 0) {
3388 ddi_remove_minor_node(dip, BRIDGE_CTL);
3389 return (DDI_FAILURE);
3390 }
3391
3392 bridge_dev_info = dip;
3393 bridge_major = ddi_driver_major(dip);
3394 bridge_taskq = ddi_taskq_create(dip, BRIDGE_DEV_NAME, 1,
3395 TASKQ_DEFAULTPRI, 0);
3396 return (DDI_SUCCESS);
3397 }
3398
3399 /*
3400 * bridge_detach()
3401 *
3402 * Description:
3403 * Detach an interface to the system.
3404 */
3405 static int
bridge_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)3406 bridge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3407 {
3408 if (cmd != DDI_DETACH)
3409 return (DDI_FAILURE);
3410
3411 ddi_remove_minor_node(dip, NULL);
3412 ddi_taskq_destroy(bridge_taskq);
3413 bridge_dev_info = NULL;
3414 return (DDI_SUCCESS);
3415 }
3416
3417 /*
3418 * bridge_info()
3419 *
3420 * Description:
3421 * Translate "dev_t" to a pointer to the associated "dev_info_t".
3422 */
3423 /* ARGSUSED */
3424 static int
bridge_info(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)3425 bridge_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
3426 void **result)
3427 {
3428 int rc;
3429
3430 switch (infocmd) {
3431 case DDI_INFO_DEVT2DEVINFO:
3432 if (bridge_dev_info == NULL) {
3433 rc = DDI_FAILURE;
3434 } else {
3435 *result = (void *)bridge_dev_info;
3436 rc = DDI_SUCCESS;
3437 }
3438 break;
3439 case DDI_INFO_DEVT2INSTANCE:
3440 *result = NULL;
3441 rc = DDI_SUCCESS;
3442 break;
3443 default:
3444 rc = DDI_FAILURE;
3445 break;
3446 }
3447 return (rc);
3448 }
3449
3450 static struct module_info bridge_modinfo = {
3451 2105, /* mi_idnum */
3452 BRIDGE_DEV_NAME, /* mi_idname */
3453 0, /* mi_minpsz */
3454 16384, /* mi_maxpsz */
3455 65536, /* mi_hiwat */
3456 128 /* mi_lowat */
3457 };
3458
3459 static struct qinit bridge_rinit = {
3460 NULL, /* qi_putp */
3461 NULL, /* qi_srvp */
3462 bridge_open, /* qi_qopen */
3463 bridge_close, /* qi_qclose */
3464 NULL, /* qi_qadmin */
3465 &bridge_modinfo, /* qi_minfo */
3466 NULL /* qi_mstat */
3467 };
3468
3469 static struct qinit bridge_winit = {
3470 (int (*)())bridge_wput, /* qi_putp */
3471 NULL, /* qi_srvp */
3472 NULL, /* qi_qopen */
3473 NULL, /* qi_qclose */
3474 NULL, /* qi_qadmin */
3475 &bridge_modinfo, /* qi_minfo */
3476 NULL /* qi_mstat */
3477 };
3478
3479 static struct streamtab bridge_tab = {
3480 &bridge_rinit, /* st_rdinit */
3481 &bridge_winit /* st_wrinit */
3482 };
3483
3484 /* No STREAMS perimeters; we do all our own locking */
3485 DDI_DEFINE_STREAM_OPS(bridge_ops, nulldev, nulldev, bridge_attach,
3486 bridge_detach, nodev, bridge_info, D_NEW | D_MP, &bridge_tab,
3487 ddi_quiesce_not_supported);
3488
3489 static struct modldrv modldrv = {
3490 &mod_driverops,
3491 "bridging driver",
3492 &bridge_ops
3493 };
3494
3495 static struct modlinkage modlinkage = {
3496 MODREV_1,
3497 (void *)&modldrv,
3498 NULL
3499 };
3500
3501 int
_init(void)3502 _init(void)
3503 {
3504 int retv;
3505
3506 mac_init_ops(NULL, BRIDGE_DEV_NAME);
3507 bridge_inst_init();
3508 if ((retv = mod_install(&modlinkage)) != 0)
3509 bridge_inst_fini();
3510 return (retv);
3511 }
3512
3513 int
_fini(void)3514 _fini(void)
3515 {
3516 int retv;
3517
3518 rw_enter(&bmac_rwlock, RW_READER);
3519 retv = list_is_empty(&bmac_list) ? 0 : EBUSY;
3520 rw_exit(&bmac_rwlock);
3521 if (retv == 0 &&
3522 (retv = mod_remove(&modlinkage)) == 0)
3523 bridge_inst_fini();
3524 return (retv);
3525 }
3526
3527 int
_info(struct modinfo * modinfop)3528 _info(struct modinfo *modinfop)
3529 {
3530 return (mod_info(&modlinkage, modinfop));
3531 }
3532