xref: /illumos-gate/usr/src/uts/common/io/trill.c (revision bc0ee17c150fbf29e52c0ff365163e4e7b1c2f0a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2024 Oxide Computer Company
26  */
27 
28 /*
29  *  This module supports AF_TRILL sockets and TRILL layer-2 forwarding.
30  */
31 
32 #include <sys/strsubr.h>
33 #include <sys/socket.h>
34 #include <sys/socketvar.h>
35 #include <sys/modctl.h>
36 #include <sys/cmn_err.h>
37 #include <sys/tihdr.h>
38 #include <sys/strsun.h>
39 #include <sys/policy.h>
40 #include <sys/ethernet.h>
41 #include <sys/vlan.h>
42 #include <net/trill.h>
43 #include <net/if_dl.h>
44 #include <sys/mac.h>
45 #include <sys/mac_client.h>
46 #include <sys/mac_provider.h>
47 #include <sys/mac_client_priv.h>
48 #include <sys/sdt.h>
49 #include <sys/dls.h>
50 #include <sys/sunddi.h>
51 
52 #include "trill_impl.h"
53 
54 static void trill_del_all(trill_inst_t *, boolean_t);
55 static int trill_del_nick(trill_inst_t *, uint16_t, boolean_t);
56 static void trill_stop_recv(trill_sock_t *);
57 static void trill_ctrl_input(trill_sock_t *, mblk_t *, const uint8_t *,
58     uint16_t);
59 static trill_node_t *trill_node_lookup(trill_inst_t *, uint16_t);
60 static void trill_node_unref(trill_inst_t *, trill_node_t *);
61 static void trill_sock_unref(trill_sock_t *);
62 static void trill_kstats_init(trill_sock_t *, const char *);
63 
64 static list_t trill_inst_list;
65 static krwlock_t trill_inst_rwlock;
66 
67 static sock_lower_handle_t trill_create(int, int, int, sock_downcalls_t **,
68     uint_t *, int *, int, cred_t *);
69 
70 static smod_reg_t sinfo = {
71 	SOCKMOD_VERSION,
72 	"trill",
73 	SOCK_UC_VERSION,
74 	SOCK_DC_VERSION,
75 	trill_create,
76 	NULL,
77 };
78 
79 /* modldrv structure */
80 static struct modlsockmod sockmod = {
81 	&mod_sockmodops, "AF_TRILL socket module", &sinfo
82 };
83 
84 /* modlinkage structure */
85 static struct modlinkage ml = {
86 	MODREV_1,
87 	&sockmod,
88 	NULL
89 };
90 
91 #define	VALID_NICK(n)	((n) != RBRIDGE_NICKNAME_NONE && \
92 			(n) != RBRIDGE_NICKNAME_UNUSED)
93 
94 static mblk_t *
95 create_trill_header(trill_sock_t *tsock, mblk_t *mp, const uint8_t *daddr,
96     boolean_t trill_hdr_ok, boolean_t multidest, uint16_t tci,
97     size_t msglen)
98 {
99 	int extra_hdr_len;
100 	struct ether_vlan_header *ethvlanhdr;
101 	mblk_t *hdr_mp;
102 	uint16_t etype;
103 
104 	etype = msglen > 0 ? (uint16_t)msglen : ETHERTYPE_TRILL;
105 
106 	/* When sending on the PVID, we must not give a VLAN ID */
107 	if (tci == tsock->ts_link->bl_pvid)
108 		tci = TRILL_NO_TCI;
109 
110 	/*
111 	 * Create new Ethernet header and include additional space
112 	 * for writing TRILL header and/or VLAN tag.
113 	 */
114 	extra_hdr_len = (trill_hdr_ok ? 0 : sizeof (trill_header_t)) +
115 	    (tci != TRILL_NO_TCI ? sizeof (struct ether_vlan_extinfo) : 0);
116 	hdr_mp = mac_header(tsock->ts_link->bl_mh, daddr,
117 	    tci != TRILL_NO_TCI ? ETHERTYPE_VLAN : etype, mp, extra_hdr_len);
118 	if (hdr_mp == NULL) {
119 		freemsg(mp);
120 		return (NULL);
121 	}
122 
123 	if (tci != TRILL_NO_TCI) {
124 		/* LINTED: alignment */
125 		ethvlanhdr = (struct ether_vlan_header *)hdr_mp->b_rptr;
126 		ethvlanhdr->ether_tci = htons(tci);
127 		ethvlanhdr->ether_type = htons(etype);
128 		hdr_mp->b_wptr += sizeof (struct ether_vlan_extinfo);
129 	}
130 
131 	if (!trill_hdr_ok) {
132 		trill_header_t *thp;
133 		/* LINTED: alignment */
134 		thp = (trill_header_t *)hdr_mp->b_wptr;
135 		(void) memset(thp, 0, sizeof (trill_header_t));
136 		thp->th_hopcount = TRILL_DEFAULT_HOPS;
137 		thp->th_multidest = (multidest ? 1:0);
138 		hdr_mp->b_wptr += sizeof (trill_header_t);
139 	}
140 
141 	hdr_mp->b_cont = mp;
142 	return (hdr_mp);
143 }
144 
145 /*
146  * TRILL local recv function. TRILL data frames that should be received
147  * by the local system are decapsulated here and passed to bridging for
148  * learning and local system receive. Only called when we are the forwarder
149  * on the link (multi-dest frames) or the frame was destined for us.
150  */
151 static void
152 trill_recv_local(trill_sock_t *tsock, mblk_t *mp, uint16_t ingressnick)
153 {
154 	struct ether_header *inner_ethhdr;
155 
156 	/* LINTED: alignment */
157 	inner_ethhdr = (struct ether_header *)mp->b_rptr;
158 	DTRACE_PROBE1(trill__recv__local, struct ether_header *, inner_ethhdr);
159 
160 	DB_CKSUMFLAGS(mp) = 0;
161 	/*
162 	 * Transmit the decapsulated frame on the link via Bridging.
163 	 * Bridging does source address learning and appropriate forwarding.
164 	 */
165 	bridge_trill_decaps(tsock->ts_link, mp, ingressnick);
166 	KSPINCR(tks_decap);
167 }
168 
169 /*
170  * Determines the outgoing link to reach a RBridge having the given nick
171  * Assumes caller has acquired the trill instance rwlock.
172  */
173 static trill_sock_t *
174 find_trill_link(trill_inst_t *tip, datalink_id_t linkid)
175 {
176 	trill_sock_t *tsp = NULL;
177 
178 	ASSERT(RW_LOCK_HELD(&tip->ti_rwlock));
179 	for (tsp = list_head(&tip->ti_socklist); tsp != NULL;
180 	    tsp = list_next(&tip->ti_socklist, tsp)) {
181 		if (tsp->ts_link != NULL && tsp->ts_link->bl_linkid == linkid) {
182 			ASSERT(tsp->ts_link->bl_mh != NULL);
183 			ASSERT(!(tsp->ts_flags & TSF_SHUTDOWN));
184 			atomic_inc_uint(&tsp->ts_refs);
185 			break;
186 		}
187 	}
188 	return (tsp);
189 }
190 
191 /*
192  * TRILL destination forwarding function. Transmits the TRILL data packet
193  * to the next-hop, adjacent RBridge.  Consumes passed mblk_t.
194  */
195 static void
196 trill_dest_fwd(trill_inst_t *tip, mblk_t *fwd_mp, uint16_t adj_nick,
197     boolean_t has_trill_hdr, boolean_t multidest, uint16_t dtnick)
198 {
199 	trill_node_t *adj;
200 	trill_sock_t *tsock = NULL;
201 	trill_header_t *trillhdr;
202 	struct ether_header *ethhdr;
203 	int ethtype;
204 	int ethhdrlen;
205 
206 	adj = trill_node_lookup(tip, adj_nick);
207 	if (adj == NULL || ((tsock = adj->tn_tsp) == NULL))
208 		goto dest_fwd_fail;
209 
210 	ASSERT(tsock->ts_link != NULL);
211 	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
212 	ASSERT(adj->tn_ni != NULL);
213 
214 	DTRACE_PROBE3(trill__dest__fwd, uint16_t, adj_nick, trill_node_t,
215 	    adj, trill_sock_t, tsock);
216 
217 	/*
218 	 * For broadcast links by using the dest address of
219 	 * the RBridge to forward the frame should result in
220 	 * savings. When the link is a bridged LAN or there are
221 	 * many end stations the frame will not always be flooded.
222 	 */
223 	fwd_mp = create_trill_header(tsock, fwd_mp, adj->tn_ni->tni_adjsnpa,
224 	    has_trill_hdr, multidest, tsock->ts_desigvlan, 0);
225 	if (fwd_mp == NULL)
226 		goto dest_fwd_fail;
227 
228 	/* LINTED: alignment */
229 	ethhdr = (struct ether_header *)fwd_mp->b_rptr;
230 	ethtype = ntohs(ethhdr->ether_type);
231 	ASSERT(ethtype == ETHERTYPE_VLAN || ethtype == ETHERTYPE_TRILL);
232 
233 	/* Pullup Ethernet and TRILL header (w/o TRILL options) */
234 	ethhdrlen = sizeof (struct ether_header) +
235 	    (ethtype == ETHERTYPE_VLAN ? sizeof (struct ether_vlan_extinfo):0);
236 	if (!pullupmsg(fwd_mp, ethhdrlen + sizeof (trill_header_t)))
237 		goto dest_fwd_fail;
238 	/* LINTED: alignment */
239 	trillhdr = (struct trill_header *)(fwd_mp->b_rptr + ethhdrlen);
240 
241 	/* Update TRILL header with ingress and egress nicks for new frames */
242 	if (!has_trill_hdr) {
243 		/* We are creating a new TRILL frame */
244 		trillhdr->th_egressnick = (multidest ? dtnick:adj_nick);
245 		rw_enter(&tip->ti_rwlock, RW_READER);
246 		trillhdr->th_ingressnick = tip->ti_nick;
247 		rw_exit(&tip->ti_rwlock);
248 		if (!VALID_NICK(trillhdr->th_ingressnick))
249 			goto dest_fwd_fail;
250 	}
251 
252 	/* Set hop count and update header in packet */
253 	ASSERT(trillhdr->th_hopcount != 0);
254 	trillhdr->th_hopcount--;
255 
256 	/* Clear checksum flag and transmit frame on the link */
257 	DB_CKSUMFLAGS(fwd_mp) = 0;
258 	DTRACE_PROBE1(trill__dest__fwd__tx, trill_header_t *, &trillhdr);
259 	fwd_mp = bridge_trill_output(tsock->ts_link, fwd_mp);
260 	if (fwd_mp == NULL) {
261 		KSPINCR(tks_sent);
262 		KSPINCR(tks_forward);
263 	} else {
264 		freemsg(fwd_mp);
265 		KSPINCR(tks_drops);
266 	}
267 	trill_node_unref(tip, adj);
268 	return;
269 
270 dest_fwd_fail:
271 	if (adj != NULL)
272 		trill_node_unref(tip, adj);
273 	if (tsock != NULL)
274 		KSPINCR(tks_drops);
275 	freemsg(fwd_mp);
276 }
277 
278 /*
279  * TRILL multi-destination forwarding. Transmits the packet to the adjacencies
280  * on the distribution tree determined by the egress nick. Source addr (saddr)
281  * is NULL for new TRILL packets originating from us.
282  */
283 static void
284 trill_multidest_fwd(trill_inst_t *tip, mblk_t *mp, uint16_t egressnick,
285     uint16_t ingressnick, boolean_t is_trill_pkt, const uint8_t *saddr,
286     int inner_vlan, boolean_t free_mblk)
287 {
288 	int idx;
289 	uint16_t adjnick;
290 	trill_node_t *dest;
291 	trill_node_t *adj;
292 	mblk_t *fwd_mp;
293 	boolean_t nicksaved = B_FALSE;
294 	uint16_t adjnicksaved;
295 
296 	/* Lookup the egress nick info, this is the DT root */
297 	if ((dest = trill_node_lookup(tip, egressnick)) == NULL)
298 		goto fail_multidest_fwd;
299 
300 	/* Send a copy to all our adjacencies on the DT root  */
301 	ASSERT(dest->tn_ni);
302 	for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
303 
304 		/* Check for a valid adjacency node */
305 		adjnick = TNI_ADJNICK(dest->tn_ni, idx);
306 		if (!VALID_NICK(adjnick) || ingressnick == adjnick ||
307 		    ((adj = trill_node_lookup(tip, adjnick)) == NULL))
308 			continue;
309 
310 		/* Do not forward back to adjacency that sent the pkt to us */
311 		ASSERT(adj->tn_ni != NULL);
312 		if ((saddr != NULL) &&
313 		    (memcmp(adj->tn_ni->tni_adjsnpa, saddr,
314 		    ETHERADDRL) == 0)) {
315 			trill_node_unref(tip, adj);
316 			continue;
317 		}
318 
319 		/* Check if adj is marked as reaching inner VLAN downstream */
320 		if ((inner_vlan != VLAN_ID_NONE) &&
321 		    !TRILL_VLANISSET(TNI_VLANFILTERMAP(dest->tn_ni, idx),
322 		    inner_vlan)) {
323 			trill_node_unref(tip, adj);
324 			DTRACE_PROBE4(trill__multi__dest__fwd__vlanfiltered,
325 			    uint16_t, adjnick, uint16_t, ingressnick,
326 			    uint16_t, egressnick, int, inner_vlan);
327 			continue;
328 		}
329 
330 		trill_node_unref(tip, adj);
331 
332 		/*
333 		 * Save the nick and look ahead to see if we should forward the
334 		 * frame to more adjacencies. We avoid doing a copy for this
335 		 * nick and use the passed mblk when we can consume the passed
336 		 * mblk.
337 		 */
338 		if (free_mblk && !nicksaved) {
339 			adjnicksaved = adjnick;
340 			nicksaved = B_TRUE;
341 			continue;
342 		}
343 
344 		fwd_mp = copymsg(mp);
345 		if (fwd_mp == NULL)
346 			break;
347 		DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
348 		    adjnick, uint16_t, ingressnick);
349 		trill_dest_fwd(tip, fwd_mp, adjnick, is_trill_pkt,
350 		    B_TRUE, egressnick);
351 	}
352 	trill_node_unref(tip, dest);
353 
354 	if (nicksaved) {
355 		ASSERT(free_mblk);
356 		DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
357 		    adjnicksaved, uint16_t, ingressnick);
358 		trill_dest_fwd(tip, mp, adjnicksaved, is_trill_pkt,
359 		    B_TRUE, egressnick);
360 		return;
361 	}
362 
363 fail_multidest_fwd:
364 	DTRACE_PROBE2(trill__multi__dest__fwd__fail, uint16_t,
365 	    egressnick, uint16_t, ingressnick);
366 	if (free_mblk) {
367 		freemsg(mp);
368 	}
369 }
370 
371 /*
372  * TRILL data receive function. Forwards the received frame if necessary
373  * and also determines if the received frame should be consumed locally.
374  * Consumes passed mblk.
375  */
376 static void
377 trill_recv(trill_sock_t *tsock, mblk_t *mp, const uint8_t *mpsaddr)
378 {
379 	trill_header_t *trillhdr;
380 	trill_node_t *dest = NULL;
381 	trill_node_t *source = NULL;
382 	trill_node_t *adj;
383 	uint16_t ournick, adjnick, treeroot;
384 	struct ether_header *ethhdr;
385 	trill_inst_t *tip = tsock->ts_tip;
386 	uint8_t srcaddr[ETHERADDRL];
387 	size_t trillhdrlen;
388 	int inner_vlan = VLAN_ID_NONE;
389 	int tci;
390 	int idx;
391 	size_t min_size;
392 
393 	/* Copy Ethernet source address before modifying packet */
394 	(void) memcpy(srcaddr, mpsaddr, ETHERADDRL);
395 
396 	/* Pull up TRILL header if necessary. */
397 	min_size = sizeof (trill_header_t);
398 	if ((MBLKL(mp) < min_size ||
399 	    !IS_P2ALIGNED(mp->b_rptr, TRILL_HDR_ALIGN)) &&
400 	    !pullupmsg(mp, min_size))
401 		goto fail;
402 
403 	/* LINTED: alignment */
404 	trillhdr = (trill_header_t *)mp->b_rptr;
405 	if (trillhdr->th_version != TRILL_PROTOCOL_VERS) {
406 		DTRACE_PROBE1(trill__recv__wrongversion,
407 		    trill_header_t *, trillhdr);
408 		goto fail;
409 	}
410 
411 	/* Drop if unknown or invalid nickname */
412 	if (!VALID_NICK(trillhdr->th_egressnick) ||
413 	    !VALID_NICK(trillhdr->th_ingressnick)) {
414 		DTRACE_PROBE1(trill__recv__invalidnick,
415 		    trill_header_t *, trillhdr);
416 		goto fail;
417 	}
418 
419 	rw_enter(&tip->ti_rwlock, RW_READER);
420 	ournick = tip->ti_nick;
421 	treeroot = tip->ti_treeroot;
422 	rw_exit(&tip->ti_rwlock);
423 	/* Drop if we received a packet with our nick as ingress */
424 	if (trillhdr->th_ingressnick == ournick)
425 		goto fail;
426 
427 	/* Re-pull any TRILL options and inner Ethernet header */
428 	min_size += GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t) +
429 	    sizeof (struct ether_header);
430 	if (MBLKL(mp) < min_size) {
431 		if (!pullupmsg(mp, min_size))
432 			goto fail;
433 		/* LINTED: alignment */
434 		trillhdr = (trill_header_t *)mp->b_rptr;
435 	}
436 	trillhdrlen = sizeof (trill_header_t) +
437 	    (GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t));
438 
439 	/*
440 	 * Get the inner Ethernet header, plus the inner VLAN header if there
441 	 * is one.
442 	 */
443 	/* LINTED: alignment */
444 	ethhdr = (struct ether_header *)(mp->b_rptr + trillhdrlen);
445 	if (ethhdr->ether_type == htons(ETHERTYPE_VLAN)) {
446 		min_size += sizeof (struct ether_vlan_extinfo);
447 		if (MBLKL(mp) < min_size) {
448 			if (!pullupmsg(mp, min_size))
449 				goto fail;
450 			/* LINTED: alignment */
451 			trillhdr = (trill_header_t *)mp->b_rptr;
452 			/* LINTED: alignment */
453 			ethhdr = (struct ether_header *)(mp->b_rptr +
454 			    trillhdrlen);
455 		}
456 
457 		tci = ntohs(((struct ether_vlan_header *)ethhdr)->ether_tci);
458 		inner_vlan = VLAN_ID(tci);
459 	}
460 
461 	/* Known/single destination forwarding. */
462 	if (!trillhdr->th_multidest) {
463 
464 		/* Inner MacDA must be unicast */
465 		if (ethhdr->ether_dhost.ether_addr_octet[0] & 1)
466 			goto fail;
467 
468 		/* Ingress and Egress nicks must be different */
469 		if (trillhdr->th_egressnick == trillhdr->th_ingressnick)
470 			goto fail;
471 
472 		DTRACE_PROBE1(trill__recv__singledest,
473 		    trill_header_t *, trillhdr);
474 		if (trillhdr->th_egressnick == ournick) {
475 			mp->b_rptr += trillhdrlen;
476 			trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
477 		} else if (trillhdr->th_hopcount > 0) {
478 			trill_dest_fwd(tip, mp, trillhdr->th_egressnick,
479 			    B_TRUE, B_FALSE, RBRIDGE_NICKNAME_NONE);
480 		} else {
481 			goto fail;
482 		}
483 		return;
484 	}
485 
486 	/*
487 	 * Multi-destination frame: perform checks verifying we have
488 	 * received a valid multi-destination frame before receiving the
489 	 * frame locally and forwarding the frame to other RBridges.
490 	 *
491 	 * Check if we received this multi-destination frame on a
492 	 * adjacency in the distribution tree indicated by the frame's
493 	 * egress nickname.
494 	 */
495 	if ((dest = trill_node_lookup(tip, trillhdr->th_egressnick)) == NULL)
496 		goto fail;
497 	for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
498 		adjnick = TNI_ADJNICK(dest->tn_ni, idx);
499 		if ((adj = trill_node_lookup(tip, adjnick)) == NULL)
500 			continue;
501 		if (memcmp(adj->tn_ni->tni_adjsnpa, srcaddr, ETHERADDRL) == 0) {
502 			trill_node_unref(tip, adj);
503 			break;
504 		}
505 		trill_node_unref(tip, adj);
506 	}
507 
508 	if (idx >= dest->tn_ni->tni_adjcount) {
509 		DTRACE_PROBE2(trill__recv__multidest__adjcheckfail,
510 		    trill_header_t *, trillhdr, trill_node_t *, dest);
511 		goto fail;
512 	}
513 
514 	/*
515 	 * Reverse path forwarding check. Check if the ingress RBridge
516 	 * that has forwarded the frame advertised the use of the
517 	 * distribution tree specified in the egress nick.
518 	 */
519 	if ((source = trill_node_lookup(tip, trillhdr->th_ingressnick)) == NULL)
520 		goto fail;
521 	for (idx = 0; idx < source->tn_ni->tni_dtrootcount; idx++) {
522 		if (TNI_DTROOTNICK(source->tn_ni, idx) ==
523 		    trillhdr->th_egressnick)
524 			break;
525 	}
526 
527 	if (idx >= source->tn_ni->tni_dtrootcount) {
528 		/*
529 		 * Allow receipt of forwarded frame with the highest
530 		 * tree root RBridge as the egress RBridge when the
531 		 * ingress RBridge has not advertised the use of any
532 		 * distribution trees.
533 		 */
534 		if (source->tn_ni->tni_dtrootcount != 0 ||
535 		    trillhdr->th_egressnick != treeroot) {
536 			DTRACE_PROBE3(
537 			    trill__recv__multidest__rpfcheckfail,
538 			    trill_header_t *, trillhdr, trill_node_t *,
539 			    source, trill_inst_t *, tip);
540 			goto fail;
541 		}
542 	}
543 
544 	/* Check hop count before doing any forwarding */
545 	if (trillhdr->th_hopcount == 0)
546 		goto fail;
547 
548 	/* Forward frame using the distribution tree specified by egress nick */
549 	DTRACE_PROBE2(trill__recv__multidest, trill_header_t *,
550 	    trillhdr, trill_node_t *, source);
551 	trill_node_unref(tip, source);
552 	trill_node_unref(tip, dest);
553 
554 	/* Tell forwarding not to free if we're the link forwarder. */
555 	trill_multidest_fwd(tip, mp, trillhdr->th_egressnick,
556 	    trillhdr->th_ingressnick, B_TRUE, srcaddr, inner_vlan,
557 	    B_FALSE);
558 
559 	/*
560 	 * Send de-capsulated frame locally if we are the link forwarder (also
561 	 * does bridge learning).
562 	 */
563 	mp->b_rptr += trillhdrlen;
564 	trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
565 	KSPINCR(tks_recv);
566 	return;
567 
568 fail:
569 	DTRACE_PROBE2(trill__recv__multidest__fail, mblk_t *, mp,
570 	    trill_sock_t *, tsock);
571 	if (dest != NULL)
572 		trill_node_unref(tip, dest);
573 	if (source != NULL)
574 		trill_node_unref(tip, source);
575 	freemsg(mp);
576 	KSPINCR(tks_drops);
577 }
578 
579 static void
580 trill_stop_recv(trill_sock_t *tsock)
581 {
582 	mutex_enter(&tsock->ts_socklock);
583 stop_retry:
584 	if (tsock->ts_state == TS_UNBND || tsock->ts_link == NULL) {
585 		mutex_exit(&tsock->ts_socklock);
586 		return;
587 	}
588 
589 	/*
590 	 * If another thread is closing the socket then wait. Our callers
591 	 * expect us to return only after the socket is closed.
592 	 */
593 	if (tsock->ts_flags & TSF_CLOSEWAIT) {
594 		cv_wait(&tsock->ts_sockclosewait, &tsock->ts_socklock);
595 		goto stop_retry;
596 	}
597 
598 	/*
599 	 * Set state and flags to block new bind or close calls
600 	 * while we close the socket.
601 	 */
602 	tsock->ts_flags |= TSF_CLOSEWAIT;
603 
604 	/* Wait until all AF_TRILL socket transmit operations are done */
605 	while (tsock->ts_sockthreadcount > 0)
606 		cv_wait(&tsock->ts_sockthreadwait, &tsock->ts_socklock);
607 
608 	/*
609 	 * We are guaranteed to be the only thread closing on the
610 	 * socket while the TSF_CLOSEWAIT flag is set, all others cv_wait
611 	 * for us to finish.
612 	 */
613 	ASSERT(tsock->ts_link != NULL);
614 	if (tsock->ts_ksp != NULL)
615 		kstat_delete(tsock->ts_ksp);
616 
617 	/*
618 	 * Release lock before bridge_trill_lnunref to prevent deadlock
619 	 * between trill_ctrl_input thread waiting to acquire ts_socklock
620 	 * and bridge_trill_lnunref waiting for the trill thread to finish.
621 	 */
622 	mutex_exit(&tsock->ts_socklock);
623 
624 	/*
625 	 * Release TRILL link reference from Bridging. On return from
626 	 * bridge_trill_lnunref we can be sure there are no active TRILL data
627 	 * threads for this link.
628 	 */
629 	bridge_trill_lnunref(tsock->ts_link);
630 
631 	/* Set socket as unbound & wakeup threads waiting for socket to close */
632 	mutex_enter(&tsock->ts_socklock);
633 	ASSERT(tsock->ts_link != NULL);
634 	tsock->ts_link = NULL;
635 	tsock->ts_state = TS_UNBND;
636 	tsock->ts_flags &= ~TSF_CLOSEWAIT;
637 	cv_broadcast(&tsock->ts_sockclosewait);
638 	mutex_exit(&tsock->ts_socklock);
639 }
640 
641 static int
642 trill_start_recv(trill_sock_t *tsock, const struct sockaddr *sa, socklen_t len)
643 {
644 	struct sockaddr_dl *lladdr = (struct sockaddr_dl *)sa;
645 	datalink_id_t linkid;
646 	int err = 0;
647 
648 	if (len != sizeof (*lladdr))
649 		return (EINVAL);
650 
651 	mutex_enter(&tsock->ts_socklock);
652 	if (tsock->ts_tip == NULL || tsock->ts_state != TS_UNBND) {
653 		err = EINVAL;
654 		goto bind_error;
655 	}
656 
657 	if (tsock->ts_flags & TSF_CLOSEWAIT || tsock->ts_link != NULL) {
658 		err = EBUSY;
659 		goto bind_error;
660 	}
661 
662 	(void) memcpy(&(tsock->ts_lladdr), lladdr,
663 	    sizeof (struct sockaddr_dl));
664 	(void) memcpy(&linkid, tsock->ts_lladdr.sdl_data,
665 	    sizeof (datalink_id_t));
666 
667 	tsock->ts_link = bridge_trill_lnref(tsock->ts_tip->ti_binst,
668 	    linkid, tsock);
669 	if (tsock->ts_link == NULL) {
670 		err = EINVAL;
671 		goto bind_error;
672 	}
673 
674 	trill_kstats_init(tsock, tsock->ts_tip->ti_bridgename);
675 	tsock->ts_state = TS_IDLE;
676 
677 bind_error:
678 	mutex_exit(&tsock->ts_socklock);
679 	return (err);
680 }
681 
682 static int
683 trill_do_unbind(trill_sock_t *tsock)
684 {
685 	/* If a bind has not been done, we can't unbind. */
686 	if (tsock->ts_state != TS_IDLE)
687 		return (EINVAL);
688 
689 	trill_stop_recv(tsock);
690 	return (0);
691 }
692 
693 static void
694 trill_instance_unref(trill_inst_t *tip)
695 {
696 	rw_enter(&trill_inst_rwlock, RW_WRITER);
697 	rw_enter(&tip->ti_rwlock, RW_WRITER);
698 	if (atomic_dec_uint_nv(&tip->ti_refs) == 0) {
699 		list_remove(&trill_inst_list, tip);
700 		rw_exit(&tip->ti_rwlock);
701 		rw_exit(&trill_inst_rwlock);
702 		if (tip->ti_binst != NULL)
703 			bridge_trill_brunref(tip->ti_binst);
704 		list_destroy(&tip->ti_socklist);
705 		rw_destroy(&tip->ti_rwlock);
706 		kmem_free(tip, sizeof (*tip));
707 	} else {
708 		rw_exit(&tip->ti_rwlock);
709 		rw_exit(&trill_inst_rwlock);
710 	}
711 }
712 
713 /*
714  * This is called when the bridge module receives a TRILL-encapsulated packet
715  * on a given link or a packet identified as "TRILL control."  We must verify
716  * that it's for us (it almost certainly will be), and then either decapsulate
717  * (if it's to our nickname), forward (if it's to someone else), or send up one
718  * of the sockets (if it's control traffic).
719  *
720  * Sadly, on Ethernet, the control traffic is identified by Outer.MacDA, and
721  * not by TRILL header information.
722  */
723 static void
724 trill_recv_pkt_cb(void *lptr, bridge_link_t *blp, mac_resource_handle_t rsrc,
725     mblk_t *mp, mac_header_info_t *hdr_info)
726 {
727 	trill_sock_t *tsock = lptr;
728 
729 	_NOTE(ARGUNUSED(rsrc));
730 
731 	ASSERT(tsock->ts_tip != NULL);
732 	ASSERT(tsock->ts_link != NULL);
733 	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
734 
735 	/*
736 	 * Only receive packet if the source address is not multicast (which is
737 	 * bogus).
738 	 */
739 	if (hdr_info->mhi_saddr[0] & 1)
740 		goto discard;
741 
742 	/*
743 	 * Check if this is our own packet reflected back.  It should not be.
744 	 */
745 	if (bcmp(hdr_info->mhi_saddr, blp->bl_local_mac, ETHERADDRL) == 0)
746 		goto discard;
747 
748 	/* Only receive unicast packet if addressed to us */
749 	if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST &&
750 	    bcmp(hdr_info->mhi_daddr, blp->bl_local_mac, ETHERADDRL) != 0)
751 		goto discard;
752 
753 	if (hdr_info->mhi_bindsap == ETHERTYPE_TRILL) {
754 		/* TRILL data packets */
755 		trill_recv(tsock, mp, hdr_info->mhi_saddr);
756 	} else {
757 		/* Design constraint for cheap IS-IS/BPDU comparison */
758 		ASSERT(all_isis_rbridges[4] != bridge_group_address[4]);
759 		/* Send received control packet upstream */
760 		trill_ctrl_input(tsock, mp, hdr_info->mhi_saddr,
761 		    hdr_info->mhi_daddr[4] == all_isis_rbridges[4] ?
762 		    hdr_info->mhi_tci : TRILL_TCI_BPDU);
763 	}
764 
765 	return;
766 
767 discard:
768 	freemsg(mp);
769 	KSPINCR(tks_drops);
770 }
771 
772 /*
773  * This is called when the bridge module discovers that the destination address
774  * for a packet is not local -- it's through some remote node.  We must verify
775  * that the remote node isn't our nickname (it shouldn't be), add a TRILL
776  * header, and then use the IS-IS data to determine which link and which
777  * next-hop RBridge should be used for output.  We then transmit on that link.
778  *
779  * The egress_nick is RBRIDGE_NICKNAME_NONE for the "unknown destination" case.
780  */
781 static void
782 trill_encap_pkt_cb(void *lptr, bridge_link_t *blp, mac_header_info_t *hdr_info,
783     mblk_t *mp, uint16_t egress_nick)
784 {
785 	uint16_t ournick;
786 	uint16_t dtnick;
787 	trill_node_t *self = NULL;
788 	trill_sock_t *tsock = lptr;
789 	trill_inst_t *tip = tsock->ts_tip;
790 	int vlan = VLAN_ID_NONE;
791 
792 	_NOTE(ARGUNUSED(blp));
793 	ASSERT(hdr_info->mhi_bindsap != ETHERTYPE_TRILL);
794 
795 	/* egress_nick = RBRIDGE_NICKNAME_NONE is valid */
796 	if (egress_nick != RBRIDGE_NICKNAME_NONE && !VALID_NICK(egress_nick))
797 		goto discard;
798 
799 	/* Check if our own nick is valid before we do any forwarding */
800 	rw_enter(&tip->ti_rwlock, RW_READER);
801 	ournick = tip->ti_nick;
802 	dtnick = tip->ti_treeroot;
803 	rw_exit(&tip->ti_rwlock);
804 	if (!VALID_NICK(ournick))
805 		goto discard;
806 
807 	/*
808 	 * For Multi-Destination forwarding determine our choice of
809 	 * root distribution tree. If we didn't choose a distribution
810 	 * tree (dtroots_count=0) then we use the highest priority tree
811 	 * root (t_treeroot) else we drop the packet without forwarding.
812 	 */
813 	if (egress_nick == RBRIDGE_NICKNAME_NONE) {
814 		if ((self = trill_node_lookup(tip, ournick)) == NULL)
815 			goto discard;
816 
817 		/*
818 		 * Use the first DT configured for now. In future we
819 		 * should have DT selection code here.
820 		 */
821 		if (self->tn_ni->tni_dtrootcount > 0) {
822 			dtnick = TNI_DTROOTNICK(self->tn_ni, 0);
823 		}
824 
825 		trill_node_unref(tip, self);
826 		if (!VALID_NICK(dtnick)) {
827 			DTRACE_PROBE(trill__fwd__packet__nodtroot);
828 			goto discard;
829 		}
830 	}
831 
832 	/*
833 	 * Retrieve VLAN ID of the native frame used for VLAN
834 	 * pruning of multi-destination frames.
835 	 */
836 	if (hdr_info->mhi_istagged) {
837 		vlan = VLAN_ID(hdr_info->mhi_tci);
838 	}
839 
840 	DTRACE_PROBE2(trill__fwd__packet, mac_header_info_t *, hdr_info,
841 	    uint16_t, egress_nick);
842 	if (egress_nick == RBRIDGE_NICKNAME_NONE) {
843 		trill_multidest_fwd(tip, mp, dtnick,
844 		    ournick, B_FALSE, NULL, vlan, B_TRUE);
845 	} else {
846 		trill_dest_fwd(tip, mp, egress_nick, B_FALSE, B_FALSE,
847 		    RBRIDGE_NICKNAME_NONE);
848 	}
849 	KSPINCR(tks_encap);
850 	return;
851 
852 discard:
853 	freemsg(mp);
854 }
855 
856 /*
857  * This is called when the bridge module has completely torn down a bridge
858  * instance and all of the attached links.  We need to make the TRILL instance
859  * go away at this point.
860  */
861 static void
862 trill_br_dstr_cb(void *bptr, bridge_inst_t *bip)
863 {
864 	trill_inst_t *tip = bptr;
865 
866 	_NOTE(ARGUNUSED(bip));
867 	rw_enter(&tip->ti_rwlock, RW_WRITER);
868 	if (tip->ti_binst != NULL)
869 		bridge_trill_brunref(tip->ti_binst);
870 	tip->ti_binst = NULL;
871 	rw_exit(&tip->ti_rwlock);
872 }
873 
874 /*
875  * This is called when the bridge module is tearing down a link, but before the
876  * actual tear-down starts.  When this function returns, we must make sure that
877  * we will not initiate any new transmits on this link.
878  */
879 static void
880 trill_ln_dstr_cb(void *lptr, bridge_link_t *blp)
881 {
882 	trill_sock_t *tsock = lptr;
883 
884 	_NOTE(ARGUNUSED(blp));
885 	trill_stop_recv(tsock);
886 }
887 
888 static void
889 trill_init(void)
890 {
891 	list_create(&trill_inst_list, sizeof (trill_inst_t),
892 	    offsetof(trill_inst_t, ti_instnode));
893 	rw_init(&trill_inst_rwlock, NULL, RW_DRIVER, NULL);
894 	bridge_trill_register_cb(trill_recv_pkt_cb, trill_encap_pkt_cb,
895 	    trill_br_dstr_cb, trill_ln_dstr_cb);
896 }
897 
898 static void
899 trill_fini(void)
900 {
901 	bridge_trill_register_cb(NULL, NULL, NULL, NULL);
902 	rw_destroy(&trill_inst_rwlock);
903 	list_destroy(&trill_inst_list);
904 }
905 
906 /* Loadable module configuration entry points */
907 int
908 _init(void)
909 {
910 	int rc;
911 
912 	trill_init();
913 	if ((rc = mod_install(&ml)) != 0)
914 		trill_fini();
915 	return (rc);
916 }
917 
918 int
919 _info(struct modinfo *modinfop)
920 {
921 	return (mod_info(&ml, modinfop));
922 }
923 
924 int
925 _fini(void)
926 {
927 	int rc;
928 
929 	rw_enter(&trill_inst_rwlock, RW_READER);
930 	rc = list_is_empty(&trill_inst_list) ? 0 : EBUSY;
931 	rw_exit(&trill_inst_rwlock);
932 	if (rc == 0 && ((rc = mod_remove(&ml)) == 0))
933 		trill_fini();
934 	return (rc);
935 }
936 
937 static void
938 trill_kstats_init(trill_sock_t *tsock, const char *bname)
939 {
940 	int i;
941 	char kstatname[KSTAT_STRLEN];
942 	kstat_named_t  *knt;
943 	static const char *sock_kstats_list[] = { TRILL_KSSOCK_NAMES };
944 	char link_name[MAXNAMELEN];
945 	int num;
946 	int err;
947 
948 	bzero(link_name, sizeof (link_name));
949 	if ((err = dls_mgmt_get_linkinfo(tsock->ts_link->bl_linkid, link_name,
950 	    NULL, NULL, NULL)) != 0) {
951 		cmn_err(CE_WARN, "%s: trill_kstats_init: error %d retrieving"
952 		    " linkinfo for linkid:%d", "trill", err,
953 		    tsock->ts_link->bl_linkid);
954 		return;
955 	}
956 
957 	bzero(kstatname, sizeof (kstatname));
958 	(void) snprintf(kstatname, sizeof (kstatname), "%s-%s",
959 	    bname, link_name);
960 
961 	num = sizeof (sock_kstats_list) / sizeof (*sock_kstats_list);
962 	for (i = 0; i < num; i++) {
963 		knt = (kstat_named_t *)&(tsock->ts_kstats);
964 		kstat_named_init(&knt[i], sock_kstats_list[i],
965 		    KSTAT_DATA_UINT64);
966 	}
967 
968 	tsock->ts_ksp = kstat_create_zone("trill", 0, kstatname, "sock",
969 	    KSTAT_TYPE_NAMED, num, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
970 	if (tsock->ts_ksp != NULL) {
971 		tsock->ts_ksp->ks_data = &tsock->ts_kstats;
972 		kstat_install(tsock->ts_ksp);
973 	}
974 }
975 
976 static trill_sock_t *
977 trill_do_open(int flags)
978 {
979 	trill_sock_t *tsock;
980 	int kmflag = ((flags & SOCKET_NOSLEEP)) ? KM_NOSLEEP:KM_SLEEP;
981 
982 	tsock = kmem_zalloc(sizeof (trill_sock_t), kmflag);
983 	if (tsock != NULL) {
984 		tsock->ts_state = TS_UNBND;
985 		tsock->ts_refs++;
986 		mutex_init(&tsock->ts_socklock, NULL, MUTEX_DRIVER, NULL);
987 		cv_init(&tsock->ts_sockthreadwait, NULL, CV_DRIVER, NULL);
988 		cv_init(&tsock->ts_sockclosewait, NULL, CV_DRIVER, NULL);
989 	}
990 	return (tsock);
991 }
992 
993 static int
994 trill_find_bridge(trill_sock_t *tsock, const char *bname, boolean_t can_create)
995 {
996 	trill_inst_t *tip, *newtip = NULL;
997 
998 	/* Allocate some memory (speculatively) before taking locks */
999 	if (can_create)
1000 		newtip = kmem_zalloc(sizeof (*tip), KM_NOSLEEP);
1001 
1002 	rw_enter(&trill_inst_rwlock, RW_WRITER);
1003 	for (tip = list_head(&trill_inst_list); tip != NULL;
1004 	    tip = list_next(&trill_inst_list, tip)) {
1005 		if (strcmp(tip->ti_bridgename, bname) == 0)
1006 			break;
1007 	}
1008 	if (tip == NULL) {
1009 		if (!can_create || newtip == NULL) {
1010 			rw_exit(&trill_inst_rwlock);
1011 			return (can_create ? ENOMEM : ENOENT);
1012 		}
1013 
1014 		tip = newtip;
1015 		newtip = NULL;
1016 		(void) strcpy(tip->ti_bridgename, bname);
1017 
1018 		/* Register TRILL instance with bridging */
1019 		tip->ti_binst = bridge_trill_brref(bname, tip);
1020 		if (tip->ti_binst == NULL) {
1021 			rw_exit(&trill_inst_rwlock);
1022 			kmem_free(tip, sizeof (*tip));
1023 			return (ENOENT);
1024 		}
1025 
1026 		rw_init(&tip->ti_rwlock, NULL, RW_DRIVER, NULL);
1027 		list_create(&tip->ti_socklist, sizeof (trill_sock_t),
1028 		    offsetof(trill_sock_t, ts_socklistnode));
1029 		list_insert_tail(&trill_inst_list, tip);
1030 	}
1031 	atomic_inc_uint(&tip->ti_refs);
1032 	rw_exit(&trill_inst_rwlock);
1033 
1034 	/* If we didn't need the preallocated memory, then discard now. */
1035 	if (newtip != NULL)
1036 		kmem_free(newtip, sizeof (*newtip));
1037 
1038 	rw_enter(&tip->ti_rwlock, RW_WRITER);
1039 	list_insert_tail(&(tip->ti_socklist), tsock);
1040 	tsock->ts_tip = tip;
1041 	rw_exit(&tip->ti_rwlock);
1042 	return (0);
1043 }
1044 
1045 static void
1046 trill_clear_bridge(trill_sock_t *tsock)
1047 {
1048 	trill_inst_t *tip;
1049 
1050 	if ((tip = tsock->ts_tip) == NULL)
1051 		return;
1052 	rw_enter(&tip->ti_rwlock, RW_WRITER);
1053 	list_remove(&tip->ti_socklist, tsock);
1054 	if (list_is_empty(&tip->ti_socklist))
1055 		trill_del_all(tip, B_TRUE);
1056 	rw_exit(&tip->ti_rwlock);
1057 }
1058 
1059 static void
1060 trill_sock_unref(trill_sock_t *tsock)
1061 {
1062 	if (atomic_dec_uint_nv(&tsock->ts_refs) == 0) {
1063 		mutex_destroy(&tsock->ts_socklock);
1064 		cv_destroy(&tsock->ts_sockthreadwait);
1065 		cv_destroy(&tsock->ts_sockclosewait);
1066 		kmem_free(tsock, sizeof (trill_sock_t));
1067 	}
1068 }
1069 
1070 static void
1071 trill_do_close(trill_sock_t *tsock)
1072 {
1073 	trill_inst_t *tip;
1074 
1075 	tip = tsock->ts_tip;
1076 	trill_stop_recv(tsock);
1077 	/* Remove socket from TRILL instance socket list */
1078 	trill_clear_bridge(tsock);
1079 	tsock->ts_flags |= TSF_SHUTDOWN;
1080 	trill_sock_unref(tsock);
1081 	if (tip != NULL)
1082 		trill_instance_unref(tip);
1083 }
1084 
1085 static void
1086 trill_del_all(trill_inst_t *tip, boolean_t lockheld)
1087 {
1088 	int i;
1089 
1090 	if (!lockheld)
1091 		rw_enter(&tip->ti_rwlock, RW_WRITER);
1092 	for (i = RBRIDGE_NICKNAME_MIN; i < RBRIDGE_NICKNAME_MAX; i++) {
1093 		if (tip->ti_nodes[i] != NULL)
1094 			(void) trill_del_nick(tip, i, B_TRUE);
1095 	}
1096 	if (!lockheld)
1097 		rw_exit(&tip->ti_rwlock);
1098 }
1099 
1100 static void
1101 trill_node_free(trill_node_t *nick_entry)
1102 {
1103 	trill_nickinfo_t *tni;
1104 
1105 	tni = nick_entry->tn_ni;
1106 	kmem_free(tni, TNI_TOTALSIZE(tni));
1107 	kmem_free(nick_entry, sizeof (trill_node_t));
1108 }
1109 
1110 static void
1111 trill_node_unref(trill_inst_t *tip, trill_node_t *tnp)
1112 {
1113 	if (atomic_dec_uint_nv(&tnp->tn_refs) == 0) {
1114 		if (tnp->tn_tsp != NULL)
1115 			trill_sock_unref(tnp->tn_tsp);
1116 		trill_node_free(tnp);
1117 		atomic_dec_uint(&tip->ti_nodecount);
1118 	}
1119 }
1120 
1121 static trill_node_t *
1122 trill_node_lookup(trill_inst_t *tip, uint16_t nick)
1123 {
1124 	trill_node_t *nick_entry;
1125 
1126 	if (!VALID_NICK(nick))
1127 		return (NULL);
1128 	rw_enter(&tip->ti_rwlock, RW_READER);
1129 	nick_entry = tip->ti_nodes[nick];
1130 	if (nick_entry != NULL) {
1131 		atomic_inc_uint(&nick_entry->tn_refs);
1132 	}
1133 	rw_exit(&tip->ti_rwlock);
1134 	return (nick_entry);
1135 }
1136 
1137 static int
1138 trill_del_nick(trill_inst_t *tip, uint16_t nick, boolean_t lockheld)
1139 {
1140 	trill_node_t *nick_entry;
1141 	int rc = ENOENT;
1142 
1143 	if (!lockheld)
1144 		rw_enter(&tip->ti_rwlock, RW_WRITER);
1145 	if (VALID_NICK(nick)) {
1146 		nick_entry = tip->ti_nodes[nick];
1147 		if (nick_entry != NULL) {
1148 			trill_node_unref(tip, nick_entry);
1149 			tip->ti_nodes[nick] = NULL;
1150 			rc = 0;
1151 		}
1152 	}
1153 	if (!lockheld)
1154 		rw_exit(&tip->ti_rwlock);
1155 	return (rc);
1156 }
1157 
1158 static int
1159 trill_add_nick(trill_inst_t *tip, void *arg, boolean_t self, int mode)
1160 {
1161 	uint16_t nick;
1162 	int size;
1163 	trill_node_t *tnode;
1164 	trill_nickinfo_t tnihdr;
1165 
1166 	/* First make sure we have at least the header available */
1167 	if (ddi_copyin(arg, &tnihdr, sizeof (trill_nickinfo_t), mode) != 0)
1168 		return (EFAULT);
1169 
1170 	nick = tnihdr.tni_nick;
1171 	if (!VALID_NICK(nick)) {
1172 		DTRACE_PROBE1(trill__add__nick__bad, trill_nickinfo_t *,
1173 		    &tnihdr);
1174 		return (EINVAL);
1175 	}
1176 
1177 	size = TNI_TOTALSIZE(&tnihdr);
1178 	if (size > TNI_MAXSIZE)
1179 		return (EINVAL);
1180 	tnode = kmem_zalloc(sizeof (trill_node_t), KM_SLEEP);
1181 	tnode->tn_ni = kmem_zalloc(size, KM_SLEEP);
1182 	if (ddi_copyin(arg, tnode->tn_ni, size, mode) != 0) {
1183 		kmem_free(tnode->tn_ni, size);
1184 		kmem_free(tnode, sizeof (trill_node_t));
1185 		return (EFAULT);
1186 	}
1187 
1188 	tnode->tn_refs++;
1189 	rw_enter(&tip->ti_rwlock, RW_WRITER);
1190 	if (tip->ti_nodes[nick] != NULL)
1191 		(void) trill_del_nick(tip, nick, B_TRUE);
1192 
1193 	if (self) {
1194 		tip->ti_nick = nick;
1195 	} else {
1196 		tnode->tn_tsp = find_trill_link(tip,
1197 		    tnode->tn_ni->tni_linkid);
1198 	}
1199 	DTRACE_PROBE2(trill__add__nick, trill_node_t *, tnode,
1200 	    uint16_t, nick);
1201 	tip->ti_nodes[nick] = tnode;
1202 	tip->ti_nodecount++;
1203 	rw_exit(&tip->ti_rwlock);
1204 	return (0);
1205 }
1206 
1207 static int
1208 trill_do_ioctl(trill_sock_t *tsock, int cmd, void *arg, int mode)
1209 {
1210 	int error = 0;
1211 	trill_inst_t *tip = tsock->ts_tip;
1212 
1213 	switch (cmd) {
1214 	case TRILL_DESIGVLAN: {
1215 		uint16_t desigvlan;
1216 
1217 		if (ddi_copyin(arg, &desigvlan, sizeof (desigvlan), mode) != 0)
1218 			return (EFAULT);
1219 		tsock->ts_desigvlan = desigvlan;
1220 		break;
1221 	}
1222 	case TRILL_VLANFWDER: {
1223 		uint8_t vlans[TRILL_VLANS_ARRSIZE];
1224 
1225 		if (tsock->ts_link == NULL)
1226 			return (EINVAL);
1227 		if ((ddi_copyin(arg, vlans, sizeof (vlans), mode)) != 0)
1228 			return (EFAULT);
1229 		bridge_trill_setvlans(tsock->ts_link, vlans);
1230 		break;
1231 	}
1232 	case TRILL_SETNICK:
1233 		if (tip == NULL)
1234 			return (EINVAL);
1235 		error = trill_add_nick(tip, arg, B_TRUE, mode);
1236 		break;
1237 
1238 	case TRILL_GETNICK:
1239 		if (tip == NULL)
1240 			return (EINVAL);
1241 		rw_enter(&tip->ti_rwlock, RW_READER);
1242 		if (ddi_copyout(&tip->ti_nick, arg, sizeof (tip->ti_nick),
1243 		    mode) != 0)
1244 			error = EFAULT;
1245 		rw_exit(&tip->ti_rwlock);
1246 		break;
1247 
1248 	case TRILL_ADDNICK:
1249 		if (tip == NULL)
1250 			break;
1251 		error = trill_add_nick(tip, arg, B_FALSE, mode);
1252 		break;
1253 
1254 	case TRILL_DELNICK: {
1255 		uint16_t delnick;
1256 
1257 		if (tip == NULL)
1258 			break;
1259 		if (ddi_copyin(arg, &delnick, sizeof (delnick), mode) != 0)
1260 			return (EFAULT);
1261 		error = trill_del_nick(tip, delnick, B_FALSE);
1262 		break;
1263 	}
1264 	case TRILL_DELALL:
1265 		if (tip == NULL)
1266 			break;
1267 		trill_del_all(tip, B_FALSE);
1268 		break;
1269 
1270 	case TRILL_TREEROOT: {
1271 		uint16_t treeroot;
1272 
1273 		if (tip == NULL)
1274 			break;
1275 		if (ddi_copyin(arg, &treeroot, sizeof (treeroot), mode) != 0)
1276 			return (EFAULT);
1277 		if (!VALID_NICK(treeroot))
1278 			return (EINVAL);
1279 		rw_enter(&tip->ti_rwlock, RW_WRITER);
1280 		tip->ti_treeroot = treeroot;
1281 		rw_exit(&tip->ti_rwlock);
1282 		break;
1283 	}
1284 	case TRILL_HWADDR:
1285 		if (tsock->ts_link == NULL)
1286 			break;
1287 		if (ddi_copyout(tsock->ts_link->bl_local_mac, arg, ETHERADDRL,
1288 		    mode) != 0)
1289 			return (EFAULT);
1290 		break;
1291 
1292 	case TRILL_NEWBRIDGE: {
1293 		char bname[MAXLINKNAMELEN];
1294 
1295 		if (tsock->ts_state != TS_UNBND)
1296 			return (ENOTSUP);
1297 		/* ts_tip can only be set once */
1298 		if (tip != NULL)
1299 			return (EEXIST);
1300 		if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
1301 			return (EFAULT);
1302 		bname[MAXLINKNAMELEN-1] = '\0';
1303 		error = trill_find_bridge(tsock, bname, B_TRUE);
1304 		break;
1305 	}
1306 
1307 	case TRILL_GETBRIDGE: {
1308 		char bname[MAXLINKNAMELEN];
1309 
1310 		/* ts_tip can only be set once */
1311 		if (tip != NULL)
1312 			return (EEXIST);
1313 		if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
1314 			return (EFAULT);
1315 		bname[MAXLINKNAMELEN - 1] = '\0';
1316 		error = trill_find_bridge(tsock, bname, B_FALSE);
1317 		break;
1318 	}
1319 
1320 	case TRILL_LISTNICK: {
1321 		trill_listnick_t tln;
1322 		trill_node_t *tnp;
1323 		trill_nickinfo_t *tnip;
1324 		uint16_t nick;
1325 
1326 		if (tip == NULL)
1327 			return (EINVAL);
1328 		if (ddi_copyin(arg, &tln, sizeof (tln), mode) != 0)
1329 			return (EFAULT);
1330 		nick = tln.tln_nick;
1331 		if (nick >= RBRIDGE_NICKNAME_MAX) {
1332 			error = EINVAL;
1333 			break;
1334 		}
1335 		rw_enter(&tip->ti_rwlock, RW_READER);
1336 		while (++nick < RBRIDGE_NICKNAME_MAX) {
1337 			if ((tnp = tip->ti_nodes[nick]) != NULL) {
1338 				tnip = tnp->tn_ni;
1339 				ASSERT(nick == tnip->tni_nick);
1340 				tln.tln_nick = nick;
1341 				bcopy(tnip->tni_adjsnpa, tln.tln_nexthop,
1342 				    ETHERADDRL);
1343 				tln.tln_ours = nick == tip->ti_nick;
1344 				if (tln.tln_ours || tnp->tn_tsp == NULL) {
1345 					tln.tln_linkid =
1346 					    DATALINK_INVALID_LINKID;
1347 				} else {
1348 					tln.tln_linkid =
1349 					    tnp->tn_tsp->ts_link->bl_linkid;
1350 				}
1351 				break;
1352 			}
1353 		}
1354 		rw_exit(&tip->ti_rwlock);
1355 		if (nick >= RBRIDGE_NICKNAME_MAX)
1356 			bzero(&tln, sizeof (tln));
1357 		if (ddi_copyout(&tln, arg, sizeof (tln), mode) != 0)
1358 			return (EFAULT);
1359 		break;
1360 	}
1361 
1362 	/*
1363 	 * Port flush: this is used when we lose AF on a port.  We must discard
1364 	 * all regular bridge forwarding entries on this port with the
1365 	 * indicated VLAN.
1366 	 */
1367 	case TRILL_PORTFLUSH: {
1368 		uint16_t vlan = (uint16_t)(uintptr_t)arg;
1369 
1370 		if (tsock->ts_link == NULL)
1371 			return (EINVAL);
1372 		bridge_trill_flush(tsock->ts_link, vlan, B_FALSE);
1373 		break;
1374 	}
1375 
1376 	/*
1377 	 * Nick flush: this is used when we lose AF on a port.  We must discard
1378 	 * all bridge TRILL forwarding entries on this port with the indicated
1379 	 * VLAN.
1380 	 */
1381 	case TRILL_NICKFLUSH: {
1382 		uint16_t vlan = (uint16_t)(uintptr_t)arg;
1383 
1384 		if (tsock->ts_link == NULL)
1385 			return (EINVAL);
1386 		bridge_trill_flush(tsock->ts_link, vlan, B_TRUE);
1387 		break;
1388 	}
1389 
1390 	case TRILL_GETMTU:
1391 		if (tsock->ts_link == NULL)
1392 			break;
1393 		if (ddi_copyout(&tsock->ts_link->bl_maxsdu, arg,
1394 		    sizeof (uint_t), mode) != 0)
1395 			return (EFAULT);
1396 		break;
1397 
1398 	default:
1399 		error = ENOTSUP;
1400 		break;
1401 	}
1402 
1403 	return (error);
1404 }
1405 
1406 /*
1407  * Sends received packet back upstream on the TRILL socket.
1408  * Consumes passed mblk_t.
1409  */
1410 static void
1411 trill_ctrl_input(trill_sock_t *tsock, mblk_t *mp, const uint8_t *saddr,
1412     uint16_t tci)
1413 {
1414 	int udi_size;
1415 	mblk_t *mp1;
1416 	struct T_unitdata_ind *tudi;
1417 	struct sockaddr_dl *sdl;
1418 	char *lladdr;
1419 	int error;
1420 
1421 	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
1422 	if (tsock->ts_flow_ctrld) {
1423 		freemsg(mp);
1424 		KSPINCR(tks_drops);
1425 		return;
1426 	}
1427 
1428 	udi_size =  sizeof (struct T_unitdata_ind) +
1429 	    sizeof (struct sockaddr_dl);
1430 	mp1 = allocb(udi_size, BPRI_MED);
1431 	if (mp1 == NULL) {
1432 		freemsg(mp);
1433 		KSPINCR(tks_drops);
1434 		return;
1435 	}
1436 
1437 	mp1->b_cont = mp;
1438 	mp = mp1;
1439 	mp->b_datap->db_type = M_PROTO;
1440 	/* LINTED: alignment */
1441 	tudi = (struct T_unitdata_ind *)mp->b_rptr;
1442 	mp->b_wptr = (uchar_t *)tudi + udi_size;
1443 
1444 	tudi->PRIM_type = T_UNITDATA_IND;
1445 	tudi->SRC_length = sizeof (struct sockaddr_dl);
1446 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1447 	tudi->OPT_length = 0;
1448 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) +
1449 	    sizeof (struct sockaddr_dl);
1450 
1451 	/* Information of the link on which packet was received. */
1452 	sdl = (struct sockaddr_dl *)&tudi[1];
1453 	(void) memset(sdl, 0, sizeof (struct sockaddr_dl));
1454 	sdl->sdl_family = AF_TRILL;
1455 
1456 	/* LINTED: alignment */
1457 	*(datalink_id_t *)sdl->sdl_data = tsock->ts_link->bl_linkid;
1458 	sdl->sdl_nlen = sizeof (tsock->ts_link->bl_linkid);
1459 
1460 	lladdr = LLADDR(sdl);
1461 	(void) memcpy(lladdr, saddr, ETHERADDRL);
1462 	lladdr += ETHERADDRL;
1463 	sdl->sdl_alen = ETHERADDRL;
1464 
1465 	/* LINTED: alignment */
1466 	*(uint16_t *)lladdr = tci;
1467 	sdl->sdl_slen = sizeof (uint16_t);
1468 
1469 	DTRACE_PROBE2(trill__ctrl__input, trill_sock_t *, tsock, mblk_t *, mp);
1470 	(*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
1471 	    mp, msgdsize(mp), 0, &error, NULL);
1472 
1473 	if (error == ENOSPC) {
1474 		mutex_enter(&tsock->ts_socklock);
1475 		(*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
1476 		    NULL, 0, 0, &error, NULL);
1477 		if (error == ENOSPC)
1478 			tsock->ts_flow_ctrld = B_TRUE;
1479 		mutex_exit(&tsock->ts_socklock);
1480 		KSPINCR(tks_drops);
1481 	} else if (error != 0) {
1482 		KSPINCR(tks_drops);
1483 	} else {
1484 		KSPINCR(tks_recv);
1485 	}
1486 
1487 	DTRACE_PROBE2(trill__ctrl__input__done, trill_sock_t *,
1488 	    tsock, int, error);
1489 }
1490 
1491 /* ARGSUSED */
1492 static void
1493 trill_activate(sock_lower_handle_t proto_handle,
1494     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
1495     int flags, cred_t *cr)
1496 {
1497 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1498 	struct sock_proto_props sopp;
1499 
1500 	tsock->ts_conn_upcalls = sock_upcalls;
1501 	tsock->ts_conn_upper_handle = sock_handle;
1502 
1503 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
1504 	    SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ |
1505 	    SOCKOPT_MAXBLK | SOCKOPT_MINPSZ;
1506 	sopp.sopp_wroff = 0;
1507 	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
1508 	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
1509 	sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl);
1510 	sopp.sopp_maxpsz = INFPSZ;
1511 	sopp.sopp_maxblk = INFPSZ;
1512 	sopp.sopp_minpsz = 0;
1513 	(*tsock->ts_conn_upcalls->su_set_proto_props)(
1514 	    tsock->ts_conn_upper_handle, &sopp);
1515 }
1516 
1517 /* ARGSUSED */
1518 static int
1519 trill_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
1520 {
1521 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1522 
1523 	trill_do_close(tsock);
1524 	return (0);
1525 }
1526 
1527 /* ARGSUSED */
1528 static int
1529 trill_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
1530     socklen_t len, cred_t *cr)
1531 {
1532 	int error;
1533 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1534 
1535 	if (sa == NULL)
1536 		error = trill_do_unbind(tsock);
1537 	else
1538 		error = trill_start_recv(tsock, sa, len);
1539 
1540 	return (error);
1541 }
1542 
1543 /*
1544  * This is a token getsockopt() implementation so we can reply to SO_PROTOCOL.
1545  */
1546 static int
1547 trill_getsockopt(sock_lower_handle_t handle, int level,
1548     int option_name, void *optval, socklen_t *optlenp, struct cred *cr)
1549 {
1550 	int32_t value;
1551 
1552 	if (level != SOL_SOCKET && option_name != SO_PROTOCOL) {
1553 		return (ENOPROTOOPT);
1554 	}
1555 
1556 	if (*optlenp < sizeof (int32_t)) {
1557 		return (EINVAL);
1558 	}
1559 
1560 	value = 0;
1561 	bcopy(&value, optval, sizeof (value));
1562 	*optlenp = sizeof (value);
1563 	return (0);
1564 }
1565 
1566 /* ARGSUSED */
1567 static int
1568 trill_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
1569     cred_t *cr)
1570 {
1571 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1572 	struct sockaddr_dl *laddr;
1573 	uint16_t tci;
1574 
1575 	ASSERT(DB_TYPE(mp) == M_DATA);
1576 	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
1577 
1578 	if (msg->msg_name == NULL || msg->msg_namelen != sizeof (*laddr))
1579 		goto eproto;
1580 
1581 	/*
1582 	 * The name is a datalink_id_t, the address is an Ethernet address, and
1583 	 * the selector value is the VLAN ID.
1584 	 */
1585 	laddr = (struct sockaddr_dl *)msg->msg_name;
1586 	if (laddr->sdl_nlen != sizeof (datalink_id_t) ||
1587 	    laddr->sdl_alen != ETHERADDRL ||
1588 	    (laddr->sdl_slen != sizeof (tci) && laddr->sdl_slen != 0))
1589 		goto eproto;
1590 
1591 	mutex_enter(&tsock->ts_socklock);
1592 	if (tsock->ts_state != TS_IDLE || tsock->ts_link == NULL) {
1593 		mutex_exit(&tsock->ts_socklock);
1594 		goto eproto;
1595 	}
1596 	atomic_inc_uint(&tsock->ts_sockthreadcount);
1597 	mutex_exit(&tsock->ts_socklock);
1598 
1599 	/*
1600 	 * Safe to dereference VLAN now, as we've checked the user's specified
1601 	 * values, and alignment is now guaranteed.
1602 	 */
1603 	if (laddr->sdl_slen == 0) {
1604 		tci = TRILL_NO_TCI;
1605 	} else {
1606 		/* LINTED: alignment */
1607 		tci = *(uint16_t *)(LLADDR(laddr) + ETHERADDRL);
1608 	}
1609 
1610 	mp = create_trill_header(tsock, mp, (const uchar_t *)LLADDR(laddr),
1611 	    B_TRUE, B_FALSE, tci, msgdsize(mp));
1612 	if (mp != NULL) {
1613 		mp = bridge_trill_output(tsock->ts_link, mp);
1614 		if (mp == NULL) {
1615 			KSPINCR(tks_sent);
1616 		} else {
1617 			freemsg(mp);
1618 			KSPINCR(tks_drops);
1619 		}
1620 	}
1621 
1622 	/* Wake up any threads blocking on us */
1623 	if (atomic_dec_uint_nv(&tsock->ts_sockthreadcount) == 0)
1624 		cv_broadcast(&tsock->ts_sockthreadwait);
1625 	return (0);
1626 
1627 eproto:
1628 	freemsg(mp);
1629 	KSPINCR(tks_drops);
1630 	return (EPROTO);
1631 }
1632 
1633 /* ARGSUSED */
1634 static int
1635 trill_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
1636     int mode, int32_t *rvalp, cred_t *cr)
1637 {
1638 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1639 	int rc;
1640 
1641 	switch (cmd) {
1642 	/* List of unprivileged TRILL ioctls */
1643 	case TRILL_GETNICK:
1644 	case TRILL_GETBRIDGE:
1645 	case TRILL_LISTNICK:
1646 		break;
1647 	default:
1648 		if (secpolicy_dl_config(cr) != 0)
1649 			return (EPERM);
1650 		break;
1651 	}
1652 
1653 	/* Lock ensures socket state is unchanged during ioctl handling */
1654 	mutex_enter(&tsock->ts_socklock);
1655 	rc = trill_do_ioctl(tsock, cmd, (void *)arg, mode);
1656 	mutex_exit(&tsock->ts_socklock);
1657 	return (rc);
1658 }
1659 
1660 static void
1661 trill_clr_flowctrl(sock_lower_handle_t proto_handle)
1662 {
1663 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1664 
1665 	mutex_enter(&tsock->ts_socklock);
1666 	tsock->ts_flow_ctrld = B_FALSE;
1667 	mutex_exit(&tsock->ts_socklock);
1668 }
1669 
1670 static sock_downcalls_t sock_trill_downcalls = {
1671 	trill_activate,			/* sd_activate */
1672 	sock_accept_notsupp,		/* sd_accept */
1673 	trill_bind,			/* sd_bind */
1674 	sock_listen_notsupp,		/* sd_listen */
1675 	sock_connect_notsupp,		/* sd_connect */
1676 	sock_getpeername_notsupp,	/* sd_getpeername */
1677 	sock_getsockname_notsupp,	/* sd_getsockname */
1678 	trill_getsockopt,		/* sd_getsockopt */
1679 	sock_setsockopt_notsupp,	/* sd_setsockopt */
1680 	trill_send,			/* sd_send */
1681 	NULL,				/* sd_send_uio */
1682 	NULL,				/* sd_recv_uio */
1683 	NULL,				/* sd_poll */
1684 	sock_shutdown_notsupp,		/* sd_shutdown */
1685 	trill_clr_flowctrl,		/* sd_setflowctrl */
1686 	trill_ioctl,			/* sd_ioctl */
1687 	trill_close			/* sd_close */
1688 };
1689 
1690 /* ARGSUSED */
1691 static sock_lower_handle_t
1692 trill_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
1693     uint_t *smodep, int *errorp, int flags, cred_t *credp)
1694 {
1695 	trill_sock_t *tsock;
1696 
1697 	if (family != AF_TRILL || type != SOCK_DGRAM || proto != 0) {
1698 		*errorp = EPROTONOSUPPORT;
1699 		return (NULL);
1700 	}
1701 
1702 	*sock_downcalls = &sock_trill_downcalls;
1703 	*smodep = SM_ATOMIC;
1704 	tsock = trill_do_open(flags);
1705 	*errorp = (tsock != NULL) ? 0:ENOMEM;
1706 	return ((sock_lower_handle_t)tsock);
1707 }
1708