xref: /titanic_44/usr/src/uts/common/io/trill.c (revision 726fad2a65f16c200a03969c29cb5c86c2d427db)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *  This module supports AF_TRILL sockets and TRILL layer-2 forwarding.
29  */
30 
31 #include <sys/strsubr.h>
32 #include <sys/socket.h>
33 #include <sys/socketvar.h>
34 #include <sys/modctl.h>
35 #include <sys/cmn_err.h>
36 #include <sys/tihdr.h>
37 #include <sys/strsun.h>
38 #include <sys/policy.h>
39 #include <sys/ethernet.h>
40 #include <sys/vlan.h>
41 #include <net/trill.h>
42 #include <net/if_dl.h>
43 #include <sys/mac.h>
44 #include <sys/mac_client.h>
45 #include <sys/mac_provider.h>
46 #include <sys/mac_client_priv.h>
47 #include <sys/sdt.h>
48 #include <sys/dls.h>
49 #include <sys/sunddi.h>
50 
51 #include "trill_impl.h"
52 
53 static void trill_del_all(trill_inst_t *, boolean_t);
54 static int trill_del_nick(trill_inst_t *, uint16_t, boolean_t);
55 static void trill_stop_recv(trill_sock_t *);
56 static void trill_ctrl_input(trill_sock_t *, mblk_t *, const uint8_t *,
57     uint16_t);
58 static trill_node_t *trill_node_lookup(trill_inst_t *, uint16_t);
59 static void trill_node_unref(trill_inst_t *, trill_node_t *);
60 static void trill_sock_unref(trill_sock_t *);
61 static void trill_kstats_init(trill_sock_t *, const char *);
62 
63 static list_t trill_inst_list;
64 static krwlock_t trill_inst_rwlock;
65 
66 static sock_lower_handle_t trill_create(int, int, int, sock_downcalls_t **,
67     uint_t *, int *, int, cred_t *);
68 
69 static smod_reg_t sinfo = {
70 	SOCKMOD_VERSION,
71 	"trill",
72 	SOCK_UC_VERSION,
73 	SOCK_DC_VERSION,
74 	trill_create,
75 	NULL,
76 };
77 
78 /* modldrv structure */
79 static struct modlsockmod sockmod = {
80 	&mod_sockmodops, "AF_TRILL socket module", &sinfo
81 };
82 
83 /* modlinkage structure */
84 static struct modlinkage ml = {
85 	MODREV_1,
86 	&sockmod,
87 	NULL
88 };
89 
90 #define	VALID_NICK(n)	((n) != RBRIDGE_NICKNAME_NONE && \
91 			(n) != RBRIDGE_NICKNAME_UNUSED)
92 
93 static mblk_t *
94 create_trill_header(trill_sock_t *tsock, mblk_t *mp, const uint8_t *daddr,
95     boolean_t trill_hdr_ok, boolean_t multidest, uint16_t tci,
96     size_t msglen)
97 {
98 	int extra_hdr_len;
99 	struct ether_vlan_header *ethvlanhdr;
100 	mblk_t *hdr_mp;
101 	uint16_t etype;
102 
103 	etype = msglen > 0 ? (uint16_t)msglen : ETHERTYPE_TRILL;
104 
105 	/* When sending on the PVID, we must not give a VLAN ID */
106 	if (tci == tsock->ts_link->bl_pvid)
107 		tci = TRILL_NO_TCI;
108 
109 	/*
110 	 * Create new Ethernet header and include additional space
111 	 * for writing TRILL header and/or VLAN tag.
112 	 */
113 	extra_hdr_len = (trill_hdr_ok ? 0 : sizeof (trill_header_t)) +
114 	    (tci != TRILL_NO_TCI ? sizeof (struct ether_vlan_extinfo) : 0);
115 	hdr_mp = mac_header(tsock->ts_link->bl_mh, daddr,
116 	    tci != TRILL_NO_TCI ? ETHERTYPE_VLAN : etype, mp, extra_hdr_len);
117 	if (hdr_mp == NULL) {
118 		freemsg(mp);
119 		return (NULL);
120 	}
121 
122 	if (tci != TRILL_NO_TCI) {
123 		/* LINTED: alignment */
124 		ethvlanhdr = (struct ether_vlan_header *)hdr_mp->b_rptr;
125 		ethvlanhdr->ether_tci = htons(tci);
126 		ethvlanhdr->ether_type = htons(etype);
127 		hdr_mp->b_wptr += sizeof (struct ether_vlan_extinfo);
128 	}
129 
130 	if (!trill_hdr_ok) {
131 		trill_header_t *thp;
132 		/* LINTED: alignment */
133 		thp = (trill_header_t *)hdr_mp->b_wptr;
134 		(void) memset(thp, 0, sizeof (trill_header_t));
135 		thp->th_hopcount = TRILL_DEFAULT_HOPS;
136 		thp->th_multidest = (multidest ? 1:0);
137 		hdr_mp->b_wptr += sizeof (trill_header_t);
138 	}
139 
140 	hdr_mp->b_cont = mp;
141 	return (hdr_mp);
142 }
143 
144 /*
145  * TRILL local recv function. TRILL data frames that should be received
146  * by the local system are decapsulated here and passed to bridging for
147  * learning and local system receive. Only called when we are the forwarder
148  * on the link (multi-dest frames) or the frame was destined for us.
149  */
150 static void
151 trill_recv_local(trill_sock_t *tsock, mblk_t *mp, uint16_t ingressnick)
152 {
153 	struct ether_header *inner_ethhdr;
154 
155 	/* LINTED: alignment */
156 	inner_ethhdr = (struct ether_header *)mp->b_rptr;
157 	DTRACE_PROBE1(trill__recv__local, struct ether_header *, inner_ethhdr);
158 
159 	DB_CKSUMFLAGS(mp) = 0;
160 	/*
161 	 * Transmit the decapsulated frame on the link via Bridging.
162 	 * Bridging does source address learning and appropriate forwarding.
163 	 */
164 	bridge_trill_decaps(tsock->ts_link, mp, ingressnick);
165 	KSPINCR(tks_decap);
166 }
167 
168 /*
169  * Determines the outgoing link to reach a RBridge having the given nick
170  * Assumes caller has acquired the trill instance rwlock.
171  */
172 static trill_sock_t *
173 find_trill_link(trill_inst_t *tip, datalink_id_t linkid)
174 {
175 	trill_sock_t *tsp = NULL;
176 
177 	ASSERT(RW_LOCK_HELD(&tip->ti_rwlock));
178 	for (tsp = list_head(&tip->ti_socklist); tsp != NULL;
179 	    tsp = list_next(&tip->ti_socklist, tsp)) {
180 		if (tsp->ts_link != NULL && tsp->ts_link->bl_linkid == linkid) {
181 			ASSERT(tsp->ts_link->bl_mh != NULL);
182 			ASSERT(!(tsp->ts_flags & TSF_SHUTDOWN));
183 			atomic_inc_uint(&tsp->ts_refs);
184 			break;
185 		}
186 	}
187 	return (tsp);
188 }
189 
190 /*
191  * TRILL destination forwarding function. Transmits the TRILL data packet
192  * to the next-hop, adjacent RBridge.  Consumes passed mblk_t.
193  */
194 static void
195 trill_dest_fwd(trill_inst_t *tip, mblk_t *fwd_mp, uint16_t adj_nick,
196     boolean_t has_trill_hdr, boolean_t multidest, uint16_t dtnick)
197 {
198 	trill_node_t *adj;
199 	trill_sock_t *tsock = NULL;
200 	trill_header_t *trillhdr;
201 	struct ether_header *ethhdr;
202 	int ethtype;
203 	int ethhdrlen;
204 
205 	adj = trill_node_lookup(tip, adj_nick);
206 	if (adj == NULL || ((tsock = adj->tn_tsp) == NULL))
207 		goto dest_fwd_fail;
208 
209 	ASSERT(tsock->ts_link != NULL);
210 	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
211 	ASSERT(adj->tn_ni != NULL);
212 
213 	DTRACE_PROBE3(trill__dest__fwd, uint16_t, adj_nick, trill_node_t,
214 	    adj, trill_sock_t, tsock);
215 
216 	/*
217 	 * For broadcast links by using the dest address of
218 	 * the RBridge to forward the frame should result in
219 	 * savings. When the link is a bridged LAN or there are
220 	 * many end stations the frame will not always be flooded.
221 	 */
222 	fwd_mp = create_trill_header(tsock, fwd_mp, adj->tn_ni->tni_adjsnpa,
223 	    has_trill_hdr, multidest, tsock->ts_desigvlan, 0);
224 	if (fwd_mp == NULL)
225 		goto dest_fwd_fail;
226 
227 	/* LINTED: alignment */
228 	ethhdr = (struct ether_header *)fwd_mp->b_rptr;
229 	ethtype = ntohs(ethhdr->ether_type);
230 	ASSERT(ethtype == ETHERTYPE_VLAN || ethtype == ETHERTYPE_TRILL);
231 
232 	/* Pullup Ethernet and TRILL header (w/o TRILL options) */
233 	ethhdrlen = sizeof (struct ether_header) +
234 	    (ethtype == ETHERTYPE_VLAN ? sizeof (struct ether_vlan_extinfo):0);
235 	if (!pullupmsg(fwd_mp, ethhdrlen + sizeof (trill_header_t)))
236 		goto dest_fwd_fail;
237 	/* LINTED: alignment */
238 	trillhdr = (struct trill_header *)(fwd_mp->b_rptr + ethhdrlen);
239 
240 	/* Update TRILL header with ingress and egress nicks for new frames */
241 	if (!has_trill_hdr) {
242 		/* We are creating a new TRILL frame */
243 		trillhdr->th_egressnick = (multidest ? dtnick:adj_nick);
244 		rw_enter(&tip->ti_rwlock, RW_READER);
245 		trillhdr->th_ingressnick = tip->ti_nick;
246 		rw_exit(&tip->ti_rwlock);
247 		if (!VALID_NICK(trillhdr->th_ingressnick))
248 			goto dest_fwd_fail;
249 	}
250 
251 	/* Set hop count and update header in packet */
252 	ASSERT(trillhdr->th_hopcount != 0);
253 	trillhdr->th_hopcount--;
254 
255 	/* Clear checksum flag and transmit frame on the link */
256 	DB_CKSUMFLAGS(fwd_mp) = 0;
257 	DTRACE_PROBE1(trill__dest__fwd__tx, trill_header_t *, &trillhdr);
258 	fwd_mp = bridge_trill_output(tsock->ts_link, fwd_mp);
259 	if (fwd_mp == NULL) {
260 		KSPINCR(tks_sent);
261 		KSPINCR(tks_forward);
262 	} else {
263 		freemsg(fwd_mp);
264 		KSPINCR(tks_drops);
265 	}
266 	trill_node_unref(tip, adj);
267 	return;
268 
269 dest_fwd_fail:
270 	if (adj != NULL)
271 		trill_node_unref(tip, adj);
272 	if (tsock != NULL)
273 		KSPINCR(tks_drops);
274 	freemsg(fwd_mp);
275 }
276 
277 /*
278  * TRILL multi-destination forwarding. Transmits the packet to the adjacencies
279  * on the distribution tree determined by the egress nick. Source addr (saddr)
280  * is NULL for new TRILL packets originating from us.
281  */
282 static void
283 trill_multidest_fwd(trill_inst_t *tip, mblk_t *mp, uint16_t egressnick,
284     uint16_t ingressnick, boolean_t is_trill_pkt, const uint8_t *saddr,
285     int inner_vlan, boolean_t free_mblk)
286 {
287 	int idx;
288 	uint16_t adjnick;
289 	trill_node_t *dest;
290 	trill_node_t *adj;
291 	mblk_t *fwd_mp;
292 	boolean_t nicksaved = B_FALSE;
293 	uint16_t adjnicksaved;
294 
295 	/* Lookup the egress nick info, this is the DT root */
296 	if ((dest = trill_node_lookup(tip, egressnick)) == NULL)
297 		goto fail_multidest_fwd;
298 
299 	/* Send a copy to all our adjacencies on the DT root  */
300 	ASSERT(dest->tn_ni);
301 	for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
302 
303 		/* Check for a valid adjacency node */
304 		adjnick = TNI_ADJNICK(dest->tn_ni, idx);
305 		if (!VALID_NICK(adjnick) || ingressnick == adjnick ||
306 		    ((adj = trill_node_lookup(tip, adjnick)) == NULL))
307 			continue;
308 
309 		/* Do not forward back to adjacency that sent the pkt to us */
310 		ASSERT(adj->tn_ni != NULL);
311 		if ((saddr != NULL) &&
312 		    (memcmp(adj->tn_ni->tni_adjsnpa, saddr,
313 		    ETHERADDRL) == 0)) {
314 			trill_node_unref(tip, adj);
315 			continue;
316 		}
317 
318 		/* Check if adj is marked as reaching inner VLAN downstream */
319 		if ((inner_vlan != VLAN_ID_NONE) &&
320 		    !TRILL_VLANISSET(TNI_VLANFILTERMAP(dest->tn_ni, idx),
321 		    inner_vlan)) {
322 			trill_node_unref(tip, adj);
323 			DTRACE_PROBE4(trill__multi__dest__fwd__vlanfiltered,
324 			    uint16_t, adjnick, uint16_t, ingressnick,
325 			    uint16_t, egressnick, int, inner_vlan);
326 			continue;
327 		}
328 
329 		trill_node_unref(tip, adj);
330 
331 		/*
332 		 * Save the nick and look ahead to see if we should forward the
333 		 * frame to more adjacencies. We avoid doing a copy for this
334 		 * nick and use the passed mblk when we can consume the passed
335 		 * mblk.
336 		 */
337 		if (free_mblk && !nicksaved) {
338 			adjnicksaved = adjnick;
339 			nicksaved = B_TRUE;
340 			continue;
341 		}
342 
343 		fwd_mp = copymsg(mp);
344 		if (fwd_mp == NULL)
345 			break;
346 		DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
347 		    adjnick, uint16_t, ingressnick);
348 		trill_dest_fwd(tip, fwd_mp, adjnick, is_trill_pkt,
349 		    B_TRUE, egressnick);
350 	}
351 	trill_node_unref(tip, dest);
352 
353 	if (nicksaved) {
354 		ASSERT(free_mblk);
355 		DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
356 		    adjnicksaved, uint16_t, ingressnick);
357 		trill_dest_fwd(tip, mp, adjnicksaved, is_trill_pkt,
358 		    B_TRUE, egressnick);
359 		return;
360 	}
361 
362 fail_multidest_fwd:
363 	DTRACE_PROBE2(trill__multi__dest__fwd__fail, uint16_t,
364 	    egressnick, uint16_t, ingressnick);
365 	if (free_mblk) {
366 		freemsg(mp);
367 	}
368 }
369 
370 /*
371  * TRILL data receive function. Forwards the received frame if necessary
372  * and also determines if the received frame should be consumed locally.
373  * Consumes passed mblk.
374  */
375 static void
376 trill_recv(trill_sock_t *tsock, mblk_t *mp, const uint8_t *mpsaddr)
377 {
378 	trill_header_t *trillhdr;
379 	trill_node_t *dest = NULL;
380 	trill_node_t *source = NULL;
381 	trill_node_t *adj;
382 	uint16_t ournick, adjnick, treeroot;
383 	struct ether_header *ethhdr;
384 	trill_inst_t *tip = tsock->ts_tip;
385 	uint8_t srcaddr[ETHERADDRL];
386 	size_t trillhdrlen;
387 	int inner_vlan = VLAN_ID_NONE;
388 	int tci;
389 	int idx;
390 	size_t min_size;
391 
392 	/* Copy Ethernet source address before modifying packet */
393 	(void) memcpy(srcaddr, mpsaddr, ETHERADDRL);
394 
395 	/* Pull up TRILL header if necessary. */
396 	min_size = sizeof (trill_header_t);
397 	if ((MBLKL(mp) < min_size ||
398 	    !IS_P2ALIGNED(mp->b_rptr, TRILL_HDR_ALIGN)) &&
399 	    !pullupmsg(mp, min_size))
400 		goto fail;
401 
402 	/* LINTED: alignment */
403 	trillhdr = (trill_header_t *)mp->b_rptr;
404 	if (trillhdr->th_version != TRILL_PROTOCOL_VERS) {
405 		DTRACE_PROBE1(trill__recv__wrongversion,
406 		    trill_header_t *, trillhdr);
407 		goto fail;
408 	}
409 
410 	/* Drop if unknown or invalid nickname */
411 	if (!VALID_NICK(trillhdr->th_egressnick) ||
412 	    !VALID_NICK(trillhdr->th_ingressnick)) {
413 		DTRACE_PROBE1(trill__recv__invalidnick,
414 		    trill_header_t *, trillhdr);
415 		goto fail;
416 	}
417 
418 	rw_enter(&tip->ti_rwlock, RW_READER);
419 	ournick = tip->ti_nick;
420 	treeroot = tip->ti_treeroot;
421 	rw_exit(&tip->ti_rwlock);
422 	/* Drop if we received a packet with our nick as ingress */
423 	if (trillhdr->th_ingressnick == ournick)
424 		goto fail;
425 
426 	/* Re-pull any TRILL options and inner Ethernet header */
427 	min_size += GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t) +
428 	    sizeof (struct ether_header);
429 	if (MBLKL(mp) < min_size) {
430 		if (!pullupmsg(mp, min_size))
431 			goto fail;
432 		/* LINTED: alignment */
433 		trillhdr = (trill_header_t *)mp->b_rptr;
434 	}
435 	trillhdrlen = sizeof (trill_header_t) +
436 	    (GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t));
437 
438 	/*
439 	 * Get the inner Ethernet header, plus the inner VLAN header if there
440 	 * is one.
441 	 */
442 	/* LINTED: alignment */
443 	ethhdr = (struct ether_header *)(mp->b_rptr + trillhdrlen);
444 	if (ethhdr->ether_type == htons(ETHERTYPE_VLAN)) {
445 		min_size += sizeof (struct ether_vlan_extinfo);
446 		if (MBLKL(mp) < min_size) {
447 			if (!pullupmsg(mp, min_size))
448 				goto fail;
449 			/* LINTED: alignment */
450 			trillhdr = (trill_header_t *)mp->b_rptr;
451 			/* LINTED: alignment */
452 			ethhdr = (struct ether_header *)(mp->b_rptr +
453 			    trillhdrlen);
454 		}
455 
456 		tci = ntohs(((struct ether_vlan_header *)ethhdr)->ether_tci);
457 		inner_vlan = VLAN_ID(tci);
458 	}
459 
460 	/* Known/single destination forwarding. */
461 	if (!trillhdr->th_multidest) {
462 
463 		/* Inner MacDA must be unicast */
464 		if (ethhdr->ether_dhost.ether_addr_octet[0] & 1)
465 			goto fail;
466 
467 		/* Ingress and Egress nicks must be different */
468 		if (trillhdr->th_egressnick == trillhdr->th_ingressnick)
469 			goto fail;
470 
471 		DTRACE_PROBE1(trill__recv__singledest,
472 		    trill_header_t *, trillhdr);
473 		if (trillhdr->th_egressnick == ournick) {
474 			mp->b_rptr += trillhdrlen;
475 			trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
476 		} else if (trillhdr->th_hopcount > 0) {
477 			trill_dest_fwd(tip, mp, trillhdr->th_egressnick,
478 			    B_TRUE, B_FALSE, RBRIDGE_NICKNAME_NONE);
479 		} else {
480 			goto fail;
481 		}
482 		return;
483 	}
484 
485 	/*
486 	 * Multi-destination frame: perform checks verifying we have
487 	 * received a valid multi-destination frame before receiving the
488 	 * frame locally and forwarding the frame to other RBridges.
489 	 *
490 	 * Check if we received this multi-destination frame on a
491 	 * adjacency in the distribution tree indicated by the frame's
492 	 * egress nickname.
493 	 */
494 	if ((dest = trill_node_lookup(tip, trillhdr->th_egressnick)) == NULL)
495 		goto fail;
496 	for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
497 		adjnick = TNI_ADJNICK(dest->tn_ni, idx);
498 		if ((adj = trill_node_lookup(tip, adjnick)) == NULL)
499 			continue;
500 		if (memcmp(adj->tn_ni->tni_adjsnpa, srcaddr, ETHERADDRL) == 0) {
501 			trill_node_unref(tip, adj);
502 			break;
503 		}
504 		trill_node_unref(tip, adj);
505 	}
506 
507 	if (idx >= dest->tn_ni->tni_adjcount) {
508 		DTRACE_PROBE2(trill__recv__multidest__adjcheckfail,
509 		    trill_header_t *, trillhdr, trill_node_t *, dest);
510 		goto fail;
511 	}
512 
513 	/*
514 	 * Reverse path forwarding check. Check if the ingress RBridge
515 	 * that has forwarded the frame advertised the use of the
516 	 * distribution tree specified in the egress nick.
517 	 */
518 	if ((source = trill_node_lookup(tip, trillhdr->th_ingressnick)) == NULL)
519 		goto fail;
520 	for (idx = 0; idx < source->tn_ni->tni_dtrootcount; idx++) {
521 		if (TNI_DTROOTNICK(source->tn_ni, idx) ==
522 		    trillhdr->th_egressnick)
523 			break;
524 	}
525 
526 	if (idx >= source->tn_ni->tni_dtrootcount) {
527 		/*
528 		 * Allow receipt of forwarded frame with the highest
529 		 * tree root RBridge as the egress RBridge when the
530 		 * ingress RBridge has not advertised the use of any
531 		 * distribution trees.
532 		 */
533 		if (source->tn_ni->tni_dtrootcount != 0 ||
534 		    trillhdr->th_egressnick != treeroot) {
535 			DTRACE_PROBE3(
536 			    trill__recv__multidest__rpfcheckfail,
537 			    trill_header_t *, trillhdr, trill_node_t *,
538 			    source, trill_inst_t *, tip);
539 			goto fail;
540 		}
541 	}
542 
543 	/* Check hop count before doing any forwarding */
544 	if (trillhdr->th_hopcount == 0)
545 		goto fail;
546 
547 	/* Forward frame using the distribution tree specified by egress nick */
548 	DTRACE_PROBE2(trill__recv__multidest, trill_header_t *,
549 	    trillhdr, trill_node_t *, source);
550 	trill_node_unref(tip, source);
551 	trill_node_unref(tip, dest);
552 
553 	/* Tell forwarding not to free if we're the link forwarder. */
554 	trill_multidest_fwd(tip, mp, trillhdr->th_egressnick,
555 	    trillhdr->th_ingressnick, B_TRUE, srcaddr, inner_vlan,
556 	    B_FALSE);
557 
558 	/*
559 	 * Send de-capsulated frame locally if we are the link forwarder (also
560 	 * does bridge learning).
561 	 */
562 	mp->b_rptr += trillhdrlen;
563 	trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
564 	KSPINCR(tks_recv);
565 	return;
566 
567 fail:
568 	DTRACE_PROBE2(trill__recv__multidest__fail, mblk_t *, mp,
569 	    trill_sock_t *, tsock);
570 	if (dest != NULL)
571 		trill_node_unref(tip, dest);
572 	if (source != NULL)
573 		trill_node_unref(tip, source);
574 	freemsg(mp);
575 	KSPINCR(tks_drops);
576 }
577 
578 static void
579 trill_stop_recv(trill_sock_t *tsock)
580 {
581 	mutex_enter(&tsock->ts_socklock);
582 stop_retry:
583 	if (tsock->ts_state == TS_UNBND || tsock->ts_link == NULL) {
584 		mutex_exit(&tsock->ts_socklock);
585 		return;
586 	}
587 
588 	/*
589 	 * If another thread is closing the socket then wait. Our callers
590 	 * expect us to return only after the socket is closed.
591 	 */
592 	if (tsock->ts_flags & TSF_CLOSEWAIT) {
593 		cv_wait(&tsock->ts_sockclosewait, &tsock->ts_socklock);
594 		goto stop_retry;
595 	}
596 
597 	/*
598 	 * Set state and flags to block new bind or close calls
599 	 * while we close the socket.
600 	 */
601 	tsock->ts_flags |= TSF_CLOSEWAIT;
602 
603 	/* Wait until all AF_TRILL socket transmit operations are done */
604 	while (tsock->ts_sockthreadcount > 0)
605 		cv_wait(&tsock->ts_sockthreadwait, &tsock->ts_socklock);
606 
607 	/*
608 	 * We are guaranteed to be the only thread closing on the
609 	 * socket while the TSF_CLOSEWAIT flag is set, all others cv_wait
610 	 * for us to finish.
611 	 */
612 	ASSERT(tsock->ts_link != NULL);
613 	if (tsock->ts_ksp != NULL)
614 		kstat_delete(tsock->ts_ksp);
615 
616 	/*
617 	 * Release lock before bridge_trill_lnunref to prevent deadlock
618 	 * between trill_ctrl_input thread waiting to acquire ts_socklock
619 	 * and bridge_trill_lnunref waiting for the trill thread to finish.
620 	 */
621 	mutex_exit(&tsock->ts_socklock);
622 
623 	/*
624 	 * Release TRILL link reference from Bridging. On return from
625 	 * bridge_trill_lnunref we can be sure there are no active TRILL data
626 	 * threads for this link.
627 	 */
628 	bridge_trill_lnunref(tsock->ts_link);
629 
630 	/* Set socket as unbound & wakeup threads waiting for socket to close */
631 	mutex_enter(&tsock->ts_socklock);
632 	ASSERT(tsock->ts_link != NULL);
633 	tsock->ts_link = NULL;
634 	tsock->ts_state = TS_UNBND;
635 	tsock->ts_flags &= ~TSF_CLOSEWAIT;
636 	cv_broadcast(&tsock->ts_sockclosewait);
637 	mutex_exit(&tsock->ts_socklock);
638 }
639 
640 static int
641 trill_start_recv(trill_sock_t *tsock, const struct sockaddr *sa, socklen_t len)
642 {
643 	struct sockaddr_dl *lladdr = (struct sockaddr_dl *)sa;
644 	datalink_id_t linkid;
645 	int err = 0;
646 
647 	if (len != sizeof (*lladdr))
648 		return (EINVAL);
649 
650 	mutex_enter(&tsock->ts_socklock);
651 	if (tsock->ts_tip == NULL || tsock->ts_state != TS_UNBND) {
652 		err = EINVAL;
653 		goto bind_error;
654 	}
655 
656 	if (tsock->ts_flags & TSF_CLOSEWAIT || tsock->ts_link != NULL) {
657 		err = EBUSY;
658 		goto bind_error;
659 	}
660 
661 	(void) memcpy(&(tsock->ts_lladdr), lladdr,
662 	    sizeof (struct sockaddr_dl));
663 	(void) memcpy(&linkid, tsock->ts_lladdr.sdl_data,
664 	    sizeof (datalink_id_t));
665 
666 	tsock->ts_link = bridge_trill_lnref(tsock->ts_tip->ti_binst,
667 	    linkid, tsock);
668 	if (tsock->ts_link == NULL) {
669 		err = EINVAL;
670 		goto bind_error;
671 	}
672 
673 	trill_kstats_init(tsock, tsock->ts_tip->ti_bridgename);
674 	tsock->ts_state = TS_IDLE;
675 
676 bind_error:
677 	mutex_exit(&tsock->ts_socklock);
678 	return (err);
679 }
680 
681 static int
682 trill_do_unbind(trill_sock_t *tsock)
683 {
684 	/* If a bind has not been done, we can't unbind. */
685 	if (tsock->ts_state != TS_IDLE)
686 		return (EINVAL);
687 
688 	trill_stop_recv(tsock);
689 	return (0);
690 }
691 
692 static void
693 trill_instance_unref(trill_inst_t *tip)
694 {
695 	rw_enter(&trill_inst_rwlock, RW_WRITER);
696 	rw_enter(&tip->ti_rwlock, RW_WRITER);
697 	if (atomic_dec_uint_nv(&tip->ti_refs) == 0) {
698 		list_remove(&trill_inst_list, tip);
699 		rw_exit(&tip->ti_rwlock);
700 		rw_exit(&trill_inst_rwlock);
701 		if (tip->ti_binst != NULL)
702 			bridge_trill_brunref(tip->ti_binst);
703 		list_destroy(&tip->ti_socklist);
704 		rw_destroy(&tip->ti_rwlock);
705 		kmem_free(tip, sizeof (*tip));
706 	} else {
707 		rw_exit(&tip->ti_rwlock);
708 		rw_exit(&trill_inst_rwlock);
709 	}
710 }
711 
712 /*
713  * This is called when the bridge module receives a TRILL-encapsulated packet
714  * on a given link or a packet identified as "TRILL control."  We must verify
715  * that it's for us (it almost certainly will be), and then either decapsulate
716  * (if it's to our nickname), forward (if it's to someone else), or send up one
717  * of the sockets (if it's control traffic).
718  *
719  * Sadly, on Ethernet, the control traffic is identified by Outer.MacDA, and
720  * not by TRILL header information.
721  */
722 static void
723 trill_recv_pkt_cb(void *lptr, bridge_link_t *blp, mac_resource_handle_t rsrc,
724     mblk_t *mp, mac_header_info_t *hdr_info)
725 {
726 	trill_sock_t *tsock = lptr;
727 
728 	_NOTE(ARGUNUSED(rsrc));
729 
730 	ASSERT(tsock->ts_tip != NULL);
731 	ASSERT(tsock->ts_link != NULL);
732 	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
733 
734 	/*
735 	 * Only receive packet if the source address is not multicast (which is
736 	 * bogus).
737 	 */
738 	if (hdr_info->mhi_saddr[0] & 1)
739 		goto discard;
740 
741 	/*
742 	 * Check if this is our own packet reflected back.  It should not be.
743 	 */
744 	if (bcmp(hdr_info->mhi_saddr, blp->bl_local_mac, ETHERADDRL) == 0)
745 		goto discard;
746 
747 	/* Only receive unicast packet if addressed to us */
748 	if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST &&
749 	    bcmp(hdr_info->mhi_daddr, blp->bl_local_mac, ETHERADDRL) != 0)
750 		goto discard;
751 
752 	if (hdr_info->mhi_bindsap == ETHERTYPE_TRILL) {
753 		/* TRILL data packets */
754 		trill_recv(tsock, mp, hdr_info->mhi_saddr);
755 	} else {
756 		/* Design constraint for cheap IS-IS/BPDU comparison */
757 		ASSERT(all_isis_rbridges[4] != bridge_group_address[4]);
758 		/* Send received control packet upstream */
759 		trill_ctrl_input(tsock, mp, hdr_info->mhi_saddr,
760 		    hdr_info->mhi_daddr[4] == all_isis_rbridges[4] ?
761 		    hdr_info->mhi_tci : TRILL_TCI_BPDU);
762 	}
763 
764 	return;
765 
766 discard:
767 	freemsg(mp);
768 	KSPINCR(tks_drops);
769 }
770 
771 /*
772  * This is called when the bridge module discovers that the destination address
773  * for a packet is not local -- it's through some remote node.  We must verify
774  * that the remote node isn't our nickname (it shouldn't be), add a TRILL
775  * header, and then use the IS-IS data to determine which link and which
776  * next-hop RBridge should be used for output.  We then transmit on that link.
777  *
778  * The egress_nick is RBRIDGE_NICKNAME_NONE for the "unknown destination" case.
779  */
780 static void
781 trill_encap_pkt_cb(void *lptr, bridge_link_t *blp, mac_header_info_t *hdr_info,
782     mblk_t *mp, uint16_t egress_nick)
783 {
784 	uint16_t ournick;
785 	uint16_t dtnick;
786 	trill_node_t *self = NULL;
787 	trill_sock_t *tsock = lptr;
788 	trill_inst_t *tip = tsock->ts_tip;
789 	int vlan = VLAN_ID_NONE;
790 
791 	_NOTE(ARGUNUSED(blp));
792 	ASSERT(hdr_info->mhi_bindsap != ETHERTYPE_TRILL);
793 
794 	/* egress_nick = RBRIDGE_NICKNAME_NONE is valid */
795 	if (egress_nick != RBRIDGE_NICKNAME_NONE && !VALID_NICK(egress_nick))
796 		goto discard;
797 
798 	/* Check if our own nick is valid before we do any forwarding */
799 	rw_enter(&tip->ti_rwlock, RW_READER);
800 	ournick = tip->ti_nick;
801 	dtnick = tip->ti_treeroot;
802 	rw_exit(&tip->ti_rwlock);
803 	if (!VALID_NICK(ournick))
804 		goto discard;
805 
806 	/*
807 	 * For Multi-Destination forwarding determine our choice of
808 	 * root distribution tree. If we didn't choose a distribution
809 	 * tree (dtroots_count=0) then we use the highest priority tree
810 	 * root (t_treeroot) else we drop the packet without forwarding.
811 	 */
812 	if (egress_nick == RBRIDGE_NICKNAME_NONE) {
813 		if ((self = trill_node_lookup(tip, ournick)) == NULL)
814 			goto discard;
815 
816 		/*
817 		 * Use the first DT configured for now. In future we
818 		 * should have DT selection code here.
819 		 */
820 		if (self->tn_ni->tni_dtrootcount > 0) {
821 			dtnick = TNI_DTROOTNICK(self->tn_ni, 0);
822 		}
823 
824 		trill_node_unref(tip, self);
825 		if (!VALID_NICK(dtnick)) {
826 			DTRACE_PROBE(trill__fwd__packet__nodtroot);
827 			goto discard;
828 		}
829 	}
830 
831 	/*
832 	 * Retrieve VLAN ID of the native frame used for VLAN
833 	 * pruning of multi-destination frames.
834 	 */
835 	if (hdr_info->mhi_istagged) {
836 		vlan = VLAN_ID(hdr_info->mhi_tci);
837 	}
838 
839 	DTRACE_PROBE2(trill__fwd__packet, mac_header_info_t *, hdr_info,
840 	    uint16_t, egress_nick);
841 	if (egress_nick == RBRIDGE_NICKNAME_NONE) {
842 		trill_multidest_fwd(tip, mp, dtnick,
843 		    ournick, B_FALSE, NULL, vlan, B_TRUE);
844 	} else {
845 		trill_dest_fwd(tip, mp, egress_nick, B_FALSE, B_FALSE,
846 		    RBRIDGE_NICKNAME_NONE);
847 	}
848 	KSPINCR(tks_encap);
849 	return;
850 
851 discard:
852 	freemsg(mp);
853 }
854 
855 /*
856  * This is called when the bridge module has completely torn down a bridge
857  * instance and all of the attached links.  We need to make the TRILL instance
858  * go away at this point.
859  */
860 static void
861 trill_br_dstr_cb(void *bptr, bridge_inst_t *bip)
862 {
863 	trill_inst_t *tip = bptr;
864 
865 	_NOTE(ARGUNUSED(bip));
866 	rw_enter(&tip->ti_rwlock, RW_WRITER);
867 	if (tip->ti_binst != NULL)
868 		bridge_trill_brunref(tip->ti_binst);
869 	tip->ti_binst = NULL;
870 	rw_exit(&tip->ti_rwlock);
871 }
872 
873 /*
874  * This is called when the bridge module is tearing down a link, but before the
875  * actual tear-down starts.  When this function returns, we must make sure that
876  * we will not initiate any new transmits on this link.
877  */
878 static void
879 trill_ln_dstr_cb(void *lptr, bridge_link_t *blp)
880 {
881 	trill_sock_t *tsock = lptr;
882 
883 	_NOTE(ARGUNUSED(blp));
884 	trill_stop_recv(tsock);
885 }
886 
887 static void
888 trill_init(void)
889 {
890 	list_create(&trill_inst_list, sizeof (trill_inst_t),
891 	    offsetof(trill_inst_t, ti_instnode));
892 	rw_init(&trill_inst_rwlock, NULL, RW_DRIVER, NULL);
893 	bridge_trill_register_cb(trill_recv_pkt_cb, trill_encap_pkt_cb,
894 	    trill_br_dstr_cb, trill_ln_dstr_cb);
895 }
896 
897 static void
898 trill_fini(void)
899 {
900 	bridge_trill_register_cb(NULL, NULL, NULL, NULL);
901 	rw_destroy(&trill_inst_rwlock);
902 	list_destroy(&trill_inst_list);
903 }
904 
905 /* Loadable module configuration entry points */
906 int
907 _init(void)
908 {
909 	int rc;
910 
911 	trill_init();
912 	if ((rc = mod_install(&ml)) != 0)
913 		trill_fini();
914 	return (rc);
915 }
916 
917 int
918 _info(struct modinfo *modinfop)
919 {
920 	return (mod_info(&ml, modinfop));
921 }
922 
923 int
924 _fini(void)
925 {
926 	int rc;
927 
928 	rw_enter(&trill_inst_rwlock, RW_READER);
929 	rc = list_is_empty(&trill_inst_list) ? 0 : EBUSY;
930 	rw_exit(&trill_inst_rwlock);
931 	if (rc == 0 && ((rc = mod_remove(&ml)) == 0))
932 		trill_fini();
933 	return (rc);
934 }
935 
936 static void
937 trill_kstats_init(trill_sock_t *tsock, const char *bname)
938 {
939 	int i;
940 	char kstatname[KSTAT_STRLEN];
941 	kstat_named_t  *knt;
942 	static const char *sock_kstats_list[] = { TRILL_KSSOCK_NAMES };
943 	char link_name[MAXNAMELEN];
944 	int num;
945 	int err;
946 
947 	bzero(link_name, sizeof (link_name));
948 	if ((err = dls_mgmt_get_linkinfo(tsock->ts_link->bl_linkid, link_name,
949 	    NULL, NULL, NULL)) != 0) {
950 		cmn_err(CE_WARN, "%s: trill_kstats_init: error %d retrieving"
951 		    " linkinfo for linkid:%d", "trill", err,
952 		    tsock->ts_link->bl_linkid);
953 		return;
954 	}
955 
956 	bzero(kstatname, sizeof (kstatname));
957 	(void) snprintf(kstatname, sizeof (kstatname), "%s-%s",
958 	    bname, link_name);
959 
960 	num = sizeof (sock_kstats_list) / sizeof (*sock_kstats_list);
961 	for (i = 0; i < num; i++) {
962 		knt = (kstat_named_t *)&(tsock->ts_kstats);
963 		kstat_named_init(&knt[i], sock_kstats_list[i],
964 		    KSTAT_DATA_UINT64);
965 	}
966 
967 	tsock->ts_ksp = kstat_create_zone("trill", 0, kstatname, "sock",
968 	    KSTAT_TYPE_NAMED, num, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
969 	if (tsock->ts_ksp != NULL) {
970 		tsock->ts_ksp->ks_data = &tsock->ts_kstats;
971 		kstat_install(tsock->ts_ksp);
972 	}
973 }
974 
975 static trill_sock_t *
976 trill_do_open(int flags)
977 {
978 	trill_sock_t *tsock;
979 	int kmflag = ((flags & SOCKET_NOSLEEP)) ? KM_NOSLEEP:KM_SLEEP;
980 
981 	tsock = kmem_zalloc(sizeof (trill_sock_t), kmflag);
982 	if (tsock != NULL) {
983 		tsock->ts_state = TS_UNBND;
984 		tsock->ts_refs++;
985 		mutex_init(&tsock->ts_socklock, NULL, MUTEX_DRIVER, NULL);
986 		cv_init(&tsock->ts_sockthreadwait, NULL, CV_DRIVER, NULL);
987 		cv_init(&tsock->ts_sockclosewait, NULL, CV_DRIVER, NULL);
988 	}
989 	return (tsock);
990 }
991 
992 static int
993 trill_find_bridge(trill_sock_t *tsock, const char *bname, boolean_t can_create)
994 {
995 	trill_inst_t *tip, *newtip = NULL;
996 
997 	/* Allocate some memory (speculatively) before taking locks */
998 	if (can_create)
999 		newtip = kmem_zalloc(sizeof (*tip), KM_NOSLEEP);
1000 
1001 	rw_enter(&trill_inst_rwlock, RW_WRITER);
1002 	for (tip = list_head(&trill_inst_list); tip != NULL;
1003 	    tip = list_next(&trill_inst_list, tip)) {
1004 		if (strcmp(tip->ti_bridgename, bname) == 0)
1005 			break;
1006 	}
1007 	if (tip == NULL) {
1008 		if (!can_create || newtip == NULL) {
1009 			rw_exit(&trill_inst_rwlock);
1010 			return (can_create ? ENOMEM : ENOENT);
1011 		}
1012 
1013 		tip = newtip;
1014 		newtip = NULL;
1015 		(void) strcpy(tip->ti_bridgename, bname);
1016 
1017 		/* Register TRILL instance with bridging */
1018 		tip->ti_binst = bridge_trill_brref(bname, tip);
1019 		if (tip->ti_binst == NULL) {
1020 			rw_exit(&trill_inst_rwlock);
1021 			kmem_free(tip, sizeof (*tip));
1022 			return (ENOENT);
1023 		}
1024 
1025 		rw_init(&tip->ti_rwlock, NULL, RW_DRIVER, NULL);
1026 		list_create(&tip->ti_socklist, sizeof (trill_sock_t),
1027 		    offsetof(trill_sock_t, ts_socklistnode));
1028 		list_insert_tail(&trill_inst_list, tip);
1029 	}
1030 	atomic_inc_uint(&tip->ti_refs);
1031 	rw_exit(&trill_inst_rwlock);
1032 
1033 	/* If we didn't need the preallocated memory, then discard now. */
1034 	if (newtip != NULL)
1035 		kmem_free(newtip, sizeof (*newtip));
1036 
1037 	rw_enter(&tip->ti_rwlock, RW_WRITER);
1038 	list_insert_tail(&(tip->ti_socklist), tsock);
1039 	tsock->ts_tip = tip;
1040 	rw_exit(&tip->ti_rwlock);
1041 	return (0);
1042 }
1043 
1044 static void
1045 trill_clear_bridge(trill_sock_t *tsock)
1046 {
1047 	trill_inst_t *tip;
1048 
1049 	if ((tip = tsock->ts_tip) == NULL)
1050 		return;
1051 	rw_enter(&tip->ti_rwlock, RW_WRITER);
1052 	list_remove(&tip->ti_socklist, tsock);
1053 	if (list_is_empty(&tip->ti_socklist))
1054 		trill_del_all(tip, B_TRUE);
1055 	rw_exit(&tip->ti_rwlock);
1056 }
1057 
1058 static void
1059 trill_sock_unref(trill_sock_t *tsock)
1060 {
1061 	if (atomic_dec_uint_nv(&tsock->ts_refs) == 0) {
1062 		mutex_destroy(&tsock->ts_socklock);
1063 		cv_destroy(&tsock->ts_sockthreadwait);
1064 		cv_destroy(&tsock->ts_sockclosewait);
1065 		kmem_free(tsock, sizeof (trill_sock_t));
1066 	}
1067 }
1068 
1069 static void
1070 trill_do_close(trill_sock_t *tsock)
1071 {
1072 	trill_inst_t *tip;
1073 
1074 	tip = tsock->ts_tip;
1075 	trill_stop_recv(tsock);
1076 	/* Remove socket from TRILL instance socket list */
1077 	trill_clear_bridge(tsock);
1078 	tsock->ts_flags |= TSF_SHUTDOWN;
1079 	trill_sock_unref(tsock);
1080 	if (tip != NULL)
1081 		trill_instance_unref(tip);
1082 }
1083 
1084 static void
1085 trill_del_all(trill_inst_t *tip, boolean_t lockheld)
1086 {
1087 	int i;
1088 
1089 	if (!lockheld)
1090 		rw_enter(&tip->ti_rwlock, RW_WRITER);
1091 	for (i = RBRIDGE_NICKNAME_MIN; i < RBRIDGE_NICKNAME_MAX; i++) {
1092 		if (tip->ti_nodes[i] != NULL)
1093 			(void) trill_del_nick(tip, i, B_TRUE);
1094 	}
1095 	if (!lockheld)
1096 		rw_exit(&tip->ti_rwlock);
1097 }
1098 
1099 static void
1100 trill_node_free(trill_node_t *nick_entry)
1101 {
1102 	trill_nickinfo_t *tni;
1103 
1104 	tni = nick_entry->tn_ni;
1105 	kmem_free(tni, TNI_TOTALSIZE(tni));
1106 	kmem_free(nick_entry, sizeof (trill_node_t));
1107 }
1108 
1109 static void
1110 trill_node_unref(trill_inst_t *tip, trill_node_t *tnp)
1111 {
1112 	if (atomic_dec_uint_nv(&tnp->tn_refs) == 0) {
1113 		if (tnp->tn_tsp != NULL)
1114 			trill_sock_unref(tnp->tn_tsp);
1115 		trill_node_free(tnp);
1116 		(void) atomic_dec_uint_nv(&tip->ti_nodecount);
1117 	}
1118 }
1119 
1120 static trill_node_t *
1121 trill_node_lookup(trill_inst_t *tip, uint16_t nick)
1122 {
1123 	trill_node_t *nick_entry;
1124 
1125 	if (!VALID_NICK(nick))
1126 		return (NULL);
1127 	rw_enter(&tip->ti_rwlock, RW_READER);
1128 	nick_entry = tip->ti_nodes[nick];
1129 	if (nick_entry != NULL) {
1130 		atomic_inc_uint(&nick_entry->tn_refs);
1131 	}
1132 	rw_exit(&tip->ti_rwlock);
1133 	return (nick_entry);
1134 }
1135 
1136 static int
1137 trill_del_nick(trill_inst_t *tip, uint16_t nick, boolean_t lockheld)
1138 {
1139 	trill_node_t *nick_entry;
1140 	int rc = ENOENT;
1141 
1142 	if (!lockheld)
1143 		rw_enter(&tip->ti_rwlock, RW_WRITER);
1144 	if (VALID_NICK(nick)) {
1145 		nick_entry = tip->ti_nodes[nick];
1146 		if (nick_entry != NULL) {
1147 			trill_node_unref(tip, nick_entry);
1148 			tip->ti_nodes[nick] = NULL;
1149 			rc = 0;
1150 		}
1151 	}
1152 	if (!lockheld)
1153 		rw_exit(&tip->ti_rwlock);
1154 	return (rc);
1155 }
1156 
1157 static int
1158 trill_add_nick(trill_inst_t *tip, void *arg, boolean_t self, int mode)
1159 {
1160 	uint16_t nick;
1161 	int size;
1162 	trill_node_t *tnode;
1163 	trill_nickinfo_t tnihdr;
1164 
1165 	/* First make sure we have at least the header available */
1166 	if (ddi_copyin(arg, &tnihdr, sizeof (trill_nickinfo_t), mode) != 0)
1167 		return (EFAULT);
1168 
1169 	nick = tnihdr.tni_nick;
1170 	if (!VALID_NICK(nick)) {
1171 		DTRACE_PROBE1(trill__add__nick__bad, trill_nickinfo_t *,
1172 		    &tnihdr);
1173 		return (EINVAL);
1174 	}
1175 
1176 	size = TNI_TOTALSIZE(&tnihdr);
1177 	if (size > TNI_MAXSIZE)
1178 		return (EINVAL);
1179 	tnode = kmem_zalloc(sizeof (trill_node_t), KM_SLEEP);
1180 	tnode->tn_ni = kmem_zalloc(size, KM_SLEEP);
1181 	if (ddi_copyin(arg, tnode->tn_ni, size, mode) != 0) {
1182 		kmem_free(tnode->tn_ni, size);
1183 		kmem_free(tnode, sizeof (trill_node_t));
1184 		return (EFAULT);
1185 	}
1186 
1187 	tnode->tn_refs++;
1188 	rw_enter(&tip->ti_rwlock, RW_WRITER);
1189 	if (tip->ti_nodes[nick] != NULL)
1190 		(void) trill_del_nick(tip, nick, B_TRUE);
1191 
1192 	if (self) {
1193 		tip->ti_nick = nick;
1194 	} else {
1195 		tnode->tn_tsp = find_trill_link(tip,
1196 		    tnode->tn_ni->tni_linkid);
1197 	}
1198 	DTRACE_PROBE2(trill__add__nick, trill_node_t *, tnode,
1199 	    uint16_t, nick);
1200 	tip->ti_nodes[nick] = tnode;
1201 	tip->ti_nodecount++;
1202 	rw_exit(&tip->ti_rwlock);
1203 	return (0);
1204 }
1205 
1206 static int
1207 trill_do_ioctl(trill_sock_t *tsock, int cmd, void *arg, int mode)
1208 {
1209 	int error = 0;
1210 	trill_inst_t *tip = tsock->ts_tip;
1211 
1212 	switch (cmd) {
1213 	case TRILL_DESIGVLAN: {
1214 		uint16_t desigvlan;
1215 
1216 		if (ddi_copyin(arg, &desigvlan, sizeof (desigvlan), mode) != 0)
1217 			return (EFAULT);
1218 		tsock->ts_desigvlan = desigvlan;
1219 		break;
1220 	}
1221 	case TRILL_VLANFWDER: {
1222 		uint8_t vlans[TRILL_VLANS_ARRSIZE];
1223 
1224 		if (tsock->ts_link == NULL)
1225 			return (EINVAL);
1226 		if ((ddi_copyin(arg, vlans, sizeof (vlans), mode)) != 0)
1227 			return (EFAULT);
1228 		bridge_trill_setvlans(tsock->ts_link, vlans);
1229 		break;
1230 	}
1231 	case TRILL_SETNICK:
1232 		if (tip == NULL)
1233 			return (EINVAL);
1234 		error = trill_add_nick(tip, arg, B_TRUE, mode);
1235 		break;
1236 
1237 	case TRILL_GETNICK:
1238 		if (tip == NULL)
1239 			return (EINVAL);
1240 		rw_enter(&tip->ti_rwlock, RW_READER);
1241 		if (ddi_copyout(&tip->ti_nick, arg, sizeof (tip->ti_nick),
1242 		    mode) != 0)
1243 			error = EFAULT;
1244 		rw_exit(&tip->ti_rwlock);
1245 		break;
1246 
1247 	case TRILL_ADDNICK:
1248 		if (tip == NULL)
1249 			break;
1250 		error = trill_add_nick(tip, arg, B_FALSE, mode);
1251 		break;
1252 
1253 	case TRILL_DELNICK: {
1254 		uint16_t delnick;
1255 
1256 		if (tip == NULL)
1257 			break;
1258 		if (ddi_copyin(arg, &delnick, sizeof (delnick), mode) != 0)
1259 			return (EFAULT);
1260 		error = trill_del_nick(tip, delnick, B_FALSE);
1261 		break;
1262 	}
1263 	case TRILL_DELALL:
1264 		if (tip == NULL)
1265 			break;
1266 		trill_del_all(tip, B_FALSE);
1267 		break;
1268 
1269 	case TRILL_TREEROOT: {
1270 		uint16_t treeroot;
1271 
1272 		if (tip == NULL)
1273 			break;
1274 		if (ddi_copyin(arg, &treeroot, sizeof (treeroot), mode) != 0)
1275 			return (EFAULT);
1276 		if (!VALID_NICK(treeroot))
1277 			return (EINVAL);
1278 		rw_enter(&tip->ti_rwlock, RW_WRITER);
1279 		tip->ti_treeroot = treeroot;
1280 		rw_exit(&tip->ti_rwlock);
1281 		break;
1282 	}
1283 	case TRILL_HWADDR:
1284 		if (tsock->ts_link == NULL)
1285 			break;
1286 		if (ddi_copyout(tsock->ts_link->bl_local_mac, arg, ETHERADDRL,
1287 		    mode) != 0)
1288 			return (EFAULT);
1289 		break;
1290 
1291 	case TRILL_NEWBRIDGE: {
1292 		char bname[MAXLINKNAMELEN];
1293 
1294 		if (tsock->ts_state != TS_UNBND)
1295 			return (ENOTSUP);
1296 		/* ts_tip can only be set once */
1297 		if (tip != NULL)
1298 			return (EEXIST);
1299 		if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
1300 			return (EFAULT);
1301 		bname[MAXLINKNAMELEN-1] = '\0';
1302 		error = trill_find_bridge(tsock, bname, B_TRUE);
1303 		break;
1304 	}
1305 
1306 	case TRILL_GETBRIDGE: {
1307 		char bname[MAXLINKNAMELEN];
1308 
1309 		/* ts_tip can only be set once */
1310 		if (tip != NULL)
1311 			return (EEXIST);
1312 		if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
1313 			return (EFAULT);
1314 		bname[MAXLINKNAMELEN - 1] = '\0';
1315 		error = trill_find_bridge(tsock, bname, B_FALSE);
1316 		break;
1317 	}
1318 
1319 	case TRILL_LISTNICK: {
1320 		trill_listnick_t tln;
1321 		trill_node_t *tnp;
1322 		trill_nickinfo_t *tnip;
1323 		uint16_t nick;
1324 
1325 		if (tip == NULL)
1326 			return (EINVAL);
1327 		if (ddi_copyin(arg, &tln, sizeof (tln), mode) != 0)
1328 			return (EFAULT);
1329 		nick = tln.tln_nick;
1330 		if (nick >= RBRIDGE_NICKNAME_MAX) {
1331 			error = EINVAL;
1332 			break;
1333 		}
1334 		rw_enter(&tip->ti_rwlock, RW_READER);
1335 		while (++nick < RBRIDGE_NICKNAME_MAX) {
1336 			if ((tnp = tip->ti_nodes[nick]) != NULL) {
1337 				tnip = tnp->tn_ni;
1338 				ASSERT(nick == tnip->tni_nick);
1339 				tln.tln_nick = nick;
1340 				bcopy(tnip->tni_adjsnpa, tln.tln_nexthop,
1341 				    ETHERADDRL);
1342 				tln.tln_ours = nick == tip->ti_nick;
1343 				if (tln.tln_ours || tnp->tn_tsp == NULL) {
1344 					tln.tln_linkid =
1345 					    DATALINK_INVALID_LINKID;
1346 				} else {
1347 					tln.tln_linkid =
1348 					    tnp->tn_tsp->ts_link->bl_linkid;
1349 				}
1350 				break;
1351 			}
1352 		}
1353 		rw_exit(&tip->ti_rwlock);
1354 		if (nick >= RBRIDGE_NICKNAME_MAX)
1355 			bzero(&tln, sizeof (tln));
1356 		if (ddi_copyout(&tln, arg, sizeof (tln), mode) != 0)
1357 			return (EFAULT);
1358 		break;
1359 	}
1360 
1361 	/*
1362 	 * Port flush: this is used when we lose AF on a port.  We must discard
1363 	 * all regular bridge forwarding entries on this port with the
1364 	 * indicated VLAN.
1365 	 */
1366 	case TRILL_PORTFLUSH: {
1367 		uint16_t vlan = (uint16_t)(uintptr_t)arg;
1368 
1369 		if (tsock->ts_link == NULL)
1370 			return (EINVAL);
1371 		bridge_trill_flush(tsock->ts_link, vlan, B_FALSE);
1372 		break;
1373 	}
1374 
1375 	/*
1376 	 * Nick flush: this is used when we lose AF on a port.  We must discard
1377 	 * all bridge TRILL forwarding entries on this port with the indicated
1378 	 * VLAN.
1379 	 */
1380 	case TRILL_NICKFLUSH: {
1381 		uint16_t vlan = (uint16_t)(uintptr_t)arg;
1382 
1383 		if (tsock->ts_link == NULL)
1384 			return (EINVAL);
1385 		bridge_trill_flush(tsock->ts_link, vlan, B_TRUE);
1386 		break;
1387 	}
1388 
1389 	case TRILL_GETMTU:
1390 		if (tsock->ts_link == NULL)
1391 			break;
1392 		if (ddi_copyout(&tsock->ts_link->bl_maxsdu, arg,
1393 		    sizeof (uint_t), mode) != 0)
1394 			return (EFAULT);
1395 		break;
1396 
1397 	default:
1398 		error = ENOTSUP;
1399 		break;
1400 	}
1401 
1402 	return (error);
1403 }
1404 
1405 /*
1406  * Sends received packet back upstream on the TRILL socket.
1407  * Consumes passed mblk_t.
1408  */
1409 static void
1410 trill_ctrl_input(trill_sock_t *tsock, mblk_t *mp, const uint8_t *saddr,
1411     uint16_t tci)
1412 {
1413 	int udi_size;
1414 	mblk_t *mp1;
1415 	struct T_unitdata_ind *tudi;
1416 	struct sockaddr_dl *sdl;
1417 	char *lladdr;
1418 	int error;
1419 
1420 	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
1421 	if (tsock->ts_flow_ctrld) {
1422 		freemsg(mp);
1423 		KSPINCR(tks_drops);
1424 		return;
1425 	}
1426 
1427 	udi_size =  sizeof (struct T_unitdata_ind) +
1428 	    sizeof (struct sockaddr_dl);
1429 	mp1 = allocb(udi_size, BPRI_MED);
1430 	if (mp1 == NULL) {
1431 		freemsg(mp);
1432 		KSPINCR(tks_drops);
1433 		return;
1434 	}
1435 
1436 	mp1->b_cont = mp;
1437 	mp = mp1;
1438 	mp->b_datap->db_type = M_PROTO;
1439 	/* LINTED: alignment */
1440 	tudi = (struct T_unitdata_ind *)mp->b_rptr;
1441 	mp->b_wptr = (uchar_t *)tudi + udi_size;
1442 
1443 	tudi->PRIM_type = T_UNITDATA_IND;
1444 	tudi->SRC_length = sizeof (struct sockaddr_dl);
1445 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1446 	tudi->OPT_length = 0;
1447 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) +
1448 	    sizeof (struct sockaddr_dl);
1449 
1450 	/* Information of the link on which packet was received. */
1451 	sdl = (struct sockaddr_dl *)&tudi[1];
1452 	(void) memset(sdl, 0, sizeof (struct sockaddr_dl));
1453 	sdl->sdl_family = AF_TRILL;
1454 
1455 	/* LINTED: alignment */
1456 	*(datalink_id_t *)sdl->sdl_data = tsock->ts_link->bl_linkid;
1457 	sdl->sdl_nlen = sizeof (tsock->ts_link->bl_linkid);
1458 
1459 	lladdr = LLADDR(sdl);
1460 	(void) memcpy(lladdr, saddr, ETHERADDRL);
1461 	lladdr += ETHERADDRL;
1462 	sdl->sdl_alen = ETHERADDRL;
1463 
1464 	/* LINTED: alignment */
1465 	*(uint16_t *)lladdr = tci;
1466 	sdl->sdl_slen = sizeof (uint16_t);
1467 
1468 	DTRACE_PROBE2(trill__ctrl__input, trill_sock_t *, tsock, mblk_t *, mp);
1469 	(*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
1470 	    mp, msgdsize(mp), 0, &error, NULL);
1471 
1472 	if (error == ENOSPC) {
1473 		mutex_enter(&tsock->ts_socklock);
1474 		(*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
1475 		    NULL, 0, 0, &error, NULL);
1476 		if (error == ENOSPC)
1477 			tsock->ts_flow_ctrld = B_TRUE;
1478 		mutex_exit(&tsock->ts_socklock);
1479 		KSPINCR(tks_drops);
1480 	} else if (error != 0) {
1481 		KSPINCR(tks_drops);
1482 	} else {
1483 		KSPINCR(tks_recv);
1484 	}
1485 
1486 	DTRACE_PROBE2(trill__ctrl__input__done, trill_sock_t *,
1487 	    tsock, int, error);
1488 }
1489 
1490 /* ARGSUSED */
1491 static void
1492 trill_activate(sock_lower_handle_t proto_handle,
1493     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
1494     int flags, cred_t *cr)
1495 {
1496 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1497 	struct sock_proto_props sopp;
1498 
1499 	tsock->ts_conn_upcalls = sock_upcalls;
1500 	tsock->ts_conn_upper_handle = sock_handle;
1501 
1502 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
1503 	    SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ |
1504 	    SOCKOPT_MAXBLK | SOCKOPT_MINPSZ;
1505 	sopp.sopp_wroff = 0;
1506 	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
1507 	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
1508 	sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl);
1509 	sopp.sopp_maxpsz = INFPSZ;
1510 	sopp.sopp_maxblk = INFPSZ;
1511 	sopp.sopp_minpsz = 0;
1512 	(*tsock->ts_conn_upcalls->su_set_proto_props)(
1513 	    tsock->ts_conn_upper_handle, &sopp);
1514 }
1515 
1516 /* ARGSUSED */
1517 static int
1518 trill_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
1519 {
1520 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1521 
1522 	trill_do_close(tsock);
1523 	return (0);
1524 }
1525 
1526 /* ARGSUSED */
1527 static int
1528 trill_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
1529     socklen_t len, cred_t *cr)
1530 {
1531 	int error;
1532 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1533 
1534 	if (sa == NULL)
1535 		error = trill_do_unbind(tsock);
1536 	else
1537 		error = trill_start_recv(tsock, sa, len);
1538 
1539 	return (error);
1540 }
1541 
1542 /* ARGSUSED */
1543 static int
1544 trill_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
1545     cred_t *cr)
1546 {
1547 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1548 	struct sockaddr_dl *laddr;
1549 	uint16_t tci;
1550 
1551 	ASSERT(DB_TYPE(mp) == M_DATA);
1552 	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
1553 
1554 	if (msg->msg_name == NULL || msg->msg_namelen != sizeof (*laddr))
1555 		goto eproto;
1556 
1557 	/*
1558 	 * The name is a datalink_id_t, the address is an Ethernet address, and
1559 	 * the selector value is the VLAN ID.
1560 	 */
1561 	laddr = (struct sockaddr_dl *)msg->msg_name;
1562 	if (laddr->sdl_nlen != sizeof (datalink_id_t) ||
1563 	    laddr->sdl_alen != ETHERADDRL ||
1564 	    (laddr->sdl_slen != sizeof (tci) && laddr->sdl_slen != 0))
1565 		goto eproto;
1566 
1567 	mutex_enter(&tsock->ts_socklock);
1568 	if (tsock->ts_state != TS_IDLE || tsock->ts_link == NULL) {
1569 		mutex_exit(&tsock->ts_socklock);
1570 		goto eproto;
1571 	}
1572 	atomic_inc_uint(&tsock->ts_sockthreadcount);
1573 	mutex_exit(&tsock->ts_socklock);
1574 
1575 	/*
1576 	 * Safe to dereference VLAN now, as we've checked the user's specified
1577 	 * values, and alignment is now guaranteed.
1578 	 */
1579 	if (laddr->sdl_slen == 0) {
1580 		tci = TRILL_NO_TCI;
1581 	} else {
1582 		/* LINTED: alignment */
1583 		tci = *(uint16_t *)(LLADDR(laddr) + ETHERADDRL);
1584 	}
1585 
1586 	mp = create_trill_header(tsock, mp, (const uchar_t *)LLADDR(laddr),
1587 	    B_TRUE, B_FALSE, tci, msgdsize(mp));
1588 	if (mp != NULL) {
1589 		mp = bridge_trill_output(tsock->ts_link, mp);
1590 		if (mp == NULL) {
1591 			KSPINCR(tks_sent);
1592 		} else {
1593 			freemsg(mp);
1594 			KSPINCR(tks_drops);
1595 		}
1596 	}
1597 
1598 	/* Wake up any threads blocking on us */
1599 	if (atomic_dec_uint_nv(&tsock->ts_sockthreadcount) == 0)
1600 		cv_broadcast(&tsock->ts_sockthreadwait);
1601 	return (0);
1602 
1603 eproto:
1604 	freemsg(mp);
1605 	KSPINCR(tks_drops);
1606 	return (EPROTO);
1607 }
1608 
1609 /* ARGSUSED */
1610 static int
1611 trill_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
1612     int mode, int32_t *rvalp, cred_t *cr)
1613 {
1614 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1615 	int rc;
1616 
1617 	switch (cmd) {
1618 	/* List of unprivileged TRILL ioctls */
1619 	case TRILL_GETNICK:
1620 	case TRILL_GETBRIDGE:
1621 	case TRILL_LISTNICK:
1622 		break;
1623 	default:
1624 		if (secpolicy_dl_config(cr) != 0)
1625 			return (EPERM);
1626 		break;
1627 	}
1628 
1629 	/* Lock ensures socket state is unchanged during ioctl handling */
1630 	mutex_enter(&tsock->ts_socklock);
1631 	rc = trill_do_ioctl(tsock, cmd, (void *)arg, mode);
1632 	mutex_exit(&tsock->ts_socklock);
1633 	return (rc);
1634 }
1635 
1636 static void
1637 trill_clr_flowctrl(sock_lower_handle_t proto_handle)
1638 {
1639 	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1640 
1641 	mutex_enter(&tsock->ts_socklock);
1642 	tsock->ts_flow_ctrld = B_FALSE;
1643 	mutex_exit(&tsock->ts_socklock);
1644 }
1645 
1646 static sock_downcalls_t sock_trill_downcalls = {
1647 	trill_activate,			/* sd_activate */
1648 	sock_accept_notsupp,		/* sd_accept */
1649 	trill_bind,			/* sd_bind */
1650 	sock_listen_notsupp,		/* sd_listen */
1651 	sock_connect_notsupp,		/* sd_connect */
1652 	sock_getpeername_notsupp,	/* sd_getpeername */
1653 	sock_getsockname_notsupp,	/* sd_getsockname */
1654 	sock_getsockopt_notsupp,	/* sd_getsockopt */
1655 	sock_setsockopt_notsupp,	/* sd_setsockopt */
1656 	trill_send,			/* sd_send */
1657 	NULL,				/* sd_send_uio */
1658 	NULL,				/* sd_recv_uio */
1659 	NULL,				/* sd_poll */
1660 	sock_shutdown_notsupp,		/* sd_shutdown */
1661 	trill_clr_flowctrl,		/* sd_setflowctrl */
1662 	trill_ioctl,			/* sd_ioctl */
1663 	trill_close			/* sd_close */
1664 };
1665 
1666 /* ARGSUSED */
1667 static sock_lower_handle_t
1668 trill_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
1669     uint_t *smodep, int *errorp, int flags, cred_t *credp)
1670 {
1671 	trill_sock_t *tsock;
1672 
1673 	if (family != AF_TRILL || type != SOCK_DGRAM || proto != 0) {
1674 		*errorp = EPROTONOSUPPORT;
1675 		return (NULL);
1676 	}
1677 
1678 	*sock_downcalls = &sock_trill_downcalls;
1679 	*smodep = SM_ATOMIC;
1680 	tsock = trill_do_open(flags);
1681 	*errorp = (tsock != NULL) ? 0:ENOMEM;
1682 	return ((sock_lower_handle_t)tsock);
1683 }
1684