xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_send.c (revision 45526e9775395f5d44bad3f5430041f32c84ce1e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * IEEE 802.3ad Link Aggregation - Send code.
31  *
32  * Implements the Distributor function.
33  */
34 
35 #include <sys/conf.h>
36 #include <sys/modctl.h>
37 #include <sys/sunddi.h>
38 #include <sys/vlan.h>
39 #include <sys/strsun.h>
40 #include <sys/strsubr.h>
41 
42 #include <inet/common.h>
43 #include <inet/led.h>
44 #include <inet/ip.h>
45 #include <inet/ip6.h>
46 #include <inet/tcp.h>
47 #include <netinet/udp.h>
48 #include <inet/ipsecesp.h>
49 #include <inet/ipsecah.h>
50 
51 #include <sys/aggr.h>
52 #include <sys/aggr_impl.h>
53 
54 #define	HASH32(x) (((x) >> 24) ^ ((x) >> 16) ^ ((x) >> 8) ^ (x))
55 #define	HASH_MAC(x) (x[0] ^ x[1] ^ x[2] ^ x[3] ^ x[4] ^ x[5])
56 
57 static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *);
58 
59 static uint_t
60 aggr_send_port(aggr_grp_t *grp, mblk_t *mp)
61 {
62 	struct ether_header *ehp;
63 	uint16_t sap;
64 	uint_t skip_len;
65 	uint8_t proto;
66 	uint32_t policy = grp->lg_tx_policy;
67 	uint32_t hash = 0;
68 
69 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
70 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
71 
72 	/* compute MAC hash */
73 
74 	ehp = (struct ether_header *)mp->b_rptr;
75 
76 	if (policy & AGGR_POLICY_L2) {
77 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
78 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
79 		hash = HASH_MAC(mac_src) ^ HASH_MAC(mac_dst);
80 		policy &= ~AGGR_POLICY_L2;
81 	}
82 
83 	if (policy == 0)
84 		goto done;
85 
86 	/* skip ethernet header */
87 
88 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
89 		struct ether_vlan_header *evhp;
90 
91 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
92 		evhp = (struct ether_vlan_header *)mp->b_rptr;
93 		sap = ntohs(evhp->ether_type);
94 		skip_len = sizeof (struct ether_vlan_header);
95 	} else {
96 		sap = ntohs(ehp->ether_type);
97 		skip_len = sizeof (struct ether_header);
98 	}
99 
100 	/* if ethernet header is in its own mblk, skip it */
101 	if (MBLKL(mp) <= skip_len) {
102 		skip_len -= MBLKL(mp);
103 		mp = mp->b_cont;
104 	}
105 
106 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
107 
108 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
109 
110 	switch (sap) {
111 	case ETHERTYPE_IP: {
112 		ipha_t *iphp;
113 
114 		ASSERT(MBLKL(mp) >= skip_len + sizeof (ipha_t));
115 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
116 		proto = iphp->ipha_protocol;
117 		skip_len += IPH_HDR_LENGTH(iphp);
118 
119 		if (policy & AGGR_POLICY_L3) {
120 			uint32_t ip_src = iphp->ipha_src;
121 			uint32_t ip_dst = iphp->ipha_dst;
122 			hash ^= (HASH32(htonl(ip_src)) ^ HASH32(htonl(ip_dst)));
123 			policy &= ~AGGR_POLICY_L3;
124 		}
125 		break;
126 	}
127 	case ETHERTYPE_IPV6: {
128 		ip6_t *ip6hp;
129 
130 		/*
131 		 * if ipv6 packet has options, the proto will not be one of the
132 		 * ones handled by the ULP processor below, and will return 0
133 		 * as the index
134 		 */
135 		ASSERT(MBLKL(mp) >= skip_len + sizeof (ip6_t));
136 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
137 		proto = ip6hp->ip6_nxt;
138 		skip_len += aggr_send_ip6_hdr_len(mp, ip6hp);
139 
140 		if (policy & AGGR_POLICY_L3) {
141 			uint32_t ip_src = ip6hp->ip6_src.s6_addr32[3];
142 			uint32_t ip_dst = ip6hp->ip6_dst.s6_addr32[3];
143 			hash ^= (HASH32(htonl(ip_src)) ^ HASH32(htonl(ip_dst)));
144 			policy &= ~AGGR_POLICY_L3;
145 		}
146 		break;
147 	}
148 	default:
149 		goto done;
150 	}
151 
152 	if (!(policy & AGGR_POLICY_L4))
153 		goto done;
154 
155 	/* if ip header is in its own mblk, skip it */
156 	if (MBLKL(mp) <= skip_len) {
157 		skip_len -= MBLKL(mp);
158 		mp = mp->b_cont;
159 	}
160 
161 	/* parse ULP header */
162 again:
163 	switch (proto) {
164 	case IPPROTO_TCP:
165 	case IPPROTO_UDP:
166 	case IPPROTO_ESP:
167 	case IPPROTO_SCTP:
168 		/*
169 		 * These Internet Protocols are intentionally designed
170 		 * for hashing from the git-go.  Port numbers are in the first
171 		 * word for transports, SPI is first for ESP.
172 		 */
173 		hash ^= HASH32(*(uint32_t *)(mp->b_rptr + skip_len));
174 		break;
175 
176 	case IPPROTO_AH: {
177 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
178 
179 		uint_t ah_length = AH_TOTAL_LEN(ah);
180 		proto = ah->ah_nexthdr;
181 		skip_len += ah_length;
182 
183 		/* if ip header is in its own mblk, skip it */
184 		if (MBLKL(mp) <= skip_len) {
185 			skip_len -= MBLKL(mp);
186 			mp = mp->b_cont;
187 		}
188 
189 		goto again;
190 	}
191 	}
192 
193 done:
194 	return (hash % grp->lg_ntx_ports);
195 }
196 
197 /*
198  * Update the TX load balancing policy of the specified group.
199  */
200 void
201 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
202 {
203 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
204 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
205 
206 	grp->lg_tx_policy = policy;
207 }
208 
209 /*
210  * Send function invoked by the MAC service module.
211  */
212 mblk_t *
213 aggr_m_tx(void *arg, mblk_t *mp)
214 {
215 	aggr_grp_t *grp = arg;
216 	aggr_port_t *port;
217 	mblk_t *nextp;
218 	const mac_txinfo_t *mtp;
219 
220 	rw_enter(&grp->lg_lock, RW_READER);
221 
222 	if (grp->lg_ntx_ports == 0) {
223 		/*
224 		 * We could have returned from aggr_m_start() before
225 		 * the ports were actually attached. Drop the chain.
226 		 */
227 		rw_exit(&grp->lg_lock);
228 
229 		freemsgchain(mp);
230 		return (NULL);
231 	}
232 
233 	for (;;) {
234 		nextp = mp->b_next;
235 		mp->b_next = NULL;
236 
237 		port = grp->lg_tx_ports[aggr_send_port(grp, mp)];
238 		ASSERT(port->lp_state == AGGR_PORT_STATE_ATTACHED);
239 
240 		rw_exit(&grp->lg_lock);
241 
242 		/*
243 		 * We store the transmit info pointer locally in case it
244 		 * changes between loading mt_fn and mt_arg.
245 		 */
246 		mtp = port->lp_txinfo;
247 		if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
248 			mp->b_next = nextp;
249 			goto done;
250 		}
251 
252 		if ((mp = nextp) == NULL)
253 			goto done;
254 
255 		rw_enter(&grp->lg_lock, RW_READER);
256 	}
257 
258 done:
259 	return (mp);
260 }
261 
262 /*
263  * Enable sending on the specified port.
264  */
265 void
266 aggr_send_port_enable(aggr_port_t *port)
267 {
268 	aggr_grp_t *grp = port->lp_grp;
269 
270 	if (port->lp_tx_enabled || (port->lp_state !=
271 	    AGGR_PORT_STATE_ATTACHED)) {
272 		/* already enabled or port not yet attached */
273 		return;
274 	}
275 
276 	/*
277 	 * Add to group's array of tx ports.
278 	 */
279 	if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
280 		/* current array too small */
281 		aggr_port_t **new_ports;
282 		uint_t new_size;
283 
284 		new_size = grp->lg_ntx_ports+1;
285 		new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *),
286 		    KM_SLEEP);
287 
288 		if (grp->lg_tx_ports_size > 0) {
289 			ASSERT(grp->lg_tx_ports != NULL);
290 			bcopy(grp->lg_tx_ports, new_ports,
291 			    grp->lg_ntx_ports * sizeof (aggr_port_t *));
292 			kmem_free(grp->lg_tx_ports,
293 			    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
294 		}
295 
296 		grp->lg_tx_ports = new_ports;
297 		grp->lg_tx_ports_size = new_size;
298 	}
299 
300 	grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
301 	port->lp_tx_idx = grp->lg_ntx_ports-1;
302 
303 	port->lp_tx_enabled = B_TRUE;
304 }
305 
306 /*
307  * Disable sending from the specified port.
308  */
309 void
310 aggr_send_port_disable(aggr_port_t *port)
311 {
312 	uint_t idx, ntx;
313 	aggr_grp_t *grp = port->lp_grp;
314 
315 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
316 
317 	if (!port->lp_tx_enabled) {
318 		/* not yet enabled */
319 		return;
320 	}
321 
322 	idx = port->lp_tx_idx;
323 	ntx = grp->lg_ntx_ports;
324 	ASSERT(idx < ntx);
325 
326 	/* remove from array of attached ports */
327 	if (idx == (ntx - 1)) {
328 		grp->lg_tx_ports[idx] = NULL;
329 	} else {
330 		/* not the last entry, replace with last one */
331 		aggr_port_t *victim;
332 
333 		victim = grp->lg_tx_ports[ntx - 1];
334 		grp->lg_tx_ports[ntx - 1] = NULL;
335 		victim->lp_tx_idx = idx;
336 		grp->lg_tx_ports[idx] = victim;
337 	}
338 
339 	port->lp_tx_idx = 0;
340 	grp->lg_ntx_ports--;
341 
342 	port->lp_tx_enabled = B_FALSE;
343 }
344 
345 static uint16_t
346 aggr_send_ip6_hdr_len(mblk_t *mp, ip6_t *ip6h)
347 {
348 	uint16_t length;
349 	uint_t	ehdrlen;
350 	uint8_t	*nexthdrp;
351 	uint8_t *whereptr;
352 	uint8_t *endptr;
353 	ip6_dest_t *desthdr;
354 	ip6_rthdr_t *rthdr;
355 	ip6_frag_t *fraghdr;
356 
357 	length = IPV6_HDR_LEN;
358 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
359 	endptr = mp->b_wptr;
360 
361 	nexthdrp = &ip6h->ip6_nxt;
362 	while (whereptr < endptr) {
363 		switch (*nexthdrp) {
364 		case IPPROTO_HOPOPTS:
365 		case IPPROTO_DSTOPTS:
366 			/* Assumes the headers are identical for hbh and dst */
367 			desthdr = (ip6_dest_t *)whereptr;
368 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
369 			nexthdrp = &desthdr->ip6d_nxt;
370 			break;
371 		case IPPROTO_ROUTING:
372 			rthdr = (ip6_rthdr_t *)whereptr;
373 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
374 			nexthdrp = &rthdr->ip6r_nxt;
375 			break;
376 		case IPPROTO_FRAGMENT:
377 			fraghdr = (ip6_frag_t *)whereptr;
378 			ehdrlen = sizeof (ip6_frag_t);
379 			nexthdrp = &fraghdr->ip6f_nxt;
380 			break;
381 		case IPPROTO_NONE:
382 			/* No next header means we're finished */
383 		default:
384 			return (length);
385 		}
386 		length += ehdrlen;
387 		whereptr += ehdrlen;
388 	}
389 
390 	return (length);
391 }
392