xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_send.c (revision 0a44ef6d9afbfe052a7e975f55ea0d2954b62a82)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * IEEE 802.3ad Link Aggregation - Send code.
30  *
31  * Implements the Distributor function.
32  */
33 
34 #include <sys/conf.h>
35 #include <sys/modctl.h>
36 #include <sys/sunddi.h>
37 #include <sys/vlan.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 
41 #include <inet/common.h>
42 #include <inet/led.h>
43 #include <inet/ip.h>
44 #include <inet/ip6.h>
45 #include <inet/tcp.h>
46 #include <netinet/udp.h>
47 #include <inet/ipsecesp.h>
48 #include <inet/ipsecah.h>
49 
50 #include <sys/aggr.h>
51 #include <sys/aggr_impl.h>
52 
53 #define	HASH32(x) (((x) >> 24) ^ ((x) >> 16) ^ ((x) >> 8) ^ (x))
54 #define	HASH_MAC(x) (x[0] ^ x[1] ^ x[2] ^ x[3] ^ x[4] ^ x[5])
55 
56 static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *);
57 
58 static uint_t
59 aggr_send_port(aggr_grp_t *grp, mblk_t *mp)
60 {
61 	struct ether_header *ehp;
62 	uint16_t sap;
63 	uint_t skip_len;
64 	uint8_t proto;
65 	uint32_t policy = grp->lg_tx_policy;
66 	uint32_t hash = 0;
67 
68 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
69 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
70 
71 	/* compute MAC hash */
72 
73 	ehp = (struct ether_header *)mp->b_rptr;
74 
75 	if (policy & AGGR_POLICY_L2) {
76 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
77 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
78 		hash = HASH_MAC(mac_src) ^ HASH_MAC(mac_dst);
79 		policy &= ~AGGR_POLICY_L2;
80 	}
81 
82 	if (policy == 0)
83 		goto done;
84 
85 	/* skip ethernet header */
86 
87 	if (ntohs(ehp->ether_type) == ETHERTYPE_VLAN) {
88 		struct ether_vlan_header *evhp;
89 
90 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
91 		evhp = (struct ether_vlan_header *)mp->b_rptr;
92 		sap = ntohs(evhp->ether_type);
93 		skip_len = sizeof (struct ether_vlan_header);
94 	} else {
95 		sap = ntohs(ehp->ether_type);
96 		skip_len = sizeof (struct ether_header);
97 	}
98 
99 	/* if ethernet header is in its own mblk, skip it */
100 	if (MBLKL(mp) <= skip_len) {
101 		skip_len -= MBLKL(mp);
102 		mp = mp->b_cont;
103 	}
104 
105 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
106 
107 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
108 
109 	switch (sap) {
110 	case ETHERTYPE_IP: {
111 		ipha_t *iphp;
112 
113 		ASSERT(MBLKL(mp) >= skip_len + sizeof (ipha_t));
114 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
115 		proto = iphp->ipha_protocol;
116 		skip_len += IPH_HDR_LENGTH(iphp);
117 
118 		if (policy & AGGR_POLICY_L3) {
119 			uint32_t ip_src = iphp->ipha_src;
120 			uint32_t ip_dst = iphp->ipha_dst;
121 			hash ^= (HASH32(htonl(ip_src)) ^ HASH32(htonl(ip_dst)));
122 			policy &= ~AGGR_POLICY_L3;
123 		}
124 		break;
125 	}
126 	case ETHERTYPE_IPV6: {
127 		ip6_t *ip6hp;
128 
129 		/*
130 		 * if ipv6 packet has options, the proto will not be one of the
131 		 * ones handled by the ULP processor below, and will return 0
132 		 * as the index
133 		 */
134 		ASSERT(MBLKL(mp) >= skip_len + sizeof (ip6_t));
135 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
136 		proto = ip6hp->ip6_nxt;
137 		skip_len += aggr_send_ip6_hdr_len(mp, ip6hp);
138 
139 		if (policy & AGGR_POLICY_L3) {
140 			uint32_t ip_src = ip6hp->ip6_src.s6_addr32[3];
141 			uint32_t ip_dst = ip6hp->ip6_dst.s6_addr32[3];
142 			hash ^= (HASH32(htonl(ip_src)) ^ HASH32(htonl(ip_dst)));
143 			policy &= ~AGGR_POLICY_L3;
144 		}
145 		break;
146 	}
147 	default:
148 		goto done;
149 	}
150 
151 	if (!(policy & AGGR_POLICY_L4))
152 		goto done;
153 
154 	/* if ip header is in its own mblk, skip it */
155 	if (MBLKL(mp) <= skip_len) {
156 		skip_len -= MBLKL(mp);
157 		mp = mp->b_cont;
158 	}
159 
160 	/* parse ULP header */
161 again:
162 	switch (proto) {
163 	case IPPROTO_TCP:
164 	case IPPROTO_UDP:
165 	case IPPROTO_ESP:
166 	case IPPROTO_SCTP:
167 		/*
168 		 * These Internet Protocols are intentionally designed
169 		 * for hashing from the git-go.  Port numbers are in the first
170 		 * word for transports, SPI is first for ESP.
171 		 */
172 		hash ^= HASH32(*(uint32_t *)(mp->b_rptr + skip_len));
173 		break;
174 
175 	case IPPROTO_AH: {
176 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
177 
178 		uint_t ah_length = AH_TOTAL_LEN(ah);
179 		proto = ah->ah_nexthdr;
180 		skip_len += ah_length;
181 
182 		/* if ip header is in its own mblk, skip it */
183 		if (MBLKL(mp) <= skip_len) {
184 			skip_len -= MBLKL(mp);
185 			mp = mp->b_cont;
186 		}
187 
188 		goto again;
189 	}
190 	}
191 
192 done:
193 	return (hash % grp->lg_ntx_ports);
194 }
195 
196 /*
197  * Update the TX load balancing policy of the specified group.
198  */
199 void
200 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
201 {
202 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
203 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
204 
205 	grp->lg_tx_policy = policy;
206 }
207 
208 /*
209  * Send function invoked by the MAC service module.
210  */
211 mblk_t *
212 aggr_m_tx(void *arg, mblk_t *mp)
213 {
214 	aggr_grp_t *grp = arg;
215 	aggr_port_t *port;
216 	mblk_t *nextp;
217 	const mac_txinfo_t *mtp;
218 
219 	for (;;) {
220 		rw_enter(&grp->lg_lock, RW_READER);
221 		if (grp->lg_ntx_ports == 0) {
222 			/*
223 			 * We could have returned from aggr_m_start() before
224 			 * the ports were actually attached. Drop the chain.
225 			 */
226 			rw_exit(&grp->lg_lock);
227 			freemsgchain(mp);
228 			return (NULL);
229 		}
230 		nextp = mp->b_next;
231 		mp->b_next = NULL;
232 
233 		port = grp->lg_tx_ports[aggr_send_port(grp, mp)];
234 		ASSERT(port->lp_state == AGGR_PORT_STATE_ATTACHED);
235 
236 		rw_exit(&grp->lg_lock);
237 
238 		/*
239 		 * We store the transmit info pointer locally in case it
240 		 * changes between loading mt_fn and mt_arg.
241 		 */
242 		mtp = port->lp_txinfo;
243 		if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
244 			mp->b_next = nextp;
245 			break;
246 		}
247 
248 		if ((mp = nextp) == NULL)
249 			break;
250 	}
251 	return (mp);
252 }
253 
254 /*
255  * Enable sending on the specified port.
256  */
257 void
258 aggr_send_port_enable(aggr_port_t *port)
259 {
260 	aggr_grp_t *grp = port->lp_grp;
261 
262 	if (port->lp_tx_enabled || (port->lp_state !=
263 	    AGGR_PORT_STATE_ATTACHED)) {
264 		/* already enabled or port not yet attached */
265 		return;
266 	}
267 
268 	/*
269 	 * Add to group's array of tx ports.
270 	 */
271 	if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
272 		/* current array too small */
273 		aggr_port_t **new_ports;
274 		uint_t new_size;
275 
276 		new_size = grp->lg_ntx_ports+1;
277 		new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *),
278 		    KM_SLEEP);
279 
280 		if (grp->lg_tx_ports_size > 0) {
281 			ASSERT(grp->lg_tx_ports != NULL);
282 			bcopy(grp->lg_tx_ports, new_ports,
283 			    grp->lg_ntx_ports * sizeof (aggr_port_t *));
284 			kmem_free(grp->lg_tx_ports,
285 			    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
286 		}
287 
288 		grp->lg_tx_ports = new_ports;
289 		grp->lg_tx_ports_size = new_size;
290 	}
291 
292 	grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
293 	port->lp_tx_idx = grp->lg_ntx_ports-1;
294 
295 	port->lp_tx_enabled = B_TRUE;
296 }
297 
298 /*
299  * Disable sending from the specified port.
300  */
301 void
302 aggr_send_port_disable(aggr_port_t *port)
303 {
304 	uint_t idx, ntx;
305 	aggr_grp_t *grp = port->lp_grp;
306 
307 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
308 
309 	if (!port->lp_tx_enabled) {
310 		/* not yet enabled */
311 		return;
312 	}
313 
314 	idx = port->lp_tx_idx;
315 	ntx = grp->lg_ntx_ports;
316 	ASSERT(idx < ntx);
317 
318 	/* remove from array of attached ports */
319 	if (idx == (ntx - 1)) {
320 		grp->lg_tx_ports[idx] = NULL;
321 	} else {
322 		/* not the last entry, replace with last one */
323 		aggr_port_t *victim;
324 
325 		victim = grp->lg_tx_ports[ntx - 1];
326 		grp->lg_tx_ports[ntx - 1] = NULL;
327 		victim->lp_tx_idx = idx;
328 		grp->lg_tx_ports[idx] = victim;
329 	}
330 
331 	port->lp_tx_idx = 0;
332 	grp->lg_ntx_ports--;
333 
334 	port->lp_tx_enabled = B_FALSE;
335 }
336 
337 static uint16_t
338 aggr_send_ip6_hdr_len(mblk_t *mp, ip6_t *ip6h)
339 {
340 	uint16_t length;
341 	uint_t	ehdrlen;
342 	uint8_t	*nexthdrp;
343 	uint8_t *whereptr;
344 	uint8_t *endptr;
345 	ip6_dest_t *desthdr;
346 	ip6_rthdr_t *rthdr;
347 	ip6_frag_t *fraghdr;
348 
349 	length = IPV6_HDR_LEN;
350 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
351 	endptr = mp->b_wptr;
352 
353 	nexthdrp = &ip6h->ip6_nxt;
354 	while (whereptr < endptr) {
355 		switch (*nexthdrp) {
356 		case IPPROTO_HOPOPTS:
357 		case IPPROTO_DSTOPTS:
358 			/* Assumes the headers are identical for hbh and dst */
359 			desthdr = (ip6_dest_t *)whereptr;
360 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
361 			nexthdrp = &desthdr->ip6d_nxt;
362 			break;
363 		case IPPROTO_ROUTING:
364 			rthdr = (ip6_rthdr_t *)whereptr;
365 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
366 			nexthdrp = &rthdr->ip6r_nxt;
367 			break;
368 		case IPPROTO_FRAGMENT:
369 			fraghdr = (ip6_frag_t *)whereptr;
370 			ehdrlen = sizeof (ip6_frag_t);
371 			nexthdrp = &fraghdr->ip6f_nxt;
372 			break;
373 		case IPPROTO_NONE:
374 			/* No next header means we're finished */
375 		default:
376 			return (length);
377 		}
378 		length += ehdrlen;
379 		whereptr += ehdrlen;
380 	}
381 
382 	return (length);
383 }
384