xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_send.c (revision 1b8adde7ba7d5e04395c141c5400dc2cffd7d809)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * IEEE 802.3ad Link Aggregation - Send code.
28  *
29  * Implements the Distributor function.
30  */
31 
32 #include <sys/conf.h>
33 #include <sys/modctl.h>
34 #include <sys/sunddi.h>
35 #include <sys/vlan.h>
36 #include <sys/strsun.h>
37 #include <sys/strsubr.h>
38 
39 #include <inet/common.h>
40 #include <inet/led.h>
41 #include <inet/ip.h>
42 #include <inet/ip6.h>
43 #include <inet/tcp.h>
44 #include <netinet/udp.h>
45 #include <inet/ipsec_impl.h>
46 #include <inet/sadb.h>
47 #include <inet/ipsecesp.h>
48 #include <inet/ipsecah.h>
49 
50 #include <sys/aggr.h>
51 #include <sys/aggr_impl.h>
52 
53 #define	HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
54 #define	HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
55 
56 static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *);
57 
58 static uint_t
59 aggr_send_port(aggr_grp_t *grp, mblk_t *mp)
60 {
61 	struct ether_header *ehp;
62 	uint16_t sap;
63 	uint_t skip_len;
64 	uint8_t proto;
65 	uint32_t policy = grp->lg_tx_policy;
66 	uint32_t hash = 0;
67 
68 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
69 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
70 
71 	/* compute MAC hash */
72 
73 	ehp = (struct ether_header *)mp->b_rptr;
74 
75 	if (policy & AGGR_POLICY_L2) {
76 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
77 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
78 		hash = HASH_MAC(mac_src) ^ HASH_MAC(mac_dst);
79 		policy &= ~AGGR_POLICY_L2;
80 	}
81 
82 	if (policy == 0)
83 		goto done;
84 
85 	/* skip ethernet header */
86 
87 	if (ntohs(ehp->ether_type) == ETHERTYPE_VLAN) {
88 		struct ether_vlan_header *evhp;
89 		mblk_t *newmp = NULL;
90 
91 		skip_len = sizeof (struct ether_vlan_header);
92 		if (MBLKL(mp) < skip_len) {
93 			/* the vlan tag is the payload, pull up first */
94 			newmp = msgpullup(mp, -1);
95 			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
96 				goto done;
97 			}
98 			evhp = (struct ether_vlan_header *)newmp->b_rptr;
99 		} else {
100 			evhp = (struct ether_vlan_header *)mp->b_rptr;
101 		}
102 
103 		sap = ntohs(evhp->ether_type);
104 		freemsg(newmp);
105 	} else {
106 		sap = ntohs(ehp->ether_type);
107 		skip_len = sizeof (struct ether_header);
108 	}
109 
110 	/* if ethernet header is in its own mblk, skip it */
111 	if (MBLKL(mp) <= skip_len) {
112 		skip_len -= MBLKL(mp);
113 		mp = mp->b_cont;
114 	}
115 
116 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
117 
118 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
119 
120 	switch (sap) {
121 	case ETHERTYPE_IP: {
122 		ipha_t *iphp;
123 
124 		if (MBLKL(mp) < (skip_len + sizeof (ipha_t)))
125 			goto done;
126 
127 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
128 		proto = iphp->ipha_protocol;
129 		skip_len += IPH_HDR_LENGTH(iphp);
130 
131 		if (policy & AGGR_POLICY_L3) {
132 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
133 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
134 
135 			hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst));
136 			policy &= ~AGGR_POLICY_L3;
137 		}
138 		break;
139 	}
140 	case ETHERTYPE_IPV6: {
141 		ip6_t *ip6hp;
142 
143 		/*
144 		 * if ipv6 packet has options, the proto will not be one of the
145 		 * ones handled by the ULP processor below, and will return 0
146 		 * as the index
147 		 */
148 		if (MBLKL(mp) < (skip_len + sizeof (ip6_t)))
149 			goto done;
150 
151 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
152 		proto = ip6hp->ip6_nxt;
153 		skip_len += aggr_send_ip6_hdr_len(mp, ip6hp);
154 
155 		if (policy & AGGR_POLICY_L3) {
156 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
157 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
158 
159 			hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst));
160 			policy &= ~AGGR_POLICY_L3;
161 		}
162 		break;
163 	}
164 	default:
165 		goto done;
166 	}
167 
168 	if (!(policy & AGGR_POLICY_L4))
169 		goto done;
170 
171 	/* if ip header is in its own mblk, skip it */
172 	if (MBLKL(mp) <= skip_len) {
173 		skip_len -= MBLKL(mp);
174 		mp = mp->b_cont;
175 	}
176 
177 	/* parse ULP header */
178 again:
179 	switch (proto) {
180 	case IPPROTO_TCP:
181 	case IPPROTO_UDP:
182 	case IPPROTO_ESP:
183 	case IPPROTO_SCTP:
184 		/*
185 		 * These Internet Protocols are intentionally designed
186 		 * for hashing from the git-go.  Port numbers are in the first
187 		 * word for transports, SPI is first for ESP.
188 		 */
189 		hash ^= HASH_4BYTES((mp->b_rptr + skip_len));
190 		break;
191 
192 	case IPPROTO_AH: {
193 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
194 
195 		uint_t ah_length = AH_TOTAL_LEN(ah);
196 		proto = ah->ah_nexthdr;
197 		skip_len += ah_length;
198 
199 		/* if ip header is in its own mblk, skip it */
200 		if (MBLKL(mp) <= skip_len) {
201 			skip_len -= MBLKL(mp);
202 			mp = mp->b_cont;
203 		}
204 
205 		goto again;
206 	}
207 	}
208 
209 done:
210 	return (hash % grp->lg_ntx_ports);
211 }
212 
213 /*
214  * Update the TX load balancing policy of the specified group.
215  */
216 void
217 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
218 {
219 	ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp));
220 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
221 
222 	grp->lg_tx_policy = policy;
223 }
224 
225 /*
226  * Send function invoked by the MAC service module.
227  */
228 mblk_t *
229 aggr_m_tx(void *arg, mblk_t *mp)
230 {
231 	aggr_grp_t *grp = arg;
232 	aggr_port_t *port;
233 	mblk_t *nextp;
234 	const mac_txinfo_t *mtp;
235 
236 	for (;;) {
237 		AGGR_LACP_LOCK_READER(grp)
238 		if (grp->lg_ntx_ports == 0) {
239 			/*
240 			 * We could have returned from aggr_m_start() before
241 			 * the ports were actually attached. Drop the chain.
242 			 */
243 			AGGR_LACP_UNLOCK(grp)
244 			freemsgchain(mp);
245 			return (NULL);
246 		}
247 		nextp = mp->b_next;
248 		mp->b_next = NULL;
249 
250 		port = grp->lg_tx_ports[aggr_send_port(grp, mp)];
251 		ASSERT(port->lp_state == AGGR_PORT_STATE_ATTACHED);
252 
253 		/*
254 		 * We store the transmit info pointer locally in case it
255 		 * changes between loading mt_fn and mt_arg.
256 		 */
257 		mtp = port->lp_txinfo;
258 		AGGR_LACP_UNLOCK(grp)
259 
260 		if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
261 			mp->b_next = nextp;
262 			break;
263 		}
264 
265 		if ((mp = nextp) == NULL)
266 			break;
267 	}
268 	return (mp);
269 }
270 
271 /*
272  * Enable sending on the specified port.
273  */
274 void
275 aggr_send_port_enable(aggr_port_t *port)
276 {
277 	aggr_grp_t *grp = port->lp_grp;
278 
279 	if (port->lp_tx_enabled || (port->lp_state !=
280 	    AGGR_PORT_STATE_ATTACHED)) {
281 		/* already enabled or port not yet attached */
282 		return;
283 	}
284 
285 	/*
286 	 * Add to group's array of tx ports.
287 	 */
288 	if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
289 		/* current array too small */
290 		aggr_port_t **new_ports;
291 		uint_t new_size;
292 
293 		new_size = grp->lg_ntx_ports+1;
294 		new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *),
295 		    KM_SLEEP);
296 
297 		if (grp->lg_tx_ports_size > 0) {
298 			ASSERT(grp->lg_tx_ports != NULL);
299 			bcopy(grp->lg_tx_ports, new_ports,
300 			    grp->lg_ntx_ports * sizeof (aggr_port_t *));
301 			kmem_free(grp->lg_tx_ports,
302 			    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
303 		}
304 
305 		grp->lg_tx_ports = new_ports;
306 		grp->lg_tx_ports_size = new_size;
307 	}
308 
309 	grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
310 	port->lp_tx_idx = grp->lg_ntx_ports-1;
311 
312 	port->lp_tx_enabled = B_TRUE;
313 }
314 
315 /*
316  * Disable sending from the specified port.
317  */
318 void
319 aggr_send_port_disable(aggr_port_t *port)
320 {
321 	uint_t idx, ntx;
322 	aggr_grp_t *grp = port->lp_grp;
323 
324 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
325 
326 	if (!port->lp_tx_enabled) {
327 		/* not yet enabled */
328 		return;
329 	}
330 
331 	idx = port->lp_tx_idx;
332 	ntx = grp->lg_ntx_ports;
333 	ASSERT(idx < ntx);
334 
335 	/* remove from array of attached ports */
336 	if (idx == (ntx - 1)) {
337 		grp->lg_tx_ports[idx] = NULL;
338 	} else {
339 		/* not the last entry, replace with last one */
340 		aggr_port_t *victim;
341 
342 		victim = grp->lg_tx_ports[ntx - 1];
343 		grp->lg_tx_ports[ntx - 1] = NULL;
344 		victim->lp_tx_idx = idx;
345 		grp->lg_tx_ports[idx] = victim;
346 	}
347 
348 	port->lp_tx_idx = 0;
349 	grp->lg_ntx_ports--;
350 
351 	port->lp_tx_enabled = B_FALSE;
352 }
353 
354 static uint16_t
355 aggr_send_ip6_hdr_len(mblk_t *mp, ip6_t *ip6h)
356 {
357 	uint16_t length;
358 	uint_t	ehdrlen;
359 	uint8_t	*nexthdrp;
360 	uint8_t *whereptr;
361 	uint8_t *endptr;
362 	ip6_dest_t *desthdr;
363 	ip6_rthdr_t *rthdr;
364 	ip6_frag_t *fraghdr;
365 
366 	length = IPV6_HDR_LEN;
367 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
368 	endptr = mp->b_wptr;
369 
370 	nexthdrp = &ip6h->ip6_nxt;
371 	while (whereptr < endptr) {
372 		switch (*nexthdrp) {
373 		case IPPROTO_HOPOPTS:
374 		case IPPROTO_DSTOPTS:
375 			/* Assumes the headers are identical for hbh and dst */
376 			desthdr = (ip6_dest_t *)whereptr;
377 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
378 			nexthdrp = &desthdr->ip6d_nxt;
379 			break;
380 		case IPPROTO_ROUTING:
381 			rthdr = (ip6_rthdr_t *)whereptr;
382 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
383 			nexthdrp = &rthdr->ip6r_nxt;
384 			break;
385 		case IPPROTO_FRAGMENT:
386 			fraghdr = (ip6_frag_t *)whereptr;
387 			ehdrlen = sizeof (ip6_frag_t);
388 			nexthdrp = &fraghdr->ip6f_nxt;
389 			break;
390 		case IPPROTO_NONE:
391 			/* No next header means we're finished */
392 		default:
393 			return (length);
394 		}
395 		length += ehdrlen;
396 		whereptr += ehdrlen;
397 	}
398 
399 	return (length);
400 }
401