xref: /freebsd/sys/netinet/tcp_lro.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /******************************************************************************
2 
3 Copyright (c) 2007, Myricom Inc.
4 Copyright (c) 2008, Intel Corporation.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17  3. Neither the name of the Intel Corporation, nor the names of its
18     contributors may be used to endorse or promote products derived from
19     this software without specific prior written permission.
20 
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
32 
33 $FreeBSD$
34 ***************************************************************************/
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/endian.h>
39 #include <sys/mbuf.h>
40 #include <sys/kernel.h>
41 #include <sys/socket.h>
42 
43 #include <net/if.h>
44 #include <net/ethernet.h>
45 #include <net/if_media.h>
46 
47 #include <netinet/in_systm.h>
48 #include <netinet/in.h>
49 #include <netinet/ip.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_lro.h>
52 
53 #include <machine/bus.h>
54 #include <machine/in_cksum.h>
55 
56 
57 static uint16_t do_csum_data(uint16_t *raw, int len)
58 {
59 	uint32_t csum;
60 	csum = 0;
61 	while (len > 0) {
62 		csum += *raw;
63 		raw++;
64 		csum += *raw;
65 		raw++;
66 		len -= 4;
67 	}
68 	csum = (csum >> 16) + (csum & 0xffff);
69 	csum = (csum >> 16) + (csum & 0xffff);
70 	return (uint16_t)csum;
71 }
72 
73 /*
74  * Allocate and init the LRO data structures
75  */
76 int
77 tcp_lro_init(struct lro_ctrl *cntl)
78 {
79 	struct lro_entry *lro;
80 	int i, error = 0;
81 
82 	SLIST_INIT(&cntl->lro_free);
83 	SLIST_INIT(&cntl->lro_active);
84 
85 	cntl->lro_bad_csum = 0;
86 	cntl->lro_queued = 0;
87 	cntl->lro_flushed = 0;
88 
89 	for (i = 0; i < LRO_ENTRIES; i++) {
90                 lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
91 		    M_DEVBUF, M_NOWAIT | M_ZERO);
92                 if (lro == NULL) {
93 			if (i == 0)
94 				error = ENOMEM;
95                         break;
96                 }
97 		cntl->lro_cnt = i;
98                 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99         }
100 
101 	return (error);
102 }
103 
104 void
105 tcp_lro_free(struct lro_ctrl *cntl)
106 {
107 	struct lro_entry *entry;
108 
109 	while (!SLIST_EMPTY(&cntl->lro_free)) {
110 		entry = SLIST_FIRST(&cntl->lro_free);
111 		SLIST_REMOVE_HEAD(&cntl->lro_free, next);
112 		free(entry, M_DEVBUF);
113 	}
114 }
115 
116 void
117 tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
118 {
119 	struct ifnet *ifp;
120 	struct ip *ip;
121 	struct tcphdr *tcp;
122 	uint32_t *ts_ptr;
123 	uint32_t tcplen, tcp_csum;
124 
125 
126 	if (lro->append_cnt) {
127 		/* incorporate the new len into the ip header and
128 		 * re-calculate the checksum */
129 		ip = lro->ip;
130 		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
131 		ip->ip_sum = 0;
132 		ip->ip_sum = 0xffff ^
133 			do_csum_data((uint16_t*)ip,
134 					      sizeof (*ip));
135 
136 		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
137 			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
138 		lro->m_head->m_pkthdr.csum_data = 0xffff;
139 		lro->m_head->m_pkthdr.len = lro->len;
140 
141 		/* incorporate the latest ack into the tcp header */
142 		tcp = (struct tcphdr *) (ip + 1);
143 		tcp->th_ack = lro->ack_seq;
144 		tcp->th_win = lro->window;
145 		/* incorporate latest timestamp into the tcp header */
146 		if (lro->timestamp) {
147 			ts_ptr = (uint32_t *)(tcp + 1);
148 			ts_ptr[1] = htonl(lro->tsval);
149 			ts_ptr[2] = lro->tsecr;
150 		}
151 		/*
152 		 * update checksum in tcp header by re-calculating the
153 		 * tcp pseudoheader checksum, and adding it to the checksum
154 		 * of the tcp payload data
155 		 */
156 		tcp->th_sum = 0;
157 		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
158 		tcp_csum = lro->data_csum;
159 		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
160 				      htons(tcplen + IPPROTO_TCP));
161 		tcp_csum += do_csum_data((uint16_t*)tcp,
162 						  tcp->th_off << 2);
163 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
164 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
165 		tcp->th_sum = 0xffff ^ tcp_csum;
166 	}
167 	ifp = cntl->ifp;
168 	(*ifp->if_input)(cntl->ifp, lro->m_head);
169 	cntl->lro_queued += lro->append_cnt + 1;
170 	cntl->lro_flushed++;
171 	lro->m_head = NULL;
172 	lro->timestamp = 0;
173 	lro->append_cnt = 0;
174 	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
175 }
176 
177 int
178 tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
179 {
180 	struct ether_header *eh;
181 	struct ip *ip;
182 	struct tcphdr *tcp;
183 	uint32_t *ts_ptr;
184 	struct mbuf *m_nxt, *m_tail;
185 	struct lro_entry *lro;
186 	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
187 	int opt_bytes, trim, csum_flags;
188 	uint32_t seq, tmp_csum, device_mtu;
189 
190 
191 	eh = mtod(m_head, struct ether_header *);
192 	if (eh->ether_type != htons(ETHERTYPE_IP))
193 		return 1;
194 	ip = (struct ip *) (eh + 1);
195 	if (ip->ip_p != IPPROTO_TCP)
196 		return 1;
197 
198 	/* ensure there are no options */
199 	if ((ip->ip_hl << 2) != sizeof (*ip))
200 		return -1;
201 
202 	/* .. and the packet is not fragmented */
203 	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
204 		return -1;
205 
206 	/* verify that the IP header checksum is correct */
207 	csum_flags = m_head->m_pkthdr.csum_flags;
208 	if (csum_flags & CSUM_IP_CHECKED) {
209 		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
210 			cntl->lro_bad_csum++;
211 			return -1;
212 		}
213 	} else {
214 		tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
215 		if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
216 			cntl->lro_bad_csum++;
217 			return -1;
218 		}
219 	}
220 
221 	/* find the TCP header */
222 	tcp = (struct tcphdr *) (ip + 1);
223 
224 	/* Get the TCP checksum if we dont have it */
225 	if (!csum)
226 		csum = tcp->th_sum;
227 
228 	/* ensure no bits set besides ack or psh */
229 	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
230 		return -1;
231 
232 	/* check for timestamps. Since the only option we handle are
233 	   timestamps, we only have to handle the simple case of
234 	   aligned timestamps */
235 
236 	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
237 	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
238 	ts_ptr = (uint32_t *)(tcp + 1);
239 	if (opt_bytes != 0) {
240 		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
241 		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
242 		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
243 			return -1;
244 	}
245 
246 	ip_len = ntohs(ip->ip_len);
247 	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
248 
249 
250 	/*
251 	 * If frame is padded beyond the end of the IP packet,
252 	 * then we must trim the extra bytes off the end.
253 	 */
254 	tot_len = m_head->m_pkthdr.len;
255 	trim = tot_len - (ip_len + ETHER_HDR_LEN);
256 	if (trim != 0) {
257 		if (trim < 0) {
258 			/* truncated packet */
259 			return -1;
260 		}
261 		m_adj(m_head, -trim);
262 		tot_len = m_head->m_pkthdr.len;
263 	}
264 
265 	m_nxt = m_head;
266 	m_tail = NULL; /* -Wuninitialized */
267 	while (m_nxt != NULL) {
268 		m_tail = m_nxt;
269 		m_nxt = m_tail->m_next;
270 	}
271 
272 	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
273 	seq = ntohl(tcp->th_seq);
274 
275 	SLIST_FOREACH(lro, &cntl->lro_active, next) {
276 		if (lro->source_port == tcp->th_sport &&
277 		    lro->dest_port == tcp->th_dport &&
278 		    lro->source_ip == ip->ip_src.s_addr &&
279 		    lro->dest_ip == ip->ip_dst.s_addr) {
280 			/* Try to append it */
281 
282 			if (__predict_false(seq != lro->next_seq ||
283 				    (tcp_data_len == 0 &&
284 				    lro->ack_seq == tcp->th_ack))) {
285 				/* out of order packet or dup ack */
286 				SLIST_REMOVE(&cntl->lro_active, lro,
287 					     lro_entry, next);
288 				tcp_lro_flush(cntl, lro);
289 				return -1;
290 			}
291 
292 			if (opt_bytes) {
293 				uint32_t tsval = ntohl(*(ts_ptr + 1));
294 				/* make sure timestamp values are increasing */
295 				if (__predict_false(lro->tsval > tsval ||
296 					     *(ts_ptr + 2) == 0)) {
297 					return -1;
298 				}
299 				lro->tsval = tsval;
300 				lro->tsecr = *(ts_ptr + 2);
301 			}
302 
303 			lro->next_seq += tcp_data_len;
304 			lro->ack_seq = tcp->th_ack;
305 			lro->window = tcp->th_win;
306 			lro->append_cnt++;
307 			if (tcp_data_len == 0) {
308 				m_freem(m_head);
309 				return 0;
310 			}
311 			/* subtract off the checksum of the tcp header
312                          * from the hardware checksum, and add it to the
313                          * stored tcp data checksum.  Byteswap the checksum
314 			 * if the total length so far is odd
315                          */
316 			tmp_csum = do_csum_data((uint16_t*)tcp,
317 							 tcp_hdr_len);
318 			csum = csum + (tmp_csum ^ 0xffff);
319 			csum = (csum & 0xffff) + (csum >> 16);
320 			csum = (csum & 0xffff) + (csum >> 16);
321 			if (lro->len & 0x1) {
322 				/* Odd number of bytes so far, flip bytes */
323 				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
324 			}
325 			csum = csum + lro->data_csum;
326 			csum = (csum & 0xffff) + (csum >> 16);
327 			csum = (csum & 0xffff) + (csum >> 16);
328 			lro->data_csum = csum;
329 
330 			lro->len += tcp_data_len;
331 
332 			/* adjust mbuf so that m->m_data points to
333 			   the first byte of the payload */
334 			m_adj(m_head, hlen);
335 			/* append mbuf chain */
336 			lro->m_tail->m_next = m_head;
337 			/* advance the last pointer */
338 			lro->m_tail = m_tail;
339 			/* flush packet if required */
340 			device_mtu = cntl->ifp->if_mtu;
341 			if (lro->len > (65535 - device_mtu)) {
342 				SLIST_REMOVE(&cntl->lro_active, lro,
343 					     lro_entry, next);
344 				tcp_lro_flush(cntl, lro);
345 			}
346 			return 0;
347 		}
348 	}
349 
350 	if (SLIST_EMPTY(&cntl->lro_free))
351 	    return -1;
352 
353 	/* start a new chain */
354 	lro = SLIST_FIRST(&cntl->lro_free);
355 	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
356 	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
357 	lro->source_port = tcp->th_sport;
358 	lro->dest_port = tcp->th_dport;
359 	lro->source_ip = ip->ip_src.s_addr;
360 	lro->dest_ip = ip->ip_dst.s_addr;
361 	lro->next_seq = seq + tcp_data_len;
362 	lro->mss = tcp_data_len;
363 	lro->ack_seq = tcp->th_ack;
364 	lro->window = tcp->th_win;
365 
366 	/* save the checksum of just the TCP payload by
367 	 * subtracting off the checksum of the TCP header from
368 	 * the entire hardware checksum
369 	 * Since IP header checksum is correct, checksum over
370 	 * the IP header is -0.  Substracting -0 is unnecessary.
371 	 */
372 	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
373 	csum = csum + (tmp_csum ^ 0xffff);
374 	csum = (csum & 0xffff) + (csum >> 16);
375 	csum = (csum & 0xffff) + (csum >> 16);
376 	lro->data_csum = csum;
377 
378 	lro->ip = ip;
379 	/* record timestamp if it is present */
380 	if (opt_bytes) {
381 		lro->timestamp = 1;
382 		lro->tsval = ntohl(*(ts_ptr + 1));
383 		lro->tsecr = *(ts_ptr + 2);
384 	}
385 	lro->len = tot_len;
386 	lro->m_head = m_head;
387 	lro->m_tail = m_tail;
388 	return 0;
389 }
390