xref: /freebsd/sys/netinet/tcp_lro.c (revision 721351876cd4d3a8a700f62d2061331fa951a488)
1 /******************************************************************************
2 
3 Copyright (c) 2007, Myricom Inc.
4 Copyright (c) 2008, Intel Corporation.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17  3. Neither the name of the Intel Corporation, nor the names of its
18     contributors may be used to endorse or promote products derived from
19     this software without specific prior written permission.
20 
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
32 
33 $FreeBSD$
34 ***************************************************************************/
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/endian.h>
39 #include <sys/mbuf.h>
40 #include <sys/kernel.h>
41 #include <sys/socket.h>
42 
43 #include <net/if.h>
44 #include <net/ethernet.h>
45 #include <net/if_media.h>
46 
47 #include <netinet/in_systm.h>
48 #include <netinet/in.h>
49 #include <netinet/ip.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_lro.h>
52 
53 #include <machine/bus.h>
54 #include <machine/in_cksum.h>
55 
56 
57 static uint16_t do_csum_data(uint16_t *raw, int len)
58 {
59 	uint32_t csum;
60 	csum = 0;
61 	while (len > 0) {
62 		csum += *raw;
63 		raw++;
64 		csum += *raw;
65 		raw++;
66 		len -= 4;
67 	}
68 	csum = (csum >> 16) + (csum & 0xffff);
69 	csum = (csum >> 16) + (csum & 0xffff);
70 	return (uint16_t)csum;
71 }
72 
73 /*
74  * Allocate and init the LRO data structures
75  */
76 int
77 tcp_lro_init(struct lro_ctrl *cntl)
78 {
79 	struct lro_entry *lro;
80 	int i, error = 0;
81 
82 	SLIST_INIT(&cntl->lro_free);
83 	SLIST_INIT(&cntl->lro_active);
84 
85 	cntl->lro_bad_csum = 0;
86 	cntl->lro_queued = 0;
87 	cntl->lro_flushed = 0;
88 
89 	for (i = 0; i < LRO_ENTRIES; i++) {
90                 lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
91 		    M_DEVBUF, M_NOWAIT | M_ZERO);
92                 if (lro == NULL) {
93 			if (i == 0)
94 				error = ENOMEM;
95                         break;
96                 }
97 		cntl->lro_cnt = i;
98                 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99         }
100 
101 	return (error);
102 }
103 
104 void
105 tcp_lro_free(struct lro_ctrl *cntl)
106 {
107 	struct lro_entry *entry;
108 
109 	while (!SLIST_EMPTY(&cntl->lro_free)) {
110 		entry = SLIST_FIRST(&cntl->lro_free);
111                	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
112 		free(entry, M_DEVBUF);
113 	}
114 }
115 
116 void
117 tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
118 {
119 	struct ifnet *ifp;
120 	struct ip *ip;
121 	struct tcphdr *tcp;
122 	uint32_t *ts_ptr;
123 	uint32_t tcplen, tcp_csum;
124 
125 
126 	if (lro->append_cnt) {
127 		/* incorporate the new len into the ip header and
128 		 * re-calculate the checksum */
129 		ip = lro->ip;
130 		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
131 		ip->ip_sum = 0;
132 		ip->ip_sum = 0xffff ^
133 			do_csum_data((uint16_t*)ip,
134 					      sizeof (*ip));
135 
136 		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
137 			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
138 		lro->m_head->m_pkthdr.csum_data = 0xffff;
139 		lro->m_head->m_pkthdr.len = lro->len;
140 
141 		/* incorporate the latest ack into the tcp header */
142 		tcp = (struct tcphdr *) (ip + 1);
143 		tcp->th_ack = lro->ack_seq;
144 		tcp->th_win = lro->window;
145 		/* incorporate latest timestamp into the tcp header */
146 		if (lro->timestamp) {
147 			ts_ptr = (uint32_t *)(tcp + 1);
148 			ts_ptr[1] = htonl(lro->tsval);
149 			ts_ptr[2] = lro->tsecr;
150 		}
151 		/*
152 		 * update checksum in tcp header by re-calculating the
153 		 * tcp pseudoheader checksum, and adding it to the checksum
154 		 * of the tcp payload data
155 		 */
156 		tcp->th_sum = 0;
157 		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
158 		tcp_csum = lro->data_csum;
159 		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
160 				      htons(tcplen + IPPROTO_TCP));
161 		tcp_csum += do_csum_data((uint16_t*)tcp,
162 						  tcp->th_off << 2);
163 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
164 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
165 		tcp->th_sum = 0xffff ^ tcp_csum;
166 	}
167 	ifp = cntl->ifp;
168 	(*ifp->if_input)(cntl->ifp, lro->m_head);
169 	cntl->lro_queued += lro->append_cnt + 1;
170 	cntl->lro_flushed++;
171 	lro->m_head = NULL;
172 	lro->timestamp = 0;
173 	lro->append_cnt = 0;
174 	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
175 }
176 
177 int
178 tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
179 {
180 	struct ether_header *eh;
181 	struct ip *ip;
182 	struct tcphdr *tcp;
183 	uint32_t *ts_ptr;
184 	struct mbuf *m_nxt, *m_tail;
185 	struct lro_entry *lro;
186 	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
187 	int opt_bytes, trim;
188 	uint32_t seq, tmp_csum, device_mtu;
189 
190 
191 	eh = mtod(m_head, struct ether_header *);
192 	if (eh->ether_type != htons(ETHERTYPE_IP))
193 		return 1;
194 	ip = (struct ip *) (eh + 1);
195 	if (ip->ip_p != IPPROTO_TCP)
196 		return 1;
197 
198 	/* ensure there are no options */
199 	if ((ip->ip_hl << 2) != sizeof (*ip))
200 		return -1;
201 
202 	/* .. and the packet is not fragmented */
203 	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
204 		return -1;
205 
206 	/* verify that the IP header checksum is correct */
207 	tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
208 	if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
209 		cntl->lro_bad_csum++;
210 		return -1;
211 	}
212 
213 	/* find the TCP header */
214 	tcp = (struct tcphdr *) (ip + 1);
215 
216 	/* Get the TCP checksum if we dont have it */
217 	if (!csum)
218 		csum = tcp->th_sum;
219 
220 	/* ensure no bits set besides ack or psh */
221 	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
222 		return -1;
223 
224 	/* check for timestamps. Since the only option we handle are
225 	   timestamps, we only have to handle the simple case of
226 	   aligned timestamps */
227 
228 	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
229 	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
230 	ts_ptr = (uint32_t *)(tcp + 1);
231 	if (opt_bytes != 0) {
232 		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
233 		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
234 		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
235 			return -1;
236 	}
237 
238 	ip_len = ntohs(ip->ip_len);
239 	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
240 
241 
242 	/*
243 	 * If frame is padded beyond the end of the IP packet,
244 	 * then we must trim the extra bytes off the end.
245 	 */
246 	tot_len = m_head->m_pkthdr.len;
247 	trim = tot_len - (ip_len + ETHER_HDR_LEN);
248 	if (trim != 0) {
249 		if (trim < 0) {
250 			/* truncated packet */
251 			return -1;
252 		}
253 		m_adj(m_head, -trim);
254 		tot_len = m_head->m_pkthdr.len;
255 	}
256 
257 	m_nxt = m_head;
258 	m_tail = NULL; /* -Wuninitialized */
259 	while (m_nxt != NULL) {
260 		m_tail = m_nxt;
261 		m_nxt = m_tail->m_next;
262 	}
263 
264 	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
265 	seq = ntohl(tcp->th_seq);
266 
267 	SLIST_FOREACH(lro, &cntl->lro_active, next) {
268 		if (lro->source_port == tcp->th_sport &&
269 		    lro->dest_port == tcp->th_dport &&
270 		    lro->source_ip == ip->ip_src.s_addr &&
271 		    lro->dest_ip == ip->ip_dst.s_addr) {
272 			/* Try to append it */
273 
274 			if (__predict_false(seq != lro->next_seq)) {
275 				/* out of order packet */
276 				SLIST_REMOVE(&cntl->lro_active, lro,
277 					     lro_entry, next);
278 				tcp_lro_flush(cntl, lro);
279 				return -1;
280 			}
281 
282 			if (opt_bytes) {
283 				uint32_t tsval = ntohl(*(ts_ptr + 1));
284 				/* make sure timestamp values are increasing */
285 				if (__predict_false(lro->tsval > tsval ||
286 					     *(ts_ptr + 2) == 0)) {
287 					return -1;
288 				}
289 				lro->tsval = tsval;
290 				lro->tsecr = *(ts_ptr + 2);
291 			}
292 
293 			lro->next_seq += tcp_data_len;
294 			lro->ack_seq = tcp->th_ack;
295 			lro->window = tcp->th_win;
296 			lro->append_cnt++;
297 			if (tcp_data_len == 0) {
298 				m_freem(m_head);
299 				return 0;
300 			}
301 			/* subtract off the checksum of the tcp header
302                          * from the hardware checksum, and add it to the
303                          * stored tcp data checksum.  Byteswap the checksum
304 			 * if the total length so far is odd
305                          */
306 			tmp_csum = do_csum_data((uint16_t*)tcp,
307 							 tcp_hdr_len);
308 			csum = csum + (tmp_csum ^ 0xffff);
309 			csum = (csum & 0xffff) + (csum >> 16);
310 			csum = (csum & 0xffff) + (csum >> 16);
311 			if (lro->len & 0x1) {
312 				/* Odd number of bytes so far, flip bytes */
313 				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
314 			}
315 			csum = csum + lro->data_csum;
316 			csum = (csum & 0xffff) + (csum >> 16);
317 			csum = (csum & 0xffff) + (csum >> 16);
318 			lro->data_csum = csum;
319 
320 			lro->len += tcp_data_len;
321 
322 			/* adjust mbuf so that m->m_data points to
323 			   the first byte of the payload */
324 			m_adj(m_head, hlen);
325 			/* append mbuf chain */
326 			lro->m_tail->m_next = m_head;
327 			/* advance the last pointer */
328 			lro->m_tail = m_tail;
329 			/* flush packet if required */
330 			device_mtu = cntl->ifp->if_mtu;
331 			if (lro->len > (65535 - device_mtu)) {
332 				SLIST_REMOVE(&cntl->lro_active, lro,
333 					     lro_entry, next);
334 				tcp_lro_flush(cntl, lro);
335 			}
336 			return 0;
337 		}
338 	}
339 
340 	if (SLIST_EMPTY(&cntl->lro_free))
341 	    return -1;
342 
343 	/* start a new chain */
344 	lro = SLIST_FIRST(&cntl->lro_free);
345 	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
346 	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
347 	lro->source_port = tcp->th_sport;
348 	lro->dest_port = tcp->th_dport;
349 	lro->source_ip = ip->ip_src.s_addr;
350 	lro->dest_ip = ip->ip_dst.s_addr;
351 	lro->next_seq = seq + tcp_data_len;
352 	lro->mss = tcp_data_len;
353 	lro->ack_seq = tcp->th_ack;
354 	lro->window = tcp->th_win;
355 
356 	/* save the checksum of just the TCP payload by
357 	 * subtracting off the checksum of the TCP header from
358 	 * the entire hardware checksum
359 	 * Since IP header checksum is correct, checksum over
360 	 * the IP header is -0.  Substracting -0 is unnecessary.
361 	 */
362 	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
363 	csum = csum + (tmp_csum ^ 0xffff);
364 	csum = (csum & 0xffff) + (csum >> 16);
365 	csum = (csum & 0xffff) + (csum >> 16);
366 	lro->data_csum = csum;
367 
368 	lro->ip = ip;
369 	/* record timestamp if it is present */
370 	if (opt_bytes) {
371 		lro->timestamp = 1;
372 		lro->tsval = ntohl(*(ts_ptr + 1));
373 		lro->tsecr = *(ts_ptr + 2);
374 	}
375 	lro->len = tot_len;
376 	lro->m_head = m_head;
377 	lro->m_tail = m_tail;
378 	return 0;
379 }
380