xref: /freebsd/sys/netinet/tcp_lro.c (revision bd2228ab3ee0cde6831fe446d793fffda2f48503)
1 /******************************************************************************
2 
3 Copyright (c) 2007, Myricom Inc.
4 Copyright (c) 2008, Intel Corporation.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17  3. Neither the name of the Intel Corporation, nor the names of its
18     contributors may be used to endorse or promote products derived from
19     this software without specific prior written permission.
20 
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
32 
33 $FreeBSD$
34 ***************************************************************************/
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/endian.h>
39 #include <sys/mbuf.h>
40 #include <sys/kernel.h>
41 #include <sys/socket.h>
42 
43 #include <net/if.h>
44 #include <net/ethernet.h>
45 #include <net/if_media.h>
46 
47 #include <netinet/in_systm.h>
48 #include <netinet/in.h>
49 #include <netinet/ip.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_lro.h>
52 
53 #include <machine/bus.h>
54 #include <machine/in_cksum.h>
55 
56 
57 static uint16_t do_csum_data(uint16_t *raw, int len)
58 {
59 	uint32_t csum;
60 	csum = 0;
61 	while (len > 0) {
62 		csum += *raw;
63 		raw++;
64 		csum += *raw;
65 		raw++;
66 		len -= 4;
67 	}
68 	csum = (csum >> 16) + (csum & 0xffff);
69 	csum = (csum >> 16) + (csum & 0xffff);
70 	return (uint16_t)csum;
71 }
72 
73 /*
74  * Allocate and init the LRO data structures
75  */
76 int
77 tcp_lro_init(struct lro_ctrl *cntl)
78 {
79 	struct lro_entry *lro;
80 	int i, error = 0;
81 
82 	SLIST_INIT(&cntl->lro_free);
83 	SLIST_INIT(&cntl->lro_active);
84 
85 	cntl->lro_bad_csum = 0;
86 	cntl->lro_queued = 0;
87 	cntl->lro_flushed = 0;
88 
89 	for (i = 0; i < LRO_ENTRIES; i++) {
90                 lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
91 		    M_DEVBUF, M_NOWAIT | M_ZERO);
92                 if (lro == NULL) {
93 			if (i == 0)
94 				error = ENOMEM;
95                         break;
96                 }
97 		cntl->lro_cnt = i;
98                 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99         }
100 
101 	return (error);
102 }
103 
104 void
105 tcp_lro_free(struct lro_ctrl *cntl)
106 {
107 	struct lro_entry *entry;
108 
109 	while (!SLIST_EMPTY(&cntl->lro_free)) {
110 		entry = SLIST_FIRST(&cntl->lro_free);
111 		SLIST_REMOVE_HEAD(&cntl->lro_free, next);
112 		free(entry, M_DEVBUF);
113 	}
114 }
115 
116 void
117 tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
118 {
119 	struct ifnet *ifp;
120 	struct ip *ip;
121 	struct tcphdr *tcp;
122 	uint32_t *ts_ptr;
123 	uint32_t tcplen, tcp_csum;
124 
125 
126 	if (lro->append_cnt) {
127 		/* incorporate the new len into the ip header and
128 		 * re-calculate the checksum */
129 		ip = lro->ip;
130 		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
131 		ip->ip_sum = 0;
132 		ip->ip_sum = 0xffff ^
133 			do_csum_data((uint16_t*)ip,
134 					      sizeof (*ip));
135 
136 		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
137 			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
138 		lro->m_head->m_pkthdr.csum_data = 0xffff;
139 		lro->m_head->m_pkthdr.len = lro->len;
140 
141 		/* incorporate the latest ack into the tcp header */
142 		tcp = (struct tcphdr *) (ip + 1);
143 		tcp->th_ack = lro->ack_seq;
144 		tcp->th_win = lro->window;
145 		/* incorporate latest timestamp into the tcp header */
146 		if (lro->timestamp) {
147 			ts_ptr = (uint32_t *)(tcp + 1);
148 			ts_ptr[1] = htonl(lro->tsval);
149 			ts_ptr[2] = lro->tsecr;
150 		}
151 		/*
152 		 * update checksum in tcp header by re-calculating the
153 		 * tcp pseudoheader checksum, and adding it to the checksum
154 		 * of the tcp payload data
155 		 */
156 		tcp->th_sum = 0;
157 		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
158 		tcp_csum = lro->data_csum;
159 		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
160 				      htons(tcplen + IPPROTO_TCP));
161 		tcp_csum += do_csum_data((uint16_t*)tcp,
162 						  tcp->th_off << 2);
163 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
164 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
165 		tcp->th_sum = 0xffff ^ tcp_csum;
166 	}
167 	ifp = cntl->ifp;
168 	(*ifp->if_input)(cntl->ifp, lro->m_head);
169 	cntl->lro_queued += lro->append_cnt + 1;
170 	cntl->lro_flushed++;
171 	lro->m_head = NULL;
172 	lro->timestamp = 0;
173 	lro->append_cnt = 0;
174 	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
175 }
176 
177 int
178 tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
179 {
180 	struct ether_header *eh;
181 	struct ip *ip;
182 	struct tcphdr *tcp;
183 	uint32_t *ts_ptr;
184 	struct mbuf *m_nxt, *m_tail;
185 	struct lro_entry *lro;
186 	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
187 	int opt_bytes, trim, csum_flags;
188 	uint32_t seq, tmp_csum, device_mtu;
189 
190 
191 	eh = mtod(m_head, struct ether_header *);
192 	if (eh->ether_type != htons(ETHERTYPE_IP))
193 		return 1;
194 	ip = (struct ip *) (eh + 1);
195 	if (ip->ip_p != IPPROTO_TCP)
196 		return 1;
197 
198 	/* ensure there are no options */
199 	if ((ip->ip_hl << 2) != sizeof (*ip))
200 		return -1;
201 
202 	/* .. and the packet is not fragmented */
203 	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
204 		return -1;
205 
206 	/* verify that the IP header checksum is correct */
207 	csum_flags = m_head->m_pkthdr.csum_flags;
208 	if (csum_flags & CSUM_IP_CHECKED) {
209 		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
210 			cntl->lro_bad_csum++;
211 			return -1;
212 		}
213 	} else {
214 		tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
215 		if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
216 			cntl->lro_bad_csum++;
217 			return -1;
218 		}
219 	}
220 
221 	/* find the TCP header */
222 	tcp = (struct tcphdr *) (ip + 1);
223 
224 	/* Get the TCP checksum if we dont have it */
225 	if (!csum)
226 		csum = tcp->th_sum;
227 
228 	/* ensure no bits set besides ack or psh */
229 	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
230 		return -1;
231 
232 	/* check for timestamps. Since the only option we handle are
233 	   timestamps, we only have to handle the simple case of
234 	   aligned timestamps */
235 
236 	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
237 	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
238 	ts_ptr = (uint32_t *)(tcp + 1);
239 	if (opt_bytes != 0) {
240 		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
241 		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
242 		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
243 			return -1;
244 	}
245 
246 	ip_len = ntohs(ip->ip_len);
247 	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
248 
249 
250 	/*
251 	 * If frame is padded beyond the end of the IP packet,
252 	 * then we must trim the extra bytes off the end.
253 	 */
254 	tot_len = m_head->m_pkthdr.len;
255 	trim = tot_len - (ip_len + ETHER_HDR_LEN);
256 	if (trim != 0) {
257 		if (trim < 0) {
258 			/* truncated packet */
259 			return -1;
260 		}
261 		m_adj(m_head, -trim);
262 		tot_len = m_head->m_pkthdr.len;
263 	}
264 
265 	m_nxt = m_head;
266 	m_tail = NULL; /* -Wuninitialized */
267 	while (m_nxt != NULL) {
268 		m_tail = m_nxt;
269 		m_nxt = m_tail->m_next;
270 	}
271 
272 	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
273 	seq = ntohl(tcp->th_seq);
274 
275 	SLIST_FOREACH(lro, &cntl->lro_active, next) {
276 		if (lro->source_port == tcp->th_sport &&
277 		    lro->dest_port == tcp->th_dport &&
278 		    lro->source_ip == ip->ip_src.s_addr &&
279 		    lro->dest_ip == ip->ip_dst.s_addr) {
280 			/* Flush now if appending will result in overflow. */
281 			if (lro->len > (65535 - tcp_data_len)) {
282 				SLIST_REMOVE(&cntl->lro_active, lro,
283 					     lro_entry, next);
284 				tcp_lro_flush(cntl, lro);
285 				break;
286 			}
287 
288 			/* Try to append it */
289 
290 			if (__predict_false(seq != lro->next_seq ||
291 				    (tcp_data_len == 0 &&
292 				    lro->ack_seq == tcp->th_ack))) {
293 				/* out of order packet or dup ack */
294 				SLIST_REMOVE(&cntl->lro_active, lro,
295 					     lro_entry, next);
296 				tcp_lro_flush(cntl, lro);
297 				return -1;
298 			}
299 
300 			if (opt_bytes) {
301 				uint32_t tsval = ntohl(*(ts_ptr + 1));
302 				/* make sure timestamp values are increasing */
303 				if (__predict_false(lro->tsval > tsval ||
304 					     *(ts_ptr + 2) == 0)) {
305 					return -1;
306 				}
307 				lro->tsval = tsval;
308 				lro->tsecr = *(ts_ptr + 2);
309 			}
310 
311 			lro->next_seq += tcp_data_len;
312 			lro->ack_seq = tcp->th_ack;
313 			lro->window = tcp->th_win;
314 			lro->append_cnt++;
315 			if (tcp_data_len == 0) {
316 				m_freem(m_head);
317 				return 0;
318 			}
319 			/* subtract off the checksum of the tcp header
320                          * from the hardware checksum, and add it to the
321                          * stored tcp data checksum.  Byteswap the checksum
322 			 * if the total length so far is odd
323                          */
324 			tmp_csum = do_csum_data((uint16_t*)tcp,
325 							 tcp_hdr_len);
326 			csum = csum + (tmp_csum ^ 0xffff);
327 			csum = (csum & 0xffff) + (csum >> 16);
328 			csum = (csum & 0xffff) + (csum >> 16);
329 			if (lro->len & 0x1) {
330 				/* Odd number of bytes so far, flip bytes */
331 				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
332 			}
333 			csum = csum + lro->data_csum;
334 			csum = (csum & 0xffff) + (csum >> 16);
335 			csum = (csum & 0xffff) + (csum >> 16);
336 			lro->data_csum = csum;
337 
338 			lro->len += tcp_data_len;
339 
340 			/* adjust mbuf so that m->m_data points to
341 			   the first byte of the payload */
342 			m_adj(m_head, hlen);
343 			/* append mbuf chain */
344 			lro->m_tail->m_next = m_head;
345 			/* advance the last pointer */
346 			lro->m_tail = m_tail;
347 			/* flush packet if required */
348 			device_mtu = cntl->ifp->if_mtu;
349 			if (lro->len > (65535 - device_mtu)) {
350 				SLIST_REMOVE(&cntl->lro_active, lro,
351 					     lro_entry, next);
352 				tcp_lro_flush(cntl, lro);
353 			}
354 			return 0;
355 		}
356 	}
357 
358 	if (SLIST_EMPTY(&cntl->lro_free))
359 	    return -1;
360 
361 	/* start a new chain */
362 	lro = SLIST_FIRST(&cntl->lro_free);
363 	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
364 	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
365 	lro->source_port = tcp->th_sport;
366 	lro->dest_port = tcp->th_dport;
367 	lro->source_ip = ip->ip_src.s_addr;
368 	lro->dest_ip = ip->ip_dst.s_addr;
369 	lro->next_seq = seq + tcp_data_len;
370 	lro->mss = tcp_data_len;
371 	lro->ack_seq = tcp->th_ack;
372 	lro->window = tcp->th_win;
373 
374 	/* save the checksum of just the TCP payload by
375 	 * subtracting off the checksum of the TCP header from
376 	 * the entire hardware checksum
377 	 * Since IP header checksum is correct, checksum over
378 	 * the IP header is -0.  Substracting -0 is unnecessary.
379 	 */
380 	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
381 	csum = csum + (tmp_csum ^ 0xffff);
382 	csum = (csum & 0xffff) + (csum >> 16);
383 	csum = (csum & 0xffff) + (csum >> 16);
384 	lro->data_csum = csum;
385 
386 	lro->ip = ip;
387 	/* record timestamp if it is present */
388 	if (opt_bytes) {
389 		lro->timestamp = 1;
390 		lro->tsval = ntohl(*(ts_ptr + 1));
391 		lro->tsecr = *(ts_ptr + 2);
392 	}
393 	lro->len = tot_len;
394 	lro->m_head = m_head;
395 	lro->m_tail = m_tail;
396 	return 0;
397 }
398