xref: /freebsd/sys/netinet/tcp_lro.c (revision e39e854e27f53a784c3982cbeb68f4ad1cfd9162)
1 /*-
2  * Copyright (c) 2007, Myricom Inc.
3  * Copyright (c) 2008, Intel Corporation.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/endian.h>
33 #include <sys/mbuf.h>
34 #include <sys/kernel.h>
35 #include <sys/socket.h>
36 
37 #include <net/if.h>
38 #include <net/ethernet.h>
39 #include <net/if_media.h>
40 
41 #include <netinet/in_systm.h>
42 #include <netinet/in.h>
43 #include <netinet/ip.h>
44 #include <netinet/tcp.h>
45 #include <netinet/tcp_lro.h>
46 
47 #include <machine/bus.h>
48 #include <machine/in_cksum.h>
49 
50 
51 static uint16_t do_csum_data(uint16_t *raw, int len)
52 {
53 	uint32_t csum;
54 	csum = 0;
55 	while (len > 0) {
56 		csum += *raw;
57 		raw++;
58 		csum += *raw;
59 		raw++;
60 		len -= 4;
61 	}
62 	csum = (csum >> 16) + (csum & 0xffff);
63 	csum = (csum >> 16) + (csum & 0xffff);
64 	return (uint16_t)csum;
65 }
66 
67 /*
68  * Allocate and init the LRO data structures
69  */
70 int
71 tcp_lro_init(struct lro_ctrl *cntl)
72 {
73 	struct lro_entry *lro;
74 	int i, error = 0;
75 
76 	SLIST_INIT(&cntl->lro_free);
77 	SLIST_INIT(&cntl->lro_active);
78 
79 	cntl->lro_bad_csum = 0;
80 	cntl->lro_queued = 0;
81 	cntl->lro_flushed = 0;
82 
83 	for (i = 0; i < LRO_ENTRIES; i++) {
84                 lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
85 		    M_DEVBUF, M_NOWAIT | M_ZERO);
86                 if (lro == NULL) {
87 			if (i == 0)
88 				error = ENOMEM;
89                         break;
90                 }
91 		cntl->lro_cnt = i;
92                 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
93         }
94 
95 	return (error);
96 }
97 
98 void
99 tcp_lro_free(struct lro_ctrl *cntl)
100 {
101 	struct lro_entry *entry;
102 
103 	while (!SLIST_EMPTY(&cntl->lro_free)) {
104 		entry = SLIST_FIRST(&cntl->lro_free);
105 		SLIST_REMOVE_HEAD(&cntl->lro_free, next);
106 		free(entry, M_DEVBUF);
107 	}
108 }
109 
110 void
111 tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
112 {
113 	struct ifnet *ifp;
114 	struct ip *ip;
115 	struct tcphdr *tcp;
116 	uint32_t *ts_ptr;
117 	uint32_t tcplen, tcp_csum;
118 
119 
120 	if (lro->append_cnt) {
121 		/* incorporate the new len into the ip header and
122 		 * re-calculate the checksum */
123 		ip = lro->ip;
124 		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
125 		ip->ip_sum = 0;
126 		ip->ip_sum = 0xffff ^
127 			do_csum_data((uint16_t*)ip,
128 					      sizeof (*ip));
129 
130 		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
131 			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
132 		lro->m_head->m_pkthdr.csum_data = 0xffff;
133 		lro->m_head->m_pkthdr.len = lro->len;
134 
135 		/* incorporate the latest ack into the tcp header */
136 		tcp = (struct tcphdr *) (ip + 1);
137 		tcp->th_ack = lro->ack_seq;
138 		tcp->th_win = lro->window;
139 		/* incorporate latest timestamp into the tcp header */
140 		if (lro->timestamp) {
141 			ts_ptr = (uint32_t *)(tcp + 1);
142 			ts_ptr[1] = htonl(lro->tsval);
143 			ts_ptr[2] = lro->tsecr;
144 		}
145 		/*
146 		 * update checksum in tcp header by re-calculating the
147 		 * tcp pseudoheader checksum, and adding it to the checksum
148 		 * of the tcp payload data
149 		 */
150 		tcp->th_sum = 0;
151 		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
152 		tcp_csum = lro->data_csum;
153 		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
154 				      htons(tcplen + IPPROTO_TCP));
155 		tcp_csum += do_csum_data((uint16_t*)tcp,
156 						  tcp->th_off << 2);
157 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
158 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
159 		tcp->th_sum = 0xffff ^ tcp_csum;
160 	}
161 	ifp = cntl->ifp;
162 	(*ifp->if_input)(cntl->ifp, lro->m_head);
163 	cntl->lro_queued += lro->append_cnt + 1;
164 	cntl->lro_flushed++;
165 	lro->m_head = NULL;
166 	lro->timestamp = 0;
167 	lro->append_cnt = 0;
168 	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
169 }
170 
171 int
172 tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
173 {
174 	struct ether_header *eh;
175 	struct ip *ip;
176 	struct tcphdr *tcp;
177 	uint32_t *ts_ptr;
178 	struct mbuf *m_nxt, *m_tail;
179 	struct lro_entry *lro;
180 	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
181 	int opt_bytes, trim, csum_flags;
182 	uint32_t seq, tmp_csum, device_mtu;
183 
184 
185 	eh = mtod(m_head, struct ether_header *);
186 	if (eh->ether_type != htons(ETHERTYPE_IP))
187 		return 1;
188 	ip = (struct ip *) (eh + 1);
189 	if (ip->ip_p != IPPROTO_TCP)
190 		return 1;
191 
192 	/* ensure there are no options */
193 	if ((ip->ip_hl << 2) != sizeof (*ip))
194 		return -1;
195 
196 	/* .. and the packet is not fragmented */
197 	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
198 		return -1;
199 
200 	/* verify that the IP header checksum is correct */
201 	csum_flags = m_head->m_pkthdr.csum_flags;
202 	if (csum_flags & CSUM_IP_CHECKED) {
203 		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
204 			cntl->lro_bad_csum++;
205 			return -1;
206 		}
207 	} else {
208 		tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
209 		if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
210 			cntl->lro_bad_csum++;
211 			return -1;
212 		}
213 	}
214 
215 	/* find the TCP header */
216 	tcp = (struct tcphdr *) (ip + 1);
217 
218 	/* Get the TCP checksum if we dont have it */
219 	if (!csum)
220 		csum = tcp->th_sum;
221 
222 	/* ensure no bits set besides ack or psh */
223 	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
224 		return -1;
225 
226 	/* check for timestamps. Since the only option we handle are
227 	   timestamps, we only have to handle the simple case of
228 	   aligned timestamps */
229 
230 	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
231 	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
232 	ts_ptr = (uint32_t *)(tcp + 1);
233 	if (opt_bytes != 0) {
234 		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
235 		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
236 		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
237 			return -1;
238 	}
239 
240 	ip_len = ntohs(ip->ip_len);
241 	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
242 
243 
244 	/*
245 	 * If frame is padded beyond the end of the IP packet,
246 	 * then we must trim the extra bytes off the end.
247 	 */
248 	tot_len = m_head->m_pkthdr.len;
249 	trim = tot_len - (ip_len + ETHER_HDR_LEN);
250 	if (trim != 0) {
251 		if (trim < 0) {
252 			/* truncated packet */
253 			return -1;
254 		}
255 		m_adj(m_head, -trim);
256 		tot_len = m_head->m_pkthdr.len;
257 	}
258 
259 	m_nxt = m_head;
260 	m_tail = NULL; /* -Wuninitialized */
261 	while (m_nxt != NULL) {
262 		m_tail = m_nxt;
263 		m_nxt = m_tail->m_next;
264 	}
265 
266 	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
267 	seq = ntohl(tcp->th_seq);
268 
269 	SLIST_FOREACH(lro, &cntl->lro_active, next) {
270 		if (lro->source_port == tcp->th_sport &&
271 		    lro->dest_port == tcp->th_dport &&
272 		    lro->source_ip == ip->ip_src.s_addr &&
273 		    lro->dest_ip == ip->ip_dst.s_addr) {
274 			/* Flush now if appending will result in overflow. */
275 			if (lro->len > (65535 - tcp_data_len)) {
276 				SLIST_REMOVE(&cntl->lro_active, lro,
277 					     lro_entry, next);
278 				tcp_lro_flush(cntl, lro);
279 				break;
280 			}
281 
282 			/* Try to append it */
283 
284 			if (__predict_false(seq != lro->next_seq ||
285 				    (tcp_data_len == 0 &&
286 				    lro->ack_seq == tcp->th_ack))) {
287 				/* out of order packet or dup ack */
288 				SLIST_REMOVE(&cntl->lro_active, lro,
289 					     lro_entry, next);
290 				tcp_lro_flush(cntl, lro);
291 				return -1;
292 			}
293 
294 			if (opt_bytes) {
295 				uint32_t tsval = ntohl(*(ts_ptr + 1));
296 				/* make sure timestamp values are increasing */
297 				if (__predict_false(lro->tsval > tsval ||
298 					     *(ts_ptr + 2) == 0)) {
299 					return -1;
300 				}
301 				lro->tsval = tsval;
302 				lro->tsecr = *(ts_ptr + 2);
303 			}
304 
305 			lro->next_seq += tcp_data_len;
306 			lro->ack_seq = tcp->th_ack;
307 			lro->window = tcp->th_win;
308 			lro->append_cnt++;
309 			if (tcp_data_len == 0) {
310 				m_freem(m_head);
311 				return 0;
312 			}
313 			/* subtract off the checksum of the tcp header
314                          * from the hardware checksum, and add it to the
315                          * stored tcp data checksum.  Byteswap the checksum
316 			 * if the total length so far is odd
317                          */
318 			tmp_csum = do_csum_data((uint16_t*)tcp,
319 							 tcp_hdr_len);
320 			csum = csum + (tmp_csum ^ 0xffff);
321 			csum = (csum & 0xffff) + (csum >> 16);
322 			csum = (csum & 0xffff) + (csum >> 16);
323 			if (lro->len & 0x1) {
324 				/* Odd number of bytes so far, flip bytes */
325 				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
326 			}
327 			csum = csum + lro->data_csum;
328 			csum = (csum & 0xffff) + (csum >> 16);
329 			csum = (csum & 0xffff) + (csum >> 16);
330 			lro->data_csum = csum;
331 
332 			lro->len += tcp_data_len;
333 
334 			/* adjust mbuf so that m->m_data points to
335 			   the first byte of the payload */
336 			m_adj(m_head, hlen);
337 			/* append mbuf chain */
338 			lro->m_tail->m_next = m_head;
339 			/* advance the last pointer */
340 			lro->m_tail = m_tail;
341 			/* flush packet if required */
342 			device_mtu = cntl->ifp->if_mtu;
343 			if (lro->len > (65535 - device_mtu)) {
344 				SLIST_REMOVE(&cntl->lro_active, lro,
345 					     lro_entry, next);
346 				tcp_lro_flush(cntl, lro);
347 			}
348 			return 0;
349 		}
350 	}
351 
352 	if (SLIST_EMPTY(&cntl->lro_free))
353 	    return -1;
354 
355 	/* start a new chain */
356 	lro = SLIST_FIRST(&cntl->lro_free);
357 	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
358 	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
359 	lro->source_port = tcp->th_sport;
360 	lro->dest_port = tcp->th_dport;
361 	lro->source_ip = ip->ip_src.s_addr;
362 	lro->dest_ip = ip->ip_dst.s_addr;
363 	lro->next_seq = seq + tcp_data_len;
364 	lro->mss = tcp_data_len;
365 	lro->ack_seq = tcp->th_ack;
366 	lro->window = tcp->th_win;
367 
368 	/* save the checksum of just the TCP payload by
369 	 * subtracting off the checksum of the TCP header from
370 	 * the entire hardware checksum
371 	 * Since IP header checksum is correct, checksum over
372 	 * the IP header is -0.  Substracting -0 is unnecessary.
373 	 */
374 	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
375 	csum = csum + (tmp_csum ^ 0xffff);
376 	csum = (csum & 0xffff) + (csum >> 16);
377 	csum = (csum & 0xffff) + (csum >> 16);
378 	lro->data_csum = csum;
379 
380 	lro->ip = ip;
381 	/* record timestamp if it is present */
382 	if (opt_bytes) {
383 		lro->timestamp = 1;
384 		lro->tsval = ntohl(*(ts_ptr + 1));
385 		lro->tsecr = *(ts_ptr + 2);
386 	}
387 	lro->len = tot_len;
388 	lro->m_head = m_head;
389 	lro->m_tail = m_tail;
390 	return 0;
391 }
392