xref: /freebsd/sys/netinet/tcp_lro.c (revision 4a5216a6dc0c3ce4cf5f2d3ee8af0c3ff3402c4f)
1 /******************************************************************************
2 
3 Copyright (c) 2007, Myricom Inc.
4 Copyright (c) 2008, Intel Corporation.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17  3. Neither the name of the Intel Corporation, nor the names of its
18     contributors may be used to endorse or promote products derived from
19     this software without specific prior written permission.
20 
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
32 
33 $FreeBSD$
34 ***************************************************************************/
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/endian.h>
39 #include <sys/mbuf.h>
40 #include <sys/kernel.h>
41 #include <sys/socket.h>
42 
43 #include <net/if.h>
44 #include <net/ethernet.h>
45 #include <net/if_media.h>
46 
47 #include <netinet/in_systm.h>
48 #include <netinet/in.h>
49 #include <netinet/ip.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_lro.h>
52 
53 #include <machine/bus.h>
54 #include <machine/in_cksum.h>
55 
56 
57 static uint16_t do_csum_data(uint16_t *raw, int len)
58 {
59 	uint32_t csum;
60 	csum = 0;
61 	while (len > 0) {
62 		csum += *raw;
63 		raw++;
64 		csum += *raw;
65 		raw++;
66 		len -= 4;
67 	}
68 	csum = (csum >> 16) + (csum & 0xffff);
69 	csum = (csum >> 16) + (csum & 0xffff);
70 	return (uint16_t)csum;
71 }
72 
73 /*
74  * Allocate and init the LRO data structures
75  */
76 int
77 tcp_lro_init(struct lro_ctrl *cntl)
78 {
79 	struct lro_entry *lro;
80 	int i, error = 0;
81 
82 	SLIST_INIT(&cntl->lro_free);
83 	SLIST_INIT(&cntl->lro_active);
84 
85 	cntl->lro_bad_csum = 0;
86 	cntl->lro_queued = 0;
87 	cntl->lro_flushed = 0;
88 
89 	for (i = 0; i < LRO_ENTRIES; i++) {
90                 lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
91 		    M_DEVBUF, M_NOWAIT | M_ZERO);
92                 if (lro == NULL) {
93 			if (i == 0)
94 				error = ENOMEM;
95                         break;
96                 }
97 		cntl->lro_cnt = i;
98                 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99         }
100 
101 	return (error);
102 }
103 
104 void
105 tcp_lro_free(struct lro_ctrl *cntl)
106 {
107 	struct lro_entry *entry;
108 
109 	while (!SLIST_EMPTY(&cntl->lro_free)) {
110 		entry = SLIST_FIRST(&cntl->lro_free);
111                	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
112 		free(entry, M_DEVBUF);
113 	}
114 }
115 
116 void
117 tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
118 {
119 	struct ifnet *ifp;
120 	struct ip *ip;
121 	struct tcphdr *tcp;
122 	uint32_t *ts_ptr;
123 	uint32_t tcplen, tcp_csum;
124 
125 
126 	if (lro->append_cnt) {
127 		/* incorporate the new len into the ip header and
128 		 * re-calculate the checksum */
129 		ip = lro->ip;
130 		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
131 		ip->ip_sum = 0;
132 		ip->ip_sum = 0xffff ^
133 			do_csum_data((uint16_t*)ip,
134 					      sizeof (*ip));
135 
136 		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
137 			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
138 		lro->m_head->m_pkthdr.csum_data = 0xffff;
139 		lro->m_head->m_pkthdr.len = lro->len;
140 
141 		/* incorporate the latest ack into the tcp header */
142 		tcp = (struct tcphdr *) (ip + 1);
143 		tcp->th_ack = lro->ack_seq;
144 		tcp->th_win = lro->window;
145 		/* incorporate latest timestamp into the tcp header */
146 		if (lro->timestamp) {
147 			ts_ptr = (uint32_t *)(tcp + 1);
148 			ts_ptr[1] = htonl(lro->tsval);
149 			ts_ptr[2] = lro->tsecr;
150 		}
151 		/*
152 		 * update checksum in tcp header by re-calculating the
153 		 * tcp pseudoheader checksum, and adding it to the checksum
154 		 * of the tcp payload data
155 		 */
156 		tcp->th_sum = 0;
157 		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
158 		tcp_csum = lro->data_csum;
159 		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
160 				      htons(tcplen + IPPROTO_TCP));
161 		tcp_csum += do_csum_data((uint16_t*)tcp,
162 						  tcp->th_off << 2);
163 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
164 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
165 		tcp->th_sum = 0xffff ^ tcp_csum;
166 	}
167 	ifp = cntl->ifp;
168 	(*ifp->if_input)(cntl->ifp, lro->m_head);
169 	cntl->lro_queued += lro->append_cnt + 1;
170 	cntl->lro_flushed++;
171 	lro->m_head = NULL;
172 	lro->timestamp = 0;
173 	lro->append_cnt = 0;
174 	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
175 }
176 
177 int
178 tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
179 {
180 	struct ether_header *eh;
181 	struct ip *ip;
182 	struct tcphdr *tcp;
183 	uint32_t *ts_ptr;
184 	struct mbuf *m_nxt, *m_tail;
185 	struct lro_entry *lro;
186 	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
187 	int opt_bytes, trim, csum_flags;
188 	uint32_t seq, tmp_csum, device_mtu;
189 
190 
191 	eh = mtod(m_head, struct ether_header *);
192 	if (eh->ether_type != htons(ETHERTYPE_IP))
193 		return 1;
194 	ip = (struct ip *) (eh + 1);
195 	if (ip->ip_p != IPPROTO_TCP)
196 		return 1;
197 
198 	/* ensure there are no options */
199 	if ((ip->ip_hl << 2) != sizeof (*ip))
200 		return -1;
201 
202 	/* .. and the packet is not fragmented */
203 	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
204 		return -1;
205 
206 	/* verify that the IP header checksum is correct */
207 	csum_flags = m_head->m_pkthdr.csum_flags;
208 	if (csum_flags & CSUM_IP_CHECKED) {
209 		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
210 			cntl->lro_bad_csum++;
211 			return -1;
212 		}
213 	} else {
214 		tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
215 		if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
216 			cntl->lro_bad_csum++;
217 			return -1;
218 		}
219 	}
220 
221 	/* find the TCP header */
222 	tcp = (struct tcphdr *) (ip + 1);
223 
224 	/* Get the TCP checksum if we dont have it */
225 	if (!csum)
226 		csum = tcp->th_sum;
227 
228 	/* ensure no bits set besides ack or psh */
229 	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
230 		return -1;
231 
232 	/* check for timestamps. Since the only option we handle are
233 	   timestamps, we only have to handle the simple case of
234 	   aligned timestamps */
235 
236 	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
237 	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
238 	ts_ptr = (uint32_t *)(tcp + 1);
239 	if (opt_bytes != 0) {
240 		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
241 		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
242 		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
243 			return -1;
244 	}
245 
246 	ip_len = ntohs(ip->ip_len);
247 	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
248 
249 
250 	/*
251 	 * If frame is padded beyond the end of the IP packet,
252 	 * then we must trim the extra bytes off the end.
253 	 */
254 	tot_len = m_head->m_pkthdr.len;
255 	trim = tot_len - (ip_len + ETHER_HDR_LEN);
256 	if (trim != 0) {
257 		if (trim < 0) {
258 			/* truncated packet */
259 			return -1;
260 		}
261 		m_adj(m_head, -trim);
262 		tot_len = m_head->m_pkthdr.len;
263 	}
264 
265 	m_nxt = m_head;
266 	m_tail = NULL; /* -Wuninitialized */
267 	while (m_nxt != NULL) {
268 		m_tail = m_nxt;
269 		m_nxt = m_tail->m_next;
270 	}
271 
272 	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
273 	seq = ntohl(tcp->th_seq);
274 
275 	SLIST_FOREACH(lro, &cntl->lro_active, next) {
276 		if (lro->source_port == tcp->th_sport &&
277 		    lro->dest_port == tcp->th_dport &&
278 		    lro->source_ip == ip->ip_src.s_addr &&
279 		    lro->dest_ip == ip->ip_dst.s_addr) {
280 			/* Try to append it */
281 
282 			if (__predict_false(seq != lro->next_seq)) {
283 				/* out of order packet */
284 				SLIST_REMOVE(&cntl->lro_active, lro,
285 					     lro_entry, next);
286 				tcp_lro_flush(cntl, lro);
287 				return -1;
288 			}
289 
290 			if (opt_bytes) {
291 				uint32_t tsval = ntohl(*(ts_ptr + 1));
292 				/* make sure timestamp values are increasing */
293 				if (__predict_false(lro->tsval > tsval ||
294 					     *(ts_ptr + 2) == 0)) {
295 					return -1;
296 				}
297 				lro->tsval = tsval;
298 				lro->tsecr = *(ts_ptr + 2);
299 			}
300 
301 			lro->next_seq += tcp_data_len;
302 			lro->ack_seq = tcp->th_ack;
303 			lro->window = tcp->th_win;
304 			lro->append_cnt++;
305 			if (tcp_data_len == 0) {
306 				m_freem(m_head);
307 				return 0;
308 			}
309 			/* subtract off the checksum of the tcp header
310                          * from the hardware checksum, and add it to the
311                          * stored tcp data checksum.  Byteswap the checksum
312 			 * if the total length so far is odd
313                          */
314 			tmp_csum = do_csum_data((uint16_t*)tcp,
315 							 tcp_hdr_len);
316 			csum = csum + (tmp_csum ^ 0xffff);
317 			csum = (csum & 0xffff) + (csum >> 16);
318 			csum = (csum & 0xffff) + (csum >> 16);
319 			if (lro->len & 0x1) {
320 				/* Odd number of bytes so far, flip bytes */
321 				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
322 			}
323 			csum = csum + lro->data_csum;
324 			csum = (csum & 0xffff) + (csum >> 16);
325 			csum = (csum & 0xffff) + (csum >> 16);
326 			lro->data_csum = csum;
327 
328 			lro->len += tcp_data_len;
329 
330 			/* adjust mbuf so that m->m_data points to
331 			   the first byte of the payload */
332 			m_adj(m_head, hlen);
333 			/* append mbuf chain */
334 			lro->m_tail->m_next = m_head;
335 			/* advance the last pointer */
336 			lro->m_tail = m_tail;
337 			/* flush packet if required */
338 			device_mtu = cntl->ifp->if_mtu;
339 			if (lro->len > (65535 - device_mtu)) {
340 				SLIST_REMOVE(&cntl->lro_active, lro,
341 					     lro_entry, next);
342 				tcp_lro_flush(cntl, lro);
343 			}
344 			return 0;
345 		}
346 	}
347 
348 	if (SLIST_EMPTY(&cntl->lro_free))
349 	    return -1;
350 
351 	/* start a new chain */
352 	lro = SLIST_FIRST(&cntl->lro_free);
353 	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
354 	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
355 	lro->source_port = tcp->th_sport;
356 	lro->dest_port = tcp->th_dport;
357 	lro->source_ip = ip->ip_src.s_addr;
358 	lro->dest_ip = ip->ip_dst.s_addr;
359 	lro->next_seq = seq + tcp_data_len;
360 	lro->mss = tcp_data_len;
361 	lro->ack_seq = tcp->th_ack;
362 	lro->window = tcp->th_win;
363 
364 	/* save the checksum of just the TCP payload by
365 	 * subtracting off the checksum of the TCP header from
366 	 * the entire hardware checksum
367 	 * Since IP header checksum is correct, checksum over
368 	 * the IP header is -0.  Substracting -0 is unnecessary.
369 	 */
370 	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
371 	csum = csum + (tmp_csum ^ 0xffff);
372 	csum = (csum & 0xffff) + (csum >> 16);
373 	csum = (csum & 0xffff) + (csum >> 16);
374 	lro->data_csum = csum;
375 
376 	lro->ip = ip;
377 	/* record timestamp if it is present */
378 	if (opt_bytes) {
379 		lro->timestamp = 1;
380 		lro->tsval = ntohl(*(ts_ptr + 1));
381 		lro->tsecr = *(ts_ptr + 2);
382 	}
383 	lro->len = tot_len;
384 	lro->m_head = m_head;
385 	lro->m_tail = m_tail;
386 	return 0;
387 }
388