xref: /illumos-gate/usr/src/uts/common/io/myri10ge/drv/myri10ge_lro.c (revision fe072f421ec51952432306add7d50852ad1921b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007-2009 Myricom, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef lint
28 static const char __idstring[] =
29 	"@(#)$Id: myri10ge_lro.c,v 1.7 2009-06-29 13:47:22 gallatin Exp $";
30 #endif
31 
32 #include "myri10ge_var.h"
33 
34 #define	IP_OFFMASK 0x1fff
35 #define	TCPOPT_TIMESTAMP 8
36 #define	TCPOLEN_TIMESTAMP 10
37 #define	TCPOLEN_TSTAMP_APPA 12
38 
39 
40 /*
41  * Assume len is a multiple of 4. Note that "raw" must be
42  * suitably aligned. In practice, it will always enter algned on
43  * at least a 4 bytes bounday, due to the alignment of our rx buffers.
44  */
45 uint16_t
46 myri10ge_csum_generic(uint16_t *raw, int len)
47 {
48 	uint32_t csum;
49 	csum = 0;
50 	while (len > 0) {
51 		csum += *raw;
52 		raw++;
53 		csum += *raw;
54 		raw++;
55 		len -= 4;
56 	}
57 	csum = (csum >> 16) + (csum & 0xffff);
58 	csum = (csum >> 16) + (csum & 0xffff);
59 	return ((uint16_t)csum);
60 }
61 
62 static uint16_t
63 myri10ge_in_pseudo(unsigned int a, unsigned int b,
64     unsigned int c)
65 {
66 	uint64_t csum;
67 
68 	csum = (uint64_t)a + b + c;
69 	csum = (csum >> 16) + (csum & 0xffff);
70 	csum = (csum >> 16) + (csum & 0xffff);
71 	return ((uint16_t)csum);
72 }
73 
74 void
75 myri10ge_lro_flush(struct myri10ge_slice_state *ss, struct lro_entry *lro,
76 	struct myri10ge_mblk_list *mbl)
77 {
78 	struct ip *ip;
79 	struct tcphdr *tcp;
80 	uint32_t *ts_ptr;
81 	uint32_t tcplen, tcp_csum;
82 
83 	if (lro->append_cnt) {
84 		/*
85 		 * incorporate the new len into the ip header and
86 		 * re-calculate the checksum
87 		 */
88 		ip = lro->ip;
89 		ip->ip_len = htons(lro->len - ETHERNET_HEADER_SIZE);
90 		ip->ip_sum = 0;
91 		ip->ip_sum = 0xffff ^
92 		    myri10ge_csum_generic((uint16_t *)ip, sizeof (*ip));
93 		/* incorporate the latest ack into the tcp header */
94 		tcp = (struct tcphdr *)(ip + 1);
95 		tcp->th_ack = lro->ack_seq;
96 		tcp->th_win = lro->window;
97 		tcp->th_flags = lro->flags;
98 		/* incorporate latest timestamp into the tcp header */
99 		if (lro->timestamp) {
100 			ts_ptr = (uint32_t *)(tcp + 1);
101 			ts_ptr[1] = htonl(lro->tsval);
102 			ts_ptr[2] = lro->tsecr;
103 		}
104 		/*
105 		 * update checksum in tcp header by re-calculating the
106 		 * tcp pseudoheader checksum, and adding it to the checksum
107 		 * of the tcp payload data
108 		 */
109 		tcp->th_sum = 0;
110 		tcplen = lro->len - sizeof (*ip) - ETHERNET_HEADER_SIZE;
111 		tcp_csum = lro->data_csum;
112 		tcp_csum += myri10ge_in_pseudo(ip->ip_src.s_addr,
113 		    ip->ip_dst.s_addr, htons(tcplen + IPPROTO_TCP));
114 		tcp_csum += myri10ge_csum_generic((uint16_t *)tcp,
115 		    tcp->th_off << 2);
116 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
117 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
118 		tcp->th_sum = 0xffff ^ tcp_csum;
119 	}
120 
121 	(void) hcksum_assoc(lro->m_head, NULL, NULL, 0, 0, 0,
122 	    0, HCK_IPV4_HDRCKSUM | HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
123 
124 	mbl->cnt += lro->append_cnt;
125 	myri10ge_mbl_append(ss, mbl, lro->m_head);
126 	MYRI10GE_SLICE_STAT_INC(lro_flushed);
127 	MYRI10GE_SLICE_STAT_ADD(lro_queued, lro->append_cnt + 1);
128 	lro->m_head = NULL;
129 	lro->timestamp = 0;
130 	lro->append_cnt = 0;
131 	lro->next = ss->lro_free;
132 	ss->lro_free = lro;
133 }
134 
135 int
136 myri10ge_lro_rx(struct myri10ge_slice_state *ss, mblk_t *m_head,
137 		uint32_t csum, struct myri10ge_mblk_list *mbl)
138 {
139 	struct ether_header *eh;
140 	struct ip *ip;
141 	struct tcphdr *tcp;
142 	uint32_t *ts_ptr;
143 	struct lro_entry *lro, *curr;
144 	int hlen, ip_len, tcp_hdr_len, tcp_data_len;
145 	int opt_bytes, trim;
146 	int tot_len = MBLKL(m_head);
147 	uint32_t seq, tmp_csum;
148 
149 	eh = (struct ether_header *)(void *)m_head->b_rptr;
150 	if (eh->ether_type != htons(ETHERTYPE_IP))
151 		return (EINVAL);
152 	ip = (struct ip *)(void *)(eh + 1);
153 	if (ip->ip_p != IPPROTO_TCP)
154 		return (EINVAL);
155 
156 	/* ensure there are no options */
157 	if ((ip->ip_hl << 2) != sizeof (*ip))
158 		return (EINVAL);
159 
160 	/* .. and the packet is not fragmented */
161 	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
162 		return (EINVAL);
163 
164 	/* verify that the IP header checksum is correct */
165 	tmp_csum = myri10ge_csum_generic((uint16_t *)ip, sizeof (*ip));
166 	if (unlikely((tmp_csum ^ 0xffff) != 0)) {
167 		MYRI10GE_SLICE_STAT_INC(lro_bad_csum);
168 		return (EINVAL);
169 	}
170 
171 	/* find the TCP header */
172 	tcp = (struct tcphdr *)(ip + 1);
173 
174 	/* ensure no bits set besides ack or psh */
175 	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
176 		return (EINVAL);
177 
178 	/*
179 	 * check for timestamps. Since the only option we handle are
180 	 * timestamps, we only have to handle the simple case of
181 	 * aligned timestamps
182 	 */
183 
184 	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
185 	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
186 	ts_ptr = (uint32_t *)(tcp + 1);
187 	if (opt_bytes != 0) {
188 		if (unlikely(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
189 		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
190 		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
191 			return (EINVAL);
192 	}
193 
194 	ip_len = ntohs(ip->ip_len);
195 	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
196 
197 	/*
198 	 * If frame is padded beyond the end of the IP packet,
199 	 * then we must trim the extra bytes off the end.
200 	 */
201 	trim = tot_len - (ip_len + ETHERNET_HEADER_SIZE);
202 	if (trim != 0) {
203 		if (trim < 0) {
204 			/* truncated packet */
205 			return (EINVAL);
206 		}
207 		m_head->b_wptr -= trim;
208 		tot_len -= trim;
209 	}
210 
211 	/* Verify TCP checksum */
212 	csum = ntohs((uint16_t)csum);
213 	tmp_csum = csum + myri10ge_in_pseudo(ip->ip_src.s_addr,
214 	    ip->ip_dst.s_addr, htons(tcp_hdr_len + tcp_data_len + IPPROTO_TCP));
215 	tmp_csum = (tmp_csum & 0xffff) + (tmp_csum >> 16);
216 	tmp_csum = (tmp_csum & 0xffff) + (tmp_csum >> 16);
217 	if (tmp_csum != 0xffff) {
218 		MYRI10GE_SLICE_STAT_INC(lro_bad_csum);
219 		return (EINVAL);
220 	}
221 
222 	hlen = ip_len + ETHERNET_HEADER_SIZE - tcp_data_len;
223 	seq = ntohl(tcp->th_seq);
224 
225 	for (lro = ss->lro_active; lro != NULL; lro = lro->next) {
226 		if (lro->source_port == tcp->th_sport &&
227 		    lro->dest_port == tcp->th_dport &&
228 		    lro->source_ip == ip->ip_src.s_addr &&
229 		    lro->dest_ip == ip->ip_dst.s_addr) {
230 			/* Try to append it */
231 
232 			if (unlikely(seq != lro->next_seq)) {
233 				/* out of order packet */
234 				if (ss->lro_active == lro) {
235 					ss->lro_active = lro->next;
236 				} else {
237 					curr = ss->lro_active;
238 					while (curr->next != lro)
239 						curr = curr->next;
240 					curr->next = lro->next;
241 				}
242 				myri10ge_lro_flush(ss, lro, mbl);
243 				return (EINVAL);
244 			}
245 
246 			if (opt_bytes) {
247 				uint32_t tsval = ntohl(*(ts_ptr + 1));
248 				/* make sure timestamp values are increasing */
249 				if (unlikely(lro->tsval > tsval ||
250 				    *(ts_ptr + 2) == 0)) {
251 					return (-8);
252 				}
253 				lro->tsval = tsval;
254 				lro->tsecr = *(ts_ptr + 2);
255 			}
256 
257 			lro->next_seq += tcp_data_len;
258 			lro->ack_seq = tcp->th_ack;
259 			lro->window = tcp->th_win;
260 			lro->flags |= tcp->th_flags;
261 			lro->append_cnt++;
262 			if (tcp_data_len == 0) {
263 				freeb(m_head);
264 				return (0);
265 			}
266 			/*
267 			 * subtract off the checksum of the tcp header
268 			 * from the hardware checksum, and add it to
269 			 * the stored tcp data checksum.  Byteswap
270 			 * the checksum if the total length so far is
271 			 * odd
272 			 */
273 			tmp_csum = myri10ge_csum_generic((uint16_t *)tcp,
274 			    tcp_hdr_len);
275 			csum = csum + (tmp_csum ^ 0xffff);
276 			csum = (csum & 0xffff) + (csum >> 16);
277 			csum = (csum & 0xffff) + (csum >> 16);
278 			if (lro->len & 0x1) {
279 				/* Odd number of bytes so far, flip bytes */
280 				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
281 			}
282 			csum = csum + lro->data_csum;
283 			csum = (csum & 0xffff) + (csum >> 16);
284 			csum = (csum & 0xffff) + (csum >> 16);
285 			lro->data_csum = csum;
286 
287 			lro->len += tcp_data_len;
288 
289 			/*
290 			 * adjust mblk so that rptr points to
291 			 * the first byte of the payload
292 			 */
293 			m_head->b_rptr += hlen;
294 			/* append mbuf chain */
295 			lro->m_tail->b_cont = m_head;
296 			/* advance the last pointer */
297 			lro->m_tail = m_head;
298 			/* flush packet if required */
299 			if (lro->len > (65535 - myri10ge_mtu) ||
300 			    (lro->append_cnt + 1) == myri10ge_lro_max_aggr) {
301 				if (ss->lro_active == lro) {
302 					ss->lro_active = lro->next;
303 				} else {
304 					curr = ss->lro_active;
305 					while (curr->next != lro)
306 						curr = curr->next;
307 					curr->next = lro->next;
308 				}
309 				myri10ge_lro_flush(ss, lro, mbl);
310 			}
311 			return (0);
312 		}
313 	}
314 
315 	if (ss->lro_free == NULL)
316 		return (ENOMEM);
317 
318 	/* start a new chain */
319 	lro = ss->lro_free;
320 	ss->lro_free = lro->next;
321 	lro->next = ss->lro_active;
322 	ss->lro_active = lro;
323 	lro->source_port = tcp->th_sport;
324 	lro->dest_port = tcp->th_dport;
325 	lro->source_ip = ip->ip_src.s_addr;
326 	lro->dest_ip = ip->ip_dst.s_addr;
327 	lro->next_seq = seq + tcp_data_len;
328 	lro->mss = (uint16_t)tcp_data_len;
329 	lro->ack_seq = tcp->th_ack;
330 	lro->window = tcp->th_win;
331 	lro->flags = tcp->th_flags;
332 
333 	/*
334 	 * save the checksum of just the TCP payload by
335 	 * subtracting off the checksum of the TCP header from
336 	 * the entire hardware checksum
337 	 * Since IP header checksum is correct, checksum over
338 	 * the IP header is -0.  Substracting -0 is unnecessary.
339 	 */
340 	tmp_csum = myri10ge_csum_generic((uint16_t *)tcp, tcp_hdr_len);
341 	csum = csum + (tmp_csum ^ 0xffff);
342 	csum = (csum & 0xffff) + (csum >> 16);
343 	csum = (csum & 0xffff) + (csum >> 16);
344 	lro->data_csum = csum;
345 	lro->ip = ip;
346 
347 	/* record timestamp if it is present */
348 	if (opt_bytes) {
349 		lro->timestamp = 1;
350 		lro->tsval = ntohl(*(ts_ptr + 1));
351 		lro->tsecr = *(ts_ptr + 2);
352 	}
353 	lro->len = tot_len;
354 	lro->m_head = m_head;
355 	lro->m_tail = m_head;
356 	return (0);
357 }
358 
359 /*
360  *  This file uses MyriGE driver indentation.
361  *
362  * Local Variables:
363  * c-file-style:"sun"
364  * tab-width:8
365  * End:
366  */
367