xref: /illumos-gate/usr/src/uts/common/io/myri10ge/drv/myri10ge_lro.c (revision a3ef597505463b3755428d2fd18d4672af6e737a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007-2009 Myricom, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include "myri10ge_var.h"
28 
29 #define	IP_OFFMASK 0x1fff
30 #define	TCPOPT_TIMESTAMP 8
31 #define	TCPOLEN_TIMESTAMP 10
32 #define	TCPOLEN_TSTAMP_APPA 12
33 
34 
35 /*
36  * Assume len is a multiple of 4. Note that "raw" must be
37  * suitably aligned. In practice, it will always enter algned on
38  * at least a 4 bytes bounday, due to the alignment of our rx buffers.
39  */
40 uint16_t
myri10ge_csum_generic(uint16_t * raw,int len)41 myri10ge_csum_generic(uint16_t *raw, int len)
42 {
43 	uint32_t csum;
44 	csum = 0;
45 	while (len > 0) {
46 		csum += *raw;
47 		raw++;
48 		csum += *raw;
49 		raw++;
50 		len -= 4;
51 	}
52 	csum = (csum >> 16) + (csum & 0xffff);
53 	csum = (csum >> 16) + (csum & 0xffff);
54 	return ((uint16_t)csum);
55 }
56 
57 static uint16_t
myri10ge_in_pseudo(unsigned int a,unsigned int b,unsigned int c)58 myri10ge_in_pseudo(unsigned int a, unsigned int b,
59     unsigned int c)
60 {
61 	uint64_t csum;
62 
63 	csum = (uint64_t)a + b + c;
64 	csum = (csum >> 16) + (csum & 0xffff);
65 	csum = (csum >> 16) + (csum & 0xffff);
66 	return ((uint16_t)csum);
67 }
68 
69 void
myri10ge_lro_flush(struct myri10ge_slice_state * ss,struct lro_entry * lro,struct myri10ge_mblk_list * mbl)70 myri10ge_lro_flush(struct myri10ge_slice_state *ss, struct lro_entry *lro,
71     struct myri10ge_mblk_list *mbl)
72 {
73 	struct ip *ip;
74 	struct tcphdr *tcp;
75 	uint32_t *ts_ptr;
76 	uint32_t tcplen, tcp_csum;
77 
78 	if (lro->append_cnt) {
79 		/*
80 		 * incorporate the new len into the ip header and
81 		 * re-calculate the checksum
82 		 */
83 		ip = lro->ip;
84 		ip->ip_len = htons(lro->len - ETHERNET_HEADER_SIZE);
85 		ip->ip_sum = 0;
86 		ip->ip_sum = 0xffff ^
87 		    myri10ge_csum_generic((uint16_t *)ip, sizeof (*ip));
88 		/* incorporate the latest ack into the tcp header */
89 		tcp = (struct tcphdr *)(ip + 1);
90 		tcp->th_ack = lro->ack_seq;
91 		tcp->th_win = lro->window;
92 		tcp->th_flags = lro->flags;
93 		/* incorporate latest timestamp into the tcp header */
94 		if (lro->timestamp) {
95 			ts_ptr = (uint32_t *)(tcp + 1);
96 			ts_ptr[1] = htonl(lro->tsval);
97 			ts_ptr[2] = lro->tsecr;
98 		}
99 		/*
100 		 * update checksum in tcp header by re-calculating the
101 		 * tcp pseudoheader checksum, and adding it to the checksum
102 		 * of the tcp payload data
103 		 */
104 		tcp->th_sum = 0;
105 		tcplen = lro->len - sizeof (*ip) - ETHERNET_HEADER_SIZE;
106 		tcp_csum = lro->data_csum;
107 		tcp_csum += myri10ge_in_pseudo(ip->ip_src.s_addr,
108 		    ip->ip_dst.s_addr, htons(tcplen + IPPROTO_TCP));
109 		tcp_csum += myri10ge_csum_generic((uint16_t *)tcp,
110 		    tcp->th_off << 2);
111 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
112 		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
113 		tcp->th_sum = 0xffff ^ tcp_csum;
114 	}
115 
116 	mac_hcksum_set(lro->m_head, 0, 0, 0,
117 	    0, HCK_IPV4_HDRCKSUM_OK | HCK_FULLCKSUM_OK);
118 
119 	mbl->cnt += lro->append_cnt;
120 	myri10ge_mbl_append(ss, mbl, lro->m_head);
121 	MYRI10GE_SLICE_STAT_INC(lro_flushed);
122 	MYRI10GE_SLICE_STAT_ADD(lro_queued, lro->append_cnt + 1);
123 	lro->m_head = NULL;
124 	lro->timestamp = 0;
125 	lro->append_cnt = 0;
126 	lro->next = ss->lro_free;
127 	ss->lro_free = lro;
128 }
129 
130 int
myri10ge_lro_rx(struct myri10ge_slice_state * ss,mblk_t * m_head,uint32_t csum,struct myri10ge_mblk_list * mbl)131 myri10ge_lro_rx(struct myri10ge_slice_state *ss, mblk_t *m_head,
132     uint32_t csum, struct myri10ge_mblk_list *mbl)
133 {
134 	struct ether_header *eh;
135 	struct ip *ip;
136 	struct tcphdr *tcp;
137 	uint32_t *ts_ptr;
138 	struct lro_entry *lro, *curr;
139 	int hlen, ip_len, tcp_hdr_len, tcp_data_len;
140 	int opt_bytes, trim;
141 	int tot_len = MBLKL(m_head);
142 	uint32_t seq, tmp_csum;
143 
144 	eh = (struct ether_header *)(void *)m_head->b_rptr;
145 	if (eh->ether_type != htons(ETHERTYPE_IP))
146 		return (EINVAL);
147 	ip = (struct ip *)(void *)(eh + 1);
148 	if (ip->ip_p != IPPROTO_TCP)
149 		return (EINVAL);
150 
151 	/* ensure there are no options */
152 	if ((ip->ip_hl << 2) != sizeof (*ip))
153 		return (EINVAL);
154 
155 	/* .. and the packet is not fragmented */
156 	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
157 		return (EINVAL);
158 
159 	/* verify that the IP header checksum is correct */
160 	tmp_csum = myri10ge_csum_generic((uint16_t *)ip, sizeof (*ip));
161 	if (unlikely((tmp_csum ^ 0xffff) != 0)) {
162 		MYRI10GE_SLICE_STAT_INC(lro_bad_csum);
163 		return (EINVAL);
164 	}
165 
166 	/* find the TCP header */
167 	tcp = (struct tcphdr *)(ip + 1);
168 
169 	/* ensure no bits set besides ack or psh */
170 	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
171 		return (EINVAL);
172 
173 	/*
174 	 * check for timestamps. Since the only option we handle are
175 	 * timestamps, we only have to handle the simple case of
176 	 * aligned timestamps
177 	 */
178 
179 	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
180 	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
181 	ts_ptr = (uint32_t *)(tcp + 1);
182 	if (opt_bytes != 0) {
183 		if (unlikely(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
184 		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
185 		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
186 			return (EINVAL);
187 	}
188 
189 	ip_len = ntohs(ip->ip_len);
190 	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
191 
192 	/*
193 	 * If frame is padded beyond the end of the IP packet,
194 	 * then we must trim the extra bytes off the end.
195 	 */
196 	trim = tot_len - (ip_len + ETHERNET_HEADER_SIZE);
197 	if (trim != 0) {
198 		if (trim < 0) {
199 			/* truncated packet */
200 			return (EINVAL);
201 		}
202 		m_head->b_wptr -= trim;
203 		tot_len -= trim;
204 	}
205 
206 	/* Verify TCP checksum */
207 	csum = ntohs((uint16_t)csum);
208 	tmp_csum = csum + myri10ge_in_pseudo(ip->ip_src.s_addr,
209 	    ip->ip_dst.s_addr, htons(tcp_hdr_len + tcp_data_len + IPPROTO_TCP));
210 	tmp_csum = (tmp_csum & 0xffff) + (tmp_csum >> 16);
211 	tmp_csum = (tmp_csum & 0xffff) + (tmp_csum >> 16);
212 	if (tmp_csum != 0xffff) {
213 		MYRI10GE_SLICE_STAT_INC(lro_bad_csum);
214 		return (EINVAL);
215 	}
216 
217 	hlen = ip_len + ETHERNET_HEADER_SIZE - tcp_data_len;
218 	seq = ntohl(tcp->th_seq);
219 
220 	for (lro = ss->lro_active; lro != NULL; lro = lro->next) {
221 		if (lro->source_port == tcp->th_sport &&
222 		    lro->dest_port == tcp->th_dport &&
223 		    lro->source_ip == ip->ip_src.s_addr &&
224 		    lro->dest_ip == ip->ip_dst.s_addr) {
225 			/* Try to append it */
226 
227 			if (unlikely(seq != lro->next_seq)) {
228 				/* out of order packet */
229 				if (ss->lro_active == lro) {
230 					ss->lro_active = lro->next;
231 				} else {
232 					curr = ss->lro_active;
233 					while (curr->next != lro)
234 						curr = curr->next;
235 					curr->next = lro->next;
236 				}
237 				myri10ge_lro_flush(ss, lro, mbl);
238 				return (EINVAL);
239 			}
240 
241 			if (opt_bytes) {
242 				uint32_t tsval = ntohl(*(ts_ptr + 1));
243 				/* make sure timestamp values are increasing */
244 				if (unlikely(lro->tsval > tsval ||
245 				    *(ts_ptr + 2) == 0)) {
246 					return (-8);
247 				}
248 				lro->tsval = tsval;
249 				lro->tsecr = *(ts_ptr + 2);
250 			}
251 
252 			lro->next_seq += tcp_data_len;
253 			lro->ack_seq = tcp->th_ack;
254 			lro->window = tcp->th_win;
255 			lro->flags |= tcp->th_flags;
256 			lro->append_cnt++;
257 			if (tcp_data_len == 0) {
258 				freeb(m_head);
259 				return (0);
260 			}
261 			/*
262 			 * subtract off the checksum of the tcp header
263 			 * from the hardware checksum, and add it to
264 			 * the stored tcp data checksum.  Byteswap
265 			 * the checksum if the total length so far is
266 			 * odd
267 			 */
268 			tmp_csum = myri10ge_csum_generic((uint16_t *)tcp,
269 			    tcp_hdr_len);
270 			csum = csum + (tmp_csum ^ 0xffff);
271 			csum = (csum & 0xffff) + (csum >> 16);
272 			csum = (csum & 0xffff) + (csum >> 16);
273 			if (lro->len & 0x1) {
274 				/* Odd number of bytes so far, flip bytes */
275 				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
276 			}
277 			csum = csum + lro->data_csum;
278 			csum = (csum & 0xffff) + (csum >> 16);
279 			csum = (csum & 0xffff) + (csum >> 16);
280 			lro->data_csum = csum;
281 
282 			lro->len += tcp_data_len;
283 
284 			/*
285 			 * adjust mblk so that rptr points to
286 			 * the first byte of the payload
287 			 */
288 			m_head->b_rptr += hlen;
289 			/* append mbuf chain */
290 			lro->m_tail->b_cont = m_head;
291 			/* advance the last pointer */
292 			lro->m_tail = m_head;
293 			/* flush packet if required */
294 			if (lro->len > (65535 - myri10ge_mtu) ||
295 			    (lro->append_cnt + 1) == myri10ge_lro_max_aggr) {
296 				if (ss->lro_active == lro) {
297 					ss->lro_active = lro->next;
298 				} else {
299 					curr = ss->lro_active;
300 					while (curr->next != lro)
301 						curr = curr->next;
302 					curr->next = lro->next;
303 				}
304 				myri10ge_lro_flush(ss, lro, mbl);
305 			}
306 			return (0);
307 		}
308 	}
309 
310 	if (ss->lro_free == NULL)
311 		return (ENOMEM);
312 
313 	/* start a new chain */
314 	lro = ss->lro_free;
315 	ss->lro_free = lro->next;
316 	lro->next = ss->lro_active;
317 	ss->lro_active = lro;
318 	lro->source_port = tcp->th_sport;
319 	lro->dest_port = tcp->th_dport;
320 	lro->source_ip = ip->ip_src.s_addr;
321 	lro->dest_ip = ip->ip_dst.s_addr;
322 	lro->next_seq = seq + tcp_data_len;
323 	lro->mss = (uint16_t)tcp_data_len;
324 	lro->ack_seq = tcp->th_ack;
325 	lro->window = tcp->th_win;
326 	lro->flags = tcp->th_flags;
327 
328 	/*
329 	 * save the checksum of just the TCP payload by
330 	 * subtracting off the checksum of the TCP header from
331 	 * the entire hardware checksum
332 	 * Since IP header checksum is correct, checksum over
333 	 * the IP header is -0.  Substracting -0 is unnecessary.
334 	 */
335 	tmp_csum = myri10ge_csum_generic((uint16_t *)tcp, tcp_hdr_len);
336 	csum = csum + (tmp_csum ^ 0xffff);
337 	csum = (csum & 0xffff) + (csum >> 16);
338 	csum = (csum & 0xffff) + (csum >> 16);
339 	lro->data_csum = csum;
340 	lro->ip = ip;
341 
342 	/* record timestamp if it is present */
343 	if (opt_bytes) {
344 		lro->timestamp = 1;
345 		lro->tsval = ntohl(*(ts_ptr + 1));
346 		lro->tsecr = *(ts_ptr + 2);
347 	}
348 	lro->len = tot_len;
349 	lro->m_head = m_head;
350 	lro->m_tail = m_head;
351 	return (0);
352 }
353 
354 /*
355  *  This file uses MyriGE driver indentation.
356  *
357  * Local Variables:
358  * c-file-style:"sun"
359  * tab-width:8
360  * End:
361  */
362