xref: /freebsd/sys/netinet/tcp_lro_hpts.c (revision bfdd5b643d23171c53920accc2f15f78e984dfae)
1 /*-
2  * Copyright (c) 2016-2018 Netflix, Inc.
3  * Copyright (c) 2016-2021 Mellanox Technologies.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  */
27 #include <sys/cdefs.h>
28 #include "opt_inet.h"
29 #include "opt_inet6.h"
30 
31 #include <sys/param.h>
32 #include <sys/bus.h>
33 #include <sys/interrupt.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/socketvar.h>
40 #include <sys/sysctl.h>
41 
42 #include <net/if.h>
43 #include <net/if_var.h>
44 #include <net/if_private.h>
45 #include <net/ethernet.h>
46 #include <net/bpf.h>
47 #include <net/vnet.h>
48 #include <net/if_dl.h>
49 #include <net/if_media.h>
50 #include <net/if_types.h>
51 #include <net/infiniband.h>
52 #include <net/if_lagg.h>
53 #include <net/pfil.h>
54 
55 #include <netinet/in.h>
56 #include <netinet/in_kdtrace.h>
57 #include <netinet/ip6.h>
58 #include <netinet/ip.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/in_pcb.h>
61 #include <netinet6/in6_pcb.h>
62 #include <netinet6/ip6_var.h>
63 #include <netinet/tcp.h>
64 #include <netinet/tcp_lro.h>
65 #include <netinet/tcp_var.h>
66 #include <netinet/tcp_hpts.h>
67 #include <netinet/tcp_hpts_internal.h>
68 #ifdef TCP_BLACKBOX
69 #include <netinet/tcp_log_buf.h>
70 #endif
71 
72 static void
build_ack_entry(struct tcp_ackent * ae,struct tcphdr * th,struct mbuf * m,uint32_t * ts_ptr,uint16_t iptos)73 build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m,
74     uint32_t *ts_ptr, uint16_t iptos)
75 {
76 	/*
77 	 * Given a TCP ACK, summarize it down into the small TCP ACK
78 	 * entry.
79 	 */
80 	ae->timestamp = m->m_pkthdr.rcv_tstmp;
81 	ae->flags = 0;
82 	if (m->m_flags & M_TSTMP_LRO)
83 		ae->flags |= TSTMP_LRO;
84 	else if (m->m_flags & M_TSTMP)
85 		ae->flags |= TSTMP_HDWR;
86 	ae->seq = th->th_seq;
87 	ae->ack = th->th_ack;
88 	ae->flags |= tcp_get_flags(th);
89 	if (ts_ptr != NULL) {
90 		ae->ts_value = ntohl(ts_ptr[1]);
91 		ae->ts_echo = ntohl(ts_ptr[2]);
92 		ae->flags |= HAS_TSTMP;
93 	}
94 	ae->win = th->th_win;
95 	ae->codepoint = iptos;
96 }
97 
98 static inline bool
tcp_lro_ack_valid(struct mbuf * m,struct tcphdr * th,uint32_t ** ppts,bool * other_opts)99 tcp_lro_ack_valid(struct mbuf *m, struct tcphdr *th, uint32_t **ppts, bool *other_opts)
100 {
101 	/*
102 	 * This function returns two bits of valuable information.
103 	 * a) Is what is present capable of being ack-compressed,
104 	 *    we can ack-compress if there is no options or just
105 	 *    a timestamp option, and of course the th_flags must
106 	 *    be correct as well.
107 	 * b) Our other options present such as SACK. This is
108 	 *    used to determine if we want to wakeup or not.
109 	 */
110 	bool ret = true;
111 
112 	switch (th->th_off << 2) {
113 	case (sizeof(*th) + TCPOLEN_TSTAMP_APPA):
114 		*ppts = (uint32_t *)(th + 1);
115 		/* Check if we have only one timestamp option. */
116 		if (**ppts == TCP_LRO_TS_OPTION)
117 			*other_opts = false;
118 		else {
119 			*other_opts = true;
120 			ret = false;
121 		}
122 		break;
123 	case (sizeof(*th)):
124 		/* No options. */
125 		*ppts = NULL;
126 		*other_opts = false;
127 		break;
128 	default:
129 		*ppts = NULL;
130 		*other_opts = true;
131 		ret = false;
132 		break;
133 	}
134 	/* For ACKCMP we only accept ACK, PUSH, ECE and CWR. */
135 	if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) != 0)
136 		ret = false;
137 	/* If it has data on it we cannot compress it */
138 	if (m->m_pkthdr.lro_tcp_d_len)
139 		ret = false;
140 
141 	/* ACK flag must be set. */
142 	if (!(tcp_get_flags(th) & TH_ACK))
143 		ret = false;
144 	return (ret);
145 }
146 
147 static bool
tcp_lro_check_wake_status(struct tcpcb * tp)148 tcp_lro_check_wake_status(struct tcpcb *tp)
149 {
150 
151 	if (tp->t_fb->tfb_early_wake_check != NULL)
152 		return ((tp->t_fb->tfb_early_wake_check)(tp));
153 	return (false);
154 }
155 
156 #ifdef TCP_BLACKBOX
157 static void
tcp_lro_log(struct tcpcb * tp,const struct lro_ctrl * lc,const struct lro_entry * le,const struct mbuf * m,int frm,int32_t tcp_data_len,uint32_t th_seq,uint32_t th_ack,uint16_t th_win)158 tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
159     const struct lro_entry *le, const struct mbuf *m,
160     int frm, int32_t tcp_data_len, uint32_t th_seq,
161     uint32_t th_ack, uint16_t th_win)
162 {
163 	if (tcp_bblogging_on(tp)) {
164 		union tcp_log_stackspecific log;
165 		struct timeval tv, btv;
166 		uint32_t cts;
167 
168 		cts = tcp_get_usecs(&tv);
169 		memset(&log, 0, sizeof(union tcp_log_stackspecific));
170 		log.u_bbr.flex8 = frm;
171 		log.u_bbr.flex1 = tcp_data_len;
172 		if (m)
173 			log.u_bbr.flex2 = m->m_pkthdr.len;
174 		else
175 			log.u_bbr.flex2 = 0;
176 		if (le->m_head) {
177 			log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs;
178 			log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len;
179 			log.u_bbr.flex5 = le->m_head->m_pkthdr.len;
180 			log.u_bbr.delRate = le->m_head->m_flags;
181 			log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
182 		}
183 		log.u_bbr.inflight = th_seq;
184 		log.u_bbr.delivered = th_ack;
185 		log.u_bbr.timeStamp = cts;
186 		log.u_bbr.epoch = le->next_seq;
187 		log.u_bbr.lt_epoch = le->ack_seq;
188 		log.u_bbr.pacing_gain = th_win;
189 		log.u_bbr.cwnd_gain = le->window;
190 		log.u_bbr.lost = curcpu;
191 		log.u_bbr.cur_del_rate = (uintptr_t)m;
192 		log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
193 		bintime2timeval(&lc->lro_last_queue_time, &btv);
194 		log.u_bbr.flex6 = tcp_tv_to_usec(&btv);
195 		log.u_bbr.flex7 = le->compressed;
196 		log.u_bbr.pacing_gain = le->uncompressed;
197 		if (in_epoch(net_epoch_preempt))
198 			log.u_bbr.inhpts = 1;
199 		else
200 			log.u_bbr.inhpts = 0;
201 		TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv,
202 		    &tptosocket(tp)->so_snd,
203 		    TCP_LOG_LRO, 0, 0, &log, false, &tv);
204 	}
205 }
206 #endif
207 
208 static struct mbuf *
tcp_lro_get_last_if_ackcmp(struct lro_ctrl * lc,struct lro_entry * le,struct tcpcb * tp,int32_t * new_m,bool can_append_old_cmp)209 tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le,
210     struct tcpcb *tp, int32_t *new_m, bool can_append_old_cmp)
211 {
212 	struct mbuf *m;
213 
214 	/* Look at the last mbuf if any in queue */
215 	if (can_append_old_cmp) {
216 		m = STAILQ_LAST(&tp->t_inqueue, mbuf, m_stailqpkt);
217 		if (m != NULL && (m->m_flags & M_ACKCMP) != 0) {
218 			if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) {
219 #ifdef TCP_BLACKBOX
220 				tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0);
221 #endif
222 				*new_m = 0;
223 				counter_u64_add(tcp_extra_mbuf, 1);
224 				return (m);
225 			} else {
226 				/* Mark we ran out of space */
227 				tp->t_flags2 |= TF2_MBUF_L_ACKS;
228 			}
229 		}
230 	}
231 	/* Decide mbuf size. */
232 #ifdef TCP_BLACKBOX
233 	tcp_lro_log(tp, lc, le, NULL, 21, 0, 0, 0, 0);
234 #endif
235 	if (tp->t_flags2 & TF2_MBUF_L_ACKS)
236 		m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR);
237 	else
238 		m = m_gethdr(M_NOWAIT, MT_DATA);
239 
240 	if (__predict_false(m == NULL)) {
241 		counter_u64_add(tcp_would_have_but, 1);
242 		return (NULL);
243 	}
244 	counter_u64_add(tcp_comp_total, 1);
245 	m->m_pkthdr.rcvif = lc->ifp;
246 	m->m_flags |= M_ACKCMP;
247 	*new_m = 1;
248 	return (m);
249 }
250 
251 /*
252  * Do BPF tap for either ACK_CMP packets or MBUF QUEUE type packets
253  * and strip all, but the IPv4/IPv6 header.
254  */
255 static bool
do_bpf_strip_and_compress(struct tcpcb * tp,struct lro_ctrl * lc,struct lro_entry * le,struct mbuf ** pp,struct mbuf ** cmp,struct mbuf ** mv_to,bool * should_wake,bool bpf_req,bool lagg_bpf_req,struct ifnet * lagg_ifp,bool can_append_old_cmp)256 do_bpf_strip_and_compress(struct tcpcb *tp, struct lro_ctrl *lc,
257     struct lro_entry *le, struct mbuf **pp, struct mbuf **cmp,
258     struct mbuf **mv_to, bool *should_wake, bool bpf_req, bool lagg_bpf_req,
259     struct ifnet *lagg_ifp, bool can_append_old_cmp)
260 {
261 	union {
262 		void *ptr;
263 		struct ip *ip4;
264 		struct ip6_hdr *ip6;
265 	} l3;
266 	struct mbuf *m;
267 	struct mbuf *nm;
268 	struct tcphdr *th;
269 	struct tcp_ackent *ack_ent;
270 	uint32_t *ts_ptr;
271 	int32_t n_mbuf;
272 	bool other_opts, can_compress;
273 	uint8_t lro_type;
274 	uint16_t iptos;
275 	int tcp_hdr_offset;
276 	int idx;
277 
278 	/* Get current mbuf. */
279 	m = *pp;
280 
281 	/* Let the BPF see the packet */
282 	if (__predict_false(bpf_req))
283 		ETHER_BPF_MTAP(lc->ifp, m);
284 
285 	if (__predict_false(lagg_bpf_req))
286 		ETHER_BPF_MTAP(lagg_ifp, m);
287 
288 	tcp_hdr_offset = m->m_pkthdr.lro_tcp_h_off;
289 	lro_type = le->inner.data.lro_type;
290 	switch (lro_type) {
291 	case LRO_TYPE_NONE:
292 		lro_type = le->outer.data.lro_type;
293 		switch (lro_type) {
294 		case LRO_TYPE_IPV4_TCP:
295 			tcp_hdr_offset -= sizeof(*le->outer.ip4);
296 			m->m_pkthdr.lro_etype = ETHERTYPE_IP;
297 			IP_PROBE(receive, NULL, NULL, le->outer.ip4, lc->ifp,
298 			    le->outer.ip4, NULL);
299 			break;
300 		case LRO_TYPE_IPV6_TCP:
301 			tcp_hdr_offset -= sizeof(*le->outer.ip6);
302 			m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
303 			IP_PROBE(receive, NULL, NULL, le->outer.ip6, lc->ifp,
304 			    NULL, le->outer.ip6);
305 			break;
306 		default:
307 			goto compressed;
308 		}
309 		break;
310 	case LRO_TYPE_IPV4_TCP:
311 		switch (le->outer.data.lro_type) {
312 		case LRO_TYPE_IPV4_UDP:
313 			IP_PROBE(receive, NULL, NULL, le->outer.ip4, lc->ifp,
314 			    le->outer.ip4, NULL);
315 			UDP_PROBE(receive, NULL, NULL, le->outer.ip4, NULL,
316 			    le->outer.udp);
317 			break;
318 		case LRO_TYPE_IPV6_UDP:
319 			IP_PROBE(receive, NULL, NULL, le->outer.ip6, lc->ifp,
320 			    NULL, le->outer.ip6);
321 			UDP_PROBE(receive, NULL, NULL, le->outer.ip6, NULL,
322 			    le->outer.udp);
323 			break;
324 		default:
325 			__assert_unreachable();
326 			break;
327 		}
328 		tcp_hdr_offset -= sizeof(*le->outer.ip4);
329 		m->m_pkthdr.lro_etype = ETHERTYPE_IP;
330 		IP_PROBE(receive, NULL, NULL, le->inner.ip4, NULL,
331 		    le->inner.ip4, NULL);
332 		break;
333 	case LRO_TYPE_IPV6_TCP:
334 		switch (le->outer.data.lro_type) {
335 		case LRO_TYPE_IPV4_UDP:
336 			IP_PROBE(receive, NULL, NULL, le->outer.ip4, lc->ifp,
337 			    le->outer.ip4, NULL);
338 			UDP_PROBE(receive, NULL, NULL, le->outer.ip4, NULL,
339 			    le->outer.udp);
340 			break;
341 		case LRO_TYPE_IPV6_UDP:
342 			IP_PROBE(receive, NULL, NULL, le->outer.ip6, lc->ifp,
343 			    NULL, le->outer.ip6);
344 			UDP_PROBE(receive, NULL, NULL, le->outer.ip6, NULL,
345 			    le->outer.udp);
346 			break;
347 		default:
348 			__assert_unreachable();
349 			break;
350 		}
351 		tcp_hdr_offset -= sizeof(*le->outer.ip6);
352 		m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
353 		IP_PROBE(receive, NULL, NULL, le->inner.ip6, NULL, NULL,
354 		    le->inner.ip6);
355 		break;
356 	default:
357 		goto compressed;
358 	}
359 
360 	MPASS(tcp_hdr_offset >= 0);
361 
362 	m_adj(m, tcp_hdr_offset);
363 	m->m_flags |= M_LRO_EHDRSTRP;
364 	m->m_flags &= ~M_ACKCMP;
365 	m->m_pkthdr.lro_tcp_h_off -= tcp_hdr_offset;
366 
367 	th = tcp_lro_get_th(m);
368 
369 	th->th_sum = 0;		/* TCP checksum is valid. */
370 	tcp_fields_to_host(th);
371 	TCP_PROBE5(receive, NULL, tp, m, tp, th);
372 
373 	/* Check if ACK can be compressed */
374 	can_compress = tcp_lro_ack_valid(m, th, &ts_ptr, &other_opts);
375 
376 	/* Now lets look at the should wake states */
377 	if ((other_opts == true) &&
378 	    ((tp->t_flags2 & TF2_DONT_SACK_QUEUE) == 0)) {
379 		/*
380 		 * If there are other options (SACK?) and the
381 		 * tcp endpoint has not expressly told us it does
382 		 * not care about SACKS, then we should wake up.
383 		 */
384 		*should_wake = true;
385 	} else if (*should_wake == false) {
386 		/* Wakeup override check if we are false here  */
387 		*should_wake = tcp_lro_check_wake_status(tp);
388 	}
389 	/* Is the ack compressable? */
390 	if (can_compress == false)
391 		goto done;
392 	/* Does the TCP endpoint support ACK compression? */
393 	if ((tp->t_flags2 & TF2_MBUF_ACKCMP) == 0)
394 		goto done;
395 
396 	/* Lets get the TOS/traffic class field */
397 	l3.ptr = mtod(m, void *);
398 	switch (lro_type) {
399 	case LRO_TYPE_IPV4_TCP:
400 		iptos = l3.ip4->ip_tos;
401 		break;
402 	case LRO_TYPE_IPV6_TCP:
403 		iptos = IPV6_TRAFFIC_CLASS(l3.ip6);
404 		break;
405 	default:
406 		iptos = 0;	/* Keep compiler happy. */
407 		break;
408 	}
409 	/* Now lets get space if we don't have some already */
410 	if (*cmp == NULL) {
411 new_one:
412 		nm = tcp_lro_get_last_if_ackcmp(lc, le, tp, &n_mbuf,
413 		    can_append_old_cmp);
414 		if (__predict_false(nm == NULL))
415 			goto done;
416 		*cmp = nm;
417 		if (n_mbuf) {
418 			/*
419 			 *  Link in the new cmp ack to our in-order place,
420 			 * first set our cmp ack's next to where we are.
421 			 */
422 			nm->m_nextpkt = m;
423 			(*pp) = nm;
424 			/*
425 			 * Set it up so mv_to is advanced to our
426 			 * compressed ack. This way the caller can
427 			 * advance pp to the right place.
428 			 */
429 			*mv_to = nm;
430 			/*
431 			 * Advance it here locally as well.
432 			 */
433 			pp = &nm->m_nextpkt;
434 		}
435 	} else {
436 		/* We have one already we are working on */
437 		nm = *cmp;
438 		if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) {
439 			/* We ran out of space */
440 			tp->t_flags2 |= TF2_MBUF_L_ACKS;
441 			goto new_one;
442 		}
443 	}
444 	MPASS(M_TRAILINGSPACE(nm) >= sizeof(struct tcp_ackent));
445 	counter_u64_add(tcp_inp_lro_compressed, 1);
446 	le->compressed++;
447 	/* We can add in to the one on the tail */
448 	ack_ent = mtod(nm, struct tcp_ackent *);
449 	idx = (nm->m_len / sizeof(struct tcp_ackent));
450 	build_ack_entry(&ack_ent[idx], th, m, ts_ptr, iptos);
451 
452 	/* Bump the size of both pkt-hdr and len */
453 	nm->m_len += sizeof(struct tcp_ackent);
454 	nm->m_pkthdr.len += sizeof(struct tcp_ackent);
455 compressed:
456 	/* Advance to next mbuf before freeing. */
457 	*pp = m->m_nextpkt;
458 	m->m_nextpkt = NULL;
459 	m_freem(m);
460 	return (true);
461 done:
462 	counter_u64_add(tcp_uncomp_total, 1);
463 	le->uncompressed++;
464 	return (false);
465 }
466 
467 static void
tcp_queue_pkts(struct tcpcb * tp,struct lro_entry * le)468 tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le)
469 {
470 
471 	INP_WLOCK_ASSERT(tptoinpcb(tp));
472 
473 	STAILQ_HEAD(, mbuf) q = { le->m_head,
474 	    &STAILQ_NEXT(le->m_last_mbuf, m_stailqpkt) };
475 	STAILQ_CONCAT(&tp->t_inqueue, &q);
476 	le->m_head = NULL;
477 	le->m_last_mbuf = NULL;
478 }
479 
480 static struct tcpcb *
tcp_lro_lookup(struct ifnet * ifp,struct lro_parser * pa)481 tcp_lro_lookup(struct ifnet *ifp, struct lro_parser *pa)
482 {
483 	struct inpcb *inp;
484 
485 	CURVNET_ASSERT_SET();
486 	switch (pa->data.lro_type) {
487 #ifdef INET6
488 	case LRO_TYPE_IPV6_TCP:
489 		inp = in6_pcblookup(&V_tcbinfo,
490 		    &pa->data.s_addr.v6,
491 		    pa->data.s_port,
492 		    &pa->data.d_addr.v6,
493 		    pa->data.d_port,
494 		    INPLOOKUP_WLOCKPCB,
495 		    ifp);
496 		break;
497 #endif
498 #ifdef INET
499 	case LRO_TYPE_IPV4_TCP:
500 		inp = in_pcblookup(&V_tcbinfo,
501 		    pa->data.s_addr.v4,
502 		    pa->data.s_port,
503 		    pa->data.d_addr.v4,
504 		    pa->data.d_port,
505 		    INPLOOKUP_WLOCKPCB,
506 		    ifp);
507 		break;
508 #endif
509 	default:
510 		return (NULL);
511 	}
512 
513 	return (intotcpcb(inp));
514 }
515 
516 static int
_tcp_lro_flush_tcphpts(struct lro_ctrl * lc,struct lro_entry * le)517 _tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
518 {
519 	struct tcpcb *tp;
520 	struct mbuf **pp, *cmp, *mv_to;
521 	struct ifnet *lagg_ifp;
522 	bool bpf_req, lagg_bpf_req, should_wake, can_append_old_cmp;
523 
524 	/* Check if packet doesn't belongs to our network interface. */
525 	if ((tcplro_stacks_wanting_mbufq == 0) ||
526 	    (le->outer.data.vlan_id != 0) ||
527 	    (le->inner.data.lro_type != LRO_TYPE_NONE))
528 		return (TCP_LRO_CANNOT);
529 
530 #ifdef INET6
531 	/*
532 	 * Be proactive about unspecified IPv6 address in source. As
533 	 * we use all-zero to indicate unbounded/unconnected pcb,
534 	 * unspecified IPv6 address can be used to confuse us.
535 	 *
536 	 * Note that packets with unspecified IPv6 destination is
537 	 * already dropped in ip6_input.
538 	 */
539 	if (__predict_false(le->outer.data.lro_type == LRO_TYPE_IPV6_TCP &&
540 	    IN6_IS_ADDR_UNSPECIFIED(&le->outer.data.s_addr.v6)))
541 		return (TCP_LRO_CANNOT);
542 
543 	if (__predict_false(le->inner.data.lro_type == LRO_TYPE_IPV6_TCP &&
544 	    IN6_IS_ADDR_UNSPECIFIED(&le->inner.data.s_addr.v6)))
545 		return (TCP_LRO_CANNOT);
546 #endif
547 
548 	CURVNET_SET(lc->ifp->if_vnet);
549 	/*
550 	 * Ensure that there are no packet filter hooks which would normally
551 	 * being triggered in ether_demux(), ip_input(), or ip6_input().
552 	 */
553 	if (
554 #ifdef INET
555 	    PFIL_HOOKED_IN(V_inet_pfil_head) ||
556 #endif
557 #ifdef INET6
558 	    PFIL_HOOKED_IN(V_inet6_pfil_head) ||
559 #endif
560 	    PFIL_HOOKED_IN(V_link_pfil_head)) {
561 		CURVNET_RESTORE();
562 		return (TCP_LRO_CANNOT);
563 	}
564 
565 	/* Lookup inp, if any.  Returns locked TCP inpcb. */
566 	tp = tcp_lro_lookup(lc->ifp,
567 	    (le->inner.data.lro_type == LRO_TYPE_NONE) ? &le->outer : &le->inner);
568 	CURVNET_RESTORE();
569 	if (tp == NULL)
570 		return (TCP_LRO_CANNOT);
571 
572 	counter_u64_add(tcp_inp_lro_locks_taken, 1);
573 
574 	/* Check if the inp is dead, Jim. */
575 	if (tp->t_state == TCPS_TIME_WAIT) {
576 		INP_WUNLOCK(tptoinpcb(tp));
577 		return (TCP_LRO_CANNOT);
578 	}
579 	if (tp->t_lro_cpu == HPTS_CPU_NONE && lc->lro_cpu_is_set == 1)
580 		tp->t_lro_cpu = lc->lro_last_cpu;
581 	/* Check if the transport doesn't support the needed optimizations. */
582 	if ((tp->t_flags2 & (TF2_SUPPORTS_MBUFQ | TF2_MBUF_ACKCMP)) == 0) {
583 		INP_WUNLOCK(tptoinpcb(tp));
584 		return (TCP_LRO_CANNOT);
585 	}
586 
587 	if (tp->t_flags2 & TF2_MBUF_QUEUE_READY)
588 		should_wake = false;
589 	else
590 		should_wake = true;
591 	/* Check if packets should be tapped to BPF. */
592 	bpf_req = bpf_peers_present(lc->ifp->if_bpf);
593 	lagg_bpf_req = false;
594 	lagg_ifp = NULL;
595 	if (lc->ifp->if_type == IFT_IEEE8023ADLAG ||
596 	    lc->ifp->if_type == IFT_INFINIBANDLAG) {
597 		struct lagg_port *lp = lc->ifp->if_lagg;
598 		struct lagg_softc *sc = lp->lp_softc;
599 
600 		lagg_ifp = sc->sc_ifp;
601 		if (lagg_ifp != NULL)
602 			lagg_bpf_req = bpf_peers_present(lagg_ifp->if_bpf);
603 	}
604 
605 	/* Strip and compress all the incoming packets. */
606 	can_append_old_cmp = true;
607 	cmp = NULL;
608 	for (pp = &le->m_head; *pp != NULL; ) {
609 		mv_to = NULL;
610 		if (do_bpf_strip_and_compress(tp, lc, le, pp, &cmp, &mv_to,
611 		    &should_wake, bpf_req, lagg_bpf_req, lagg_ifp,
612 		    can_append_old_cmp) == false) {
613 			/* Advance to next mbuf. */
614 			pp = &(*pp)->m_nextpkt;
615 			/*
616 			 * Once we have appended we can't look in the pending
617 			 * inbound packets for a compressed ack to append to.
618 			 */
619 			can_append_old_cmp = false;
620 			/*
621 			 * Once we append we also need to stop adding to any
622 			 * compressed ack we were remembering. A new cmp
623 			 * ack will be required.
624 			 */
625 			cmp = NULL;
626 #ifdef TCP_BLACKBOX
627 			tcp_lro_log(tp, lc, le, NULL, 25, 0, 0, 0, 0);
628 #endif
629 		} else if (mv_to != NULL) {
630 			/* We are asked to move pp up */
631 			pp = &mv_to->m_nextpkt;
632 #ifdef TCP_BLACKBOX
633 			tcp_lro_log(tp, lc, le, NULL, 24, 0, 0, 0, 0);
634 		} else
635 			tcp_lro_log(tp, lc, le, NULL, 26, 0, 0, 0, 0);
636 #else
637 		}
638 #endif
639 	}
640 	/* Update "m_last_mbuf", if any. */
641 	if (pp == &le->m_head)
642 		le->m_last_mbuf = *pp;
643 	else
644 		le->m_last_mbuf = __containerof(pp, struct mbuf, m_nextpkt);
645 
646 	/* Check if any data mbufs left. */
647 	if (le->m_head != NULL) {
648 		counter_u64_add(tcp_inp_lro_direct_queue, 1);
649 #ifdef TCP_BLACKBOX
650 		tcp_lro_log(tp, lc, le, NULL, 22, 1, tp->t_flags2, 0, 1);
651 #endif
652 		tcp_queue_pkts(tp, le);
653 	}
654 	if (should_wake) {
655 		/* Wakeup */
656 		counter_u64_add(tcp_inp_lro_wokeup_queue, 1);
657 		if ((*tp->t_fb->tfb_do_queued_segments)(tp, 0))
658 			/* TCP cb gone and unlocked. */
659 			return (0);
660 	}
661 	INP_WUNLOCK(tptoinpcb(tp));
662 
663 	return (0);	/* Success. */
664 }
665 
666 void
tcp_lro_hpts_init(void)667 tcp_lro_hpts_init(void)
668 {
669 	tcp_lro_flush_tcphpts = _tcp_lro_flush_tcphpts;
670 }
671 
672 void
tcp_lro_hpts_uninit(void)673 tcp_lro_hpts_uninit(void)
674 {
675 	atomic_store_ptr(&tcp_lro_flush_tcphpts, NULL);
676 }
677