1 /*-
2 * Copyright (c) 2016-2018 Netflix, Inc.
3 * Copyright (c) 2016-2021 Mellanox Technologies.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 */
27 #include "opt_inet.h"
28 #include "opt_inet6.h"
29
30 #include <sys/param.h>
31 #include <sys/bus.h>
32 #include <sys/interrupt.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/malloc.h>
36 #include <sys/mbuf.h>
37 #include <sys/socket.h>
38 #include <sys/socketvar.h>
39 #include <sys/sysctl.h>
40
41 #include <net/if.h>
42 #include <net/if_var.h>
43 #include <net/if_private.h>
44 #include <net/ethernet.h>
45 #include <net/bpf.h>
46 #include <net/vnet.h>
47 #include <net/if_dl.h>
48 #include <net/if_media.h>
49 #include <net/if_types.h>
50 #include <net/infiniband.h>
51 #include <net/if_lagg.h>
52 #include <net/pfil.h>
53
54 #include <netinet/in.h>
55 #include <netinet/in_kdtrace.h>
56 #include <netinet/ip6.h>
57 #include <netinet/ip.h>
58 #include <netinet/ip_var.h>
59 #include <netinet/in_pcb.h>
60 #include <netinet6/in6_pcb.h>
61 #include <netinet6/ip6_var.h>
62 #include <netinet/tcp.h>
63 #include <netinet/tcp_lro.h>
64 #include <netinet/tcp_var.h>
65 #include <netinet/tcp_hpts.h>
66 #include <netinet/tcp_hpts_internal.h>
67 #ifdef TCP_BLACKBOX
68 #include <netinet/tcp_log_buf.h>
69 #endif
70
71 static void
build_ack_entry(struct tcp_ackent * ae,struct tcphdr * th,struct mbuf * m,uint32_t * ts_ptr,uint16_t iptos)72 build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m,
73 uint32_t *ts_ptr, uint16_t iptos)
74 {
75 /*
76 * Given a TCP ACK, summarize it down into the small TCP ACK
77 * entry.
78 */
79 ae->timestamp = m->m_pkthdr.rcv_tstmp;
80 ae->flags = 0;
81 if (m->m_flags & M_TSTMP_LRO)
82 ae->flags |= TSTMP_LRO;
83 else if (m->m_flags & M_TSTMP)
84 ae->flags |= TSTMP_HDWR;
85 ae->seq = th->th_seq;
86 ae->ack = th->th_ack;
87 ae->flags |= tcp_get_flags(th);
88 if (ts_ptr != NULL) {
89 ae->ts_value = ntohl(ts_ptr[1]);
90 ae->ts_echo = ntohl(ts_ptr[2]);
91 ae->flags |= HAS_TSTMP;
92 }
93 ae->win = th->th_win;
94 ae->codepoint = iptos;
95 }
96
97 static inline bool
tcp_lro_ack_valid(struct mbuf * m,struct tcphdr * th,uint32_t ** ppts,bool * other_opts)98 tcp_lro_ack_valid(struct mbuf *m, struct tcphdr *th, uint32_t **ppts, bool *other_opts)
99 {
100 /*
101 * This function returns two bits of valuable information.
102 * a) Is what is present capable of being ack-compressed,
103 * we can ack-compress if there is no options or just
104 * a timestamp option, and of course the th_flags must
105 * be correct as well.
106 * b) Our other options present such as SACK. This is
107 * used to determine if we want to wakeup or not.
108 */
109 bool ret = true;
110
111 switch (th->th_off << 2) {
112 case (sizeof(*th) + TCPOLEN_TSTAMP_APPA):
113 *ppts = (uint32_t *)(th + 1);
114 /* Check if we have only one timestamp option. */
115 if (**ppts == TCP_LRO_TS_OPTION)
116 *other_opts = false;
117 else {
118 *other_opts = true;
119 ret = false;
120 }
121 break;
122 case (sizeof(*th)):
123 /* No options. */
124 *ppts = NULL;
125 *other_opts = false;
126 break;
127 default:
128 *ppts = NULL;
129 *other_opts = true;
130 ret = false;
131 break;
132 }
133 /* For ACKCMP we only accept ACK, PUSH, ECE and CWR. */
134 if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) != 0)
135 ret = false;
136 /* If it has data on it we cannot compress it */
137 if (m->m_pkthdr.lro_tcp_d_len)
138 ret = false;
139
140 /* ACK flag must be set. */
141 if (!(tcp_get_flags(th) & TH_ACK))
142 ret = false;
143 return (ret);
144 }
145
146 static bool
tcp_lro_check_wake_status(struct tcpcb * tp)147 tcp_lro_check_wake_status(struct tcpcb *tp)
148 {
149
150 if (tp->t_fb->tfb_early_wake_check != NULL)
151 return ((tp->t_fb->tfb_early_wake_check)(tp));
152 return (false);
153 }
154
155 #ifdef TCP_BLACKBOX
156 static void
tcp_lro_log(struct tcpcb * tp,const struct lro_ctrl * lc,const struct lro_entry * le,const struct mbuf * m,int frm,int32_t tcp_data_len,uint32_t th_seq,uint32_t th_ack,uint16_t th_win)157 tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
158 const struct lro_entry *le, const struct mbuf *m,
159 int frm, int32_t tcp_data_len, uint32_t th_seq,
160 uint32_t th_ack, uint16_t th_win)
161 {
162 if (tcp_bblogging_on(tp)) {
163 union tcp_log_stackspecific log;
164 struct timeval tv, btv;
165 uint32_t cts;
166
167 cts = tcp_get_usecs(&tv);
168 memset(&log, 0, sizeof(union tcp_log_stackspecific));
169 log.u_bbr.flex8 = frm;
170 log.u_bbr.flex1 = tcp_data_len;
171 if (m)
172 log.u_bbr.flex2 = m->m_pkthdr.len;
173 else
174 log.u_bbr.flex2 = 0;
175 if (le->m_head) {
176 log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs;
177 log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len;
178 log.u_bbr.flex5 = le->m_head->m_pkthdr.len;
179 log.u_bbr.delRate = le->m_head->m_flags;
180 log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
181 }
182 log.u_bbr.inflight = th_seq;
183 log.u_bbr.delivered = th_ack;
184 log.u_bbr.timeStamp = cts;
185 log.u_bbr.epoch = le->next_seq;
186 log.u_bbr.lt_epoch = le->ack_seq;
187 log.u_bbr.pacing_gain = th_win;
188 log.u_bbr.cwnd_gain = le->window;
189 log.u_bbr.lost = curcpu;
190 log.u_bbr.cur_del_rate = (uintptr_t)m;
191 log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
192 bintime2timeval(&lc->lro_last_queue_time, &btv);
193 log.u_bbr.flex6 = tcp_tv_to_usec(&btv);
194 log.u_bbr.flex7 = le->compressed;
195 log.u_bbr.pacing_gain = le->uncompressed;
196 if (in_epoch(net_epoch_preempt))
197 log.u_bbr.inhpts = 1;
198 else
199 log.u_bbr.inhpts = 0;
200 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv,
201 &tptosocket(tp)->so_snd,
202 TCP_LOG_LRO, 0, 0, &log, false, &tv);
203 }
204 }
205 #endif
206
207 static struct mbuf *
tcp_lro_get_last_if_ackcmp(struct lro_ctrl * lc,struct lro_entry * le,struct tcpcb * tp,int32_t * new_m,bool can_append_old_cmp)208 tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le,
209 struct tcpcb *tp, int32_t *new_m, bool can_append_old_cmp)
210 {
211 struct mbuf *m;
212
213 /* Look at the last mbuf if any in queue */
214 if (can_append_old_cmp) {
215 m = STAILQ_LAST(&tp->t_inqueue, mbuf, m_stailqpkt);
216 if (m != NULL && (m->m_flags & M_ACKCMP) != 0) {
217 if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) {
218 #ifdef TCP_BLACKBOX
219 tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0);
220 #endif
221 *new_m = 0;
222 counter_u64_add(tcp_extra_mbuf, 1);
223 return (m);
224 } else {
225 /* Mark we ran out of space */
226 tp->t_flags2 |= TF2_MBUF_L_ACKS;
227 }
228 }
229 }
230 /* Decide mbuf size. */
231 #ifdef TCP_BLACKBOX
232 tcp_lro_log(tp, lc, le, NULL, 21, 0, 0, 0, 0);
233 #endif
234 if (tp->t_flags2 & TF2_MBUF_L_ACKS)
235 m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR);
236 else
237 m = m_gethdr(M_NOWAIT, MT_DATA);
238
239 if (__predict_false(m == NULL)) {
240 counter_u64_add(tcp_would_have_but, 1);
241 return (NULL);
242 }
243 counter_u64_add(tcp_comp_total, 1);
244 m->m_pkthdr.rcvif = lc->ifp;
245 m->m_flags |= M_ACKCMP;
246 *new_m = 1;
247 return (m);
248 }
249
250 /*
251 * Do BPF tap for either ACK_CMP packets or MBUF QUEUE type packets
252 * and strip all, but the IPv4/IPv6 header.
253 */
254 static bool
do_bpf_strip_and_compress(struct tcpcb * tp,struct lro_ctrl * lc,struct lro_entry * le,struct mbuf ** pp,struct mbuf ** cmp,struct mbuf ** mv_to,bool * should_wake,bool bpf_req,bool lagg_bpf_req,struct ifnet * lagg_ifp,bool can_append_old_cmp)255 do_bpf_strip_and_compress(struct tcpcb *tp, struct lro_ctrl *lc,
256 struct lro_entry *le, struct mbuf **pp, struct mbuf **cmp,
257 struct mbuf **mv_to, bool *should_wake, bool bpf_req, bool lagg_bpf_req,
258 struct ifnet *lagg_ifp, bool can_append_old_cmp)
259 {
260 union {
261 void *ptr;
262 struct ip *ip4;
263 struct ip6_hdr *ip6;
264 } l3;
265 struct mbuf *m;
266 struct mbuf *nm;
267 struct tcphdr *th;
268 struct tcp_ackent *ack_ent;
269 uint32_t *ts_ptr;
270 int32_t n_mbuf;
271 bool other_opts, can_compress;
272 uint8_t lro_type;
273 uint16_t iptos;
274 int tcp_hdr_offset;
275 int idx;
276
277 /* Get current mbuf. */
278 m = *pp;
279
280 /* Let the BPF see the packet */
281 if (__predict_false(bpf_req))
282 ETHER_BPF_MTAP(lc->ifp, m);
283
284 if (__predict_false(lagg_bpf_req))
285 ETHER_BPF_MTAP(lagg_ifp, m);
286
287 tcp_hdr_offset = m->m_pkthdr.lro_tcp_h_off;
288 lro_type = le->inner.data.lro_type;
289 switch (lro_type) {
290 case LRO_TYPE_NONE:
291 lro_type = le->outer.data.lro_type;
292 switch (lro_type) {
293 case LRO_TYPE_IPV4_TCP:
294 tcp_hdr_offset -= sizeof(*le->outer.ip4);
295 m->m_pkthdr.lro_etype = ETHERTYPE_IP;
296 IP_PROBE(receive, NULL, NULL, le->outer.ip4, lc->ifp,
297 le->outer.ip4, NULL);
298 break;
299 case LRO_TYPE_IPV6_TCP:
300 tcp_hdr_offset -= sizeof(*le->outer.ip6);
301 m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
302 IP_PROBE(receive, NULL, NULL, le->outer.ip6, lc->ifp,
303 NULL, le->outer.ip6);
304 break;
305 default:
306 goto compressed;
307 }
308 break;
309 case LRO_TYPE_IPV4_TCP:
310 switch (le->outer.data.lro_type) {
311 case LRO_TYPE_IPV4_UDP:
312 IP_PROBE(receive, NULL, NULL, le->outer.ip4, lc->ifp,
313 le->outer.ip4, NULL);
314 UDP_PROBE(receive, NULL, NULL, le->outer.ip4, NULL,
315 le->outer.udp);
316 break;
317 case LRO_TYPE_IPV6_UDP:
318 IP_PROBE(receive, NULL, NULL, le->outer.ip6, lc->ifp,
319 NULL, le->outer.ip6);
320 UDP_PROBE(receive, NULL, NULL, le->outer.ip6, NULL,
321 le->outer.udp);
322 break;
323 default:
324 __assert_unreachable();
325 break;
326 }
327 tcp_hdr_offset -= sizeof(*le->outer.ip4);
328 m->m_pkthdr.lro_etype = ETHERTYPE_IP;
329 IP_PROBE(receive, NULL, NULL, le->inner.ip4, NULL,
330 le->inner.ip4, NULL);
331 break;
332 case LRO_TYPE_IPV6_TCP:
333 switch (le->outer.data.lro_type) {
334 case LRO_TYPE_IPV4_UDP:
335 IP_PROBE(receive, NULL, NULL, le->outer.ip4, lc->ifp,
336 le->outer.ip4, NULL);
337 UDP_PROBE(receive, NULL, NULL, le->outer.ip4, NULL,
338 le->outer.udp);
339 break;
340 case LRO_TYPE_IPV6_UDP:
341 IP_PROBE(receive, NULL, NULL, le->outer.ip6, lc->ifp,
342 NULL, le->outer.ip6);
343 UDP_PROBE(receive, NULL, NULL, le->outer.ip6, NULL,
344 le->outer.udp);
345 break;
346 default:
347 __assert_unreachable();
348 break;
349 }
350 tcp_hdr_offset -= sizeof(*le->outer.ip6);
351 m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
352 IP_PROBE(receive, NULL, NULL, le->inner.ip6, NULL, NULL,
353 le->inner.ip6);
354 break;
355 default:
356 goto compressed;
357 }
358
359 MPASS(tcp_hdr_offset >= 0);
360
361 m_adj(m, tcp_hdr_offset);
362 m->m_flags |= M_LRO_EHDRSTRP;
363 m->m_flags &= ~M_ACKCMP;
364 m->m_pkthdr.lro_tcp_h_off -= tcp_hdr_offset;
365
366 th = tcp_lro_get_th(m);
367
368 th->th_sum = 0; /* TCP checksum is valid. */
369 tcp_fields_to_host(th);
370 TCP_PROBE5(receive, NULL, tp, m, tp, th);
371
372 /* Check if ACK can be compressed */
373 can_compress = tcp_lro_ack_valid(m, th, &ts_ptr, &other_opts);
374
375 /* Now lets look at the should wake states */
376 if ((other_opts == true) &&
377 ((tp->t_flags2 & TF2_DONT_SACK_QUEUE) == 0)) {
378 /*
379 * If there are other options (SACK?) and the
380 * tcp endpoint has not expressly told us it does
381 * not care about SACKS, then we should wake up.
382 */
383 *should_wake = true;
384 } else if (*should_wake == false) {
385 /* Wakeup override check if we are false here */
386 *should_wake = tcp_lro_check_wake_status(tp);
387 }
388 /* Is the ack compressable? */
389 if (can_compress == false)
390 goto done;
391 /* Does the TCP endpoint support ACK compression? */
392 if ((tp->t_flags2 & TF2_MBUF_ACKCMP) == 0)
393 goto done;
394
395 /* Lets get the TOS/traffic class field */
396 l3.ptr = mtod(m, void *);
397 switch (lro_type) {
398 case LRO_TYPE_IPV4_TCP:
399 iptos = l3.ip4->ip_tos;
400 break;
401 case LRO_TYPE_IPV6_TCP:
402 iptos = IPV6_TRAFFIC_CLASS(l3.ip6);
403 break;
404 default:
405 iptos = 0; /* Keep compiler happy. */
406 break;
407 }
408 /* Now lets get space if we don't have some already */
409 if (*cmp == NULL) {
410 new_one:
411 nm = tcp_lro_get_last_if_ackcmp(lc, le, tp, &n_mbuf,
412 can_append_old_cmp);
413 if (__predict_false(nm == NULL))
414 goto done;
415 *cmp = nm;
416 if (n_mbuf) {
417 /*
418 * Link in the new cmp ack to our in-order place,
419 * first set our cmp ack's next to where we are.
420 */
421 nm->m_nextpkt = m;
422 (*pp) = nm;
423 /*
424 * Set it up so mv_to is advanced to our
425 * compressed ack. This way the caller can
426 * advance pp to the right place.
427 */
428 *mv_to = nm;
429 /*
430 * Advance it here locally as well.
431 */
432 pp = &nm->m_nextpkt;
433 }
434 } else {
435 /* We have one already we are working on */
436 nm = *cmp;
437 if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) {
438 /* We ran out of space */
439 tp->t_flags2 |= TF2_MBUF_L_ACKS;
440 goto new_one;
441 }
442 }
443 MPASS(M_TRAILINGSPACE(nm) >= sizeof(struct tcp_ackent));
444 counter_u64_add(tcp_inp_lro_compressed, 1);
445 le->compressed++;
446 /* We can add in to the one on the tail */
447 ack_ent = mtod(nm, struct tcp_ackent *);
448 idx = (nm->m_len / sizeof(struct tcp_ackent));
449 build_ack_entry(&ack_ent[idx], th, m, ts_ptr, iptos);
450
451 /* Bump the size of both pkt-hdr and len */
452 nm->m_len += sizeof(struct tcp_ackent);
453 nm->m_pkthdr.len += sizeof(struct tcp_ackent);
454 compressed:
455 /* Advance to next mbuf before freeing. */
456 *pp = m->m_nextpkt;
457 m->m_nextpkt = NULL;
458 m_freem(m);
459 return (true);
460 done:
461 counter_u64_add(tcp_uncomp_total, 1);
462 le->uncompressed++;
463 return (false);
464 }
465
466 static void
tcp_queue_pkts(struct tcpcb * tp,struct lro_entry * le)467 tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le)
468 {
469
470 INP_WLOCK_ASSERT(tptoinpcb(tp));
471
472 STAILQ_HEAD(, mbuf) q = { le->m_head,
473 &STAILQ_NEXT(le->m_last_mbuf, m_stailqpkt) };
474 STAILQ_CONCAT(&tp->t_inqueue, &q);
475 le->m_head = NULL;
476 le->m_last_mbuf = NULL;
477 }
478
479 static struct tcpcb *
tcp_lro_lookup(struct ifnet * ifp,struct lro_parser * pa)480 tcp_lro_lookup(struct ifnet *ifp, struct lro_parser *pa)
481 {
482 struct inpcb *inp;
483
484 CURVNET_ASSERT_SET();
485 switch (pa->data.lro_type) {
486 #ifdef INET6
487 case LRO_TYPE_IPV6_TCP:
488 inp = in6_pcblookup(&V_tcbinfo,
489 &pa->data.s_addr.v6,
490 pa->data.s_port,
491 &pa->data.d_addr.v6,
492 pa->data.d_port,
493 INPLOOKUP_WLOCKPCB,
494 ifp);
495 break;
496 #endif
497 #ifdef INET
498 case LRO_TYPE_IPV4_TCP:
499 inp = in_pcblookup(&V_tcbinfo,
500 pa->data.s_addr.v4,
501 pa->data.s_port,
502 pa->data.d_addr.v4,
503 pa->data.d_port,
504 INPLOOKUP_WLOCKPCB,
505 ifp);
506 break;
507 #endif
508 default:
509 return (NULL);
510 }
511
512 return (intotcpcb(inp));
513 }
514
515 static int
_tcp_lro_flush_tcphpts(struct lro_ctrl * lc,struct lro_entry * le)516 _tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
517 {
518 struct tcpcb *tp;
519 struct mbuf **pp, *cmp, *mv_to;
520 struct ifnet *lagg_ifp;
521 bool bpf_req, lagg_bpf_req, should_wake, can_append_old_cmp;
522
523 /* Check if packet doesn't belongs to our network interface. */
524 if ((tcplro_stacks_wanting_mbufq == 0) ||
525 (le->outer.data.vlan_id != 0) ||
526 (le->inner.data.lro_type != LRO_TYPE_NONE))
527 return (TCP_LRO_CANNOT);
528
529 #ifdef INET6
530 /*
531 * Be proactive about unspecified IPv6 address in source. As
532 * we use all-zero to indicate unbounded/unconnected pcb,
533 * unspecified IPv6 address can be used to confuse us.
534 *
535 * Note that packets with unspecified IPv6 destination is
536 * already dropped in ip6_input.
537 */
538 if (__predict_false(le->outer.data.lro_type == LRO_TYPE_IPV6_TCP &&
539 IN6_IS_ADDR_UNSPECIFIED(&le->outer.data.s_addr.v6)))
540 return (TCP_LRO_CANNOT);
541
542 if (__predict_false(le->inner.data.lro_type == LRO_TYPE_IPV6_TCP &&
543 IN6_IS_ADDR_UNSPECIFIED(&le->inner.data.s_addr.v6)))
544 return (TCP_LRO_CANNOT);
545 #endif
546
547 CURVNET_SET(lc->ifp->if_vnet);
548 /*
549 * Ensure that there are no packet filter hooks which would normally
550 * being triggered in ether_demux(), ip_input(), or ip6_input().
551 */
552 if (
553 #ifdef INET
554 PFIL_HOOKED_IN(V_inet_pfil_head) ||
555 #endif
556 #ifdef INET6
557 PFIL_HOOKED_IN(V_inet6_pfil_head) ||
558 #endif
559 PFIL_HOOKED_IN(V_link_pfil_head)) {
560 CURVNET_RESTORE();
561 return (TCP_LRO_CANNOT);
562 }
563
564 /* Lookup inp, if any. Returns locked TCP inpcb. */
565 tp = tcp_lro_lookup(lc->ifp,
566 (le->inner.data.lro_type == LRO_TYPE_NONE) ? &le->outer : &le->inner);
567 CURVNET_RESTORE();
568 if (tp == NULL)
569 return (TCP_LRO_CANNOT);
570
571 counter_u64_add(tcp_inp_lro_locks_taken, 1);
572
573 /* Check if the inp is dead, Jim. */
574 if (tp->t_state == TCPS_TIME_WAIT) {
575 INP_WUNLOCK(tptoinpcb(tp));
576 return (TCP_LRO_CANNOT);
577 }
578 if (tp->t_lro_cpu == HPTS_CPU_NONE && lc->lro_cpu_is_set == 1)
579 tp->t_lro_cpu = lc->lro_last_cpu;
580 /* Check if the transport doesn't support the needed optimizations. */
581 if ((tp->t_flags2 & (TF2_SUPPORTS_MBUFQ | TF2_MBUF_ACKCMP)) == 0) {
582 INP_WUNLOCK(tptoinpcb(tp));
583 return (TCP_LRO_CANNOT);
584 }
585
586 if (tp->t_flags2 & TF2_MBUF_QUEUE_READY)
587 should_wake = false;
588 else
589 should_wake = true;
590 /* Check if packets should be tapped to BPF. */
591 bpf_req = bpf_peers_present(lc->ifp->if_bpf);
592 lagg_bpf_req = false;
593 lagg_ifp = NULL;
594 if (lc->ifp->if_type == IFT_IEEE8023ADLAG ||
595 lc->ifp->if_type == IFT_INFINIBANDLAG) {
596 struct lagg_port *lp = lc->ifp->if_lagg;
597 struct lagg_softc *sc = lp->lp_softc;
598
599 lagg_ifp = sc->sc_ifp;
600 if (lagg_ifp != NULL)
601 lagg_bpf_req = bpf_peers_present(lagg_ifp->if_bpf);
602 }
603
604 /* Strip and compress all the incoming packets. */
605 can_append_old_cmp = true;
606 cmp = NULL;
607 for (pp = &le->m_head; *pp != NULL; ) {
608 mv_to = NULL;
609 if (do_bpf_strip_and_compress(tp, lc, le, pp, &cmp, &mv_to,
610 &should_wake, bpf_req, lagg_bpf_req, lagg_ifp,
611 can_append_old_cmp) == false) {
612 /* Advance to next mbuf. */
613 pp = &(*pp)->m_nextpkt;
614 /*
615 * Once we have appended we can't look in the pending
616 * inbound packets for a compressed ack to append to.
617 */
618 can_append_old_cmp = false;
619 /*
620 * Once we append we also need to stop adding to any
621 * compressed ack we were remembering. A new cmp
622 * ack will be required.
623 */
624 cmp = NULL;
625 #ifdef TCP_BLACKBOX
626 tcp_lro_log(tp, lc, le, NULL, 25, 0, 0, 0, 0);
627 #endif
628 } else if (mv_to != NULL) {
629 /* We are asked to move pp up */
630 pp = &mv_to->m_nextpkt;
631 #ifdef TCP_BLACKBOX
632 tcp_lro_log(tp, lc, le, NULL, 24, 0, 0, 0, 0);
633 } else
634 tcp_lro_log(tp, lc, le, NULL, 26, 0, 0, 0, 0);
635 #else
636 }
637 #endif
638 }
639 /* Update "m_last_mbuf", if any. */
640 if (pp == &le->m_head)
641 le->m_last_mbuf = *pp;
642 else
643 le->m_last_mbuf = __containerof(pp, struct mbuf, m_nextpkt);
644
645 /* Check if any data mbufs left. */
646 if (le->m_head != NULL) {
647 counter_u64_add(tcp_inp_lro_direct_queue, 1);
648 #ifdef TCP_BLACKBOX
649 tcp_lro_log(tp, lc, le, NULL, 22, 1, tp->t_flags2, 0, 1);
650 #endif
651 tcp_queue_pkts(tp, le);
652 }
653 if (should_wake) {
654 /* Wakeup */
655 counter_u64_add(tcp_inp_lro_wokeup_queue, 1);
656 if ((*tp->t_fb->tfb_do_queued_segments)(tp, 0))
657 /* TCP cb gone and unlocked. */
658 return (0);
659 }
660 INP_WUNLOCK(tptoinpcb(tp));
661
662 return (0); /* Success. */
663 }
664
665 void
tcp_lro_hpts_init(void)666 tcp_lro_hpts_init(void)
667 {
668 tcp_lro_flush_tcphpts = _tcp_lro_flush_tcphpts;
669 }
670
671 void
tcp_lro_hpts_uninit(void)672 tcp_lro_hpts_uninit(void)
673 {
674 atomic_store_ptr(&tcp_lro_flush_tcphpts, NULL);
675 }
676