1 /*-
2 * Copyright (c) 2016-2020 Netflix, Inc.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 */
26 /*
27 * Author: Randall Stewart <rrs@netflix.com>
28 * This work is based on the ACM Queue paper
29 * BBR - Congestion Based Congestion Control
30 * and also numerous discussions with Neal, Yuchung and Van.
31 */
32
33 #include <sys/cdefs.h>
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 #include "opt_ipsec.h"
37 #include "opt_ratelimit.h"
38 #include <sys/param.h>
39 #include <sys/arb.h>
40 #include <sys/module.h>
41 #include <sys/kernel.h>
42 #ifdef TCP_HHOOK
43 #include <sys/hhook.h>
44 #endif
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/proc.h>
48 #include <sys/qmath.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/sysctl.h>
52 #include <sys/systm.h>
53 #include <sys/tree.h>
54 #ifdef NETFLIX_STATS
55 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
56 #endif
57 #include <sys/refcount.h>
58 #include <sys/queue.h>
59 #include <sys/smp.h>
60 #include <sys/kthread.h>
61 #include <sys/lock.h>
62 #include <sys/mutex.h>
63 #include <sys/tim_filter.h>
64 #include <sys/time.h>
65 #include <vm/uma.h>
66 #include <sys/kern_prefetch.h>
67
68 #include <net/route.h>
69 #include <net/vnet.h>
70 #include <net/ethernet.h>
71 #include <net/bpf.h>
72
73 #define TCPSTATES /* for logging */
74
75 #include <netinet/in.h>
76 #include <netinet/in_kdtrace.h>
77 #include <netinet/in_pcb.h>
78 #include <netinet/ip.h>
79 #include <netinet/ip_var.h>
80 #include <netinet/ip6.h>
81 #include <netinet6/in6_pcb.h>
82 #include <netinet6/ip6_var.h>
83 #include <netinet/tcp.h>
84 #include <netinet/tcp_fsm.h>
85 #include <netinet/tcp_seq.h>
86 #include <netinet/tcp_timer.h>
87 #include <netinet/tcp_var.h>
88 #include <netinet/tcpip.h>
89 #include <netinet/tcp_ecn.h>
90 #include <netinet/tcp_hpts.h>
91 #include <netinet/tcp_lro.h>
92 #include <netinet/cc/cc.h>
93 #include <netinet/tcp_log_buf.h>
94 #ifdef TCP_OFFLOAD
95 #include <netinet/tcp_offload.h>
96 #endif
97 #ifdef INET6
98 #include <netinet6/tcp6_var.h>
99 #endif
100 #include <netinet/tcp_fastopen.h>
101
102 #include <netipsec/ipsec_support.h>
103 #include <net/if.h>
104 #include <net/if_var.h>
105 #include <net/if_private.h>
106
107 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
108 #include <netipsec/ipsec.h>
109 #include <netipsec/ipsec6.h>
110 #endif /* IPSEC */
111
112 #include <netinet/udp.h>
113 #include <netinet/udp_var.h>
114 #include <machine/in_cksum.h>
115
116 #ifdef MAC
117 #include <security/mac/mac_framework.h>
118 #endif
119 #include "rack_bbr_common.h"
120
121 /*
122 * Common TCP Functions - These are shared by borth
123 * rack and BBR.
124 */
125 static int
ctf_get_enet_type(struct ifnet * ifp,struct mbuf * m)126 ctf_get_enet_type(struct ifnet *ifp, struct mbuf *m)
127 {
128 struct ether_header *eh;
129 #ifdef INET6
130 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
131 #endif
132 #ifdef INET
133 struct ip *ip = NULL; /* Keep compiler happy. */
134 #endif
135 #if defined(INET) || defined(INET6)
136 struct tcphdr *th;
137 int32_t tlen;
138 uint16_t drop_hdrlen;
139 #endif
140 uint16_t etype;
141 #ifdef INET
142 uint8_t iptos;
143 #endif
144
145 /* Is it the easy way? */
146 if (m->m_flags & M_LRO_EHDRSTRP)
147 return (m->m_pkthdr.lro_etype);
148 /*
149 * Ok this is the old style call, the ethernet header is here.
150 * This also means no checksum or BPF were done. This
151 * can happen if the race to setup the inp fails and
152 * LRO sees no INP at packet input, but by the time
153 * we queue the packets an INP gets there. Its rare
154 * but it can occur so we will handle it. Note that
155 * this means duplicated work but with the rarity of it
156 * its not worth worrying about.
157 */
158 /* Let the BPF see the packet */
159 if (bpf_peers_present(ifp->if_bpf))
160 ETHER_BPF_MTAP(ifp, m);
161 /* Now the csum */
162 eh = mtod(m, struct ether_header *);
163 etype = ntohs(eh->ether_type);
164 m_adj(m, sizeof(*eh));
165 switch (etype) {
166 #ifdef INET6
167 case ETHERTYPE_IPV6:
168 {
169 if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
170 m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
171 if (m == NULL) {
172 KMOD_TCPSTAT_INC(tcps_rcvshort);
173 return (-1);
174 }
175 }
176 ip6 = (struct ip6_hdr *)(eh + 1);
177 th = (struct tcphdr *)(ip6 + 1);
178 drop_hdrlen = sizeof(*ip6);
179 tlen = ntohs(ip6->ip6_plen);
180 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
181 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
182 th->th_sum = m->m_pkthdr.csum_data;
183 else
184 th->th_sum = in6_cksum_pseudo(ip6, tlen,
185 IPPROTO_TCP,
186 m->m_pkthdr.csum_data);
187 th->th_sum ^= 0xffff;
188 } else
189 th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
190 if (th->th_sum) {
191 KMOD_TCPSTAT_INC(tcps_rcvbadsum);
192 m_freem(m);
193 return (-1);
194 }
195 return (etype);
196 }
197 #endif
198 #ifdef INET
199 case ETHERTYPE_IP:
200 {
201 if (m->m_len < sizeof (struct tcpiphdr)) {
202 m = m_pullup(m, sizeof (struct tcpiphdr));
203 if (m == NULL) {
204 KMOD_TCPSTAT_INC(tcps_rcvshort);
205 return (-1);
206 }
207 }
208 ip = (struct ip *)(eh + 1);
209 th = (struct tcphdr *)(ip + 1);
210 drop_hdrlen = sizeof(*ip);
211 iptos = ip->ip_tos;
212 tlen = ntohs(ip->ip_len) - sizeof(struct ip);
213 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
214 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
215 th->th_sum = m->m_pkthdr.csum_data;
216 else
217 th->th_sum = in_pseudo(ip->ip_src.s_addr,
218 ip->ip_dst.s_addr,
219 htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP));
220 th->th_sum ^= 0xffff;
221 } else {
222 int len;
223 struct ipovly *ipov = (struct ipovly *)ip;
224 /*
225 * Checksum extended TCP header and data.
226 */
227 len = drop_hdrlen + tlen;
228 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
229 ipov->ih_len = htons(tlen);
230 th->th_sum = in_cksum(m, len);
231 /* Reset length for SDT probes. */
232 ip->ip_len = htons(len);
233 /* Reset TOS bits */
234 ip->ip_tos = iptos;
235 /* Re-initialization for later version check */
236 ip->ip_v = IPVERSION;
237 ip->ip_hl = sizeof(*ip) >> 2;
238 }
239 if (th->th_sum) {
240 KMOD_TCPSTAT_INC(tcps_rcvbadsum);
241 m_freem(m);
242 return (-1);
243 }
244 break;
245 }
246 #endif
247 };
248 return (etype);
249 }
250
251 /*
252 * The function ctf_process_inbound_raw() is used by
253 * transport developers to do the steps needed to
254 * support MBUF Queuing i.e. the flags in
255 * inp->inp_flags2:
256 *
257 * - INP_SUPPORTS_MBUFQ
258 * - INP_MBUF_QUEUE_READY
259 * - INP_DONT_SACK_QUEUE
260 * - INP_MBUF_ACKCMP
261 *
262 * These flags help control how LRO will deliver
263 * packets to the transport. You first set in inp_flags2
264 * the INP_SUPPORTS_MBUFQ to tell the LRO code that you
265 * will gladly take a queue of packets instead of a compressed
266 * single packet. You also set in your t_fb pointer the
267 * tfb_do_queued_segments to point to ctf_process_inbound_raw.
268 *
269 * This then gets you lists of inbound ACK's/Data instead
270 * of a condensed compressed ACK/DATA packet. Why would you
271 * want that? This will get you access to all the arrival
272 * times of at least LRO and possibly at the Hardware (if
273 * the interface card supports that) of the actual ACK/DATA.
274 * In some transport designs this is important since knowing
275 * the actual time we got the packet is useful information.
276 *
277 * A new special type of mbuf may also be supported by the transport
278 * if it has set the INP_MBUF_ACKCMP flag. If its set, LRO will
279 * possibly create a M_ACKCMP type mbuf. This is a mbuf with
280 * an array of "acks". One thing also to note is that when this
281 * occurs a subsequent LRO may find at the back of the untouched
282 * mbuf queue chain a M_ACKCMP and append on to it. This means
283 * that until the transport pulls in the mbuf chain queued
284 * for it more ack's may get on the mbufs that were already
285 * delivered. There currently is a limit of 6 acks condensed
286 * into 1 mbuf which means often when this is occuring, we
287 * don't get that effect but it does happen.
288 *
289 * Now there are some interesting Caveats that the transport
290 * designer needs to take into account when using this feature.
291 *
292 * 1) It is used with HPTS and pacing, when the pacing timer
293 * for output calls it will first call the input.
294 * 2) When you set INP_MBUF_QUEUE_READY this tells LRO
295 * queue normal packets, I am busy pacing out data and
296 * will process the queued packets before my tfb_tcp_output
297 * call from pacing. If a non-normal packet arrives, (e.g. sack)
298 * you will be awoken immediately.
299 * 3) Finally you can add the INP_DONT_SACK_QUEUE to not even
300 * be awoken if a SACK has arrived. You would do this when
301 * you were not only running a pacing for output timer
302 * but a Rack timer as well i.e. you know you are in recovery
303 * and are in the process (via the timers) of dealing with
304 * the loss.
305 *
306 * Now a critical thing you must be aware of here is that the
307 * use of the flags has a far greater scope then just your
308 * typical LRO. Why? Well thats because in the normal compressed
309 * LRO case at the end of a driver interupt all packets are going
310 * to get presented to the transport no matter if there is one
311 * or 100. With the MBUF_QUEUE model, this is not true. You will
312 * only be awoken to process the queue of packets when:
313 * a) The flags discussed above allow it.
314 * <or>
315 * b) You exceed a ack or data limit (by default the
316 * ack limit is infinity (64k acks) and the data
317 * limit is 64k of new TCP data)
318 * <or>
319 * c) The push bit has been set by the peer
320 */
321
322 static int
ctf_process_inbound_raw(struct tcpcb * tp,struct mbuf * m,int has_pkt)323 ctf_process_inbound_raw(struct tcpcb *tp, struct mbuf *m, int has_pkt)
324 {
325 /*
326 * We are passed a raw change of mbuf packets
327 * that arrived in LRO. They are linked via
328 * the m_nextpkt link in the pkt-headers.
329 *
330 * We process each one by:
331 * a) saving off the next
332 * b) stripping off the ether-header
333 * c) formulating the arguments for tfb_do_segment_nounlock()
334 * d) calling each mbuf to tfb_do_segment_nounlock()
335 * after adjusting the time to match the arrival time.
336 * Note that the LRO code assures no IP options are present.
337 *
338 * The symantics for calling tfb_do_segment_nounlock() are the
339 * following:
340 * 1) It returns 0 if all went well and you (the caller) need
341 * to release the lock.
342 * 2) If nxt_pkt is set, then the function will surpress calls
343 * to tcp_output() since you are promising to call again
344 * with another packet.
345 * 3) If it returns 1, then you must free all the packets being
346 * shipped in, the tcb has been destroyed (or about to be destroyed).
347 */
348 struct mbuf *m_save;
349 struct tcphdr *th;
350 #ifdef INET6
351 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
352 #endif
353 #ifdef INET
354 struct ip *ip = NULL; /* Keep compiler happy. */
355 #endif
356 struct ifnet *ifp;
357 struct timeval tv;
358 struct inpcb *inp __diagused;
359 int32_t retval, nxt_pkt, tlen, off;
360 int etype = 0;
361 uint16_t drop_hdrlen;
362 uint8_t iptos;
363
364 inp = tptoinpcb(tp);
365 INP_WLOCK_ASSERT(inp);
366 NET_EPOCH_ASSERT();
367 KASSERT(m != NULL, ("ctf_process_inbound_raw: m == NULL"));
368 ifp = m_rcvif(m);
369 KASSERT(ifp != NULL, ("ctf_process_inbound_raw: ifp == NULL"));
370 CURVNET_SET(ifp->if_vnet);
371 tcp_get_usecs(&tv);
372 while (m) {
373 m_save = m->m_nextpkt;
374 m->m_nextpkt = NULL;
375 if ((m->m_flags & M_ACKCMP) == 0) {
376 /* Now lets get the ether header */
377 etype = ctf_get_enet_type(ifp, m);
378 if (etype == -1) {
379 /* Skip this packet it was freed by checksum */
380 goto skipped_pkt;
381 }
382 KASSERT(((etype == ETHERTYPE_IPV6) || (etype == ETHERTYPE_IP)),
383 ("tp:%p m:%p etype:0x%x -- not IP or IPv6", tp, m, etype));
384 /* Trim off the ethernet header */
385 switch (etype) {
386 #ifdef INET6
387 case ETHERTYPE_IPV6:
388 ip6 = mtod(m, struct ip6_hdr *);
389 th = (struct tcphdr *)(ip6 + 1);
390 tlen = ntohs(ip6->ip6_plen);
391 drop_hdrlen = sizeof(*ip6);
392 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
393 break;
394 #endif
395 #ifdef INET
396 case ETHERTYPE_IP:
397 ip = mtod(m, struct ip *);
398 th = (struct tcphdr *)(ip + 1);
399 drop_hdrlen = sizeof(*ip);
400 iptos = ip->ip_tos;
401 tlen = ntohs(ip->ip_len) - sizeof(struct ip);
402 break;
403 #endif
404 } /* end switch */
405 off = th->th_off << 2;
406 if (off < sizeof (struct tcphdr) || off > tlen) {
407 printf("off:%d < hdrlen:%zu || > tlen:%u -- dump\n",
408 off,
409 sizeof(struct tcphdr),
410 tlen);
411 KMOD_TCPSTAT_INC(tcps_rcvbadoff);
412 m_freem(m);
413 goto skipped_pkt;
414 }
415 tlen -= off;
416 drop_hdrlen += off;
417 /*
418 * Now lets setup the timeval to be when we should
419 * have been called (if we can).
420 */
421 m->m_pkthdr.lro_nsegs = 1;
422 /* Now what about next packet? */
423 } else {
424 /*
425 * This mbuf is an array of acks that have
426 * been compressed. We assert the inp has
427 * the flag set to enable this!
428 */
429 KASSERT((tp->t_flags2 & TF2_MBUF_ACKCMP),
430 ("tp:%p no TF2_MBUF_ACKCMP flags?", tp));
431 tlen = 0;
432 drop_hdrlen = 0;
433 th = NULL;
434 iptos = 0;
435 }
436 tcp_get_usecs(&tv);
437 if (m_save || has_pkt)
438 nxt_pkt = 1;
439 else
440 nxt_pkt = 0;
441 if ((m->m_flags & M_ACKCMP) == 0)
442 KMOD_TCPSTAT_INC(tcps_rcvtotal);
443 else
444 KMOD_TCPSTAT_ADD(tcps_rcvtotal, (m->m_len / sizeof(struct tcp_ackent)));
445 retval = (*tp->t_fb->tfb_do_segment_nounlock)(tp, m, th,
446 drop_hdrlen, tlen, iptos, nxt_pkt, &tv);
447 if (retval) {
448 /* We lost the lock and tcb probably */
449 m = m_save;
450 while(m) {
451 m_save = m->m_nextpkt;
452 m->m_nextpkt = NULL;
453 m_freem(m);
454 m = m_save;
455 }
456 CURVNET_RESTORE();
457 INP_UNLOCK_ASSERT(inp);
458 return (retval);
459 }
460 skipped_pkt:
461 m = m_save;
462 }
463 CURVNET_RESTORE();
464 return (0);
465 }
466
467 int
ctf_do_queued_segments(struct tcpcb * tp,int have_pkt)468 ctf_do_queued_segments(struct tcpcb *tp, int have_pkt)
469 {
470 struct mbuf *m;
471
472 /* First lets see if we have old packets */
473 if ((m = STAILQ_FIRST(&tp->t_inqueue)) != NULL) {
474 STAILQ_INIT(&tp->t_inqueue);
475 if (ctf_process_inbound_raw(tp, m, have_pkt)) {
476 /* We lost the tcpcb (maybe a RST came in)? */
477 return(1);
478 }
479 }
480 return (0);
481 }
482
483 uint32_t
ctf_outstanding(struct tcpcb * tp)484 ctf_outstanding(struct tcpcb *tp)
485 {
486 uint32_t bytes_out;
487
488 bytes_out = tp->snd_max - tp->snd_una;
489 if (tp->t_state < TCPS_ESTABLISHED)
490 bytes_out++;
491 if (tp->t_flags & TF_SENTFIN)
492 bytes_out++;
493 return (bytes_out);
494 }
495
496 uint32_t
ctf_flight_size(struct tcpcb * tp,uint32_t rc_sacked)497 ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
498 {
499 if (rc_sacked <= ctf_outstanding(tp))
500 return(ctf_outstanding(tp) - rc_sacked);
501 else {
502 return (0);
503 }
504 }
505
506 void
ctf_do_dropwithreset(struct mbuf * m,struct tcpcb * tp,struct tcphdr * th,int32_t tlen)507 ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
508 int32_t tlen)
509 {
510 tcp_dropwithreset(m, th, tp, tlen);
511 if (tp != NULL)
512 INP_WUNLOCK(tptoinpcb(tp));
513 }
514
515 void
ctf_ack_war_checks(struct tcpcb * tp)516 ctf_ack_war_checks(struct tcpcb *tp)
517 {
518 sbintime_t now;
519
520 if ((V_tcp_ack_war_time_window > 0) && (V_tcp_ack_war_cnt > 0)) {
521 now = getsbinuptime();
522 if (tp->t_challenge_ack_end < now) {
523 tp->t_challenge_ack_cnt = 0;
524 tp->t_challenge_ack_end = now +
525 V_tcp_ack_war_time_window * SBT_1MS;
526 }
527 if (tp->t_challenge_ack_cnt < V_tcp_ack_war_cnt) {
528 tp->t_challenge_ack_cnt++;
529 tp->t_flags |= TF_ACKNOW;
530 } else
531 tp->t_flags &= ~TF_ACKNOW;
532 } else
533 tp->t_flags |= TF_ACKNOW;
534 }
535
536 /*
537 * ctf_drop_checks returns 1 for you should not proceed. It places
538 * in ret_val what should be returned 1/0 by the caller. The 1 indicates
539 * that the TCB is unlocked and probably dropped. The 0 indicates the
540 * TCB is still valid and locked.
541 */
542 int
ctf_drop_checks(struct tcpopt * to,struct mbuf * m,struct tcphdr * th,struct tcpcb * tp,int32_t * tlenp,int32_t * thf,int32_t * drop_hdrlen,int32_t * ret_val)543 ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th,
544 struct tcpcb *tp, int32_t *tlenp,
545 int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val)
546 {
547 int32_t todrop;
548 int32_t thflags;
549 int32_t tlen;
550
551 thflags = *thf;
552 tlen = *tlenp;
553 todrop = tp->rcv_nxt - th->th_seq;
554 if (todrop > 0) {
555 if (thflags & TH_SYN) {
556 thflags &= ~TH_SYN;
557 th->th_seq++;
558 if (th->th_urp > 1)
559 th->th_urp--;
560 else
561 thflags &= ~TH_URG;
562 todrop--;
563 }
564 /*
565 * Following if statement from Stevens, vol. 2, p. 960.
566 */
567 if (todrop > tlen
568 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
569 /*
570 * Any valid FIN must be to the left of the window.
571 * At this point the FIN must be a duplicate or out
572 * of sequence; drop it.
573 */
574 thflags &= ~TH_FIN;
575 /*
576 * Send an ACK to resynchronize and drop any data.
577 * But keep on processing for RST or ACK.
578 */
579 ctf_ack_war_checks(tp);
580 todrop = tlen;
581 KMOD_TCPSTAT_INC(tcps_rcvduppack);
582 KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
583 } else {
584 KMOD_TCPSTAT_INC(tcps_rcvpartduppack);
585 KMOD_TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
586 }
587 /*
588 * DSACK - add SACK block for dropped range
589 */
590 if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
591 /*
592 * ACK now, as the next in-sequence segment
593 * will clear the DSACK block again
594 */
595 ctf_ack_war_checks(tp);
596 if (tp->t_flags & TF_ACKNOW)
597 tcp_update_sack_list(tp, th->th_seq,
598 th->th_seq + todrop);
599 }
600 *drop_hdrlen += todrop; /* drop from the top afterwards */
601 th->th_seq += todrop;
602 tlen -= todrop;
603 if (th->th_urp > todrop)
604 th->th_urp -= todrop;
605 else {
606 thflags &= ~TH_URG;
607 th->th_urp = 0;
608 }
609 }
610 /*
611 * If segment ends after window, drop trailing data (and PUSH and
612 * FIN); if nothing left, just ACK.
613 */
614 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
615 if (todrop > 0) {
616 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
617 if (todrop >= tlen) {
618 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
619 /*
620 * If window is closed can only take segments at
621 * window edge, and have to drop data and PUSH from
622 * incoming segments. Continue processing, but
623 * remember to ack. Otherwise, drop segment and
624 * ack.
625 */
626 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
627 ctf_ack_war_checks(tp);
628 KMOD_TCPSTAT_INC(tcps_rcvwinprobe);
629 } else {
630 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
631 return (1);
632 }
633 } else
634 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
635 m_adj(m, -todrop);
636 tlen -= todrop;
637 thflags &= ~(TH_PUSH | TH_FIN);
638 }
639 *thf = thflags;
640 *tlenp = tlen;
641 return (0);
642 }
643
644 /*
645 * The value in ret_val informs the caller
646 * if we dropped the tcb (and lock) or not.
647 * 1 = we dropped it, 0 = the TCB is still locked
648 * and valid.
649 */
650 void
ctf_do_dropafterack(struct mbuf * m,struct tcpcb * tp,struct tcphdr * th,int32_t thflags,int32_t tlen,int32_t * ret_val)651 ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t *ret_val)
652 {
653 /*
654 * Generate an ACK dropping incoming segment if it occupies sequence
655 * space, where the ACK reflects our state.
656 *
657 * We can now skip the test for the RST flag since all paths to this
658 * code happen after packets containing RST have been dropped.
659 *
660 * In the SYN-RECEIVED state, don't send an ACK unless the segment
661 * we received passes the SYN-RECEIVED ACK test. If it fails send a
662 * RST. This breaks the loop in the "LAND" DoS attack, and also
663 * prevents an ACK storm between two listening ports that have been
664 * sent forged SYN segments, each with the source address of the
665 * other.
666 */
667 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
668 (SEQ_GT(tp->snd_una, th->th_ack) ||
669 SEQ_GT(th->th_ack, tp->snd_max))) {
670 *ret_val = 1;
671 ctf_do_dropwithreset(m, tp, th, tlen);
672 return;
673 } else
674 *ret_val = 0;
675 ctf_ack_war_checks(tp);
676 if (m)
677 m_freem(m);
678 }
679
680 void
ctf_do_drop(struct mbuf * m,struct tcpcb * tp)681 ctf_do_drop(struct mbuf *m, struct tcpcb *tp)
682 {
683
684 /*
685 * Drop space held by incoming segment and return.
686 */
687 if (tp != NULL)
688 INP_WUNLOCK(tptoinpcb(tp));
689 if (m)
690 m_freem(m);
691 }
692
693 int
ctf_process_rst(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp)694 ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so,
695 struct tcpcb *tp)
696 {
697 /*
698 * RFC5961 Section 3.2
699 *
700 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
701 * window, we send challenge ACK.
702 *
703 * Note: to take into account delayed ACKs, we should test against
704 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
705 * of closed window, not covered by the RFC.
706 */
707 int dropped = 0;
708
709 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
710 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
711 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
712 KASSERT(tp->t_state != TCPS_SYN_SENT,
713 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
714 __func__, th, tp));
715
716 if (V_tcp_insecure_rst ||
717 (tp->last_ack_sent == th->th_seq) ||
718 (tp->rcv_nxt == th->th_seq)) {
719 KMOD_TCPSTAT_INC(tcps_drops);
720 /* Drop the connection. */
721 switch (tp->t_state) {
722 case TCPS_SYN_RECEIVED:
723 so->so_error = ECONNREFUSED;
724 goto close;
725 case TCPS_ESTABLISHED:
726 case TCPS_FIN_WAIT_1:
727 case TCPS_FIN_WAIT_2:
728 case TCPS_CLOSE_WAIT:
729 case TCPS_CLOSING:
730 case TCPS_LAST_ACK:
731 so->so_error = ECONNRESET;
732 close:
733 tcp_state_change(tp, TCPS_CLOSED);
734 /* FALLTHROUGH */
735 default:
736 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST);
737 tp = tcp_close(tp);
738 }
739 dropped = 1;
740 ctf_do_drop(m, tp);
741 } else {
742 KMOD_TCPSTAT_INC(tcps_badrst);
743 tcp_send_challenge_ack(tp, th, m);
744 }
745 } else {
746 m_freem(m);
747 }
748 return (dropped);
749 }
750
751 /*
752 * The value in ret_val informs the caller
753 * if we dropped the tcb (and lock) or not.
754 * 1 = we dropped it, 0 = the TCB is still locked
755 * and valid.
756 */
757 void
ctf_challenge_ack(struct mbuf * m,struct tcphdr * th,struct tcpcb * tp,uint8_t iptos,int32_t * ret_val)758 ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, uint8_t iptos, int32_t * ret_val)
759 {
760
761 NET_EPOCH_ASSERT();
762
763 KMOD_TCPSTAT_INC(tcps_badsyn);
764 if (V_tcp_insecure_syn &&
765 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
766 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
767 tp = tcp_drop(tp, ECONNRESET);
768 *ret_val = 1;
769 ctf_do_drop(m, tp);
770 } else {
771 tcp_ecn_input_syn_sent(tp, tcp_get_flags(th), iptos);
772 /* Send challenge ACK. */
773 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
774 tp->snd_nxt, TH_ACK);
775 tp->last_ack_sent = tp->rcv_nxt;
776 m = NULL;
777 *ret_val = 0;
778 ctf_do_drop(m, NULL);
779 }
780 }
781
782 /*
783 * ctf_ts_check returns 1 for you should not proceed, the state
784 * machine should return. It places in ret_val what should
785 * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
786 * that the TCB is unlocked and probably dropped. The 0 indicates the
787 * TCB is still valid and locked.
788 */
789 int
ctf_ts_check(struct mbuf * m,struct tcphdr * th,struct tcpcb * tp,int32_t tlen,int32_t thflags,int32_t * ret_val)790 ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
791 int32_t tlen, int32_t thflags, int32_t * ret_val)
792 {
793
794 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
795 /*
796 * Invalidate ts_recent. If this segment updates ts_recent,
797 * the age will be reset later and ts_recent will get a
798 * valid value. If it does not, setting ts_recent to zero
799 * will at least satisfy the requirement that zero be placed
800 * in the timestamp echo reply when ts_recent isn't valid.
801 * The age isn't reset until we get a valid ts_recent
802 * because we don't want out-of-order segments to be dropped
803 * when ts_recent is old.
804 */
805 tp->ts_recent = 0;
806 } else {
807 KMOD_TCPSTAT_INC(tcps_rcvduppack);
808 KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
809 KMOD_TCPSTAT_INC(tcps_pawsdrop);
810 *ret_val = 0;
811 if (tlen) {
812 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
813 } else {
814 ctf_do_drop(m, NULL);
815 }
816 return (1);
817 }
818 return (0);
819 }
820
821 int
ctf_ts_check_ac(struct tcpcb * tp,int32_t thflags)822 ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags)
823 {
824
825 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
826 /*
827 * Invalidate ts_recent. If this segment updates ts_recent,
828 * the age will be reset later and ts_recent will get a
829 * valid value. If it does not, setting ts_recent to zero
830 * will at least satisfy the requirement that zero be placed
831 * in the timestamp echo reply when ts_recent isn't valid.
832 * The age isn't reset until we get a valid ts_recent
833 * because we don't want out-of-order segments to be dropped
834 * when ts_recent is old.
835 */
836 tp->ts_recent = 0;
837 } else {
838 KMOD_TCPSTAT_INC(tcps_rcvduppack);
839 KMOD_TCPSTAT_INC(tcps_pawsdrop);
840 return (1);
841 }
842 return (0);
843 }
844
845
846
847 void
ctf_calc_rwin(struct socket * so,struct tcpcb * tp)848 ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
849 {
850 int32_t win;
851
852 /*
853 * Calculate amount of space in receive window, and then do TCP
854 * input processing. Receive window is amount of space in rcv queue,
855 * but not less than advertised window.
856 */
857 win = sbspace(&so->so_rcv);
858 if (win < 0)
859 win = 0;
860 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
861 }
862
863 void
ctf_do_dropwithreset_conn(struct mbuf * m,struct tcpcb * tp,struct tcphdr * th,int32_t tlen)864 ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
865 int32_t tlen)
866 {
867
868 tcp_dropwithreset(m, th, tp, tlen);
869 tp = tcp_drop(tp, ETIMEDOUT);
870 if (tp)
871 INP_WUNLOCK(tptoinpcb(tp));
872 }
873
874 uint32_t
ctf_fixed_maxseg(struct tcpcb * tp)875 ctf_fixed_maxseg(struct tcpcb *tp)
876 {
877 return (tcp_fixed_maxseg(tp));
878 }
879
880 void
ctf_log_sack_filter(struct tcpcb * tp,int num_sack_blks,struct sackblk * sack_blocks)881 ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks)
882 {
883 if (tcp_bblogging_on(tp)) {
884 union tcp_log_stackspecific log;
885 struct timeval tv;
886
887 memset(&log, 0, sizeof(log));
888 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
889 log.u_bbr.flex8 = num_sack_blks;
890 if (num_sack_blks > 0) {
891 log.u_bbr.flex1 = sack_blocks[0].start;
892 log.u_bbr.flex2 = sack_blocks[0].end;
893 }
894 if (num_sack_blks > 1) {
895 log.u_bbr.flex3 = sack_blocks[1].start;
896 log.u_bbr.flex4 = sack_blocks[1].end;
897 }
898 if (num_sack_blks > 2) {
899 log.u_bbr.flex5 = sack_blocks[2].start;
900 log.u_bbr.flex6 = sack_blocks[2].end;
901 }
902 if (num_sack_blks > 3) {
903 log.u_bbr.applimited = sack_blocks[3].start;
904 log.u_bbr.pkts_out = sack_blocks[3].end;
905 }
906 TCP_LOG_EVENTP(tp, NULL,
907 &tptosocket(tp)->so_rcv,
908 &tptosocket(tp)->so_snd,
909 TCP_SACK_FILTER_RES, 0,
910 0, &log, false, &tv);
911 }
912 }
913
914 uint32_t
ctf_decay_count(uint32_t count,uint32_t decay)915 ctf_decay_count(uint32_t count, uint32_t decay)
916 {
917 /*
918 * Given a count, decay it by a set percentage. The
919 * percentage is in thousands i.e. 100% = 1000,
920 * 19.3% = 193.
921 */
922 uint64_t perc_count, decay_per;
923 uint32_t decayed_count;
924 if (decay > 1000) {
925 /* We don't raise it */
926 return (count);
927 }
928 perc_count = count;
929 decay_per = decay;
930 perc_count *= decay_per;
931 perc_count /= 1000;
932 /*
933 * So now perc_count holds the
934 * count decay value.
935 */
936 decayed_count = count - (uint32_t)perc_count;
937 return(decayed_count);
938 }
939
940 int32_t
ctf_progress_timeout_check(struct tcpcb * tp,bool log)941 ctf_progress_timeout_check(struct tcpcb *tp, bool log)
942 {
943 if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
944 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
945 /*
946 * There is an assumption that the caller
947 * will drop the connection so we will
948 * increment the counters here.
949 */
950 if (log)
951 tcp_log_end_status(tp, TCP_EI_STATUS_PROGRESS);
952 #ifdef NETFLIX_STATS
953 KMOD_TCPSTAT_INC(tcps_progdrops);
954 #endif
955 return (1);
956 }
957 }
958 return (0);
959 }
960