1 /*-
2 * Copyright (c) 2024- Netflix, Inc.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 */
26
27 #include <sys/cdefs.h>
28 #include "opt_inet.h"
29 #include "opt_inet6.h"
30 #include "opt_ipsec.h"
31 #include "opt_ratelimit.h"
32 #include "opt_kern_tls.h"
33 #if defined(INET) || defined(INET6)
34 #include <sys/param.h>
35 #include <sys/arb.h>
36 #include <sys/module.h>
37 #include <sys/kernel.h>
38 #ifdef TCP_HHOOK
39 #include <sys/hhook.h>
40 #endif
41 #include <sys/lock.h>
42 #include <sys/malloc.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/mbuf.h>
46 #include <sys/proc.h> /* for proc0 declaration */
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sysctl.h>
50 #include <sys/systm.h>
51 #ifdef STATS
52 #include <sys/qmath.h>
53 #include <sys/tree.h>
54 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
55 #else
56 #include <sys/tree.h>
57 #endif
58 #include <sys/refcount.h>
59 #include <sys/queue.h>
60 #include <sys/tim_filter.h>
61 #include <sys/smp.h>
62 #include <sys/kthread.h>
63 #include <sys/kern_prefetch.h>
64 #include <sys/protosw.h>
65 #ifdef TCP_ACCOUNTING
66 #include <sys/sched.h>
67 #include <machine/cpu.h>
68 #endif
69 #include <vm/uma.h>
70
71 #include <net/route.h>
72 #include <net/route/nhop.h>
73 #include <net/vnet.h>
74
75 #define TCPSTATES /* for logging */
76
77 #include <netinet/in.h>
78 #include <netinet/in_kdtrace.h>
79 #include <netinet/in_pcb.h>
80 #include <netinet/ip.h>
81 #include <netinet/ip_icmp.h> /* required for icmp_var.h */
82 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
83 #include <netinet/ip_var.h>
84 #include <netinet/ip6.h>
85 #include <netinet6/in6_pcb.h>
86 #include <netinet6/ip6_var.h>
87 #include <netinet/tcp.h>
88 #include <netinet/tcp_fsm.h>
89 #include <netinet/tcp_seq.h>
90 #include <netinet/tcp_timer.h>
91 #include <netinet/tcp_var.h>
92 #include <netinet/tcp_log_buf.h>
93 #include <netinet/tcp_syncache.h>
94 #include <netinet/tcp_hpts.h>
95 #include <netinet/tcp_ratelimit.h>
96 #include <netinet/tcp_accounting.h>
97 #include <netinet/tcpip.h>
98 #include <netinet/cc/cc.h>
99 #include <netinet/cc/cc_newreno.h>
100 #include <netinet/tcp_fastopen.h>
101 #include <netinet/tcp_lro.h>
102 #ifdef NETFLIX_SHARED_CWND
103 #include <netinet/tcp_shared_cwnd.h>
104 #endif
105 #ifdef TCP_OFFLOAD
106 #include <netinet/tcp_offload.h>
107 #endif
108 #ifdef INET6
109 #include <netinet6/tcp6_var.h>
110 #endif
111 #include <netinet/tcp_ecn.h>
112
113 #include <netipsec/ipsec_support.h>
114
115 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
116 #include <netipsec/ipsec.h>
117 #include <netipsec/ipsec6.h>
118 #endif /* IPSEC */
119
120 #include <netinet/udp.h>
121 #include <netinet/udp_var.h>
122 #include <machine/in_cksum.h>
123
124 #ifdef MAC
125 #include <security/mac/mac_framework.h>
126 #endif
127 #include "sack_filter.h"
128 #include "tcp_rack.h"
129 #include "tailq_hash.h"
130 #include "rack_bbr_common.h"
131
132 MALLOC_DECLARE(M_TCPPCM);
133
134 void
rack_update_pcm_ack(struct tcp_rack * rack,int was_cumack,uint32_t start,uint32_t end)135 rack_update_pcm_ack(struct tcp_rack *rack, int was_cumack, uint32_t start, uint32_t end)
136 {
137 struct rack_pcm_stats *e;
138 int i, completed = 0;
139 uint64_t ack_arrival;
140 int segsiz;
141
142 if (rack->pcm_in_progress == 0)
143 return;
144
145 if (SEQ_LEQ(end, rack->r_ctl.pcm_i.sseq)) {
146 /*
147 * Its not in our range of data sent, it
148 * is before our first seq.
149 */
150 return;
151 }
152 /* We take away 1 mss from the end to avoid delayed ack */
153 segsiz = ctf_fixed_maxseg(rack->rc_tp);
154 if (SEQ_GEQ(end, (rack->r_ctl.pcm_i.eseq - segsiz))) {
155 /*
156 * We have reached beyond the end of the
157 * initial send. Even though things may
158 * still be lost and this could be something
159 * from a different send than our burst.
160 */
161 completed = 1;
162 rack->pcm_in_progress = 0;
163 rack->r_ctl.last_pcm_round = rack->r_ctl.current_round;
164 rack->r_ctl.pcm_idle_rounds = 0;
165 }
166 if (SEQ_GEQ(start, rack->r_ctl.pcm_i.eseq)) {
167 /*
168 * This is outside the scope
169 * of the measurement itself and
170 * is likely a sack above our burst.
171 */
172 goto skip_ack_accounting;
173 }
174 /*
175 * Record ACK data.
176 */
177 ack_arrival = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
178 if (SEQ_GT(end, rack->r_ctl.pcm_i.eseq)) {
179 /* Trim the end to the end of our range if it is beyond */
180 end = rack->r_ctl.pcm_i.eseq;
181 }
182 if ((rack->r_ctl.pcm_i.cnt + 1) > rack->r_ctl.pcm_i.cnt_alloc) {
183 /* Need to expand, first is there any present? */
184 size_t sz;
185
186 if (rack->r_ctl.pcm_i.cnt_alloc == 0) {
187 /*
188 * Failed at rack_init I suppose.
189 */
190 rack->r_ctl.pcm_i.cnt_alloc = RACK_DEFAULT_PCM_ARRAY;
191 sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc);
192 rack->r_ctl.pcm_s = malloc(sz, M_TCPPCM, M_NOWAIT);
193 if (rack->r_ctl.pcm_s == NULL) {
194 rack->r_ctl.pcm_i.cnt_alloc = 0;
195 rack->pcm_in_progress = 0;
196 return;
197 }
198 } else {
199 /* Need to expand the array */
200 struct rack_pcm_stats *n;
201 uint16_t new_cnt;
202
203 new_cnt = rack->r_ctl.pcm_i.cnt_alloc * 2;
204 sz = (sizeof(struct rack_pcm_stats) * new_cnt);
205 n = malloc(sz,M_TCPPCM, M_NOWAIT);
206 if (n == NULL) {
207 /* We are dead, no memory */
208 rack->pcm_in_progress = 0;
209 rack->r_ctl.pcm_i.cnt = 0;
210 return;
211 }
212 sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc);
213 memcpy(n, rack->r_ctl.pcm_s, sz);
214 free(rack->r_ctl.pcm_s, M_TCPPCM);
215 rack->r_ctl.pcm_s = n;
216 rack->r_ctl.pcm_i.cnt_alloc = new_cnt;
217 }
218 }
219 e = &rack->r_ctl.pcm_s[rack->r_ctl.pcm_i.cnt];
220 rack->r_ctl.pcm_i.cnt++;
221 e->sseq = start;
222 e->eseq = end;
223 e->ack_time = ack_arrival;
224 skip_ack_accounting:
225 if (completed == 0)
226 return;
227 /*
228 * Ok we are to the point where we can assess what
229 * has happened and make a PCM judgement.
230 */
231
232 if (tcp_bblogging_on(rack->rc_tp)) {
233 union tcp_log_stackspecific log;
234 struct timeval tv;
235 uint64_t prev_time = 0;
236 uint64_t tot_byt = 0;
237 uint32_t tot_lt_12us = 0;
238 uint32_t tot_gt_2mss = 0;
239
240 (void)tcp_get_usecs(&tv);
241 for (i=0; i<rack->r_ctl.pcm_i.cnt; i++) {
242
243 e = &rack->r_ctl.pcm_s[i];
244 memset(&log, 0, sizeof(log));
245 log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
246 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
247 log.u_bbr.flex8 = 1;
248 log.u_bbr.flex1 = e->sseq;
249 log.u_bbr.flex2 = e->eseq;
250 tot_byt += (e->eseq - e->sseq);
251 if ((i > 0) &&
252 (e->ack_time > prev_time)) {
253 log.u_bbr.flex3 = (uint32_t)(e->ack_time - prev_time);
254 } else {
255 log.u_bbr.flex3 = 0;
256 }
257 if (e->ack_time > rack->r_ctl.pcm_i.send_time) {
258 log.u_bbr.flex4 = (uint32_t)(e->ack_time - rack->r_ctl.pcm_i.send_time);
259 } else {
260 log.u_bbr.flex4 = 0;
261 }
262 if ((e->eseq - e->sseq) > (segsiz * 2)) {
263 tot_gt_2mss++;
264 }
265 if ((i > 0) &&
266 (log.u_bbr.flex3 < 12)) {
267 tot_lt_12us++;
268 }
269 prev_time = e->ack_time;
270 log.u_bbr.cur_del_rate = rack->r_ctl.pcm_i.send_time;
271 if ((i > 0) &&
272 (log.u_bbr.flex3 > 0)) {
273 /*
274 * Calculate a b/w between this chunk and the previous.
275 */
276 log.u_bbr.delRate = (e->eseq - e->sseq);
277 log.u_bbr.delRate *= HPTS_USEC_IN_SEC;
278 log.u_bbr.delRate /= (uint64_t)log.u_bbr.flex3;
279 }
280 log.u_bbr.rttProp = e->ack_time;
281 (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK,
282 0, &log, false, NULL, NULL, 0, &tv);
283 }
284 if (prev_time > rack->r_ctl.pcm_i.send_time) {
285 /*
286 * Prev time holds the last ack arrival time.
287 */
288 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
289 log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
290 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
291 log.u_bbr.flex8 = 2;
292 log.u_bbr.flex1 = rack->r_ctl.pcm_i.sseq;
293 log.u_bbr.flex2 = rack->r_ctl.pcm_i.eseq;
294 log.u_bbr.flex3 = tot_byt;
295 log.u_bbr.flex4 = tot_lt_12us; /* How many deltas indicate > 2Gbps */
296 log.u_bbr.flex5 = tot_gt_2mss; /* How many acks represent more than 2MSS */
297 log.u_bbr.flex7 = rack->r_ctl.pcm_i.cnt;
298 log.u_bbr.cwnd_gain = rack->r_ctl.pcm_i.cnt_alloc;
299 log.u_bbr.cur_del_rate = rack->r_ctl.pcm_i.send_time;
300 log.u_bbr.rttProp = prev_time;
301 log.u_bbr.delRate = tot_byt;
302 log.u_bbr.delRate *= HPTS_USEC_IN_SEC;
303 log.u_bbr.delRate /= (prev_time - rack->r_ctl.pcm_i.send_time);
304 (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK,
305 0, &log, false, NULL, NULL, 0, &tv);
306 }
307 }
308 /*
309 * Here we need a lot to be added including:
310 * 1) Some form of measurement, where if we think the measurement
311 * is valid we iterate over the PCM data and come up with a path
312 * capacity estimate.
313 * 2) We may decide that the PCM is invalid due to ack meddlers and
314 * thus need to increase the PCM size (which defaults to 10mss).
315 * 3) We may need to think about shrinking the PCM size if we are
316 * seeing some sort of presistent loss from making the measurement
317 * (i.e. it got to big and our bursts are causing loss).
318 * 4) If we make a measurement we need to place it somewhere in the
319 * stack to be reported later somehow. Is it a WMA in the stack or
320 * the highest or?
321 * 5) Is there a limit on how big we can go PCM size wise, the code
322 * here will send multiple TSO bursts all at once, but how big
323 * is too big, and does that then put some bound (I think it does)
324 * on the largest capacity we can determine?
325 */
326 /* New code here */
327 /* Clear the cnt we are done */
328 rack->r_ctl.pcm_i.cnt = 0;
329 }
330
331 #endif /* #if !defined(INET) && !defined(INET6) */
332