1 /*- 2 * Copyright (c) 2024- Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 #include "opt_inet.h" 29 #include "opt_inet6.h" 30 #include "opt_ipsec.h" 31 #include "opt_ratelimit.h" 32 #include "opt_kern_tls.h" 33 #if defined(INET) || defined(INET6) 34 #include <sys/param.h> 35 #include <sys/arb.h> 36 #include <sys/module.h> 37 #include <sys/kernel.h> 38 #ifdef TCP_HHOOK 39 #include <sys/hhook.h> 40 #endif 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/lock.h> 44 #include <sys/mutex.h> 45 #include <sys/mbuf.h> 46 #include <sys/proc.h> /* for proc0 declaration */ 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 #ifdef STATS 52 #include <sys/qmath.h> 53 #include <sys/tree.h> 54 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 55 #else 56 #include <sys/tree.h> 57 #endif 58 #include <sys/refcount.h> 59 #include <sys/queue.h> 60 #include <sys/tim_filter.h> 61 #include <sys/smp.h> 62 #include <sys/kthread.h> 63 #include <sys/kern_prefetch.h> 64 #include <sys/protosw.h> 65 #ifdef TCP_ACCOUNTING 66 #include <sys/sched.h> 67 #include <machine/cpu.h> 68 #endif 69 #include <vm/uma.h> 70 71 #include <net/route.h> 72 #include <net/route/nhop.h> 73 #include <net/vnet.h> 74 75 #define TCPSTATES /* for logging */ 76 77 #include <netinet/in.h> 78 #include <netinet/in_kdtrace.h> 79 #include <netinet/in_pcb.h> 80 #include <netinet/ip.h> 81 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 82 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 83 #include <netinet/ip_var.h> 84 #include <netinet/ip6.h> 85 #include <netinet6/in6_pcb.h> 86 #include <netinet6/ip6_var.h> 87 #include <netinet/tcp.h> 88 #include <netinet/tcp_fsm.h> 89 #include <netinet/tcp_seq.h> 90 #include <netinet/tcp_timer.h> 91 #include <netinet/tcp_var.h> 92 #include <netinet/tcp_log_buf.h> 93 #include <netinet/tcp_syncache.h> 94 #include <netinet/tcp_hpts.h> 95 #include <netinet/tcp_ratelimit.h> 96 #include <netinet/tcp_accounting.h> 97 #include <netinet/tcpip.h> 98 #include <netinet/cc/cc.h> 99 #include <netinet/cc/cc_newreno.h> 100 #include <netinet/tcp_fastopen.h> 101 #include <netinet/tcp_lro.h> 102 #ifdef NETFLIX_SHARED_CWND 103 #include <netinet/tcp_shared_cwnd.h> 104 #endif 105 #ifdef TCP_OFFLOAD 106 #include <netinet/tcp_offload.h> 107 #endif 108 #ifdef INET6 109 #include <netinet6/tcp6_var.h> 110 #endif 111 #include <netinet/tcp_ecn.h> 112 113 #include <netipsec/ipsec_support.h> 114 115 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 116 #include <netipsec/ipsec.h> 117 #include <netipsec/ipsec6.h> 118 #endif /* IPSEC */ 119 120 #include <netinet/udp.h> 121 #include <netinet/udp_var.h> 122 #include <machine/in_cksum.h> 123 124 #ifdef MAC 125 #include <security/mac/mac_framework.h> 126 #endif 127 #include "sack_filter.h" 128 #include "tcp_rack.h" 129 #include "tailq_hash.h" 130 #include "rack_bbr_common.h" 131 132 MALLOC_DECLARE(M_TCPPCM); 133 134 void 135 rack_update_pcm_ack(struct tcp_rack *rack, int was_cumack, uint32_t start, uint32_t end) 136 { 137 struct rack_pcm_stats *e; 138 int i, completed = 0; 139 uint64_t ack_arrival; 140 int segsiz; 141 142 if (rack->pcm_in_progress == 0) 143 return; 144 145 if (SEQ_LEQ(end, rack->r_ctl.pcm_i.sseq)) { 146 /* 147 * Its not in our range of data sent, it 148 * is before our first seq. 149 */ 150 return; 151 } 152 /* We take away 1 mss from the end to avoid delayed ack */ 153 segsiz = ctf_fixed_maxseg(rack->rc_tp); 154 if (SEQ_GEQ(end, (rack->r_ctl.pcm_i.eseq - segsiz))) { 155 /* 156 * We have reached beyond the end of the 157 * initial send. Even though things may 158 * still be lost and this could be something 159 * from a different send than our burst. 160 */ 161 completed = 1; 162 rack->pcm_in_progress = 0; 163 rack->r_ctl.last_pcm_round = rack->r_ctl.current_round; 164 rack->r_ctl.pcm_idle_rounds = 0; 165 } 166 if (SEQ_GEQ(start, rack->r_ctl.pcm_i.eseq)) { 167 /* 168 * This is outside the scope 169 * of the measurement itself and 170 * is likely a sack above our burst. 171 */ 172 goto skip_ack_accounting; 173 } 174 /* 175 * Record ACK data. 176 */ 177 ack_arrival = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); 178 if (SEQ_GT(end, rack->r_ctl.pcm_i.eseq)) { 179 /* Trim the end to the end of our range if it is beyond */ 180 end = rack->r_ctl.pcm_i.eseq; 181 } 182 if ((rack->r_ctl.pcm_i.cnt + 1) > rack->r_ctl.pcm_i.cnt_alloc) { 183 /* Need to expand, first is there any present? */ 184 size_t sz; 185 186 if (rack->r_ctl.pcm_i.cnt_alloc == 0) { 187 /* 188 * Failed at rack_init I suppose. 189 */ 190 rack->r_ctl.pcm_i.cnt_alloc = RACK_DEFAULT_PCM_ARRAY; 191 sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc); 192 rack->r_ctl.pcm_s = malloc(sz, M_TCPPCM, M_NOWAIT); 193 if (rack->r_ctl.pcm_s == NULL) { 194 rack->r_ctl.pcm_i.cnt_alloc = 0; 195 rack->pcm_in_progress = 0; 196 return; 197 } 198 } else { 199 /* Need to expand the array */ 200 struct rack_pcm_stats *n; 201 uint16_t new_cnt; 202 203 new_cnt = rack->r_ctl.pcm_i.cnt_alloc * 2; 204 sz = (sizeof(struct rack_pcm_stats) * new_cnt); 205 n = malloc(sz,M_TCPPCM, M_NOWAIT); 206 if (n == NULL) { 207 /* We are dead, no memory */ 208 rack->pcm_in_progress = 0; 209 rack->r_ctl.pcm_i.cnt = 0; 210 return; 211 } 212 sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc); 213 memcpy(n, rack->r_ctl.pcm_s, sz); 214 free(rack->r_ctl.pcm_s, M_TCPPCM); 215 rack->r_ctl.pcm_s = n; 216 rack->r_ctl.pcm_i.cnt_alloc = new_cnt; 217 } 218 } 219 e = &rack->r_ctl.pcm_s[rack->r_ctl.pcm_i.cnt]; 220 rack->r_ctl.pcm_i.cnt++; 221 e->sseq = start; 222 e->eseq = end; 223 e->ack_time = ack_arrival; 224 skip_ack_accounting: 225 if (completed == 0) 226 return; 227 /* 228 * Ok we are to the point where we can assess what 229 * has happened and make a PCM judgement. 230 */ 231 232 if (tcp_bblogging_on(rack->rc_tp)) { 233 union tcp_log_stackspecific log; 234 struct timeval tv; 235 uint64_t prev_time = 0; 236 uint64_t tot_byt = 0; 237 uint32_t tot_lt_12us = 0; 238 uint32_t tot_gt_2mss = 0; 239 240 (void)tcp_get_usecs(&tv); 241 for (i=0; i<rack->r_ctl.pcm_i.cnt; i++) { 242 243 e = &rack->r_ctl.pcm_s[i]; 244 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 245 log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); 246 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 247 log.u_bbr.flex8 = 1; 248 log.u_bbr.flex1 = e->sseq; 249 log.u_bbr.flex2 = e->eseq; 250 tot_byt += (e->eseq - e->sseq); 251 if ((i > 0) && 252 (e->ack_time > prev_time)) { 253 log.u_bbr.flex3 = (uint32_t)(e->ack_time - prev_time); 254 } else { 255 log.u_bbr.flex3 = 0; 256 } 257 if (e->ack_time > rack->r_ctl.pcm_i.send_time) { 258 log.u_bbr.flex4 = (uint32_t)(e->ack_time - rack->r_ctl.pcm_i.send_time); 259 } else { 260 log.u_bbr.flex4 = 0; 261 } 262 if ((e->eseq - e->sseq) > (segsiz * 2)) { 263 tot_gt_2mss++; 264 } 265 if ((i > 0) && 266 (log.u_bbr.flex3 < 12)) { 267 tot_lt_12us++; 268 } 269 prev_time = e->ack_time; 270 log.u_bbr.cur_del_rate = rack->r_ctl.pcm_i.send_time; 271 if ((i > 0) && 272 (log.u_bbr.flex3 > 0)) { 273 /* 274 * Calculate a b/w between this chunk and the previous. 275 */ 276 log.u_bbr.delRate = (e->eseq - e->sseq); 277 log.u_bbr.delRate *= HPTS_USEC_IN_SEC; 278 log.u_bbr.delRate /= (uint64_t)log.u_bbr.flex3; 279 } 280 log.u_bbr.rttProp = e->ack_time; 281 (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK, 282 0, &log, false, NULL, NULL, 0, &tv); 283 } 284 if (prev_time > rack->r_ctl.pcm_i.send_time) { 285 /* 286 * Prev time holds the last ack arrival time. 287 */ 288 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 289 log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); 290 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 291 log.u_bbr.flex8 = 2; 292 log.u_bbr.flex1 = rack->r_ctl.pcm_i.sseq; 293 log.u_bbr.flex2 = rack->r_ctl.pcm_i.eseq; 294 log.u_bbr.flex3 = tot_byt; 295 log.u_bbr.flex4 = tot_lt_12us; /* How many deltas indicate > 2Gbps */ 296 log.u_bbr.flex5 = tot_gt_2mss; /* How many acks represent more than 2MSS */ 297 log.u_bbr.flex7 = rack->r_ctl.pcm_i.cnt; 298 log.u_bbr.cwnd_gain = rack->r_ctl.pcm_i.cnt_alloc; 299 log.u_bbr.cur_del_rate = rack->r_ctl.pcm_i.send_time; 300 log.u_bbr.rttProp = prev_time; 301 log.u_bbr.delRate = tot_byt; 302 log.u_bbr.delRate *= HPTS_USEC_IN_SEC; 303 log.u_bbr.delRate /= (prev_time - rack->r_ctl.pcm_i.send_time); 304 (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK, 305 0, &log, false, NULL, NULL, 0, &tv); 306 } 307 } 308 /* 309 * Here we need a lot to be added including: 310 * 1) Some form of measurement, where if we think the measurement 311 * is valid we iterate over the PCM data and come up with a path 312 * capacity estimate. 313 * 2) We may decide that the PCM is invalid due to ack meddlers and 314 * thus need to increase the PCM size (which defaults to 10mss). 315 * 3) We may need to think about shrinking the PCM size if we are 316 * seeing some sort of presistent loss from making the measurement 317 * (i.e. it got to big and our bursts are causing loss). 318 * 4) If we make a measurement we need to place it somewhere in the 319 * stack to be reported later somehow. Is it a WMA in the stack or 320 * the highest or? 321 * 5) Is there a limit on how big we can go PCM size wise, the code 322 * here will send multiple TSO bursts all at once, but how big 323 * is too big, and does that then put some bound (I think it does) 324 * on the largest capacity we can determine? 325 */ 326 /* New code here */ 327 /* Clear the cnt we are done */ 328 rack->r_ctl.pcm_i.cnt = 0; 329 } 330 331 #endif /* #if !defined(INET) && !defined(INET6) */ 332