1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 #ifndef _TCP_ECN_H 3 #define _TCP_ECN_H 4 5 #include <linux/tcp.h> 6 #include <linux/skbuff.h> 7 #include <linux/bitfield.h> 8 9 #include <net/inet_connection_sock.h> 10 #include <net/sock.h> 11 #include <net/tcp.h> 12 #include <net/inet_ecn.h> 13 14 /* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is 15 * attemped to be negotiated and requested for incoming connection 16 * and outgoing connection, respectively. 17 */ 18 enum tcp_ecn_mode { 19 TCP_ECN_IN_NOECN_OUT_NOECN = 0, 20 TCP_ECN_IN_ECN_OUT_ECN = 1, 21 TCP_ECN_IN_ECN_OUT_NOECN = 2, 22 TCP_ECN_IN_ACCECN_OUT_ACCECN = 3, 23 TCP_ECN_IN_ACCECN_OUT_ECN = 4, 24 TCP_ECN_IN_ACCECN_OUT_NOECN = 5, 25 }; 26 27 static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) 28 { 29 /* Do not set CWR if in AccECN mode! */ 30 if (tcp_ecn_mode_rfc3168(tp)) 31 tp->ecn_flags |= TCP_ECN_QUEUE_CWR; 32 } 33 34 static inline void tcp_ecn_accept_cwr(struct sock *sk, 35 const struct sk_buff *skb) 36 { 37 struct tcp_sock *tp = tcp_sk(sk); 38 39 if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { 40 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 41 42 /* If the sender is telling us it has entered CWR, then its 43 * cwnd may be very low (even just 1 packet), so we should ACK 44 * immediately. 45 */ 46 if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) 47 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; 48 } 49 } 50 51 static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) 52 { 53 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; 54 } 55 56 /* tp->accecn_fail_mode */ 57 #define TCP_ACCECN_ACE_FAIL_SEND BIT(0) 58 #define TCP_ACCECN_ACE_FAIL_RECV BIT(1) 59 #define TCP_ACCECN_OPT_FAIL_SEND BIT(2) 60 #define TCP_ACCECN_OPT_FAIL_RECV BIT(3) 61 62 static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) 63 { 64 return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; 65 } 66 67 static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) 68 { 69 return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV; 70 } 71 72 static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) 73 { 74 return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND; 75 } 76 77 static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) 78 { 79 return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV; 80 } 81 82 static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) 83 { 84 tp->accecn_fail_mode |= mode; 85 } 86 87 static inline u8 tcp_accecn_ace(const struct tcphdr *th) 88 { 89 return (th->ae << 2) | (th->cwr << 1) | th->ece; 90 } 91 92 /* Infer the ECT value our SYN arrived with from the echoed ACE field */ 93 static inline int tcp_accecn_extract_syn_ect(u8 ace) 94 { 95 /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */ 96 static const int ace_to_ecn[8] = { 97 INET_ECN_ECT_0, /* 0b000 (Undefined) */ 98 INET_ECN_ECT_1, /* 0b001 (Undefined) */ 99 INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */ 100 INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */ 101 INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */ 102 INET_ECN_ECT_1, /* 0b101 (Reserved) */ 103 INET_ECN_CE, /* 0b110 (CE is received) */ 104 INET_ECN_ECT_1 /* 0b111 (Undefined) */ 105 }; 106 107 return ace_to_ecn[ace & 0x7]; 108 } 109 110 /* Check ECN field transition to detect invalid transitions */ 111 static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv) 112 { 113 if (rcv == snt) 114 return true; 115 116 /* Non-ECT altered to something or something became non-ECT */ 117 if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT) 118 return false; 119 /* CE -> ECT(0/1)? */ 120 if (snt == INET_ECN_CE) 121 return false; 122 return true; 123 } 124 125 static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, 126 u8 sent_ect) 127 { 128 u8 ect = tcp_accecn_extract_syn_ect(ace); 129 struct tcp_sock *tp = tcp_sk(sk); 130 131 if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) 132 return true; 133 134 if (!tcp_ect_transition_valid(sent_ect, ect)) { 135 tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); 136 return false; 137 } 138 139 return true; 140 } 141 142 /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ 143 static inline void tcp_accecn_third_ack(struct sock *sk, 144 const struct sk_buff *skb, u8 sent_ect) 145 { 146 u8 ace = tcp_accecn_ace(tcp_hdr(skb)); 147 struct tcp_sock *tp = tcp_sk(sk); 148 149 switch (ace) { 150 case 0x0: 151 /* Invalid value */ 152 tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); 153 break; 154 case 0x7: 155 case 0x5: 156 case 0x1: 157 /* Unused but legal values */ 158 break; 159 default: 160 /* Validation only applies to first non-data packet */ 161 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && 162 !TCP_SKB_CB(skb)->sacked && 163 tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { 164 if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && 165 !tp->delivered_ce) 166 tp->delivered_ce++; 167 } 168 break; 169 } 170 } 171 172 /* Updates Accurate ECN received counters from the received IP ECN field */ 173 static inline void tcp_ecn_received_counters(struct sock *sk, 174 const struct sk_buff *skb, u32 len) 175 { 176 u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; 177 u8 is_ce = INET_ECN_is_ce(ecnfield); 178 struct tcp_sock *tp = tcp_sk(sk); 179 180 if (!INET_ECN_is_not_ect(ecnfield)) { 181 u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); 182 183 /* As for accurate ECN, the TCP_ECN_SEEN flag is set by 184 * tcp_ecn_received_counters() when the ECN codepoint of 185 * received TCP data or ACK contains ECT(0), ECT(1), or CE. 186 */ 187 if (!tcp_ecn_mode_rfc3168(tp)) 188 tp->ecn_flags |= TCP_ECN_SEEN; 189 190 /* ACE counter tracks *all* segments including pure ACKs */ 191 tp->received_ce += pcount; 192 tp->received_ce_pending = min(tp->received_ce_pending + pcount, 193 0xfU); 194 195 if (len > 0) 196 tp->received_ecn_bytes[ecnfield - 1] += len; 197 } 198 } 199 200 /* AccECN specification, 2.2: [...] A Data Receiver maintains four counters 201 * initialized at the start of the half-connection. [...] These byte counters 202 * reflect only the TCP payload length, excluding TCP header and TCP options. 203 */ 204 static inline void tcp_ecn_received_counters_payload(struct sock *sk, 205 const struct sk_buff *skb) 206 { 207 const struct tcphdr *th = (const struct tcphdr *)skb->data; 208 209 tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); 210 } 211 212 /* AccECN specification, 5.1: [...] a server can determine that it 213 * negotiated AccECN as [...] if the ACK contains an ACE field with 214 * the value 0b010 to 0b111 (decimal 2 to 7). 215 */ 216 static inline bool cookie_accecn_ok(const struct tcphdr *th) 217 { 218 return tcp_accecn_ace(th) > 0x1; 219 } 220 221 /* Used to form the ACE flags for SYN/ACK */ 222 static inline u16 tcp_accecn_reflector_flags(u8 ect) 223 { 224 /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN. 225 * Below is an excerpt from the 1st block of Table 2 of AccECN spec, 226 * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE 227 */ 228 static const u8 ecn_to_ace_flags[4] = { 229 0b010, /* Not-ECT is received */ 230 0b011, /* ECT(1) is received */ 231 0b100, /* ECT(0) is received */ 232 0b110 /* CE is received */ 233 }; 234 235 return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]); 236 } 237 238 /* AccECN specification, 3.1.2: If a TCP server that implements AccECN 239 * receives a SYN with the three TCP header flags (AE, CWR and ECE) set 240 * to any combination other than 000, 011 or 111, it MUST negotiate the 241 * use of AccECN as if they had been set to 111. 242 */ 243 static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) 244 { 245 u8 ace = tcp_accecn_ace(th); 246 247 return ace && ace != 0x3; 248 } 249 250 static inline void __tcp_accecn_init_bytes_counters(int *counter_array) 251 { 252 BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); 253 BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); 254 BUILD_BUG_ON(INET_ECN_CE != 0x3); 255 256 counter_array[INET_ECN_ECT_1 - 1] = 0; 257 counter_array[INET_ECN_ECT_0 - 1] = 0; 258 counter_array[INET_ECN_CE - 1] = 0; 259 } 260 261 static inline void tcp_accecn_init_counters(struct tcp_sock *tp) 262 { 263 tp->received_ce = 0; 264 tp->received_ce_pending = 0; 265 __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); 266 } 267 268 /* Used for make_synack to form the ACE flags */ 269 static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) 270 { 271 /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received 272 * from SYN. Below is an excerpt from Table 2 of the AccECN spec: 273 * +====================+====================================+ 274 * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK | 275 * | received on SYN | AE CWR ECE | 276 * +====================+====================================+ 277 * | Not-ECT | 0 1 0 | 278 * | ECT(1) | 0 1 1 | 279 * | ECT(0) | 1 0 0 | 280 * | CE | 1 1 0 | 281 * +====================+====================================+ 282 */ 283 th->ae = !!(ect & INET_ECN_ECT_0); 284 th->cwr = ect != INET_ECN_ECT_0; 285 th->ece = ect == INET_ECN_ECT_1; 286 } 287 288 static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, 289 struct tcphdr *th) 290 { 291 u32 wire_ace; 292 293 /* The final packet of the 3WHS or anything like it must reflect 294 * the SYN/ACK ECT instead of putting CEP into ACE field, such 295 * case show up in tcp_flags. 296 */ 297 if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) { 298 wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; 299 th->ece = !!(wire_ace & 0x1); 300 th->cwr = !!(wire_ace & 0x2); 301 th->ae = !!(wire_ace & 0x4); 302 tp->received_ce_pending = 0; 303 } 304 } 305 306 /* See Table 2 of the AccECN draft */ 307 static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, 308 u8 ip_dsfield) 309 { 310 struct tcp_sock *tp = tcp_sk(sk); 311 u8 ace = tcp_accecn_ace(th); 312 313 switch (ace) { 314 case 0x0: 315 case 0x7: 316 /* +========+========+============+=============+ 317 * | A | B | SYN/ACK | Feedback | 318 * | | | B->A | Mode of A | 319 * | | | AE CWR ECE | | 320 * +========+========+============+=============+ 321 * | AccECN | No ECN | 0 0 0 | Not ECN | 322 * | AccECN | Broken | 1 1 1 | Not ECN | 323 * +========+========+============+=============+ 324 */ 325 tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); 326 break; 327 case 0x1: 328 case 0x5: 329 /* +========+========+============+=============+ 330 * | A | B | SYN/ACK | Feedback | 331 * | | | B->A | Mode of A | 332 * | | | AE CWR ECE | | 333 * +========+========+============+=============+ 334 * | AccECN | Nonce | 1 0 1 | (Reserved) | 335 * | AccECN | ECN | 0 0 1 | Classic ECN | 336 * | Nonce | AccECN | 0 0 1 | Classic ECN | 337 * | ECN | AccECN | 0 0 1 | Classic ECN | 338 * +========+========+============+=============+ 339 */ 340 if (tcp_ecn_mode_pending(tp)) 341 /* Downgrade from AccECN, or requested initially */ 342 tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); 343 break; 344 default: 345 tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 346 tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; 347 if (INET_ECN_is_ce(ip_dsfield) && 348 tcp_accecn_validate_syn_feedback(sk, ace, 349 tp->syn_ect_snt)) { 350 tp->received_ce++; 351 tp->received_ce_pending++; 352 } 353 break; 354 } 355 } 356 357 static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, 358 const struct sk_buff *skb) 359 { 360 if (tcp_ecn_mode_pending(tp)) { 361 if (!tcp_accecn_syn_requested(th)) { 362 /* Downgrade to classic ECN feedback */ 363 tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); 364 } else { 365 tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & 366 INET_ECN_MASK; 367 tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 368 } 369 } 370 if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) 371 tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); 372 } 373 374 static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, 375 const struct tcphdr *th) 376 { 377 if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) 378 return true; 379 return false; 380 } 381 382 /* Packet ECN state for a SYN-ACK */ 383 static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) 384 { 385 struct tcp_sock *tp = tcp_sk(sk); 386 387 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; 388 if (tcp_ecn_disabled(tp)) 389 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; 390 else if (tcp_ca_needs_ecn(sk) || 391 tcp_bpf_ca_needs_ecn(sk)) 392 INET_ECN_xmit(sk); 393 394 if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) { 395 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; 396 TCP_SKB_CB(skb)->tcp_flags |= 397 tcp_accecn_reflector_flags(tp->syn_ect_rcv); 398 tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 399 } 400 } 401 402 /* Packet ECN state for a SYN. */ 403 static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) 404 { 405 struct tcp_sock *tp = tcp_sk(sk); 406 bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); 407 bool use_ecn, use_accecn; 408 u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn); 409 410 use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN; 411 use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || 412 tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || 413 tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; 414 415 if (!use_ecn) { 416 const struct dst_entry *dst = __sk_dst_get(sk); 417 418 if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) 419 use_ecn = true; 420 } 421 422 tp->ecn_flags = 0; 423 424 if (use_ecn) { 425 if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) 426 INET_ECN_xmit(sk); 427 428 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; 429 if (use_accecn) { 430 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE; 431 tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING); 432 tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 433 } else { 434 tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); 435 } 436 } 437 } 438 439 static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) 440 { 441 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) { 442 /* tp->ecn_flags are cleared at a later point in time when 443 * SYN ACK is ultimatively being received. 444 */ 445 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; 446 } 447 } 448 449 static inline void 450 tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) 451 { 452 if (tcp_rsk(req)->accecn_ok) 453 tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); 454 else if (inet_rsk(req)->ecn_ok) 455 th->ece = 1; 456 } 457 458 #endif /* _LINUX_TCP_ECN_H */ 459