xref: /linux/include/net/tcp_ecn.h (revision 9a011277445583bab002fbf5043fab0ea03dc5dd)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _TCP_ECN_H
3 #define _TCP_ECN_H
4 
5 #include <linux/tcp.h>
6 #include <linux/skbuff.h>
7 #include <linux/bitfield.h>
8 
9 #include <net/inet_connection_sock.h>
10 #include <net/sock.h>
11 #include <net/tcp.h>
12 #include <net/inet_ecn.h>
13 
14 /* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is
15  * attemped to be negotiated and requested for incoming connection
16  * and outgoing connection, respectively.
17  */
18 enum tcp_ecn_mode {
19 	TCP_ECN_IN_NOECN_OUT_NOECN = 0,
20 	TCP_ECN_IN_ECN_OUT_ECN = 1,
21 	TCP_ECN_IN_ECN_OUT_NOECN = 2,
22 	TCP_ECN_IN_ACCECN_OUT_ACCECN = 3,
23 	TCP_ECN_IN_ACCECN_OUT_ECN = 4,
24 	TCP_ECN_IN_ACCECN_OUT_NOECN = 5,
25 };
26 
27 static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
28 {
29 	/* Do not set CWR if in AccECN mode! */
30 	if (tcp_ecn_mode_rfc3168(tp))
31 		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
32 }
33 
34 static inline void tcp_ecn_accept_cwr(struct sock *sk,
35 				      const struct sk_buff *skb)
36 {
37 	struct tcp_sock *tp = tcp_sk(sk);
38 
39 	if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) {
40 		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
41 
42 		/* If the sender is telling us it has entered CWR, then its
43 		 * cwnd may be very low (even just 1 packet), so we should ACK
44 		 * immediately.
45 		 */
46 		if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
47 			inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
48 	}
49 }
50 
51 static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
52 {
53 	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
54 }
55 
56 /* tp->accecn_fail_mode */
57 #define TCP_ACCECN_ACE_FAIL_SEND	BIT(0)
58 #define TCP_ACCECN_ACE_FAIL_RECV	BIT(1)
59 #define TCP_ACCECN_OPT_FAIL_SEND	BIT(2)
60 #define TCP_ACCECN_OPT_FAIL_RECV	BIT(3)
61 
62 static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp)
63 {
64 	return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
65 }
66 
67 static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp)
68 {
69 	return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV;
70 }
71 
72 static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp)
73 {
74 	return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND;
75 }
76 
77 static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp)
78 {
79 	return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV;
80 }
81 
82 static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
83 {
84 	tp->accecn_fail_mode |= mode;
85 }
86 
87 static inline u8 tcp_accecn_ace(const struct tcphdr *th)
88 {
89 	return (th->ae << 2) | (th->cwr << 1) | th->ece;
90 }
91 
92 /* Infer the ECT value our SYN arrived with from the echoed ACE field */
93 static inline int tcp_accecn_extract_syn_ect(u8 ace)
94 {
95 	/* Below is an excerpt from the 1st block of Table 2 of AccECN spec */
96 	static const int ace_to_ecn[8] = {
97 		INET_ECN_ECT_0,		/* 0b000 (Undefined) */
98 		INET_ECN_ECT_1,		/* 0b001 (Undefined) */
99 		INET_ECN_NOT_ECT,	/* 0b010 (Not-ECT is received) */
100 		INET_ECN_ECT_1,		/* 0b011 (ECT-1 is received) */
101 		INET_ECN_ECT_0,		/* 0b100 (ECT-0 is received) */
102 		INET_ECN_ECT_1,		/* 0b101 (Reserved) */
103 		INET_ECN_CE,		/* 0b110 (CE is received) */
104 		INET_ECN_ECT_1		/* 0b111 (Undefined) */
105 	};
106 
107 	return ace_to_ecn[ace & 0x7];
108 }
109 
110 /* Check ECN field transition to detect invalid transitions */
111 static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv)
112 {
113 	if (rcv == snt)
114 		return true;
115 
116 	/* Non-ECT altered to something or something became non-ECT */
117 	if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT)
118 		return false;
119 	/* CE -> ECT(0/1)? */
120 	if (snt == INET_ECN_CE)
121 		return false;
122 	return true;
123 }
124 
125 static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace,
126 						    u8 sent_ect)
127 {
128 	u8 ect = tcp_accecn_extract_syn_ect(ace);
129 	struct tcp_sock *tp = tcp_sk(sk);
130 
131 	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
132 		return true;
133 
134 	if (!tcp_ect_transition_valid(sent_ect, ect)) {
135 		tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
136 		return false;
137 	}
138 
139 	return true;
140 }
141 
142 /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */
143 static inline void tcp_accecn_third_ack(struct sock *sk,
144 					const struct sk_buff *skb, u8 sent_ect)
145 {
146 	u8 ace = tcp_accecn_ace(tcp_hdr(skb));
147 	struct tcp_sock *tp = tcp_sk(sk);
148 
149 	switch (ace) {
150 	case 0x0:
151 		/* Invalid value */
152 		tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
153 		break;
154 	case 0x7:
155 	case 0x5:
156 	case 0x1:
157 		/* Unused but legal values */
158 		break;
159 	default:
160 		/* Validation only applies to first non-data packet */
161 		if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
162 		    !TCP_SKB_CB(skb)->sacked &&
163 		    tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) {
164 			if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) &&
165 			    !tp->delivered_ce)
166 				tp->delivered_ce++;
167 		}
168 		break;
169 	}
170 }
171 
172 /* Updates Accurate ECN received counters from the received IP ECN field */
173 static inline void tcp_ecn_received_counters(struct sock *sk,
174 					     const struct sk_buff *skb, u32 len)
175 {
176 	u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
177 	u8 is_ce = INET_ECN_is_ce(ecnfield);
178 	struct tcp_sock *tp = tcp_sk(sk);
179 
180 	if (!INET_ECN_is_not_ect(ecnfield)) {
181 		u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
182 
183 		/* As for accurate ECN, the TCP_ECN_SEEN flag is set by
184 		 * tcp_ecn_received_counters() when the ECN codepoint of
185 		 * received TCP data or ACK contains ECT(0), ECT(1), or CE.
186 		 */
187 		if (!tcp_ecn_mode_rfc3168(tp))
188 			tp->ecn_flags |= TCP_ECN_SEEN;
189 
190 		/* ACE counter tracks *all* segments including pure ACKs */
191 		tp->received_ce += pcount;
192 		tp->received_ce_pending = min(tp->received_ce_pending + pcount,
193 					      0xfU);
194 
195 		if (len > 0)
196 			tp->received_ecn_bytes[ecnfield - 1] += len;
197 	}
198 }
199 
200 /* AccECN specification, 2.2: [...] A Data Receiver maintains four counters
201  * initialized at the start of	the half-connection. [...] These byte counters
202  * reflect only the TCP payload length, excluding TCP header and TCP options.
203  */
204 static inline void tcp_ecn_received_counters_payload(struct sock *sk,
205 						     const struct sk_buff *skb)
206 {
207 	const struct tcphdr *th = (const struct tcphdr *)skb->data;
208 
209 	tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4);
210 }
211 
212 /* AccECN specification, 5.1: [...] a server can determine that it
213  * negotiated AccECN as [...] if the ACK contains an ACE field with
214  * the value 0b010 to 0b111 (decimal 2 to 7).
215  */
216 static inline bool cookie_accecn_ok(const struct tcphdr *th)
217 {
218 	return tcp_accecn_ace(th) > 0x1;
219 }
220 
221 /* Used to form the ACE flags for SYN/ACK */
222 static inline u16 tcp_accecn_reflector_flags(u8 ect)
223 {
224 	/* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN.
225 	 * Below is an excerpt from the 1st block of Table 2 of AccECN spec,
226 	 * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE
227 	 */
228 	static const u8 ecn_to_ace_flags[4] = {
229 		0b010,	/* Not-ECT is received */
230 		0b011,	/* ECT(1) is received */
231 		0b100,	/* ECT(0) is received */
232 		0b110	/* CE is received */
233 	};
234 
235 	return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]);
236 }
237 
238 /* AccECN specification, 3.1.2: If a TCP server that implements AccECN
239  * receives a SYN with the three TCP header flags (AE, CWR and ECE) set
240  * to any combination other than 000, 011 or 111, it MUST negotiate the
241  * use of AccECN as if they had been set to 111.
242  */
243 static inline bool tcp_accecn_syn_requested(const struct tcphdr *th)
244 {
245 	u8 ace = tcp_accecn_ace(th);
246 
247 	return ace && ace != 0x3;
248 }
249 
250 static inline void __tcp_accecn_init_bytes_counters(int *counter_array)
251 {
252 	BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1);
253 	BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2);
254 	BUILD_BUG_ON(INET_ECN_CE != 0x3);
255 
256 	counter_array[INET_ECN_ECT_1 - 1] = 0;
257 	counter_array[INET_ECN_ECT_0 - 1] = 0;
258 	counter_array[INET_ECN_CE - 1] = 0;
259 }
260 
261 static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
262 {
263 	tp->received_ce = 0;
264 	tp->received_ce_pending = 0;
265 	__tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
266 }
267 
268 /* Used for make_synack to form the ACE flags */
269 static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect)
270 {
271 	/* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received
272 	 * from SYN. Below is an excerpt from Table 2 of the AccECN spec:
273 	 * +====================+====================================+
274 	 * |  IP-ECN codepoint  |  Respective ACE falgs on SYN/ACK   |
275 	 * |   received on SYN  |       AE       CWR       ECE       |
276 	 * +====================+====================================+
277 	 * |      Not-ECT       |       0         1         0        |
278 	 * |      ECT(1)        |       0         1         1        |
279 	 * |      ECT(0)        |       1         0         0        |
280 	 * |        CE          |       1         1         0        |
281 	 * +====================+====================================+
282 	 */
283 	th->ae = !!(ect & INET_ECN_ECT_0);
284 	th->cwr = ect != INET_ECN_ECT_0;
285 	th->ece = ect == INET_ECN_ECT_1;
286 }
287 
288 static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb,
289 				      struct tcphdr *th)
290 {
291 	u32 wire_ace;
292 
293 	/* The final packet of the 3WHS or anything like it must reflect
294 	 * the SYN/ACK ECT instead of putting CEP into ACE field, such
295 	 * case show up in tcp_flags.
296 	 */
297 	if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) {
298 		wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET;
299 		th->ece = !!(wire_ace & 0x1);
300 		th->cwr = !!(wire_ace & 0x2);
301 		th->ae = !!(wire_ace & 0x4);
302 		tp->received_ce_pending = 0;
303 	}
304 }
305 
306 /* See Table 2 of the AccECN draft */
307 static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
308 				      u8 ip_dsfield)
309 {
310 	struct tcp_sock *tp = tcp_sk(sk);
311 	u8 ace = tcp_accecn_ace(th);
312 
313 	switch (ace) {
314 	case 0x0:
315 	case 0x7:
316 		/* +========+========+============+=============+
317 		 * | A      | B      |  SYN/ACK   |  Feedback   |
318 		 * |        |        |    B->A    |  Mode of A  |
319 		 * |        |        | AE CWR ECE |             |
320 		 * +========+========+============+=============+
321 		 * | AccECN | No ECN | 0   0   0  |   Not ECN   |
322 		 * | AccECN | Broken | 1   1   1  |   Not ECN   |
323 		 * +========+========+============+=============+
324 		 */
325 		tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
326 		break;
327 	case 0x1:
328 	case 0x5:
329 		/* +========+========+============+=============+
330 		 * | A      | B      |  SYN/ACK   |  Feedback   |
331 		 * |        |        |    B->A    |  Mode of A  |
332 		 * |        |        | AE CWR ECE |             |
333 		 * +========+========+============+=============+
334 		 * | AccECN | Nonce  | 1   0   1  | (Reserved)  |
335 		 * | AccECN | ECN    | 0   0   1  | Classic ECN |
336 		 * | Nonce  | AccECN | 0   0   1  | Classic ECN |
337 		 * | ECN    | AccECN | 0   0   1  | Classic ECN |
338 		 * +========+========+============+=============+
339 		 */
340 		if (tcp_ecn_mode_pending(tp))
341 			/* Downgrade from AccECN, or requested initially */
342 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
343 		break;
344 	default:
345 		tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
346 		tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
347 		if (INET_ECN_is_ce(ip_dsfield) &&
348 		    tcp_accecn_validate_syn_feedback(sk, ace,
349 						     tp->syn_ect_snt)) {
350 			tp->received_ce++;
351 			tp->received_ce_pending++;
352 		}
353 		break;
354 	}
355 }
356 
357 static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th,
358 				   const struct sk_buff *skb)
359 {
360 	if (tcp_ecn_mode_pending(tp)) {
361 		if (!tcp_accecn_syn_requested(th)) {
362 			/* Downgrade to classic ECN feedback */
363 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
364 		} else {
365 			tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
366 					  INET_ECN_MASK;
367 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
368 		}
369 	}
370 	if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr))
371 		tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
372 }
373 
374 static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp,
375 					const struct tcphdr *th)
376 {
377 	if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
378 		return true;
379 	return false;
380 }
381 
382 /* Packet ECN state for a SYN-ACK */
383 static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
384 {
385 	struct tcp_sock *tp = tcp_sk(sk);
386 
387 	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
388 	if (tcp_ecn_disabled(tp))
389 		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
390 	else if (tcp_ca_needs_ecn(sk) ||
391 		 tcp_bpf_ca_needs_ecn(sk))
392 		INET_ECN_xmit(sk);
393 
394 	if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) {
395 		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
396 		TCP_SKB_CB(skb)->tcp_flags |=
397 			tcp_accecn_reflector_flags(tp->syn_ect_rcv);
398 		tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
399 	}
400 }
401 
402 /* Packet ECN state for a SYN.  */
403 static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
404 {
405 	struct tcp_sock *tp = tcp_sk(sk);
406 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
407 	bool use_ecn, use_accecn;
408 	u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn);
409 
410 	use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN;
411 	use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN ||
412 		  tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN ||
413 		  tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn;
414 
415 	if (!use_ecn) {
416 		const struct dst_entry *dst = __sk_dst_get(sk);
417 
418 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
419 			use_ecn = true;
420 	}
421 
422 	tp->ecn_flags = 0;
423 
424 	if (use_ecn) {
425 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
426 			INET_ECN_xmit(sk);
427 
428 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
429 		if (use_accecn) {
430 			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE;
431 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING);
432 			tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
433 		} else {
434 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
435 		}
436 	}
437 }
438 
439 static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
440 {
441 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) {
442 		/* tp->ecn_flags are cleared at a later point in time when
443 		 * SYN ACK is ultimatively being received.
444 		 */
445 		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
446 	}
447 }
448 
449 static inline void
450 tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
451 {
452 	if (tcp_rsk(req)->accecn_ok)
453 		tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv);
454 	else if (inet_rsk(req)->ecn_ok)
455 		th->ece = 1;
456 }
457 
458 #endif /* _LINUX_TCP_ECN_H */
459