xref: /linux/include/net/tcp_ecn.h (revision 3cae34274c79e0c60ccd1c10516973af1aed2a7c)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _TCP_ECN_H
3 #define _TCP_ECN_H
4 
5 #include <linux/tcp.h>
6 #include <linux/skbuff.h>
7 #include <linux/bitfield.h>
8 
9 #include <net/inet_connection_sock.h>
10 #include <net/sock.h>
11 #include <net/tcp.h>
12 #include <net/inet_ecn.h>
13 
14 /* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is
15  * attemped to be negotiated and requested for incoming connection
16  * and outgoing connection, respectively.
17  */
18 enum tcp_ecn_mode {
19 	TCP_ECN_IN_NOECN_OUT_NOECN = 0,
20 	TCP_ECN_IN_ECN_OUT_ECN = 1,
21 	TCP_ECN_IN_ECN_OUT_NOECN = 2,
22 	TCP_ECN_IN_ACCECN_OUT_ACCECN = 3,
23 	TCP_ECN_IN_ACCECN_OUT_ECN = 4,
24 	TCP_ECN_IN_ACCECN_OUT_NOECN = 5,
25 };
26 
27 static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
28 {
29 	/* Do not set CWR if in AccECN mode! */
30 	if (tcp_ecn_mode_rfc3168(tp))
31 		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
32 }
33 
34 static inline void tcp_ecn_accept_cwr(struct sock *sk,
35 				      const struct sk_buff *skb)
36 {
37 	struct tcp_sock *tp = tcp_sk(sk);
38 
39 	if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) {
40 		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
41 
42 		/* If the sender is telling us it has entered CWR, then its
43 		 * cwnd may be very low (even just 1 packet), so we should ACK
44 		 * immediately.
45 		 */
46 		if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
47 			inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
48 	}
49 }
50 
51 static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
52 {
53 	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
54 }
55 
56 /* tp->accecn_fail_mode */
57 #define TCP_ACCECN_ACE_FAIL_SEND	BIT(0)
58 #define TCP_ACCECN_ACE_FAIL_RECV	BIT(1)
59 #define TCP_ACCECN_OPT_FAIL_SEND	BIT(2)
60 #define TCP_ACCECN_OPT_FAIL_RECV	BIT(3)
61 
62 static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp)
63 {
64 	return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
65 }
66 
67 static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp)
68 {
69 	return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV;
70 }
71 
72 static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp)
73 {
74 	return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND;
75 }
76 
77 static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp)
78 {
79 	return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV;
80 }
81 
82 static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
83 {
84 	tp->accecn_fail_mode |= mode;
85 }
86 
87 static inline u8 tcp_accecn_ace(const struct tcphdr *th)
88 {
89 	return (th->ae << 2) | (th->cwr << 1) | th->ece;
90 }
91 
92 /* Infer the ECT value our SYN arrived with from the echoed ACE field */
93 static inline int tcp_accecn_extract_syn_ect(u8 ace)
94 {
95 	/* Below is an excerpt from the 1st block of Table 2 of AccECN spec */
96 	static const int ace_to_ecn[8] = {
97 		INET_ECN_ECT_0,		/* 0b000 (Undefined) */
98 		INET_ECN_ECT_1,		/* 0b001 (Undefined) */
99 		INET_ECN_NOT_ECT,	/* 0b010 (Not-ECT is received) */
100 		INET_ECN_ECT_1,		/* 0b011 (ECT-1 is received) */
101 		INET_ECN_ECT_0,		/* 0b100 (ECT-0 is received) */
102 		INET_ECN_ECT_1,		/* 0b101 (Reserved) */
103 		INET_ECN_CE,		/* 0b110 (CE is received) */
104 		INET_ECN_ECT_1		/* 0b111 (Undefined) */
105 	};
106 
107 	return ace_to_ecn[ace & 0x7];
108 }
109 
110 /* Check ECN field transition to detect invalid transitions */
111 static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv)
112 {
113 	if (rcv == snt)
114 		return true;
115 
116 	/* Non-ECT altered to something or something became non-ECT */
117 	if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT)
118 		return false;
119 	/* CE -> ECT(0/1)? */
120 	if (snt == INET_ECN_CE)
121 		return false;
122 	return true;
123 }
124 
125 static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace,
126 						    u8 sent_ect)
127 {
128 	u8 ect = tcp_accecn_extract_syn_ect(ace);
129 	struct tcp_sock *tp = tcp_sk(sk);
130 
131 	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
132 		return true;
133 
134 	if (!tcp_ect_transition_valid(sent_ect, ect)) {
135 		tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
136 		return false;
137 	}
138 
139 	return true;
140 }
141 
142 /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */
143 static inline void tcp_accecn_third_ack(struct sock *sk,
144 					const struct sk_buff *skb, u8 sent_ect)
145 {
146 	u8 ace = tcp_accecn_ace(tcp_hdr(skb));
147 	struct tcp_sock *tp = tcp_sk(sk);
148 
149 	switch (ace) {
150 	case 0x0:
151 		/* Invalid value */
152 		tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
153 		break;
154 	case 0x7:
155 	case 0x5:
156 	case 0x1:
157 		/* Unused but legal values */
158 		break;
159 	default:
160 		/* Validation only applies to first non-data packet */
161 		if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
162 		    !TCP_SKB_CB(skb)->sacked &&
163 		    tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) {
164 			if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) &&
165 			    !tp->delivered_ce)
166 				tp->delivered_ce++;
167 		}
168 		break;
169 	}
170 }
171 
172 /* Updates Accurate ECN received counters from the received IP ECN field */
173 static inline void tcp_ecn_received_counters(struct sock *sk,
174 					     const struct sk_buff *skb)
175 {
176 	u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
177 	u8 is_ce = INET_ECN_is_ce(ecnfield);
178 	struct tcp_sock *tp = tcp_sk(sk);
179 
180 	if (!INET_ECN_is_not_ect(ecnfield)) {
181 		u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
182 
183 		/* As for accurate ECN, the TCP_ECN_SEEN flag is set by
184 		 * tcp_ecn_received_counters() when the ECN codepoint of
185 		 * received TCP data or ACK contains ECT(0), ECT(1), or CE.
186 		 */
187 		if (!tcp_ecn_mode_rfc3168(tp))
188 			tp->ecn_flags |= TCP_ECN_SEEN;
189 
190 		/* ACE counter tracks *all* segments including pure ACKs */
191 		tp->received_ce += pcount;
192 		tp->received_ce_pending = min(tp->received_ce_pending + pcount,
193 					      0xfU);
194 	}
195 }
196 
197 /* AccECN specification, 5.1: [...] a server can determine that it
198  * negotiated AccECN as [...] if the ACK contains an ACE field with
199  * the value 0b010 to 0b111 (decimal 2 to 7).
200  */
201 static inline bool cookie_accecn_ok(const struct tcphdr *th)
202 {
203 	return tcp_accecn_ace(th) > 0x1;
204 }
205 
206 /* Used to form the ACE flags for SYN/ACK */
207 static inline u16 tcp_accecn_reflector_flags(u8 ect)
208 {
209 	/* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN.
210 	 * Below is an excerpt from the 1st block of Table 2 of AccECN spec,
211 	 * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE
212 	 */
213 	static const u8 ecn_to_ace_flags[4] = {
214 		0b010,	/* Not-ECT is received */
215 		0b011,	/* ECT(1) is received */
216 		0b100,	/* ECT(0) is received */
217 		0b110	/* CE is received */
218 	};
219 
220 	return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]);
221 }
222 
223 /* AccECN specification, 3.1.2: If a TCP server that implements AccECN
224  * receives a SYN with the three TCP header flags (AE, CWR and ECE) set
225  * to any combination other than 000, 011 or 111, it MUST negotiate the
226  * use of AccECN as if they had been set to 111.
227  */
228 static inline bool tcp_accecn_syn_requested(const struct tcphdr *th)
229 {
230 	u8 ace = tcp_accecn_ace(th);
231 
232 	return ace && ace != 0x3;
233 }
234 
235 static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
236 {
237 	tp->received_ce = 0;
238 	tp->received_ce_pending = 0;
239 }
240 
241 /* Used for make_synack to form the ACE flags */
242 static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect)
243 {
244 	/* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received
245 	 * from SYN. Below is an excerpt from Table 2 of the AccECN spec:
246 	 * +====================+====================================+
247 	 * |  IP-ECN codepoint  |  Respective ACE falgs on SYN/ACK   |
248 	 * |   received on SYN  |       AE       CWR       ECE       |
249 	 * +====================+====================================+
250 	 * |      Not-ECT       |       0         1         0        |
251 	 * |      ECT(1)        |       0         1         1        |
252 	 * |      ECT(0)        |       1         0         0        |
253 	 * |        CE          |       1         1         0        |
254 	 * +====================+====================================+
255 	 */
256 	th->ae = !!(ect & INET_ECN_ECT_0);
257 	th->cwr = ect != INET_ECN_ECT_0;
258 	th->ece = ect == INET_ECN_ECT_1;
259 }
260 
261 static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb,
262 				      struct tcphdr *th)
263 {
264 	u32 wire_ace;
265 
266 	/* The final packet of the 3WHS or anything like it must reflect
267 	 * the SYN/ACK ECT instead of putting CEP into ACE field, such
268 	 * case show up in tcp_flags.
269 	 */
270 	if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) {
271 		wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET;
272 		th->ece = !!(wire_ace & 0x1);
273 		th->cwr = !!(wire_ace & 0x2);
274 		th->ae = !!(wire_ace & 0x4);
275 		tp->received_ce_pending = 0;
276 	}
277 }
278 
279 /* See Table 2 of the AccECN draft */
280 static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
281 				      u8 ip_dsfield)
282 {
283 	struct tcp_sock *tp = tcp_sk(sk);
284 	u8 ace = tcp_accecn_ace(th);
285 
286 	switch (ace) {
287 	case 0x0:
288 	case 0x7:
289 		/* +========+========+============+=============+
290 		 * | A      | B      |  SYN/ACK   |  Feedback   |
291 		 * |        |        |    B->A    |  Mode of A  |
292 		 * |        |        | AE CWR ECE |             |
293 		 * +========+========+============+=============+
294 		 * | AccECN | No ECN | 0   0   0  |   Not ECN   |
295 		 * | AccECN | Broken | 1   1   1  |   Not ECN   |
296 		 * +========+========+============+=============+
297 		 */
298 		tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
299 		break;
300 	case 0x1:
301 	case 0x5:
302 		/* +========+========+============+=============+
303 		 * | A      | B      |  SYN/ACK   |  Feedback   |
304 		 * |        |        |    B->A    |  Mode of A  |
305 		 * |        |        | AE CWR ECE |             |
306 		 * +========+========+============+=============+
307 		 * | AccECN | Nonce  | 1   0   1  | (Reserved)  |
308 		 * | AccECN | ECN    | 0   0   1  | Classic ECN |
309 		 * | Nonce  | AccECN | 0   0   1  | Classic ECN |
310 		 * | ECN    | AccECN | 0   0   1  | Classic ECN |
311 		 * +========+========+============+=============+
312 		 */
313 		if (tcp_ecn_mode_pending(tp))
314 			/* Downgrade from AccECN, or requested initially */
315 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
316 		break;
317 	default:
318 		tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
319 		tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
320 		if (INET_ECN_is_ce(ip_dsfield) &&
321 		    tcp_accecn_validate_syn_feedback(sk, ace,
322 						     tp->syn_ect_snt)) {
323 			tp->received_ce++;
324 			tp->received_ce_pending++;
325 		}
326 		break;
327 	}
328 }
329 
330 static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th,
331 				   const struct sk_buff *skb)
332 {
333 	if (tcp_ecn_mode_pending(tp)) {
334 		if (!tcp_accecn_syn_requested(th)) {
335 			/* Downgrade to classic ECN feedback */
336 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
337 		} else {
338 			tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
339 					  INET_ECN_MASK;
340 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
341 		}
342 	}
343 	if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr))
344 		tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
345 }
346 
347 static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp,
348 					const struct tcphdr *th)
349 {
350 	if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
351 		return true;
352 	return false;
353 }
354 
355 /* Packet ECN state for a SYN-ACK */
356 static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
357 {
358 	struct tcp_sock *tp = tcp_sk(sk);
359 
360 	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
361 	if (tcp_ecn_disabled(tp))
362 		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
363 	else if (tcp_ca_needs_ecn(sk) ||
364 		 tcp_bpf_ca_needs_ecn(sk))
365 		INET_ECN_xmit(sk);
366 
367 	if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) {
368 		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
369 		TCP_SKB_CB(skb)->tcp_flags |=
370 			tcp_accecn_reflector_flags(tp->syn_ect_rcv);
371 		tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
372 	}
373 }
374 
375 /* Packet ECN state for a SYN.  */
376 static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
377 {
378 	struct tcp_sock *tp = tcp_sk(sk);
379 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
380 	bool use_ecn, use_accecn;
381 	u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn);
382 
383 	use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN;
384 	use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN ||
385 		  tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN ||
386 		  tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn;
387 
388 	if (!use_ecn) {
389 		const struct dst_entry *dst = __sk_dst_get(sk);
390 
391 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
392 			use_ecn = true;
393 	}
394 
395 	tp->ecn_flags = 0;
396 
397 	if (use_ecn) {
398 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
399 			INET_ECN_xmit(sk);
400 
401 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
402 		if (use_accecn) {
403 			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE;
404 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING);
405 			tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
406 		} else {
407 			tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
408 		}
409 	}
410 }
411 
412 static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
413 {
414 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) {
415 		/* tp->ecn_flags are cleared at a later point in time when
416 		 * SYN ACK is ultimatively being received.
417 		 */
418 		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
419 	}
420 }
421 
422 static inline void
423 tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
424 {
425 	if (tcp_rsk(req)->accecn_ok)
426 		tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv);
427 	else if (inet_rsk(req)->ecn_ok)
428 		th->ece = 1;
429 }
430 
431 #endif /* _LINUX_TCP_ECN_H */
432