xref: /freebsd/sys/netinet/tcp_ecn.c (revision be181ee2a28aa2b4b0e76684bce9f673ef668874)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5  *      The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 2007-2008,2010
7  *      Swinburne University of Technology, Melbourne, Australia.
8  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9  * Copyright (c) 2010 The FreeBSD Foundation
10  * Copyright (c) 2010-2011 Juniper Networks, Inc.
11  * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com>
12  * All rights reserved.
13  *
14  * Portions of this software were developed at the Centre for Advanced Internet
15  * Architectures, Swinburne University of Technology, by Lawrence Stewart,
16  * James Healy and David Hayes, made possible in part by a grant from the Cisco
17  * University Research Program Fund at Community Foundation Silicon Valley.
18  *
19  * Portions of this software were developed at the Centre for Advanced
20  * Internet Architectures, Swinburne University of Technology, Melbourne,
21  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
22  *
23  * Portions of this software were developed by Robert N. M. Watson under
24  * contract to Juniper Networks, Inc.
25  *
26  * Redistribution and use in source and binary forms, with or without
27  * modification, are permitted provided that the following conditions
28  * are met:
29  * 1. Redistributions of source code must retain the above copyright
30  *    notice, this list of conditions and the following disclaimer.
31  * 2. Redistributions in binary form must reproduce the above copyright
32  *    notice, this list of conditions and the following disclaimer in the
33  *    documentation and/or other materials provided with the distribution.
34  * 3. Neither the name of the University nor the names of its contributors
35  *    may be used to endorse or promote products derived from this software
36  *    without specific prior written permission.
37  *
38  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  *
50  *      @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95
51  */
52 
53 /*
54  * Utility functions to deal with Explicit Congestion Notification in TCP
55  * implementing the essential parts of the Accurate ECN extension
56  * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include "opt_inet.h"
63 #include "opt_inet6.h"
64 #include "opt_tcpdebug.h"
65 
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/malloc.h>
71 #include <sys/mbuf.h>
72 #include <sys/socket.h>
73 #include <sys/socketvar.h>
74 
75 #include <machine/cpu.h>
76 
77 #include <vm/uma.h>
78 
79 #include <net/if.h>
80 #include <net/if_var.h>
81 #include <net/route.h>
82 #include <net/vnet.h>
83 
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/ip.h>
87 #include <netinet/in_var.h>
88 #include <netinet/in_pcb.h>
89 #include <netinet/ip_var.h>
90 #include <netinet/ip6.h>
91 #include <netinet/icmp6.h>
92 #include <netinet6/nd6.h>
93 #include <netinet6/ip6_var.h>
94 #include <netinet6/in6_pcb.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcp_syncache.h>
100 #include <netinet/tcp_timer.h>
101 #include <netinet/tcpip.h>
102 #include <netinet/tcp_ecn.h>
103 
104 
105 /*
106  * Process incoming SYN,ACK packet
107  */
108 void
109 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
110 {
111 
112 	if (V_tcp_do_ecn == 0)
113 		return;
114 	if ((V_tcp_do_ecn == 1) ||
115 	    (V_tcp_do_ecn == 2)) {
116 		/* RFC3168 ECN handling */
117 		if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) {
118 			tp->t_flags2 |= TF2_ECN_PERMIT;
119 			TCPSTAT_INC(tcps_ecn_shs);
120 		}
121 	} else
122 	/* decoding Accurate ECN according to table in section 3.1.1 */
123 	if ((V_tcp_do_ecn == 3) ||
124 	    (V_tcp_do_ecn == 4)) {
125 		/*
126 		 * on the SYN,ACK, process the AccECN
127 		 * flags indicating the state the SYN
128 		 * was delivered.
129 		 * Reactions to Path ECN mangling can
130 		 * come here.
131 		 */
132 		switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
133 		/* RFC3168 SYN */
134 		case (0|0|TH_ECE):
135 			tp->t_flags2 |= TF2_ECN_PERMIT;
136 			TCPSTAT_INC(tcps_ecn_shs);
137 			break;
138 		/* non-ECT SYN */
139 		case (0|TH_CWR|0):
140 			tp->t_flags2 |= TF2_ACE_PERMIT;
141 			tp->t_scep = 5;
142 			TCPSTAT_INC(tcps_ecn_shs);
143 			TCPSTAT_INC(tcps_ace_nect);
144 			break;
145 		/* ECT0 SYN */
146 		case (TH_AE|0|0):
147 			tp->t_flags2 |= TF2_ACE_PERMIT;
148 			tp->t_scep = 5;
149 			TCPSTAT_INC(tcps_ecn_shs);
150 			TCPSTAT_INC(tcps_ace_ect0);
151 			break;
152 		/* ECT1 SYN */
153 		case (0|TH_CWR|TH_ECE):
154 			tp->t_flags2 |= TF2_ACE_PERMIT;
155 			tp->t_scep = 5;
156 			TCPSTAT_INC(tcps_ecn_shs);
157 			TCPSTAT_INC(tcps_ace_ect1);
158 			break;
159 		/* CE SYN */
160 		case (TH_AE|TH_CWR|0):
161 			tp->t_flags2 |= TF2_ACE_PERMIT;
162 			tp->t_scep = 6;
163 			/*
164 			 * reduce the IW to 2 MSS (to
165 			 * account for delayed acks) if
166 			 * the SYN,ACK was CE marked
167 			 */
168 			tp->snd_cwnd = 2 * tcp_maxseg(tp);
169 			TCPSTAT_INC(tcps_ecn_shs);
170 			TCPSTAT_INC(tcps_ace_nect);
171 			break;
172 		default:
173 			break;
174 		}
175 		/*
176 		 * Set the AccECN Codepoints on
177 		 * the outgoing <ACK> to the ECN
178 		 * state of the <SYN,ACK>
179 		 * according to table 3 in the
180 		 * AccECN draft
181 		 */
182 		switch (iptos & IPTOS_ECN_MASK) {
183 		case (IPTOS_ECN_NOTECT):
184 			tp->t_rcep = 0b010;
185 			break;
186 		case (IPTOS_ECN_ECT0):
187 			tp->t_rcep = 0b100;
188 			break;
189 		case (IPTOS_ECN_ECT1):
190 			tp->t_rcep = 0b011;
191 			break;
192 		case (IPTOS_ECN_CE):
193 			tp->t_rcep = 0b110;
194 			break;
195 		}
196 	}
197 }
198 
199 /*
200  * Handle parallel SYN for ECN
201  */
202 void
203 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos)
204 {
205 	if (thflags & TH_ACK)
206 		return;
207 	if (V_tcp_do_ecn == 0)
208 		return;
209 	if ((V_tcp_do_ecn == 1) ||
210 	    (V_tcp_do_ecn == 2)) {
211 		/* RFC3168 ECN handling */
212 		if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) {
213 			tp->t_flags2 |= TF2_ECN_PERMIT;
214 			tp->t_flags2 |= TF2_ECN_SND_ECE;
215 			TCPSTAT_INC(tcps_ecn_shs);
216 		}
217 	} else
218 	if ((V_tcp_do_ecn == 3) ||
219 	    (V_tcp_do_ecn == 4)) {
220 		/* AccECN handling */
221 		switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
222 		default:
223 		case (0|0|0):
224 			break;
225 		case (0|TH_CWR|TH_ECE):
226 			tp->t_flags2 |= TF2_ECN_PERMIT;
227 			tp->t_flags2 |= TF2_ECN_SND_ECE;
228 			TCPSTAT_INC(tcps_ecn_shs);
229 			break;
230 		case (TH_AE|TH_CWR|TH_ECE):
231 			tp->t_flags2 |= TF2_ACE_PERMIT;
232 			TCPSTAT_INC(tcps_ecn_shs);
233 			/*
234 			 * Set the AccECN Codepoints on
235 			 * the outgoing <ACK> to the ECN
236 			 * state of the <SYN,ACK>
237 			 * according to table 3 in the
238 			 * AccECN draft
239 			 */
240 			switch (iptos & IPTOS_ECN_MASK) {
241 			case (IPTOS_ECN_NOTECT):
242 				tp->t_rcep = 0b010;
243 				break;
244 			case (IPTOS_ECN_ECT0):
245 				tp->t_rcep = 0b100;
246 				break;
247 			case (IPTOS_ECN_ECT1):
248 				tp->t_rcep = 0b011;
249 				break;
250 			case (IPTOS_ECN_CE):
251 				tp->t_rcep = 0b110;
252 				break;
253 			}
254 			break;
255 		}
256 	}
257 }
258 
259 /*
260  * TCP ECN processing.
261  */
262 int
263 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos)
264 {
265 	int delta_ace = 0;
266 
267 	if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
268 		switch (iptos & IPTOS_ECN_MASK) {
269 		case IPTOS_ECN_CE:
270 			TCPSTAT_INC(tcps_ecn_ce);
271 			break;
272 		case IPTOS_ECN_ECT0:
273 			TCPSTAT_INC(tcps_ecn_ect0);
274 			break;
275 		case IPTOS_ECN_ECT1:
276 			TCPSTAT_INC(tcps_ecn_ect1);
277 			break;
278 		}
279 
280 		if (tp->t_flags2 & TF2_ACE_PERMIT) {
281 			if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
282 				tp->t_rcep += 1;
283 			if (tp->t_flags2 & TF2_ECN_PERMIT) {
284 				delta_ace = (tcp_ecn_get_ace(thflags) + 8 -
285 					    (tp->t_scep & 0x07)) & 0x07;
286 				tp->t_scep += delta_ace;
287 			} else {
288 				/*
289 				 * process the final ACK of the 3WHS
290 				 * see table 3 in draft-ietf-tcpm-accurate-ecn
291 				 */
292 				switch (tcp_ecn_get_ace(thflags)) {
293 				case 0b010:
294 					/* nonECT SYN or SYN,ACK */
295 					/* Fallthrough */
296 				case 0b011:
297 					/* ECT1 SYN or SYN,ACK */
298 					/* Fallthrough */
299 				case 0b100:
300 					/* ECT0 SYN or SYN,ACK */
301 					tp->t_scep = 5;
302 					break;
303 				case 0b110:
304 					/* CE SYN or SYN,ACK */
305 					tp->t_scep = 6;
306 					tp->snd_cwnd = 2 * tcp_maxseg(tp);
307 					break;
308 				default:
309 					/* mangled AccECN handshake */
310 					tp->t_scep = 5;
311 					break;
312 				}
313 				tp->t_flags2 |= TF2_ECN_PERMIT;
314 			}
315 		} else {
316 			/* RFC3168 ECN handling */
317 			if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE)
318 				delta_ace = 1;
319 			if (thflags & TH_CWR) {
320 				tp->t_flags2 &= ~TF2_ECN_SND_ECE;
321 				tp->t_flags |= TF_ACKNOW;
322 			}
323 			if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
324 				tp->t_flags2 |= TF2_ECN_SND_ECE;
325 		}
326 
327 		/* Process a packet differently from RFC3168. */
328 		cc_ecnpkt_handler_flags(tp, thflags, iptos);
329 	}
330 
331 	return delta_ace;
332 }
333 
334 /*
335  * Send ECN setup <SYN> packet header flags
336  */
337 uint16_t
338 tcp_ecn_output_syn_sent(struct tcpcb *tp)
339 {
340 	uint16_t thflags = 0;
341 
342 	if (V_tcp_do_ecn == 0)
343 		return thflags;
344 	if (V_tcp_do_ecn == 1) {
345 		/* Send a RFC3168 ECN setup <SYN> packet */
346 		if (tp->t_rxtshift >= 1) {
347 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
348 				thflags = TH_ECE|TH_CWR;
349 		} else
350 			thflags = TH_ECE|TH_CWR;
351 	} else
352 	if (V_tcp_do_ecn == 3) {
353 		/* Send an Accurate ECN setup <SYN> packet */
354 		if (tp->t_rxtshift >= 1) {
355 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
356 				thflags = TH_ECE|TH_CWR|TH_AE;
357 		} else
358 			thflags = TH_ECE|TH_CWR|TH_AE;
359 	}
360 
361 	return thflags;
362 }
363 
364 /*
365  * output processing of ECN feature
366  * returning IP ECN header codepoint
367  */
368 int
369 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit)
370 {
371 	int ipecn = IPTOS_ECN_NOTECT;
372 	bool newdata;
373 
374 	/*
375 	 * If the peer has ECN, mark data packets with
376 	 * ECN capable transmission (ECT).
377 	 * Ignore pure control packets, retransmissions
378 	 * and window probes.
379 	 */
380 	newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
381 		    !rxmit &&
382 		    !((tp->t_flags & TF_FORCEDATA) && len == 1));
383 	/* RFC3168 ECN marking, only new data segments */
384 	if (newdata) {
385 		ipecn = IPTOS_ECN_ECT0;
386 		TCPSTAT_INC(tcps_ecn_ect0);
387 	}
388 	/*
389 	 * Reply with proper ECN notifications.
390 	 */
391 	if (tp->t_flags2 & TF2_ACE_PERMIT) {
392 		*thflags &= ~(TH_AE|TH_CWR|TH_ECE);
393 		if (tp->t_rcep & 0x01)
394 			*thflags |= TH_ECE;
395 		if (tp->t_rcep & 0x02)
396 			*thflags |= TH_CWR;
397 		if (tp->t_rcep & 0x04)
398 			*thflags |= TH_AE;
399 		if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
400 			/*
401 			 * here we process the final
402 			 * ACK of the 3WHS
403 			 */
404 			if (tp->t_rcep == 0b110) {
405 				tp->t_rcep = 6;
406 			} else {
407 				tp->t_rcep = 5;
408 			}
409 			tp->t_flags2 |= TF2_ECN_PERMIT;
410 		}
411 	} else {
412 		if (newdata &&
413 		    (tp->t_flags2 & TF2_ECN_SND_CWR)) {
414 			*thflags |= TH_CWR;
415 			tp->t_flags2 &= ~TF2_ECN_SND_CWR;
416 		}
417 		if (tp->t_flags2 & TF2_ECN_SND_ECE)
418 			*thflags |= TH_ECE;
419 	}
420 
421 	return ipecn;
422 }
423 
424 /*
425  * Set up the ECN related tcpcb fields from
426  * a syncache entry
427  */
428 void
429 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc)
430 {
431 	if (sc->sc_flags & SCF_ECN_MASK) {
432 		switch (sc->sc_flags & SCF_ECN_MASK) {
433 		case SCF_ECN:
434 			tp->t_flags2 |= TF2_ECN_PERMIT;
435 			break;
436 		case SCF_ACE_N:
437 			/* Fallthrough */
438 		case SCF_ACE_0:
439 			/* Fallthrough */
440 		case SCF_ACE_1:
441 			tp->t_flags2 |= TF2_ACE_PERMIT;
442 			tp->t_scep = 5;
443 			tp->t_rcep = 5;
444 			break;
445 		case SCF_ACE_CE:
446 			tp->t_flags2 |= TF2_ACE_PERMIT;
447 			tp->t_scep = 6;
448 			tp->t_rcep = 6;
449 			break;
450 		/* undefined SCF codepoint */
451 		default:
452 			break;
453 		}
454 	}
455 }
456 
457 /*
458  * Process a <SYN> packets ECN information, and provide the
459  * syncache with the relevant information.
460  */
461 int
462 tcp_ecn_syncache_add(uint16_t thflags, int iptos)
463 {
464 	int scflags = 0;
465 
466 	switch (thflags & (TH_AE|TH_CWR|TH_ECE)) {
467 	/* no ECN */
468 	case (0|0|0):
469 		break;
470 	/* legacy ECN */
471 	case (0|TH_CWR|TH_ECE):
472 		scflags = SCF_ECN;
473 		break;
474 	/* Accurate ECN */
475 	case (TH_AE|TH_CWR|TH_ECE):
476 		if ((V_tcp_do_ecn == 3) ||
477 		    (V_tcp_do_ecn == 4)) {
478 			switch (iptos & IPTOS_ECN_MASK) {
479 			case IPTOS_ECN_CE:
480 				scflags = SCF_ACE_CE;
481 				break;
482 			case IPTOS_ECN_ECT0:
483 				scflags = SCF_ACE_0;
484 				break;
485 			case IPTOS_ECN_ECT1:
486 				scflags = SCF_ACE_1;
487 				break;
488 			case IPTOS_ECN_NOTECT:
489 				scflags = SCF_ACE_N;
490 				break;
491 			}
492 		} else
493 			scflags = SCF_ECN;
494 		break;
495 	/* Default Case (section 3.1.2) */
496 	default:
497 		if ((V_tcp_do_ecn == 3) ||
498 		    (V_tcp_do_ecn == 4)) {
499 			switch (iptos & IPTOS_ECN_MASK) {
500 			case IPTOS_ECN_CE:
501 				scflags = SCF_ACE_CE;
502 				break;
503 			case IPTOS_ECN_ECT0:
504 				scflags = SCF_ACE_0;
505 				break;
506 			case IPTOS_ECN_ECT1:
507 				scflags = SCF_ACE_1;
508 				break;
509 			case IPTOS_ECN_NOTECT:
510 				scflags = SCF_ACE_N;
511 				break;
512 			}
513 		}
514 		break;
515 	}
516 	return scflags;
517 }
518 
519 /*
520  * Set up the ECN information for the <SYN,ACK> from
521  * syncache information.
522  */
523 uint16_t
524 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
525 {
526 	if ((thflags & TH_SYN) &&
527 	    (sc->sc_flags & SCF_ECN_MASK)) {
528 		switch (sc->sc_flags & SCF_ECN_MASK) {
529 		case SCF_ECN:
530 			thflags |= (0 | 0 | TH_ECE);
531 			TCPSTAT_INC(tcps_ecn_shs);
532 			break;
533 		case SCF_ACE_N:
534 			thflags |= (0 | TH_CWR | 0);
535 			TCPSTAT_INC(tcps_ecn_shs);
536 			TCPSTAT_INC(tcps_ace_nect);
537 			break;
538 		case SCF_ACE_0:
539 			thflags |= (TH_AE | 0 | 0);
540 			TCPSTAT_INC(tcps_ecn_shs);
541 			TCPSTAT_INC(tcps_ace_ect0);
542 			break;
543 		case SCF_ACE_1:
544 			thflags |= (0 | TH_ECE | TH_CWR);
545 			TCPSTAT_INC(tcps_ecn_shs);
546 			TCPSTAT_INC(tcps_ace_ect1);
547 			break;
548 		case SCF_ACE_CE:
549 			thflags |= (TH_AE | TH_CWR | 0);
550 			TCPSTAT_INC(tcps_ecn_shs);
551 			TCPSTAT_INC(tcps_ace_ce);
552 			break;
553 		/* undefined SCF codepoint */
554 		default:
555 			break;
556 		}
557 	}
558 	return thflags;
559 }
560 
561 int
562 tcp_ecn_get_ace(uint16_t thflags)
563 {
564 	int ace = 0;
565 
566 	if (thflags & TH_ECE)
567 		ace += 1;
568 	if (thflags & TH_CWR)
569 		ace += 2;
570 	if (thflags & TH_AE)
571 		ace += 4;
572 	return ace;
573 }
574