xref: /freebsd/sys/netinet/tcp_ecn.c (revision 6fe0a6c80a1aff14236924eb33e4013aa8c14f91)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5  *      The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 2007-2008,2010
7  *      Swinburne University of Technology, Melbourne, Australia.
8  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9  * Copyright (c) 2010 The FreeBSD Foundation
10  * Copyright (c) 2010-2011 Juniper Networks, Inc.
11  * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com>
12  * All rights reserved.
13  *
14  * Portions of this software were developed at the Centre for Advanced Internet
15  * Architectures, Swinburne University of Technology, by Lawrence Stewart,
16  * James Healy and David Hayes, made possible in part by a grant from the Cisco
17  * University Research Program Fund at Community Foundation Silicon Valley.
18  *
19  * Portions of this software were developed at the Centre for Advanced
20  * Internet Architectures, Swinburne University of Technology, Melbourne,
21  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
22  *
23  * Portions of this software were developed by Robert N. M. Watson under
24  * contract to Juniper Networks, Inc.
25  *
26  * Redistribution and use in source and binary forms, with or without
27  * modification, are permitted provided that the following conditions
28  * are met:
29  * 1. Redistributions of source code must retain the above copyright
30  *    notice, this list of conditions and the following disclaimer.
31  * 2. Redistributions in binary form must reproduce the above copyright
32  *    notice, this list of conditions and the following disclaimer in the
33  *    documentation and/or other materials provided with the distribution.
34  * 3. Neither the name of the University nor the names of its contributors
35  *    may be used to endorse or promote products derived from this software
36  *    without specific prior written permission.
37  *
38  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  *
50  *      @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95
51  */
52 
53 /*
54  * Utility functions to deal with Explicit Congestion Notification in TCP
55  * implementing the essential parts of the Accurate ECN extension
56  * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include "opt_inet.h"
63 #include "opt_inet6.h"
64 #include "opt_tcpdebug.h"
65 
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/malloc.h>
71 #include <sys/mbuf.h>
72 #include <sys/socket.h>
73 #include <sys/socketvar.h>
74 
75 #include <machine/cpu.h>
76 
77 #include <vm/uma.h>
78 
79 #include <net/if.h>
80 #include <net/if_var.h>
81 #include <net/route.h>
82 #include <net/vnet.h>
83 
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/ip.h>
87 #include <netinet/in_var.h>
88 #include <netinet/in_pcb.h>
89 #include <netinet/ip_var.h>
90 #include <netinet/ip6.h>
91 #include <netinet/icmp6.h>
92 #include <netinet6/nd6.h>
93 #include <netinet6/ip6_var.h>
94 #include <netinet6/in6_pcb.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcp_syncache.h>
100 #include <netinet/tcp_timer.h>
101 #include <netinet/tcpip.h>
102 #include <netinet/tcp_ecn.h>
103 
104 
105 /*
106  * Process incoming SYN,ACK packet
107  */
108 void
109 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
110 {
111 
112 	if (V_tcp_do_ecn == 0)
113 		return;
114 	if ((V_tcp_do_ecn == 1) ||
115 	    (V_tcp_do_ecn == 2)) {
116 		/* RFC3168 ECN handling */
117 		if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) {
118 			tp->t_flags2 |= TF2_ECN_PERMIT;
119 			tp->t_flags2 &= ~TF2_ACE_PERMIT;
120 			TCPSTAT_INC(tcps_ecn_shs);
121 		}
122 	} else
123 	/* decoding Accurate ECN according to table in section 3.1.1 */
124 	if ((V_tcp_do_ecn == 3) ||
125 	    (V_tcp_do_ecn == 4)) {
126 		/*
127 		 * on the SYN,ACK, process the AccECN
128 		 * flags indicating the state the SYN
129 		 * was delivered.
130 		 * Reactions to Path ECN mangling can
131 		 * come here.
132 		 */
133 		switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
134 		/* RFC3168 SYN */
135 		case (0|0|TH_ECE):
136 			tp->t_flags2 |= TF2_ECN_PERMIT;
137 			tp->t_flags2 &= ~TF2_ACE_PERMIT;
138 			TCPSTAT_INC(tcps_ecn_shs);
139 			break;
140 		/* non-ECT SYN */
141 		case (0|TH_CWR|0):
142 			tp->t_flags2 |= TF2_ACE_PERMIT;
143 			tp->t_flags2 &= ~TF2_ECN_PERMIT;
144 			tp->t_scep = 5;
145 			TCPSTAT_INC(tcps_ecn_shs);
146 			TCPSTAT_INC(tcps_ace_nect);
147 			break;
148 		/* ECT0 SYN */
149 		case (TH_AE|0|0):
150 			tp->t_flags2 |= TF2_ACE_PERMIT;
151 			tp->t_flags2 &= ~TF2_ECN_PERMIT;
152 			tp->t_scep = 5;
153 			TCPSTAT_INC(tcps_ecn_shs);
154 			TCPSTAT_INC(tcps_ace_ect0);
155 			break;
156 		/* ECT1 SYN */
157 		case (0|TH_CWR|TH_ECE):
158 			tp->t_flags2 |= TF2_ACE_PERMIT;
159 			tp->t_flags2 &= ~TF2_ECN_PERMIT;
160 			tp->t_scep = 5;
161 			TCPSTAT_INC(tcps_ecn_shs);
162 			TCPSTAT_INC(tcps_ace_ect1);
163 			break;
164 		/* CE SYN */
165 		case (TH_AE|TH_CWR|0):
166 			tp->t_flags2 |= TF2_ACE_PERMIT;
167 			tp->t_flags2 &= ~TF2_ECN_PERMIT;
168 			tp->t_scep = 6;
169 			/*
170 			 * reduce the IW to 2 MSS (to
171 			 * account for delayed acks) if
172 			 * the SYN,ACK was CE marked
173 			 */
174 			tp->snd_cwnd = 2 * tcp_maxseg(tp);
175 			TCPSTAT_INC(tcps_ecn_shs);
176 			TCPSTAT_INC(tcps_ace_nect);
177 			break;
178 		default:
179 			tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT);
180 			break;
181 		}
182 		/*
183 		 * Set the AccECN Codepoints on
184 		 * the outgoing <ACK> to the ECN
185 		 * state of the <SYN,ACK>
186 		 * according to table 3 in the
187 		 * AccECN draft
188 		 */
189 		switch (iptos & IPTOS_ECN_MASK) {
190 		case (IPTOS_ECN_NOTECT):
191 			tp->t_rcep = 0b010;
192 			break;
193 		case (IPTOS_ECN_ECT0):
194 			tp->t_rcep = 0b100;
195 			break;
196 		case (IPTOS_ECN_ECT1):
197 			tp->t_rcep = 0b011;
198 			break;
199 		case (IPTOS_ECN_CE):
200 			tp->t_rcep = 0b110;
201 			break;
202 		}
203 	}
204 }
205 
206 /*
207  * Handle parallel SYN for ECN
208  */
209 void
210 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos)
211 {
212 	if (thflags & TH_ACK)
213 		return;
214 	if (V_tcp_do_ecn == 0)
215 		return;
216 	if ((V_tcp_do_ecn == 1) ||
217 	    (V_tcp_do_ecn == 2)) {
218 		/* RFC3168 ECN handling */
219 		if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) {
220 			tp->t_flags2 |= TF2_ECN_PERMIT;
221 			tp->t_flags2 &= ~TF2_ACE_PERMIT;
222 			tp->t_flags2 |= TF2_ECN_SND_ECE;
223 			TCPSTAT_INC(tcps_ecn_shs);
224 		}
225 	} else
226 	if ((V_tcp_do_ecn == 3) ||
227 	    (V_tcp_do_ecn == 4)) {
228 		/* AccECN handling */
229 		switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
230 		default:
231 		case (0|0|0):
232 			tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT);
233 			break;
234 		case (0|TH_CWR|TH_ECE):
235 			tp->t_flags2 |= TF2_ECN_PERMIT;
236 			tp->t_flags2 &= ~TF2_ACE_PERMIT;
237 			tp->t_flags2 |= TF2_ECN_SND_ECE;
238 			TCPSTAT_INC(tcps_ecn_shs);
239 			break;
240 		case (TH_AE|TH_CWR|TH_ECE):
241 			tp->t_flags2 |= TF2_ACE_PERMIT;
242 			tp->t_flags2 &= ~TF2_ECN_PERMIT;
243 			TCPSTAT_INC(tcps_ecn_shs);
244 			/*
245 			 * Set the AccECN Codepoints on
246 			 * the outgoing <ACK> to the ECN
247 			 * state of the <SYN,ACK>
248 			 * according to table 3 in the
249 			 * AccECN draft
250 			 */
251 			switch (iptos & IPTOS_ECN_MASK) {
252 			case (IPTOS_ECN_NOTECT):
253 				tp->t_rcep = 0b010;
254 				break;
255 			case (IPTOS_ECN_ECT0):
256 				tp->t_rcep = 0b100;
257 				break;
258 			case (IPTOS_ECN_ECT1):
259 				tp->t_rcep = 0b011;
260 				break;
261 			case (IPTOS_ECN_CE):
262 				tp->t_rcep = 0b110;
263 				break;
264 			}
265 			break;
266 		}
267 	}
268 }
269 
270 /*
271  * TCP ECN processing.
272  */
273 int
274 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos)
275 {
276 	int delta_ace = 0;
277 
278 	if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
279 		switch (iptos & IPTOS_ECN_MASK) {
280 		case IPTOS_ECN_CE:
281 			TCPSTAT_INC(tcps_ecn_ce);
282 			break;
283 		case IPTOS_ECN_ECT0:
284 			TCPSTAT_INC(tcps_ecn_ect0);
285 			break;
286 		case IPTOS_ECN_ECT1:
287 			TCPSTAT_INC(tcps_ecn_ect1);
288 			break;
289 		}
290 
291 		if (tp->t_flags2 & TF2_ACE_PERMIT) {
292 			if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
293 				tp->t_rcep += 1;
294 			if (tp->t_flags2 & TF2_ECN_PERMIT) {
295 				delta_ace = (tcp_ecn_get_ace(thflags) + 8 -
296 					    (tp->t_scep & 0x07)) & 0x07;
297 				tp->t_scep += delta_ace;
298 			} else {
299 				/*
300 				 * process the final ACK of the 3WHS
301 				 * see table 3 in draft-ietf-tcpm-accurate-ecn
302 				 */
303 				switch (tcp_ecn_get_ace(thflags)) {
304 				case 0b010:
305 					/* nonECT SYN or SYN,ACK */
306 					/* Fallthrough */
307 				case 0b011:
308 					/* ECT1 SYN or SYN,ACK */
309 					/* Fallthrough */
310 				case 0b100:
311 					/* ECT0 SYN or SYN,ACK */
312 					tp->t_scep = 5;
313 					break;
314 				case 0b110:
315 					/* CE SYN or SYN,ACK */
316 					tp->t_scep = 6;
317 					tp->snd_cwnd = 2 * tcp_maxseg(tp);
318 					break;
319 				default:
320 					/* mangled AccECN handshake */
321 					tp->t_scep = 5;
322 					break;
323 				}
324 				tp->t_flags2 |= TF2_ECN_PERMIT;
325 			}
326 		} else {
327 			/* RFC3168 ECN handling */
328 			if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE)
329 				delta_ace = 1;
330 			if (thflags & TH_CWR) {
331 				tp->t_flags2 &= ~TF2_ECN_SND_ECE;
332 				tp->t_flags |= TF_ACKNOW;
333 			}
334 			if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
335 				tp->t_flags2 |= TF2_ECN_SND_ECE;
336 		}
337 
338 		/* Process a packet differently from RFC3168. */
339 		cc_ecnpkt_handler_flags(tp, thflags, iptos);
340 	}
341 
342 	return delta_ace;
343 }
344 
345 /*
346  * Send ECN setup <SYN> packet header flags
347  */
348 uint16_t
349 tcp_ecn_output_syn_sent(struct tcpcb *tp)
350 {
351 	uint16_t thflags = 0;
352 
353 	if (V_tcp_do_ecn == 0)
354 		return thflags;
355 	if (V_tcp_do_ecn == 1) {
356 		/* Send a RFC3168 ECN setup <SYN> packet */
357 		if (tp->t_rxtshift >= 1) {
358 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
359 				thflags = TH_ECE|TH_CWR;
360 		} else
361 			thflags = TH_ECE|TH_CWR;
362 	} else
363 	if (V_tcp_do_ecn == 3) {
364 		/* Send an Accurate ECN setup <SYN> packet */
365 		if (tp->t_rxtshift >= 1) {
366 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
367 				thflags = TH_ECE|TH_CWR|TH_AE;
368 		} else
369 			thflags = TH_ECE|TH_CWR|TH_AE;
370 	}
371 
372 	return thflags;
373 }
374 
375 /*
376  * output processing of ECN feature
377  * returning IP ECN header codepoint
378  */
379 int
380 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit)
381 {
382 	int ipecn = IPTOS_ECN_NOTECT;
383 	bool newdata;
384 
385 	/*
386 	 * If the peer has ECN, mark data packets with
387 	 * ECN capable transmission (ECT).
388 	 * Ignore pure control packets, retransmissions
389 	 * and window probes.
390 	 */
391 	newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
392 		    !rxmit &&
393 		    !((tp->t_flags & TF_FORCEDATA) && len == 1));
394 	/* RFC3168 ECN marking, only new data segments */
395 	if (newdata) {
396 		ipecn = IPTOS_ECN_ECT0;
397 		TCPSTAT_INC(tcps_ecn_ect0);
398 	}
399 	/*
400 	 * Reply with proper ECN notifications.
401 	 */
402 	if (tp->t_flags2 & TF2_ACE_PERMIT) {
403 		*thflags &= ~(TH_AE|TH_CWR|TH_ECE);
404 		if (tp->t_rcep & 0x01)
405 			*thflags |= TH_ECE;
406 		if (tp->t_rcep & 0x02)
407 			*thflags |= TH_CWR;
408 		if (tp->t_rcep & 0x04)
409 			*thflags |= TH_AE;
410 		if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
411 			/*
412 			 * here we process the final
413 			 * ACK of the 3WHS
414 			 */
415 			if (tp->t_rcep == 0b110) {
416 				tp->t_rcep = 6;
417 			} else {
418 				tp->t_rcep = 5;
419 			}
420 			tp->t_flags2 |= TF2_ECN_PERMIT;
421 		}
422 	} else {
423 		if (newdata &&
424 		    (tp->t_flags2 & TF2_ECN_SND_CWR)) {
425 			*thflags |= TH_CWR;
426 			tp->t_flags2 &= ~TF2_ECN_SND_CWR;
427 		}
428 		if (tp->t_flags2 & TF2_ECN_SND_ECE)
429 			*thflags |= TH_ECE;
430 	}
431 
432 	return ipecn;
433 }
434 
435 /*
436  * Set up the ECN related tcpcb fields from
437  * a syncache entry
438  */
439 void
440 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc)
441 {
442 	if (sc->sc_flags & SCF_ECN_MASK) {
443 		switch (sc->sc_flags & SCF_ECN_MASK) {
444 		case SCF_ECN:
445 			tp->t_flags2 |= TF2_ECN_PERMIT;
446 			break;
447 		case SCF_ACE_N:
448 			/* Fallthrough */
449 		case SCF_ACE_0:
450 			/* Fallthrough */
451 		case SCF_ACE_1:
452 			tp->t_flags2 |= TF2_ACE_PERMIT;
453 			tp->t_scep = 5;
454 			tp->t_rcep = 5;
455 			break;
456 		case SCF_ACE_CE:
457 			tp->t_flags2 |= TF2_ACE_PERMIT;
458 			tp->t_scep = 6;
459 			tp->t_rcep = 6;
460 			break;
461 		/* undefined SCF codepoint */
462 		default:
463 			break;
464 		}
465 	}
466 }
467 
468 /*
469  * Process a <SYN> packets ECN information, and provide the
470  * syncache with the relevant information.
471  */
472 int
473 tcp_ecn_syncache_add(uint16_t thflags, int iptos)
474 {
475 	int scflags = 0;
476 
477 	switch (thflags & (TH_AE|TH_CWR|TH_ECE)) {
478 	/* no ECN */
479 	case (0|0|0):
480 		break;
481 	/* legacy ECN */
482 	case (0|TH_CWR|TH_ECE):
483 		scflags = SCF_ECN;
484 		break;
485 	/* Accurate ECN */
486 	case (TH_AE|TH_CWR|TH_ECE):
487 		if ((V_tcp_do_ecn == 3) ||
488 		    (V_tcp_do_ecn == 4)) {
489 			switch (iptos & IPTOS_ECN_MASK) {
490 			case IPTOS_ECN_CE:
491 				scflags = SCF_ACE_CE;
492 				break;
493 			case IPTOS_ECN_ECT0:
494 				scflags = SCF_ACE_0;
495 				break;
496 			case IPTOS_ECN_ECT1:
497 				scflags = SCF_ACE_1;
498 				break;
499 			case IPTOS_ECN_NOTECT:
500 				scflags = SCF_ACE_N;
501 				break;
502 			}
503 		} else
504 			scflags = SCF_ECN;
505 		break;
506 	/* Default Case (section 3.1.2) */
507 	default:
508 		if ((V_tcp_do_ecn == 3) ||
509 		    (V_tcp_do_ecn == 4)) {
510 			switch (iptos & IPTOS_ECN_MASK) {
511 			case IPTOS_ECN_CE:
512 				scflags = SCF_ACE_CE;
513 				break;
514 			case IPTOS_ECN_ECT0:
515 				scflags = SCF_ACE_0;
516 				break;
517 			case IPTOS_ECN_ECT1:
518 				scflags = SCF_ACE_1;
519 				break;
520 			case IPTOS_ECN_NOTECT:
521 				scflags = SCF_ACE_N;
522 				break;
523 			}
524 		}
525 		break;
526 	}
527 	return scflags;
528 }
529 
530 /*
531  * Set up the ECN information for the <SYN,ACK> from
532  * syncache information.
533  */
534 uint16_t
535 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
536 {
537 	if ((thflags & TH_SYN) &&
538 	    (sc->sc_flags & SCF_ECN_MASK)) {
539 		switch (sc->sc_flags & SCF_ECN_MASK) {
540 		case SCF_ECN:
541 			thflags |= (0 | 0 | TH_ECE);
542 			TCPSTAT_INC(tcps_ecn_shs);
543 			break;
544 		case SCF_ACE_N:
545 			thflags |= (0 | TH_CWR | 0);
546 			TCPSTAT_INC(tcps_ecn_shs);
547 			TCPSTAT_INC(tcps_ace_nect);
548 			break;
549 		case SCF_ACE_0:
550 			thflags |= (TH_AE | 0 | 0);
551 			TCPSTAT_INC(tcps_ecn_shs);
552 			TCPSTAT_INC(tcps_ace_ect0);
553 			break;
554 		case SCF_ACE_1:
555 			thflags |= (0 | TH_ECE | TH_CWR);
556 			TCPSTAT_INC(tcps_ecn_shs);
557 			TCPSTAT_INC(tcps_ace_ect1);
558 			break;
559 		case SCF_ACE_CE:
560 			thflags |= (TH_AE | TH_CWR | 0);
561 			TCPSTAT_INC(tcps_ecn_shs);
562 			TCPSTAT_INC(tcps_ace_ce);
563 			break;
564 		/* undefined SCF codepoint */
565 		default:
566 			break;
567 		}
568 	}
569 	return thflags;
570 }
571 
572 int
573 tcp_ecn_get_ace(uint16_t thflags)
574 {
575 	int ace = 0;
576 
577 	if (thflags & TH_ECE)
578 		ace += 1;
579 	if (thflags & TH_CWR)
580 		ace += 2;
581 	if (thflags & TH_AE)
582 		ace += 4;
583 	return ace;
584 }
585