xref: /freebsd/sys/netinet/tcp_ecn.c (revision b197d4b893974c9eb4d7b38704c6d5c486235d6f)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5  *      The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 2007-2008,2010
7  *      Swinburne University of Technology, Melbourne, Australia.
8  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9  * Copyright (c) 2010 The FreeBSD Foundation
10  * Copyright (c) 2010-2011 Juniper Networks, Inc.
11  * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com>
12  * All rights reserved.
13  *
14  * Portions of this software were developed at the Centre for Advanced Internet
15  * Architectures, Swinburne University of Technology, by Lawrence Stewart,
16  * James Healy and David Hayes, made possible in part by a grant from the Cisco
17  * University Research Program Fund at Community Foundation Silicon Valley.
18  *
19  * Portions of this software were developed at the Centre for Advanced
20  * Internet Architectures, Swinburne University of Technology, Melbourne,
21  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
22  *
23  * Portions of this software were developed by Robert N. M. Watson under
24  * contract to Juniper Networks, Inc.
25  *
26  * Redistribution and use in source and binary forms, with or without
27  * modification, are permitted provided that the following conditions
28  * are met:
29  * 1. Redistributions of source code must retain the above copyright
30  *    notice, this list of conditions and the following disclaimer.
31  * 2. Redistributions in binary form must reproduce the above copyright
32  *    notice, this list of conditions and the following disclaimer in the
33  *    documentation and/or other materials provided with the distribution.
34  * 3. Neither the name of the University nor the names of its contributors
35  *    may be used to endorse or promote products derived from this software
36  *    without specific prior written permission.
37  *
38  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  *
50  *      @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95
51  */
52 
53 /*
54  * Utility functions to deal with Explicit Congestion Notification in TCP
55  * implementing the essential parts of the Accurate ECN extension
56  * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include "opt_inet.h"
63 #include "opt_inet6.h"
64 #include "opt_tcpdebug.h"
65 
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/malloc.h>
71 #include <sys/mbuf.h>
72 #include <sys/socket.h>
73 #include <sys/socketvar.h>
74 
75 #include <machine/cpu.h>
76 
77 #include <vm/uma.h>
78 
79 #include <net/if.h>
80 #include <net/if_var.h>
81 #include <net/route.h>
82 #include <net/vnet.h>
83 
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/ip.h>
87 #include <netinet/in_var.h>
88 #include <netinet/in_pcb.h>
89 #include <netinet/ip_var.h>
90 #include <netinet/ip6.h>
91 #include <netinet/icmp6.h>
92 #include <netinet6/nd6.h>
93 #include <netinet6/ip6_var.h>
94 #include <netinet6/in6_pcb.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcp_syncache.h>
100 #include <netinet/tcp_timer.h>
101 #include <netinet6/tcp6_var.h>
102 #include <netinet/tcpip.h>
103 #include <netinet/tcp_ecn.h>
104 
105 
106 /*
107  * Process incoming SYN,ACK packet
108  */
109 void
110 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
111 {
112 
113 	if (V_tcp_do_ecn == 0)
114 		return;
115 	if ((V_tcp_do_ecn == 1) ||
116 	    (V_tcp_do_ecn == 2)) {
117 		/* RFC3168 ECN handling */
118 		if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) {
119 			tp->t_flags2 |= TF2_ECN_PERMIT;
120 			TCPSTAT_INC(tcps_ecn_shs);
121 		}
122 	} else
123 	/* decoding Accurate ECN according to table in section 3.1.1 */
124 	if ((V_tcp_do_ecn == 3) ||
125 	    (V_tcp_do_ecn == 4)) {
126 		/*
127 		 * on the SYN,ACK, process the AccECN
128 		 * flags indicating the state the SYN
129 		 * was delivered.
130 		 * Reactions to Path ECN mangling can
131 		 * come here.
132 		 */
133 		switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
134 		/* RFC3168 SYN */
135 		case (0|0|TH_ECE):
136 			tp->t_flags2 |= TF2_ECN_PERMIT;
137 			TCPSTAT_INC(tcps_ecn_shs);
138 			break;
139 		/* non-ECT SYN */
140 		case (0|TH_CWR|0):
141 			tp->t_flags2 |= TF2_ACE_PERMIT;
142 			tp->t_scep = 5;
143 			TCPSTAT_INC(tcps_ecn_shs);
144 			TCPSTAT_INC(tcps_ace_nect);
145 			break;
146 		/* ECT0 SYN */
147 		case (TH_AE|0|0):
148 			tp->t_flags2 |= TF2_ACE_PERMIT;
149 			tp->t_scep = 5;
150 			TCPSTAT_INC(tcps_ecn_shs);
151 			TCPSTAT_INC(tcps_ace_ect0);
152 			break;
153 		/* ECT1 SYN */
154 		case (0|TH_CWR|TH_ECE):
155 			tp->t_flags2 |= TF2_ACE_PERMIT;
156 			tp->t_scep = 5;
157 			TCPSTAT_INC(tcps_ecn_shs);
158 			TCPSTAT_INC(tcps_ace_ect1);
159 			break;
160 		/* CE SYN */
161 		case (TH_AE|TH_CWR|0):
162 			tp->t_flags2 |= TF2_ACE_PERMIT;
163 			tp->t_scep = 6;
164 			/*
165 			 * reduce the IW to 2 MSS (to
166 			 * account for delayed acks) if
167 			 * the SYN,ACK was CE marked
168 			 */
169 			tp->snd_cwnd = 2 * tcp_maxseg(tp);
170 			TCPSTAT_INC(tcps_ecn_shs);
171 			TCPSTAT_INC(tcps_ace_nect);
172 			break;
173 		default:
174 			break;
175 		}
176 		/*
177 		 * Set the AccECN Codepoints on
178 		 * the outgoing <ACK> to the ECN
179 		 * state of the <SYN,ACK>
180 		 * according to table 3 in the
181 		 * AccECN draft
182 		 */
183 		switch (iptos & IPTOS_ECN_MASK) {
184 		case (IPTOS_ECN_NOTECT):
185 			tp->t_rcep = 0b010;
186 			break;
187 		case (IPTOS_ECN_ECT0):
188 			tp->t_rcep = 0b100;
189 			break;
190 		case (IPTOS_ECN_ECT1):
191 			tp->t_rcep = 0b011;
192 			break;
193 		case (IPTOS_ECN_CE):
194 			tp->t_rcep = 0b110;
195 			break;
196 		}
197 	}
198 }
199 
200 /*
201  * Handle parallel SYN for ECN
202  */
203 void
204 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos)
205 {
206 	if (thflags & TH_ACK)
207 		return;
208 	if (V_tcp_do_ecn == 0)
209 		return;
210 	if ((V_tcp_do_ecn == 1) ||
211 	    (V_tcp_do_ecn == 2)) {
212 		/* RFC3168 ECN handling */
213 		if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) {
214 			tp->t_flags2 |= TF2_ECN_PERMIT;
215 			tp->t_flags2 |= TF2_ECN_SND_ECE;
216 			TCPSTAT_INC(tcps_ecn_shs);
217 		}
218 	} else
219 	if ((V_tcp_do_ecn == 3) ||
220 	    (V_tcp_do_ecn == 4)) {
221 		/* AccECN handling */
222 		switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
223 		default:
224 		case (0|0|0):
225 			break;
226 		case (0|TH_CWR|TH_ECE):
227 			tp->t_flags2 |= TF2_ECN_PERMIT;
228 			tp->t_flags2 |= TF2_ECN_SND_ECE;
229 			TCPSTAT_INC(tcps_ecn_shs);
230 			break;
231 		case (TH_AE|TH_CWR|TH_ECE):
232 			tp->t_flags2 |= TF2_ACE_PERMIT;
233 			TCPSTAT_INC(tcps_ecn_shs);
234 			/*
235 			 * Set the AccECN Codepoints on
236 			 * the outgoing <ACK> to the ECN
237 			 * state of the <SYN,ACK>
238 			 * according to table 3 in the
239 			 * AccECN draft
240 			 */
241 			switch (iptos & IPTOS_ECN_MASK) {
242 			case (IPTOS_ECN_NOTECT):
243 				tp->t_rcep = 0b010;
244 				break;
245 			case (IPTOS_ECN_ECT0):
246 				tp->t_rcep = 0b100;
247 				break;
248 			case (IPTOS_ECN_ECT1):
249 				tp->t_rcep = 0b011;
250 				break;
251 			case (IPTOS_ECN_CE):
252 				tp->t_rcep = 0b110;
253 				break;
254 			}
255 			break;
256 		}
257 	}
258 }
259 
260 /*
261  * TCP ECN processing.
262  */
263 int
264 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos)
265 {
266 	int delta_ace = 0;
267 
268 	if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
269 		switch (iptos & IPTOS_ECN_MASK) {
270 		case IPTOS_ECN_CE:
271 			TCPSTAT_INC(tcps_ecn_ce);
272 			break;
273 		case IPTOS_ECN_ECT0:
274 			TCPSTAT_INC(tcps_ecn_ect0);
275 			break;
276 		case IPTOS_ECN_ECT1:
277 			TCPSTAT_INC(tcps_ecn_ect1);
278 			break;
279 		}
280 
281 		if (tp->t_flags2 & TF2_ACE_PERMIT) {
282 			if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
283 				tp->t_rcep += 1;
284 			if (tp->t_flags2 & TF2_ECN_PERMIT) {
285 				delta_ace = (tcp_ecn_get_ace(thflags) + 8 -
286 					    (tp->t_scep & 0x07)) & 0x07;
287 				tp->t_scep += delta_ace;
288 			} else {
289 				/*
290 				 * process the final ACK of the 3WHS
291 				 * see table 3 in draft-ietf-tcpm-accurate-ecn
292 				 */
293 				switch (tcp_ecn_get_ace(thflags)) {
294 				case 0b010:
295 					/* nonECT SYN or SYN,ACK */
296 					/* Fallthrough */
297 				case 0b011:
298 					/* ECT1 SYN or SYN,ACK */
299 					/* Fallthrough */
300 				case 0b100:
301 					/* ECT0 SYN or SYN,ACK */
302 					tp->t_scep = 5;
303 					break;
304 				case 0b110:
305 					/* CE SYN or SYN,ACK */
306 					tp->t_scep = 6;
307 					tp->snd_cwnd = 2 * tcp_maxseg(tp);
308 					break;
309 				default:
310 					/* mangled AccECN handshake */
311 					tp->t_scep = 5;
312 					break;
313 				}
314 				tp->t_flags2 |= TF2_ECN_PERMIT;
315 			}
316 		} else {
317 			/* RFC3168 ECN handling */
318 			if (thflags & TH_ECE)
319 				delta_ace = 1;
320 			if (thflags & TH_CWR) {
321 				tp->t_flags2 &= ~TF2_ECN_SND_ECE;
322 				tp->t_flags |= TF_ACKNOW;
323 			}
324 			if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
325 				tp->t_flags2 |= TF2_ECN_SND_ECE;
326 		}
327 
328 		/* Process a packet differently from RFC3168. */
329 		cc_ecnpkt_handler_flags(tp, thflags, iptos);
330 	}
331 
332 	return delta_ace;
333 }
334 
335 /*
336  * Send ECN setup <SYN> packet header flags
337  */
338 uint16_t
339 tcp_ecn_output_syn_sent(struct tcpcb *tp)
340 {
341 	uint16_t thflags = 0;
342 
343 	if (V_tcp_do_ecn == 0)
344 		return thflags;
345 	if (V_tcp_do_ecn == 1) {
346 		/* Send a RFC3168 ECN setup <SYN> packet */
347 		if (tp->t_rxtshift >= 1) {
348 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
349 				thflags = TH_ECE|TH_CWR;
350 		} else
351 			thflags = TH_ECE|TH_CWR;
352 	} else
353 	if (V_tcp_do_ecn == 3) {
354 		/* Send an Accurate ECN setup <SYN> packet */
355 		if (tp->t_rxtshift >= 1) {
356 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
357 				thflags = TH_ECE|TH_CWR|TH_AE;
358 		} else
359 			thflags = TH_ECE|TH_CWR|TH_AE;
360 	}
361 
362 	return thflags;
363 }
364 
365 /*
366  * output processing of ECN feature
367  * returning IP ECN header codepoint
368  */
369 int
370 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit)
371 {
372 	int ipecn = IPTOS_ECN_NOTECT;
373 	bool newdata;
374 
375 	/*
376 	 * If the peer has ECN, mark data packets with
377 	 * ECN capable transmission (ECT).
378 	 * Ignore pure control packets, retransmissions
379 	 * and window probes.
380 	 */
381 	newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
382 		    !rxmit &&
383 		    !((tp->t_flags & TF_FORCEDATA) && len == 1));
384 	/* RFC3168 ECN marking, only new data segments */
385 	if (newdata) {
386 		ipecn = IPTOS_ECN_ECT0;
387 		TCPSTAT_INC(tcps_ecn_ect0);
388 	}
389 	/*
390 	 * Reply with proper ECN notifications.
391 	 */
392 	if (tp->t_flags2 & TF2_ACE_PERMIT) {
393 		*thflags &= ~(TH_AE|TH_CWR|TH_ECE);
394 		if (tp->t_rcep & 0x01)
395 			*thflags |= TH_ECE;
396 		if (tp->t_rcep & 0x02)
397 			*thflags |= TH_CWR;
398 		if (tp->t_rcep & 0x04)
399 			*thflags |= TH_AE;
400 		if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
401 			/*
402 			 * here we process the final
403 			 * ACK of the 3WHS
404 			 */
405 			if (tp->t_rcep == 0b110) {
406 				tp->t_rcep = 6;
407 			} else {
408 				tp->t_rcep = 5;
409 			}
410 			tp->t_flags2 |= TF2_ECN_PERMIT;
411 		}
412 	} else {
413 		if (newdata &&
414 		    (tp->t_flags2 & TF2_ECN_SND_CWR)) {
415 			*thflags |= TH_CWR;
416 			tp->t_flags2 &= ~TF2_ECN_SND_CWR;
417 		}
418 		if (tp->t_flags2 & TF2_ECN_SND_ECE)
419 			*thflags |= TH_ECE;
420 	}
421 
422 	return ipecn;
423 }
424 
425 /*
426  * Set up the ECN related tcpcb fields from
427  * a syncache entry
428  */
429 void
430 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc)
431 {
432 	if (sc->sc_flags & SCF_ECN_MASK) {
433 		switch (sc->sc_flags & SCF_ECN_MASK) {
434 		case SCF_ECN:
435 			tp->t_flags2 |= TF2_ECN_PERMIT;
436 			break;
437 		case SCF_ACE_N:
438 			/* Fallthrough */
439 		case SCF_ACE_0:
440 			/* Fallthrough */
441 		case SCF_ACE_1:
442 			tp->t_flags2 |= TF2_ACE_PERMIT;
443 			tp->t_scep = 5;
444 			tp->t_rcep = 5;
445 			break;
446 		case SCF_ACE_CE:
447 			tp->t_flags2 |= TF2_ACE_PERMIT;
448 			tp->t_scep = 6;
449 			tp->t_rcep = 6;
450 			break;
451 		/* undefined SCF codepoint */
452 		default:
453 			break;
454 		}
455 	}
456 }
457 
458 /*
459  * Process a <SYN> packets ECN information, and provide the
460  * syncache with the relevant information.
461  */
462 int
463 tcp_ecn_syncache_add(uint16_t thflags, int iptos)
464 {
465 	int scflags = 0;
466 
467 	switch (thflags & (TH_AE|TH_CWR|TH_ECE)) {
468 	/* no ECN */
469 	case (0|0|0):
470 		break;
471 	/* legacy ECN */
472 	case (0|TH_CWR|TH_ECE):
473 		scflags = SCF_ECN;
474 		break;
475 	/* Accurate ECN */
476 	case (TH_AE|TH_CWR|TH_ECE):
477 		if ((V_tcp_do_ecn == 3) ||
478 		    (V_tcp_do_ecn == 4)) {
479 			switch (iptos & IPTOS_ECN_MASK) {
480 			case IPTOS_ECN_CE:
481 				scflags = SCF_ACE_CE;
482 				break;
483 			case IPTOS_ECN_ECT0:
484 				scflags = SCF_ACE_0;
485 				break;
486 			case IPTOS_ECN_ECT1:
487 				scflags = SCF_ACE_1;
488 				break;
489 			case IPTOS_ECN_NOTECT:
490 				scflags = SCF_ACE_N;
491 				break;
492 			}
493 		} else
494 			scflags = SCF_ECN;
495 		break;
496 	/* Default Case (section 3.1.2) */
497 	default:
498 		if ((V_tcp_do_ecn == 3) ||
499 		    (V_tcp_do_ecn == 4)) {
500 			switch (iptos & IPTOS_ECN_MASK) {
501 			case IPTOS_ECN_CE:
502 				scflags = SCF_ACE_CE;
503 				break;
504 			case IPTOS_ECN_ECT0:
505 				scflags = SCF_ACE_0;
506 				break;
507 			case IPTOS_ECN_ECT1:
508 				scflags = SCF_ACE_1;
509 				break;
510 			case IPTOS_ECN_NOTECT:
511 				scflags = SCF_ACE_N;
512 				break;
513 			}
514 		}
515 		break;
516 	}
517 	return scflags;
518 }
519 
520 /*
521  * Set up the ECN information for the <SYN,ACK> from
522  * syncache information.
523  */
524 uint16_t
525 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
526 {
527 	if ((thflags & TH_SYN) &&
528 	    (sc->sc_flags & SCF_ECN_MASK)) {
529 		switch (sc->sc_flags & SCF_ECN_MASK) {
530 		case SCF_ECN:
531 			thflags |= (0 | 0 | TH_ECE);
532 			TCPSTAT_INC(tcps_ecn_shs);
533 			break;
534 		case SCF_ACE_N:
535 			thflags |= (0 | TH_CWR | 0);
536 			TCPSTAT_INC(tcps_ecn_shs);
537 			TCPSTAT_INC(tcps_ace_nect);
538 			break;
539 		case SCF_ACE_0:
540 			thflags |= (TH_AE | 0 | 0);
541 			TCPSTAT_INC(tcps_ecn_shs);
542 			TCPSTAT_INC(tcps_ace_ect0);
543 			break;
544 		case SCF_ACE_1:
545 			thflags |= (0 | TH_ECE | TH_CWR);
546 			TCPSTAT_INC(tcps_ecn_shs);
547 			TCPSTAT_INC(tcps_ace_ect1);
548 			break;
549 		case SCF_ACE_CE:
550 			thflags |= (TH_AE | TH_CWR | 0);
551 			TCPSTAT_INC(tcps_ecn_shs);
552 			TCPSTAT_INC(tcps_ace_ce);
553 			break;
554 		/* undefined SCF codepoint */
555 		default:
556 			break;
557 		}
558 	}
559 	return thflags;
560 }
561 
562 int
563 tcp_ecn_get_ace(uint16_t thflags)
564 {
565 	int ace = 0;
566 
567 	if (thflags & TH_ECE)
568 		ace += 1;
569 	if (thflags & TH_CWR)
570 		ace += 2;
571 	if (thflags & TH_AE)
572 		ace += 4;
573 	return ace;
574 }
575