1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2007-2008,2010 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 9 * Copyright (c) 2010 The FreeBSD Foundation 10 * Copyright (c) 2010-2011 Juniper Networks, Inc. 11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com> 12 * All rights reserved. 13 * 14 * Portions of this software were developed at the Centre for Advanced Internet 15 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 16 * James Healy and David Hayes, made possible in part by a grant from the Cisco 17 * University Research Program Fund at Community Foundation Silicon Valley. 18 * 19 * Portions of this software were developed at the Centre for Advanced 20 * Internet Architectures, Swinburne University of Technology, Melbourne, 21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 22 * 23 * Portions of this software were developed by Robert N. M. Watson under 24 * contract to Juniper Networks, Inc. 25 * 26 * Redistribution and use in source and binary forms, with or without 27 * modification, are permitted provided that the following conditions 28 * are met: 29 * 1. Redistributions of source code must retain the above copyright 30 * notice, this list of conditions and the following disclaimer. 31 * 2. Redistributions in binary form must reproduce the above copyright 32 * notice, this list of conditions and the following disclaimer in the 33 * documentation and/or other materials provided with the distribution. 34 * 3. Neither the name of the University nor the names of its contributors 35 * may be used to endorse or promote products derived from this software 36 * without specific prior written permission. 37 * 38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 48 * SUCH DAMAGE. 49 * 50 * @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95 51 */ 52 53 /* 54 * Utility functions to deal with Explicit Congestion Notification in TCP 55 * implementing the essential parts of the Accurate ECN extension 56 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include "opt_inet.h" 63 #include "opt_inet6.h" 64 #include "opt_tcpdebug.h" 65 66 #include <sys/param.h> 67 #include <sys/systm.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/malloc.h> 71 #include <sys/mbuf.h> 72 #include <sys/socket.h> 73 #include <sys/socketvar.h> 74 75 #include <machine/cpu.h> 76 77 #include <vm/uma.h> 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 #include <net/route.h> 82 #include <net/vnet.h> 83 84 #include <netinet/in.h> 85 #include <netinet/in_systm.h> 86 #include <netinet/ip.h> 87 #include <netinet/in_var.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/ip_var.h> 90 #include <netinet/ip6.h> 91 #include <netinet/icmp6.h> 92 #include <netinet6/nd6.h> 93 #include <netinet6/ip6_var.h> 94 #include <netinet6/in6_pcb.h> 95 #include <netinet/tcp.h> 96 #include <netinet/tcp_fsm.h> 97 #include <netinet/tcp_seq.h> 98 #include <netinet/tcp_var.h> 99 #include <netinet/tcp_syncache.h> 100 #include <netinet/tcp_timer.h> 101 #include <netinet6/tcp6_var.h> 102 #include <netinet/tcpip.h> 103 #include <netinet/tcp_ecn.h> 104 105 106 /* 107 * Process incoming SYN,ACK packet 108 */ 109 void 110 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) 111 { 112 113 if (V_tcp_do_ecn == 0) 114 return; 115 if ((V_tcp_do_ecn == 1) || 116 (V_tcp_do_ecn == 2)) { 117 /* RFC3168 ECN handling */ 118 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) { 119 tp->t_flags2 |= TF2_ECN_PERMIT; 120 TCPSTAT_INC(tcps_ecn_shs); 121 } 122 } else 123 /* decoding Accurate ECN according to table in section 3.1.1 */ 124 if ((V_tcp_do_ecn == 3) || 125 (V_tcp_do_ecn == 4)) { 126 /* 127 * on the SYN,ACK, process the AccECN 128 * flags indicating the state the SYN 129 * was delivered. 130 * Reactions to Path ECN mangling can 131 * come here. 132 */ 133 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 134 /* RFC3168 SYN */ 135 case (0|0|TH_ECE): 136 tp->t_flags2 |= TF2_ECN_PERMIT; 137 TCPSTAT_INC(tcps_ecn_shs); 138 break; 139 /* non-ECT SYN */ 140 case (0|TH_CWR|0): 141 tp->t_flags2 |= TF2_ACE_PERMIT; 142 tp->t_scep = 5; 143 TCPSTAT_INC(tcps_ecn_shs); 144 TCPSTAT_INC(tcps_ace_nect); 145 break; 146 /* ECT0 SYN */ 147 case (TH_AE|0|0): 148 tp->t_flags2 |= TF2_ACE_PERMIT; 149 tp->t_scep = 5; 150 TCPSTAT_INC(tcps_ecn_shs); 151 TCPSTAT_INC(tcps_ace_ect0); 152 break; 153 /* ECT1 SYN */ 154 case (0|TH_CWR|TH_ECE): 155 tp->t_flags2 |= TF2_ACE_PERMIT; 156 tp->t_scep = 5; 157 TCPSTAT_INC(tcps_ecn_shs); 158 TCPSTAT_INC(tcps_ace_ect1); 159 break; 160 /* CE SYN */ 161 case (TH_AE|TH_CWR|0): 162 tp->t_flags2 |= TF2_ACE_PERMIT; 163 tp->t_scep = 6; 164 /* 165 * reduce the IW to 2 MSS (to 166 * account for delayed acks) if 167 * the SYN,ACK was CE marked 168 */ 169 tp->snd_cwnd = 2 * tcp_maxseg(tp); 170 TCPSTAT_INC(tcps_ecn_shs); 171 TCPSTAT_INC(tcps_ace_nect); 172 break; 173 default: 174 break; 175 } 176 /* 177 * Set the AccECN Codepoints on 178 * the outgoing <ACK> to the ECN 179 * state of the <SYN,ACK> 180 * according to table 3 in the 181 * AccECN draft 182 */ 183 switch (iptos & IPTOS_ECN_MASK) { 184 case (IPTOS_ECN_NOTECT): 185 tp->t_rcep = 0b010; 186 break; 187 case (IPTOS_ECN_ECT0): 188 tp->t_rcep = 0b100; 189 break; 190 case (IPTOS_ECN_ECT1): 191 tp->t_rcep = 0b011; 192 break; 193 case (IPTOS_ECN_CE): 194 tp->t_rcep = 0b110; 195 break; 196 } 197 } 198 } 199 200 /* 201 * Handle parallel SYN for ECN 202 */ 203 void 204 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos) 205 { 206 if (thflags & TH_ACK) 207 return; 208 if (V_tcp_do_ecn == 0) 209 return; 210 if ((V_tcp_do_ecn == 1) || 211 (V_tcp_do_ecn == 2)) { 212 /* RFC3168 ECN handling */ 213 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) { 214 tp->t_flags2 |= TF2_ECN_PERMIT; 215 tp->t_flags2 |= TF2_ECN_SND_ECE; 216 TCPSTAT_INC(tcps_ecn_shs); 217 } 218 } else 219 if ((V_tcp_do_ecn == 3) || 220 (V_tcp_do_ecn == 4)) { 221 /* AccECN handling */ 222 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 223 default: 224 case (0|0|0): 225 break; 226 case (0|TH_CWR|TH_ECE): 227 tp->t_flags2 |= TF2_ECN_PERMIT; 228 tp->t_flags2 |= TF2_ECN_SND_ECE; 229 TCPSTAT_INC(tcps_ecn_shs); 230 break; 231 case (TH_AE|TH_CWR|TH_ECE): 232 tp->t_flags2 |= TF2_ACE_PERMIT; 233 TCPSTAT_INC(tcps_ecn_shs); 234 /* 235 * Set the AccECN Codepoints on 236 * the outgoing <ACK> to the ECN 237 * state of the <SYN,ACK> 238 * according to table 3 in the 239 * AccECN draft 240 */ 241 switch (iptos & IPTOS_ECN_MASK) { 242 case (IPTOS_ECN_NOTECT): 243 tp->t_rcep = 0b010; 244 break; 245 case (IPTOS_ECN_ECT0): 246 tp->t_rcep = 0b100; 247 break; 248 case (IPTOS_ECN_ECT1): 249 tp->t_rcep = 0b011; 250 break; 251 case (IPTOS_ECN_CE): 252 tp->t_rcep = 0b110; 253 break; 254 } 255 break; 256 } 257 } 258 } 259 260 /* 261 * TCP ECN processing. 262 */ 263 int 264 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos) 265 { 266 int delta_ace = 0; 267 268 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { 269 switch (iptos & IPTOS_ECN_MASK) { 270 case IPTOS_ECN_CE: 271 TCPSTAT_INC(tcps_ecn_ce); 272 break; 273 case IPTOS_ECN_ECT0: 274 TCPSTAT_INC(tcps_ecn_ect0); 275 break; 276 case IPTOS_ECN_ECT1: 277 TCPSTAT_INC(tcps_ecn_ect1); 278 break; 279 } 280 281 if (tp->t_flags2 & TF2_ACE_PERMIT) { 282 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 283 tp->t_rcep += 1; 284 if (tp->t_flags2 & TF2_ECN_PERMIT) { 285 delta_ace = (tcp_ecn_get_ace(thflags) + 8 - 286 (tp->t_scep & 0x07)) & 0x07; 287 tp->t_scep += delta_ace; 288 } else { 289 /* 290 * process the final ACK of the 3WHS 291 * see table 3 in draft-ietf-tcpm-accurate-ecn 292 */ 293 switch (tcp_ecn_get_ace(thflags)) { 294 case 0b010: 295 /* nonECT SYN or SYN,ACK */ 296 /* Fallthrough */ 297 case 0b011: 298 /* ECT1 SYN or SYN,ACK */ 299 /* Fallthrough */ 300 case 0b100: 301 /* ECT0 SYN or SYN,ACK */ 302 tp->t_scep = 5; 303 break; 304 case 0b110: 305 /* CE SYN or SYN,ACK */ 306 tp->t_scep = 6; 307 tp->snd_cwnd = 2 * tcp_maxseg(tp); 308 break; 309 default: 310 /* mangled AccECN handshake */ 311 tp->t_scep = 5; 312 break; 313 } 314 tp->t_flags2 |= TF2_ECN_PERMIT; 315 } 316 } else { 317 /* RFC3168 ECN handling */ 318 if (thflags & TH_ECE) 319 delta_ace = 1; 320 if (thflags & TH_CWR) { 321 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 322 tp->t_flags |= TF_ACKNOW; 323 } 324 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 325 tp->t_flags2 |= TF2_ECN_SND_ECE; 326 } 327 328 /* Process a packet differently from RFC3168. */ 329 cc_ecnpkt_handler_flags(tp, thflags, iptos); 330 } 331 332 return delta_ace; 333 } 334 335 /* 336 * Send ECN setup <SYN> packet header flags 337 */ 338 uint16_t 339 tcp_ecn_output_syn_sent(struct tcpcb *tp) 340 { 341 uint16_t thflags = 0; 342 343 if (V_tcp_do_ecn == 0) 344 return thflags; 345 if (V_tcp_do_ecn == 1) { 346 /* Send a RFC3168 ECN setup <SYN> packet */ 347 if (tp->t_rxtshift >= 1) { 348 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 349 thflags = TH_ECE|TH_CWR; 350 } else 351 thflags = TH_ECE|TH_CWR; 352 } else 353 if (V_tcp_do_ecn == 3) { 354 /* Send an Accurate ECN setup <SYN> packet */ 355 if (tp->t_rxtshift >= 1) { 356 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 357 thflags = TH_ECE|TH_CWR|TH_AE; 358 } else 359 thflags = TH_ECE|TH_CWR|TH_AE; 360 } 361 362 return thflags; 363 } 364 365 /* 366 * output processing of ECN feature 367 * returning IP ECN header codepoint 368 */ 369 int 370 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit) 371 { 372 int ipecn = IPTOS_ECN_NOTECT; 373 bool newdata; 374 375 /* 376 * If the peer has ECN, mark data packets with 377 * ECN capable transmission (ECT). 378 * Ignore pure control packets, retransmissions 379 * and window probes. 380 */ 381 newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 382 !rxmit && 383 !((tp->t_flags & TF_FORCEDATA) && len == 1)); 384 /* RFC3168 ECN marking, only new data segments */ 385 if (newdata) { 386 ipecn = IPTOS_ECN_ECT0; 387 TCPSTAT_INC(tcps_ecn_ect0); 388 } 389 /* 390 * Reply with proper ECN notifications. 391 */ 392 if (tp->t_flags2 & TF2_ACE_PERMIT) { 393 *thflags &= ~(TH_AE|TH_CWR|TH_ECE); 394 if (tp->t_rcep & 0x01) 395 *thflags |= TH_ECE; 396 if (tp->t_rcep & 0x02) 397 *thflags |= TH_CWR; 398 if (tp->t_rcep & 0x04) 399 *thflags |= TH_AE; 400 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { 401 /* 402 * here we process the final 403 * ACK of the 3WHS 404 */ 405 if (tp->t_rcep == 0b110) { 406 tp->t_rcep = 6; 407 } else { 408 tp->t_rcep = 5; 409 } 410 tp->t_flags2 |= TF2_ECN_PERMIT; 411 } 412 } else { 413 if (newdata && 414 (tp->t_flags2 & TF2_ECN_SND_CWR)) { 415 *thflags |= TH_CWR; 416 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 417 } 418 if (tp->t_flags2 & TF2_ECN_SND_ECE) 419 *thflags |= TH_ECE; 420 } 421 422 return ipecn; 423 } 424 425 /* 426 * Set up the ECN related tcpcb fields from 427 * a syncache entry 428 */ 429 void 430 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc) 431 { 432 if (sc->sc_flags & SCF_ECN_MASK) { 433 switch (sc->sc_flags & SCF_ECN_MASK) { 434 case SCF_ECN: 435 tp->t_flags2 |= TF2_ECN_PERMIT; 436 break; 437 case SCF_ACE_N: 438 /* Fallthrough */ 439 case SCF_ACE_0: 440 /* Fallthrough */ 441 case SCF_ACE_1: 442 tp->t_flags2 |= TF2_ACE_PERMIT; 443 tp->t_scep = 5; 444 tp->t_rcep = 5; 445 break; 446 case SCF_ACE_CE: 447 tp->t_flags2 |= TF2_ACE_PERMIT; 448 tp->t_scep = 6; 449 tp->t_rcep = 6; 450 break; 451 /* undefined SCF codepoint */ 452 default: 453 break; 454 } 455 } 456 } 457 458 /* 459 * Process a <SYN> packets ECN information, and provide the 460 * syncache with the relevant information. 461 */ 462 int 463 tcp_ecn_syncache_add(uint16_t thflags, int iptos) 464 { 465 int scflags = 0; 466 467 switch (thflags & (TH_AE|TH_CWR|TH_ECE)) { 468 /* no ECN */ 469 case (0|0|0): 470 break; 471 /* legacy ECN */ 472 case (0|TH_CWR|TH_ECE): 473 scflags = SCF_ECN; 474 break; 475 /* Accurate ECN */ 476 case (TH_AE|TH_CWR|TH_ECE): 477 if ((V_tcp_do_ecn == 3) || 478 (V_tcp_do_ecn == 4)) { 479 switch (iptos & IPTOS_ECN_MASK) { 480 case IPTOS_ECN_CE: 481 scflags = SCF_ACE_CE; 482 break; 483 case IPTOS_ECN_ECT0: 484 scflags = SCF_ACE_0; 485 break; 486 case IPTOS_ECN_ECT1: 487 scflags = SCF_ACE_1; 488 break; 489 case IPTOS_ECN_NOTECT: 490 scflags = SCF_ACE_N; 491 break; 492 } 493 } else 494 scflags = SCF_ECN; 495 break; 496 /* Default Case (section 3.1.2) */ 497 default: 498 if ((V_tcp_do_ecn == 3) || 499 (V_tcp_do_ecn == 4)) { 500 switch (iptos & IPTOS_ECN_MASK) { 501 case IPTOS_ECN_CE: 502 scflags = SCF_ACE_CE; 503 break; 504 case IPTOS_ECN_ECT0: 505 scflags = SCF_ACE_0; 506 break; 507 case IPTOS_ECN_ECT1: 508 scflags = SCF_ACE_1; 509 break; 510 case IPTOS_ECN_NOTECT: 511 scflags = SCF_ACE_N; 512 break; 513 } 514 } 515 break; 516 } 517 return scflags; 518 } 519 520 /* 521 * Set up the ECN information for the <SYN,ACK> from 522 * syncache information. 523 */ 524 uint16_t 525 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) 526 { 527 if ((thflags & TH_SYN) && 528 (sc->sc_flags & SCF_ECN_MASK)) { 529 switch (sc->sc_flags & SCF_ECN_MASK) { 530 case SCF_ECN: 531 thflags |= (0 | 0 | TH_ECE); 532 TCPSTAT_INC(tcps_ecn_shs); 533 break; 534 case SCF_ACE_N: 535 thflags |= (0 | TH_CWR | 0); 536 TCPSTAT_INC(tcps_ecn_shs); 537 TCPSTAT_INC(tcps_ace_nect); 538 break; 539 case SCF_ACE_0: 540 thflags |= (TH_AE | 0 | 0); 541 TCPSTAT_INC(tcps_ecn_shs); 542 TCPSTAT_INC(tcps_ace_ect0); 543 break; 544 case SCF_ACE_1: 545 thflags |= (0 | TH_ECE | TH_CWR); 546 TCPSTAT_INC(tcps_ecn_shs); 547 TCPSTAT_INC(tcps_ace_ect1); 548 break; 549 case SCF_ACE_CE: 550 thflags |= (TH_AE | TH_CWR | 0); 551 TCPSTAT_INC(tcps_ecn_shs); 552 TCPSTAT_INC(tcps_ace_ce); 553 break; 554 /* undefined SCF codepoint */ 555 default: 556 break; 557 } 558 } 559 return thflags; 560 } 561 562 int 563 tcp_ecn_get_ace(uint16_t thflags) 564 { 565 int ace = 0; 566 567 if (thflags & TH_ECE) 568 ace += 1; 569 if (thflags & TH_CWR) 570 ace += 2; 571 if (thflags & TH_AE) 572 ace += 4; 573 return ace; 574 } 575