1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2007-2008,2010 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 9 * Copyright (c) 2010 The FreeBSD Foundation 10 * Copyright (c) 2010-2011 Juniper Networks, Inc. 11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com> 12 * All rights reserved. 13 * 14 * Portions of this software were developed at the Centre for Advanced Internet 15 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 16 * James Healy and David Hayes, made possible in part by a grant from the Cisco 17 * University Research Program Fund at Community Foundation Silicon Valley. 18 * 19 * Portions of this software were developed at the Centre for Advanced 20 * Internet Architectures, Swinburne University of Technology, Melbourne, 21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 22 * 23 * Portions of this software were developed by Robert N. M. Watson under 24 * contract to Juniper Networks, Inc. 25 * 26 * Redistribution and use in source and binary forms, with or without 27 * modification, are permitted provided that the following conditions 28 * are met: 29 * 1. Redistributions of source code must retain the above copyright 30 * notice, this list of conditions and the following disclaimer. 31 * 2. Redistributions in binary form must reproduce the above copyright 32 * notice, this list of conditions and the following disclaimer in the 33 * documentation and/or other materials provided with the distribution. 34 * 3. Neither the name of the University nor the names of its contributors 35 * may be used to endorse or promote products derived from this software 36 * without specific prior written permission. 37 * 38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 48 * SUCH DAMAGE. 49 * 50 * @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95 51 */ 52 53 /* 54 * Utility functions to deal with Explicit Congestion Notification in TCP 55 * implementing the essential parts of the Accurate ECN extension 56 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09 57 */ 58 59 #include <sys/cdefs.h> 60 #include "opt_inet.h" 61 #include "opt_inet6.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/kernel.h> 66 #include <sys/sysctl.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/socket.h> 70 #include <sys/socketvar.h> 71 72 #include <machine/cpu.h> 73 74 #include <vm/uma.h> 75 76 #include <net/if.h> 77 #include <net/if_var.h> 78 #include <net/route.h> 79 #include <net/vnet.h> 80 81 #include <netinet/in.h> 82 #include <netinet/in_systm.h> 83 #include <netinet/ip.h> 84 #include <netinet/in_var.h> 85 #include <netinet/in_pcb.h> 86 #include <netinet/ip_var.h> 87 #include <netinet/ip6.h> 88 #include <netinet/icmp6.h> 89 #include <netinet6/nd6.h> 90 #include <netinet6/ip6_var.h> 91 #include <netinet6/in6_pcb.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_syncache.h> 97 #include <netinet/tcp_timer.h> 98 #include <netinet/tcpip.h> 99 #include <netinet/tcp_ecn.h> 100 101 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, 102 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 103 "TCP ECN"); 104 105 VNET_DEFINE(int, tcp_do_ecn) = 2; 106 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, 107 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, 108 "TCP ECN support"); 109 110 VNET_DEFINE(int, tcp_ecn_maxretries) = 1; 111 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, 112 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, 113 "Max retries before giving up on ECN"); 114 115 /* 116 * Process incoming SYN,ACK packet 117 */ 118 void 119 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) 120 { 121 122 if (V_tcp_do_ecn == 0) 123 return; 124 if ((V_tcp_do_ecn == 1) || 125 (V_tcp_do_ecn == 2)) { 126 /* RFC3168 ECN handling */ 127 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) { 128 tp->t_flags2 |= TF2_ECN_PERMIT; 129 tp->t_flags2 &= ~TF2_ACE_PERMIT; 130 TCPSTAT_INC(tcps_ecn_shs); 131 } 132 } else 133 /* decoding Accurate ECN according to table in section 3.1.1 */ 134 if ((V_tcp_do_ecn == 3) || 135 (V_tcp_do_ecn == 4)) { 136 /* 137 * on the SYN,ACK, process the AccECN 138 * flags indicating the state the SYN 139 * was delivered. 140 * Reactions to Path ECN mangling can 141 * come here. 142 */ 143 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 144 /* RFC3168 SYN */ 145 case (0|0|TH_ECE): 146 tp->t_flags2 |= TF2_ECN_PERMIT; 147 tp->t_flags2 &= ~TF2_ACE_PERMIT; 148 TCPSTAT_INC(tcps_ecn_shs); 149 break; 150 /* non-ECT SYN */ 151 case (0|TH_CWR|0): 152 tp->t_flags2 |= TF2_ACE_PERMIT; 153 tp->t_flags2 &= ~TF2_ECN_PERMIT; 154 tp->t_scep = 5; 155 TCPSTAT_INC(tcps_ecn_shs); 156 TCPSTAT_INC(tcps_ace_nect); 157 break; 158 /* ECT0 SYN */ 159 case (TH_AE|0|0): 160 tp->t_flags2 |= TF2_ACE_PERMIT; 161 tp->t_flags2 &= ~TF2_ECN_PERMIT; 162 tp->t_scep = 5; 163 TCPSTAT_INC(tcps_ecn_shs); 164 TCPSTAT_INC(tcps_ace_ect0); 165 break; 166 /* ECT1 SYN */ 167 case (0|TH_CWR|TH_ECE): 168 tp->t_flags2 |= TF2_ACE_PERMIT; 169 tp->t_flags2 &= ~TF2_ECN_PERMIT; 170 tp->t_scep = 5; 171 TCPSTAT_INC(tcps_ecn_shs); 172 TCPSTAT_INC(tcps_ace_ect1); 173 break; 174 /* CE SYN */ 175 case (TH_AE|TH_CWR|0): 176 tp->t_flags2 |= TF2_ACE_PERMIT; 177 tp->t_flags2 &= ~TF2_ECN_PERMIT; 178 tp->t_scep = 6; 179 /* 180 * reduce the IW to 2 MSS (to 181 * account for delayed acks) if 182 * the SYN,ACK was CE marked 183 */ 184 tp->snd_cwnd = 2 * tcp_maxseg(tp); 185 TCPSTAT_INC(tcps_ecn_shs); 186 TCPSTAT_INC(tcps_ace_nect); 187 break; 188 default: 189 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 190 break; 191 } 192 /* 193 * Set the AccECN Codepoints on 194 * the outgoing <ACK> to the ECN 195 * state of the <SYN,ACK> 196 * according to table 3 in the 197 * AccECN draft 198 */ 199 switch (iptos & IPTOS_ECN_MASK) { 200 case (IPTOS_ECN_NOTECT): 201 tp->t_rcep = 0b010; 202 break; 203 case (IPTOS_ECN_ECT0): 204 tp->t_rcep = 0b100; 205 break; 206 case (IPTOS_ECN_ECT1): 207 tp->t_rcep = 0b011; 208 break; 209 case (IPTOS_ECN_CE): 210 tp->t_rcep = 0b110; 211 break; 212 } 213 } 214 } 215 216 /* 217 * Handle parallel SYN for ECN 218 */ 219 void 220 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos) 221 { 222 if (thflags & TH_ACK) 223 return; 224 if (V_tcp_do_ecn == 0) 225 return; 226 if ((V_tcp_do_ecn == 1) || 227 (V_tcp_do_ecn == 2)) { 228 /* RFC3168 ECN handling */ 229 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) { 230 tp->t_flags2 |= TF2_ECN_PERMIT; 231 tp->t_flags2 &= ~TF2_ACE_PERMIT; 232 tp->t_flags2 |= TF2_ECN_SND_ECE; 233 TCPSTAT_INC(tcps_ecn_shs); 234 } 235 } else 236 if ((V_tcp_do_ecn == 3) || 237 (V_tcp_do_ecn == 4)) { 238 /* AccECN handling */ 239 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 240 default: 241 case (0|0|0): 242 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 243 break; 244 case (0|TH_CWR|TH_ECE): 245 tp->t_flags2 |= TF2_ECN_PERMIT; 246 tp->t_flags2 &= ~TF2_ACE_PERMIT; 247 tp->t_flags2 |= TF2_ECN_SND_ECE; 248 TCPSTAT_INC(tcps_ecn_shs); 249 break; 250 case (TH_AE|TH_CWR|TH_ECE): 251 tp->t_flags2 |= TF2_ACE_PERMIT; 252 tp->t_flags2 &= ~TF2_ECN_PERMIT; 253 TCPSTAT_INC(tcps_ecn_shs); 254 /* 255 * Set the AccECN Codepoints on 256 * the outgoing <ACK> to the ECN 257 * state of the <SYN,ACK> 258 * according to table 3 in the 259 * AccECN draft 260 */ 261 switch (iptos & IPTOS_ECN_MASK) { 262 case (IPTOS_ECN_NOTECT): 263 tp->t_rcep = 0b010; 264 break; 265 case (IPTOS_ECN_ECT0): 266 tp->t_rcep = 0b100; 267 break; 268 case (IPTOS_ECN_ECT1): 269 tp->t_rcep = 0b011; 270 break; 271 case (IPTOS_ECN_CE): 272 tp->t_rcep = 0b110; 273 break; 274 } 275 break; 276 } 277 } 278 } 279 280 /* 281 * TCP ECN processing. 282 */ 283 int 284 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int tlen, int pkts, int iptos) 285 { 286 int delta_cep = 0; 287 288 switch (iptos & IPTOS_ECN_MASK) { 289 case IPTOS_ECN_CE: 290 TCPSTAT_INC(tcps_ecn_rcvce); 291 break; 292 case IPTOS_ECN_ECT0: 293 TCPSTAT_INC(tcps_ecn_rcvect0); 294 break; 295 case IPTOS_ECN_ECT1: 296 TCPSTAT_INC(tcps_ecn_rcvect1); 297 break; 298 } 299 300 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { 301 if (tp->t_flags2 & TF2_ACE_PERMIT) { 302 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 303 tp->t_rcep += 1; 304 if (tp->t_flags2 & TF2_ECN_PERMIT) { 305 delta_cep = (tcp_ecn_get_ace(thflags) + 8 - 306 (tp->t_scep & 7)) & 7; 307 if (delta_cep < pkts) 308 delta_cep = pkts - 309 ((pkts - delta_cep) & 7); 310 tp->t_scep += delta_cep; 311 } else { 312 /* 313 * process the final ACK of the 3WHS 314 * see table 3 in draft-ietf-tcpm-accurate-ecn 315 */ 316 switch (tcp_ecn_get_ace(thflags)) { 317 case 0b010: 318 /* nonECT SYN or SYN,ACK */ 319 /* FALLTHROUGH */ 320 case 0b011: 321 /* ECT1 SYN or SYN,ACK */ 322 /* FALLTHROUGH */ 323 case 0b100: 324 /* ECT0 SYN or SYN,ACK */ 325 tp->t_scep = 5; 326 break; 327 case 0b110: 328 /* CE SYN or SYN,ACK */ 329 tp->t_scep = 6; 330 tp->snd_cwnd = 2 * tcp_maxseg(tp); 331 break; 332 default: 333 /* mangled AccECN handshake */ 334 tp->t_scep = 5; 335 break; 336 } 337 tp->t_flags2 |= TF2_ECN_PERMIT; 338 } 339 } else { 340 /* RFC3168 ECN handling */ 341 if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) { 342 delta_cep = 1; 343 tp->t_scep++; 344 } 345 if (thflags & TH_CWR) { 346 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 347 tp->t_flags |= TF_ACKNOW; 348 } 349 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 350 tp->t_flags2 |= TF2_ECN_SND_ECE; 351 } 352 353 /* Process a packet differently from RFC3168. */ 354 cc_ecnpkt_handler_flags(tp, thflags, iptos); 355 } 356 357 return delta_cep; 358 } 359 360 /* 361 * Send ECN setup <SYN> packet header flags 362 */ 363 uint16_t 364 tcp_ecn_output_syn_sent(struct tcpcb *tp) 365 { 366 uint16_t thflags = 0; 367 368 if (V_tcp_do_ecn == 0) 369 return thflags; 370 if (V_tcp_do_ecn == 1) { 371 /* Send a RFC3168 ECN setup <SYN> packet */ 372 if (tp->t_rxtshift >= 1) { 373 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 374 thflags = TH_ECE|TH_CWR; 375 } else 376 thflags = TH_ECE|TH_CWR; 377 } else 378 if (V_tcp_do_ecn == 3) { 379 /* Send an Accurate ECN setup <SYN> packet */ 380 if (tp->t_rxtshift >= 1) { 381 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 382 thflags = TH_ECE|TH_CWR|TH_AE; 383 } else 384 thflags = TH_ECE|TH_CWR|TH_AE; 385 } 386 387 return thflags; 388 } 389 390 /* 391 * output processing of ECN feature 392 * returning IP ECN header codepoint 393 */ 394 int 395 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit) 396 { 397 int ipecn = IPTOS_ECN_NOTECT; 398 bool newdata; 399 400 /* 401 * If the peer has ECN, mark data packets with 402 * ECN capable transmission (ECT). 403 * Ignore pure control packets, retransmissions 404 * and window probes. 405 */ 406 newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 407 !rxmit && 408 !((tp->t_flags & TF_FORCEDATA) && len == 1)); 409 /* RFC3168 ECN marking, only new data segments */ 410 if (newdata) { 411 if (tp->t_flags2 & TF2_ECN_USE_ECT1) { 412 ipecn = IPTOS_ECN_ECT1; 413 TCPSTAT_INC(tcps_ecn_sndect1); 414 } else { 415 ipecn = IPTOS_ECN_ECT0; 416 TCPSTAT_INC(tcps_ecn_sndect0); 417 } 418 } 419 /* 420 * Reply with proper ECN notifications. 421 */ 422 if (tp->t_flags2 & TF2_ACE_PERMIT) { 423 *thflags &= ~(TH_AE|TH_CWR|TH_ECE); 424 if (tp->t_rcep & 0x01) 425 *thflags |= TH_ECE; 426 if (tp->t_rcep & 0x02) 427 *thflags |= TH_CWR; 428 if (tp->t_rcep & 0x04) 429 *thflags |= TH_AE; 430 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { 431 /* 432 * here we process the final 433 * ACK of the 3WHS 434 */ 435 if (tp->t_rcep == 0b110) { 436 tp->t_rcep = 6; 437 } else { 438 tp->t_rcep = 5; 439 } 440 tp->t_flags2 |= TF2_ECN_PERMIT; 441 } 442 } else { 443 if (newdata && 444 (tp->t_flags2 & TF2_ECN_SND_CWR)) { 445 *thflags |= TH_CWR; 446 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 447 } 448 if (tp->t_flags2 & TF2_ECN_SND_ECE) 449 *thflags |= TH_ECE; 450 } 451 452 return ipecn; 453 } 454 455 /* 456 * Set up the ECN related tcpcb fields from 457 * a syncache entry 458 */ 459 void 460 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc) 461 { 462 if (sc->sc_flags & SCF_ECN_MASK) { 463 switch (sc->sc_flags & SCF_ECN_MASK) { 464 case SCF_ECN: 465 tp->t_flags2 |= TF2_ECN_PERMIT; 466 break; 467 case SCF_ACE_N: 468 /* FALLTHROUGH */ 469 case SCF_ACE_0: 470 /* FALLTHROUGH */ 471 case SCF_ACE_1: 472 tp->t_flags2 |= TF2_ACE_PERMIT; 473 tp->t_scep = 5; 474 tp->t_rcep = 5; 475 break; 476 case SCF_ACE_CE: 477 tp->t_flags2 |= TF2_ACE_PERMIT; 478 tp->t_scep = 6; 479 tp->t_rcep = 6; 480 break; 481 /* undefined SCF codepoint */ 482 default: 483 break; 484 } 485 } 486 } 487 488 /* 489 * Process a <SYN> packets ECN information, and provide the 490 * syncache with the relevant information. 491 */ 492 int 493 tcp_ecn_syncache_add(uint16_t thflags, int iptos) 494 { 495 int scflags = 0; 496 497 switch (iptos & IPTOS_ECN_MASK) { 498 case IPTOS_ECN_CE: 499 TCPSTAT_INC(tcps_ecn_rcvce); 500 break; 501 case IPTOS_ECN_ECT0: 502 TCPSTAT_INC(tcps_ecn_rcvect0); 503 break; 504 case IPTOS_ECN_ECT1: 505 TCPSTAT_INC(tcps_ecn_rcvect1); 506 break; 507 } 508 509 switch (thflags & (TH_AE|TH_CWR|TH_ECE)) { 510 /* no ECN */ 511 case (0|0|0): 512 break; 513 /* legacy ECN */ 514 case (0|TH_CWR|TH_ECE): 515 scflags = SCF_ECN; 516 break; 517 /* Accurate ECN */ 518 case (TH_AE|TH_CWR|TH_ECE): 519 if ((V_tcp_do_ecn == 3) || 520 (V_tcp_do_ecn == 4)) { 521 switch (iptos & IPTOS_ECN_MASK) { 522 case IPTOS_ECN_CE: 523 scflags = SCF_ACE_CE; 524 break; 525 case IPTOS_ECN_ECT0: 526 scflags = SCF_ACE_0; 527 break; 528 case IPTOS_ECN_ECT1: 529 scflags = SCF_ACE_1; 530 break; 531 case IPTOS_ECN_NOTECT: 532 scflags = SCF_ACE_N; 533 break; 534 } 535 } else 536 scflags = SCF_ECN; 537 break; 538 /* Default Case (section 3.1.2) */ 539 default: 540 if ((V_tcp_do_ecn == 3) || 541 (V_tcp_do_ecn == 4)) { 542 switch (iptos & IPTOS_ECN_MASK) { 543 case IPTOS_ECN_CE: 544 scflags = SCF_ACE_CE; 545 break; 546 case IPTOS_ECN_ECT0: 547 scflags = SCF_ACE_0; 548 break; 549 case IPTOS_ECN_ECT1: 550 scflags = SCF_ACE_1; 551 break; 552 case IPTOS_ECN_NOTECT: 553 scflags = SCF_ACE_N; 554 break; 555 } 556 } 557 break; 558 } 559 return scflags; 560 } 561 562 /* 563 * Set up the ECN information for the <SYN,ACK> from 564 * syncache information. 565 */ 566 uint16_t 567 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) 568 { 569 if ((thflags & TH_SYN) && 570 (sc->sc_flags & SCF_ECN_MASK)) { 571 switch (sc->sc_flags & SCF_ECN_MASK) { 572 case SCF_ECN: 573 thflags |= (0 | 0 | TH_ECE); 574 TCPSTAT_INC(tcps_ecn_shs); 575 break; 576 case SCF_ACE_N: 577 thflags |= (0 | TH_CWR | 0); 578 TCPSTAT_INC(tcps_ecn_shs); 579 TCPSTAT_INC(tcps_ace_nect); 580 break; 581 case SCF_ACE_0: 582 thflags |= (TH_AE | 0 | 0); 583 TCPSTAT_INC(tcps_ecn_shs); 584 TCPSTAT_INC(tcps_ace_ect0); 585 break; 586 case SCF_ACE_1: 587 thflags |= (0 | TH_ECE | TH_CWR); 588 TCPSTAT_INC(tcps_ecn_shs); 589 TCPSTAT_INC(tcps_ace_ect1); 590 break; 591 case SCF_ACE_CE: 592 thflags |= (TH_AE | TH_CWR | 0); 593 TCPSTAT_INC(tcps_ecn_shs); 594 TCPSTAT_INC(tcps_ace_ce); 595 break; 596 /* undefined SCF codepoint */ 597 default: 598 break; 599 } 600 } 601 return thflags; 602 } 603 604 int 605 tcp_ecn_get_ace(uint16_t thflags) 606 { 607 int ace = 0; 608 609 if (thflags & TH_ECE) 610 ace += 1; 611 if (thflags & TH_CWR) 612 ace += 2; 613 if (thflags & TH_AE) 614 ace += 4; 615 return ace; 616 } 617