1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2007-2008,2010 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 9 * Copyright (c) 2010 The FreeBSD Foundation 10 * Copyright (c) 2010-2011 Juniper Networks, Inc. 11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com> 12 * All rights reserved. 13 * 14 * Portions of this software were developed at the Centre for Advanced Internet 15 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 16 * James Healy and David Hayes, made possible in part by a grant from the Cisco 17 * University Research Program Fund at Community Foundation Silicon Valley. 18 * 19 * Portions of this software were developed at the Centre for Advanced 20 * Internet Architectures, Swinburne University of Technology, Melbourne, 21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 22 * 23 * Portions of this software were developed by Robert N. M. Watson under 24 * contract to Juniper Networks, Inc. 25 * 26 * Redistribution and use in source and binary forms, with or without 27 * modification, are permitted provided that the following conditions 28 * are met: 29 * 1. Redistributions of source code must retain the above copyright 30 * notice, this list of conditions and the following disclaimer. 31 * 2. Redistributions in binary form must reproduce the above copyright 32 * notice, this list of conditions and the following disclaimer in the 33 * documentation and/or other materials provided with the distribution. 34 * 3. Neither the name of the University nor the names of its contributors 35 * may be used to endorse or promote products derived from this software 36 * without specific prior written permission. 37 * 38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 48 * SUCH DAMAGE. 49 * 50 * @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95 51 */ 52 53 /* 54 * Utility functions to deal with Explicit Congestion Notification in TCP 55 * implementing the essential parts of the Accurate ECN extension 56 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include "opt_inet.h" 63 #include "opt_inet6.h" 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/kernel.h> 68 #include <sys/sysctl.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/socket.h> 72 #include <sys/socketvar.h> 73 74 #include <machine/cpu.h> 75 76 #include <vm/uma.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/route.h> 81 #include <net/vnet.h> 82 83 #include <netinet/in.h> 84 #include <netinet/in_systm.h> 85 #include <netinet/ip.h> 86 #include <netinet/in_var.h> 87 #include <netinet/in_pcb.h> 88 #include <netinet/ip_var.h> 89 #include <netinet/ip6.h> 90 #include <netinet/icmp6.h> 91 #include <netinet6/nd6.h> 92 #include <netinet6/ip6_var.h> 93 #include <netinet6/in6_pcb.h> 94 #include <netinet/tcp.h> 95 #include <netinet/tcp_fsm.h> 96 #include <netinet/tcp_seq.h> 97 #include <netinet/tcp_var.h> 98 #include <netinet/tcp_syncache.h> 99 #include <netinet/tcp_timer.h> 100 #include <netinet/tcpip.h> 101 #include <netinet/tcp_ecn.h> 102 103 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, 104 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 105 "TCP ECN"); 106 107 VNET_DEFINE(int, tcp_do_ecn) = 2; 108 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, 109 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, 110 "TCP ECN support"); 111 112 VNET_DEFINE(int, tcp_ecn_maxretries) = 1; 113 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, 114 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, 115 "Max retries before giving up on ECN"); 116 117 /* 118 * Process incoming SYN,ACK packet 119 */ 120 void 121 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) 122 { 123 124 if (V_tcp_do_ecn == 0) 125 return; 126 if ((V_tcp_do_ecn == 1) || 127 (V_tcp_do_ecn == 2)) { 128 /* RFC3168 ECN handling */ 129 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) { 130 tp->t_flags2 |= TF2_ECN_PERMIT; 131 tp->t_flags2 &= ~TF2_ACE_PERMIT; 132 TCPSTAT_INC(tcps_ecn_shs); 133 } 134 } else 135 /* decoding Accurate ECN according to table in section 3.1.1 */ 136 if ((V_tcp_do_ecn == 3) || 137 (V_tcp_do_ecn == 4)) { 138 /* 139 * on the SYN,ACK, process the AccECN 140 * flags indicating the state the SYN 141 * was delivered. 142 * Reactions to Path ECN mangling can 143 * come here. 144 */ 145 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 146 /* RFC3168 SYN */ 147 case (0|0|TH_ECE): 148 tp->t_flags2 |= TF2_ECN_PERMIT; 149 tp->t_flags2 &= ~TF2_ACE_PERMIT; 150 TCPSTAT_INC(tcps_ecn_shs); 151 break; 152 /* non-ECT SYN */ 153 case (0|TH_CWR|0): 154 tp->t_flags2 |= TF2_ACE_PERMIT; 155 tp->t_flags2 &= ~TF2_ECN_PERMIT; 156 tp->t_scep = 5; 157 TCPSTAT_INC(tcps_ecn_shs); 158 TCPSTAT_INC(tcps_ace_nect); 159 break; 160 /* ECT0 SYN */ 161 case (TH_AE|0|0): 162 tp->t_flags2 |= TF2_ACE_PERMIT; 163 tp->t_flags2 &= ~TF2_ECN_PERMIT; 164 tp->t_scep = 5; 165 TCPSTAT_INC(tcps_ecn_shs); 166 TCPSTAT_INC(tcps_ace_ect0); 167 break; 168 /* ECT1 SYN */ 169 case (0|TH_CWR|TH_ECE): 170 tp->t_flags2 |= TF2_ACE_PERMIT; 171 tp->t_flags2 &= ~TF2_ECN_PERMIT; 172 tp->t_scep = 5; 173 TCPSTAT_INC(tcps_ecn_shs); 174 TCPSTAT_INC(tcps_ace_ect1); 175 break; 176 /* CE SYN */ 177 case (TH_AE|TH_CWR|0): 178 tp->t_flags2 |= TF2_ACE_PERMIT; 179 tp->t_flags2 &= ~TF2_ECN_PERMIT; 180 tp->t_scep = 6; 181 /* 182 * reduce the IW to 2 MSS (to 183 * account for delayed acks) if 184 * the SYN,ACK was CE marked 185 */ 186 tp->snd_cwnd = 2 * tcp_maxseg(tp); 187 TCPSTAT_INC(tcps_ecn_shs); 188 TCPSTAT_INC(tcps_ace_nect); 189 break; 190 default: 191 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 192 break; 193 } 194 /* 195 * Set the AccECN Codepoints on 196 * the outgoing <ACK> to the ECN 197 * state of the <SYN,ACK> 198 * according to table 3 in the 199 * AccECN draft 200 */ 201 switch (iptos & IPTOS_ECN_MASK) { 202 case (IPTOS_ECN_NOTECT): 203 tp->t_rcep = 0b010; 204 break; 205 case (IPTOS_ECN_ECT0): 206 tp->t_rcep = 0b100; 207 break; 208 case (IPTOS_ECN_ECT1): 209 tp->t_rcep = 0b011; 210 break; 211 case (IPTOS_ECN_CE): 212 tp->t_rcep = 0b110; 213 break; 214 } 215 } 216 } 217 218 /* 219 * Handle parallel SYN for ECN 220 */ 221 void 222 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos) 223 { 224 if (thflags & TH_ACK) 225 return; 226 if (V_tcp_do_ecn == 0) 227 return; 228 if ((V_tcp_do_ecn == 1) || 229 (V_tcp_do_ecn == 2)) { 230 /* RFC3168 ECN handling */ 231 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) { 232 tp->t_flags2 |= TF2_ECN_PERMIT; 233 tp->t_flags2 &= ~TF2_ACE_PERMIT; 234 tp->t_flags2 |= TF2_ECN_SND_ECE; 235 TCPSTAT_INC(tcps_ecn_shs); 236 } 237 } else 238 if ((V_tcp_do_ecn == 3) || 239 (V_tcp_do_ecn == 4)) { 240 /* AccECN handling */ 241 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 242 default: 243 case (0|0|0): 244 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 245 break; 246 case (0|TH_CWR|TH_ECE): 247 tp->t_flags2 |= TF2_ECN_PERMIT; 248 tp->t_flags2 &= ~TF2_ACE_PERMIT; 249 tp->t_flags2 |= TF2_ECN_SND_ECE; 250 TCPSTAT_INC(tcps_ecn_shs); 251 break; 252 case (TH_AE|TH_CWR|TH_ECE): 253 tp->t_flags2 |= TF2_ACE_PERMIT; 254 tp->t_flags2 &= ~TF2_ECN_PERMIT; 255 TCPSTAT_INC(tcps_ecn_shs); 256 /* 257 * Set the AccECN Codepoints on 258 * the outgoing <ACK> to the ECN 259 * state of the <SYN,ACK> 260 * according to table 3 in the 261 * AccECN draft 262 */ 263 switch (iptos & IPTOS_ECN_MASK) { 264 case (IPTOS_ECN_NOTECT): 265 tp->t_rcep = 0b010; 266 break; 267 case (IPTOS_ECN_ECT0): 268 tp->t_rcep = 0b100; 269 break; 270 case (IPTOS_ECN_ECT1): 271 tp->t_rcep = 0b011; 272 break; 273 case (IPTOS_ECN_CE): 274 tp->t_rcep = 0b110; 275 break; 276 } 277 break; 278 } 279 } 280 } 281 282 /* 283 * TCP ECN processing. 284 */ 285 int 286 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int tlen, int pkts, int iptos) 287 { 288 int delta_cep = 0; 289 290 switch (iptos & IPTOS_ECN_MASK) { 291 case IPTOS_ECN_CE: 292 TCPSTAT_INC(tcps_ecn_rcvce); 293 break; 294 case IPTOS_ECN_ECT0: 295 TCPSTAT_INC(tcps_ecn_rcvect0); 296 break; 297 case IPTOS_ECN_ECT1: 298 TCPSTAT_INC(tcps_ecn_rcvect1); 299 break; 300 } 301 302 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { 303 if (tp->t_flags2 & TF2_ACE_PERMIT) { 304 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 305 tp->t_rcep += 1; 306 if (tp->t_flags2 & TF2_ECN_PERMIT) { 307 delta_cep = (tcp_ecn_get_ace(thflags) + 8 - 308 (tp->t_scep & 7)) & 7; 309 if (delta_cep < pkts) 310 delta_cep = pkts - 311 ((pkts - delta_cep) & 7); 312 tp->t_scep += delta_cep; 313 } else { 314 /* 315 * process the final ACK of the 3WHS 316 * see table 3 in draft-ietf-tcpm-accurate-ecn 317 */ 318 switch (tcp_ecn_get_ace(thflags)) { 319 case 0b010: 320 /* nonECT SYN or SYN,ACK */ 321 /* Fallthrough */ 322 case 0b011: 323 /* ECT1 SYN or SYN,ACK */ 324 /* Fallthrough */ 325 case 0b100: 326 /* ECT0 SYN or SYN,ACK */ 327 tp->t_scep = 5; 328 break; 329 case 0b110: 330 /* CE SYN or SYN,ACK */ 331 tp->t_scep = 6; 332 tp->snd_cwnd = 2 * tcp_maxseg(tp); 333 break; 334 default: 335 /* mangled AccECN handshake */ 336 tp->t_scep = 5; 337 break; 338 } 339 tp->t_flags2 |= TF2_ECN_PERMIT; 340 } 341 } else { 342 /* RFC3168 ECN handling */ 343 if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) { 344 delta_cep = 1; 345 tp->t_scep++; 346 } 347 if (thflags & TH_CWR) { 348 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 349 tp->t_flags |= TF_ACKNOW; 350 } 351 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 352 tp->t_flags2 |= TF2_ECN_SND_ECE; 353 } 354 355 /* Process a packet differently from RFC3168. */ 356 cc_ecnpkt_handler_flags(tp, thflags, iptos); 357 } 358 359 return delta_cep; 360 } 361 362 /* 363 * Send ECN setup <SYN> packet header flags 364 */ 365 uint16_t 366 tcp_ecn_output_syn_sent(struct tcpcb *tp) 367 { 368 uint16_t thflags = 0; 369 370 if (V_tcp_do_ecn == 0) 371 return thflags; 372 if (V_tcp_do_ecn == 1) { 373 /* Send a RFC3168 ECN setup <SYN> packet */ 374 if (tp->t_rxtshift >= 1) { 375 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 376 thflags = TH_ECE|TH_CWR; 377 } else 378 thflags = TH_ECE|TH_CWR; 379 } else 380 if (V_tcp_do_ecn == 3) { 381 /* Send an Accurate ECN setup <SYN> packet */ 382 if (tp->t_rxtshift >= 1) { 383 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 384 thflags = TH_ECE|TH_CWR|TH_AE; 385 } else 386 thflags = TH_ECE|TH_CWR|TH_AE; 387 } 388 389 return thflags; 390 } 391 392 /* 393 * output processing of ECN feature 394 * returning IP ECN header codepoint 395 */ 396 int 397 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit) 398 { 399 int ipecn = IPTOS_ECN_NOTECT; 400 bool newdata; 401 402 /* 403 * If the peer has ECN, mark data packets with 404 * ECN capable transmission (ECT). 405 * Ignore pure control packets, retransmissions 406 * and window probes. 407 */ 408 newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 409 !rxmit && 410 !((tp->t_flags & TF_FORCEDATA) && len == 1)); 411 /* RFC3168 ECN marking, only new data segments */ 412 if (newdata) { 413 if (tp->t_flags2 & TF2_ECN_USE_ECT1) { 414 ipecn = IPTOS_ECN_ECT1; 415 TCPSTAT_INC(tcps_ecn_sndect1); 416 } else { 417 ipecn = IPTOS_ECN_ECT0; 418 TCPSTAT_INC(tcps_ecn_sndect0); 419 } 420 } 421 /* 422 * Reply with proper ECN notifications. 423 */ 424 if (tp->t_flags2 & TF2_ACE_PERMIT) { 425 *thflags &= ~(TH_AE|TH_CWR|TH_ECE); 426 if (tp->t_rcep & 0x01) 427 *thflags |= TH_ECE; 428 if (tp->t_rcep & 0x02) 429 *thflags |= TH_CWR; 430 if (tp->t_rcep & 0x04) 431 *thflags |= TH_AE; 432 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { 433 /* 434 * here we process the final 435 * ACK of the 3WHS 436 */ 437 if (tp->t_rcep == 0b110) { 438 tp->t_rcep = 6; 439 } else { 440 tp->t_rcep = 5; 441 } 442 tp->t_flags2 |= TF2_ECN_PERMIT; 443 } 444 } else { 445 if (newdata && 446 (tp->t_flags2 & TF2_ECN_SND_CWR)) { 447 *thflags |= TH_CWR; 448 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 449 } 450 if (tp->t_flags2 & TF2_ECN_SND_ECE) 451 *thflags |= TH_ECE; 452 } 453 454 return ipecn; 455 } 456 457 /* 458 * Set up the ECN related tcpcb fields from 459 * a syncache entry 460 */ 461 void 462 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc) 463 { 464 if (sc->sc_flags & SCF_ECN_MASK) { 465 switch (sc->sc_flags & SCF_ECN_MASK) { 466 case SCF_ECN: 467 tp->t_flags2 |= TF2_ECN_PERMIT; 468 break; 469 case SCF_ACE_N: 470 /* Fallthrough */ 471 case SCF_ACE_0: 472 /* Fallthrough */ 473 case SCF_ACE_1: 474 tp->t_flags2 |= TF2_ACE_PERMIT; 475 tp->t_scep = 5; 476 tp->t_rcep = 5; 477 break; 478 case SCF_ACE_CE: 479 tp->t_flags2 |= TF2_ACE_PERMIT; 480 tp->t_scep = 6; 481 tp->t_rcep = 6; 482 break; 483 /* undefined SCF codepoint */ 484 default: 485 break; 486 } 487 } 488 } 489 490 /* 491 * Process a <SYN> packets ECN information, and provide the 492 * syncache with the relevant information. 493 */ 494 int 495 tcp_ecn_syncache_add(uint16_t thflags, int iptos) 496 { 497 int scflags = 0; 498 499 switch (iptos & IPTOS_ECN_MASK) { 500 case IPTOS_ECN_CE: 501 TCPSTAT_INC(tcps_ecn_rcvce); 502 break; 503 case IPTOS_ECN_ECT0: 504 TCPSTAT_INC(tcps_ecn_rcvect0); 505 break; 506 case IPTOS_ECN_ECT1: 507 TCPSTAT_INC(tcps_ecn_rcvect1); 508 break; 509 } 510 511 switch (thflags & (TH_AE|TH_CWR|TH_ECE)) { 512 /* no ECN */ 513 case (0|0|0): 514 break; 515 /* legacy ECN */ 516 case (0|TH_CWR|TH_ECE): 517 scflags = SCF_ECN; 518 break; 519 /* Accurate ECN */ 520 case (TH_AE|TH_CWR|TH_ECE): 521 if ((V_tcp_do_ecn == 3) || 522 (V_tcp_do_ecn == 4)) { 523 switch (iptos & IPTOS_ECN_MASK) { 524 case IPTOS_ECN_CE: 525 scflags = SCF_ACE_CE; 526 break; 527 case IPTOS_ECN_ECT0: 528 scflags = SCF_ACE_0; 529 break; 530 case IPTOS_ECN_ECT1: 531 scflags = SCF_ACE_1; 532 break; 533 case IPTOS_ECN_NOTECT: 534 scflags = SCF_ACE_N; 535 break; 536 } 537 } else 538 scflags = SCF_ECN; 539 break; 540 /* Default Case (section 3.1.2) */ 541 default: 542 if ((V_tcp_do_ecn == 3) || 543 (V_tcp_do_ecn == 4)) { 544 switch (iptos & IPTOS_ECN_MASK) { 545 case IPTOS_ECN_CE: 546 scflags = SCF_ACE_CE; 547 break; 548 case IPTOS_ECN_ECT0: 549 scflags = SCF_ACE_0; 550 break; 551 case IPTOS_ECN_ECT1: 552 scflags = SCF_ACE_1; 553 break; 554 case IPTOS_ECN_NOTECT: 555 scflags = SCF_ACE_N; 556 break; 557 } 558 } 559 break; 560 } 561 return scflags; 562 } 563 564 /* 565 * Set up the ECN information for the <SYN,ACK> from 566 * syncache information. 567 */ 568 uint16_t 569 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) 570 { 571 if ((thflags & TH_SYN) && 572 (sc->sc_flags & SCF_ECN_MASK)) { 573 switch (sc->sc_flags & SCF_ECN_MASK) { 574 case SCF_ECN: 575 thflags |= (0 | 0 | TH_ECE); 576 TCPSTAT_INC(tcps_ecn_shs); 577 break; 578 case SCF_ACE_N: 579 thflags |= (0 | TH_CWR | 0); 580 TCPSTAT_INC(tcps_ecn_shs); 581 TCPSTAT_INC(tcps_ace_nect); 582 break; 583 case SCF_ACE_0: 584 thflags |= (TH_AE | 0 | 0); 585 TCPSTAT_INC(tcps_ecn_shs); 586 TCPSTAT_INC(tcps_ace_ect0); 587 break; 588 case SCF_ACE_1: 589 thflags |= (0 | TH_ECE | TH_CWR); 590 TCPSTAT_INC(tcps_ecn_shs); 591 TCPSTAT_INC(tcps_ace_ect1); 592 break; 593 case SCF_ACE_CE: 594 thflags |= (TH_AE | TH_CWR | 0); 595 TCPSTAT_INC(tcps_ecn_shs); 596 TCPSTAT_INC(tcps_ace_ce); 597 break; 598 /* undefined SCF codepoint */ 599 default: 600 break; 601 } 602 } 603 return thflags; 604 } 605 606 int 607 tcp_ecn_get_ace(uint16_t thflags) 608 { 609 int ace = 0; 610 611 if (thflags & TH_ECE) 612 ace += 1; 613 if (thflags & TH_CWR) 614 ace += 2; 615 if (thflags & TH_AE) 616 ace += 4; 617 return ace; 618 } 619