1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2007-2008,2010 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 9 * Copyright (c) 2010 The FreeBSD Foundation 10 * Copyright (c) 2010-2011 Juniper Networks, Inc. 11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com> 12 * All rights reserved. 13 * 14 * Portions of this software were developed at the Centre for Advanced Internet 15 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 16 * James Healy and David Hayes, made possible in part by a grant from the Cisco 17 * University Research Program Fund at Community Foundation Silicon Valley. 18 * 19 * Portions of this software were developed at the Centre for Advanced 20 * Internet Architectures, Swinburne University of Technology, Melbourne, 21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 22 * 23 * Portions of this software were developed by Robert N. M. Watson under 24 * contract to Juniper Networks, Inc. 25 * 26 * Redistribution and use in source and binary forms, with or without 27 * modification, are permitted provided that the following conditions 28 * are met: 29 * 1. Redistributions of source code must retain the above copyright 30 * notice, this list of conditions and the following disclaimer. 31 * 2. Redistributions in binary form must reproduce the above copyright 32 * notice, this list of conditions and the following disclaimer in the 33 * documentation and/or other materials provided with the distribution. 34 * 3. Neither the name of the University nor the names of its contributors 35 * may be used to endorse or promote products derived from this software 36 * without specific prior written permission. 37 * 38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 48 * SUCH DAMAGE. 49 * 50 * @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95 51 */ 52 53 /* 54 * Utility functions to deal with Explicit Congestion Notification in TCP 55 * implementing the essential parts of the Accurate ECN extension 56 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include "opt_inet.h" 63 #include "opt_inet6.h" 64 #include "opt_tcpdebug.h" 65 66 #include <sys/param.h> 67 #include <sys/systm.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/malloc.h> 71 #include <sys/mbuf.h> 72 #include <sys/socket.h> 73 #include <sys/socketvar.h> 74 75 #include <machine/cpu.h> 76 77 #include <vm/uma.h> 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 #include <net/route.h> 82 #include <net/vnet.h> 83 84 #include <netinet/in.h> 85 #include <netinet/in_systm.h> 86 #include <netinet/ip.h> 87 #include <netinet/in_var.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/ip_var.h> 90 #include <netinet/ip6.h> 91 #include <netinet/icmp6.h> 92 #include <netinet6/nd6.h> 93 #include <netinet6/ip6_var.h> 94 #include <netinet6/in6_pcb.h> 95 #include <netinet/tcp.h> 96 #include <netinet/tcp_fsm.h> 97 #include <netinet/tcp_seq.h> 98 #include <netinet/tcp_var.h> 99 #include <netinet/tcp_syncache.h> 100 #include <netinet/tcp_timer.h> 101 #include <netinet/tcpip.h> 102 #include <netinet/tcp_ecn.h> 103 104 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, 105 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 106 "TCP ECN"); 107 108 VNET_DEFINE(int, tcp_do_ecn) = 2; 109 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, 110 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, 111 "TCP ECN support"); 112 113 VNET_DEFINE(int, tcp_ecn_maxretries) = 1; 114 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, 115 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, 116 "Max retries before giving up on ECN"); 117 118 /* 119 * Process incoming SYN,ACK packet 120 */ 121 void 122 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) 123 { 124 125 if (V_tcp_do_ecn == 0) 126 return; 127 if ((V_tcp_do_ecn == 1) || 128 (V_tcp_do_ecn == 2)) { 129 /* RFC3168 ECN handling */ 130 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) { 131 tp->t_flags2 |= TF2_ECN_PERMIT; 132 tp->t_flags2 &= ~TF2_ACE_PERMIT; 133 TCPSTAT_INC(tcps_ecn_shs); 134 } 135 } else 136 /* decoding Accurate ECN according to table in section 3.1.1 */ 137 if ((V_tcp_do_ecn == 3) || 138 (V_tcp_do_ecn == 4)) { 139 /* 140 * on the SYN,ACK, process the AccECN 141 * flags indicating the state the SYN 142 * was delivered. 143 * Reactions to Path ECN mangling can 144 * come here. 145 */ 146 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 147 /* RFC3168 SYN */ 148 case (0|0|TH_ECE): 149 tp->t_flags2 |= TF2_ECN_PERMIT; 150 tp->t_flags2 &= ~TF2_ACE_PERMIT; 151 TCPSTAT_INC(tcps_ecn_shs); 152 break; 153 /* non-ECT SYN */ 154 case (0|TH_CWR|0): 155 tp->t_flags2 |= TF2_ACE_PERMIT; 156 tp->t_flags2 &= ~TF2_ECN_PERMIT; 157 tp->t_scep = 5; 158 TCPSTAT_INC(tcps_ecn_shs); 159 TCPSTAT_INC(tcps_ace_nect); 160 break; 161 /* ECT0 SYN */ 162 case (TH_AE|0|0): 163 tp->t_flags2 |= TF2_ACE_PERMIT; 164 tp->t_flags2 &= ~TF2_ECN_PERMIT; 165 tp->t_scep = 5; 166 TCPSTAT_INC(tcps_ecn_shs); 167 TCPSTAT_INC(tcps_ace_ect0); 168 break; 169 /* ECT1 SYN */ 170 case (0|TH_CWR|TH_ECE): 171 tp->t_flags2 |= TF2_ACE_PERMIT; 172 tp->t_flags2 &= ~TF2_ECN_PERMIT; 173 tp->t_scep = 5; 174 TCPSTAT_INC(tcps_ecn_shs); 175 TCPSTAT_INC(tcps_ace_ect1); 176 break; 177 /* CE SYN */ 178 case (TH_AE|TH_CWR|0): 179 tp->t_flags2 |= TF2_ACE_PERMIT; 180 tp->t_flags2 &= ~TF2_ECN_PERMIT; 181 tp->t_scep = 6; 182 /* 183 * reduce the IW to 2 MSS (to 184 * account for delayed acks) if 185 * the SYN,ACK was CE marked 186 */ 187 tp->snd_cwnd = 2 * tcp_maxseg(tp); 188 TCPSTAT_INC(tcps_ecn_shs); 189 TCPSTAT_INC(tcps_ace_nect); 190 break; 191 default: 192 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 193 break; 194 } 195 /* 196 * Set the AccECN Codepoints on 197 * the outgoing <ACK> to the ECN 198 * state of the <SYN,ACK> 199 * according to table 3 in the 200 * AccECN draft 201 */ 202 switch (iptos & IPTOS_ECN_MASK) { 203 case (IPTOS_ECN_NOTECT): 204 tp->t_rcep = 0b010; 205 break; 206 case (IPTOS_ECN_ECT0): 207 tp->t_rcep = 0b100; 208 break; 209 case (IPTOS_ECN_ECT1): 210 tp->t_rcep = 0b011; 211 break; 212 case (IPTOS_ECN_CE): 213 tp->t_rcep = 0b110; 214 break; 215 } 216 } 217 } 218 219 /* 220 * Handle parallel SYN for ECN 221 */ 222 void 223 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos) 224 { 225 if (thflags & TH_ACK) 226 return; 227 if (V_tcp_do_ecn == 0) 228 return; 229 if ((V_tcp_do_ecn == 1) || 230 (V_tcp_do_ecn == 2)) { 231 /* RFC3168 ECN handling */ 232 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) { 233 tp->t_flags2 |= TF2_ECN_PERMIT; 234 tp->t_flags2 &= ~TF2_ACE_PERMIT; 235 tp->t_flags2 |= TF2_ECN_SND_ECE; 236 TCPSTAT_INC(tcps_ecn_shs); 237 } 238 } else 239 if ((V_tcp_do_ecn == 3) || 240 (V_tcp_do_ecn == 4)) { 241 /* AccECN handling */ 242 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 243 default: 244 case (0|0|0): 245 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 246 break; 247 case (0|TH_CWR|TH_ECE): 248 tp->t_flags2 |= TF2_ECN_PERMIT; 249 tp->t_flags2 &= ~TF2_ACE_PERMIT; 250 tp->t_flags2 |= TF2_ECN_SND_ECE; 251 TCPSTAT_INC(tcps_ecn_shs); 252 break; 253 case (TH_AE|TH_CWR|TH_ECE): 254 tp->t_flags2 |= TF2_ACE_PERMIT; 255 tp->t_flags2 &= ~TF2_ECN_PERMIT; 256 TCPSTAT_INC(tcps_ecn_shs); 257 /* 258 * Set the AccECN Codepoints on 259 * the outgoing <ACK> to the ECN 260 * state of the <SYN,ACK> 261 * according to table 3 in the 262 * AccECN draft 263 */ 264 switch (iptos & IPTOS_ECN_MASK) { 265 case (IPTOS_ECN_NOTECT): 266 tp->t_rcep = 0b010; 267 break; 268 case (IPTOS_ECN_ECT0): 269 tp->t_rcep = 0b100; 270 break; 271 case (IPTOS_ECN_ECT1): 272 tp->t_rcep = 0b011; 273 break; 274 case (IPTOS_ECN_CE): 275 tp->t_rcep = 0b110; 276 break; 277 } 278 break; 279 } 280 } 281 } 282 283 /* 284 * TCP ECN processing. 285 */ 286 int 287 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int tlen, int pkts, int iptos) 288 { 289 int delta_cep = 0; 290 291 switch (iptos & IPTOS_ECN_MASK) { 292 case IPTOS_ECN_CE: 293 TCPSTAT_INC(tcps_ecn_rcvce); 294 break; 295 case IPTOS_ECN_ECT0: 296 TCPSTAT_INC(tcps_ecn_rcvect0); 297 break; 298 case IPTOS_ECN_ECT1: 299 TCPSTAT_INC(tcps_ecn_rcvect1); 300 break; 301 } 302 303 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { 304 if (tp->t_flags2 & TF2_ACE_PERMIT) { 305 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 306 tp->t_rcep += 1; 307 if (tp->t_flags2 & TF2_ECN_PERMIT) { 308 delta_cep = (tcp_ecn_get_ace(thflags) + 8 - 309 (tp->t_scep & 7)) & 7; 310 if (delta_cep < pkts) 311 delta_cep = pkts - 312 ((pkts - delta_cep) & 7); 313 tp->t_scep += delta_cep; 314 } else { 315 /* 316 * process the final ACK of the 3WHS 317 * see table 3 in draft-ietf-tcpm-accurate-ecn 318 */ 319 switch (tcp_ecn_get_ace(thflags)) { 320 case 0b010: 321 /* nonECT SYN or SYN,ACK */ 322 /* Fallthrough */ 323 case 0b011: 324 /* ECT1 SYN or SYN,ACK */ 325 /* Fallthrough */ 326 case 0b100: 327 /* ECT0 SYN or SYN,ACK */ 328 tp->t_scep = 5; 329 break; 330 case 0b110: 331 /* CE SYN or SYN,ACK */ 332 tp->t_scep = 6; 333 tp->snd_cwnd = 2 * tcp_maxseg(tp); 334 break; 335 default: 336 /* mangled AccECN handshake */ 337 tp->t_scep = 5; 338 break; 339 } 340 tp->t_flags2 |= TF2_ECN_PERMIT; 341 } 342 } else { 343 /* RFC3168 ECN handling */ 344 if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) { 345 delta_cep = 1; 346 tp->t_scep++; 347 } 348 if (thflags & TH_CWR) { 349 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 350 tp->t_flags |= TF_ACKNOW; 351 } 352 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 353 tp->t_flags2 |= TF2_ECN_SND_ECE; 354 } 355 356 /* Process a packet differently from RFC3168. */ 357 cc_ecnpkt_handler_flags(tp, thflags, iptos); 358 } 359 360 return delta_cep; 361 } 362 363 /* 364 * Send ECN setup <SYN> packet header flags 365 */ 366 uint16_t 367 tcp_ecn_output_syn_sent(struct tcpcb *tp) 368 { 369 uint16_t thflags = 0; 370 371 if (V_tcp_do_ecn == 0) 372 return thflags; 373 if (V_tcp_do_ecn == 1) { 374 /* Send a RFC3168 ECN setup <SYN> packet */ 375 if (tp->t_rxtshift >= 1) { 376 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 377 thflags = TH_ECE|TH_CWR; 378 } else 379 thflags = TH_ECE|TH_CWR; 380 } else 381 if (V_tcp_do_ecn == 3) { 382 /* Send an Accurate ECN setup <SYN> packet */ 383 if (tp->t_rxtshift >= 1) { 384 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 385 thflags = TH_ECE|TH_CWR|TH_AE; 386 } else 387 thflags = TH_ECE|TH_CWR|TH_AE; 388 } 389 390 return thflags; 391 } 392 393 /* 394 * output processing of ECN feature 395 * returning IP ECN header codepoint 396 */ 397 int 398 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit) 399 { 400 int ipecn = IPTOS_ECN_NOTECT; 401 bool newdata; 402 403 /* 404 * If the peer has ECN, mark data packets with 405 * ECN capable transmission (ECT). 406 * Ignore pure control packets, retransmissions 407 * and window probes. 408 */ 409 newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 410 !rxmit && 411 !((tp->t_flags & TF_FORCEDATA) && len == 1)); 412 /* RFC3168 ECN marking, only new data segments */ 413 if (newdata) { 414 if (tp->t_flags2 & TF2_ECN_USE_ECT1) { 415 ipecn = IPTOS_ECN_ECT1; 416 TCPSTAT_INC(tcps_ecn_sndect1); 417 } else { 418 ipecn = IPTOS_ECN_ECT0; 419 TCPSTAT_INC(tcps_ecn_sndect0); 420 } 421 } 422 /* 423 * Reply with proper ECN notifications. 424 */ 425 if (tp->t_flags2 & TF2_ACE_PERMIT) { 426 *thflags &= ~(TH_AE|TH_CWR|TH_ECE); 427 if (tp->t_rcep & 0x01) 428 *thflags |= TH_ECE; 429 if (tp->t_rcep & 0x02) 430 *thflags |= TH_CWR; 431 if (tp->t_rcep & 0x04) 432 *thflags |= TH_AE; 433 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { 434 /* 435 * here we process the final 436 * ACK of the 3WHS 437 */ 438 if (tp->t_rcep == 0b110) { 439 tp->t_rcep = 6; 440 } else { 441 tp->t_rcep = 5; 442 } 443 tp->t_flags2 |= TF2_ECN_PERMIT; 444 } 445 } else { 446 if (newdata && 447 (tp->t_flags2 & TF2_ECN_SND_CWR)) { 448 *thflags |= TH_CWR; 449 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 450 } 451 if (tp->t_flags2 & TF2_ECN_SND_ECE) 452 *thflags |= TH_ECE; 453 } 454 455 return ipecn; 456 } 457 458 /* 459 * Set up the ECN related tcpcb fields from 460 * a syncache entry 461 */ 462 void 463 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc) 464 { 465 if (sc->sc_flags & SCF_ECN_MASK) { 466 switch (sc->sc_flags & SCF_ECN_MASK) { 467 case SCF_ECN: 468 tp->t_flags2 |= TF2_ECN_PERMIT; 469 break; 470 case SCF_ACE_N: 471 /* Fallthrough */ 472 case SCF_ACE_0: 473 /* Fallthrough */ 474 case SCF_ACE_1: 475 tp->t_flags2 |= TF2_ACE_PERMIT; 476 tp->t_scep = 5; 477 tp->t_rcep = 5; 478 break; 479 case SCF_ACE_CE: 480 tp->t_flags2 |= TF2_ACE_PERMIT; 481 tp->t_scep = 6; 482 tp->t_rcep = 6; 483 break; 484 /* undefined SCF codepoint */ 485 default: 486 break; 487 } 488 } 489 } 490 491 /* 492 * Process a <SYN> packets ECN information, and provide the 493 * syncache with the relevant information. 494 */ 495 int 496 tcp_ecn_syncache_add(uint16_t thflags, int iptos) 497 { 498 int scflags = 0; 499 500 switch (iptos & IPTOS_ECN_MASK) { 501 case IPTOS_ECN_CE: 502 TCPSTAT_INC(tcps_ecn_rcvce); 503 break; 504 case IPTOS_ECN_ECT0: 505 TCPSTAT_INC(tcps_ecn_rcvect0); 506 break; 507 case IPTOS_ECN_ECT1: 508 TCPSTAT_INC(tcps_ecn_rcvect1); 509 break; 510 } 511 512 switch (thflags & (TH_AE|TH_CWR|TH_ECE)) { 513 /* no ECN */ 514 case (0|0|0): 515 break; 516 /* legacy ECN */ 517 case (0|TH_CWR|TH_ECE): 518 scflags = SCF_ECN; 519 break; 520 /* Accurate ECN */ 521 case (TH_AE|TH_CWR|TH_ECE): 522 if ((V_tcp_do_ecn == 3) || 523 (V_tcp_do_ecn == 4)) { 524 switch (iptos & IPTOS_ECN_MASK) { 525 case IPTOS_ECN_CE: 526 scflags = SCF_ACE_CE; 527 break; 528 case IPTOS_ECN_ECT0: 529 scflags = SCF_ACE_0; 530 break; 531 case IPTOS_ECN_ECT1: 532 scflags = SCF_ACE_1; 533 break; 534 case IPTOS_ECN_NOTECT: 535 scflags = SCF_ACE_N; 536 break; 537 } 538 } else 539 scflags = SCF_ECN; 540 break; 541 /* Default Case (section 3.1.2) */ 542 default: 543 if ((V_tcp_do_ecn == 3) || 544 (V_tcp_do_ecn == 4)) { 545 switch (iptos & IPTOS_ECN_MASK) { 546 case IPTOS_ECN_CE: 547 scflags = SCF_ACE_CE; 548 break; 549 case IPTOS_ECN_ECT0: 550 scflags = SCF_ACE_0; 551 break; 552 case IPTOS_ECN_ECT1: 553 scflags = SCF_ACE_1; 554 break; 555 case IPTOS_ECN_NOTECT: 556 scflags = SCF_ACE_N; 557 break; 558 } 559 } 560 break; 561 } 562 return scflags; 563 } 564 565 /* 566 * Set up the ECN information for the <SYN,ACK> from 567 * syncache information. 568 */ 569 uint16_t 570 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) 571 { 572 if ((thflags & TH_SYN) && 573 (sc->sc_flags & SCF_ECN_MASK)) { 574 switch (sc->sc_flags & SCF_ECN_MASK) { 575 case SCF_ECN: 576 thflags |= (0 | 0 | TH_ECE); 577 TCPSTAT_INC(tcps_ecn_shs); 578 break; 579 case SCF_ACE_N: 580 thflags |= (0 | TH_CWR | 0); 581 TCPSTAT_INC(tcps_ecn_shs); 582 TCPSTAT_INC(tcps_ace_nect); 583 break; 584 case SCF_ACE_0: 585 thflags |= (TH_AE | 0 | 0); 586 TCPSTAT_INC(tcps_ecn_shs); 587 TCPSTAT_INC(tcps_ace_ect0); 588 break; 589 case SCF_ACE_1: 590 thflags |= (0 | TH_ECE | TH_CWR); 591 TCPSTAT_INC(tcps_ecn_shs); 592 TCPSTAT_INC(tcps_ace_ect1); 593 break; 594 case SCF_ACE_CE: 595 thflags |= (TH_AE | TH_CWR | 0); 596 TCPSTAT_INC(tcps_ecn_shs); 597 TCPSTAT_INC(tcps_ace_ce); 598 break; 599 /* undefined SCF codepoint */ 600 default: 601 break; 602 } 603 } 604 return thflags; 605 } 606 607 int 608 tcp_ecn_get_ace(uint16_t thflags) 609 { 610 int ace = 0; 611 612 if (thflags & TH_ECE) 613 ace += 1; 614 if (thflags & TH_CWR) 615 ace += 2; 616 if (thflags & TH_AE) 617 ace += 4; 618 return ace; 619 } 620