1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2007-2008,2010 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 9 * Copyright (c) 2010 The FreeBSD Foundation 10 * Copyright (c) 2010-2011 Juniper Networks, Inc. 11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com> 12 * All rights reserved. 13 * 14 * Portions of this software were developed at the Centre for Advanced Internet 15 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 16 * James Healy and David Hayes, made possible in part by a grant from the Cisco 17 * University Research Program Fund at Community Foundation Silicon Valley. 18 * 19 * Portions of this software were developed at the Centre for Advanced 20 * Internet Architectures, Swinburne University of Technology, Melbourne, 21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 22 * 23 * Portions of this software were developed by Robert N. M. Watson under 24 * contract to Juniper Networks, Inc. 25 * 26 * Redistribution and use in source and binary forms, with or without 27 * modification, are permitted provided that the following conditions 28 * are met: 29 * 1. Redistributions of source code must retain the above copyright 30 * notice, this list of conditions and the following disclaimer. 31 * 2. Redistributions in binary form must reproduce the above copyright 32 * notice, this list of conditions and the following disclaimer in the 33 * documentation and/or other materials provided with the distribution. 34 * 3. Neither the name of the University nor the names of its contributors 35 * may be used to endorse or promote products derived from this software 36 * without specific prior written permission. 37 * 38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 48 * SUCH DAMAGE. 49 */ 50 51 /* 52 * Utility functions to deal with Explicit Congestion Notification in TCP 53 * implementing the essential parts of the Accurate ECN extension 54 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09 55 */ 56 57 #include <sys/cdefs.h> 58 #include "opt_inet.h" 59 #include "opt_inet6.h" 60 61 #include <sys/param.h> 62 #include <sys/systm.h> 63 #include <sys/kernel.h> 64 #include <sys/sysctl.h> 65 #include <sys/malloc.h> 66 #include <sys/mbuf.h> 67 #include <sys/socket.h> 68 #include <sys/socketvar.h> 69 70 #include <machine/cpu.h> 71 72 #include <vm/uma.h> 73 74 #include <net/if.h> 75 #include <net/if_var.h> 76 #include <net/route.h> 77 #include <net/vnet.h> 78 79 #include <netinet/in.h> 80 #include <netinet/in_systm.h> 81 #include <netinet/ip.h> 82 #include <netinet/in_var.h> 83 #include <netinet/in_pcb.h> 84 #include <netinet/ip_var.h> 85 #include <netinet/ip6.h> 86 #include <netinet/icmp6.h> 87 #include <netinet6/nd6.h> 88 #include <netinet6/ip6_var.h> 89 #include <netinet6/in6_pcb.h> 90 #include <netinet/tcp.h> 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_seq.h> 93 #include <netinet/tcp_var.h> 94 #include <netinet/tcp_syncache.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcpip.h> 97 #include <netinet/tcp_ecn.h> 98 99 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, 100 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 101 "TCP ECN"); 102 103 VNET_DEFINE(int, tcp_do_ecn) = 2; 104 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, 105 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, 106 "TCP ECN support"); 107 108 VNET_DEFINE(int, tcp_ecn_maxretries) = 1; 109 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, 110 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, 111 "Max retries before giving up on ECN"); 112 113 /* 114 * Process incoming SYN,ACK packet 115 */ 116 void 117 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) 118 { 119 120 if (V_tcp_do_ecn == 0) 121 return; 122 if ((V_tcp_do_ecn == 1) || 123 (V_tcp_do_ecn == 2)) { 124 /* RFC3168 ECN handling */ 125 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) { 126 tp->t_flags2 |= TF2_ECN_PERMIT; 127 tp->t_flags2 &= ~TF2_ACE_PERMIT; 128 TCPSTAT_INC(tcps_ecn_shs); 129 } 130 } else 131 /* decoding Accurate ECN according to table in section 3.1.1 */ 132 if ((V_tcp_do_ecn == 3) || 133 (V_tcp_do_ecn == 4)) { 134 /* 135 * on the SYN,ACK, process the AccECN 136 * flags indicating the state the SYN 137 * was delivered. 138 * Reactions to Path ECN mangling can 139 * come here. 140 */ 141 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 142 /* RFC3168 SYN */ 143 case (0|0|TH_ECE): 144 tp->t_flags2 |= TF2_ECN_PERMIT; 145 tp->t_flags2 &= ~TF2_ACE_PERMIT; 146 TCPSTAT_INC(tcps_ecn_shs); 147 break; 148 /* non-ECT SYN */ 149 case (0|TH_CWR|0): 150 tp->t_flags2 |= TF2_ACE_PERMIT; 151 tp->t_flags2 &= ~TF2_ECN_PERMIT; 152 tp->t_scep = 5; 153 TCPSTAT_INC(tcps_ecn_shs); 154 TCPSTAT_INC(tcps_ace_nect); 155 break; 156 /* ECT0 SYN */ 157 case (TH_AE|0|0): 158 tp->t_flags2 |= TF2_ACE_PERMIT; 159 tp->t_flags2 &= ~TF2_ECN_PERMIT; 160 tp->t_scep = 5; 161 TCPSTAT_INC(tcps_ecn_shs); 162 TCPSTAT_INC(tcps_ace_ect0); 163 break; 164 /* ECT1 SYN */ 165 case (0|TH_CWR|TH_ECE): 166 tp->t_flags2 |= TF2_ACE_PERMIT; 167 tp->t_flags2 &= ~TF2_ECN_PERMIT; 168 tp->t_scep = 5; 169 TCPSTAT_INC(tcps_ecn_shs); 170 TCPSTAT_INC(tcps_ace_ect1); 171 break; 172 /* CE SYN */ 173 case (TH_AE|TH_CWR|0): 174 tp->t_flags2 |= TF2_ACE_PERMIT; 175 tp->t_flags2 &= ~TF2_ECN_PERMIT; 176 tp->t_scep = 6; 177 /* 178 * reduce the IW to 2 MSS (to 179 * account for delayed acks) if 180 * the SYN,ACK was CE marked 181 */ 182 tp->snd_cwnd = 2 * tcp_maxseg(tp); 183 TCPSTAT_INC(tcps_ecn_shs); 184 TCPSTAT_INC(tcps_ace_nect); 185 break; 186 default: 187 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 188 break; 189 } 190 /* 191 * Set the AccECN Codepoints on 192 * the outgoing <ACK> to the ECN 193 * state of the <SYN,ACK> 194 * according to table 3 in the 195 * AccECN draft 196 */ 197 switch (iptos & IPTOS_ECN_MASK) { 198 case (IPTOS_ECN_NOTECT): 199 tp->t_rcep = 0b010; 200 break; 201 case (IPTOS_ECN_ECT0): 202 tp->t_rcep = 0b100; 203 break; 204 case (IPTOS_ECN_ECT1): 205 tp->t_rcep = 0b011; 206 break; 207 case (IPTOS_ECN_CE): 208 tp->t_rcep = 0b110; 209 break; 210 } 211 } 212 } 213 214 /* 215 * Handle parallel SYN for ECN 216 */ 217 void 218 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos) 219 { 220 if (thflags & TH_ACK) 221 return; 222 if (V_tcp_do_ecn == 0) 223 return; 224 if ((V_tcp_do_ecn == 1) || 225 (V_tcp_do_ecn == 2)) { 226 /* RFC3168 ECN handling */ 227 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) { 228 tp->t_flags2 |= TF2_ECN_PERMIT; 229 tp->t_flags2 &= ~TF2_ACE_PERMIT; 230 tp->t_flags2 |= TF2_ECN_SND_ECE; 231 TCPSTAT_INC(tcps_ecn_shs); 232 } 233 } else 234 if ((V_tcp_do_ecn == 3) || 235 (V_tcp_do_ecn == 4)) { 236 /* AccECN handling */ 237 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 238 default: 239 case (0|0|0): 240 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 241 break; 242 case (0|TH_CWR|TH_ECE): 243 tp->t_flags2 |= TF2_ECN_PERMIT; 244 tp->t_flags2 &= ~TF2_ACE_PERMIT; 245 tp->t_flags2 |= TF2_ECN_SND_ECE; 246 TCPSTAT_INC(tcps_ecn_shs); 247 break; 248 case (TH_AE|TH_CWR|TH_ECE): 249 tp->t_flags2 |= TF2_ACE_PERMIT; 250 tp->t_flags2 &= ~TF2_ECN_PERMIT; 251 TCPSTAT_INC(tcps_ecn_shs); 252 /* 253 * Set the AccECN Codepoints on 254 * the outgoing <ACK> to the ECN 255 * state of the <SYN,ACK> 256 * according to table 3 in the 257 * AccECN draft 258 */ 259 switch (iptos & IPTOS_ECN_MASK) { 260 case (IPTOS_ECN_NOTECT): 261 tp->t_rcep = 0b010; 262 break; 263 case (IPTOS_ECN_ECT0): 264 tp->t_rcep = 0b100; 265 break; 266 case (IPTOS_ECN_ECT1): 267 tp->t_rcep = 0b011; 268 break; 269 case (IPTOS_ECN_CE): 270 tp->t_rcep = 0b110; 271 break; 272 } 273 break; 274 } 275 } 276 } 277 278 /* 279 * TCP ECN processing. 280 */ 281 int 282 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int tlen, int pkts, int iptos) 283 { 284 int delta_cep = 0; 285 286 switch (iptos & IPTOS_ECN_MASK) { 287 case IPTOS_ECN_CE: 288 TCPSTAT_INC(tcps_ecn_rcvce); 289 break; 290 case IPTOS_ECN_ECT0: 291 TCPSTAT_INC(tcps_ecn_rcvect0); 292 break; 293 case IPTOS_ECN_ECT1: 294 TCPSTAT_INC(tcps_ecn_rcvect1); 295 break; 296 } 297 298 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { 299 if (tp->t_flags2 & TF2_ACE_PERMIT) { 300 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 301 tp->t_rcep += 1; 302 if (tp->t_flags2 & TF2_ECN_PERMIT) { 303 delta_cep = (tcp_ecn_get_ace(thflags) + 8 - 304 (tp->t_scep & 7)) & 7; 305 if (delta_cep < pkts) 306 delta_cep = pkts - 307 ((pkts - delta_cep) & 7); 308 tp->t_scep += delta_cep; 309 } else { 310 /* 311 * process the final ACK of the 3WHS 312 * see table 3 in draft-ietf-tcpm-accurate-ecn 313 */ 314 switch (tcp_ecn_get_ace(thflags)) { 315 case 0b010: 316 /* nonECT SYN or SYN,ACK */ 317 /* FALLTHROUGH */ 318 case 0b011: 319 /* ECT1 SYN or SYN,ACK */ 320 /* FALLTHROUGH */ 321 case 0b100: 322 /* ECT0 SYN or SYN,ACK */ 323 tp->t_scep = 5; 324 break; 325 case 0b110: 326 /* CE SYN or SYN,ACK */ 327 tp->t_scep = 6; 328 tp->snd_cwnd = 2 * tcp_maxseg(tp); 329 break; 330 default: 331 /* mangled AccECN handshake */ 332 tp->t_scep = 5; 333 break; 334 } 335 tp->t_flags2 |= TF2_ECN_PERMIT; 336 } 337 } else { 338 /* RFC3168 ECN handling */ 339 if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) { 340 delta_cep = 1; 341 tp->t_scep++; 342 } 343 if (thflags & TH_CWR) { 344 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 345 tp->t_flags |= TF_ACKNOW; 346 } 347 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 348 tp->t_flags2 |= TF2_ECN_SND_ECE; 349 } 350 351 /* Process a packet differently from RFC3168. */ 352 cc_ecnpkt_handler_flags(tp, thflags, iptos); 353 } 354 355 return delta_cep; 356 } 357 358 /* 359 * Send ECN setup <SYN> packet header flags 360 */ 361 uint16_t 362 tcp_ecn_output_syn_sent(struct tcpcb *tp) 363 { 364 uint16_t thflags = 0; 365 366 if (V_tcp_do_ecn == 0) 367 return thflags; 368 if (V_tcp_do_ecn == 1) { 369 /* Send a RFC3168 ECN setup <SYN> packet */ 370 if (tp->t_rxtshift >= 1) { 371 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 372 thflags = TH_ECE|TH_CWR; 373 } else 374 thflags = TH_ECE|TH_CWR; 375 } else 376 if (V_tcp_do_ecn == 3) { 377 /* Send an Accurate ECN setup <SYN> packet */ 378 if (tp->t_rxtshift >= 1) { 379 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 380 thflags = TH_ECE|TH_CWR|TH_AE; 381 } else 382 thflags = TH_ECE|TH_CWR|TH_AE; 383 } 384 385 return thflags; 386 } 387 388 /* 389 * output processing of ECN feature 390 * returning IP ECN header codepoint 391 */ 392 int 393 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit) 394 { 395 int ipecn = IPTOS_ECN_NOTECT; 396 bool newdata; 397 398 /* 399 * If the peer has ECN, mark data packets with 400 * ECN capable transmission (ECT). 401 * Ignore pure control packets, retransmissions 402 * and window probes. 403 */ 404 newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 405 !rxmit && 406 !((tp->t_flags & TF_FORCEDATA) && len == 1)); 407 /* RFC3168 ECN marking, only new data segments */ 408 if (newdata) { 409 if (tp->t_flags2 & TF2_ECN_USE_ECT1) { 410 ipecn = IPTOS_ECN_ECT1; 411 TCPSTAT_INC(tcps_ecn_sndect1); 412 } else { 413 ipecn = IPTOS_ECN_ECT0; 414 TCPSTAT_INC(tcps_ecn_sndect0); 415 } 416 } 417 /* 418 * Reply with proper ECN notifications. 419 */ 420 if (tp->t_flags2 & TF2_ACE_PERMIT) { 421 *thflags &= ~(TH_AE|TH_CWR|TH_ECE); 422 if (tp->t_rcep & 0x01) 423 *thflags |= TH_ECE; 424 if (tp->t_rcep & 0x02) 425 *thflags |= TH_CWR; 426 if (tp->t_rcep & 0x04) 427 *thflags |= TH_AE; 428 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { 429 /* 430 * here we process the final 431 * ACK of the 3WHS 432 */ 433 if (tp->t_rcep == 0b110) { 434 tp->t_rcep = 6; 435 } else { 436 tp->t_rcep = 5; 437 } 438 tp->t_flags2 |= TF2_ECN_PERMIT; 439 } 440 } else { 441 if (newdata && 442 (tp->t_flags2 & TF2_ECN_SND_CWR)) { 443 *thflags |= TH_CWR; 444 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 445 } 446 if (tp->t_flags2 & TF2_ECN_SND_ECE) 447 *thflags |= TH_ECE; 448 } 449 450 return ipecn; 451 } 452 453 /* 454 * Set up the ECN related tcpcb fields from 455 * a syncache entry 456 */ 457 void 458 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc) 459 { 460 if (sc->sc_flags & SCF_ECN_MASK) { 461 switch (sc->sc_flags & SCF_ECN_MASK) { 462 case SCF_ECN: 463 tp->t_flags2 |= TF2_ECN_PERMIT; 464 break; 465 case SCF_ACE_N: 466 /* FALLTHROUGH */ 467 case SCF_ACE_0: 468 /* FALLTHROUGH */ 469 case SCF_ACE_1: 470 tp->t_flags2 |= TF2_ACE_PERMIT; 471 tp->t_scep = 5; 472 tp->t_rcep = 5; 473 break; 474 case SCF_ACE_CE: 475 tp->t_flags2 |= TF2_ACE_PERMIT; 476 tp->t_scep = 6; 477 tp->t_rcep = 6; 478 break; 479 /* undefined SCF codepoint */ 480 default: 481 break; 482 } 483 } 484 } 485 486 /* 487 * Process a <SYN> packets ECN information, and provide the 488 * syncache with the relevant information. 489 */ 490 int 491 tcp_ecn_syncache_add(uint16_t thflags, int iptos) 492 { 493 int scflags = 0; 494 495 switch (iptos & IPTOS_ECN_MASK) { 496 case IPTOS_ECN_CE: 497 TCPSTAT_INC(tcps_ecn_rcvce); 498 break; 499 case IPTOS_ECN_ECT0: 500 TCPSTAT_INC(tcps_ecn_rcvect0); 501 break; 502 case IPTOS_ECN_ECT1: 503 TCPSTAT_INC(tcps_ecn_rcvect1); 504 break; 505 } 506 507 switch (thflags & (TH_AE|TH_CWR|TH_ECE)) { 508 /* no ECN */ 509 case (0|0|0): 510 break; 511 /* legacy ECN */ 512 case (0|TH_CWR|TH_ECE): 513 scflags = SCF_ECN; 514 break; 515 /* Accurate ECN */ 516 case (TH_AE|TH_CWR|TH_ECE): 517 if ((V_tcp_do_ecn == 3) || 518 (V_tcp_do_ecn == 4)) { 519 switch (iptos & IPTOS_ECN_MASK) { 520 case IPTOS_ECN_CE: 521 scflags = SCF_ACE_CE; 522 break; 523 case IPTOS_ECN_ECT0: 524 scflags = SCF_ACE_0; 525 break; 526 case IPTOS_ECN_ECT1: 527 scflags = SCF_ACE_1; 528 break; 529 case IPTOS_ECN_NOTECT: 530 scflags = SCF_ACE_N; 531 break; 532 } 533 } else 534 scflags = SCF_ECN; 535 break; 536 /* Default Case (section 3.1.2) */ 537 default: 538 if ((V_tcp_do_ecn == 3) || 539 (V_tcp_do_ecn == 4)) { 540 switch (iptos & IPTOS_ECN_MASK) { 541 case IPTOS_ECN_CE: 542 scflags = SCF_ACE_CE; 543 break; 544 case IPTOS_ECN_ECT0: 545 scflags = SCF_ACE_0; 546 break; 547 case IPTOS_ECN_ECT1: 548 scflags = SCF_ACE_1; 549 break; 550 case IPTOS_ECN_NOTECT: 551 scflags = SCF_ACE_N; 552 break; 553 } 554 } 555 break; 556 } 557 return scflags; 558 } 559 560 /* 561 * Set up the ECN information for the <SYN,ACK> from 562 * syncache information. 563 */ 564 uint16_t 565 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) 566 { 567 if ((thflags & TH_SYN) && 568 (sc->sc_flags & SCF_ECN_MASK)) { 569 switch (sc->sc_flags & SCF_ECN_MASK) { 570 case SCF_ECN: 571 thflags |= (0 | 0 | TH_ECE); 572 TCPSTAT_INC(tcps_ecn_shs); 573 break; 574 case SCF_ACE_N: 575 thflags |= (0 | TH_CWR | 0); 576 TCPSTAT_INC(tcps_ecn_shs); 577 TCPSTAT_INC(tcps_ace_nect); 578 break; 579 case SCF_ACE_0: 580 thflags |= (TH_AE | 0 | 0); 581 TCPSTAT_INC(tcps_ecn_shs); 582 TCPSTAT_INC(tcps_ace_ect0); 583 break; 584 case SCF_ACE_1: 585 thflags |= (0 | TH_ECE | TH_CWR); 586 TCPSTAT_INC(tcps_ecn_shs); 587 TCPSTAT_INC(tcps_ace_ect1); 588 break; 589 case SCF_ACE_CE: 590 thflags |= (TH_AE | TH_CWR | 0); 591 TCPSTAT_INC(tcps_ecn_shs); 592 TCPSTAT_INC(tcps_ace_ce); 593 break; 594 /* undefined SCF codepoint */ 595 default: 596 break; 597 } 598 } 599 return thflags; 600 } 601 602 int 603 tcp_ecn_get_ace(uint16_t thflags) 604 { 605 int ace = 0; 606 607 if (thflags & TH_ECE) 608 ace += 1; 609 if (thflags & TH_CWR) 610 ace += 2; 611 if (thflags & TH_AE) 612 ace += 4; 613 return ace; 614 } 615