1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2007-2008,2010 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 9 * Copyright (c) 2010 The FreeBSD Foundation 10 * Copyright (c) 2010-2011 Juniper Networks, Inc. 11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com> 12 * All rights reserved. 13 * 14 * Portions of this software were developed at the Centre for Advanced Internet 15 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 16 * James Healy and David Hayes, made possible in part by a grant from the Cisco 17 * University Research Program Fund at Community Foundation Silicon Valley. 18 * 19 * Portions of this software were developed at the Centre for Advanced 20 * Internet Architectures, Swinburne University of Technology, Melbourne, 21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 22 * 23 * Portions of this software were developed by Robert N. M. Watson under 24 * contract to Juniper Networks, Inc. 25 * 26 * Redistribution and use in source and binary forms, with or without 27 * modification, are permitted provided that the following conditions 28 * are met: 29 * 1. Redistributions of source code must retain the above copyright 30 * notice, this list of conditions and the following disclaimer. 31 * 2. Redistributions in binary form must reproduce the above copyright 32 * notice, this list of conditions and the following disclaimer in the 33 * documentation and/or other materials provided with the distribution. 34 * 3. Neither the name of the University nor the names of its contributors 35 * may be used to endorse or promote products derived from this software 36 * without specific prior written permission. 37 * 38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 48 * SUCH DAMAGE. 49 */ 50 51 /* 52 * Utility functions to deal with Explicit Congestion Notification in TCP 53 * implementing the essential parts of the Accurate ECN extension 54 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09 55 */ 56 57 #include "opt_inet.h" 58 #include "opt_inet6.h" 59 60 #include <sys/param.h> 61 #include <sys/systm.h> 62 #include <sys/kernel.h> 63 #include <sys/sysctl.h> 64 #include <sys/malloc.h> 65 #include <sys/mbuf.h> 66 #include <sys/socket.h> 67 #include <sys/socketvar.h> 68 69 #include <machine/cpu.h> 70 71 #include <vm/uma.h> 72 73 #include <net/if.h> 74 #include <net/if_var.h> 75 #include <net/route.h> 76 #include <net/vnet.h> 77 78 #include <netinet/in.h> 79 #include <netinet/in_systm.h> 80 #include <netinet/ip.h> 81 #include <netinet/in_var.h> 82 #include <netinet/in_pcb.h> 83 #include <netinet/ip_var.h> 84 #include <netinet/ip6.h> 85 #include <netinet/icmp6.h> 86 #include <netinet6/nd6.h> 87 #include <netinet6/ip6_var.h> 88 #include <netinet6/in6_pcb.h> 89 #include <netinet/tcp.h> 90 #include <netinet/tcp_fsm.h> 91 #include <netinet/tcp_seq.h> 92 #include <netinet/tcp_var.h> 93 #include <netinet/tcp_syncache.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcpip.h> 96 #include <netinet/tcp_ecn.h> 97 98 static inline int tcp_ecn_get_ace(uint16_t); 99 static inline void tcp_ecn_set_ace(uint16_t *, uint32_t); 100 101 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, 102 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 103 "TCP ECN"); 104 105 VNET_DEFINE(int, tcp_do_ecn) = 2; 106 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, 107 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, 108 "TCP ECN support"); 109 110 VNET_DEFINE(int, tcp_ecn_maxretries) = 1; 111 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, 112 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, 113 "Max retries before giving up on ECN"); 114 115 /* 116 * Process incoming SYN,ACK packet 117 */ 118 void 119 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) 120 { 121 switch (V_tcp_do_ecn) { 122 case 0: 123 return; 124 case 1: 125 /* FALLTHROUGH */ 126 case 2: 127 /* RFC3168 ECN handling */ 128 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) { 129 tp->t_flags2 |= TF2_ECN_PERMIT; 130 tp->t_flags2 &= ~TF2_ACE_PERMIT; 131 TCPSTAT_INC(tcps_ecn_shs); 132 } 133 break; 134 case 3: 135 /* FALLTHROUGH */ 136 case 4: 137 /* 138 * Decoding Accurate ECN according to 139 * table in section 3.1.1 140 * 141 * On the SYN,ACK, process the AccECN 142 * flags indicating the state the SYN 143 * was delivered. 144 * Reactions to Path ECN mangling can 145 * come here. 146 */ 147 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 148 /* RFC3168 SYN */ 149 case (0|0|TH_ECE): 150 tp->t_flags2 |= TF2_ECN_PERMIT; 151 tp->t_flags2 &= ~TF2_ACE_PERMIT; 152 TCPSTAT_INC(tcps_ecn_shs); 153 break; 154 /* non-ECT SYN */ 155 case (0|TH_CWR|0): 156 tp->t_flags2 |= TF2_ACE_PERMIT; 157 tp->t_flags2 &= ~TF2_ECN_PERMIT; 158 tp->t_scep = 5; 159 TCPSTAT_INC(tcps_ecn_shs); 160 TCPSTAT_INC(tcps_ace_nect); 161 break; 162 /* ECT0 SYN */ 163 case (TH_AE|0|0): 164 tp->t_flags2 |= TF2_ACE_PERMIT; 165 tp->t_flags2 &= ~TF2_ECN_PERMIT; 166 tp->t_scep = 5; 167 TCPSTAT_INC(tcps_ecn_shs); 168 TCPSTAT_INC(tcps_ace_ect0); 169 break; 170 /* ECT1 SYN */ 171 case (0|TH_CWR|TH_ECE): 172 tp->t_flags2 |= TF2_ACE_PERMIT; 173 tp->t_flags2 &= ~TF2_ECN_PERMIT; 174 tp->t_scep = 5; 175 TCPSTAT_INC(tcps_ecn_shs); 176 TCPSTAT_INC(tcps_ace_ect1); 177 break; 178 /* CE SYN */ 179 case (TH_AE|TH_CWR|0): 180 tp->t_flags2 |= TF2_ACE_PERMIT; 181 tp->t_flags2 &= ~TF2_ECN_PERMIT; 182 tp->t_scep = 6; 183 /* 184 * reduce the IW to 2 MSS (to 185 * account for delayed acks) if 186 * the SYN,ACK was CE marked 187 */ 188 tp->snd_cwnd = 2 * tcp_maxseg(tp); 189 TCPSTAT_INC(tcps_ecn_shs); 190 TCPSTAT_INC(tcps_ace_nect); 191 break; 192 default: 193 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 194 break; 195 } 196 /* 197 * Set the AccECN Codepoints on 198 * the outgoing <ACK> to the ECN 199 * state of the <SYN,ACK> 200 * according to table 3 in the 201 * AccECN draft 202 */ 203 switch (iptos & IPTOS_ECN_MASK) { 204 case (IPTOS_ECN_NOTECT): 205 tp->t_rcep = 0b010; 206 break; 207 case (IPTOS_ECN_ECT0): 208 tp->t_rcep = 0b100; 209 break; 210 case (IPTOS_ECN_ECT1): 211 tp->t_rcep = 0b011; 212 break; 213 case (IPTOS_ECN_CE): 214 tp->t_rcep = 0b110; 215 break; 216 } 217 break; 218 } 219 } 220 221 /* 222 * Handle parallel SYN for ECN 223 */ 224 void 225 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos) 226 { 227 if (thflags & TH_ACK) 228 return; 229 switch (V_tcp_do_ecn) { 230 case 0: 231 return; 232 case 1: 233 /* FALLTHROUGH */ 234 case 2: 235 /* RFC3168 ECN handling */ 236 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) { 237 tp->t_flags2 |= TF2_ECN_PERMIT; 238 tp->t_flags2 &= ~TF2_ACE_PERMIT; 239 tp->t_flags2 |= TF2_ECN_SND_ECE; 240 TCPSTAT_INC(tcps_ecn_shs); 241 } 242 break; 243 case 3: 244 /* FALLTHROUGH */ 245 case 4: 246 /* AccECN handling */ 247 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 248 default: 249 case (0|0|0): 250 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 251 break; 252 case (0|TH_CWR|TH_ECE): 253 tp->t_flags2 |= TF2_ECN_PERMIT; 254 tp->t_flags2 &= ~TF2_ACE_PERMIT; 255 tp->t_flags2 |= TF2_ECN_SND_ECE; 256 TCPSTAT_INC(tcps_ecn_shs); 257 break; 258 case (TH_AE|TH_CWR|TH_ECE): 259 tp->t_flags2 |= TF2_ACE_PERMIT; 260 tp->t_flags2 &= ~TF2_ECN_PERMIT; 261 TCPSTAT_INC(tcps_ecn_shs); 262 /* 263 * Set the AccECN Codepoints on 264 * the outgoing <ACK> to the ECN 265 * state of the <SYN,ACK> 266 * according to table 3 in the 267 * AccECN draft 268 */ 269 switch (iptos & IPTOS_ECN_MASK) { 270 case (IPTOS_ECN_NOTECT): 271 tp->t_rcep = 0b010; 272 break; 273 case (IPTOS_ECN_ECT0): 274 tp->t_rcep = 0b100; 275 break; 276 case (IPTOS_ECN_ECT1): 277 tp->t_rcep = 0b011; 278 break; 279 case (IPTOS_ECN_CE): 280 tp->t_rcep = 0b110; 281 break; 282 } 283 break; 284 } 285 break; 286 } 287 } 288 289 /* 290 * TCP ECN processing. 291 */ 292 int 293 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int tlen, int pkts, int iptos) 294 { 295 int delta_cep = 0; 296 297 switch (iptos & IPTOS_ECN_MASK) { 298 case IPTOS_ECN_CE: 299 TCPSTAT_INC(tcps_ecn_rcvce); 300 break; 301 case IPTOS_ECN_ECT0: 302 TCPSTAT_INC(tcps_ecn_rcvect0); 303 break; 304 case IPTOS_ECN_ECT1: 305 TCPSTAT_INC(tcps_ecn_rcvect1); 306 break; 307 } 308 309 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { 310 if (tp->t_flags2 & TF2_ACE_PERMIT) { 311 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 312 tp->t_rcep += 1; 313 if (tp->t_flags2 & TF2_ECN_PERMIT) { 314 delta_cep = (tcp_ecn_get_ace(thflags) + 8 - 315 (tp->t_scep & 7)) & 7; 316 if (delta_cep < pkts) 317 delta_cep = pkts - 318 ((pkts - delta_cep) & 7); 319 tp->t_scep += delta_cep; 320 } else { 321 /* 322 * process the final ACK of the 3WHS 323 * see table 3 in draft-ietf-tcpm-accurate-ecn 324 */ 325 switch (tcp_ecn_get_ace(thflags)) { 326 case 0b010: 327 /* nonECT SYN or SYN,ACK */ 328 /* FALLTHROUGH */ 329 case 0b011: 330 /* ECT1 SYN or SYN,ACK */ 331 /* FALLTHROUGH */ 332 case 0b100: 333 /* ECT0 SYN or SYN,ACK */ 334 tp->t_scep = 5; 335 break; 336 case 0b110: 337 /* CE SYN or SYN,ACK */ 338 tp->t_scep = 6; 339 tp->snd_cwnd = 2 * tcp_maxseg(tp); 340 break; 341 default: 342 /* mangled AccECN handshake */ 343 tp->t_scep = 5; 344 break; 345 } 346 tp->t_flags2 |= TF2_ECN_PERMIT; 347 } 348 } else { 349 /* RFC3168 ECN handling */ 350 if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) { 351 delta_cep = 1; 352 tp->t_scep++; 353 } 354 if (thflags & TH_CWR) { 355 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 356 if ((tp->t_state == TCPS_ESTABLISHED) || 357 (tp->t_state == TCPS_FIN_WAIT_1) || 358 (tp->t_state == TCPS_FIN_WAIT_2)) 359 tp->t_flags |= TF_ACKNOW; 360 } 361 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 362 tp->t_flags2 |= TF2_ECN_SND_ECE; 363 } 364 365 /* Process a packet differently from RFC3168. */ 366 cc_ecnpkt_handler_flags(tp, thflags, iptos); 367 } 368 369 return delta_cep; 370 } 371 372 /* 373 * Send ECN setup <SYN> packet header flags 374 */ 375 uint16_t 376 tcp_ecn_output_syn_sent(struct tcpcb *tp) 377 { 378 uint16_t thflags = 0; 379 380 if (V_tcp_do_ecn == 0) 381 return thflags; 382 if (V_tcp_do_ecn == 1) { 383 /* Send a RFC3168 ECN setup <SYN> packet */ 384 if (tp->t_rxtshift >= 1) { 385 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 386 thflags = TH_ECE|TH_CWR; 387 } else 388 thflags = TH_ECE|TH_CWR; 389 } else if (V_tcp_do_ecn == 3) { 390 /* Send an Accurate ECN setup <SYN> packet */ 391 if (tp->t_rxtshift >= 1) { 392 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 393 thflags = TH_ECE|TH_CWR|TH_AE; 394 } else 395 thflags = TH_ECE|TH_CWR|TH_AE; 396 } 397 398 return thflags; 399 } 400 401 /* 402 * output processing of ECN feature 403 * returning IP ECN header codepoint 404 */ 405 int 406 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit) 407 { 408 int ipecn = IPTOS_ECN_NOTECT; 409 bool newdata; 410 411 /* 412 * If the peer has ECN, mark data packets with 413 * ECN capable transmission (ECT). 414 * Ignore pure control packets, retransmissions 415 * and window probes. 416 */ 417 newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 418 !rxmit && 419 !((tp->t_flags & TF_FORCEDATA) && len == 1)); 420 /* RFC3168 ECN marking, only new data segments */ 421 if (newdata) { 422 if (tp->t_flags2 & TF2_ECN_USE_ECT1) { 423 ipecn = IPTOS_ECN_ECT1; 424 TCPSTAT_INC(tcps_ecn_sndect1); 425 } else { 426 ipecn = IPTOS_ECN_ECT0; 427 TCPSTAT_INC(tcps_ecn_sndect0); 428 } 429 } 430 /* 431 * Reply with proper ECN notifications. 432 */ 433 if (tp->t_flags2 & TF2_ACE_PERMIT) { 434 tcp_ecn_set_ace(thflags, tp->t_rcep); 435 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { 436 /* 437 * here we process the final 438 * ACK of the 3WHS 439 */ 440 if (tp->t_rcep == 0b110) { 441 tp->t_rcep = 6; 442 } else { 443 tp->t_rcep = 5; 444 } 445 tp->t_flags2 |= TF2_ECN_PERMIT; 446 } 447 } else { 448 if (newdata && 449 (tp->t_flags2 & TF2_ECN_SND_CWR)) { 450 *thflags |= TH_CWR; 451 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 452 } 453 if (tp->t_flags2 & TF2_ECN_SND_ECE) 454 *thflags |= TH_ECE; 455 } 456 457 return ipecn; 458 } 459 460 /* 461 * Set up the ECN related tcpcb fields from 462 * a syncache entry 463 */ 464 void 465 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc) 466 { 467 if (sc->sc_flags & SCF_ECN_MASK) { 468 switch (sc->sc_flags & SCF_ECN_MASK) { 469 case SCF_ECN: 470 tp->t_flags2 |= TF2_ECN_PERMIT; 471 break; 472 case SCF_ACE_N: 473 /* FALLTHROUGH */ 474 case SCF_ACE_0: 475 /* FALLTHROUGH */ 476 case SCF_ACE_1: 477 tp->t_flags2 |= TF2_ACE_PERMIT; 478 tp->t_scep = 5; 479 tp->t_rcep = 5; 480 break; 481 case SCF_ACE_CE: 482 tp->t_flags2 |= TF2_ACE_PERMIT; 483 tp->t_scep = 6; 484 tp->t_rcep = 6; 485 break; 486 } 487 } 488 } 489 490 /* 491 * Process a <SYN> packets ECN information, and provide the 492 * syncache with the relevant information. 493 */ 494 int 495 tcp_ecn_syncache_add(uint16_t thflags, int iptos) 496 { 497 int scflags = 0; 498 499 switch (iptos & IPTOS_ECN_MASK) { 500 case IPTOS_ECN_CE: 501 TCPSTAT_INC(tcps_ecn_rcvce); 502 break; 503 case IPTOS_ECN_ECT0: 504 TCPSTAT_INC(tcps_ecn_rcvect0); 505 break; 506 case IPTOS_ECN_ECT1: 507 TCPSTAT_INC(tcps_ecn_rcvect1); 508 break; 509 } 510 511 switch (thflags & (TH_AE|TH_CWR|TH_ECE)) { 512 /* no ECN */ 513 case (0|0|0): 514 break; 515 /* legacy ECN */ 516 case (0|TH_CWR|TH_ECE): 517 scflags = SCF_ECN; 518 break; 519 /* Accurate ECN */ 520 case (TH_AE|TH_CWR|TH_ECE): 521 if ((V_tcp_do_ecn == 3) || 522 (V_tcp_do_ecn == 4)) { 523 switch (iptos & IPTOS_ECN_MASK) { 524 case IPTOS_ECN_CE: 525 scflags = SCF_ACE_CE; 526 break; 527 case IPTOS_ECN_ECT0: 528 scflags = SCF_ACE_0; 529 break; 530 case IPTOS_ECN_ECT1: 531 scflags = SCF_ACE_1; 532 break; 533 case IPTOS_ECN_NOTECT: 534 scflags = SCF_ACE_N; 535 break; 536 } 537 } else 538 scflags = SCF_ECN; 539 break; 540 /* Default Case (section 3.1.2) */ 541 default: 542 if ((V_tcp_do_ecn == 3) || 543 (V_tcp_do_ecn == 4)) { 544 switch (iptos & IPTOS_ECN_MASK) { 545 case IPTOS_ECN_CE: 546 scflags = SCF_ACE_CE; 547 break; 548 case IPTOS_ECN_ECT0: 549 scflags = SCF_ACE_0; 550 break; 551 case IPTOS_ECN_ECT1: 552 scflags = SCF_ACE_1; 553 break; 554 case IPTOS_ECN_NOTECT: 555 scflags = SCF_ACE_N; 556 break; 557 } 558 } 559 break; 560 } 561 return scflags; 562 } 563 564 /* 565 * Set up the ECN information for the <SYN,ACK> from 566 * syncache information. 567 */ 568 uint16_t 569 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) 570 { 571 if ((thflags & TH_SYN) && 572 (sc->sc_flags & SCF_ECN_MASK)) { 573 switch (sc->sc_flags & SCF_ECN_MASK) { 574 case SCF_ECN: 575 thflags |= (0 | 0 | TH_ECE); 576 TCPSTAT_INC(tcps_ecn_shs); 577 break; 578 case SCF_ACE_N: 579 thflags |= (0 | TH_CWR | 0); 580 TCPSTAT_INC(tcps_ecn_shs); 581 TCPSTAT_INC(tcps_ace_nect); 582 break; 583 case SCF_ACE_0: 584 thflags |= (TH_AE | 0 | 0); 585 TCPSTAT_INC(tcps_ecn_shs); 586 TCPSTAT_INC(tcps_ace_ect0); 587 break; 588 case SCF_ACE_1: 589 thflags |= (0 | TH_ECE | TH_CWR); 590 TCPSTAT_INC(tcps_ecn_shs); 591 TCPSTAT_INC(tcps_ace_ect1); 592 break; 593 case SCF_ACE_CE: 594 thflags |= (TH_AE | TH_CWR | 0); 595 TCPSTAT_INC(tcps_ecn_shs); 596 TCPSTAT_INC(tcps_ace_ce); 597 break; 598 } 599 } 600 return thflags; 601 } 602 603 static inline int 604 tcp_ecn_get_ace(uint16_t thflags) 605 { 606 return ((thflags & (TH_AE|TH_CWR|TH_ECE)) >> TH_ACE_SHIFT); 607 } 608 609 static inline void 610 tcp_ecn_set_ace(uint16_t *thflags, uint32_t t_rcep) 611 { 612 *thflags &= ~(TH_AE|TH_CWR|TH_ECE); 613 *thflags |= ((t_rcep << TH_ACE_SHIFT) & (TH_AE|TH_CWR|TH_ECE)); 614 } 615