1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2007-2008,2010 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 9 * Copyright (c) 2010 The FreeBSD Foundation 10 * Copyright (c) 2010-2011 Juniper Networks, Inc. 11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com> 12 * All rights reserved. 13 * 14 * Portions of this software were developed at the Centre for Advanced Internet 15 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 16 * James Healy and David Hayes, made possible in part by a grant from the Cisco 17 * University Research Program Fund at Community Foundation Silicon Valley. 18 * 19 * Portions of this software were developed at the Centre for Advanced 20 * Internet Architectures, Swinburne University of Technology, Melbourne, 21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 22 * 23 * Portions of this software were developed by Robert N. M. Watson under 24 * contract to Juniper Networks, Inc. 25 * 26 * Redistribution and use in source and binary forms, with or without 27 * modification, are permitted provided that the following conditions 28 * are met: 29 * 1. Redistributions of source code must retain the above copyright 30 * notice, this list of conditions and the following disclaimer. 31 * 2. Redistributions in binary form must reproduce the above copyright 32 * notice, this list of conditions and the following disclaimer in the 33 * documentation and/or other materials provided with the distribution. 34 * 3. Neither the name of the University nor the names of its contributors 35 * may be used to endorse or promote products derived from this software 36 * without specific prior written permission. 37 * 38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 48 * SUCH DAMAGE. 49 * 50 * @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95 51 */ 52 53 /* 54 * Utility functions to deal with Explicit Congestion Notification in TCP 55 * implementing the essential parts of the Accurate ECN extension 56 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include "opt_inet.h" 63 #include "opt_inet6.h" 64 #include "opt_tcpdebug.h" 65 66 #include <sys/param.h> 67 #include <sys/systm.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/malloc.h> 71 #include <sys/mbuf.h> 72 #include <sys/socket.h> 73 #include <sys/socketvar.h> 74 75 #include <machine/cpu.h> 76 77 #include <vm/uma.h> 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 #include <net/route.h> 82 #include <net/vnet.h> 83 84 #include <netinet/in.h> 85 #include <netinet/in_systm.h> 86 #include <netinet/ip.h> 87 #include <netinet/in_var.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/ip_var.h> 90 #include <netinet/ip6.h> 91 #include <netinet/icmp6.h> 92 #include <netinet6/nd6.h> 93 #include <netinet6/ip6_var.h> 94 #include <netinet6/in6_pcb.h> 95 #include <netinet/tcp.h> 96 #include <netinet/tcp_fsm.h> 97 #include <netinet/tcp_seq.h> 98 #include <netinet/tcp_var.h> 99 #include <netinet/tcp_syncache.h> 100 #include <netinet/tcp_timer.h> 101 #include <netinet/tcpip.h> 102 #include <netinet/tcp_ecn.h> 103 104 105 /* 106 * Process incoming SYN,ACK packet 107 */ 108 void 109 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) 110 { 111 112 if (V_tcp_do_ecn == 0) 113 return; 114 if ((V_tcp_do_ecn == 1) || 115 (V_tcp_do_ecn == 2)) { 116 /* RFC3168 ECN handling */ 117 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) { 118 tp->t_flags2 |= TF2_ECN_PERMIT; 119 tp->t_flags2 &= ~TF2_ACE_PERMIT; 120 TCPSTAT_INC(tcps_ecn_shs); 121 } 122 } else 123 /* decoding Accurate ECN according to table in section 3.1.1 */ 124 if ((V_tcp_do_ecn == 3) || 125 (V_tcp_do_ecn == 4)) { 126 /* 127 * on the SYN,ACK, process the AccECN 128 * flags indicating the state the SYN 129 * was delivered. 130 * Reactions to Path ECN mangling can 131 * come here. 132 */ 133 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 134 /* RFC3168 SYN */ 135 case (0|0|TH_ECE): 136 tp->t_flags2 |= TF2_ECN_PERMIT; 137 tp->t_flags2 &= ~TF2_ACE_PERMIT; 138 TCPSTAT_INC(tcps_ecn_shs); 139 break; 140 /* non-ECT SYN */ 141 case (0|TH_CWR|0): 142 tp->t_flags2 |= TF2_ACE_PERMIT; 143 tp->t_flags2 &= ~TF2_ECN_PERMIT; 144 tp->t_scep = 5; 145 TCPSTAT_INC(tcps_ecn_shs); 146 TCPSTAT_INC(tcps_ace_nect); 147 break; 148 /* ECT0 SYN */ 149 case (TH_AE|0|0): 150 tp->t_flags2 |= TF2_ACE_PERMIT; 151 tp->t_flags2 &= ~TF2_ECN_PERMIT; 152 tp->t_scep = 5; 153 TCPSTAT_INC(tcps_ecn_shs); 154 TCPSTAT_INC(tcps_ace_ect0); 155 break; 156 /* ECT1 SYN */ 157 case (0|TH_CWR|TH_ECE): 158 tp->t_flags2 |= TF2_ACE_PERMIT; 159 tp->t_flags2 &= ~TF2_ECN_PERMIT; 160 tp->t_scep = 5; 161 TCPSTAT_INC(tcps_ecn_shs); 162 TCPSTAT_INC(tcps_ace_ect1); 163 break; 164 /* CE SYN */ 165 case (TH_AE|TH_CWR|0): 166 tp->t_flags2 |= TF2_ACE_PERMIT; 167 tp->t_flags2 &= ~TF2_ECN_PERMIT; 168 tp->t_scep = 6; 169 /* 170 * reduce the IW to 2 MSS (to 171 * account for delayed acks) if 172 * the SYN,ACK was CE marked 173 */ 174 tp->snd_cwnd = 2 * tcp_maxseg(tp); 175 TCPSTAT_INC(tcps_ecn_shs); 176 TCPSTAT_INC(tcps_ace_nect); 177 break; 178 default: 179 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 180 break; 181 } 182 /* 183 * Set the AccECN Codepoints on 184 * the outgoing <ACK> to the ECN 185 * state of the <SYN,ACK> 186 * according to table 3 in the 187 * AccECN draft 188 */ 189 switch (iptos & IPTOS_ECN_MASK) { 190 case (IPTOS_ECN_NOTECT): 191 tp->t_rcep = 0b010; 192 break; 193 case (IPTOS_ECN_ECT0): 194 tp->t_rcep = 0b100; 195 break; 196 case (IPTOS_ECN_ECT1): 197 tp->t_rcep = 0b011; 198 break; 199 case (IPTOS_ECN_CE): 200 tp->t_rcep = 0b110; 201 break; 202 } 203 } 204 } 205 206 /* 207 * Handle parallel SYN for ECN 208 */ 209 void 210 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos) 211 { 212 if (thflags & TH_ACK) 213 return; 214 if (V_tcp_do_ecn == 0) 215 return; 216 if ((V_tcp_do_ecn == 1) || 217 (V_tcp_do_ecn == 2)) { 218 /* RFC3168 ECN handling */ 219 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) { 220 tp->t_flags2 |= TF2_ECN_PERMIT; 221 tp->t_flags2 &= ~TF2_ACE_PERMIT; 222 tp->t_flags2 |= TF2_ECN_SND_ECE; 223 TCPSTAT_INC(tcps_ecn_shs); 224 } 225 } else 226 if ((V_tcp_do_ecn == 3) || 227 (V_tcp_do_ecn == 4)) { 228 /* AccECN handling */ 229 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 230 default: 231 case (0|0|0): 232 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT); 233 break; 234 case (0|TH_CWR|TH_ECE): 235 tp->t_flags2 |= TF2_ECN_PERMIT; 236 tp->t_flags2 &= ~TF2_ACE_PERMIT; 237 tp->t_flags2 |= TF2_ECN_SND_ECE; 238 TCPSTAT_INC(tcps_ecn_shs); 239 break; 240 case (TH_AE|TH_CWR|TH_ECE): 241 tp->t_flags2 |= TF2_ACE_PERMIT; 242 tp->t_flags2 &= ~TF2_ECN_PERMIT; 243 TCPSTAT_INC(tcps_ecn_shs); 244 /* 245 * Set the AccECN Codepoints on 246 * the outgoing <ACK> to the ECN 247 * state of the <SYN,ACK> 248 * according to table 3 in the 249 * AccECN draft 250 */ 251 switch (iptos & IPTOS_ECN_MASK) { 252 case (IPTOS_ECN_NOTECT): 253 tp->t_rcep = 0b010; 254 break; 255 case (IPTOS_ECN_ECT0): 256 tp->t_rcep = 0b100; 257 break; 258 case (IPTOS_ECN_ECT1): 259 tp->t_rcep = 0b011; 260 break; 261 case (IPTOS_ECN_CE): 262 tp->t_rcep = 0b110; 263 break; 264 } 265 break; 266 } 267 } 268 } 269 270 /* 271 * TCP ECN processing. 272 */ 273 int 274 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos) 275 { 276 int delta_ace = 0; 277 278 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { 279 switch (iptos & IPTOS_ECN_MASK) { 280 case IPTOS_ECN_CE: 281 TCPSTAT_INC(tcps_ecn_ce); 282 break; 283 case IPTOS_ECN_ECT0: 284 TCPSTAT_INC(tcps_ecn_ect0); 285 break; 286 case IPTOS_ECN_ECT1: 287 TCPSTAT_INC(tcps_ecn_ect1); 288 break; 289 } 290 291 if (tp->t_flags2 & TF2_ACE_PERMIT) { 292 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 293 tp->t_rcep += 1; 294 if (tp->t_flags2 & TF2_ECN_PERMIT) { 295 delta_ace = (tcp_ecn_get_ace(thflags) + 8 - 296 (tp->t_scep & 0x07)) & 0x07; 297 tp->t_scep += delta_ace; 298 } else { 299 /* 300 * process the final ACK of the 3WHS 301 * see table 3 in draft-ietf-tcpm-accurate-ecn 302 */ 303 switch (tcp_ecn_get_ace(thflags)) { 304 case 0b010: 305 /* nonECT SYN or SYN,ACK */ 306 /* Fallthrough */ 307 case 0b011: 308 /* ECT1 SYN or SYN,ACK */ 309 /* Fallthrough */ 310 case 0b100: 311 /* ECT0 SYN or SYN,ACK */ 312 tp->t_scep = 5; 313 break; 314 case 0b110: 315 /* CE SYN or SYN,ACK */ 316 tp->t_scep = 6; 317 tp->snd_cwnd = 2 * tcp_maxseg(tp); 318 break; 319 default: 320 /* mangled AccECN handshake */ 321 tp->t_scep = 5; 322 break; 323 } 324 tp->t_flags2 |= TF2_ECN_PERMIT; 325 } 326 } else { 327 /* RFC3168 ECN handling */ 328 if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) 329 delta_ace = 1; 330 if (thflags & TH_CWR) { 331 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 332 tp->t_flags |= TF_ACKNOW; 333 } 334 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 335 tp->t_flags2 |= TF2_ECN_SND_ECE; 336 } 337 338 /* Process a packet differently from RFC3168. */ 339 cc_ecnpkt_handler_flags(tp, thflags, iptos); 340 } 341 342 return delta_ace; 343 } 344 345 /* 346 * Send ECN setup <SYN> packet header flags 347 */ 348 uint16_t 349 tcp_ecn_output_syn_sent(struct tcpcb *tp) 350 { 351 uint16_t thflags = 0; 352 353 if (V_tcp_do_ecn == 0) 354 return thflags; 355 if (V_tcp_do_ecn == 1) { 356 /* Send a RFC3168 ECN setup <SYN> packet */ 357 if (tp->t_rxtshift >= 1) { 358 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 359 thflags = TH_ECE|TH_CWR; 360 } else 361 thflags = TH_ECE|TH_CWR; 362 } else 363 if (V_tcp_do_ecn == 3) { 364 /* Send an Accurate ECN setup <SYN> packet */ 365 if (tp->t_rxtshift >= 1) { 366 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 367 thflags = TH_ECE|TH_CWR|TH_AE; 368 } else 369 thflags = TH_ECE|TH_CWR|TH_AE; 370 } 371 372 return thflags; 373 } 374 375 /* 376 * output processing of ECN feature 377 * returning IP ECN header codepoint 378 */ 379 int 380 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit) 381 { 382 int ipecn = IPTOS_ECN_NOTECT; 383 bool newdata; 384 385 /* 386 * If the peer has ECN, mark data packets with 387 * ECN capable transmission (ECT). 388 * Ignore pure control packets, retransmissions 389 * and window probes. 390 */ 391 newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 392 !rxmit && 393 !((tp->t_flags & TF_FORCEDATA) && len == 1)); 394 /* RFC3168 ECN marking, only new data segments */ 395 if (newdata) { 396 ipecn = IPTOS_ECN_ECT0; 397 TCPSTAT_INC(tcps_ecn_ect0); 398 } 399 /* 400 * Reply with proper ECN notifications. 401 */ 402 if (tp->t_flags2 & TF2_ACE_PERMIT) { 403 *thflags &= ~(TH_AE|TH_CWR|TH_ECE); 404 if (tp->t_rcep & 0x01) 405 *thflags |= TH_ECE; 406 if (tp->t_rcep & 0x02) 407 *thflags |= TH_CWR; 408 if (tp->t_rcep & 0x04) 409 *thflags |= TH_AE; 410 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { 411 /* 412 * here we process the final 413 * ACK of the 3WHS 414 */ 415 if (tp->t_rcep == 0b110) { 416 tp->t_rcep = 6; 417 } else { 418 tp->t_rcep = 5; 419 } 420 tp->t_flags2 |= TF2_ECN_PERMIT; 421 } 422 } else { 423 if (newdata && 424 (tp->t_flags2 & TF2_ECN_SND_CWR)) { 425 *thflags |= TH_CWR; 426 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 427 } 428 if (tp->t_flags2 & TF2_ECN_SND_ECE) 429 *thflags |= TH_ECE; 430 } 431 432 return ipecn; 433 } 434 435 /* 436 * Set up the ECN related tcpcb fields from 437 * a syncache entry 438 */ 439 void 440 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc) 441 { 442 if (sc->sc_flags & SCF_ECN_MASK) { 443 switch (sc->sc_flags & SCF_ECN_MASK) { 444 case SCF_ECN: 445 tp->t_flags2 |= TF2_ECN_PERMIT; 446 break; 447 case SCF_ACE_N: 448 /* Fallthrough */ 449 case SCF_ACE_0: 450 /* Fallthrough */ 451 case SCF_ACE_1: 452 tp->t_flags2 |= TF2_ACE_PERMIT; 453 tp->t_scep = 5; 454 tp->t_rcep = 5; 455 break; 456 case SCF_ACE_CE: 457 tp->t_flags2 |= TF2_ACE_PERMIT; 458 tp->t_scep = 6; 459 tp->t_rcep = 6; 460 break; 461 /* undefined SCF codepoint */ 462 default: 463 break; 464 } 465 } 466 } 467 468 /* 469 * Process a <SYN> packets ECN information, and provide the 470 * syncache with the relevant information. 471 */ 472 int 473 tcp_ecn_syncache_add(uint16_t thflags, int iptos) 474 { 475 int scflags = 0; 476 477 switch (thflags & (TH_AE|TH_CWR|TH_ECE)) { 478 /* no ECN */ 479 case (0|0|0): 480 break; 481 /* legacy ECN */ 482 case (0|TH_CWR|TH_ECE): 483 scflags = SCF_ECN; 484 break; 485 /* Accurate ECN */ 486 case (TH_AE|TH_CWR|TH_ECE): 487 if ((V_tcp_do_ecn == 3) || 488 (V_tcp_do_ecn == 4)) { 489 switch (iptos & IPTOS_ECN_MASK) { 490 case IPTOS_ECN_CE: 491 scflags = SCF_ACE_CE; 492 break; 493 case IPTOS_ECN_ECT0: 494 scflags = SCF_ACE_0; 495 break; 496 case IPTOS_ECN_ECT1: 497 scflags = SCF_ACE_1; 498 break; 499 case IPTOS_ECN_NOTECT: 500 scflags = SCF_ACE_N; 501 break; 502 } 503 } else 504 scflags = SCF_ECN; 505 break; 506 /* Default Case (section 3.1.2) */ 507 default: 508 if ((V_tcp_do_ecn == 3) || 509 (V_tcp_do_ecn == 4)) { 510 switch (iptos & IPTOS_ECN_MASK) { 511 case IPTOS_ECN_CE: 512 scflags = SCF_ACE_CE; 513 break; 514 case IPTOS_ECN_ECT0: 515 scflags = SCF_ACE_0; 516 break; 517 case IPTOS_ECN_ECT1: 518 scflags = SCF_ACE_1; 519 break; 520 case IPTOS_ECN_NOTECT: 521 scflags = SCF_ACE_N; 522 break; 523 } 524 } 525 break; 526 } 527 return scflags; 528 } 529 530 /* 531 * Set up the ECN information for the <SYN,ACK> from 532 * syncache information. 533 */ 534 uint16_t 535 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) 536 { 537 if ((thflags & TH_SYN) && 538 (sc->sc_flags & SCF_ECN_MASK)) { 539 switch (sc->sc_flags & SCF_ECN_MASK) { 540 case SCF_ECN: 541 thflags |= (0 | 0 | TH_ECE); 542 TCPSTAT_INC(tcps_ecn_shs); 543 break; 544 case SCF_ACE_N: 545 thflags |= (0 | TH_CWR | 0); 546 TCPSTAT_INC(tcps_ecn_shs); 547 TCPSTAT_INC(tcps_ace_nect); 548 break; 549 case SCF_ACE_0: 550 thflags |= (TH_AE | 0 | 0); 551 TCPSTAT_INC(tcps_ecn_shs); 552 TCPSTAT_INC(tcps_ace_ect0); 553 break; 554 case SCF_ACE_1: 555 thflags |= (0 | TH_ECE | TH_CWR); 556 TCPSTAT_INC(tcps_ecn_shs); 557 TCPSTAT_INC(tcps_ace_ect1); 558 break; 559 case SCF_ACE_CE: 560 thflags |= (TH_AE | TH_CWR | 0); 561 TCPSTAT_INC(tcps_ecn_shs); 562 TCPSTAT_INC(tcps_ace_ce); 563 break; 564 /* undefined SCF codepoint */ 565 default: 566 break; 567 } 568 } 569 return thflags; 570 } 571 572 int 573 tcp_ecn_get_ace(uint16_t thflags) 574 { 575 int ace = 0; 576 577 if (thflags & TH_ECE) 578 ace += 1; 579 if (thflags & TH_CWR) 580 ace += 2; 581 if (thflags & TH_AE) 582 ace += 4; 583 return ace; 584 } 585