1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2007-2008,2010 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 9 * Copyright (c) 2010 The FreeBSD Foundation 10 * Copyright (c) 2010-2011 Juniper Networks, Inc. 11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com> 12 * All rights reserved. 13 * 14 * Portions of this software were developed at the Centre for Advanced Internet 15 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 16 * James Healy and David Hayes, made possible in part by a grant from the Cisco 17 * University Research Program Fund at Community Foundation Silicon Valley. 18 * 19 * Portions of this software were developed at the Centre for Advanced 20 * Internet Architectures, Swinburne University of Technology, Melbourne, 21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 22 * 23 * Portions of this software were developed by Robert N. M. Watson under 24 * contract to Juniper Networks, Inc. 25 * 26 * Redistribution and use in source and binary forms, with or without 27 * modification, are permitted provided that the following conditions 28 * are met: 29 * 1. Redistributions of source code must retain the above copyright 30 * notice, this list of conditions and the following disclaimer. 31 * 2. Redistributions in binary form must reproduce the above copyright 32 * notice, this list of conditions and the following disclaimer in the 33 * documentation and/or other materials provided with the distribution. 34 * 3. Neither the name of the University nor the names of its contributors 35 * may be used to endorse or promote products derived from this software 36 * without specific prior written permission. 37 * 38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 48 * SUCH DAMAGE. 49 * 50 * @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95 51 */ 52 53 /* 54 * Utility functions to deal with Explicit Congestion Notification in TCP 55 * implementing the essential parts of the Accurate ECN extension 56 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include "opt_inet.h" 63 #include "opt_inet6.h" 64 #include "opt_tcpdebug.h" 65 66 #include <sys/param.h> 67 #include <sys/systm.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/malloc.h> 71 #include <sys/mbuf.h> 72 #include <sys/socket.h> 73 #include <sys/socketvar.h> 74 75 #include <machine/cpu.h> 76 77 #include <vm/uma.h> 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 #include <net/route.h> 82 #include <net/vnet.h> 83 84 #include <netinet/in.h> 85 #include <netinet/in_systm.h> 86 #include <netinet/ip.h> 87 #include <netinet/in_var.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/ip_var.h> 90 #include <netinet/ip6.h> 91 #include <netinet/icmp6.h> 92 #include <netinet6/nd6.h> 93 #include <netinet6/ip6_var.h> 94 #include <netinet6/in6_pcb.h> 95 #include <netinet/tcp.h> 96 #include <netinet/tcp_fsm.h> 97 #include <netinet/tcp_seq.h> 98 #include <netinet/tcp_var.h> 99 #include <netinet/tcp_syncache.h> 100 #include <netinet/tcp_timer.h> 101 #include <netinet/tcpip.h> 102 #include <netinet/tcp_ecn.h> 103 104 105 /* 106 * Process incoming SYN,ACK packet 107 */ 108 void 109 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) 110 { 111 112 if (V_tcp_do_ecn == 0) 113 return; 114 if ((V_tcp_do_ecn == 1) || 115 (V_tcp_do_ecn == 2)) { 116 /* RFC3168 ECN handling */ 117 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) { 118 tp->t_flags2 |= TF2_ECN_PERMIT; 119 TCPSTAT_INC(tcps_ecn_shs); 120 } 121 } else 122 /* decoding Accurate ECN according to table in section 3.1.1 */ 123 if ((V_tcp_do_ecn == 3) || 124 (V_tcp_do_ecn == 4)) { 125 /* 126 * on the SYN,ACK, process the AccECN 127 * flags indicating the state the SYN 128 * was delivered. 129 * Reactions to Path ECN mangling can 130 * come here. 131 */ 132 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 133 /* RFC3168 SYN */ 134 case (0|0|TH_ECE): 135 tp->t_flags2 |= TF2_ECN_PERMIT; 136 TCPSTAT_INC(tcps_ecn_shs); 137 break; 138 /* non-ECT SYN */ 139 case (0|TH_CWR|0): 140 tp->t_flags2 |= TF2_ACE_PERMIT; 141 tp->t_scep = 5; 142 TCPSTAT_INC(tcps_ecn_shs); 143 TCPSTAT_INC(tcps_ace_nect); 144 break; 145 /* ECT0 SYN */ 146 case (TH_AE|0|0): 147 tp->t_flags2 |= TF2_ACE_PERMIT; 148 tp->t_scep = 5; 149 TCPSTAT_INC(tcps_ecn_shs); 150 TCPSTAT_INC(tcps_ace_ect0); 151 break; 152 /* ECT1 SYN */ 153 case (0|TH_CWR|TH_ECE): 154 tp->t_flags2 |= TF2_ACE_PERMIT; 155 tp->t_scep = 5; 156 TCPSTAT_INC(tcps_ecn_shs); 157 TCPSTAT_INC(tcps_ace_ect1); 158 break; 159 /* CE SYN */ 160 case (TH_AE|TH_CWR|0): 161 tp->t_flags2 |= TF2_ACE_PERMIT; 162 tp->t_scep = 6; 163 /* 164 * reduce the IW to 2 MSS (to 165 * account for delayed acks) if 166 * the SYN,ACK was CE marked 167 */ 168 tp->snd_cwnd = 2 * tcp_maxseg(tp); 169 TCPSTAT_INC(tcps_ecn_shs); 170 TCPSTAT_INC(tcps_ace_nect); 171 break; 172 default: 173 break; 174 } 175 /* 176 * Set the AccECN Codepoints on 177 * the outgoing <ACK> to the ECN 178 * state of the <SYN,ACK> 179 * according to table 3 in the 180 * AccECN draft 181 */ 182 switch (iptos & IPTOS_ECN_MASK) { 183 case (IPTOS_ECN_NOTECT): 184 tp->t_rcep = 0b010; 185 break; 186 case (IPTOS_ECN_ECT0): 187 tp->t_rcep = 0b100; 188 break; 189 case (IPTOS_ECN_ECT1): 190 tp->t_rcep = 0b011; 191 break; 192 case (IPTOS_ECN_CE): 193 tp->t_rcep = 0b110; 194 break; 195 } 196 } 197 } 198 199 /* 200 * Handle parallel SYN for ECN 201 */ 202 void 203 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos) 204 { 205 if (thflags & TH_ACK) 206 return; 207 if (V_tcp_do_ecn == 0) 208 return; 209 if ((V_tcp_do_ecn == 1) || 210 (V_tcp_do_ecn == 2)) { 211 /* RFC3168 ECN handling */ 212 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) { 213 tp->t_flags2 |= TF2_ECN_PERMIT; 214 tp->t_flags2 |= TF2_ECN_SND_ECE; 215 TCPSTAT_INC(tcps_ecn_shs); 216 } 217 } else 218 if ((V_tcp_do_ecn == 3) || 219 (V_tcp_do_ecn == 4)) { 220 /* AccECN handling */ 221 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { 222 default: 223 case (0|0|0): 224 break; 225 case (0|TH_CWR|TH_ECE): 226 tp->t_flags2 |= TF2_ECN_PERMIT; 227 tp->t_flags2 |= TF2_ECN_SND_ECE; 228 TCPSTAT_INC(tcps_ecn_shs); 229 break; 230 case (TH_AE|TH_CWR|TH_ECE): 231 tp->t_flags2 |= TF2_ACE_PERMIT; 232 TCPSTAT_INC(tcps_ecn_shs); 233 /* 234 * Set the AccECN Codepoints on 235 * the outgoing <ACK> to the ECN 236 * state of the <SYN,ACK> 237 * according to table 3 in the 238 * AccECN draft 239 */ 240 switch (iptos & IPTOS_ECN_MASK) { 241 case (IPTOS_ECN_NOTECT): 242 tp->t_rcep = 0b010; 243 break; 244 case (IPTOS_ECN_ECT0): 245 tp->t_rcep = 0b100; 246 break; 247 case (IPTOS_ECN_ECT1): 248 tp->t_rcep = 0b011; 249 break; 250 case (IPTOS_ECN_CE): 251 tp->t_rcep = 0b110; 252 break; 253 } 254 break; 255 } 256 } 257 } 258 259 /* 260 * TCP ECN processing. 261 */ 262 int 263 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos) 264 { 265 int delta_ace = 0; 266 267 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { 268 switch (iptos & IPTOS_ECN_MASK) { 269 case IPTOS_ECN_CE: 270 TCPSTAT_INC(tcps_ecn_ce); 271 break; 272 case IPTOS_ECN_ECT0: 273 TCPSTAT_INC(tcps_ecn_ect0); 274 break; 275 case IPTOS_ECN_ECT1: 276 TCPSTAT_INC(tcps_ecn_ect1); 277 break; 278 } 279 280 if (tp->t_flags2 & TF2_ACE_PERMIT) { 281 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 282 tp->t_rcep += 1; 283 if (tp->t_flags2 & TF2_ECN_PERMIT) { 284 delta_ace = (tcp_ecn_get_ace(thflags) + 8 - 285 (tp->t_scep & 0x07)) & 0x07; 286 tp->t_scep += delta_ace; 287 } else { 288 /* 289 * process the final ACK of the 3WHS 290 * see table 3 in draft-ietf-tcpm-accurate-ecn 291 */ 292 switch (tcp_ecn_get_ace(thflags)) { 293 case 0b010: 294 /* nonECT SYN or SYN,ACK */ 295 /* Fallthrough */ 296 case 0b011: 297 /* ECT1 SYN or SYN,ACK */ 298 /* Fallthrough */ 299 case 0b100: 300 /* ECT0 SYN or SYN,ACK */ 301 tp->t_scep = 5; 302 break; 303 case 0b110: 304 /* CE SYN or SYN,ACK */ 305 tp->t_scep = 6; 306 tp->snd_cwnd = 2 * tcp_maxseg(tp); 307 break; 308 default: 309 /* mangled AccECN handshake */ 310 tp->t_scep = 5; 311 break; 312 } 313 tp->t_flags2 |= TF2_ECN_PERMIT; 314 } 315 } else { 316 /* RFC3168 ECN handling */ 317 if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) 318 delta_ace = 1; 319 if (thflags & TH_CWR) { 320 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 321 tp->t_flags |= TF_ACKNOW; 322 } 323 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 324 tp->t_flags2 |= TF2_ECN_SND_ECE; 325 } 326 327 /* Process a packet differently from RFC3168. */ 328 cc_ecnpkt_handler_flags(tp, thflags, iptos); 329 } 330 331 return delta_ace; 332 } 333 334 /* 335 * Send ECN setup <SYN> packet header flags 336 */ 337 uint16_t 338 tcp_ecn_output_syn_sent(struct tcpcb *tp) 339 { 340 uint16_t thflags = 0; 341 342 if (V_tcp_do_ecn == 0) 343 return thflags; 344 if (V_tcp_do_ecn == 1) { 345 /* Send a RFC3168 ECN setup <SYN> packet */ 346 if (tp->t_rxtshift >= 1) { 347 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 348 thflags = TH_ECE|TH_CWR; 349 } else 350 thflags = TH_ECE|TH_CWR; 351 } else 352 if (V_tcp_do_ecn == 3) { 353 /* Send an Accurate ECN setup <SYN> packet */ 354 if (tp->t_rxtshift >= 1) { 355 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 356 thflags = TH_ECE|TH_CWR|TH_AE; 357 } else 358 thflags = TH_ECE|TH_CWR|TH_AE; 359 } 360 361 return thflags; 362 } 363 364 /* 365 * output processing of ECN feature 366 * returning IP ECN header codepoint 367 */ 368 int 369 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit) 370 { 371 int ipecn = IPTOS_ECN_NOTECT; 372 bool newdata; 373 374 /* 375 * If the peer has ECN, mark data packets with 376 * ECN capable transmission (ECT). 377 * Ignore pure control packets, retransmissions 378 * and window probes. 379 */ 380 newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 381 !rxmit && 382 !((tp->t_flags & TF_FORCEDATA) && len == 1)); 383 /* RFC3168 ECN marking, only new data segments */ 384 if (newdata) { 385 ipecn = IPTOS_ECN_ECT0; 386 TCPSTAT_INC(tcps_ecn_ect0); 387 } 388 /* 389 * Reply with proper ECN notifications. 390 */ 391 if (tp->t_flags2 & TF2_ACE_PERMIT) { 392 *thflags &= ~(TH_AE|TH_CWR|TH_ECE); 393 if (tp->t_rcep & 0x01) 394 *thflags |= TH_ECE; 395 if (tp->t_rcep & 0x02) 396 *thflags |= TH_CWR; 397 if (tp->t_rcep & 0x04) 398 *thflags |= TH_AE; 399 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { 400 /* 401 * here we process the final 402 * ACK of the 3WHS 403 */ 404 if (tp->t_rcep == 0b110) { 405 tp->t_rcep = 6; 406 } else { 407 tp->t_rcep = 5; 408 } 409 tp->t_flags2 |= TF2_ECN_PERMIT; 410 } 411 } else { 412 if (newdata && 413 (tp->t_flags2 & TF2_ECN_SND_CWR)) { 414 *thflags |= TH_CWR; 415 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 416 } 417 if (tp->t_flags2 & TF2_ECN_SND_ECE) 418 *thflags |= TH_ECE; 419 } 420 421 return ipecn; 422 } 423 424 /* 425 * Set up the ECN related tcpcb fields from 426 * a syncache entry 427 */ 428 void 429 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc) 430 { 431 if (sc->sc_flags & SCF_ECN_MASK) { 432 switch (sc->sc_flags & SCF_ECN_MASK) { 433 case SCF_ECN: 434 tp->t_flags2 |= TF2_ECN_PERMIT; 435 break; 436 case SCF_ACE_N: 437 /* Fallthrough */ 438 case SCF_ACE_0: 439 /* Fallthrough */ 440 case SCF_ACE_1: 441 tp->t_flags2 |= TF2_ACE_PERMIT; 442 tp->t_scep = 5; 443 tp->t_rcep = 5; 444 break; 445 case SCF_ACE_CE: 446 tp->t_flags2 |= TF2_ACE_PERMIT; 447 tp->t_scep = 6; 448 tp->t_rcep = 6; 449 break; 450 /* undefined SCF codepoint */ 451 default: 452 break; 453 } 454 } 455 } 456 457 /* 458 * Process a <SYN> packets ECN information, and provide the 459 * syncache with the relevant information. 460 */ 461 int 462 tcp_ecn_syncache_add(uint16_t thflags, int iptos) 463 { 464 int scflags = 0; 465 466 switch (thflags & (TH_AE|TH_CWR|TH_ECE)) { 467 /* no ECN */ 468 case (0|0|0): 469 break; 470 /* legacy ECN */ 471 case (0|TH_CWR|TH_ECE): 472 scflags = SCF_ECN; 473 break; 474 /* Accurate ECN */ 475 case (TH_AE|TH_CWR|TH_ECE): 476 if ((V_tcp_do_ecn == 3) || 477 (V_tcp_do_ecn == 4)) { 478 switch (iptos & IPTOS_ECN_MASK) { 479 case IPTOS_ECN_CE: 480 scflags = SCF_ACE_CE; 481 break; 482 case IPTOS_ECN_ECT0: 483 scflags = SCF_ACE_0; 484 break; 485 case IPTOS_ECN_ECT1: 486 scflags = SCF_ACE_1; 487 break; 488 case IPTOS_ECN_NOTECT: 489 scflags = SCF_ACE_N; 490 break; 491 } 492 } else 493 scflags = SCF_ECN; 494 break; 495 /* Default Case (section 3.1.2) */ 496 default: 497 if ((V_tcp_do_ecn == 3) || 498 (V_tcp_do_ecn == 4)) { 499 switch (iptos & IPTOS_ECN_MASK) { 500 case IPTOS_ECN_CE: 501 scflags = SCF_ACE_CE; 502 break; 503 case IPTOS_ECN_ECT0: 504 scflags = SCF_ACE_0; 505 break; 506 case IPTOS_ECN_ECT1: 507 scflags = SCF_ACE_1; 508 break; 509 case IPTOS_ECN_NOTECT: 510 scflags = SCF_ACE_N; 511 break; 512 } 513 } 514 break; 515 } 516 return scflags; 517 } 518 519 /* 520 * Set up the ECN information for the <SYN,ACK> from 521 * syncache information. 522 */ 523 uint16_t 524 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) 525 { 526 if ((thflags & TH_SYN) && 527 (sc->sc_flags & SCF_ECN_MASK)) { 528 switch (sc->sc_flags & SCF_ECN_MASK) { 529 case SCF_ECN: 530 thflags |= (0 | 0 | TH_ECE); 531 TCPSTAT_INC(tcps_ecn_shs); 532 break; 533 case SCF_ACE_N: 534 thflags |= (0 | TH_CWR | 0); 535 TCPSTAT_INC(tcps_ecn_shs); 536 TCPSTAT_INC(tcps_ace_nect); 537 break; 538 case SCF_ACE_0: 539 thflags |= (TH_AE | 0 | 0); 540 TCPSTAT_INC(tcps_ecn_shs); 541 TCPSTAT_INC(tcps_ace_ect0); 542 break; 543 case SCF_ACE_1: 544 thflags |= (0 | TH_ECE | TH_CWR); 545 TCPSTAT_INC(tcps_ecn_shs); 546 TCPSTAT_INC(tcps_ace_ect1); 547 break; 548 case SCF_ACE_CE: 549 thflags |= (TH_AE | TH_CWR | 0); 550 TCPSTAT_INC(tcps_ecn_shs); 551 TCPSTAT_INC(tcps_ace_ce); 552 break; 553 /* undefined SCF codepoint */ 554 default: 555 break; 556 } 557 } 558 return thflags; 559 } 560 561 int 562 tcp_ecn_get_ace(uint16_t thflags) 563 { 564 int ace = 0; 565 566 if (thflags & TH_ECE) 567 ace += 1; 568 if (thflags & TH_CWR) 569 ace += 2; 570 if (thflags & TH_AE) 571 ace += 4; 572 return ace; 573 } 574