1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_ipfw.h" /* for ipfw_fwd */ 36 #include "opt_inet.h" 37 #include "opt_inet6.h" 38 #include "opt_ipsec.h" 39 #include "opt_mac.h" 40 #include "opt_tcpdebug.h" 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/proc.h> /* for proc0 declaration */ 47 #include <sys/protosw.h> 48 #include <sys/signalvar.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/syslog.h> 53 #include <sys/systm.h> 54 #include <sys/vimage.h> 55 56 #include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 57 58 #include <vm/uma.h> 59 60 #include <net/if.h> 61 #include <net/route.h> 62 63 #define TCPSTATES /* for logging */ 64 65 #include <netinet/in.h> 66 #include <netinet/in_pcb.h> 67 #include <netinet/in_systm.h> 68 #include <netinet/in_var.h> 69 #include <netinet/ip.h> 70 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 71 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 72 #include <netinet/ip_var.h> 73 #include <netinet/ip_options.h> 74 #include <netinet/ip6.h> 75 #include <netinet/icmp6.h> 76 #include <netinet6/in6_pcb.h> 77 #include <netinet6/ip6_var.h> 78 #include <netinet6/nd6.h> 79 #include <netinet/tcp.h> 80 #include <netinet/tcp_fsm.h> 81 #include <netinet/tcp_seq.h> 82 #include <netinet/tcp_timer.h> 83 #include <netinet/tcp_var.h> 84 #include <netinet6/tcp6_var.h> 85 #include <netinet/tcpip.h> 86 #include <netinet/tcp_syncache.h> 87 #ifdef TCPDEBUG 88 #include <netinet/tcp_debug.h> 89 #endif /* TCPDEBUG */ 90 #include <netinet/vinet.h> 91 92 #ifdef INET6 93 #include <netinet6/vinet6.h> 94 #endif 95 96 #ifdef IPSEC 97 #include <netipsec/ipsec.h> 98 #include <netipsec/ipsec6.h> 99 #endif /*IPSEC*/ 100 101 #include <machine/in_cksum.h> 102 103 #include <security/mac/mac_framework.h> 104 105 static const int tcprexmtthresh = 3; 106 107 #ifdef VIMAGE_GLOBALS 108 struct tcpstat tcpstat; 109 int blackhole; 110 int tcp_delack_enabled; 111 int drop_synfin; 112 int tcp_do_rfc3042; 113 int tcp_do_rfc3390; 114 int tcp_do_ecn; 115 int tcp_ecn_maxretries; 116 int tcp_insecure_rst; 117 int tcp_do_autorcvbuf; 118 int tcp_autorcvbuf_inc; 119 int tcp_autorcvbuf_max; 120 #endif 121 122 SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats, 123 CTLFLAG_RW, tcpstat , tcpstat, 124 "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 125 126 int tcp_log_in_vain = 0; 127 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 128 &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); 129 130 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, 131 blackhole, 0, "Do not send RST on segments to closed ports"); 132 133 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, delayed_ack, 134 CTLFLAG_RW, tcp_delack_enabled, 0, 135 "Delay ACK to try and piggyback it onto a data packet"); 136 137 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, drop_synfin, 138 CTLFLAG_RW, drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); 139 140 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, 141 tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); 142 143 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, 144 tcp_do_rfc3390, 0, 145 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); 146 147 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); 148 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, enable, 149 CTLFLAG_RW, tcp_do_ecn, 0, "TCP ECN support"); 150 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, maxretries, 151 CTLFLAG_RW, tcp_ecn_maxretries, 0, "Max retries before giving up on ECN"); 152 153 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, insecure_rst, 154 CTLFLAG_RW, tcp_insecure_rst, 0, 155 "Follow the old (insecure) criteria for accepting RST packets"); 156 157 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_auto, 158 CTLFLAG_RW, tcp_do_autorcvbuf, 0, 159 "Enable automatic receive buffer sizing"); 160 161 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_inc, 162 CTLFLAG_RW, tcp_autorcvbuf_inc, 0, 163 "Incrementor step size of automatic receive buffer"); 164 165 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_max, 166 CTLFLAG_RW, tcp_autorcvbuf_max, 0, 167 "Max size of automatic receive buffer"); 168 169 int tcp_read_locking = 1; 170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW, 171 &tcp_read_locking, 0, "Enable read locking strategy"); 172 173 int tcp_rlock_atfirst; 174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rlock_atfirst, CTLFLAG_RD, 175 &tcp_rlock_atfirst, 0, ""); 176 177 int tcp_wlock_atfirst; 178 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_wlock_atfirst, CTLFLAG_RD, 179 &tcp_wlock_atfirst, 0, ""); 180 181 int tcp_wlock_upgraded; 182 SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_upgraded, CTLFLAG_RD, 183 &tcp_wlock_upgraded, 0, ""); 184 185 int tcp_wlock_relocked; 186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_relocked, CTLFLAG_RD, 187 &tcp_wlock_relocked, 0, ""); 188 189 int tcp_wlock_looped; 190 SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_looped, CTLFLAG_RD, 191 &tcp_wlock_looped, 0, ""); 192 193 #ifdef VIMAGE_GLOBALS 194 struct inpcbhead tcb; 195 struct inpcbinfo tcbinfo; 196 #endif 197 #define tcb6 tcb /* for KAME src sync over BSD*'s */ 198 199 static void tcp_dooptions(struct tcpopt *, u_char *, int, int); 200 static void tcp_do_segment(struct mbuf *, struct tcphdr *, 201 struct socket *, struct tcpcb *, int, int, uint8_t, 202 int); 203 static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, 204 struct tcpcb *, int, int); 205 static void tcp_pulloutofband(struct socket *, 206 struct tcphdr *, struct mbuf *, int); 207 static void tcp_xmit_timer(struct tcpcb *, int); 208 static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); 209 static void inline 210 tcp_congestion_exp(struct tcpcb *); 211 212 static void inline 213 tcp_congestion_exp(struct tcpcb *tp) 214 { 215 u_int win; 216 217 win = min(tp->snd_wnd, tp->snd_cwnd) / 218 2 / tp->t_maxseg; 219 if (win < 2) 220 win = 2; 221 tp->snd_ssthresh = win * tp->t_maxseg; 222 ENTER_FASTRECOVERY(tp); 223 tp->snd_recover = tp->snd_max; 224 if (tp->t_flags & TF_ECN_PERMIT) 225 tp->t_flags |= TF_ECN_SND_CWR; 226 } 227 228 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ 229 #ifdef INET6 230 #define ND6_HINT(tp) \ 231 do { \ 232 if ((tp) && (tp)->t_inpcb && \ 233 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ 234 nd6_nud_hint(NULL, NULL, 0); \ 235 } while (0) 236 #else 237 #define ND6_HINT(tp) 238 #endif 239 240 /* 241 * Indicate whether this ack should be delayed. We can delay the ack if 242 * - there is no delayed ack timer in progress and 243 * - our last ack wasn't a 0-sized window. We never want to delay 244 * the ack that opens up a 0-sized window and 245 * - delayed acks are enabled or 246 * - this is a half-synchronized T/TCP connection. 247 */ 248 #define DELAY_ACK(tp) \ 249 ((!tcp_timer_active(tp, TT_DELACK) && \ 250 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 251 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 252 253 /* 254 * TCP input handling is split into multiple parts: 255 * tcp6_input is a thin wrapper around tcp_input for the extended 256 * ip6_protox[] call format in ip6_input 257 * tcp_input handles primary segment validation, inpcb lookup and 258 * SYN processing on listen sockets 259 * tcp_do_segment processes the ACK and text of the segment for 260 * establishing, established and closing connections 261 */ 262 #ifdef INET6 263 int 264 tcp6_input(struct mbuf **mp, int *offp, int proto) 265 { 266 INIT_VNET_INET6(curvnet); 267 struct mbuf *m = *mp; 268 struct in6_ifaddr *ia6; 269 270 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); 271 272 /* 273 * draft-itojun-ipv6-tcp-to-anycast 274 * better place to put this in? 275 */ 276 ia6 = ip6_getdstifaddr(m); 277 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { 278 struct ip6_hdr *ip6; 279 280 ip6 = mtod(m, struct ip6_hdr *); 281 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 282 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 283 return IPPROTO_DONE; 284 } 285 286 tcp_input(m, *offp); 287 return IPPROTO_DONE; 288 } 289 #endif 290 291 void 292 tcp_input(struct mbuf *m, int off0) 293 { 294 INIT_VNET_INET(curvnet); 295 #ifdef INET6 296 INIT_VNET_INET6(curvnet); 297 #endif 298 #ifdef IPSEC 299 INIT_VNET_IPSEC(curvnet); 300 #endif 301 struct tcphdr *th; 302 struct ip *ip = NULL; 303 struct ipovly *ipov; 304 struct inpcb *inp = NULL; 305 struct tcpcb *tp = NULL; 306 struct socket *so = NULL; 307 u_char *optp = NULL; 308 int optlen = 0; 309 int len, tlen, off; 310 int drop_hdrlen; 311 int thflags; 312 int rstreason = 0; /* For badport_bandlim accounting purposes */ 313 uint8_t iptos; 314 #ifdef IPFIREWALL_FORWARD 315 struct m_tag *fwd_tag; 316 #endif 317 #ifdef INET6 318 struct ip6_hdr *ip6 = NULL; 319 int isipv6; 320 #else 321 const void *ip6 = NULL; 322 const int isipv6 = 0; 323 #endif 324 struct tcpopt to; /* options in this segment */ 325 char *s = NULL; /* address and port logging */ 326 int ti_locked; 327 #define TI_UNLOCKED 1 328 #define TI_RLOCKED 2 329 #define TI_WLOCKED 3 330 331 #ifdef TCPDEBUG 332 /* 333 * The size of tcp_saveipgen must be the size of the max ip header, 334 * now IPv6. 335 */ 336 u_char tcp_saveipgen[IP6_HDR_LEN]; 337 struct tcphdr tcp_savetcp; 338 short ostate = 0; 339 #endif 340 341 #ifdef INET6 342 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 343 #endif 344 345 to.to_flags = 0; 346 V_tcpstat.tcps_rcvtotal++; 347 348 if (isipv6) { 349 #ifdef INET6 350 /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ 351 ip6 = mtod(m, struct ip6_hdr *); 352 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 353 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { 354 V_tcpstat.tcps_rcvbadsum++; 355 goto drop; 356 } 357 th = (struct tcphdr *)((caddr_t)ip6 + off0); 358 359 /* 360 * Be proactive about unspecified IPv6 address in source. 361 * As we use all-zero to indicate unbounded/unconnected pcb, 362 * unspecified IPv6 address can be used to confuse us. 363 * 364 * Note that packets with unspecified IPv6 destination is 365 * already dropped in ip6_input. 366 */ 367 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 368 /* XXX stat */ 369 goto drop; 370 } 371 #else 372 th = NULL; /* XXX: Avoid compiler warning. */ 373 #endif 374 } else { 375 /* 376 * Get IP and TCP header together in first mbuf. 377 * Note: IP leaves IP header in first mbuf. 378 */ 379 if (off0 > sizeof (struct ip)) { 380 ip_stripoptions(m, (struct mbuf *)0); 381 off0 = sizeof(struct ip); 382 } 383 if (m->m_len < sizeof (struct tcpiphdr)) { 384 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 385 == NULL) { 386 V_tcpstat.tcps_rcvshort++; 387 return; 388 } 389 } 390 ip = mtod(m, struct ip *); 391 ipov = (struct ipovly *)ip; 392 th = (struct tcphdr *)((caddr_t)ip + off0); 393 tlen = ip->ip_len; 394 395 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 396 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 397 th->th_sum = m->m_pkthdr.csum_data; 398 else 399 th->th_sum = in_pseudo(ip->ip_src.s_addr, 400 ip->ip_dst.s_addr, 401 htonl(m->m_pkthdr.csum_data + 402 ip->ip_len + 403 IPPROTO_TCP)); 404 th->th_sum ^= 0xffff; 405 #ifdef TCPDEBUG 406 ipov->ih_len = (u_short)tlen; 407 ipov->ih_len = htons(ipov->ih_len); 408 #endif 409 } else { 410 /* 411 * Checksum extended TCP header and data. 412 */ 413 len = sizeof (struct ip) + tlen; 414 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 415 ipov->ih_len = (u_short)tlen; 416 ipov->ih_len = htons(ipov->ih_len); 417 th->th_sum = in_cksum(m, len); 418 } 419 if (th->th_sum) { 420 V_tcpstat.tcps_rcvbadsum++; 421 goto drop; 422 } 423 /* Re-initialization for later version check */ 424 ip->ip_v = IPVERSION; 425 } 426 427 #ifdef INET6 428 if (isipv6) 429 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 430 else 431 #endif 432 iptos = ip->ip_tos; 433 434 /* 435 * Check that TCP offset makes sense, 436 * pull out TCP options and adjust length. XXX 437 */ 438 off = th->th_off << 2; 439 if (off < sizeof (struct tcphdr) || off > tlen) { 440 V_tcpstat.tcps_rcvbadoff++; 441 goto drop; 442 } 443 tlen -= off; /* tlen is used instead of ti->ti_len */ 444 if (off > sizeof (struct tcphdr)) { 445 if (isipv6) { 446 #ifdef INET6 447 IP6_EXTHDR_CHECK(m, off0, off, ); 448 ip6 = mtod(m, struct ip6_hdr *); 449 th = (struct tcphdr *)((caddr_t)ip6 + off0); 450 #endif 451 } else { 452 if (m->m_len < sizeof(struct ip) + off) { 453 if ((m = m_pullup(m, sizeof (struct ip) + off)) 454 == NULL) { 455 V_tcpstat.tcps_rcvshort++; 456 return; 457 } 458 ip = mtod(m, struct ip *); 459 ipov = (struct ipovly *)ip; 460 th = (struct tcphdr *)((caddr_t)ip + off0); 461 } 462 } 463 optlen = off - sizeof (struct tcphdr); 464 optp = (u_char *)(th + 1); 465 } 466 thflags = th->th_flags; 467 468 /* 469 * Convert TCP protocol specific fields to host format. 470 */ 471 th->th_seq = ntohl(th->th_seq); 472 th->th_ack = ntohl(th->th_ack); 473 th->th_win = ntohs(th->th_win); 474 th->th_urp = ntohs(th->th_urp); 475 476 /* 477 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. 478 */ 479 drop_hdrlen = off0 + off; 480 481 /* 482 * Locate pcb for segment, which requires a lock on tcbinfo. 483 * Optimisticaly acquire a global read lock rather than a write lock 484 * unless header flags necessarily imply a state change. There are 485 * two cases where we might discover later we need a write lock 486 * despite the flags: ACKs moving a connection out of the syncache, 487 * and ACKs for a connection in TIMEWAIT. 488 */ 489 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 490 tcp_read_locking == 0) { 491 INP_INFO_WLOCK(&V_tcbinfo); 492 ti_locked = TI_WLOCKED; 493 tcp_wlock_atfirst++; 494 } else { 495 INP_INFO_RLOCK(&V_tcbinfo); 496 ti_locked = TI_RLOCKED; 497 tcp_rlock_atfirst++; 498 } 499 500 findpcb: 501 #ifdef INVARIANTS 502 if (ti_locked == TI_RLOCKED) 503 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 504 else if (ti_locked == TI_WLOCKED) 505 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 506 else 507 panic("%s: findpcb ti_locked %d\n", __func__, ti_locked); 508 #endif 509 510 #ifdef IPFIREWALL_FORWARD 511 /* 512 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. 513 */ 514 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 515 516 if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ 517 struct sockaddr_in *next_hop; 518 519 next_hop = (struct sockaddr_in *)(fwd_tag+1); 520 /* 521 * Transparently forwarded. Pretend to be the destination. 522 * already got one like this? 523 */ 524 inp = in_pcblookup_hash(&V_tcbinfo, 525 ip->ip_src, th->th_sport, 526 ip->ip_dst, th->th_dport, 527 0, m->m_pkthdr.rcvif); 528 if (!inp) { 529 /* It's new. Try to find the ambushing socket. */ 530 inp = in_pcblookup_hash(&V_tcbinfo, 531 ip->ip_src, th->th_sport, 532 next_hop->sin_addr, 533 next_hop->sin_port ? 534 ntohs(next_hop->sin_port) : 535 th->th_dport, 536 INPLOOKUP_WILDCARD, 537 m->m_pkthdr.rcvif); 538 } 539 /* Remove the tag from the packet. We don't need it anymore. */ 540 m_tag_delete(m, fwd_tag); 541 } else 542 #endif /* IPFIREWALL_FORWARD */ 543 { 544 if (isipv6) { 545 #ifdef INET6 546 inp = in6_pcblookup_hash(&V_tcbinfo, 547 &ip6->ip6_src, th->th_sport, 548 &ip6->ip6_dst, th->th_dport, 549 INPLOOKUP_WILDCARD, 550 m->m_pkthdr.rcvif); 551 #endif 552 } else 553 inp = in_pcblookup_hash(&V_tcbinfo, 554 ip->ip_src, th->th_sport, 555 ip->ip_dst, th->th_dport, 556 INPLOOKUP_WILDCARD, 557 m->m_pkthdr.rcvif); 558 } 559 560 /* 561 * If the INPCB does not exist then all data in the incoming 562 * segment is discarded and an appropriate RST is sent back. 563 * XXX MRT Send RST using which routing table? 564 */ 565 if (inp == NULL) { 566 /* 567 * Log communication attempts to ports that are not 568 * in use. 569 */ 570 if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || 571 tcp_log_in_vain == 2) { 572 if ((s = tcp_log_addrs(NULL, th, (void *)ip, ip6))) 573 log(LOG_INFO, "%s; %s: Connection attempt " 574 "to closed port\n", s, __func__); 575 } 576 /* 577 * When blackholing do not respond with a RST but 578 * completely ignore the segment and drop it. 579 */ 580 if ((V_blackhole == 1 && (thflags & TH_SYN)) || 581 V_blackhole == 2) 582 goto dropunlock; 583 584 rstreason = BANDLIM_RST_CLOSEDPORT; 585 goto dropwithreset; 586 } 587 INP_WLOCK(inp); 588 589 #ifdef IPSEC 590 #ifdef INET6 591 if (isipv6 && ipsec6_in_reject(m, inp)) { 592 V_ipsec6stat.in_polvio++; 593 goto dropunlock; 594 } else 595 #endif /* INET6 */ 596 if (ipsec4_in_reject(m, inp) != 0) { 597 V_ipsec4stat.in_polvio++; 598 goto dropunlock; 599 } 600 #endif /* IPSEC */ 601 602 /* 603 * Check the minimum TTL for socket. 604 */ 605 if (inp->inp_ip_minttl != 0) { 606 #ifdef INET6 607 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) 608 goto dropunlock; 609 else 610 #endif 611 if (inp->inp_ip_minttl > ip->ip_ttl) 612 goto dropunlock; 613 } 614 615 /* 616 * A previous connection in TIMEWAIT state is supposed to catch stray 617 * or duplicate segments arriving late. If this segment was a 618 * legitimate new connection attempt the old INPCB gets removed and 619 * we can try again to find a listening socket. 620 * 621 * At this point, due to earlier optimism, we may hold a read lock on 622 * the inpcbinfo, rather than a write lock. If so, we need to 623 * upgrade, or if that fails, acquire a reference on the inpcb, drop 624 * all locks, acquire a global write lock, and then re-acquire the 625 * inpcb lock. We may at that point discover that another thread has 626 * tried to free the inpcb, in which case we need to loop back and 627 * try to find a new inpcb to deliver to. 628 */ 629 if (inp->inp_vflag & INP_TIMEWAIT) { 630 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 631 ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked)); 632 633 if (ti_locked == TI_RLOCKED) { 634 if (rw_try_upgrade(&V_tcbinfo.ipi_lock) == 0) { 635 in_pcbref(inp); 636 INP_WUNLOCK(inp); 637 INP_INFO_RUNLOCK(&V_tcbinfo); 638 INP_INFO_WLOCK(&V_tcbinfo); 639 ti_locked = TI_WLOCKED; 640 INP_WLOCK(inp); 641 if (in_pcbrele(inp)) { 642 tcp_wlock_looped++; 643 inp = NULL; 644 goto findpcb; 645 } 646 tcp_wlock_relocked++; 647 } else { 648 ti_locked = TI_WLOCKED; 649 tcp_wlock_upgraded++; 650 } 651 } 652 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 653 654 if (thflags & TH_SYN) 655 tcp_dooptions(&to, optp, optlen, TO_SYN); 656 /* 657 * NB: tcp_twcheck unlocks the INP and frees the mbuf. 658 */ 659 if (tcp_twcheck(inp, &to, th, m, tlen)) 660 goto findpcb; 661 INP_INFO_WUNLOCK(&V_tcbinfo); 662 return; 663 } 664 /* 665 * The TCPCB may no longer exist if the connection is winding 666 * down or it is in the CLOSED state. Either way we drop the 667 * segment and send an appropriate response. 668 */ 669 tp = intotcpcb(inp); 670 if (tp == NULL || tp->t_state == TCPS_CLOSED) { 671 rstreason = BANDLIM_RST_CLOSEDPORT; 672 goto dropwithreset; 673 } 674 675 /* 676 * We've identified a valid inpcb, but it could be that we need an 677 * inpcbinfo write lock and have only a read lock. In this case, 678 * attempt to upgrade/relock using the same strategy as the TIMEWAIT 679 * case above. 680 */ 681 if (tp->t_state != TCPS_ESTABLISHED || 682 (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 683 tcp_read_locking == 0) { 684 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 685 ("%s: upgrade check ti_locked %d", __func__, ti_locked)); 686 687 if (ti_locked == TI_RLOCKED) { 688 if (rw_try_upgrade(&V_tcbinfo.ipi_lock) == 0) { 689 in_pcbref(inp); 690 INP_WUNLOCK(inp); 691 INP_INFO_RUNLOCK(&V_tcbinfo); 692 INP_INFO_WLOCK(&V_tcbinfo); 693 ti_locked = TI_WLOCKED; 694 INP_WLOCK(inp); 695 if (in_pcbrele(inp)) { 696 tcp_wlock_looped++; 697 inp = NULL; 698 goto findpcb; 699 } 700 tcp_wlock_relocked++; 701 } else { 702 ti_locked = TI_WLOCKED; 703 tcp_wlock_upgraded++; 704 } 705 } 706 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 707 } 708 709 #ifdef MAC 710 INP_WLOCK_ASSERT(inp); 711 if (mac_inpcb_check_deliver(inp, m)) 712 goto dropunlock; 713 #endif 714 so = inp->inp_socket; 715 KASSERT(so != NULL, ("%s: so == NULL", __func__)); 716 #ifdef TCPDEBUG 717 if (so->so_options & SO_DEBUG) { 718 ostate = tp->t_state; 719 if (isipv6) { 720 #ifdef INET6 721 bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); 722 #endif 723 } else 724 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); 725 tcp_savetcp = *th; 726 } 727 #endif 728 /* 729 * When the socket is accepting connections (the INPCB is in LISTEN 730 * state) we look into the SYN cache if this is a new connection 731 * attempt or the completion of a previous one. 732 */ 733 if (so->so_options & SO_ACCEPTCONN) { 734 struct in_conninfo inc; 735 736 KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " 737 "tp not listening", __func__)); 738 739 bzero(&inc, sizeof(inc)); 740 inc.inc_isipv6 = isipv6; 741 #ifdef INET6 742 if (isipv6) { 743 inc.inc6_faddr = ip6->ip6_src; 744 inc.inc6_laddr = ip6->ip6_dst; 745 } else 746 #endif 747 { 748 inc.inc_faddr = ip->ip_src; 749 inc.inc_laddr = ip->ip_dst; 750 } 751 inc.inc_fport = th->th_sport; 752 inc.inc_lport = th->th_dport; 753 754 /* 755 * Check for an existing connection attempt in syncache if 756 * the flag is only ACK. A successful lookup creates a new 757 * socket appended to the listen queue in SYN_RECEIVED state. 758 */ 759 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 760 /* 761 * Parse the TCP options here because 762 * syncookies need access to the reflected 763 * timestamp. 764 */ 765 tcp_dooptions(&to, optp, optlen, 0); 766 /* 767 * NB: syncache_expand() doesn't unlock 768 * inp and tcpinfo locks. 769 */ 770 if (!syncache_expand(&inc, &to, th, &so, m)) { 771 /* 772 * No syncache entry or ACK was not 773 * for our SYN/ACK. Send a RST. 774 * NB: syncache did its own logging 775 * of the failure cause. 776 */ 777 rstreason = BANDLIM_RST_OPENPORT; 778 goto dropwithreset; 779 } 780 if (so == NULL) { 781 /* 782 * We completed the 3-way handshake 783 * but could not allocate a socket 784 * either due to memory shortage, 785 * listen queue length limits or 786 * global socket limits. Send RST 787 * or wait and have the remote end 788 * retransmit the ACK for another 789 * try. 790 */ 791 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 792 log(LOG_DEBUG, "%s; %s: Listen socket: " 793 "Socket allocation failed due to " 794 "limits or memory shortage, %s\n", 795 s, __func__, 796 V_tcp_sc_rst_sock_fail ? 797 "sending RST" : "try again"); 798 if (V_tcp_sc_rst_sock_fail) { 799 rstreason = BANDLIM_UNLIMITED; 800 goto dropwithreset; 801 } else 802 goto dropunlock; 803 } 804 /* 805 * Socket is created in state SYN_RECEIVED. 806 * Unlock the listen socket, lock the newly 807 * created socket and update the tp variable. 808 */ 809 INP_WUNLOCK(inp); /* listen socket */ 810 inp = sotoinpcb(so); 811 INP_WLOCK(inp); /* new connection */ 812 tp = intotcpcb(inp); 813 KASSERT(tp->t_state == TCPS_SYN_RECEIVED, 814 ("%s: ", __func__)); 815 /* 816 * Process the segment and the data it 817 * contains. tcp_do_segment() consumes 818 * the mbuf chain and unlocks the inpcb. 819 */ 820 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, 821 iptos, ti_locked); 822 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 823 return; 824 } 825 /* 826 * Segment flag validation for new connection attempts: 827 * 828 * Our (SYN|ACK) response was rejected. 829 * Check with syncache and remove entry to prevent 830 * retransmits. 831 * 832 * NB: syncache_chkrst does its own logging of failure 833 * causes. 834 */ 835 if (thflags & TH_RST) { 836 syncache_chkrst(&inc, th); 837 goto dropunlock; 838 } 839 /* 840 * We can't do anything without SYN. 841 */ 842 if ((thflags & TH_SYN) == 0) { 843 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 844 log(LOG_DEBUG, "%s; %s: Listen socket: " 845 "SYN is missing, segment ignored\n", 846 s, __func__); 847 V_tcpstat.tcps_badsyn++; 848 goto dropunlock; 849 } 850 /* 851 * (SYN|ACK) is bogus on a listen socket. 852 */ 853 if (thflags & TH_ACK) { 854 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 855 log(LOG_DEBUG, "%s; %s: Listen socket: " 856 "SYN|ACK invalid, segment rejected\n", 857 s, __func__); 858 syncache_badack(&inc); /* XXX: Not needed! */ 859 V_tcpstat.tcps_badsyn++; 860 rstreason = BANDLIM_RST_OPENPORT; 861 goto dropwithreset; 862 } 863 /* 864 * If the drop_synfin option is enabled, drop all 865 * segments with both the SYN and FIN bits set. 866 * This prevents e.g. nmap from identifying the 867 * TCP/IP stack. 868 * XXX: Poor reasoning. nmap has other methods 869 * and is constantly refining its stack detection 870 * strategies. 871 * XXX: This is a violation of the TCP specification 872 * and was used by RFC1644. 873 */ 874 if ((thflags & TH_FIN) && V_drop_synfin) { 875 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 876 log(LOG_DEBUG, "%s; %s: Listen socket: " 877 "SYN|FIN segment ignored (based on " 878 "sysctl setting)\n", s, __func__); 879 V_tcpstat.tcps_badsyn++; 880 goto dropunlock; 881 } 882 /* 883 * Segment's flags are (SYN) or (SYN|FIN). 884 * 885 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored 886 * as they do not affect the state of the TCP FSM. 887 * The data pointed to by TH_URG and th_urp is ignored. 888 */ 889 KASSERT((thflags & (TH_RST|TH_ACK)) == 0, 890 ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); 891 KASSERT(thflags & (TH_SYN), 892 ("%s: Listen socket: TH_SYN not set", __func__)); 893 #ifdef INET6 894 /* 895 * If deprecated address is forbidden, 896 * we do not accept SYN to deprecated interface 897 * address to prevent any new inbound connection from 898 * getting established. 899 * When we do not accept SYN, we send a TCP RST, 900 * with deprecated source address (instead of dropping 901 * it). We compromise it as it is much better for peer 902 * to send a RST, and RST will be the final packet 903 * for the exchange. 904 * 905 * If we do not forbid deprecated addresses, we accept 906 * the SYN packet. RFC2462 does not suggest dropping 907 * SYN in this case. 908 * If we decipher RFC2462 5.5.4, it says like this: 909 * 1. use of deprecated addr with existing 910 * communication is okay - "SHOULD continue to be 911 * used" 912 * 2. use of it with new communication: 913 * (2a) "SHOULD NOT be used if alternate address 914 * with sufficient scope is available" 915 * (2b) nothing mentioned otherwise. 916 * Here we fall into (2b) case as we have no choice in 917 * our source address selection - we must obey the peer. 918 * 919 * The wording in RFC2462 is confusing, and there are 920 * multiple description text for deprecated address 921 * handling - worse, they are not exactly the same. 922 * I believe 5.5.4 is the best one, so we follow 5.5.4. 923 */ 924 if (isipv6 && !V_ip6_use_deprecated) { 925 struct in6_ifaddr *ia6; 926 927 if ((ia6 = ip6_getdstifaddr(m)) && 928 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 929 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 930 log(LOG_DEBUG, "%s; %s: Listen socket: " 931 "Connection attempt to deprecated " 932 "IPv6 address rejected\n", 933 s, __func__); 934 rstreason = BANDLIM_RST_OPENPORT; 935 goto dropwithreset; 936 } 937 } 938 #endif 939 /* 940 * Basic sanity checks on incoming SYN requests: 941 * Don't respond if the destination is a link layer 942 * broadcast according to RFC1122 4.2.3.10, p. 104. 943 * If it is from this socket it must be forged. 944 * Don't respond if the source or destination is a 945 * global or subnet broad- or multicast address. 946 * Note that it is quite possible to receive unicast 947 * link-layer packets with a broadcast IP address. Use 948 * in_broadcast() to find them. 949 */ 950 if (m->m_flags & (M_BCAST|M_MCAST)) { 951 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 952 log(LOG_DEBUG, "%s; %s: Listen socket: " 953 "Connection attempt from broad- or multicast " 954 "link layer address ignored\n", s, __func__); 955 goto dropunlock; 956 } 957 if (isipv6) { 958 #ifdef INET6 959 if (th->th_dport == th->th_sport && 960 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { 961 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 962 log(LOG_DEBUG, "%s; %s: Listen socket: " 963 "Connection attempt to/from self " 964 "ignored\n", s, __func__); 965 goto dropunlock; 966 } 967 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 968 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { 969 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 970 log(LOG_DEBUG, "%s; %s: Listen socket: " 971 "Connection attempt from/to multicast " 972 "address ignored\n", s, __func__); 973 goto dropunlock; 974 } 975 #endif 976 } else { 977 if (th->th_dport == th->th_sport && 978 ip->ip_dst.s_addr == ip->ip_src.s_addr) { 979 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 980 log(LOG_DEBUG, "%s; %s: Listen socket: " 981 "Connection attempt from/to self " 982 "ignored\n", s, __func__); 983 goto dropunlock; 984 } 985 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 986 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 987 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 988 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { 989 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 990 log(LOG_DEBUG, "%s; %s: Listen socket: " 991 "Connection attempt from/to broad- " 992 "or multicast address ignored\n", 993 s, __func__); 994 goto dropunlock; 995 } 996 } 997 /* 998 * SYN appears to be valid. Create compressed TCP state 999 * for syncache. 1000 */ 1001 #ifdef TCPDEBUG 1002 if (so->so_options & SO_DEBUG) 1003 tcp_trace(TA_INPUT, ostate, tp, 1004 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1005 #endif 1006 tcp_dooptions(&to, optp, optlen, TO_SYN); 1007 syncache_add(&inc, &to, th, inp, &so, m); 1008 /* 1009 * Entry added to syncache and mbuf consumed. 1010 * Everything already unlocked by syncache_add(). 1011 */ 1012 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1013 return; 1014 } 1015 1016 /* 1017 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later 1018 * state. tcp_do_segment() always consumes the mbuf chain, unlocks 1019 * the inpcb, and unlocks pcbinfo. 1020 */ 1021 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); 1022 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1023 return; 1024 1025 dropwithreset: 1026 if (ti_locked == TI_RLOCKED) 1027 INP_INFO_RUNLOCK(&V_tcbinfo); 1028 else if (ti_locked == TI_WLOCKED) 1029 INP_INFO_WUNLOCK(&V_tcbinfo); 1030 else 1031 panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); 1032 ti_locked = TI_UNLOCKED; 1033 1034 if (inp != NULL) { 1035 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1036 INP_WUNLOCK(inp); 1037 } else 1038 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1039 m = NULL; /* mbuf chain got consumed. */ 1040 goto drop; 1041 1042 dropunlock: 1043 if (ti_locked == TI_RLOCKED) 1044 INP_INFO_RUNLOCK(&V_tcbinfo); 1045 else if (ti_locked == TI_WLOCKED) 1046 INP_INFO_WUNLOCK(&V_tcbinfo); 1047 else 1048 panic("%s: dropunlock ti_locked %d", __func__, ti_locked); 1049 ti_locked = TI_UNLOCKED; 1050 1051 if (inp != NULL) 1052 INP_WUNLOCK(inp); 1053 1054 drop: 1055 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1056 if (s != NULL) 1057 free(s, M_TCPLOG); 1058 if (m != NULL) 1059 m_freem(m); 1060 } 1061 1062 static void 1063 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 1064 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 1065 int ti_locked) 1066 { 1067 INIT_VNET_INET(tp->t_vnet); 1068 int thflags, acked, ourfinisacked, needoutput = 0; 1069 int rstreason, todrop, win; 1070 u_long tiwin; 1071 struct tcpopt to; 1072 1073 #ifdef TCPDEBUG 1074 /* 1075 * The size of tcp_saveipgen must be the size of the max ip header, 1076 * now IPv6. 1077 */ 1078 u_char tcp_saveipgen[IP6_HDR_LEN]; 1079 struct tcphdr tcp_savetcp; 1080 short ostate = 0; 1081 #endif 1082 thflags = th->th_flags; 1083 1084 /* 1085 * If this is either a state-changing packet or current state isn't 1086 * established, we require a write lock on tcbinfo. Otherwise, we 1087 * allow either a read lock or a write lock, as we may have acquired 1088 * a write lock due to a race. 1089 * 1090 * Require a global write lock for SYN/FIN/RST segments or 1091 * non-established connections; otherwise accept either a read or 1092 * write lock, as we may have conservatively acquired a write lock in 1093 * certain cases in tcp_input() (is this still true?). Currently we 1094 * will never enter with no lock, so we try to drop it quickly in the 1095 * common pure ack/pure data cases. 1096 */ 1097 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 1098 tp->t_state != TCPS_ESTABLISHED) { 1099 KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " 1100 "SYN/FIN/RST/!EST", __func__, ti_locked)); 1101 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1102 } else { 1103 #ifdef INVARIANTS 1104 if (ti_locked == TI_RLOCKED) 1105 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1106 else if (ti_locked == TI_WLOCKED) 1107 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1108 else 1109 panic("%s: ti_locked %d for EST", __func__, 1110 ti_locked); 1111 #endif 1112 } 1113 INP_WLOCK_ASSERT(tp->t_inpcb); 1114 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 1115 __func__)); 1116 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 1117 __func__)); 1118 1119 /* 1120 * Segment received on connection. 1121 * Reset idle time and keep-alive timer. 1122 * XXX: This should be done after segment 1123 * validation to ignore broken/spoofed segs. 1124 */ 1125 tp->t_rcvtime = ticks; 1126 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1127 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1128 1129 /* 1130 * Unscale the window into a 32-bit value. 1131 * For the SYN_SENT state the scale is zero. 1132 */ 1133 tiwin = th->th_win << tp->snd_scale; 1134 1135 /* 1136 * TCP ECN processing. 1137 */ 1138 if (tp->t_flags & TF_ECN_PERMIT) { 1139 switch (iptos & IPTOS_ECN_MASK) { 1140 case IPTOS_ECN_CE: 1141 tp->t_flags |= TF_ECN_SND_ECE; 1142 V_tcpstat.tcps_ecn_ce++; 1143 break; 1144 case IPTOS_ECN_ECT0: 1145 V_tcpstat.tcps_ecn_ect0++; 1146 break; 1147 case IPTOS_ECN_ECT1: 1148 V_tcpstat.tcps_ecn_ect1++; 1149 break; 1150 } 1151 1152 if (thflags & TH_CWR) 1153 tp->t_flags &= ~TF_ECN_SND_ECE; 1154 1155 /* 1156 * Congestion experienced. 1157 * Ignore if we are already trying to recover. 1158 */ 1159 if ((thflags & TH_ECE) && 1160 SEQ_LEQ(th->th_ack, tp->snd_recover)) { 1161 V_tcpstat.tcps_ecn_rcwnd++; 1162 tcp_congestion_exp(tp); 1163 } 1164 } 1165 1166 /* 1167 * Parse options on any incoming segment. 1168 */ 1169 tcp_dooptions(&to, (u_char *)(th + 1), 1170 (th->th_off << 2) - sizeof(struct tcphdr), 1171 (thflags & TH_SYN) ? TO_SYN : 0); 1172 1173 /* 1174 * If echoed timestamp is later than the current time, 1175 * fall back to non RFC1323 RTT calculation. Normalize 1176 * timestamp if syncookies were used when this connection 1177 * was established. 1178 */ 1179 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1180 to.to_tsecr -= tp->ts_offset; 1181 if (TSTMP_GT(to.to_tsecr, ticks)) 1182 to.to_tsecr = 0; 1183 } 1184 1185 /* 1186 * Process options only when we get SYN/ACK back. The SYN case 1187 * for incoming connections is handled in tcp_syncache. 1188 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1189 * or <SYN,ACK>) segment itself is never scaled. 1190 * XXX this is traditional behavior, may need to be cleaned up. 1191 */ 1192 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1193 if ((to.to_flags & TOF_SCALE) && 1194 (tp->t_flags & TF_REQ_SCALE)) { 1195 tp->t_flags |= TF_RCVD_SCALE; 1196 tp->snd_scale = to.to_wscale; 1197 } 1198 /* 1199 * Initial send window. It will be updated with 1200 * the next incoming segment to the scaled value. 1201 */ 1202 tp->snd_wnd = th->th_win; 1203 if (to.to_flags & TOF_TS) { 1204 tp->t_flags |= TF_RCVD_TSTMP; 1205 tp->ts_recent = to.to_tsval; 1206 tp->ts_recent_age = ticks; 1207 } 1208 if (to.to_flags & TOF_MSS) 1209 tcp_mss(tp, to.to_mss); 1210 if ((tp->t_flags & TF_SACK_PERMIT) && 1211 (to.to_flags & TOF_SACKPERM) == 0) 1212 tp->t_flags &= ~TF_SACK_PERMIT; 1213 } 1214 1215 /* 1216 * Header prediction: check for the two common cases 1217 * of a uni-directional data xfer. If the packet has 1218 * no control flags, is in-sequence, the window didn't 1219 * change and we're not retransmitting, it's a 1220 * candidate. If the length is zero and the ack moved 1221 * forward, we're the sender side of the xfer. Just 1222 * free the data acked & wake any higher level process 1223 * that was blocked waiting for space. If the length 1224 * is non-zero and the ack didn't move, we're the 1225 * receiver side. If we're getting packets in-order 1226 * (the reassembly queue is empty), add the data to 1227 * the socket buffer and note that we need a delayed ack. 1228 * Make sure that the hidden state-flags are also off. 1229 * Since we check for TCPS_ESTABLISHED first, it can only 1230 * be TH_NEEDSYN. 1231 */ 1232 if (tp->t_state == TCPS_ESTABLISHED && 1233 th->th_seq == tp->rcv_nxt && 1234 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1235 tp->snd_nxt == tp->snd_max && 1236 tiwin && tiwin == tp->snd_wnd && 1237 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1238 LIST_EMPTY(&tp->t_segq) && 1239 ((to.to_flags & TOF_TS) == 0 || 1240 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { 1241 1242 /* 1243 * If last ACK falls within this segment's sequence numbers, 1244 * record the timestamp. 1245 * NOTE that the test is modified according to the latest 1246 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1247 */ 1248 if ((to.to_flags & TOF_TS) != 0 && 1249 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1250 tp->ts_recent_age = ticks; 1251 tp->ts_recent = to.to_tsval; 1252 } 1253 1254 if (tlen == 0) { 1255 if (SEQ_GT(th->th_ack, tp->snd_una) && 1256 SEQ_LEQ(th->th_ack, tp->snd_max) && 1257 tp->snd_cwnd >= tp->snd_wnd && 1258 ((!V_tcp_do_newreno && 1259 !(tp->t_flags & TF_SACK_PERMIT) && 1260 tp->t_dupacks < tcprexmtthresh) || 1261 ((V_tcp_do_newreno || 1262 (tp->t_flags & TF_SACK_PERMIT)) && 1263 !IN_FASTRECOVERY(tp) && 1264 (to.to_flags & TOF_SACK) == 0 && 1265 TAILQ_EMPTY(&tp->snd_holes)))) { 1266 /* 1267 * This is a pure ack for outstanding data. 1268 */ 1269 if (ti_locked == TI_RLOCKED) 1270 INP_INFO_RUNLOCK(&V_tcbinfo); 1271 else if (ti_locked == TI_WLOCKED) 1272 INP_INFO_WUNLOCK(&V_tcbinfo); 1273 else 1274 panic("%s: ti_locked %d on pure ACK", 1275 __func__, ti_locked); 1276 ti_locked = TI_UNLOCKED; 1277 1278 ++V_tcpstat.tcps_predack; 1279 1280 /* 1281 * "bad retransmit" recovery. 1282 */ 1283 if (tp->t_rxtshift == 1 && 1284 ticks < tp->t_badrxtwin) { 1285 ++V_tcpstat.tcps_sndrexmitbad; 1286 tp->snd_cwnd = tp->snd_cwnd_prev; 1287 tp->snd_ssthresh = 1288 tp->snd_ssthresh_prev; 1289 tp->snd_recover = tp->snd_recover_prev; 1290 if (tp->t_flags & TF_WASFRECOVERY) 1291 ENTER_FASTRECOVERY(tp); 1292 tp->snd_nxt = tp->snd_max; 1293 tp->t_badrxtwin = 0; 1294 } 1295 1296 /* 1297 * Recalculate the transmit timer / rtt. 1298 * 1299 * Some boxes send broken timestamp replies 1300 * during the SYN+ACK phase, ignore 1301 * timestamps of 0 or we could calculate a 1302 * huge RTT and blow up the retransmit timer. 1303 */ 1304 if ((to.to_flags & TOF_TS) != 0 && 1305 to.to_tsecr) { 1306 if (!tp->t_rttlow || 1307 tp->t_rttlow > ticks - to.to_tsecr) 1308 tp->t_rttlow = ticks - to.to_tsecr; 1309 tcp_xmit_timer(tp, 1310 ticks - to.to_tsecr + 1); 1311 } else if (tp->t_rtttime && 1312 SEQ_GT(th->th_ack, tp->t_rtseq)) { 1313 if (!tp->t_rttlow || 1314 tp->t_rttlow > ticks - tp->t_rtttime) 1315 tp->t_rttlow = ticks - tp->t_rtttime; 1316 tcp_xmit_timer(tp, 1317 ticks - tp->t_rtttime); 1318 } 1319 tcp_xmit_bandwidth_limit(tp, th->th_ack); 1320 acked = th->th_ack - tp->snd_una; 1321 V_tcpstat.tcps_rcvackpack++; 1322 V_tcpstat.tcps_rcvackbyte += acked; 1323 sbdrop(&so->so_snd, acked); 1324 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 1325 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1326 tp->snd_recover = th->th_ack - 1; 1327 tp->snd_una = th->th_ack; 1328 /* 1329 * Pull snd_wl2 up to prevent seq wrap relative 1330 * to th_ack. 1331 */ 1332 tp->snd_wl2 = th->th_ack; 1333 tp->t_dupacks = 0; 1334 m_freem(m); 1335 ND6_HINT(tp); /* Some progress has been made. */ 1336 1337 /* 1338 * If all outstanding data are acked, stop 1339 * retransmit timer, otherwise restart timer 1340 * using current (possibly backed-off) value. 1341 * If process is waiting for space, 1342 * wakeup/selwakeup/signal. If data 1343 * are ready to send, let tcp_output 1344 * decide between more output or persist. 1345 */ 1346 #ifdef TCPDEBUG 1347 if (so->so_options & SO_DEBUG) 1348 tcp_trace(TA_INPUT, ostate, tp, 1349 (void *)tcp_saveipgen, 1350 &tcp_savetcp, 0); 1351 #endif 1352 if (tp->snd_una == tp->snd_max) 1353 tcp_timer_activate(tp, TT_REXMT, 0); 1354 else if (!tcp_timer_active(tp, TT_PERSIST)) 1355 tcp_timer_activate(tp, TT_REXMT, 1356 tp->t_rxtcur); 1357 sowwakeup(so); 1358 if (so->so_snd.sb_cc) 1359 (void) tcp_output(tp); 1360 goto check_delack; 1361 } 1362 } else if (th->th_ack == tp->snd_una && 1363 tlen <= sbspace(&so->so_rcv)) { 1364 int newsize = 0; /* automatic sockbuf scaling */ 1365 1366 /* 1367 * This is a pure, in-sequence data packet with 1368 * nothing on the reassembly queue and we have enough 1369 * buffer space to take it. 1370 */ 1371 if (ti_locked == TI_RLOCKED) 1372 INP_INFO_RUNLOCK(&V_tcbinfo); 1373 else if (ti_locked == TI_WLOCKED) 1374 INP_INFO_WUNLOCK(&V_tcbinfo); 1375 else 1376 panic("%s: ti_locked %d on pure data " 1377 "segment", __func__, ti_locked); 1378 ti_locked = TI_UNLOCKED; 1379 1380 /* Clean receiver SACK report if present */ 1381 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 1382 tcp_clean_sackreport(tp); 1383 ++V_tcpstat.tcps_preddat; 1384 tp->rcv_nxt += tlen; 1385 /* 1386 * Pull snd_wl1 up to prevent seq wrap relative to 1387 * th_seq. 1388 */ 1389 tp->snd_wl1 = th->th_seq; 1390 /* 1391 * Pull rcv_up up to prevent seq wrap relative to 1392 * rcv_nxt. 1393 */ 1394 tp->rcv_up = tp->rcv_nxt; 1395 V_tcpstat.tcps_rcvpack++; 1396 V_tcpstat.tcps_rcvbyte += tlen; 1397 ND6_HINT(tp); /* Some progress has been made */ 1398 #ifdef TCPDEBUG 1399 if (so->so_options & SO_DEBUG) 1400 tcp_trace(TA_INPUT, ostate, tp, 1401 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1402 #endif 1403 /* 1404 * Automatic sizing of receive socket buffer. Often the send 1405 * buffer size is not optimally adjusted to the actual network 1406 * conditions at hand (delay bandwidth product). Setting the 1407 * buffer size too small limits throughput on links with high 1408 * bandwidth and high delay (eg. trans-continental/oceanic links). 1409 * 1410 * On the receive side the socket buffer memory is only rarely 1411 * used to any significant extent. This allows us to be much 1412 * more aggressive in scaling the receive socket buffer. For 1413 * the case that the buffer space is actually used to a large 1414 * extent and we run out of kernel memory we can simply drop 1415 * the new segments; TCP on the sender will just retransmit it 1416 * later. Setting the buffer size too big may only consume too 1417 * much kernel memory if the application doesn't read() from 1418 * the socket or packet loss or reordering makes use of the 1419 * reassembly queue. 1420 * 1421 * The criteria to step up the receive buffer one notch are: 1422 * 1. the number of bytes received during the time it takes 1423 * one timestamp to be reflected back to us (the RTT); 1424 * 2. received bytes per RTT is within seven eighth of the 1425 * current socket buffer size; 1426 * 3. receive buffer size has not hit maximal automatic size; 1427 * 1428 * This algorithm does one step per RTT at most and only if 1429 * we receive a bulk stream w/o packet losses or reorderings. 1430 * Shrinking the buffer during idle times is not necessary as 1431 * it doesn't consume any memory when idle. 1432 * 1433 * TODO: Only step up if the application is actually serving 1434 * the buffer to better manage the socket buffer resources. 1435 */ 1436 if (V_tcp_do_autorcvbuf && 1437 to.to_tsecr && 1438 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 1439 if (to.to_tsecr > tp->rfbuf_ts && 1440 to.to_tsecr - tp->rfbuf_ts < hz) { 1441 if (tp->rfbuf_cnt > 1442 (so->so_rcv.sb_hiwat / 8 * 7) && 1443 so->so_rcv.sb_hiwat < 1444 V_tcp_autorcvbuf_max) { 1445 newsize = 1446 min(so->so_rcv.sb_hiwat + 1447 V_tcp_autorcvbuf_inc, 1448 V_tcp_autorcvbuf_max); 1449 } 1450 /* Start over with next RTT. */ 1451 tp->rfbuf_ts = 0; 1452 tp->rfbuf_cnt = 0; 1453 } else 1454 tp->rfbuf_cnt += tlen; /* add up */ 1455 } 1456 1457 /* Add data to socket buffer. */ 1458 SOCKBUF_LOCK(&so->so_rcv); 1459 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1460 m_freem(m); 1461 } else { 1462 /* 1463 * Set new socket buffer size. 1464 * Give up when limit is reached. 1465 */ 1466 if (newsize) 1467 if (!sbreserve_locked(&so->so_rcv, 1468 newsize, so, NULL)) 1469 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1470 m_adj(m, drop_hdrlen); /* delayed header drop */ 1471 sbappendstream_locked(&so->so_rcv, m); 1472 } 1473 /* NB: sorwakeup_locked() does an implicit unlock. */ 1474 sorwakeup_locked(so); 1475 if (DELAY_ACK(tp)) { 1476 tp->t_flags |= TF_DELACK; 1477 } else { 1478 tp->t_flags |= TF_ACKNOW; 1479 tcp_output(tp); 1480 } 1481 goto check_delack; 1482 } 1483 } 1484 1485 /* 1486 * Calculate amount of space in receive window, 1487 * and then do TCP input processing. 1488 * Receive window is amount of space in rcv queue, 1489 * but not less than advertised window. 1490 */ 1491 win = sbspace(&so->so_rcv); 1492 if (win < 0) 1493 win = 0; 1494 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1495 1496 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1497 tp->rfbuf_ts = 0; 1498 tp->rfbuf_cnt = 0; 1499 1500 switch (tp->t_state) { 1501 1502 /* 1503 * If the state is SYN_RECEIVED: 1504 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1505 */ 1506 case TCPS_SYN_RECEIVED: 1507 if ((thflags & TH_ACK) && 1508 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1509 SEQ_GT(th->th_ack, tp->snd_max))) { 1510 rstreason = BANDLIM_RST_OPENPORT; 1511 goto dropwithreset; 1512 } 1513 break; 1514 1515 /* 1516 * If the state is SYN_SENT: 1517 * if seg contains an ACK, but not for our SYN, drop the input. 1518 * if seg contains a RST, then drop the connection. 1519 * if seg does not contain SYN, then drop it. 1520 * Otherwise this is an acceptable SYN segment 1521 * initialize tp->rcv_nxt and tp->irs 1522 * if seg contains ack then advance tp->snd_una 1523 * if seg contains an ECE and ECN support is enabled, the stream 1524 * is ECN capable. 1525 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1526 * arrange for segment to be acked (eventually) 1527 * continue processing rest of data/controls, beginning with URG 1528 */ 1529 case TCPS_SYN_SENT: 1530 if ((thflags & TH_ACK) && 1531 (SEQ_LEQ(th->th_ack, tp->iss) || 1532 SEQ_GT(th->th_ack, tp->snd_max))) { 1533 rstreason = BANDLIM_UNLIMITED; 1534 goto dropwithreset; 1535 } 1536 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) 1537 tp = tcp_drop(tp, ECONNREFUSED); 1538 if (thflags & TH_RST) 1539 goto drop; 1540 if (!(thflags & TH_SYN)) 1541 goto drop; 1542 1543 tp->irs = th->th_seq; 1544 tcp_rcvseqinit(tp); 1545 if (thflags & TH_ACK) { 1546 V_tcpstat.tcps_connects++; 1547 soisconnected(so); 1548 #ifdef MAC 1549 SOCK_LOCK(so); 1550 mac_socketpeer_set_from_mbuf(m, so); 1551 SOCK_UNLOCK(so); 1552 #endif 1553 /* Do window scaling on this connection? */ 1554 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1555 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1556 tp->rcv_scale = tp->request_r_scale; 1557 } 1558 tp->rcv_adv += tp->rcv_wnd; 1559 tp->snd_una++; /* SYN is acked */ 1560 /* 1561 * If there's data, delay ACK; if there's also a FIN 1562 * ACKNOW will be turned on later. 1563 */ 1564 if (DELAY_ACK(tp) && tlen != 0) 1565 tcp_timer_activate(tp, TT_DELACK, 1566 tcp_delacktime); 1567 else 1568 tp->t_flags |= TF_ACKNOW; 1569 1570 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 1571 tp->t_flags |= TF_ECN_PERMIT; 1572 V_tcpstat.tcps_ecn_shs++; 1573 } 1574 1575 /* 1576 * Received <SYN,ACK> in SYN_SENT[*] state. 1577 * Transitions: 1578 * SYN_SENT --> ESTABLISHED 1579 * SYN_SENT* --> FIN_WAIT_1 1580 */ 1581 tp->t_starttime = ticks; 1582 if (tp->t_flags & TF_NEEDFIN) { 1583 tp->t_state = TCPS_FIN_WAIT_1; 1584 tp->t_flags &= ~TF_NEEDFIN; 1585 thflags &= ~TH_SYN; 1586 } else { 1587 tp->t_state = TCPS_ESTABLISHED; 1588 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1589 } 1590 } else { 1591 /* 1592 * Received initial SYN in SYN-SENT[*] state => 1593 * simultaneous open. If segment contains CC option 1594 * and there is a cached CC, apply TAO test. 1595 * If it succeeds, connection is * half-synchronized. 1596 * Otherwise, do 3-way handshake: 1597 * SYN-SENT -> SYN-RECEIVED 1598 * SYN-SENT* -> SYN-RECEIVED* 1599 * If there was no CC option, clear cached CC value. 1600 */ 1601 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 1602 tcp_timer_activate(tp, TT_REXMT, 0); 1603 tp->t_state = TCPS_SYN_RECEIVED; 1604 } 1605 1606 KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " 1607 "ti_locked %d", __func__, ti_locked)); 1608 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1609 INP_WLOCK_ASSERT(tp->t_inpcb); 1610 1611 /* 1612 * Advance th->th_seq to correspond to first data byte. 1613 * If data, trim to stay within window, 1614 * dropping FIN if necessary. 1615 */ 1616 th->th_seq++; 1617 if (tlen > tp->rcv_wnd) { 1618 todrop = tlen - tp->rcv_wnd; 1619 m_adj(m, -todrop); 1620 tlen = tp->rcv_wnd; 1621 thflags &= ~TH_FIN; 1622 V_tcpstat.tcps_rcvpackafterwin++; 1623 V_tcpstat.tcps_rcvbyteafterwin += todrop; 1624 } 1625 tp->snd_wl1 = th->th_seq - 1; 1626 tp->rcv_up = th->th_seq; 1627 /* 1628 * Client side of transaction: already sent SYN and data. 1629 * If the remote host used T/TCP to validate the SYN, 1630 * our data will be ACK'd; if so, enter normal data segment 1631 * processing in the middle of step 5, ack processing. 1632 * Otherwise, goto step 6. 1633 */ 1634 if (thflags & TH_ACK) 1635 goto process_ACK; 1636 1637 goto step6; 1638 1639 /* 1640 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 1641 * do normal processing. 1642 * 1643 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 1644 */ 1645 case TCPS_LAST_ACK: 1646 case TCPS_CLOSING: 1647 break; /* continue normal processing */ 1648 } 1649 1650 /* 1651 * States other than LISTEN or SYN_SENT. 1652 * First check the RST flag and sequence number since reset segments 1653 * are exempt from the timestamp and connection count tests. This 1654 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 1655 * below which allowed reset segments in half the sequence space 1656 * to fall though and be processed (which gives forged reset 1657 * segments with a random sequence number a 50 percent chance of 1658 * killing a connection). 1659 * Then check timestamp, if present. 1660 * Then check the connection count, if present. 1661 * Then check that at least some bytes of segment are within 1662 * receive window. If segment begins before rcv_nxt, 1663 * drop leading data (and SYN); if nothing left, just ack. 1664 * 1665 * 1666 * If the RST bit is set, check the sequence number to see 1667 * if this is a valid reset segment. 1668 * RFC 793 page 37: 1669 * In all states except SYN-SENT, all reset (RST) segments 1670 * are validated by checking their SEQ-fields. A reset is 1671 * valid if its sequence number is in the window. 1672 * Note: this does not take into account delayed ACKs, so 1673 * we should test against last_ack_sent instead of rcv_nxt. 1674 * The sequence number in the reset segment is normally an 1675 * echo of our outgoing acknowlegement numbers, but some hosts 1676 * send a reset with the sequence number at the rightmost edge 1677 * of our receive window, and we have to handle this case. 1678 * Note 2: Paul Watson's paper "Slipping in the Window" has shown 1679 * that brute force RST attacks are possible. To combat this, 1680 * we use a much stricter check while in the ESTABLISHED state, 1681 * only accepting RSTs where the sequence number is equal to 1682 * last_ack_sent. In all other states (the states in which a 1683 * RST is more likely), the more permissive check is used. 1684 * If we have multiple segments in flight, the initial reset 1685 * segment sequence numbers will be to the left of last_ack_sent, 1686 * but they will eventually catch up. 1687 * In any case, it never made sense to trim reset segments to 1688 * fit the receive window since RFC 1122 says: 1689 * 4.2.2.12 RST Segment: RFC-793 Section 3.4 1690 * 1691 * A TCP SHOULD allow a received RST segment to include data. 1692 * 1693 * DISCUSSION 1694 * It has been suggested that a RST segment could contain 1695 * ASCII text that encoded and explained the cause of the 1696 * RST. No standard has yet been established for such 1697 * data. 1698 * 1699 * If the reset segment passes the sequence number test examine 1700 * the state: 1701 * SYN_RECEIVED STATE: 1702 * If passive open, return to LISTEN state. 1703 * If active open, inform user that connection was refused. 1704 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: 1705 * Inform user that connection was reset, and close tcb. 1706 * CLOSING, LAST_ACK STATES: 1707 * Close the tcb. 1708 * TIME_WAIT STATE: 1709 * Drop the segment - see Stevens, vol. 2, p. 964 and 1710 * RFC 1337. 1711 */ 1712 if (thflags & TH_RST) { 1713 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1714 SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1715 switch (tp->t_state) { 1716 1717 case TCPS_SYN_RECEIVED: 1718 so->so_error = ECONNREFUSED; 1719 goto close; 1720 1721 case TCPS_ESTABLISHED: 1722 if (V_tcp_insecure_rst == 0 && 1723 !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && 1724 SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && 1725 !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1726 SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { 1727 V_tcpstat.tcps_badrst++; 1728 goto drop; 1729 } 1730 /* FALLTHROUGH */ 1731 case TCPS_FIN_WAIT_1: 1732 case TCPS_FIN_WAIT_2: 1733 case TCPS_CLOSE_WAIT: 1734 so->so_error = ECONNRESET; 1735 close: 1736 KASSERT(ti_locked == TI_WLOCKED, 1737 ("tcp_do_segment: TH_RST 1 ti_locked %d", 1738 ti_locked)); 1739 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1740 1741 tp->t_state = TCPS_CLOSED; 1742 V_tcpstat.tcps_drops++; 1743 tp = tcp_close(tp); 1744 break; 1745 1746 case TCPS_CLOSING: 1747 case TCPS_LAST_ACK: 1748 KASSERT(ti_locked == TI_WLOCKED, 1749 ("tcp_do_segment: TH_RST 2 ti_locked %d", 1750 ti_locked)); 1751 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1752 1753 tp = tcp_close(tp); 1754 break; 1755 } 1756 } 1757 goto drop; 1758 } 1759 1760 /* 1761 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1762 * and it's less than ts_recent, drop it. 1763 */ 1764 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 1765 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 1766 1767 /* Check to see if ts_recent is over 24 days old. */ 1768 if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1769 /* 1770 * Invalidate ts_recent. If this segment updates 1771 * ts_recent, the age will be reset later and ts_recent 1772 * will get a valid value. If it does not, setting 1773 * ts_recent to zero will at least satisfy the 1774 * requirement that zero be placed in the timestamp 1775 * echo reply when ts_recent isn't valid. The 1776 * age isn't reset until we get a valid ts_recent 1777 * because we don't want out-of-order segments to be 1778 * dropped when ts_recent is old. 1779 */ 1780 tp->ts_recent = 0; 1781 } else { 1782 V_tcpstat.tcps_rcvduppack++; 1783 V_tcpstat.tcps_rcvdupbyte += tlen; 1784 V_tcpstat.tcps_pawsdrop++; 1785 if (tlen) 1786 goto dropafterack; 1787 goto drop; 1788 } 1789 } 1790 1791 /* 1792 * In the SYN-RECEIVED state, validate that the packet belongs to 1793 * this connection before trimming the data to fit the receive 1794 * window. Check the sequence number versus IRS since we know 1795 * the sequence numbers haven't wrapped. This is a partial fix 1796 * for the "LAND" DoS attack. 1797 */ 1798 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 1799 rstreason = BANDLIM_RST_OPENPORT; 1800 goto dropwithreset; 1801 } 1802 1803 todrop = tp->rcv_nxt - th->th_seq; 1804 if (todrop > 0) { 1805 if (thflags & TH_SYN) { 1806 thflags &= ~TH_SYN; 1807 th->th_seq++; 1808 if (th->th_urp > 1) 1809 th->th_urp--; 1810 else 1811 thflags &= ~TH_URG; 1812 todrop--; 1813 } 1814 /* 1815 * Following if statement from Stevens, vol. 2, p. 960. 1816 */ 1817 if (todrop > tlen 1818 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1819 /* 1820 * Any valid FIN must be to the left of the window. 1821 * At this point the FIN must be a duplicate or out 1822 * of sequence; drop it. 1823 */ 1824 thflags &= ~TH_FIN; 1825 1826 /* 1827 * Send an ACK to resynchronize and drop any data. 1828 * But keep on processing for RST or ACK. 1829 */ 1830 tp->t_flags |= TF_ACKNOW; 1831 todrop = tlen; 1832 V_tcpstat.tcps_rcvduppack++; 1833 V_tcpstat.tcps_rcvdupbyte += todrop; 1834 } else { 1835 V_tcpstat.tcps_rcvpartduppack++; 1836 V_tcpstat.tcps_rcvpartdupbyte += todrop; 1837 } 1838 drop_hdrlen += todrop; /* drop from the top afterwards */ 1839 th->th_seq += todrop; 1840 tlen -= todrop; 1841 if (th->th_urp > todrop) 1842 th->th_urp -= todrop; 1843 else { 1844 thflags &= ~TH_URG; 1845 th->th_urp = 0; 1846 } 1847 } 1848 1849 /* 1850 * If new data are received on a connection after the 1851 * user processes are gone, then RST the other end. 1852 */ 1853 if ((so->so_state & SS_NOFDREF) && 1854 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1855 char *s; 1856 1857 KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " 1858 "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); 1859 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1860 1861 if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { 1862 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " 1863 "was closed, sending RST and removing tcpcb\n", 1864 s, __func__, tcpstates[tp->t_state], tlen); 1865 free(s, M_TCPLOG); 1866 } 1867 tp = tcp_close(tp); 1868 V_tcpstat.tcps_rcvafterclose++; 1869 rstreason = BANDLIM_UNLIMITED; 1870 goto dropwithreset; 1871 } 1872 1873 /* 1874 * If segment ends after window, drop trailing data 1875 * (and PUSH and FIN); if nothing left, just ACK. 1876 */ 1877 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1878 if (todrop > 0) { 1879 V_tcpstat.tcps_rcvpackafterwin++; 1880 if (todrop >= tlen) { 1881 V_tcpstat.tcps_rcvbyteafterwin += tlen; 1882 /* 1883 * If window is closed can only take segments at 1884 * window edge, and have to drop data and PUSH from 1885 * incoming segments. Continue processing, but 1886 * remember to ack. Otherwise, drop segment 1887 * and ack. 1888 */ 1889 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1890 tp->t_flags |= TF_ACKNOW; 1891 V_tcpstat.tcps_rcvwinprobe++; 1892 } else 1893 goto dropafterack; 1894 } else 1895 V_tcpstat.tcps_rcvbyteafterwin += todrop; 1896 m_adj(m, -todrop); 1897 tlen -= todrop; 1898 thflags &= ~(TH_PUSH|TH_FIN); 1899 } 1900 1901 /* 1902 * If last ACK falls within this segment's sequence numbers, 1903 * record its timestamp. 1904 * NOTE: 1905 * 1) That the test incorporates suggestions from the latest 1906 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1907 * 2) That updating only on newer timestamps interferes with 1908 * our earlier PAWS tests, so this check should be solely 1909 * predicated on the sequence space of this segment. 1910 * 3) That we modify the segment boundary check to be 1911 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 1912 * instead of RFC1323's 1913 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 1914 * This modified check allows us to overcome RFC1323's 1915 * limitations as described in Stevens TCP/IP Illustrated 1916 * Vol. 2 p.869. In such cases, we can still calculate the 1917 * RTT correctly when RCV.NXT == Last.ACK.Sent. 1918 */ 1919 if ((to.to_flags & TOF_TS) != 0 && 1920 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1921 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1922 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 1923 tp->ts_recent_age = ticks; 1924 tp->ts_recent = to.to_tsval; 1925 } 1926 1927 /* 1928 * If a SYN is in the window, then this is an 1929 * error and we send an RST and drop the connection. 1930 */ 1931 if (thflags & TH_SYN) { 1932 KASSERT(ti_locked == TI_WLOCKED, 1933 ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); 1934 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1935 1936 tp = tcp_drop(tp, ECONNRESET); 1937 rstreason = BANDLIM_UNLIMITED; 1938 goto drop; 1939 } 1940 1941 /* 1942 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 1943 * flag is on (half-synchronized state), then queue data for 1944 * later processing; else drop segment and return. 1945 */ 1946 if ((thflags & TH_ACK) == 0) { 1947 if (tp->t_state == TCPS_SYN_RECEIVED || 1948 (tp->t_flags & TF_NEEDSYN)) 1949 goto step6; 1950 else if (tp->t_flags & TF_ACKNOW) 1951 goto dropafterack; 1952 else 1953 goto drop; 1954 } 1955 1956 /* 1957 * Ack processing. 1958 */ 1959 switch (tp->t_state) { 1960 1961 /* 1962 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1963 * ESTABLISHED state and continue processing. 1964 * The ACK was checked above. 1965 */ 1966 case TCPS_SYN_RECEIVED: 1967 1968 V_tcpstat.tcps_connects++; 1969 soisconnected(so); 1970 /* Do window scaling? */ 1971 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1972 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1973 tp->rcv_scale = tp->request_r_scale; 1974 tp->snd_wnd = tiwin; 1975 } 1976 /* 1977 * Make transitions: 1978 * SYN-RECEIVED -> ESTABLISHED 1979 * SYN-RECEIVED* -> FIN-WAIT-1 1980 */ 1981 tp->t_starttime = ticks; 1982 if (tp->t_flags & TF_NEEDFIN) { 1983 tp->t_state = TCPS_FIN_WAIT_1; 1984 tp->t_flags &= ~TF_NEEDFIN; 1985 } else { 1986 tp->t_state = TCPS_ESTABLISHED; 1987 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1988 } 1989 /* 1990 * If segment contains data or ACK, will call tcp_reass() 1991 * later; if not, do so now to pass queued data to user. 1992 */ 1993 if (tlen == 0 && (thflags & TH_FIN) == 0) 1994 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 1995 (struct mbuf *)0); 1996 tp->snd_wl1 = th->th_seq - 1; 1997 /* FALLTHROUGH */ 1998 1999 /* 2000 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 2001 * ACKs. If the ack is in the range 2002 * tp->snd_una < th->th_ack <= tp->snd_max 2003 * then advance tp->snd_una to th->th_ack and drop 2004 * data from the retransmission queue. If this ACK reflects 2005 * more up to date window information we update our window information. 2006 */ 2007 case TCPS_ESTABLISHED: 2008 case TCPS_FIN_WAIT_1: 2009 case TCPS_FIN_WAIT_2: 2010 case TCPS_CLOSE_WAIT: 2011 case TCPS_CLOSING: 2012 case TCPS_LAST_ACK: 2013 if (SEQ_GT(th->th_ack, tp->snd_max)) { 2014 V_tcpstat.tcps_rcvacktoomuch++; 2015 goto dropafterack; 2016 } 2017 if ((tp->t_flags & TF_SACK_PERMIT) && 2018 ((to.to_flags & TOF_SACK) || 2019 !TAILQ_EMPTY(&tp->snd_holes))) 2020 tcp_sack_doack(tp, &to, th->th_ack); 2021 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 2022 if (tlen == 0 && tiwin == tp->snd_wnd) { 2023 V_tcpstat.tcps_rcvdupack++; 2024 /* 2025 * If we have outstanding data (other than 2026 * a window probe), this is a completely 2027 * duplicate ack (ie, window info didn't 2028 * change), the ack is the biggest we've 2029 * seen and we've seen exactly our rexmt 2030 * threshhold of them, assume a packet 2031 * has been dropped and retransmit it. 2032 * Kludge snd_nxt & the congestion 2033 * window so we send only this one 2034 * packet. 2035 * 2036 * We know we're losing at the current 2037 * window size so do congestion avoidance 2038 * (set ssthresh to half the current window 2039 * and pull our congestion window back to 2040 * the new ssthresh). 2041 * 2042 * Dup acks mean that packets have left the 2043 * network (they're now cached at the receiver) 2044 * so bump cwnd by the amount in the receiver 2045 * to keep a constant cwnd packets in the 2046 * network. 2047 * 2048 * When using TCP ECN, notify the peer that 2049 * we reduced the cwnd. 2050 */ 2051 if (!tcp_timer_active(tp, TT_REXMT) || 2052 th->th_ack != tp->snd_una) 2053 tp->t_dupacks = 0; 2054 else if (++tp->t_dupacks > tcprexmtthresh || 2055 ((V_tcp_do_newreno || 2056 (tp->t_flags & TF_SACK_PERMIT)) && 2057 IN_FASTRECOVERY(tp))) { 2058 if ((tp->t_flags & TF_SACK_PERMIT) && 2059 IN_FASTRECOVERY(tp)) { 2060 int awnd; 2061 2062 /* 2063 * Compute the amount of data in flight first. 2064 * We can inject new data into the pipe iff 2065 * we have less than 1/2 the original window's 2066 * worth of data in flight. 2067 */ 2068 awnd = (tp->snd_nxt - tp->snd_fack) + 2069 tp->sackhint.sack_bytes_rexmit; 2070 if (awnd < tp->snd_ssthresh) { 2071 tp->snd_cwnd += tp->t_maxseg; 2072 if (tp->snd_cwnd > tp->snd_ssthresh) 2073 tp->snd_cwnd = tp->snd_ssthresh; 2074 } 2075 } else 2076 tp->snd_cwnd += tp->t_maxseg; 2077 (void) tcp_output(tp); 2078 goto drop; 2079 } else if (tp->t_dupacks == tcprexmtthresh) { 2080 tcp_seq onxt = tp->snd_nxt; 2081 2082 /* 2083 * If we're doing sack, check to 2084 * see if we're already in sack 2085 * recovery. If we're not doing sack, 2086 * check to see if we're in newreno 2087 * recovery. 2088 */ 2089 if (tp->t_flags & TF_SACK_PERMIT) { 2090 if (IN_FASTRECOVERY(tp)) { 2091 tp->t_dupacks = 0; 2092 break; 2093 } 2094 } else if (V_tcp_do_newreno || 2095 V_tcp_do_ecn) { 2096 if (SEQ_LEQ(th->th_ack, 2097 tp->snd_recover)) { 2098 tp->t_dupacks = 0; 2099 break; 2100 } 2101 } 2102 tcp_congestion_exp(tp); 2103 tcp_timer_activate(tp, TT_REXMT, 0); 2104 tp->t_rtttime = 0; 2105 if (tp->t_flags & TF_SACK_PERMIT) { 2106 V_tcpstat.tcps_sack_recovery_episode++; 2107 tp->sack_newdata = tp->snd_nxt; 2108 tp->snd_cwnd = tp->t_maxseg; 2109 (void) tcp_output(tp); 2110 goto drop; 2111 } 2112 tp->snd_nxt = th->th_ack; 2113 tp->snd_cwnd = tp->t_maxseg; 2114 (void) tcp_output(tp); 2115 KASSERT(tp->snd_limited <= 2, 2116 ("%s: tp->snd_limited too big", 2117 __func__)); 2118 tp->snd_cwnd = tp->snd_ssthresh + 2119 tp->t_maxseg * 2120 (tp->t_dupacks - tp->snd_limited); 2121 if (SEQ_GT(onxt, tp->snd_nxt)) 2122 tp->snd_nxt = onxt; 2123 goto drop; 2124 } else if (V_tcp_do_rfc3042) { 2125 u_long oldcwnd = tp->snd_cwnd; 2126 tcp_seq oldsndmax = tp->snd_max; 2127 u_int sent; 2128 2129 KASSERT(tp->t_dupacks == 1 || 2130 tp->t_dupacks == 2, 2131 ("%s: dupacks not 1 or 2", 2132 __func__)); 2133 if (tp->t_dupacks == 1) 2134 tp->snd_limited = 0; 2135 tp->snd_cwnd = 2136 (tp->snd_nxt - tp->snd_una) + 2137 (tp->t_dupacks - tp->snd_limited) * 2138 tp->t_maxseg; 2139 (void) tcp_output(tp); 2140 sent = tp->snd_max - oldsndmax; 2141 if (sent > tp->t_maxseg) { 2142 KASSERT((tp->t_dupacks == 2 && 2143 tp->snd_limited == 0) || 2144 (sent == tp->t_maxseg + 1 && 2145 tp->t_flags & TF_SENTFIN), 2146 ("%s: sent too much", 2147 __func__)); 2148 tp->snd_limited = 2; 2149 } else if (sent > 0) 2150 ++tp->snd_limited; 2151 tp->snd_cwnd = oldcwnd; 2152 goto drop; 2153 } 2154 } else 2155 tp->t_dupacks = 0; 2156 break; 2157 } 2158 2159 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 2160 ("%s: th_ack <= snd_una", __func__)); 2161 2162 /* 2163 * If the congestion window was inflated to account 2164 * for the other side's cached packets, retract it. 2165 */ 2166 if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { 2167 if (IN_FASTRECOVERY(tp)) { 2168 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2169 if (tp->t_flags & TF_SACK_PERMIT) 2170 tcp_sack_partialack(tp, th); 2171 else 2172 tcp_newreno_partial_ack(tp, th); 2173 } else { 2174 /* 2175 * Out of fast recovery. 2176 * Window inflation should have left us 2177 * with approximately snd_ssthresh 2178 * outstanding data. 2179 * But in case we would be inclined to 2180 * send a burst, better to do it via 2181 * the slow start mechanism. 2182 */ 2183 if (SEQ_GT(th->th_ack + 2184 tp->snd_ssthresh, 2185 tp->snd_max)) 2186 tp->snd_cwnd = tp->snd_max - 2187 th->th_ack + 2188 tp->t_maxseg; 2189 else 2190 tp->snd_cwnd = tp->snd_ssthresh; 2191 } 2192 } 2193 } else { 2194 if (tp->t_dupacks >= tcprexmtthresh && 2195 tp->snd_cwnd > tp->snd_ssthresh) 2196 tp->snd_cwnd = tp->snd_ssthresh; 2197 } 2198 tp->t_dupacks = 0; 2199 /* 2200 * If we reach this point, ACK is not a duplicate, 2201 * i.e., it ACKs something we sent. 2202 */ 2203 if (tp->t_flags & TF_NEEDSYN) { 2204 /* 2205 * T/TCP: Connection was half-synchronized, and our 2206 * SYN has been ACK'd (so connection is now fully 2207 * synchronized). Go to non-starred state, 2208 * increment snd_una for ACK of SYN, and check if 2209 * we can do window scaling. 2210 */ 2211 tp->t_flags &= ~TF_NEEDSYN; 2212 tp->snd_una++; 2213 /* Do window scaling? */ 2214 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2215 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2216 tp->rcv_scale = tp->request_r_scale; 2217 /* Send window already scaled. */ 2218 } 2219 } 2220 2221 process_ACK: 2222 INP_INFO_LOCK_ASSERT(&V_tcbinfo); 2223 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 2224 ("tcp_input: process_ACK ti_locked %d", ti_locked)); 2225 INP_WLOCK_ASSERT(tp->t_inpcb); 2226 2227 acked = th->th_ack - tp->snd_una; 2228 V_tcpstat.tcps_rcvackpack++; 2229 V_tcpstat.tcps_rcvackbyte += acked; 2230 2231 /* 2232 * If we just performed our first retransmit, and the ACK 2233 * arrives within our recovery window, then it was a mistake 2234 * to do the retransmit in the first place. Recover our 2235 * original cwnd and ssthresh, and proceed to transmit where 2236 * we left off. 2237 */ 2238 if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { 2239 ++V_tcpstat.tcps_sndrexmitbad; 2240 tp->snd_cwnd = tp->snd_cwnd_prev; 2241 tp->snd_ssthresh = tp->snd_ssthresh_prev; 2242 tp->snd_recover = tp->snd_recover_prev; 2243 if (tp->t_flags & TF_WASFRECOVERY) 2244 ENTER_FASTRECOVERY(tp); 2245 tp->snd_nxt = tp->snd_max; 2246 tp->t_badrxtwin = 0; /* XXX probably not required */ 2247 } 2248 2249 /* 2250 * If we have a timestamp reply, update smoothed 2251 * round trip time. If no timestamp is present but 2252 * transmit timer is running and timed sequence 2253 * number was acked, update smoothed round trip time. 2254 * Since we now have an rtt measurement, cancel the 2255 * timer backoff (cf., Phil Karn's retransmit alg.). 2256 * Recompute the initial retransmit timer. 2257 * 2258 * Some boxes send broken timestamp replies 2259 * during the SYN+ACK phase, ignore 2260 * timestamps of 0 or we could calculate a 2261 * huge RTT and blow up the retransmit timer. 2262 */ 2263 if ((to.to_flags & TOF_TS) != 0 && 2264 to.to_tsecr) { 2265 if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) 2266 tp->t_rttlow = ticks - to.to_tsecr; 2267 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); 2268 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 2269 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 2270 tp->t_rttlow = ticks - tp->t_rtttime; 2271 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 2272 } 2273 tcp_xmit_bandwidth_limit(tp, th->th_ack); 2274 2275 /* 2276 * If all outstanding data is acked, stop retransmit 2277 * timer and remember to restart (more output or persist). 2278 * If there is more data to be acked, restart retransmit 2279 * timer, using current (possibly backed-off) value. 2280 */ 2281 if (th->th_ack == tp->snd_max) { 2282 tcp_timer_activate(tp, TT_REXMT, 0); 2283 needoutput = 1; 2284 } else if (!tcp_timer_active(tp, TT_PERSIST)) 2285 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 2286 2287 /* 2288 * If no data (only SYN) was ACK'd, 2289 * skip rest of ACK processing. 2290 */ 2291 if (acked == 0) 2292 goto step6; 2293 2294 /* 2295 * When new data is acked, open the congestion window. 2296 * If the window gives us less than ssthresh packets 2297 * in flight, open exponentially (maxseg per packet). 2298 * Otherwise open linearly: maxseg per window 2299 * (maxseg^2 / cwnd per packet). 2300 * If cwnd > maxseg^2, fix the cwnd increment at 1 byte 2301 * to avoid capping cwnd (as suggested in RFC2581). 2302 */ 2303 if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || 2304 !IN_FASTRECOVERY(tp)) { 2305 u_int cw = tp->snd_cwnd; 2306 u_int incr = tp->t_maxseg; 2307 if (cw > tp->snd_ssthresh) 2308 incr = max((incr * incr / cw), 1); 2309 tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale); 2310 } 2311 SOCKBUF_LOCK(&so->so_snd); 2312 if (acked > so->so_snd.sb_cc) { 2313 tp->snd_wnd -= so->so_snd.sb_cc; 2314 sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); 2315 ourfinisacked = 1; 2316 } else { 2317 sbdrop_locked(&so->so_snd, acked); 2318 tp->snd_wnd -= acked; 2319 ourfinisacked = 0; 2320 } 2321 /* NB: sowwakeup_locked() does an implicit unlock. */ 2322 sowwakeup_locked(so); 2323 /* Detect una wraparound. */ 2324 if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && 2325 !IN_FASTRECOVERY(tp) && 2326 SEQ_GT(tp->snd_una, tp->snd_recover) && 2327 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2328 tp->snd_recover = th->th_ack - 1; 2329 if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && 2330 IN_FASTRECOVERY(tp) && 2331 SEQ_GEQ(th->th_ack, tp->snd_recover)) 2332 EXIT_FASTRECOVERY(tp); 2333 tp->snd_una = th->th_ack; 2334 if (tp->t_flags & TF_SACK_PERMIT) { 2335 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 2336 tp->snd_recover = tp->snd_una; 2337 } 2338 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2339 tp->snd_nxt = tp->snd_una; 2340 2341 switch (tp->t_state) { 2342 2343 /* 2344 * In FIN_WAIT_1 STATE in addition to the processing 2345 * for the ESTABLISHED state if our FIN is now acknowledged 2346 * then enter FIN_WAIT_2. 2347 */ 2348 case TCPS_FIN_WAIT_1: 2349 if (ourfinisacked) { 2350 /* 2351 * If we can't receive any more 2352 * data, then closing user can proceed. 2353 * Starting the timer is contrary to the 2354 * specification, but if we don't get a FIN 2355 * we'll hang forever. 2356 * 2357 * XXXjl: 2358 * we should release the tp also, and use a 2359 * compressed state. 2360 */ 2361 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2362 int timeout; 2363 2364 soisdisconnected(so); 2365 timeout = (tcp_fast_finwait2_recycle) ? 2366 tcp_finwait2_timeout : tcp_maxidle; 2367 tcp_timer_activate(tp, TT_2MSL, timeout); 2368 } 2369 tp->t_state = TCPS_FIN_WAIT_2; 2370 } 2371 break; 2372 2373 /* 2374 * In CLOSING STATE in addition to the processing for 2375 * the ESTABLISHED state if the ACK acknowledges our FIN 2376 * then enter the TIME-WAIT state, otherwise ignore 2377 * the segment. 2378 */ 2379 case TCPS_CLOSING: 2380 if (ourfinisacked) { 2381 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2382 tcp_twstart(tp); 2383 INP_INFO_WUNLOCK(&V_tcbinfo); 2384 m_freem(m); 2385 return; 2386 } 2387 break; 2388 2389 /* 2390 * In LAST_ACK, we may still be waiting for data to drain 2391 * and/or to be acked, as well as for the ack of our FIN. 2392 * If our FIN is now acknowledged, delete the TCB, 2393 * enter the closed state and return. 2394 */ 2395 case TCPS_LAST_ACK: 2396 if (ourfinisacked) { 2397 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2398 tp = tcp_close(tp); 2399 goto drop; 2400 } 2401 break; 2402 } 2403 } 2404 2405 step6: 2406 INP_INFO_LOCK_ASSERT(&V_tcbinfo); 2407 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 2408 ("tcp_do_segment: step6 ti_locked %d", ti_locked)); 2409 INP_WLOCK_ASSERT(tp->t_inpcb); 2410 2411 /* 2412 * Update window information. 2413 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2414 */ 2415 if ((thflags & TH_ACK) && 2416 (SEQ_LT(tp->snd_wl1, th->th_seq) || 2417 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2418 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2419 /* keep track of pure window updates */ 2420 if (tlen == 0 && 2421 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2422 V_tcpstat.tcps_rcvwinupd++; 2423 tp->snd_wnd = tiwin; 2424 tp->snd_wl1 = th->th_seq; 2425 tp->snd_wl2 = th->th_ack; 2426 if (tp->snd_wnd > tp->max_sndwnd) 2427 tp->max_sndwnd = tp->snd_wnd; 2428 needoutput = 1; 2429 } 2430 2431 /* 2432 * Process segments with URG. 2433 */ 2434 if ((thflags & TH_URG) && th->th_urp && 2435 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2436 /* 2437 * This is a kludge, but if we receive and accept 2438 * random urgent pointers, we'll crash in 2439 * soreceive. It's hard to imagine someone 2440 * actually wanting to send this much urgent data. 2441 */ 2442 SOCKBUF_LOCK(&so->so_rcv); 2443 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2444 th->th_urp = 0; /* XXX */ 2445 thflags &= ~TH_URG; /* XXX */ 2446 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 2447 goto dodata; /* XXX */ 2448 } 2449 /* 2450 * If this segment advances the known urgent pointer, 2451 * then mark the data stream. This should not happen 2452 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2453 * a FIN has been received from the remote side. 2454 * In these states we ignore the URG. 2455 * 2456 * According to RFC961 (Assigned Protocols), 2457 * the urgent pointer points to the last octet 2458 * of urgent data. We continue, however, 2459 * to consider it to indicate the first octet 2460 * of data past the urgent section as the original 2461 * spec states (in one of two places). 2462 */ 2463 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2464 tp->rcv_up = th->th_seq + th->th_urp; 2465 so->so_oobmark = so->so_rcv.sb_cc + 2466 (tp->rcv_up - tp->rcv_nxt) - 1; 2467 if (so->so_oobmark == 0) 2468 so->so_rcv.sb_state |= SBS_RCVATMARK; 2469 sohasoutofband(so); 2470 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2471 } 2472 SOCKBUF_UNLOCK(&so->so_rcv); 2473 /* 2474 * Remove out of band data so doesn't get presented to user. 2475 * This can happen independent of advancing the URG pointer, 2476 * but if two URG's are pending at once, some out-of-band 2477 * data may creep in... ick. 2478 */ 2479 if (th->th_urp <= (u_long)tlen && 2480 !(so->so_options & SO_OOBINLINE)) { 2481 /* hdr drop is delayed */ 2482 tcp_pulloutofband(so, th, m, drop_hdrlen); 2483 } 2484 } else { 2485 /* 2486 * If no out of band data is expected, 2487 * pull receive urgent pointer along 2488 * with the receive window. 2489 */ 2490 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2491 tp->rcv_up = tp->rcv_nxt; 2492 } 2493 dodata: /* XXX */ 2494 INP_INFO_LOCK_ASSERT(&V_tcbinfo); 2495 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 2496 ("tcp_do_segment: dodata ti_locked %d", ti_locked)); 2497 INP_WLOCK_ASSERT(tp->t_inpcb); 2498 2499 /* 2500 * Process the segment text, merging it into the TCP sequencing queue, 2501 * and arranging for acknowledgment of receipt if necessary. 2502 * This process logically involves adjusting tp->rcv_wnd as data 2503 * is presented to the user (this happens in tcp_usrreq.c, 2504 * case PRU_RCVD). If a FIN has already been received on this 2505 * connection then we just ignore the text. 2506 */ 2507 if ((tlen || (thflags & TH_FIN)) && 2508 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2509 tcp_seq save_start = th->th_seq; 2510 m_adj(m, drop_hdrlen); /* delayed header drop */ 2511 /* 2512 * Insert segment which includes th into TCP reassembly queue 2513 * with control block tp. Set thflags to whether reassembly now 2514 * includes a segment with FIN. This handles the common case 2515 * inline (segment is the next to be received on an established 2516 * connection, and the queue is empty), avoiding linkage into 2517 * and removal from the queue and repetition of various 2518 * conversions. 2519 * Set DELACK for segments received in order, but ack 2520 * immediately when segments are out of order (so 2521 * fast retransmit can work). 2522 */ 2523 if (th->th_seq == tp->rcv_nxt && 2524 LIST_EMPTY(&tp->t_segq) && 2525 TCPS_HAVEESTABLISHED(tp->t_state)) { 2526 if (DELAY_ACK(tp)) 2527 tp->t_flags |= TF_DELACK; 2528 else 2529 tp->t_flags |= TF_ACKNOW; 2530 tp->rcv_nxt += tlen; 2531 thflags = th->th_flags & TH_FIN; 2532 V_tcpstat.tcps_rcvpack++; 2533 V_tcpstat.tcps_rcvbyte += tlen; 2534 ND6_HINT(tp); 2535 SOCKBUF_LOCK(&so->so_rcv); 2536 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 2537 m_freem(m); 2538 else 2539 sbappendstream_locked(&so->so_rcv, m); 2540 /* NB: sorwakeup_locked() does an implicit unlock. */ 2541 sorwakeup_locked(so); 2542 } else { 2543 /* 2544 * XXX: Due to the header drop above "th" is 2545 * theoretically invalid by now. Fortunately 2546 * m_adj() doesn't actually frees any mbufs 2547 * when trimming from the head. 2548 */ 2549 thflags = tcp_reass(tp, th, &tlen, m); 2550 tp->t_flags |= TF_ACKNOW; 2551 } 2552 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 2553 tcp_update_sack_list(tp, save_start, save_start + tlen); 2554 #if 0 2555 /* 2556 * Note the amount of data that peer has sent into 2557 * our window, in order to estimate the sender's 2558 * buffer size. 2559 * XXX: Unused. 2560 */ 2561 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2562 #endif 2563 } else { 2564 m_freem(m); 2565 thflags &= ~TH_FIN; 2566 } 2567 2568 /* 2569 * If FIN is received ACK the FIN and let the user know 2570 * that the connection is closing. 2571 */ 2572 if (thflags & TH_FIN) { 2573 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2574 socantrcvmore(so); 2575 /* 2576 * If connection is half-synchronized 2577 * (ie NEEDSYN flag on) then delay ACK, 2578 * so it may be piggybacked when SYN is sent. 2579 * Otherwise, since we received a FIN then no 2580 * more input can be expected, send ACK now. 2581 */ 2582 if (tp->t_flags & TF_NEEDSYN) 2583 tp->t_flags |= TF_DELACK; 2584 else 2585 tp->t_flags |= TF_ACKNOW; 2586 tp->rcv_nxt++; 2587 } 2588 switch (tp->t_state) { 2589 2590 /* 2591 * In SYN_RECEIVED and ESTABLISHED STATES 2592 * enter the CLOSE_WAIT state. 2593 */ 2594 case TCPS_SYN_RECEIVED: 2595 tp->t_starttime = ticks; 2596 /* FALLTHROUGH */ 2597 case TCPS_ESTABLISHED: 2598 tp->t_state = TCPS_CLOSE_WAIT; 2599 break; 2600 2601 /* 2602 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2603 * enter the CLOSING state. 2604 */ 2605 case TCPS_FIN_WAIT_1: 2606 tp->t_state = TCPS_CLOSING; 2607 break; 2608 2609 /* 2610 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2611 * starting the time-wait timer, turning off the other 2612 * standard timers. 2613 */ 2614 case TCPS_FIN_WAIT_2: 2615 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 2616 KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata " 2617 "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 2618 ti_locked)); 2619 2620 tcp_twstart(tp); 2621 INP_INFO_WUNLOCK(&V_tcbinfo); 2622 return; 2623 } 2624 } 2625 if (ti_locked == TI_RLOCKED) 2626 INP_INFO_RUNLOCK(&V_tcbinfo); 2627 else if (ti_locked == TI_WLOCKED) 2628 INP_INFO_WUNLOCK(&V_tcbinfo); 2629 else 2630 panic("%s: dodata epilogue ti_locked %d", __func__, 2631 ti_locked); 2632 ti_locked = TI_UNLOCKED; 2633 2634 #ifdef TCPDEBUG 2635 if (so->so_options & SO_DEBUG) 2636 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 2637 &tcp_savetcp, 0); 2638 #endif 2639 2640 /* 2641 * Return any desired output. 2642 */ 2643 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2644 (void) tcp_output(tp); 2645 2646 check_delack: 2647 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 2648 __func__, ti_locked)); 2649 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2650 INP_WLOCK_ASSERT(tp->t_inpcb); 2651 2652 if (tp->t_flags & TF_DELACK) { 2653 tp->t_flags &= ~TF_DELACK; 2654 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2655 } 2656 INP_WUNLOCK(tp->t_inpcb); 2657 return; 2658 2659 dropafterack: 2660 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, 2661 ("tcp_do_segment: dropafterack ti_locked %d", ti_locked)); 2662 2663 /* 2664 * Generate an ACK dropping incoming segment if it occupies 2665 * sequence space, where the ACK reflects our state. 2666 * 2667 * We can now skip the test for the RST flag since all 2668 * paths to this code happen after packets containing 2669 * RST have been dropped. 2670 * 2671 * In the SYN-RECEIVED state, don't send an ACK unless the 2672 * segment we received passes the SYN-RECEIVED ACK test. 2673 * If it fails send a RST. This breaks the loop in the 2674 * "LAND" DoS attack, and also prevents an ACK storm 2675 * between two listening ports that have been sent forged 2676 * SYN segments, each with the source address of the other. 2677 */ 2678 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 2679 (SEQ_GT(tp->snd_una, th->th_ack) || 2680 SEQ_GT(th->th_ack, tp->snd_max)) ) { 2681 rstreason = BANDLIM_RST_OPENPORT; 2682 goto dropwithreset; 2683 } 2684 #ifdef TCPDEBUG 2685 if (so->so_options & SO_DEBUG) 2686 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2687 &tcp_savetcp, 0); 2688 #endif 2689 if (ti_locked == TI_RLOCKED) 2690 INP_INFO_RUNLOCK(&V_tcbinfo); 2691 else if (ti_locked == TI_WLOCKED) 2692 INP_INFO_WUNLOCK(&V_tcbinfo); 2693 else 2694 panic("%s: dropafterack epilogue ti_locked %d", __func__, 2695 ti_locked); 2696 ti_locked = TI_UNLOCKED; 2697 2698 tp->t_flags |= TF_ACKNOW; 2699 (void) tcp_output(tp); 2700 INP_WUNLOCK(tp->t_inpcb); 2701 m_freem(m); 2702 return; 2703 2704 dropwithreset: 2705 if (ti_locked == TI_RLOCKED) 2706 INP_INFO_RUNLOCK(&V_tcbinfo); 2707 else if (ti_locked == TI_WLOCKED) 2708 INP_INFO_WUNLOCK(&V_tcbinfo); 2709 else 2710 panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); 2711 ti_locked = TI_UNLOCKED; 2712 2713 if (tp != NULL) { 2714 tcp_dropwithreset(m, th, tp, tlen, rstreason); 2715 INP_WUNLOCK(tp->t_inpcb); 2716 } else 2717 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 2718 return; 2719 2720 drop: 2721 if (ti_locked == TI_RLOCKED) 2722 INP_INFO_RUNLOCK(&V_tcbinfo); 2723 else if (ti_locked == TI_WLOCKED) 2724 INP_INFO_WUNLOCK(&V_tcbinfo); 2725 #ifdef INVARIANTS 2726 else 2727 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2728 #endif 2729 ti_locked = TI_UNLOCKED; 2730 2731 /* 2732 * Drop space held by incoming segment and return. 2733 */ 2734 #ifdef TCPDEBUG 2735 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 2736 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2737 &tcp_savetcp, 0); 2738 #endif 2739 if (tp != NULL) 2740 INP_WUNLOCK(tp->t_inpcb); 2741 m_freem(m); 2742 } 2743 2744 /* 2745 * Issue RST and make ACK acceptable to originator of segment. 2746 * The mbuf must still include the original packet header. 2747 * tp may be NULL. 2748 */ 2749 static void 2750 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, 2751 int tlen, int rstreason) 2752 { 2753 struct ip *ip; 2754 #ifdef INET6 2755 struct ip6_hdr *ip6; 2756 #endif 2757 2758 if (tp != NULL) { 2759 INP_WLOCK_ASSERT(tp->t_inpcb); 2760 } 2761 2762 /* Don't bother if destination was broadcast/multicast. */ 2763 if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2764 goto drop; 2765 #ifdef INET6 2766 if (mtod(m, struct ip *)->ip_v == 6) { 2767 ip6 = mtod(m, struct ip6_hdr *); 2768 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 2769 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 2770 goto drop; 2771 /* IPv6 anycast check is done at tcp6_input() */ 2772 } else 2773 #endif 2774 { 2775 ip = mtod(m, struct ip *); 2776 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 2777 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 2778 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 2779 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 2780 goto drop; 2781 } 2782 2783 /* Perform bandwidth limiting. */ 2784 if (badport_bandlim(rstreason) < 0) 2785 goto drop; 2786 2787 /* tcp_respond consumes the mbuf chain. */ 2788 if (th->th_flags & TH_ACK) { 2789 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, 2790 th->th_ack, TH_RST); 2791 } else { 2792 if (th->th_flags & TH_SYN) 2793 tlen++; 2794 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 2795 (tcp_seq)0, TH_RST|TH_ACK); 2796 } 2797 return; 2798 drop: 2799 m_freem(m); 2800 } 2801 2802 /* 2803 * Parse TCP options and place in tcpopt. 2804 */ 2805 static void 2806 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) 2807 { 2808 INIT_VNET_INET(curvnet); 2809 int opt, optlen; 2810 2811 to->to_flags = 0; 2812 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2813 opt = cp[0]; 2814 if (opt == TCPOPT_EOL) 2815 break; 2816 if (opt == TCPOPT_NOP) 2817 optlen = 1; 2818 else { 2819 if (cnt < 2) 2820 break; 2821 optlen = cp[1]; 2822 if (optlen < 2 || optlen > cnt) 2823 break; 2824 } 2825 switch (opt) { 2826 case TCPOPT_MAXSEG: 2827 if (optlen != TCPOLEN_MAXSEG) 2828 continue; 2829 if (!(flags & TO_SYN)) 2830 continue; 2831 to->to_flags |= TOF_MSS; 2832 bcopy((char *)cp + 2, 2833 (char *)&to->to_mss, sizeof(to->to_mss)); 2834 to->to_mss = ntohs(to->to_mss); 2835 break; 2836 case TCPOPT_WINDOW: 2837 if (optlen != TCPOLEN_WINDOW) 2838 continue; 2839 if (!(flags & TO_SYN)) 2840 continue; 2841 to->to_flags |= TOF_SCALE; 2842 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); 2843 break; 2844 case TCPOPT_TIMESTAMP: 2845 if (optlen != TCPOLEN_TIMESTAMP) 2846 continue; 2847 to->to_flags |= TOF_TS; 2848 bcopy((char *)cp + 2, 2849 (char *)&to->to_tsval, sizeof(to->to_tsval)); 2850 to->to_tsval = ntohl(to->to_tsval); 2851 bcopy((char *)cp + 6, 2852 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 2853 to->to_tsecr = ntohl(to->to_tsecr); 2854 break; 2855 #ifdef TCP_SIGNATURE 2856 /* 2857 * XXX In order to reply to a host which has set the 2858 * TCP_SIGNATURE option in its initial SYN, we have to 2859 * record the fact that the option was observed here 2860 * for the syncache code to perform the correct response. 2861 */ 2862 case TCPOPT_SIGNATURE: 2863 if (optlen != TCPOLEN_SIGNATURE) 2864 continue; 2865 to->to_flags |= TOF_SIGNATURE; 2866 to->to_signature = cp + 2; 2867 break; 2868 #endif 2869 case TCPOPT_SACK_PERMITTED: 2870 if (optlen != TCPOLEN_SACK_PERMITTED) 2871 continue; 2872 if (!(flags & TO_SYN)) 2873 continue; 2874 if (!V_tcp_do_sack) 2875 continue; 2876 to->to_flags |= TOF_SACKPERM; 2877 break; 2878 case TCPOPT_SACK: 2879 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2880 continue; 2881 if (flags & TO_SYN) 2882 continue; 2883 to->to_flags |= TOF_SACK; 2884 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 2885 to->to_sacks = cp + 2; 2886 V_tcpstat.tcps_sack_rcv_blocks++; 2887 break; 2888 default: 2889 continue; 2890 } 2891 } 2892 } 2893 2894 /* 2895 * Pull out of band byte out of a segment so 2896 * it doesn't appear in the user's data queue. 2897 * It is still reflected in the segment length for 2898 * sequencing purposes. 2899 */ 2900 static void 2901 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, 2902 int off) 2903 { 2904 int cnt = off + th->th_urp - 1; 2905 2906 while (cnt >= 0) { 2907 if (m->m_len > cnt) { 2908 char *cp = mtod(m, caddr_t) + cnt; 2909 struct tcpcb *tp = sototcpcb(so); 2910 2911 INP_WLOCK_ASSERT(tp->t_inpcb); 2912 2913 tp->t_iobc = *cp; 2914 tp->t_oobflags |= TCPOOB_HAVEDATA; 2915 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2916 m->m_len--; 2917 if (m->m_flags & M_PKTHDR) 2918 m->m_pkthdr.len--; 2919 return; 2920 } 2921 cnt -= m->m_len; 2922 m = m->m_next; 2923 if (m == NULL) 2924 break; 2925 } 2926 panic("tcp_pulloutofband"); 2927 } 2928 2929 /* 2930 * Collect new round-trip time estimate 2931 * and update averages and current timeout. 2932 */ 2933 static void 2934 tcp_xmit_timer(struct tcpcb *tp, int rtt) 2935 { 2936 INIT_VNET_INET(tp->t_inpcb->inp_vnet); 2937 int delta; 2938 2939 INP_WLOCK_ASSERT(tp->t_inpcb); 2940 2941 V_tcpstat.tcps_rttupdated++; 2942 tp->t_rttupdated++; 2943 if (tp->t_srtt != 0) { 2944 /* 2945 * srtt is stored as fixed point with 5 bits after the 2946 * binary point (i.e., scaled by 8). The following magic 2947 * is equivalent to the smoothing algorithm in rfc793 with 2948 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2949 * point). Adjust rtt to origin 0. 2950 */ 2951 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 2952 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 2953 2954 if ((tp->t_srtt += delta) <= 0) 2955 tp->t_srtt = 1; 2956 2957 /* 2958 * We accumulate a smoothed rtt variance (actually, a 2959 * smoothed mean difference), then set the retransmit 2960 * timer to smoothed rtt + 4 times the smoothed variance. 2961 * rttvar is stored as fixed point with 4 bits after the 2962 * binary point (scaled by 16). The following is 2963 * equivalent to rfc793 smoothing with an alpha of .75 2964 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2965 * rfc793's wired-in beta. 2966 */ 2967 if (delta < 0) 2968 delta = -delta; 2969 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 2970 if ((tp->t_rttvar += delta) <= 0) 2971 tp->t_rttvar = 1; 2972 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 2973 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2974 } else { 2975 /* 2976 * No rtt measurement yet - use the unsmoothed rtt. 2977 * Set the variance to half the rtt (so our first 2978 * retransmit happens at 3*rtt). 2979 */ 2980 tp->t_srtt = rtt << TCP_RTT_SHIFT; 2981 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 2982 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2983 } 2984 tp->t_rtttime = 0; 2985 tp->t_rxtshift = 0; 2986 2987 /* 2988 * the retransmit should happen at rtt + 4 * rttvar. 2989 * Because of the way we do the smoothing, srtt and rttvar 2990 * will each average +1/2 tick of bias. When we compute 2991 * the retransmit timer, we want 1/2 tick of rounding and 2992 * 1 extra tick because of +-1/2 tick uncertainty in the 2993 * firing of the timer. The bias will give us exactly the 2994 * 1.5 tick we need. But, because the bias is 2995 * statistical, we have to test that we don't drop below 2996 * the minimum feasible timer (which is 2 ticks). 2997 */ 2998 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 2999 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 3000 3001 /* 3002 * We received an ack for a packet that wasn't retransmitted; 3003 * it is probably safe to discard any error indications we've 3004 * received recently. This isn't quite right, but close enough 3005 * for now (a route might have failed after we sent a segment, 3006 * and the return path might not be symmetrical). 3007 */ 3008 tp->t_softerror = 0; 3009 } 3010 3011 /* 3012 * Determine a reasonable value for maxseg size. 3013 * If the route is known, check route for mtu. 3014 * If none, use an mss that can be handled on the outgoing 3015 * interface without forcing IP to fragment; if bigger than 3016 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 3017 * to utilize large mbufs. If no route is found, route has no mtu, 3018 * or the destination isn't local, use a default, hopefully conservative 3019 * size (usually 512 or the default IP max size, but no more than the mtu 3020 * of the interface), as we can't discover anything about intervening 3021 * gateways or networks. We also initialize the congestion/slow start 3022 * window to be a single segment if the destination isn't local. 3023 * While looking at the routing entry, we also initialize other path-dependent 3024 * parameters from pre-set or cached values in the routing entry. 3025 * 3026 * Also take into account the space needed for options that we 3027 * send regularly. Make maxseg shorter by that amount to assure 3028 * that we can send maxseg amount of data even when the options 3029 * are present. Store the upper limit of the length of options plus 3030 * data in maxopd. 3031 * 3032 * In case of T/TCP, we call this routine during implicit connection 3033 * setup as well (offer = -1), to initialize maxseg from the cached 3034 * MSS of our peer. 3035 * 3036 * NOTE that this routine is only called when we process an incoming 3037 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). 3038 */ 3039 void 3040 tcp_mss_update(struct tcpcb *tp, int offer, 3041 struct hc_metrics_lite *metricptr, int *mtuflags) 3042 { 3043 INIT_VNET_INET(tp->t_inpcb->inp_vnet); 3044 int mss; 3045 u_long maxmtu; 3046 struct inpcb *inp = tp->t_inpcb; 3047 struct hc_metrics_lite metrics; 3048 int origoffer = offer; 3049 #ifdef INET6 3050 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 3051 size_t min_protoh = isipv6 ? 3052 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : 3053 sizeof (struct tcpiphdr); 3054 #else 3055 const size_t min_protoh = sizeof(struct tcpiphdr); 3056 #endif 3057 3058 INP_WLOCK_ASSERT(tp->t_inpcb); 3059 3060 /* Initialize. */ 3061 #ifdef INET6 3062 if (isipv6) { 3063 maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags); 3064 tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; 3065 } else 3066 #endif 3067 { 3068 maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags); 3069 tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; 3070 } 3071 3072 /* 3073 * No route to sender, stay with default mss and return. 3074 */ 3075 if (maxmtu == 0) { 3076 /* 3077 * In case we return early we need to initialize metrics 3078 * to a defined state as tcp_hc_get() would do for us 3079 * if there was no cache hit. 3080 */ 3081 if (metricptr != NULL) 3082 bzero(metricptr, sizeof(struct hc_metrics_lite)); 3083 return; 3084 } 3085 3086 /* What have we got? */ 3087 switch (offer) { 3088 case 0: 3089 /* 3090 * Offer == 0 means that there was no MSS on the SYN 3091 * segment, in this case we use tcp_mssdflt as 3092 * already assigned to t_maxopd above. 3093 */ 3094 offer = tp->t_maxopd; 3095 break; 3096 3097 case -1: 3098 /* 3099 * Offer == -1 means that we didn't receive SYN yet. 3100 */ 3101 /* FALLTHROUGH */ 3102 3103 default: 3104 /* 3105 * Prevent DoS attack with too small MSS. Round up 3106 * to at least minmss. 3107 */ 3108 offer = max(offer, V_tcp_minmss); 3109 } 3110 3111 /* 3112 * rmx information is now retrieved from tcp_hostcache. 3113 */ 3114 tcp_hc_get(&inp->inp_inc, &metrics); 3115 if (metricptr != NULL) 3116 bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); 3117 3118 /* 3119 * If there's a discovered mtu int tcp hostcache, use it 3120 * else, use the link mtu. 3121 */ 3122 if (metrics.rmx_mtu) 3123 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; 3124 else { 3125 #ifdef INET6 3126 if (isipv6) { 3127 mss = maxmtu - min_protoh; 3128 if (!V_path_mtu_discovery && 3129 !in6_localaddr(&inp->in6p_faddr)) 3130 mss = min(mss, V_tcp_v6mssdflt); 3131 } else 3132 #endif 3133 { 3134 mss = maxmtu - min_protoh; 3135 if (!V_path_mtu_discovery && 3136 !in_localaddr(inp->inp_faddr)) 3137 mss = min(mss, V_tcp_mssdflt); 3138 } 3139 /* 3140 * XXX - The above conditional (mss = maxmtu - min_protoh) 3141 * probably violates the TCP spec. 3142 * The problem is that, since we don't know the 3143 * other end's MSS, we are supposed to use a conservative 3144 * default. But, if we do that, then MTU discovery will 3145 * never actually take place, because the conservative 3146 * default is much less than the MTUs typically seen 3147 * on the Internet today. For the moment, we'll sweep 3148 * this under the carpet. 3149 * 3150 * The conservative default might not actually be a problem 3151 * if the only case this occurs is when sending an initial 3152 * SYN with options and data to a host we've never talked 3153 * to before. Then, they will reply with an MSS value which 3154 * will get recorded and the new parameters should get 3155 * recomputed. For Further Study. 3156 */ 3157 } 3158 mss = min(mss, offer); 3159 3160 /* 3161 * Sanity check: make sure that maxopd will be large 3162 * enough to allow some data on segments even if the 3163 * all the option space is used (40bytes). Otherwise 3164 * funny things may happen in tcp_output. 3165 */ 3166 mss = max(mss, 64); 3167 3168 /* 3169 * maxopd stores the maximum length of data AND options 3170 * in a segment; maxseg is the amount of data in a normal 3171 * segment. We need to store this value (maxopd) apart 3172 * from maxseg, because now every segment carries options 3173 * and thus we normally have somewhat less data in segments. 3174 */ 3175 tp->t_maxopd = mss; 3176 3177 /* 3178 * origoffer==-1 indicates that no segments were received yet. 3179 * In this case we just guess. 3180 */ 3181 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 3182 (origoffer == -1 || 3183 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 3184 mss -= TCPOLEN_TSTAMP_APPA; 3185 3186 #if (MCLBYTES & (MCLBYTES - 1)) == 0 3187 if (mss > MCLBYTES) 3188 mss &= ~(MCLBYTES-1); 3189 #else 3190 if (mss > MCLBYTES) 3191 mss = mss / MCLBYTES * MCLBYTES; 3192 #endif 3193 tp->t_maxseg = mss; 3194 } 3195 3196 void 3197 tcp_mss(struct tcpcb *tp, int offer) 3198 { 3199 int rtt, mss; 3200 u_long bufsize; 3201 struct inpcb *inp; 3202 struct socket *so; 3203 struct hc_metrics_lite metrics; 3204 int mtuflags = 0; 3205 #ifdef INET6 3206 int isipv6; 3207 #endif 3208 KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); 3209 INIT_VNET_INET(tp->t_vnet); 3210 3211 tcp_mss_update(tp, offer, &metrics, &mtuflags); 3212 3213 mss = tp->t_maxseg; 3214 inp = tp->t_inpcb; 3215 #ifdef INET6 3216 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 3217 #endif 3218 3219 /* 3220 * If there's a pipesize, change the socket buffer to that size, 3221 * don't change if sb_hiwat is different than default (then it 3222 * has been changed on purpose with setsockopt). 3223 * Make the socket buffers an integral number of mss units; 3224 * if the mss is larger than the socket buffer, decrease the mss. 3225 */ 3226 so = inp->inp_socket; 3227 SOCKBUF_LOCK(&so->so_snd); 3228 if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) 3229 bufsize = metrics.rmx_sendpipe; 3230 else 3231 bufsize = so->so_snd.sb_hiwat; 3232 if (bufsize < mss) 3233 mss = bufsize; 3234 else { 3235 bufsize = roundup(bufsize, mss); 3236 if (bufsize > sb_max) 3237 bufsize = sb_max; 3238 if (bufsize > so->so_snd.sb_hiwat) 3239 (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); 3240 } 3241 SOCKBUF_UNLOCK(&so->so_snd); 3242 tp->t_maxseg = mss; 3243 3244 SOCKBUF_LOCK(&so->so_rcv); 3245 if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) 3246 bufsize = metrics.rmx_recvpipe; 3247 else 3248 bufsize = so->so_rcv.sb_hiwat; 3249 if (bufsize > mss) { 3250 bufsize = roundup(bufsize, mss); 3251 if (bufsize > sb_max) 3252 bufsize = sb_max; 3253 if (bufsize > so->so_rcv.sb_hiwat) 3254 (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); 3255 } 3256 SOCKBUF_UNLOCK(&so->so_rcv); 3257 /* 3258 * While we're here, check the others too. 3259 */ 3260 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { 3261 tp->t_srtt = rtt; 3262 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; 3263 V_tcpstat.tcps_usedrtt++; 3264 if (metrics.rmx_rttvar) { 3265 tp->t_rttvar = metrics.rmx_rttvar; 3266 V_tcpstat.tcps_usedrttvar++; 3267 } else { 3268 /* default variation is +- 1 rtt */ 3269 tp->t_rttvar = 3270 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 3271 } 3272 TCPT_RANGESET(tp->t_rxtcur, 3273 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 3274 tp->t_rttmin, TCPTV_REXMTMAX); 3275 } 3276 if (metrics.rmx_ssthresh) { 3277 /* 3278 * There's some sort of gateway or interface 3279 * buffer limit on the path. Use this to set 3280 * the slow start threshhold, but set the 3281 * threshold to no less than 2*mss. 3282 */ 3283 tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); 3284 V_tcpstat.tcps_usedssthresh++; 3285 } 3286 if (metrics.rmx_bandwidth) 3287 tp->snd_bandwidth = metrics.rmx_bandwidth; 3288 3289 /* 3290 * Set the slow-start flight size depending on whether this 3291 * is a local network or not. 3292 * 3293 * Extend this so we cache the cwnd too and retrieve it here. 3294 * Make cwnd even bigger than RFC3390 suggests but only if we 3295 * have previous experience with the remote host. Be careful 3296 * not make cwnd bigger than remote receive window or our own 3297 * send socket buffer. Maybe put some additional upper bound 3298 * on the retrieved cwnd. Should do incremental updates to 3299 * hostcache when cwnd collapses so next connection doesn't 3300 * overloads the path again. 3301 * 3302 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. 3303 * We currently check only in syncache_socket for that. 3304 */ 3305 #define TCP_METRICS_CWND 3306 #ifdef TCP_METRICS_CWND 3307 if (metrics.rmx_cwnd) 3308 tp->snd_cwnd = max(mss, 3309 min(metrics.rmx_cwnd / 2, 3310 min(tp->snd_wnd, so->so_snd.sb_hiwat))); 3311 else 3312 #endif 3313 if (V_tcp_do_rfc3390) 3314 tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); 3315 #ifdef INET6 3316 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || 3317 (!isipv6 && in_localaddr(inp->inp_faddr))) 3318 #else 3319 else if (in_localaddr(inp->inp_faddr)) 3320 #endif 3321 tp->snd_cwnd = mss * V_ss_fltsz_local; 3322 else 3323 tp->snd_cwnd = mss * V_ss_fltsz; 3324 3325 /* Check the interface for TSO capabilities. */ 3326 if (mtuflags & CSUM_TSO) 3327 tp->t_flags |= TF_TSO; 3328 } 3329 3330 /* 3331 * Determine the MSS option to send on an outgoing SYN. 3332 */ 3333 int 3334 tcp_mssopt(struct in_conninfo *inc) 3335 { 3336 INIT_VNET_INET(curvnet); 3337 int mss = 0; 3338 u_long maxmtu = 0; 3339 u_long thcmtu = 0; 3340 size_t min_protoh; 3341 #ifdef INET6 3342 int isipv6 = inc->inc_isipv6 ? 1 : 0; 3343 #endif 3344 3345 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); 3346 3347 #ifdef INET6 3348 if (isipv6) { 3349 mss = V_tcp_v6mssdflt; 3350 maxmtu = tcp_maxmtu6(inc, NULL); 3351 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3352 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 3353 } else 3354 #endif 3355 { 3356 mss = V_tcp_mssdflt; 3357 maxmtu = tcp_maxmtu(inc, NULL); 3358 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3359 min_protoh = sizeof(struct tcpiphdr); 3360 } 3361 if (maxmtu && thcmtu) 3362 mss = min(maxmtu, thcmtu) - min_protoh; 3363 else if (maxmtu || thcmtu) 3364 mss = max(maxmtu, thcmtu) - min_protoh; 3365 3366 return (mss); 3367 } 3368 3369 3370 /* 3371 * On a partial ack arrives, force the retransmission of the 3372 * next unacknowledged segment. Do not clear tp->t_dupacks. 3373 * By setting snd_nxt to ti_ack, this forces retransmission timer to 3374 * be started again. 3375 */ 3376 static void 3377 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) 3378 { 3379 tcp_seq onxt = tp->snd_nxt; 3380 u_long ocwnd = tp->snd_cwnd; 3381 3382 INP_WLOCK_ASSERT(tp->t_inpcb); 3383 3384 tcp_timer_activate(tp, TT_REXMT, 0); 3385 tp->t_rtttime = 0; 3386 tp->snd_nxt = th->th_ack; 3387 /* 3388 * Set snd_cwnd to one segment beyond acknowledged offset. 3389 * (tp->snd_una has not yet been updated when this function is called.) 3390 */ 3391 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3392 tp->t_flags |= TF_ACKNOW; 3393 (void) tcp_output(tp); 3394 tp->snd_cwnd = ocwnd; 3395 if (SEQ_GT(onxt, tp->snd_nxt)) 3396 tp->snd_nxt = onxt; 3397 /* 3398 * Partial window deflation. Relies on fact that tp->snd_una 3399 * not updated yet. 3400 */ 3401 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3402 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3403 else 3404 tp->snd_cwnd = 0; 3405 tp->snd_cwnd += tp->t_maxseg; 3406 } 3407