1 /*- 2 * Copyright (c) 2001 Networks Associates Technologies, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Jonathan Lemon 6 * and NAI Labs, the Security Research Division of Network Associates, Inc. 7 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the 8 * DARPA CHATS research program. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. The name of the author may not be used to endorse or promote 19 * products derived from this software without specific prior written 20 * permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $FreeBSD$ 35 */ 36 37 #include "opt_inet6.h" 38 #include "opt_ipsec.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/kernel.h> 43 #include <sys/sysctl.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/md5.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #include <sys/random.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 55 #include <netinet/in.h> 56 #include <netinet/in_systm.h> 57 #include <netinet/ip.h> 58 #include <netinet/in_var.h> 59 #include <netinet/in_pcb.h> 60 #include <netinet/ip_var.h> 61 #ifdef INET6 62 #include <netinet/ip6.h> 63 #include <netinet/icmp6.h> 64 #include <netinet6/nd6.h> 65 #include <netinet6/ip6_var.h> 66 #include <netinet6/in6_pcb.h> 67 #endif 68 #include <netinet/tcp.h> 69 #include <netinet/tcp_fsm.h> 70 #include <netinet/tcp_seq.h> 71 #include <netinet/tcp_timer.h> 72 #include <netinet/tcp_var.h> 73 #ifdef INET6 74 #include <netinet6/tcp6_var.h> 75 #endif 76 77 #ifdef IPSEC 78 #include <netinet6/ipsec.h> 79 #ifdef INET6 80 #include <netinet6/ipsec6.h> 81 #endif 82 #include <netkey/key.h> 83 #endif /*IPSEC*/ 84 85 #include <machine/in_cksum.h> 86 #include <vm/vm_zone.h> 87 88 static void syncache_drop(struct syncache *, struct syncache_head *); 89 static void syncache_free(struct syncache *); 90 static int syncache_insert(struct syncache *, struct syncache_head *); 91 struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **); 92 static int syncache_respond(struct syncache *, struct mbuf *); 93 static struct socket *syncache_socket(struct syncache *, struct socket *); 94 static void syncache_timer(void *); 95 96 /* 97 * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. 98 * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds, 99 * the odds are that the user has given up attempting to connect by then. 100 */ 101 #define SYNCACHE_MAXREXMTS 3 102 103 /* Arbitrary values */ 104 #define TCP_SYNCACHE_HASHSIZE 512 105 #define TCP_SYNCACHE_BUCKETLIMIT 30 106 107 struct tcp_syncache { 108 struct syncache_head *hashbase; 109 struct vm_zone *zone; 110 u_int hashsize; 111 u_int hashmask; 112 u_int bucket_limit; 113 u_int cache_count; 114 u_int cache_limit; 115 u_int rexmt_limit; 116 u_int hash_secret; 117 u_int next_reseed; 118 TAILQ_HEAD(, syncache) timerq[SYNCACHE_MAXREXMTS + 1]; 119 struct callout tt_timerq[SYNCACHE_MAXREXMTS + 1]; 120 }; 121 static struct tcp_syncache tcp_syncache; 122 123 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); 124 125 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RD, 126 &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache"); 127 128 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RD, 129 &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache"); 130 131 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD, 132 &tcp_syncache.cache_count, 0, "Current number of entries in syncache"); 133 134 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RD, 135 &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable"); 136 137 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, 138 &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions"); 139 140 static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); 141 142 #define SYNCACHE_HASH(inc, mask) \ 143 ((tcp_syncache.hash_secret ^ \ 144 (inc)->inc_faddr.s_addr ^ \ 145 ((inc)->inc_faddr.s_addr >> 16) ^ \ 146 (inc)->inc_fport ^ (inc)->inc_lport) & mask) 147 148 #define SYNCACHE_HASH6(inc, mask) \ 149 ((tcp_syncache.hash_secret ^ \ 150 (inc)->inc6_faddr.s6_addr32[0] ^ \ 151 (inc)->inc6_faddr.s6_addr32[3] ^ \ 152 (inc)->inc_fport ^ (inc)->inc_lport) & mask) 153 154 #define ENDPTS_EQ(a, b) ( \ 155 (a)->ie_fport == (a)->ie_fport && \ 156 (a)->ie_lport == (b)->ie_lport && \ 157 (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ 158 (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ 159 ) 160 161 #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) 162 163 #define SYNCACHE_TIMEOUT(sc, slot) do { \ 164 sc->sc_rxtslot = slot; \ 165 sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[slot]; \ 166 TAILQ_INSERT_TAIL(&tcp_syncache.timerq[slot], sc, sc_timerq); \ 167 if (!callout_active(&tcp_syncache.tt_timerq[slot])) \ 168 callout_reset(&tcp_syncache.tt_timerq[slot], \ 169 TCPTV_RTOBASE * tcp_backoff[slot], \ 170 syncache_timer, (void *)((int)slot)); \ 171 } while (0) 172 173 static void 174 syncache_free(struct syncache *sc) 175 { 176 struct rtentry *rt; 177 178 if (sc->sc_ipopts) 179 (void) m_free(sc->sc_ipopts); 180 #ifdef INET6 181 if (sc->sc_inc.inc_isipv6) 182 rt = sc->sc_route6.ro_rt; 183 else 184 #endif 185 rt = sc->sc_route.ro_rt; 186 if (rt != NULL) { 187 /* 188 * If this is the only reference to a protocol cloned 189 * route, remove it immediately. 190 */ 191 if (rt->rt_flags & RTF_WASCLONED && 192 (sc->sc_flags & SCF_KEEPROUTE) == 0 && 193 rt->rt_refcnt == 1) 194 rtrequest(RTM_DELETE, rt_key(rt), 195 rt->rt_gateway, rt_mask(rt), 196 rt->rt_flags, NULL); 197 RTFREE(rt); 198 } 199 zfree(tcp_syncache.zone, sc); 200 } 201 202 void 203 syncache_init(void) 204 { 205 int i; 206 207 tcp_syncache.cache_count = 0; 208 tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; 209 tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; 210 tcp_syncache.cache_limit = 211 tcp_syncache.hashsize * tcp_syncache.bucket_limit; 212 tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; 213 tcp_syncache.next_reseed = 0; 214 tcp_syncache.hash_secret = arc4random(); 215 216 TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", 217 &tcp_syncache.hashsize); 218 TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", 219 &tcp_syncache.cache_limit); 220 TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", 221 &tcp_syncache.bucket_limit); 222 if (!powerof2(tcp_syncache.hashsize)) { 223 printf("WARNING: syncache hash size is not a power of 2.\n"); 224 tcp_syncache.hashsize = 512; /* safe default */ 225 } 226 tcp_syncache.hashmask = tcp_syncache.hashsize - 1; 227 228 /* Allocate the hash table. */ 229 MALLOC(tcp_syncache.hashbase, struct syncache_head *, 230 tcp_syncache.hashsize * sizeof(struct syncache_head), 231 M_SYNCACHE, M_WAITOK | M_ZERO); 232 233 /* Initialize the hash buckets. */ 234 for (i = 0; i < tcp_syncache.hashsize; i++) { 235 TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket); 236 tcp_syncache.hashbase[i].sch_length = 0; 237 } 238 239 /* Initialize the timer queues. */ 240 for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) { 241 TAILQ_INIT(&tcp_syncache.timerq[i]); 242 callout_init(&tcp_syncache.tt_timerq[i], 0); 243 } 244 245 /* 246 * Allocate the syncache entries. Allow the zone to allocate one 247 * more entry than cache limit, so a new entry can bump out an 248 * older one. 249 */ 250 tcp_syncache.cache_limit -= 1; 251 tcp_syncache.zone = zinit("syncache", sizeof(struct syncache), 252 tcp_syncache.cache_limit, ZONE_INTERRUPT, 0); 253 } 254 255 static int 256 syncache_insert(sc, sch) 257 struct syncache *sc; 258 struct syncache_head *sch; 259 { 260 struct syncache *sc2; 261 int s, i; 262 263 /* 264 * Make sure that we don't overflow the per-bucket 265 * limit or the total cache size limit. 266 */ 267 s = splnet(); 268 if (sch->sch_length >= tcp_syncache.bucket_limit) { 269 /* 270 * The bucket is full, toss the oldest element. 271 */ 272 sc2 = TAILQ_FIRST(&sch->sch_bucket); 273 syncache_drop(sc2, sch); 274 tcpstat.tcps_sc_bucketoverflow++; 275 } else if (tcp_syncache.cache_count >= tcp_syncache.cache_limit) { 276 /* 277 * The cache is full. Toss the oldest entry in the 278 * entire cache. This is the front entry in the 279 * first non-empty timer queue with the largest 280 * timeout value. 281 */ 282 for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) { 283 sc2 = TAILQ_FIRST(&tcp_syncache.timerq[i]); 284 if (sc2 != NULL) 285 break; 286 } 287 syncache_drop(sc2, NULL); 288 tcpstat.tcps_sc_cacheoverflow++; 289 } 290 291 /* Initialize the entry's timer. */ 292 SYNCACHE_TIMEOUT(sc, 0); 293 294 /* Put it into the bucket. */ 295 TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash); 296 sch->sch_length++; 297 tcp_syncache.cache_count++; 298 tcpstat.tcps_sc_added++; 299 splx(s); 300 return (1); 301 } 302 303 static void 304 syncache_drop(sc, sch) 305 struct syncache *sc; 306 struct syncache_head *sch; 307 { 308 int s; 309 310 if (sch == NULL) { 311 #ifdef INET6 312 if (sc->sc_inc.inc_isipv6) { 313 sch = &tcp_syncache.hashbase[ 314 SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)]; 315 } else 316 #endif 317 { 318 sch = &tcp_syncache.hashbase[ 319 SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)]; 320 } 321 } 322 323 s = splnet(); 324 325 TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); 326 sch->sch_length--; 327 tcp_syncache.cache_count--; 328 329 TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq); 330 if (TAILQ_EMPTY(&tcp_syncache.timerq[sc->sc_rxtslot])) 331 callout_stop(&tcp_syncache.tt_timerq[sc->sc_rxtslot]); 332 splx(s); 333 334 syncache_free(sc); 335 } 336 337 /* 338 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 339 * If we have retransmitted an entry the maximum number of times, expire it. 340 */ 341 static void 342 syncache_timer(xslot) 343 void *xslot; 344 { 345 int slot = (int)xslot; 346 struct syncache *sc, *nsc; 347 struct inpcb *inp; 348 int s; 349 350 s = splnet(); 351 if (callout_pending(&tcp_syncache.tt_timerq[slot]) || 352 !callout_active(&tcp_syncache.tt_timerq[slot])) { 353 splx(s); 354 return; 355 } 356 callout_deactivate(&tcp_syncache.tt_timerq[slot]); 357 358 nsc = TAILQ_FIRST(&tcp_syncache.timerq[slot]); 359 while (nsc != NULL) { 360 if (ticks < nsc->sc_rxttime) 361 break; 362 sc = nsc; 363 nsc = TAILQ_NEXT(sc, sc_timerq); 364 inp = sc->sc_tp->t_inpcb; 365 if (slot == SYNCACHE_MAXREXMTS || 366 slot >= tcp_syncache.rexmt_limit || 367 inp->inp_gencnt != sc->sc_inp_gencnt) { 368 syncache_drop(sc, NULL); 369 tcpstat.tcps_sc_stale++; 370 continue; 371 } 372 (void) syncache_respond(sc, NULL); 373 tcpstat.tcps_sc_retransmitted++; 374 TAILQ_REMOVE(&tcp_syncache.timerq[slot], sc, sc_timerq); 375 SYNCACHE_TIMEOUT(sc, slot + 1); 376 } 377 if (nsc != NULL) 378 callout_reset(&tcp_syncache.tt_timerq[slot], 379 nsc->sc_rxttime - ticks, syncache_timer, (void *)(slot)); 380 splx(s); 381 } 382 383 /* 384 * Find an entry in the syncache. 385 */ 386 struct syncache * 387 syncache_lookup(inc, schp) 388 struct in_conninfo *inc; 389 struct syncache_head **schp; 390 { 391 struct syncache *sc; 392 struct syncache_head *sch; 393 int s; 394 395 #ifdef INET6 396 if (inc->inc_isipv6) { 397 sch = &tcp_syncache.hashbase[ 398 SYNCACHE_HASH6(inc, tcp_syncache.hashmask)]; 399 *schp = sch; 400 s = splnet(); 401 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { 402 if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) { 403 splx(s); 404 return (sc); 405 } 406 } 407 splx(s); 408 } else 409 #endif 410 { 411 sch = &tcp_syncache.hashbase[ 412 SYNCACHE_HASH(inc, tcp_syncache.hashmask)]; 413 *schp = sch; 414 s = splnet(); 415 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { 416 #ifdef INET6 417 if (sc->sc_inc.inc_isipv6) 418 continue; 419 #endif 420 if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) { 421 splx(s); 422 return (sc); 423 } 424 } 425 splx(s); 426 } 427 return (NULL); 428 } 429 430 /* 431 * This function is called when we get a RST for a 432 * non-existent connection, so that we can see if the 433 * connection is in the syn cache. If it is, zap it. 434 */ 435 void 436 syncache_chkrst(inc, th) 437 struct in_conninfo *inc; 438 struct tcphdr *th; 439 { 440 struct syncache *sc; 441 struct syncache_head *sch; 442 443 sc = syncache_lookup(inc, &sch); 444 if (sc == NULL) 445 return; 446 /* 447 * If the RST bit is set, check the sequence number to see 448 * if this is a valid reset segment. 449 * RFC 793 page 37: 450 * In all states except SYN-SENT, all reset (RST) segments 451 * are validated by checking their SEQ-fields. A reset is 452 * valid if its sequence number is in the window. 453 * 454 * The sequence number in the reset segment is normally an 455 * echo of our outgoing acknowlegement numbers, but some hosts 456 * send a reset with the sequence number at the rightmost edge 457 * of our receive window, and we have to handle this case. 458 */ 459 if (SEQ_GEQ(th->th_seq, sc->sc_irs) && 460 SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { 461 syncache_drop(sc, sch); 462 tcpstat.tcps_sc_reset++; 463 } 464 } 465 466 void 467 syncache_badack(inc) 468 struct in_conninfo *inc; 469 { 470 struct syncache *sc; 471 struct syncache_head *sch; 472 473 sc = syncache_lookup(inc, &sch); 474 if (sc != NULL) { 475 syncache_drop(sc, sch); 476 tcpstat.tcps_sc_badack++; 477 } 478 } 479 480 void 481 syncache_unreach(inc, th) 482 struct in_conninfo *inc; 483 struct tcphdr *th; 484 { 485 struct syncache *sc; 486 struct syncache_head *sch; 487 488 /* we are called at splnet() here */ 489 sc = syncache_lookup(inc, &sch); 490 if (sc == NULL) 491 return; 492 493 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 494 if (ntohl(th->th_seq) != sc->sc_iss) 495 return; 496 497 /* 498 * If we've rertransmitted 3 times and this is our second error, 499 * we remove the entry. Otherwise, we allow it to continue on. 500 * This prevents us from incorrectly nuking an entry during a 501 * spurious network outage. 502 * 503 * See tcp_notify(). 504 */ 505 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) { 506 sc->sc_flags |= SCF_UNREACH; 507 return; 508 } 509 syncache_drop(sc, sch); 510 tcpstat.tcps_sc_unreach++; 511 } 512 513 /* 514 * Build a new TCP socket structure from a syncache entry. 515 */ 516 static struct socket * 517 syncache_socket(sc, lso) 518 struct syncache *sc; 519 struct socket *lso; 520 { 521 struct inpcb *inp = NULL; 522 struct socket *so; 523 struct tcpcb *tp; 524 525 /* 526 * Ok, create the full blown connection, and set things up 527 * as they would have been set up if we had created the 528 * connection when the SYN arrived. If we can't create 529 * the connection, abort it. 530 */ 531 so = sonewconn(lso, SS_ISCONNECTED); 532 if (so == NULL) { 533 /* 534 * Drop the connection; we will send a RST if the peer 535 * retransmits the ACK, 536 */ 537 tcpstat.tcps_listendrop++; 538 goto abort; 539 } 540 541 inp = sotoinpcb(so); 542 543 /* 544 * Insert new socket into hash list. 545 */ 546 #ifdef INET6 547 if (sc->sc_inc.inc_isipv6) { 548 inp->in6p_laddr = sc->sc_inc.inc6_laddr; 549 } else { 550 inp->inp_vflag &= ~INP_IPV6; 551 inp->inp_vflag |= INP_IPV4; 552 #endif 553 inp->inp_laddr = sc->sc_inc.inc_laddr; 554 #ifdef INET6 555 } 556 #endif 557 inp->inp_lport = sc->sc_inc.inc_lport; 558 if (in_pcbinshash(inp) != 0) { 559 /* 560 * Undo the assignments above if we failed to 561 * put the PCB on the hash lists. 562 */ 563 #ifdef INET6 564 if (sc->sc_inc.inc_isipv6) 565 inp->in6p_laddr = in6addr_any; 566 else 567 #endif 568 inp->inp_laddr.s_addr = INADDR_ANY; 569 inp->inp_lport = 0; 570 goto abort; 571 } 572 #ifdef IPSEC 573 /* copy old policy into new socket's */ 574 if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) 575 printf("syncache_expand: could not copy policy\n"); 576 #endif 577 #ifdef INET6 578 if (sc->sc_inc.inc_isipv6) { 579 struct inpcb *oinp = sotoinpcb(lso); 580 struct in6_addr laddr6; 581 struct sockaddr_in6 *sin6; 582 /* 583 * Inherit socket options from the listening socket. 584 * Note that in6p_inputopts are not (and should not be) 585 * copied, since it stores previously received options and is 586 * used to detect if each new option is different than the 587 * previous one and hence should be passed to a user. 588 * If we copied in6p_inputopts, a user would not be able to 589 * receive options just after calling the accept system call. 590 */ 591 inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; 592 if (oinp->in6p_outputopts) 593 inp->in6p_outputopts = 594 ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); 595 inp->in6p_route = sc->sc_route6; 596 sc->sc_route6.ro_rt = NULL; 597 598 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, 599 M_SONAME, M_NOWAIT | M_ZERO); 600 if (sin6 == NULL) 601 goto abort; 602 sin6->sin6_family = AF_INET6; 603 sin6->sin6_len = sizeof(*sin6); 604 sin6->sin6_addr = sc->sc_inc.inc6_faddr; 605 sin6->sin6_port = sc->sc_inc.inc_fport; 606 laddr6 = inp->in6p_laddr; 607 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 608 inp->in6p_laddr = sc->sc_inc.inc6_laddr; 609 if (in6_pcbconnect(inp, (struct sockaddr *)sin6, thread0)) { 610 inp->in6p_laddr = laddr6; 611 FREE(sin6, M_SONAME); 612 goto abort; 613 } 614 FREE(sin6, M_SONAME); 615 } else 616 #endif 617 { 618 struct in_addr laddr; 619 struct sockaddr_in *sin; 620 621 inp->inp_options = ip_srcroute(); 622 if (inp->inp_options == NULL) { 623 inp->inp_options = sc->sc_ipopts; 624 sc->sc_ipopts = NULL; 625 } 626 inp->inp_route = sc->sc_route; 627 sc->sc_route.ro_rt = NULL; 628 629 MALLOC(sin, struct sockaddr_in *, sizeof *sin, 630 M_SONAME, M_NOWAIT | M_ZERO); 631 if (sin == NULL) 632 goto abort; 633 sin->sin_family = AF_INET; 634 sin->sin_len = sizeof(*sin); 635 sin->sin_addr = sc->sc_inc.inc_faddr; 636 sin->sin_port = sc->sc_inc.inc_fport; 637 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); 638 laddr = inp->inp_laddr; 639 if (inp->inp_laddr.s_addr == INADDR_ANY) 640 inp->inp_laddr = sc->sc_inc.inc_laddr; 641 if (in_pcbconnect(inp, (struct sockaddr *)sin, thread0)) { 642 inp->inp_laddr = laddr; 643 FREE(sin, M_SONAME); 644 goto abort; 645 } 646 FREE(sin, M_SONAME); 647 } 648 649 tp = intotcpcb(inp); 650 tp->t_state = TCPS_SYN_RECEIVED; 651 tp->iss = sc->sc_iss; 652 tp->irs = sc->sc_irs; 653 tcp_rcvseqinit(tp); 654 tcp_sendseqinit(tp); 655 tp->snd_wl1 = sc->sc_irs; 656 tp->rcv_up = sc->sc_irs + 1; 657 tp->rcv_wnd = sc->sc_wnd; 658 tp->rcv_adv += tp->rcv_wnd; 659 660 tp->t_flags = sc->sc_tp->t_flags & TF_NOPUSH; 661 if (sc->sc_flags & SCF_NOOPT) 662 tp->t_flags |= TF_NOOPT; 663 if (sc->sc_flags & SCF_WINSCALE) { 664 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 665 tp->requested_s_scale = sc->sc_requested_s_scale; 666 tp->request_r_scale = sc->sc_request_r_scale; 667 } 668 if (sc->sc_flags & SCF_TIMESTAMP) { 669 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 670 tp->ts_recent = sc->sc_tsrecent; 671 tp->ts_recent_age = ticks; 672 } 673 if (sc->sc_flags & SCF_CC) { 674 /* 675 * Initialization of the tcpcb for transaction; 676 * set SND.WND = SEG.WND, 677 * initialize CCsend and CCrecv. 678 */ 679 tp->t_flags |= TF_REQ_CC|TF_RCVD_CC; 680 tp->cc_send = sc->sc_cc_send; 681 tp->cc_recv = sc->sc_cc_recv; 682 } 683 684 tcp_mss(tp, sc->sc_peer_mss); 685 686 /* 687 * If the SYN,ACK was retransmitted, reset cwnd to 1 segment. 688 */ 689 if (sc->sc_rxtslot != 0) 690 tp->snd_cwnd = tp->t_maxseg; 691 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); 692 693 tcpstat.tcps_accepts++; 694 return (so); 695 696 abort: 697 if (so != NULL) 698 (void) soabort(so); 699 return (NULL); 700 } 701 702 /* 703 * This function gets called when we receive an ACK for a 704 * socket in the LISTEN state. We look up the connection 705 * in the syncache, and if its there, we pull it out of 706 * the cache and turn it into a full-blown connection in 707 * the SYN-RECEIVED state. 708 */ 709 int 710 syncache_expand(inc, th, sop, m) 711 struct in_conninfo *inc; 712 struct tcphdr *th; 713 struct socket **sop; 714 struct mbuf *m; 715 { 716 struct syncache *sc; 717 struct syncache_head *sch; 718 struct socket *so; 719 720 sc = syncache_lookup(inc, &sch); 721 if (sc == NULL) 722 return (0); 723 724 /* 725 * If seg contains an ACK, but not for our SYN/ACK, send a RST. 726 */ 727 if (th->th_ack != sc->sc_iss + 1) 728 return (0); 729 730 so = syncache_socket(sc, *sop); 731 if (so == NULL) { 732 #if 0 733 resetandabort: 734 /* XXXjlemon check this - is this correct? */ 735 (void) tcp_respond(NULL, m, m, th, 736 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK); 737 #endif 738 m_freem(m); /* XXX only needed for above */ 739 tcpstat.tcps_sc_aborted++; 740 } else { 741 sc->sc_flags |= SCF_KEEPROUTE; 742 tcpstat.tcps_sc_completed++; 743 } 744 if (sch == NULL) 745 syncache_free(sc); 746 else 747 syncache_drop(sc, sch); 748 *sop = so; 749 return (1); 750 } 751 752 /* 753 * Given a LISTEN socket and an inbound SYN request, add 754 * this to the syn cache, and send back a segment: 755 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 756 * to the source. 757 * 758 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 759 * Doing so would require that we hold onto the data and deliver it 760 * to the application. However, if we are the target of a SYN-flood 761 * DoS attack, an attacker could send data which would eventually 762 * consume all available buffer space if it were ACKed. By not ACKing 763 * the data, we avoid this DoS scenario. 764 */ 765 int 766 syncache_add(inc, to, th, sop, m) 767 struct in_conninfo *inc; 768 struct tcpopt *to; 769 struct tcphdr *th; 770 struct socket **sop; 771 struct mbuf *m; 772 { 773 struct tcpcb *tp; 774 struct socket *so; 775 struct syncache *sc = NULL; 776 struct syncache_head *sch; 777 struct mbuf *ipopts = NULL; 778 struct rmxp_tao *taop; 779 int i, s, win; 780 781 so = *sop; 782 tp = sototcpcb(so); 783 784 /* 785 * Remember the IP options, if any. 786 */ 787 #ifdef INET6 788 if (!inc->inc_isipv6) 789 #endif 790 ipopts = ip_srcroute(); 791 792 /* 793 * See if we already have an entry for this connection. 794 * If we do, resend the SYN,ACK, and reset the retransmit timer. 795 * 796 * XXX 797 * should the syncache be re-initialized with the contents 798 * of the new SYN here (which may have different options?) 799 */ 800 sc = syncache_lookup(inc, &sch); 801 if (sc != NULL) { 802 tcpstat.tcps_sc_dupsyn++; 803 if (ipopts) { 804 /* 805 * If we were remembering a previous source route, 806 * forget it and use the new one we've been given. 807 */ 808 if (sc->sc_ipopts) 809 (void) m_free(sc->sc_ipopts); 810 sc->sc_ipopts = ipopts; 811 } 812 /* 813 * Update timestamp if present. 814 */ 815 if (sc->sc_flags & SCF_TIMESTAMP) 816 sc->sc_tsrecent = to->to_tsval; 817 if (syncache_respond(sc, m) == 0) { 818 s = splnet(); 819 TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], 820 sc, sc_timerq); 821 SYNCACHE_TIMEOUT(sc, sc->sc_rxtslot); 822 splx(s); 823 tcpstat.tcps_sndacks++; 824 tcpstat.tcps_sndtotal++; 825 } 826 *sop = NULL; 827 return (1); 828 } 829 830 sc = zalloc(tcp_syncache.zone); 831 if (sc == NULL) { 832 /* 833 * The zone allocator couldn't provide more entries. 834 * Treat this as if the cache was full; drop the oldest 835 * entry and insert the new one. 836 */ 837 s = splnet(); 838 for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) { 839 sc = TAILQ_FIRST(&tcp_syncache.timerq[i]); 840 if (sc != NULL) 841 break; 842 } 843 syncache_drop(sc, NULL); 844 splx(s); 845 tcpstat.tcps_sc_zonefail++; 846 sc = zalloc(tcp_syncache.zone); 847 if (sc == NULL) { 848 if (ipopts) 849 (void) m_free(ipopts); 850 return (0); 851 } 852 } 853 854 /* 855 * Fill in the syncache values. 856 */ 857 bzero(sc, sizeof(*sc)); 858 sc->sc_tp = tp; 859 sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt; 860 sc->sc_ipopts = ipopts; 861 sc->sc_inc.inc_fport = inc->inc_fport; 862 sc->sc_inc.inc_lport = inc->inc_lport; 863 #ifdef INET6 864 sc->sc_inc.inc_isipv6 = inc->inc_isipv6; 865 if (inc->inc_isipv6) { 866 sc->sc_inc.inc6_faddr = inc->inc6_faddr; 867 sc->sc_inc.inc6_laddr = inc->inc6_laddr; 868 sc->sc_route6.ro_rt = NULL; 869 } else 870 #endif 871 { 872 sc->sc_inc.inc_faddr = inc->inc_faddr; 873 sc->sc_inc.inc_laddr = inc->inc_laddr; 874 sc->sc_route.ro_rt = NULL; 875 } 876 sc->sc_irs = th->th_seq; 877 878 /* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN] */ 879 win = sbspace(&so->so_rcv); 880 win = imax(win, 0); 881 win = imin(win, TCP_MAXWIN); 882 sc->sc_wnd = win; 883 884 sc->sc_flags = 0; 885 sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0; 886 if (tcp_do_rfc1323) { 887 /* 888 * A timestamp received in a SYN makes 889 * it ok to send timestamp requests and replies. 890 */ 891 if (to->to_flags & TOF_TS) { 892 sc->sc_tsrecent = to->to_tsval; 893 sc->sc_flags |= SCF_TIMESTAMP; 894 } 895 if (to->to_flags & TOF_SCALE) { 896 int wscale = 0; 897 898 /* Compute proper scaling value from buffer space */ 899 while (wscale < TCP_MAX_WINSHIFT && 900 (TCP_MAXWIN << wscale) < so->so_rcv.sb_hiwat) 901 wscale++; 902 sc->sc_request_r_scale = wscale; 903 sc->sc_requested_s_scale = to->to_requested_s_scale; 904 sc->sc_flags |= SCF_WINSCALE; 905 } 906 } 907 if (tcp_do_rfc1644) { 908 /* 909 * A CC or CC.new option received in a SYN makes 910 * it ok to send CC in subsequent segments. 911 */ 912 if (to->to_flags & (TOF_CC|TOF_CCNEW)) { 913 sc->sc_cc_recv = to->to_cc; 914 sc->sc_cc_send = CC_INC(tcp_ccgen); 915 sc->sc_flags |= SCF_CC; 916 } 917 } 918 if (tp->t_flags & TF_NOOPT) 919 sc->sc_flags = SCF_NOOPT; 920 921 /* 922 * XXX 923 * We have the option here of not doing TAO (even if the segment 924 * qualifies) and instead fall back to a normal 3WHS via the syncache. 925 * This allows us to apply synflood protection to TAO-qualifying SYNs 926 * also. However, there should be a hueristic to determine when to 927 * do this, and is not present at the moment. 928 */ 929 930 /* 931 * Perform TAO test on incoming CC (SEG.CC) option, if any. 932 * - compare SEG.CC against cached CC from the same host, if any. 933 * - if SEG.CC > chached value, SYN must be new and is accepted 934 * immediately: save new CC in the cache, mark the socket 935 * connected, enter ESTABLISHED state, turn on flag to 936 * send a SYN in the next segment. 937 * A virtual advertised window is set in rcv_adv to 938 * initialize SWS prevention. Then enter normal segment 939 * processing: drop SYN, process data and FIN. 940 * - otherwise do a normal 3-way handshake. 941 */ 942 taop = tcp_gettaocache(&sc->sc_inc); 943 if ((to->to_flags & TOF_CC) != 0) { 944 if (((tp->t_flags & TF_NOPUSH) != 0) && 945 sc->sc_flags & SCF_CC && 946 taop != NULL && taop->tao_cc != 0 && 947 CC_GT(to->to_cc, taop->tao_cc)) { 948 sc->sc_rxtslot = 0; 949 so = syncache_socket(sc, *sop); 950 if (so != NULL) { 951 sc->sc_flags |= SCF_KEEPROUTE; 952 taop->tao_cc = to->to_cc; 953 *sop = so; 954 } 955 syncache_free(sc); 956 return (so != NULL); 957 } 958 } else { 959 /* 960 * No CC option, but maybe CC.NEW: invalidate cached value. 961 */ 962 if (taop != NULL) 963 taop->tao_cc = 0; 964 } 965 /* 966 * TAO test failed or there was no CC option, 967 * do a standard 3-way handshake. 968 */ 969 sc->sc_iss = arc4random(); 970 if (syncache_insert(sc, sch)) { 971 if (syncache_respond(sc, m) == 0) { 972 tcpstat.tcps_sndacks++; 973 tcpstat.tcps_sndtotal++; 974 } else { 975 syncache_drop(sc, sch); 976 tcpstat.tcps_sc_dropped++; 977 } 978 } else { 979 syncache_free(sc); 980 } 981 *sop = NULL; 982 return (1); 983 } 984 985 static int 986 syncache_respond(sc, m) 987 struct syncache *sc; 988 struct mbuf *m; 989 { 990 u_int8_t *optp; 991 int optlen, error; 992 u_int16_t tlen, hlen, mssopt; 993 struct ip *ip = NULL; 994 struct rtentry *rt; 995 struct tcphdr *th; 996 #ifdef INET6 997 struct ip6_hdr *ip6 = NULL; 998 #endif 999 1000 #ifdef INET6 1001 if (sc->sc_inc.inc_isipv6) { 1002 rt = tcp_rtlookup6(&sc->sc_inc); 1003 if (rt != NULL) 1004 mssopt = rt->rt_ifp->if_mtu - 1005 (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); 1006 else 1007 mssopt = tcp_v6mssdflt; 1008 hlen = sizeof(struct ip6_hdr); 1009 } else 1010 #endif 1011 { 1012 rt = tcp_rtlookup(&sc->sc_inc); 1013 if (rt != NULL) 1014 mssopt = rt->rt_ifp->if_mtu - 1015 (sizeof(struct ip) + sizeof(struct tcphdr)); 1016 else 1017 mssopt = tcp_mssdflt; 1018 hlen = sizeof(struct ip); 1019 } 1020 1021 /* Compute the size of the TCP options. */ 1022 if (sc->sc_flags & SCF_NOOPT) { 1023 optlen = 0; 1024 } else { 1025 optlen = TCPOLEN_MAXSEG + 1026 ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) + 1027 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0) + 1028 ((sc->sc_flags & SCF_CC) ? TCPOLEN_CC_APPA * 2 : 0); 1029 } 1030 tlen = hlen + sizeof(struct tcphdr) + optlen; 1031 1032 /* 1033 * XXX 1034 * assume that the entire packet will fit in a header mbuf 1035 */ 1036 KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small")); 1037 1038 /* 1039 * XXX shouldn't this reuse the mbuf if possible ? 1040 * Create the IP+TCP header from scratch. 1041 */ 1042 if (m) 1043 m_freem(m); 1044 1045 m = m_gethdr(M_DONTWAIT, MT_HEADER); 1046 if (m == NULL) 1047 return (ENOBUFS); 1048 m->m_data += max_linkhdr; 1049 m->m_len = tlen; 1050 m->m_pkthdr.len = tlen; 1051 m->m_pkthdr.rcvif = NULL; 1052 1053 #ifdef IPSEC 1054 /* use IPsec policy on listening socket to send SYN,ACK */ 1055 if (ipsec_setsocket(m, sc->sc_tp->t_inpcb->inp_socket) != 0) { 1056 m_freem(m); 1057 return (ENOBUFS); 1058 } 1059 #endif 1060 1061 #ifdef INET6 1062 if (sc->sc_inc.inc_isipv6) { 1063 ip6 = mtod(m, struct ip6_hdr *); 1064 ip6->ip6_vfc = IPV6_VERSION; 1065 ip6->ip6_nxt = IPPROTO_TCP; 1066 ip6->ip6_src = sc->sc_inc.inc6_laddr; 1067 ip6->ip6_dst = sc->sc_inc.inc6_faddr; 1068 ip6->ip6_plen = htons(tlen - hlen); 1069 /* ip6_hlim is set after checksum */ 1070 /* ip6_flow = ??? */ 1071 1072 th = (struct tcphdr *)(ip6 + 1); 1073 } else 1074 #endif 1075 { 1076 ip = mtod(m, struct ip *); 1077 ip->ip_v = IPVERSION; 1078 ip->ip_hl = sizeof(struct ip) >> 2; 1079 ip->ip_tos = 0; 1080 ip->ip_len = tlen; 1081 ip->ip_id = 0; 1082 ip->ip_off = 0; 1083 ip->ip_ttl = ip_defttl; 1084 ip->ip_sum = 0; 1085 ip->ip_p = IPPROTO_TCP; 1086 ip->ip_src = sc->sc_inc.inc_laddr; 1087 ip->ip_dst = sc->sc_inc.inc_faddr; 1088 1089 th = (struct tcphdr *)(ip + 1); 1090 } 1091 th->th_sport = sc->sc_inc.inc_lport; 1092 th->th_dport = sc->sc_inc.inc_fport; 1093 1094 th->th_seq = htonl(sc->sc_iss); 1095 th->th_ack = htonl(sc->sc_irs + 1); 1096 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 1097 th->th_x2 = 0; 1098 th->th_flags = TH_SYN|TH_ACK; 1099 th->th_win = htons(sc->sc_wnd); 1100 th->th_urp = 0; 1101 1102 /* Tack on the TCP options. */ 1103 if (optlen == 0) 1104 goto no_options; 1105 optp = (u_int8_t *)(th + 1); 1106 *optp++ = TCPOPT_MAXSEG; 1107 *optp++ = TCPOLEN_MAXSEG; 1108 *optp++ = (mssopt >> 8) & 0xff; 1109 *optp++ = mssopt & 0xff; 1110 1111 if (sc->sc_flags & SCF_WINSCALE) { 1112 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 1113 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 1114 sc->sc_request_r_scale); 1115 optp += 4; 1116 } 1117 1118 if (sc->sc_flags & SCF_TIMESTAMP) { 1119 u_int32_t *lp = (u_int32_t *)(optp); 1120 1121 /* Form timestamp option as shown in appendix A of RFC 1323. */ 1122 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 1123 *lp++ = htonl(ticks); 1124 *lp = htonl(sc->sc_tsrecent); 1125 optp += TCPOLEN_TSTAMP_APPA; 1126 } 1127 1128 /* 1129 * Send CC and CC.echo if we received CC from our peer. 1130 */ 1131 if (sc->sc_flags & SCF_CC) { 1132 u_int32_t *lp = (u_int32_t *)(optp); 1133 1134 *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC)); 1135 *lp++ = htonl(sc->sc_cc_send); 1136 *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CCECHO)); 1137 *lp = htonl(sc->sc_cc_recv); 1138 optp += TCPOLEN_CC_APPA * 2; 1139 } 1140 no_options: 1141 1142 #ifdef INET6 1143 if (sc->sc_inc.inc_isipv6) { 1144 struct route_in6 *ro6 = &sc->sc_route6; 1145 1146 th->th_sum = 0; 1147 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); 1148 ip6->ip6_hlim = in6_selecthlim(NULL, 1149 ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL); 1150 error = ip6_output(m, NULL, ro6, 0, NULL, NULL); 1151 } else 1152 #endif 1153 { 1154 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 1155 htons(tlen - hlen + IPPROTO_TCP)); 1156 m->m_pkthdr.csum_flags = CSUM_TCP; 1157 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1158 error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL); 1159 } 1160 return (error); 1161 } 1162