1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * IPv4 specific functions 9 * 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24 /* 25 * Changes: 26 * David S. Miller : New socket lookup architecture. 27 * This code is dedicated to John Dyson. 28 * David S. Miller : Change semantics of established hash, 29 * half is devoted to TIME_WAIT sockets 30 * and the rest go in the other half. 31 * Andi Kleen : Add support for syncookies and fixed 32 * some bugs: ip options weren't passed to 33 * the TCP layer, missed a check for an 34 * ACK bit. 35 * Andi Kleen : Implemented fast path mtu discovery. 36 * Fixed many serious bugs in the 37 * request_sock handling and moved 38 * most of it into the af independent code. 39 * Added tail drop and some other bugfixes. 40 * Added new listen semantics. 41 * Mike McLagan : Routing by source 42 * Juan Jose Ciarlante: ip_dynaddr bits 43 * Andi Kleen: various fixes. 44 * Vitaly E. Lavrov : Transparent proxy revived after year 45 * coma. 46 * Andi Kleen : Fix new listen. 47 * Andi Kleen : Fix accept error reporting. 48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 50 * a single port at the same time. 51 */ 52 53 #define pr_fmt(fmt) "TCP: " fmt 54 55 #include <linux/bottom_half.h> 56 #include <linux/types.h> 57 #include <linux/fcntl.h> 58 #include <linux/module.h> 59 #include <linux/random.h> 60 #include <linux/cache.h> 61 #include <linux/jhash.h> 62 #include <linux/init.h> 63 #include <linux/times.h> 64 #include <linux/slab.h> 65 66 #include <net/net_namespace.h> 67 #include <net/icmp.h> 68 #include <net/inet_hashtables.h> 69 #include <net/tcp.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/timewait_sock.h> 74 #include <net/xfrm.h> 75 #include <net/secure_seq.h> 76 #include <net/busy_poll.h> 77 78 #include <linux/inet.h> 79 #include <linux/ipv6.h> 80 #include <linux/stddef.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 84 #include <crypto/hash.h> 85 #include <linux/scatterlist.h> 86 87 int sysctl_tcp_low_latency __read_mostly; 88 89 #ifdef CONFIG_TCP_MD5SIG 90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 91 __be32 daddr, __be32 saddr, const struct tcphdr *th); 92 #endif 93 94 struct inet_hashinfo tcp_hashinfo; 95 EXPORT_SYMBOL(tcp_hashinfo); 96 97 static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff) 98 { 99 return secure_tcp_sequence_number(ip_hdr(skb)->daddr, 100 ip_hdr(skb)->saddr, 101 tcp_hdr(skb)->dest, 102 tcp_hdr(skb)->source, tsoff); 103 } 104 105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 106 { 107 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 108 struct tcp_sock *tp = tcp_sk(sk); 109 110 /* With PAWS, it is safe from the viewpoint 111 of data integrity. Even without PAWS it is safe provided sequence 112 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 113 114 Actually, the idea is close to VJ's one, only timestamp cache is 115 held not per host, but per port pair and TW bucket is used as state 116 holder. 117 118 If TW bucket has been already destroyed we fall back to VJ's scheme 119 and use initial timestamp retrieved from peer table. 120 */ 121 if (tcptw->tw_ts_recent_stamp && 122 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse && 123 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 124 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 125 if (tp->write_seq == 0) 126 tp->write_seq = 1; 127 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 128 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 129 sock_hold(sktw); 130 return 1; 131 } 132 133 return 0; 134 } 135 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 136 137 /* This will initiate an outgoing connection. */ 138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 139 { 140 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 141 struct inet_sock *inet = inet_sk(sk); 142 struct tcp_sock *tp = tcp_sk(sk); 143 __be16 orig_sport, orig_dport; 144 __be32 daddr, nexthop; 145 struct flowi4 *fl4; 146 struct rtable *rt; 147 int err; 148 struct ip_options_rcu *inet_opt; 149 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 150 151 if (addr_len < sizeof(struct sockaddr_in)) 152 return -EINVAL; 153 154 if (usin->sin_family != AF_INET) 155 return -EAFNOSUPPORT; 156 157 nexthop = daddr = usin->sin_addr.s_addr; 158 inet_opt = rcu_dereference_protected(inet->inet_opt, 159 lockdep_sock_is_held(sk)); 160 if (inet_opt && inet_opt->opt.srr) { 161 if (!daddr) 162 return -EINVAL; 163 nexthop = inet_opt->opt.faddr; 164 } 165 166 orig_sport = inet->inet_sport; 167 orig_dport = usin->sin_port; 168 fl4 = &inet->cork.fl.u.ip4; 169 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 170 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 171 IPPROTO_TCP, 172 orig_sport, orig_dport, sk); 173 if (IS_ERR(rt)) { 174 err = PTR_ERR(rt); 175 if (err == -ENETUNREACH) 176 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 177 return err; 178 } 179 180 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 181 ip_rt_put(rt); 182 return -ENETUNREACH; 183 } 184 185 if (!inet_opt || !inet_opt->opt.srr) 186 daddr = fl4->daddr; 187 188 if (!inet->inet_saddr) 189 inet->inet_saddr = fl4->saddr; 190 sk_rcv_saddr_set(sk, inet->inet_saddr); 191 192 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 193 /* Reset inherited state */ 194 tp->rx_opt.ts_recent = 0; 195 tp->rx_opt.ts_recent_stamp = 0; 196 if (likely(!tp->repair)) 197 tp->write_seq = 0; 198 } 199 200 if (tcp_death_row->sysctl_tw_recycle && 201 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) 202 tcp_fetch_timewait_stamp(sk, &rt->dst); 203 204 inet->inet_dport = usin->sin_port; 205 sk_daddr_set(sk, daddr); 206 207 inet_csk(sk)->icsk_ext_hdr_len = 0; 208 if (inet_opt) 209 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 210 211 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 212 213 /* Socket identity is still unknown (sport may be zero). 214 * However we set state to SYN-SENT and not releasing socket 215 * lock select source port, enter ourselves into the hash tables and 216 * complete initialization after this. 217 */ 218 tcp_set_state(sk, TCP_SYN_SENT); 219 err = inet_hash_connect(tcp_death_row, sk); 220 if (err) 221 goto failure; 222 223 sk_set_txhash(sk); 224 225 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 226 inet->inet_sport, inet->inet_dport, sk); 227 if (IS_ERR(rt)) { 228 err = PTR_ERR(rt); 229 rt = NULL; 230 goto failure; 231 } 232 /* OK, now commit destination to socket. */ 233 sk->sk_gso_type = SKB_GSO_TCPV4; 234 sk_setup_caps(sk, &rt->dst); 235 rt = NULL; 236 237 if (!tp->write_seq && likely(!tp->repair)) 238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 239 inet->inet_daddr, 240 inet->inet_sport, 241 usin->sin_port, 242 &tp->tsoffset); 243 244 inet->inet_id = tp->write_seq ^ jiffies; 245 246 if (tcp_fastopen_defer_connect(sk, &err)) 247 return err; 248 if (err) 249 goto failure; 250 251 err = tcp_connect(sk); 252 253 if (err) 254 goto failure; 255 256 return 0; 257 258 failure: 259 /* 260 * This unhashes the socket and releases the local port, 261 * if necessary. 262 */ 263 tcp_set_state(sk, TCP_CLOSE); 264 ip_rt_put(rt); 265 sk->sk_route_caps = 0; 266 inet->inet_dport = 0; 267 return err; 268 } 269 EXPORT_SYMBOL(tcp_v4_connect); 270 271 /* 272 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 273 * It can be called through tcp_release_cb() if socket was owned by user 274 * at the time tcp_v4_err() was called to handle ICMP message. 275 */ 276 void tcp_v4_mtu_reduced(struct sock *sk) 277 { 278 struct dst_entry *dst; 279 struct inet_sock *inet = inet_sk(sk); 280 u32 mtu = tcp_sk(sk)->mtu_info; 281 282 dst = inet_csk_update_pmtu(sk, mtu); 283 if (!dst) 284 return; 285 286 /* Something is about to be wrong... Remember soft error 287 * for the case, if this connection will not able to recover. 288 */ 289 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 290 sk->sk_err_soft = EMSGSIZE; 291 292 mtu = dst_mtu(dst); 293 294 if (inet->pmtudisc != IP_PMTUDISC_DONT && 295 ip_sk_accept_pmtu(sk) && 296 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 297 tcp_sync_mss(sk, mtu); 298 299 /* Resend the TCP packet because it's 300 * clear that the old packet has been 301 * dropped. This is the new "fast" path mtu 302 * discovery. 303 */ 304 tcp_simple_retransmit(sk); 305 } /* else let the usual retransmit timer handle it */ 306 } 307 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 308 309 static void do_redirect(struct sk_buff *skb, struct sock *sk) 310 { 311 struct dst_entry *dst = __sk_dst_check(sk, 0); 312 313 if (dst) 314 dst->ops->redirect(dst, sk, skb); 315 } 316 317 318 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 319 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 320 { 321 struct request_sock *req = inet_reqsk(sk); 322 struct net *net = sock_net(sk); 323 324 /* ICMPs are not backlogged, hence we cannot get 325 * an established socket here. 326 */ 327 if (seq != tcp_rsk(req)->snt_isn) { 328 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 329 } else if (abort) { 330 /* 331 * Still in SYN_RECV, just remove it silently. 332 * There is no good way to pass the error to the newly 333 * created socket, and POSIX does not want network 334 * errors returned from accept(). 335 */ 336 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 337 tcp_listendrop(req->rsk_listener); 338 } 339 reqsk_put(req); 340 } 341 EXPORT_SYMBOL(tcp_req_err); 342 343 /* 344 * This routine is called by the ICMP module when it gets some 345 * sort of error condition. If err < 0 then the socket should 346 * be closed and the error returned to the user. If err > 0 347 * it's just the icmp type << 8 | icmp code. After adjustment 348 * header points to the first 8 bytes of the tcp header. We need 349 * to find the appropriate port. 350 * 351 * The locking strategy used here is very "optimistic". When 352 * someone else accesses the socket the ICMP is just dropped 353 * and for some paths there is no check at all. 354 * A more general error queue to queue errors for later handling 355 * is probably better. 356 * 357 */ 358 359 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 360 { 361 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 362 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 363 struct inet_connection_sock *icsk; 364 struct tcp_sock *tp; 365 struct inet_sock *inet; 366 const int type = icmp_hdr(icmp_skb)->type; 367 const int code = icmp_hdr(icmp_skb)->code; 368 struct sock *sk; 369 struct sk_buff *skb; 370 struct request_sock *fastopen; 371 __u32 seq, snd_una; 372 __u32 remaining; 373 int err; 374 struct net *net = dev_net(icmp_skb->dev); 375 376 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 377 th->dest, iph->saddr, ntohs(th->source), 378 inet_iif(icmp_skb)); 379 if (!sk) { 380 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 381 return; 382 } 383 if (sk->sk_state == TCP_TIME_WAIT) { 384 inet_twsk_put(inet_twsk(sk)); 385 return; 386 } 387 seq = ntohl(th->seq); 388 if (sk->sk_state == TCP_NEW_SYN_RECV) 389 return tcp_req_err(sk, seq, 390 type == ICMP_PARAMETERPROB || 391 type == ICMP_TIME_EXCEEDED || 392 (type == ICMP_DEST_UNREACH && 393 (code == ICMP_NET_UNREACH || 394 code == ICMP_HOST_UNREACH))); 395 396 bh_lock_sock(sk); 397 /* If too many ICMPs get dropped on busy 398 * servers this needs to be solved differently. 399 * We do take care of PMTU discovery (RFC1191) special case : 400 * we can receive locally generated ICMP messages while socket is held. 401 */ 402 if (sock_owned_by_user(sk)) { 403 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 404 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 405 } 406 if (sk->sk_state == TCP_CLOSE) 407 goto out; 408 409 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 410 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 411 goto out; 412 } 413 414 icsk = inet_csk(sk); 415 tp = tcp_sk(sk); 416 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 417 fastopen = tp->fastopen_rsk; 418 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 419 if (sk->sk_state != TCP_LISTEN && 420 !between(seq, snd_una, tp->snd_nxt)) { 421 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 422 goto out; 423 } 424 425 switch (type) { 426 case ICMP_REDIRECT: 427 do_redirect(icmp_skb, sk); 428 goto out; 429 case ICMP_SOURCE_QUENCH: 430 /* Just silently ignore these. */ 431 goto out; 432 case ICMP_PARAMETERPROB: 433 err = EPROTO; 434 break; 435 case ICMP_DEST_UNREACH: 436 if (code > NR_ICMP_UNREACH) 437 goto out; 438 439 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 440 /* We are not interested in TCP_LISTEN and open_requests 441 * (SYN-ACKs send out by Linux are always <576bytes so 442 * they should go through unfragmented). 443 */ 444 if (sk->sk_state == TCP_LISTEN) 445 goto out; 446 447 tp->mtu_info = info; 448 if (!sock_owned_by_user(sk)) { 449 tcp_v4_mtu_reduced(sk); 450 } else { 451 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 452 sock_hold(sk); 453 } 454 goto out; 455 } 456 457 err = icmp_err_convert[code].errno; 458 /* check if icmp_skb allows revert of backoff 459 * (see draft-zimmermann-tcp-lcd) */ 460 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 461 break; 462 if (seq != tp->snd_una || !icsk->icsk_retransmits || 463 !icsk->icsk_backoff || fastopen) 464 break; 465 466 if (sock_owned_by_user(sk)) 467 break; 468 469 icsk->icsk_backoff--; 470 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 471 TCP_TIMEOUT_INIT; 472 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 473 474 skb = tcp_write_queue_head(sk); 475 BUG_ON(!skb); 476 477 remaining = icsk->icsk_rto - 478 min(icsk->icsk_rto, 479 tcp_time_stamp - tcp_skb_timestamp(skb)); 480 481 if (remaining) { 482 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 483 remaining, TCP_RTO_MAX); 484 } else { 485 /* RTO revert clocked out retransmission. 486 * Will retransmit now */ 487 tcp_retransmit_timer(sk); 488 } 489 490 break; 491 case ICMP_TIME_EXCEEDED: 492 err = EHOSTUNREACH; 493 break; 494 default: 495 goto out; 496 } 497 498 switch (sk->sk_state) { 499 case TCP_SYN_SENT: 500 case TCP_SYN_RECV: 501 /* Only in fast or simultaneous open. If a fast open socket is 502 * is already accepted it is treated as a connected one below. 503 */ 504 if (fastopen && !fastopen->sk) 505 break; 506 507 if (!sock_owned_by_user(sk)) { 508 sk->sk_err = err; 509 510 sk->sk_error_report(sk); 511 512 tcp_done(sk); 513 } else { 514 sk->sk_err_soft = err; 515 } 516 goto out; 517 } 518 519 /* If we've already connected we will keep trying 520 * until we time out, or the user gives up. 521 * 522 * rfc1122 4.2.3.9 allows to consider as hard errors 523 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 524 * but it is obsoleted by pmtu discovery). 525 * 526 * Note, that in modern internet, where routing is unreliable 527 * and in each dark corner broken firewalls sit, sending random 528 * errors ordered by their masters even this two messages finally lose 529 * their original sense (even Linux sends invalid PORT_UNREACHs) 530 * 531 * Now we are in compliance with RFCs. 532 * --ANK (980905) 533 */ 534 535 inet = inet_sk(sk); 536 if (!sock_owned_by_user(sk) && inet->recverr) { 537 sk->sk_err = err; 538 sk->sk_error_report(sk); 539 } else { /* Only an error on timeout */ 540 sk->sk_err_soft = err; 541 } 542 543 out: 544 bh_unlock_sock(sk); 545 sock_put(sk); 546 } 547 548 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 549 { 550 struct tcphdr *th = tcp_hdr(skb); 551 552 if (skb->ip_summed == CHECKSUM_PARTIAL) { 553 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 554 skb->csum_start = skb_transport_header(skb) - skb->head; 555 skb->csum_offset = offsetof(struct tcphdr, check); 556 } else { 557 th->check = tcp_v4_check(skb->len, saddr, daddr, 558 csum_partial(th, 559 th->doff << 2, 560 skb->csum)); 561 } 562 } 563 564 /* This routine computes an IPv4 TCP checksum. */ 565 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 566 { 567 const struct inet_sock *inet = inet_sk(sk); 568 569 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 570 } 571 EXPORT_SYMBOL(tcp_v4_send_check); 572 573 /* 574 * This routine will send an RST to the other tcp. 575 * 576 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 577 * for reset. 578 * Answer: if a packet caused RST, it is not for a socket 579 * existing in our system, if it is matched to a socket, 580 * it is just duplicate segment or bug in other side's TCP. 581 * So that we build reply only basing on parameters 582 * arrived with segment. 583 * Exception: precedence violation. We do not implement it in any case. 584 */ 585 586 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 587 { 588 const struct tcphdr *th = tcp_hdr(skb); 589 struct { 590 struct tcphdr th; 591 #ifdef CONFIG_TCP_MD5SIG 592 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 593 #endif 594 } rep; 595 struct ip_reply_arg arg; 596 #ifdef CONFIG_TCP_MD5SIG 597 struct tcp_md5sig_key *key = NULL; 598 const __u8 *hash_location = NULL; 599 unsigned char newhash[16]; 600 int genhash; 601 struct sock *sk1 = NULL; 602 #endif 603 struct net *net; 604 605 /* Never send a reset in response to a reset. */ 606 if (th->rst) 607 return; 608 609 /* If sk not NULL, it means we did a successful lookup and incoming 610 * route had to be correct. prequeue might have dropped our dst. 611 */ 612 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 613 return; 614 615 /* Swap the send and the receive. */ 616 memset(&rep, 0, sizeof(rep)); 617 rep.th.dest = th->source; 618 rep.th.source = th->dest; 619 rep.th.doff = sizeof(struct tcphdr) / 4; 620 rep.th.rst = 1; 621 622 if (th->ack) { 623 rep.th.seq = th->ack_seq; 624 } else { 625 rep.th.ack = 1; 626 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 627 skb->len - (th->doff << 2)); 628 } 629 630 memset(&arg, 0, sizeof(arg)); 631 arg.iov[0].iov_base = (unsigned char *)&rep; 632 arg.iov[0].iov_len = sizeof(rep.th); 633 634 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 635 #ifdef CONFIG_TCP_MD5SIG 636 rcu_read_lock(); 637 hash_location = tcp_parse_md5sig_option(th); 638 if (sk && sk_fullsock(sk)) { 639 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 640 &ip_hdr(skb)->saddr, AF_INET); 641 } else if (hash_location) { 642 /* 643 * active side is lost. Try to find listening socket through 644 * source port, and then find md5 key through listening socket. 645 * we are not loose security here: 646 * Incoming packet is checked with md5 hash with finding key, 647 * no RST generated if md5 hash doesn't match. 648 */ 649 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 650 ip_hdr(skb)->saddr, 651 th->source, ip_hdr(skb)->daddr, 652 ntohs(th->source), inet_iif(skb)); 653 /* don't send rst if it can't find key */ 654 if (!sk1) 655 goto out; 656 657 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 658 &ip_hdr(skb)->saddr, AF_INET); 659 if (!key) 660 goto out; 661 662 663 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 664 if (genhash || memcmp(hash_location, newhash, 16) != 0) 665 goto out; 666 667 } 668 669 if (key) { 670 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 671 (TCPOPT_NOP << 16) | 672 (TCPOPT_MD5SIG << 8) | 673 TCPOLEN_MD5SIG); 674 /* Update length and the length the header thinks exists */ 675 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 676 rep.th.doff = arg.iov[0].iov_len / 4; 677 678 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 679 key, ip_hdr(skb)->saddr, 680 ip_hdr(skb)->daddr, &rep.th); 681 } 682 #endif 683 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 684 ip_hdr(skb)->saddr, /* XXX */ 685 arg.iov[0].iov_len, IPPROTO_TCP, 0); 686 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 687 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 688 689 /* When socket is gone, all binding information is lost. 690 * routing might fail in this case. No choice here, if we choose to force 691 * input interface, we will misroute in case of asymmetric route. 692 */ 693 if (sk) 694 arg.bound_dev_if = sk->sk_bound_dev_if; 695 696 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 697 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 698 699 arg.tos = ip_hdr(skb)->tos; 700 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 701 local_bh_disable(); 702 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 703 skb, &TCP_SKB_CB(skb)->header.h4.opt, 704 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 705 &arg, arg.iov[0].iov_len); 706 707 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 708 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 709 local_bh_enable(); 710 711 #ifdef CONFIG_TCP_MD5SIG 712 out: 713 rcu_read_unlock(); 714 #endif 715 } 716 717 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 718 outside socket context is ugly, certainly. What can I do? 719 */ 720 721 static void tcp_v4_send_ack(const struct sock *sk, 722 struct sk_buff *skb, u32 seq, u32 ack, 723 u32 win, u32 tsval, u32 tsecr, int oif, 724 struct tcp_md5sig_key *key, 725 int reply_flags, u8 tos) 726 { 727 const struct tcphdr *th = tcp_hdr(skb); 728 struct { 729 struct tcphdr th; 730 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 731 #ifdef CONFIG_TCP_MD5SIG 732 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 733 #endif 734 ]; 735 } rep; 736 struct net *net = sock_net(sk); 737 struct ip_reply_arg arg; 738 739 memset(&rep.th, 0, sizeof(struct tcphdr)); 740 memset(&arg, 0, sizeof(arg)); 741 742 arg.iov[0].iov_base = (unsigned char *)&rep; 743 arg.iov[0].iov_len = sizeof(rep.th); 744 if (tsecr) { 745 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 746 (TCPOPT_TIMESTAMP << 8) | 747 TCPOLEN_TIMESTAMP); 748 rep.opt[1] = htonl(tsval); 749 rep.opt[2] = htonl(tsecr); 750 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 751 } 752 753 /* Swap the send and the receive. */ 754 rep.th.dest = th->source; 755 rep.th.source = th->dest; 756 rep.th.doff = arg.iov[0].iov_len / 4; 757 rep.th.seq = htonl(seq); 758 rep.th.ack_seq = htonl(ack); 759 rep.th.ack = 1; 760 rep.th.window = htons(win); 761 762 #ifdef CONFIG_TCP_MD5SIG 763 if (key) { 764 int offset = (tsecr) ? 3 : 0; 765 766 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 767 (TCPOPT_NOP << 16) | 768 (TCPOPT_MD5SIG << 8) | 769 TCPOLEN_MD5SIG); 770 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 771 rep.th.doff = arg.iov[0].iov_len/4; 772 773 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 774 key, ip_hdr(skb)->saddr, 775 ip_hdr(skb)->daddr, &rep.th); 776 } 777 #endif 778 arg.flags = reply_flags; 779 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 780 ip_hdr(skb)->saddr, /* XXX */ 781 arg.iov[0].iov_len, IPPROTO_TCP, 0); 782 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 783 if (oif) 784 arg.bound_dev_if = oif; 785 arg.tos = tos; 786 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 787 local_bh_disable(); 788 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), 789 skb, &TCP_SKB_CB(skb)->header.h4.opt, 790 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 791 &arg, arg.iov[0].iov_len); 792 793 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 794 local_bh_enable(); 795 } 796 797 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 798 { 799 struct inet_timewait_sock *tw = inet_twsk(sk); 800 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 801 802 tcp_v4_send_ack(sk, skb, 803 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 804 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 805 tcp_time_stamp + tcptw->tw_ts_offset, 806 tcptw->tw_ts_recent, 807 tw->tw_bound_dev_if, 808 tcp_twsk_md5_key(tcptw), 809 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 810 tw->tw_tos 811 ); 812 813 inet_twsk_put(tw); 814 } 815 816 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 817 struct request_sock *req) 818 { 819 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 820 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 821 */ 822 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 823 tcp_sk(sk)->snd_nxt; 824 825 /* RFC 7323 2.3 826 * The window field (SEG.WND) of every outgoing segment, with the 827 * exception of <SYN> segments, MUST be right-shifted by 828 * Rcv.Wind.Shift bits: 829 */ 830 tcp_v4_send_ack(sk, skb, seq, 831 tcp_rsk(req)->rcv_nxt, 832 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 833 tcp_time_stamp + tcp_rsk(req)->ts_off, 834 req->ts_recent, 835 0, 836 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, 837 AF_INET), 838 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 839 ip_hdr(skb)->tos); 840 } 841 842 /* 843 * Send a SYN-ACK after having received a SYN. 844 * This still operates on a request_sock only, not on a big 845 * socket. 846 */ 847 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 848 struct flowi *fl, 849 struct request_sock *req, 850 struct tcp_fastopen_cookie *foc, 851 enum tcp_synack_type synack_type) 852 { 853 const struct inet_request_sock *ireq = inet_rsk(req); 854 struct flowi4 fl4; 855 int err = -1; 856 struct sk_buff *skb; 857 858 /* First, grab a route. */ 859 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 860 return -1; 861 862 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 863 864 if (skb) { 865 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 866 867 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 868 ireq->ir_rmt_addr, 869 ireq->opt); 870 err = net_xmit_eval(err); 871 } 872 873 return err; 874 } 875 876 /* 877 * IPv4 request_sock destructor. 878 */ 879 static void tcp_v4_reqsk_destructor(struct request_sock *req) 880 { 881 kfree(inet_rsk(req)->opt); 882 } 883 884 #ifdef CONFIG_TCP_MD5SIG 885 /* 886 * RFC2385 MD5 checksumming requires a mapping of 887 * IP address->MD5 Key. 888 * We need to maintain these in the sk structure. 889 */ 890 891 /* Find the Key structure for an address. */ 892 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, 893 const union tcp_md5_addr *addr, 894 int family) 895 { 896 const struct tcp_sock *tp = tcp_sk(sk); 897 struct tcp_md5sig_key *key; 898 unsigned int size = sizeof(struct in_addr); 899 const struct tcp_md5sig_info *md5sig; 900 901 /* caller either holds rcu_read_lock() or socket lock */ 902 md5sig = rcu_dereference_check(tp->md5sig_info, 903 lockdep_sock_is_held(sk)); 904 if (!md5sig) 905 return NULL; 906 #if IS_ENABLED(CONFIG_IPV6) 907 if (family == AF_INET6) 908 size = sizeof(struct in6_addr); 909 #endif 910 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 911 if (key->family != family) 912 continue; 913 if (!memcmp(&key->addr, addr, size)) 914 return key; 915 } 916 return NULL; 917 } 918 EXPORT_SYMBOL(tcp_md5_do_lookup); 919 920 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 921 const struct sock *addr_sk) 922 { 923 const union tcp_md5_addr *addr; 924 925 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 926 return tcp_md5_do_lookup(sk, addr, AF_INET); 927 } 928 EXPORT_SYMBOL(tcp_v4_md5_lookup); 929 930 /* This can be called on a newly created socket, from other files */ 931 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 932 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) 933 { 934 /* Add Key to the list */ 935 struct tcp_md5sig_key *key; 936 struct tcp_sock *tp = tcp_sk(sk); 937 struct tcp_md5sig_info *md5sig; 938 939 key = tcp_md5_do_lookup(sk, addr, family); 940 if (key) { 941 /* Pre-existing entry - just update that one. */ 942 memcpy(key->key, newkey, newkeylen); 943 key->keylen = newkeylen; 944 return 0; 945 } 946 947 md5sig = rcu_dereference_protected(tp->md5sig_info, 948 lockdep_sock_is_held(sk)); 949 if (!md5sig) { 950 md5sig = kmalloc(sizeof(*md5sig), gfp); 951 if (!md5sig) 952 return -ENOMEM; 953 954 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 955 INIT_HLIST_HEAD(&md5sig->head); 956 rcu_assign_pointer(tp->md5sig_info, md5sig); 957 } 958 959 key = sock_kmalloc(sk, sizeof(*key), gfp); 960 if (!key) 961 return -ENOMEM; 962 if (!tcp_alloc_md5sig_pool()) { 963 sock_kfree_s(sk, key, sizeof(*key)); 964 return -ENOMEM; 965 } 966 967 memcpy(key->key, newkey, newkeylen); 968 key->keylen = newkeylen; 969 key->family = family; 970 memcpy(&key->addr, addr, 971 (family == AF_INET6) ? sizeof(struct in6_addr) : 972 sizeof(struct in_addr)); 973 hlist_add_head_rcu(&key->node, &md5sig->head); 974 return 0; 975 } 976 EXPORT_SYMBOL(tcp_md5_do_add); 977 978 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) 979 { 980 struct tcp_md5sig_key *key; 981 982 key = tcp_md5_do_lookup(sk, addr, family); 983 if (!key) 984 return -ENOENT; 985 hlist_del_rcu(&key->node); 986 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 987 kfree_rcu(key, rcu); 988 return 0; 989 } 990 EXPORT_SYMBOL(tcp_md5_do_del); 991 992 static void tcp_clear_md5_list(struct sock *sk) 993 { 994 struct tcp_sock *tp = tcp_sk(sk); 995 struct tcp_md5sig_key *key; 996 struct hlist_node *n; 997 struct tcp_md5sig_info *md5sig; 998 999 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1000 1001 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1002 hlist_del_rcu(&key->node); 1003 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1004 kfree_rcu(key, rcu); 1005 } 1006 } 1007 1008 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, 1009 int optlen) 1010 { 1011 struct tcp_md5sig cmd; 1012 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1013 1014 if (optlen < sizeof(cmd)) 1015 return -EINVAL; 1016 1017 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1018 return -EFAULT; 1019 1020 if (sin->sin_family != AF_INET) 1021 return -EINVAL; 1022 1023 if (!cmd.tcpm_keylen) 1024 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1025 AF_INET); 1026 1027 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1028 return -EINVAL; 1029 1030 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1031 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, 1032 GFP_KERNEL); 1033 } 1034 1035 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1036 __be32 daddr, __be32 saddr, 1037 const struct tcphdr *th, int nbytes) 1038 { 1039 struct tcp4_pseudohdr *bp; 1040 struct scatterlist sg; 1041 struct tcphdr *_th; 1042 1043 bp = hp->scratch; 1044 bp->saddr = saddr; 1045 bp->daddr = daddr; 1046 bp->pad = 0; 1047 bp->protocol = IPPROTO_TCP; 1048 bp->len = cpu_to_be16(nbytes); 1049 1050 _th = (struct tcphdr *)(bp + 1); 1051 memcpy(_th, th, sizeof(*th)); 1052 _th->check = 0; 1053 1054 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1055 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1056 sizeof(*bp) + sizeof(*th)); 1057 return crypto_ahash_update(hp->md5_req); 1058 } 1059 1060 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1061 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1062 { 1063 struct tcp_md5sig_pool *hp; 1064 struct ahash_request *req; 1065 1066 hp = tcp_get_md5sig_pool(); 1067 if (!hp) 1068 goto clear_hash_noput; 1069 req = hp->md5_req; 1070 1071 if (crypto_ahash_init(req)) 1072 goto clear_hash; 1073 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1074 goto clear_hash; 1075 if (tcp_md5_hash_key(hp, key)) 1076 goto clear_hash; 1077 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1078 if (crypto_ahash_final(req)) 1079 goto clear_hash; 1080 1081 tcp_put_md5sig_pool(); 1082 return 0; 1083 1084 clear_hash: 1085 tcp_put_md5sig_pool(); 1086 clear_hash_noput: 1087 memset(md5_hash, 0, 16); 1088 return 1; 1089 } 1090 1091 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1092 const struct sock *sk, 1093 const struct sk_buff *skb) 1094 { 1095 struct tcp_md5sig_pool *hp; 1096 struct ahash_request *req; 1097 const struct tcphdr *th = tcp_hdr(skb); 1098 __be32 saddr, daddr; 1099 1100 if (sk) { /* valid for establish/request sockets */ 1101 saddr = sk->sk_rcv_saddr; 1102 daddr = sk->sk_daddr; 1103 } else { 1104 const struct iphdr *iph = ip_hdr(skb); 1105 saddr = iph->saddr; 1106 daddr = iph->daddr; 1107 } 1108 1109 hp = tcp_get_md5sig_pool(); 1110 if (!hp) 1111 goto clear_hash_noput; 1112 req = hp->md5_req; 1113 1114 if (crypto_ahash_init(req)) 1115 goto clear_hash; 1116 1117 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1118 goto clear_hash; 1119 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1120 goto clear_hash; 1121 if (tcp_md5_hash_key(hp, key)) 1122 goto clear_hash; 1123 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1124 if (crypto_ahash_final(req)) 1125 goto clear_hash; 1126 1127 tcp_put_md5sig_pool(); 1128 return 0; 1129 1130 clear_hash: 1131 tcp_put_md5sig_pool(); 1132 clear_hash_noput: 1133 memset(md5_hash, 0, 16); 1134 return 1; 1135 } 1136 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1137 1138 #endif 1139 1140 /* Called with rcu_read_lock() */ 1141 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1142 const struct sk_buff *skb) 1143 { 1144 #ifdef CONFIG_TCP_MD5SIG 1145 /* 1146 * This gets called for each TCP segment that arrives 1147 * so we want to be efficient. 1148 * We have 3 drop cases: 1149 * o No MD5 hash and one expected. 1150 * o MD5 hash and we're not expecting one. 1151 * o MD5 hash and its wrong. 1152 */ 1153 const __u8 *hash_location = NULL; 1154 struct tcp_md5sig_key *hash_expected; 1155 const struct iphdr *iph = ip_hdr(skb); 1156 const struct tcphdr *th = tcp_hdr(skb); 1157 int genhash; 1158 unsigned char newhash[16]; 1159 1160 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1161 AF_INET); 1162 hash_location = tcp_parse_md5sig_option(th); 1163 1164 /* We've parsed the options - do we have a hash? */ 1165 if (!hash_expected && !hash_location) 1166 return false; 1167 1168 if (hash_expected && !hash_location) { 1169 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1170 return true; 1171 } 1172 1173 if (!hash_expected && hash_location) { 1174 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1175 return true; 1176 } 1177 1178 /* Okay, so this is hash_expected and hash_location - 1179 * so we need to calculate the checksum. 1180 */ 1181 genhash = tcp_v4_md5_hash_skb(newhash, 1182 hash_expected, 1183 NULL, skb); 1184 1185 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1186 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1187 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1188 &iph->saddr, ntohs(th->source), 1189 &iph->daddr, ntohs(th->dest), 1190 genhash ? " tcp_v4_calc_md5_hash failed" 1191 : ""); 1192 return true; 1193 } 1194 return false; 1195 #endif 1196 return false; 1197 } 1198 1199 static void tcp_v4_init_req(struct request_sock *req, 1200 const struct sock *sk_listener, 1201 struct sk_buff *skb) 1202 { 1203 struct inet_request_sock *ireq = inet_rsk(req); 1204 1205 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1206 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1207 ireq->opt = tcp_v4_save_options(skb); 1208 } 1209 1210 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1211 struct flowi *fl, 1212 const struct request_sock *req, 1213 bool *strict) 1214 { 1215 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req); 1216 1217 if (strict) { 1218 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr) 1219 *strict = true; 1220 else 1221 *strict = false; 1222 } 1223 1224 return dst; 1225 } 1226 1227 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1228 .family = PF_INET, 1229 .obj_size = sizeof(struct tcp_request_sock), 1230 .rtx_syn_ack = tcp_rtx_synack, 1231 .send_ack = tcp_v4_reqsk_send_ack, 1232 .destructor = tcp_v4_reqsk_destructor, 1233 .send_reset = tcp_v4_send_reset, 1234 .syn_ack_timeout = tcp_syn_ack_timeout, 1235 }; 1236 1237 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1238 .mss_clamp = TCP_MSS_DEFAULT, 1239 #ifdef CONFIG_TCP_MD5SIG 1240 .req_md5_lookup = tcp_v4_md5_lookup, 1241 .calc_md5_hash = tcp_v4_md5_hash_skb, 1242 #endif 1243 .init_req = tcp_v4_init_req, 1244 #ifdef CONFIG_SYN_COOKIES 1245 .cookie_init_seq = cookie_v4_init_sequence, 1246 #endif 1247 .route_req = tcp_v4_route_req, 1248 .init_seq = tcp_v4_init_sequence, 1249 .send_synack = tcp_v4_send_synack, 1250 }; 1251 1252 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1253 { 1254 /* Never answer to SYNs send to broadcast or multicast */ 1255 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1256 goto drop; 1257 1258 return tcp_conn_request(&tcp_request_sock_ops, 1259 &tcp_request_sock_ipv4_ops, sk, skb); 1260 1261 drop: 1262 tcp_listendrop(sk); 1263 return 0; 1264 } 1265 EXPORT_SYMBOL(tcp_v4_conn_request); 1266 1267 1268 /* 1269 * The three way handshake has completed - we got a valid synack - 1270 * now create the new socket. 1271 */ 1272 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1273 struct request_sock *req, 1274 struct dst_entry *dst, 1275 struct request_sock *req_unhash, 1276 bool *own_req) 1277 { 1278 struct inet_request_sock *ireq; 1279 struct inet_sock *newinet; 1280 struct tcp_sock *newtp; 1281 struct sock *newsk; 1282 #ifdef CONFIG_TCP_MD5SIG 1283 struct tcp_md5sig_key *key; 1284 #endif 1285 struct ip_options_rcu *inet_opt; 1286 1287 if (sk_acceptq_is_full(sk)) 1288 goto exit_overflow; 1289 1290 newsk = tcp_create_openreq_child(sk, req, skb); 1291 if (!newsk) 1292 goto exit_nonewsk; 1293 1294 newsk->sk_gso_type = SKB_GSO_TCPV4; 1295 inet_sk_rx_dst_set(newsk, skb); 1296 1297 newtp = tcp_sk(newsk); 1298 newinet = inet_sk(newsk); 1299 ireq = inet_rsk(req); 1300 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1301 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1302 newsk->sk_bound_dev_if = ireq->ir_iif; 1303 newinet->inet_saddr = ireq->ir_loc_addr; 1304 inet_opt = ireq->opt; 1305 rcu_assign_pointer(newinet->inet_opt, inet_opt); 1306 ireq->opt = NULL; 1307 newinet->mc_index = inet_iif(skb); 1308 newinet->mc_ttl = ip_hdr(skb)->ttl; 1309 newinet->rcv_tos = ip_hdr(skb)->tos; 1310 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1311 if (inet_opt) 1312 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1313 newinet->inet_id = newtp->write_seq ^ jiffies; 1314 1315 if (!dst) { 1316 dst = inet_csk_route_child_sock(sk, newsk, req); 1317 if (!dst) 1318 goto put_and_exit; 1319 } else { 1320 /* syncookie case : see end of cookie_v4_check() */ 1321 } 1322 sk_setup_caps(newsk, dst); 1323 1324 tcp_ca_openreq_child(newsk, dst); 1325 1326 tcp_sync_mss(newsk, dst_mtu(dst)); 1327 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1328 1329 tcp_initialize_rcv_mss(newsk); 1330 1331 #ifdef CONFIG_TCP_MD5SIG 1332 /* Copy over the MD5 key from the original socket */ 1333 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1334 AF_INET); 1335 if (key) { 1336 /* 1337 * We're using one, so create a matching key 1338 * on the newsk structure. If we fail to get 1339 * memory, then we end up not copying the key 1340 * across. Shucks. 1341 */ 1342 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1343 AF_INET, key->key, key->keylen, GFP_ATOMIC); 1344 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1345 } 1346 #endif 1347 1348 if (__inet_inherit_port(sk, newsk) < 0) 1349 goto put_and_exit; 1350 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1351 if (*own_req) 1352 tcp_move_syn(newtp, req); 1353 1354 return newsk; 1355 1356 exit_overflow: 1357 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1358 exit_nonewsk: 1359 dst_release(dst); 1360 exit: 1361 tcp_listendrop(sk); 1362 return NULL; 1363 put_and_exit: 1364 inet_csk_prepare_forced_close(newsk); 1365 tcp_done(newsk); 1366 goto exit; 1367 } 1368 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1369 1370 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1371 { 1372 #ifdef CONFIG_SYN_COOKIES 1373 const struct tcphdr *th = tcp_hdr(skb); 1374 1375 if (!th->syn) 1376 sk = cookie_v4_check(sk, skb); 1377 #endif 1378 return sk; 1379 } 1380 1381 /* The socket must have it's spinlock held when we get 1382 * here, unless it is a TCP_LISTEN socket. 1383 * 1384 * We have a potential double-lock case here, so even when 1385 * doing backlog processing we use the BH locking scheme. 1386 * This is because we cannot sleep with the original spinlock 1387 * held. 1388 */ 1389 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1390 { 1391 struct sock *rsk; 1392 1393 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1394 struct dst_entry *dst = sk->sk_rx_dst; 1395 1396 sock_rps_save_rxhash(sk, skb); 1397 sk_mark_napi_id(sk, skb); 1398 if (dst) { 1399 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1400 !dst->ops->check(dst, 0)) { 1401 dst_release(dst); 1402 sk->sk_rx_dst = NULL; 1403 } 1404 } 1405 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); 1406 return 0; 1407 } 1408 1409 if (tcp_checksum_complete(skb)) 1410 goto csum_err; 1411 1412 if (sk->sk_state == TCP_LISTEN) { 1413 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1414 1415 if (!nsk) 1416 goto discard; 1417 if (nsk != sk) { 1418 sock_rps_save_rxhash(nsk, skb); 1419 sk_mark_napi_id(nsk, skb); 1420 if (tcp_child_process(sk, nsk, skb)) { 1421 rsk = nsk; 1422 goto reset; 1423 } 1424 return 0; 1425 } 1426 } else 1427 sock_rps_save_rxhash(sk, skb); 1428 1429 if (tcp_rcv_state_process(sk, skb)) { 1430 rsk = sk; 1431 goto reset; 1432 } 1433 return 0; 1434 1435 reset: 1436 tcp_v4_send_reset(rsk, skb); 1437 discard: 1438 kfree_skb(skb); 1439 /* Be careful here. If this function gets more complicated and 1440 * gcc suffers from register pressure on the x86, sk (in %ebx) 1441 * might be destroyed here. This current version compiles correctly, 1442 * but you have been warned. 1443 */ 1444 return 0; 1445 1446 csum_err: 1447 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1448 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1449 goto discard; 1450 } 1451 EXPORT_SYMBOL(tcp_v4_do_rcv); 1452 1453 void tcp_v4_early_demux(struct sk_buff *skb) 1454 { 1455 const struct iphdr *iph; 1456 const struct tcphdr *th; 1457 struct sock *sk; 1458 1459 if (skb->pkt_type != PACKET_HOST) 1460 return; 1461 1462 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1463 return; 1464 1465 iph = ip_hdr(skb); 1466 th = tcp_hdr(skb); 1467 1468 if (th->doff < sizeof(struct tcphdr) / 4) 1469 return; 1470 1471 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1472 iph->saddr, th->source, 1473 iph->daddr, ntohs(th->dest), 1474 skb->skb_iif); 1475 if (sk) { 1476 skb->sk = sk; 1477 skb->destructor = sock_edemux; 1478 if (sk_fullsock(sk)) { 1479 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1480 1481 if (dst) 1482 dst = dst_check(dst, 0); 1483 if (dst && 1484 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1485 skb_dst_set_noref(skb, dst); 1486 } 1487 } 1488 } 1489 1490 /* Packet is added to VJ-style prequeue for processing in process 1491 * context, if a reader task is waiting. Apparently, this exciting 1492 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) 1493 * failed somewhere. Latency? Burstiness? Well, at least now we will 1494 * see, why it failed. 8)8) --ANK 1495 * 1496 */ 1497 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) 1498 { 1499 struct tcp_sock *tp = tcp_sk(sk); 1500 1501 if (sysctl_tcp_low_latency || !tp->ucopy.task) 1502 return false; 1503 1504 if (skb->len <= tcp_hdrlen(skb) && 1505 skb_queue_len(&tp->ucopy.prequeue) == 0) 1506 return false; 1507 1508 /* Before escaping RCU protected region, we need to take care of skb 1509 * dst. Prequeue is only enabled for established sockets. 1510 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst 1511 * Instead of doing full sk_rx_dst validity here, let's perform 1512 * an optimistic check. 1513 */ 1514 if (likely(sk->sk_rx_dst)) 1515 skb_dst_drop(skb); 1516 else 1517 skb_dst_force_safe(skb); 1518 1519 __skb_queue_tail(&tp->ucopy.prequeue, skb); 1520 tp->ucopy.memory += skb->truesize; 1521 if (skb_queue_len(&tp->ucopy.prequeue) >= 32 || 1522 tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { 1523 struct sk_buff *skb1; 1524 1525 BUG_ON(sock_owned_by_user(sk)); 1526 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED, 1527 skb_queue_len(&tp->ucopy.prequeue)); 1528 1529 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) 1530 sk_backlog_rcv(sk, skb1); 1531 1532 tp->ucopy.memory = 0; 1533 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { 1534 wake_up_interruptible_sync_poll(sk_sleep(sk), 1535 POLLIN | POLLRDNORM | POLLRDBAND); 1536 if (!inet_csk_ack_scheduled(sk)) 1537 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 1538 (3 * tcp_rto_min(sk)) / 4, 1539 TCP_RTO_MAX); 1540 } 1541 return true; 1542 } 1543 EXPORT_SYMBOL(tcp_prequeue); 1544 1545 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1546 { 1547 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1548 1549 /* Only socket owner can try to collapse/prune rx queues 1550 * to reduce memory overhead, so add a little headroom here. 1551 * Few sockets backlog are possibly concurrently non empty. 1552 */ 1553 limit += 64*1024; 1554 1555 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1556 * we can fix skb->truesize to its real value to avoid future drops. 1557 * This is valid because skb is not yet charged to the socket. 1558 * It has been noticed pure SACK packets were sometimes dropped 1559 * (if cooked by drivers without copybreak feature). 1560 */ 1561 skb_condense(skb); 1562 1563 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1564 bh_unlock_sock(sk); 1565 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1566 return true; 1567 } 1568 return false; 1569 } 1570 EXPORT_SYMBOL(tcp_add_backlog); 1571 1572 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1573 { 1574 struct tcphdr *th = (struct tcphdr *)skb->data; 1575 unsigned int eaten = skb->len; 1576 int err; 1577 1578 err = sk_filter_trim_cap(sk, skb, th->doff * 4); 1579 if (!err) { 1580 eaten -= skb->len; 1581 TCP_SKB_CB(skb)->end_seq -= eaten; 1582 } 1583 return err; 1584 } 1585 EXPORT_SYMBOL(tcp_filter); 1586 1587 /* 1588 * From tcp_input.c 1589 */ 1590 1591 int tcp_v4_rcv(struct sk_buff *skb) 1592 { 1593 struct net *net = dev_net(skb->dev); 1594 const struct iphdr *iph; 1595 const struct tcphdr *th; 1596 bool refcounted; 1597 struct sock *sk; 1598 int ret; 1599 1600 if (skb->pkt_type != PACKET_HOST) 1601 goto discard_it; 1602 1603 /* Count it even if it's bad */ 1604 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1605 1606 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1607 goto discard_it; 1608 1609 th = (const struct tcphdr *)skb->data; 1610 1611 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1612 goto bad_packet; 1613 if (!pskb_may_pull(skb, th->doff * 4)) 1614 goto discard_it; 1615 1616 /* An explanation is required here, I think. 1617 * Packet length and doff are validated by header prediction, 1618 * provided case of th->doff==0 is eliminated. 1619 * So, we defer the checks. */ 1620 1621 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1622 goto csum_error; 1623 1624 th = (const struct tcphdr *)skb->data; 1625 iph = ip_hdr(skb); 1626 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1627 * barrier() makes sure compiler wont play fool^Waliasing games. 1628 */ 1629 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1630 sizeof(struct inet_skb_parm)); 1631 barrier(); 1632 1633 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1634 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1635 skb->len - th->doff * 4); 1636 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1637 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1638 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1639 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1640 TCP_SKB_CB(skb)->sacked = 0; 1641 1642 lookup: 1643 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1644 th->dest, &refcounted); 1645 if (!sk) 1646 goto no_tcp_socket; 1647 1648 process: 1649 if (sk->sk_state == TCP_TIME_WAIT) 1650 goto do_time_wait; 1651 1652 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1653 struct request_sock *req = inet_reqsk(sk); 1654 struct sock *nsk; 1655 1656 sk = req->rsk_listener; 1657 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 1658 sk_drops_add(sk, skb); 1659 reqsk_put(req); 1660 goto discard_it; 1661 } 1662 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1663 inet_csk_reqsk_queue_drop_and_put(sk, req); 1664 goto lookup; 1665 } 1666 /* We own a reference on the listener, increase it again 1667 * as we might lose it too soon. 1668 */ 1669 sock_hold(sk); 1670 refcounted = true; 1671 nsk = tcp_check_req(sk, skb, req, false); 1672 if (!nsk) { 1673 reqsk_put(req); 1674 goto discard_and_relse; 1675 } 1676 if (nsk == sk) { 1677 reqsk_put(req); 1678 } else if (tcp_child_process(sk, nsk, skb)) { 1679 tcp_v4_send_reset(nsk, skb); 1680 goto discard_and_relse; 1681 } else { 1682 sock_put(sk); 1683 return 0; 1684 } 1685 } 1686 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1687 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1688 goto discard_and_relse; 1689 } 1690 1691 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1692 goto discard_and_relse; 1693 1694 if (tcp_v4_inbound_md5_hash(sk, skb)) 1695 goto discard_and_relse; 1696 1697 nf_reset(skb); 1698 1699 if (tcp_filter(sk, skb)) 1700 goto discard_and_relse; 1701 th = (const struct tcphdr *)skb->data; 1702 iph = ip_hdr(skb); 1703 1704 skb->dev = NULL; 1705 1706 if (sk->sk_state == TCP_LISTEN) { 1707 ret = tcp_v4_do_rcv(sk, skb); 1708 goto put_and_return; 1709 } 1710 1711 sk_incoming_cpu_update(sk); 1712 1713 bh_lock_sock_nested(sk); 1714 tcp_segs_in(tcp_sk(sk), skb); 1715 ret = 0; 1716 if (!sock_owned_by_user(sk)) { 1717 if (!tcp_prequeue(sk, skb)) 1718 ret = tcp_v4_do_rcv(sk, skb); 1719 } else if (tcp_add_backlog(sk, skb)) { 1720 goto discard_and_relse; 1721 } 1722 bh_unlock_sock(sk); 1723 1724 put_and_return: 1725 if (refcounted) 1726 sock_put(sk); 1727 1728 return ret; 1729 1730 no_tcp_socket: 1731 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1732 goto discard_it; 1733 1734 if (tcp_checksum_complete(skb)) { 1735 csum_error: 1736 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 1737 bad_packet: 1738 __TCP_INC_STATS(net, TCP_MIB_INERRS); 1739 } else { 1740 tcp_v4_send_reset(NULL, skb); 1741 } 1742 1743 discard_it: 1744 /* Discard frame. */ 1745 kfree_skb(skb); 1746 return 0; 1747 1748 discard_and_relse: 1749 sk_drops_add(sk, skb); 1750 if (refcounted) 1751 sock_put(sk); 1752 goto discard_it; 1753 1754 do_time_wait: 1755 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1756 inet_twsk_put(inet_twsk(sk)); 1757 goto discard_it; 1758 } 1759 1760 if (tcp_checksum_complete(skb)) { 1761 inet_twsk_put(inet_twsk(sk)); 1762 goto csum_error; 1763 } 1764 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1765 case TCP_TW_SYN: { 1766 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1767 &tcp_hashinfo, skb, 1768 __tcp_hdrlen(th), 1769 iph->saddr, th->source, 1770 iph->daddr, th->dest, 1771 inet_iif(skb)); 1772 if (sk2) { 1773 inet_twsk_deschedule_put(inet_twsk(sk)); 1774 sk = sk2; 1775 refcounted = false; 1776 goto process; 1777 } 1778 /* Fall through to ACK */ 1779 } 1780 case TCP_TW_ACK: 1781 tcp_v4_timewait_ack(sk, skb); 1782 break; 1783 case TCP_TW_RST: 1784 tcp_v4_send_reset(sk, skb); 1785 inet_twsk_deschedule_put(inet_twsk(sk)); 1786 goto discard_it; 1787 case TCP_TW_SUCCESS:; 1788 } 1789 goto discard_it; 1790 } 1791 1792 static struct timewait_sock_ops tcp_timewait_sock_ops = { 1793 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1794 .twsk_unique = tcp_twsk_unique, 1795 .twsk_destructor= tcp_twsk_destructor, 1796 }; 1797 1798 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 1799 { 1800 struct dst_entry *dst = skb_dst(skb); 1801 1802 if (dst && dst_hold_safe(dst)) { 1803 sk->sk_rx_dst = dst; 1804 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 1805 } 1806 } 1807 EXPORT_SYMBOL(inet_sk_rx_dst_set); 1808 1809 const struct inet_connection_sock_af_ops ipv4_specific = { 1810 .queue_xmit = ip_queue_xmit, 1811 .send_check = tcp_v4_send_check, 1812 .rebuild_header = inet_sk_rebuild_header, 1813 .sk_rx_dst_set = inet_sk_rx_dst_set, 1814 .conn_request = tcp_v4_conn_request, 1815 .syn_recv_sock = tcp_v4_syn_recv_sock, 1816 .net_header_len = sizeof(struct iphdr), 1817 .setsockopt = ip_setsockopt, 1818 .getsockopt = ip_getsockopt, 1819 .addr2sockaddr = inet_csk_addr2sockaddr, 1820 .sockaddr_len = sizeof(struct sockaddr_in), 1821 #ifdef CONFIG_COMPAT 1822 .compat_setsockopt = compat_ip_setsockopt, 1823 .compat_getsockopt = compat_ip_getsockopt, 1824 #endif 1825 .mtu_reduced = tcp_v4_mtu_reduced, 1826 }; 1827 EXPORT_SYMBOL(ipv4_specific); 1828 1829 #ifdef CONFIG_TCP_MD5SIG 1830 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1831 .md5_lookup = tcp_v4_md5_lookup, 1832 .calc_md5_hash = tcp_v4_md5_hash_skb, 1833 .md5_parse = tcp_v4_parse_md5_keys, 1834 }; 1835 #endif 1836 1837 /* NOTE: A lot of things set to zero explicitly by call to 1838 * sk_alloc() so need not be done here. 1839 */ 1840 static int tcp_v4_init_sock(struct sock *sk) 1841 { 1842 struct inet_connection_sock *icsk = inet_csk(sk); 1843 1844 tcp_init_sock(sk); 1845 1846 icsk->icsk_af_ops = &ipv4_specific; 1847 1848 #ifdef CONFIG_TCP_MD5SIG 1849 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 1850 #endif 1851 1852 return 0; 1853 } 1854 1855 void tcp_v4_destroy_sock(struct sock *sk) 1856 { 1857 struct tcp_sock *tp = tcp_sk(sk); 1858 1859 tcp_clear_xmit_timers(sk); 1860 1861 tcp_cleanup_congestion_control(sk); 1862 1863 /* Cleanup up the write buffer. */ 1864 tcp_write_queue_purge(sk); 1865 1866 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1867 skb_rbtree_purge(&tp->out_of_order_queue); 1868 1869 #ifdef CONFIG_TCP_MD5SIG 1870 /* Clean up the MD5 key list, if any */ 1871 if (tp->md5sig_info) { 1872 tcp_clear_md5_list(sk); 1873 kfree_rcu(tp->md5sig_info, rcu); 1874 tp->md5sig_info = NULL; 1875 } 1876 #endif 1877 1878 /* Clean prequeue, it must be empty really */ 1879 __skb_queue_purge(&tp->ucopy.prequeue); 1880 1881 /* Clean up a referenced TCP bind bucket. */ 1882 if (inet_csk(sk)->icsk_bind_hash) 1883 inet_put_port(sk); 1884 1885 BUG_ON(tp->fastopen_rsk); 1886 1887 /* If socket is aborted during connect operation */ 1888 tcp_free_fastopen_req(tp); 1889 tcp_saved_syn_free(tp); 1890 1891 sk_sockets_allocated_dec(sk); 1892 } 1893 EXPORT_SYMBOL(tcp_v4_destroy_sock); 1894 1895 #ifdef CONFIG_PROC_FS 1896 /* Proc filesystem TCP sock list dumping. */ 1897 1898 /* 1899 * Get next listener socket follow cur. If cur is NULL, get first socket 1900 * starting from bucket given in st->bucket; when st->bucket is zero the 1901 * very first socket in the hash table is returned. 1902 */ 1903 static void *listening_get_next(struct seq_file *seq, void *cur) 1904 { 1905 struct tcp_iter_state *st = seq->private; 1906 struct net *net = seq_file_net(seq); 1907 struct inet_listen_hashbucket *ilb; 1908 struct sock *sk = cur; 1909 1910 if (!sk) { 1911 get_head: 1912 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1913 spin_lock(&ilb->lock); 1914 sk = sk_head(&ilb->head); 1915 st->offset = 0; 1916 goto get_sk; 1917 } 1918 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1919 ++st->num; 1920 ++st->offset; 1921 1922 sk = sk_next(sk); 1923 get_sk: 1924 sk_for_each_from(sk) { 1925 if (!net_eq(sock_net(sk), net)) 1926 continue; 1927 if (sk->sk_family == st->family) 1928 return sk; 1929 } 1930 spin_unlock(&ilb->lock); 1931 st->offset = 0; 1932 if (++st->bucket < INET_LHTABLE_SIZE) 1933 goto get_head; 1934 return NULL; 1935 } 1936 1937 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 1938 { 1939 struct tcp_iter_state *st = seq->private; 1940 void *rc; 1941 1942 st->bucket = 0; 1943 st->offset = 0; 1944 rc = listening_get_next(seq, NULL); 1945 1946 while (rc && *pos) { 1947 rc = listening_get_next(seq, rc); 1948 --*pos; 1949 } 1950 return rc; 1951 } 1952 1953 static inline bool empty_bucket(const struct tcp_iter_state *st) 1954 { 1955 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 1956 } 1957 1958 /* 1959 * Get first established socket starting from bucket given in st->bucket. 1960 * If st->bucket is zero, the very first socket in the hash is returned. 1961 */ 1962 static void *established_get_first(struct seq_file *seq) 1963 { 1964 struct tcp_iter_state *st = seq->private; 1965 struct net *net = seq_file_net(seq); 1966 void *rc = NULL; 1967 1968 st->offset = 0; 1969 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 1970 struct sock *sk; 1971 struct hlist_nulls_node *node; 1972 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 1973 1974 /* Lockless fast path for the common case of empty buckets */ 1975 if (empty_bucket(st)) 1976 continue; 1977 1978 spin_lock_bh(lock); 1979 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 1980 if (sk->sk_family != st->family || 1981 !net_eq(sock_net(sk), net)) { 1982 continue; 1983 } 1984 rc = sk; 1985 goto out; 1986 } 1987 spin_unlock_bh(lock); 1988 } 1989 out: 1990 return rc; 1991 } 1992 1993 static void *established_get_next(struct seq_file *seq, void *cur) 1994 { 1995 struct sock *sk = cur; 1996 struct hlist_nulls_node *node; 1997 struct tcp_iter_state *st = seq->private; 1998 struct net *net = seq_file_net(seq); 1999 2000 ++st->num; 2001 ++st->offset; 2002 2003 sk = sk_nulls_next(sk); 2004 2005 sk_nulls_for_each_from(sk, node) { 2006 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) 2007 return sk; 2008 } 2009 2010 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2011 ++st->bucket; 2012 return established_get_first(seq); 2013 } 2014 2015 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2016 { 2017 struct tcp_iter_state *st = seq->private; 2018 void *rc; 2019 2020 st->bucket = 0; 2021 rc = established_get_first(seq); 2022 2023 while (rc && pos) { 2024 rc = established_get_next(seq, rc); 2025 --pos; 2026 } 2027 return rc; 2028 } 2029 2030 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2031 { 2032 void *rc; 2033 struct tcp_iter_state *st = seq->private; 2034 2035 st->state = TCP_SEQ_STATE_LISTENING; 2036 rc = listening_get_idx(seq, &pos); 2037 2038 if (!rc) { 2039 st->state = TCP_SEQ_STATE_ESTABLISHED; 2040 rc = established_get_idx(seq, pos); 2041 } 2042 2043 return rc; 2044 } 2045 2046 static void *tcp_seek_last_pos(struct seq_file *seq) 2047 { 2048 struct tcp_iter_state *st = seq->private; 2049 int offset = st->offset; 2050 int orig_num = st->num; 2051 void *rc = NULL; 2052 2053 switch (st->state) { 2054 case TCP_SEQ_STATE_LISTENING: 2055 if (st->bucket >= INET_LHTABLE_SIZE) 2056 break; 2057 st->state = TCP_SEQ_STATE_LISTENING; 2058 rc = listening_get_next(seq, NULL); 2059 while (offset-- && rc) 2060 rc = listening_get_next(seq, rc); 2061 if (rc) 2062 break; 2063 st->bucket = 0; 2064 st->state = TCP_SEQ_STATE_ESTABLISHED; 2065 /* Fallthrough */ 2066 case TCP_SEQ_STATE_ESTABLISHED: 2067 if (st->bucket > tcp_hashinfo.ehash_mask) 2068 break; 2069 rc = established_get_first(seq); 2070 while (offset-- && rc) 2071 rc = established_get_next(seq, rc); 2072 } 2073 2074 st->num = orig_num; 2075 2076 return rc; 2077 } 2078 2079 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2080 { 2081 struct tcp_iter_state *st = seq->private; 2082 void *rc; 2083 2084 if (*pos && *pos == st->last_pos) { 2085 rc = tcp_seek_last_pos(seq); 2086 if (rc) 2087 goto out; 2088 } 2089 2090 st->state = TCP_SEQ_STATE_LISTENING; 2091 st->num = 0; 2092 st->bucket = 0; 2093 st->offset = 0; 2094 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2095 2096 out: 2097 st->last_pos = *pos; 2098 return rc; 2099 } 2100 2101 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2102 { 2103 struct tcp_iter_state *st = seq->private; 2104 void *rc = NULL; 2105 2106 if (v == SEQ_START_TOKEN) { 2107 rc = tcp_get_idx(seq, 0); 2108 goto out; 2109 } 2110 2111 switch (st->state) { 2112 case TCP_SEQ_STATE_LISTENING: 2113 rc = listening_get_next(seq, v); 2114 if (!rc) { 2115 st->state = TCP_SEQ_STATE_ESTABLISHED; 2116 st->bucket = 0; 2117 st->offset = 0; 2118 rc = established_get_first(seq); 2119 } 2120 break; 2121 case TCP_SEQ_STATE_ESTABLISHED: 2122 rc = established_get_next(seq, v); 2123 break; 2124 } 2125 out: 2126 ++*pos; 2127 st->last_pos = *pos; 2128 return rc; 2129 } 2130 2131 static void tcp_seq_stop(struct seq_file *seq, void *v) 2132 { 2133 struct tcp_iter_state *st = seq->private; 2134 2135 switch (st->state) { 2136 case TCP_SEQ_STATE_LISTENING: 2137 if (v != SEQ_START_TOKEN) 2138 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2139 break; 2140 case TCP_SEQ_STATE_ESTABLISHED: 2141 if (v) 2142 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2143 break; 2144 } 2145 } 2146 2147 int tcp_seq_open(struct inode *inode, struct file *file) 2148 { 2149 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode); 2150 struct tcp_iter_state *s; 2151 int err; 2152 2153 err = seq_open_net(inode, file, &afinfo->seq_ops, 2154 sizeof(struct tcp_iter_state)); 2155 if (err < 0) 2156 return err; 2157 2158 s = ((struct seq_file *)file->private_data)->private; 2159 s->family = afinfo->family; 2160 s->last_pos = 0; 2161 return 0; 2162 } 2163 EXPORT_SYMBOL(tcp_seq_open); 2164 2165 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) 2166 { 2167 int rc = 0; 2168 struct proc_dir_entry *p; 2169 2170 afinfo->seq_ops.start = tcp_seq_start; 2171 afinfo->seq_ops.next = tcp_seq_next; 2172 afinfo->seq_ops.stop = tcp_seq_stop; 2173 2174 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, 2175 afinfo->seq_fops, afinfo); 2176 if (!p) 2177 rc = -ENOMEM; 2178 return rc; 2179 } 2180 EXPORT_SYMBOL(tcp_proc_register); 2181 2182 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) 2183 { 2184 remove_proc_entry(afinfo->name, net->proc_net); 2185 } 2186 EXPORT_SYMBOL(tcp_proc_unregister); 2187 2188 static void get_openreq4(const struct request_sock *req, 2189 struct seq_file *f, int i) 2190 { 2191 const struct inet_request_sock *ireq = inet_rsk(req); 2192 long delta = req->rsk_timer.expires - jiffies; 2193 2194 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2195 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2196 i, 2197 ireq->ir_loc_addr, 2198 ireq->ir_num, 2199 ireq->ir_rmt_addr, 2200 ntohs(ireq->ir_rmt_port), 2201 TCP_SYN_RECV, 2202 0, 0, /* could print option size, but that is af dependent. */ 2203 1, /* timers active (only the expire timer) */ 2204 jiffies_delta_to_clock_t(delta), 2205 req->num_timeout, 2206 from_kuid_munged(seq_user_ns(f), 2207 sock_i_uid(req->rsk_listener)), 2208 0, /* non standard timer */ 2209 0, /* open_requests have no inode */ 2210 0, 2211 req); 2212 } 2213 2214 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2215 { 2216 int timer_active; 2217 unsigned long timer_expires; 2218 const struct tcp_sock *tp = tcp_sk(sk); 2219 const struct inet_connection_sock *icsk = inet_csk(sk); 2220 const struct inet_sock *inet = inet_sk(sk); 2221 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2222 __be32 dest = inet->inet_daddr; 2223 __be32 src = inet->inet_rcv_saddr; 2224 __u16 destp = ntohs(inet->inet_dport); 2225 __u16 srcp = ntohs(inet->inet_sport); 2226 int rx_queue; 2227 int state; 2228 2229 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2230 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2231 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2232 timer_active = 1; 2233 timer_expires = icsk->icsk_timeout; 2234 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2235 timer_active = 4; 2236 timer_expires = icsk->icsk_timeout; 2237 } else if (timer_pending(&sk->sk_timer)) { 2238 timer_active = 2; 2239 timer_expires = sk->sk_timer.expires; 2240 } else { 2241 timer_active = 0; 2242 timer_expires = jiffies; 2243 } 2244 2245 state = sk_state_load(sk); 2246 if (state == TCP_LISTEN) 2247 rx_queue = sk->sk_ack_backlog; 2248 else 2249 /* Because we don't lock the socket, 2250 * we might find a transient negative value. 2251 */ 2252 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2253 2254 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2255 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2256 i, src, srcp, dest, destp, state, 2257 tp->write_seq - tp->snd_una, 2258 rx_queue, 2259 timer_active, 2260 jiffies_delta_to_clock_t(timer_expires - jiffies), 2261 icsk->icsk_retransmits, 2262 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2263 icsk->icsk_probes_out, 2264 sock_i_ino(sk), 2265 atomic_read(&sk->sk_refcnt), sk, 2266 jiffies_to_clock_t(icsk->icsk_rto), 2267 jiffies_to_clock_t(icsk->icsk_ack.ato), 2268 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2269 tp->snd_cwnd, 2270 state == TCP_LISTEN ? 2271 fastopenq->max_qlen : 2272 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2273 } 2274 2275 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2276 struct seq_file *f, int i) 2277 { 2278 long delta = tw->tw_timer.expires - jiffies; 2279 __be32 dest, src; 2280 __u16 destp, srcp; 2281 2282 dest = tw->tw_daddr; 2283 src = tw->tw_rcv_saddr; 2284 destp = ntohs(tw->tw_dport); 2285 srcp = ntohs(tw->tw_sport); 2286 2287 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2288 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2289 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2290 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2291 atomic_read(&tw->tw_refcnt), tw); 2292 } 2293 2294 #define TMPSZ 150 2295 2296 static int tcp4_seq_show(struct seq_file *seq, void *v) 2297 { 2298 struct tcp_iter_state *st; 2299 struct sock *sk = v; 2300 2301 seq_setwidth(seq, TMPSZ - 1); 2302 if (v == SEQ_START_TOKEN) { 2303 seq_puts(seq, " sl local_address rem_address st tx_queue " 2304 "rx_queue tr tm->when retrnsmt uid timeout " 2305 "inode"); 2306 goto out; 2307 } 2308 st = seq->private; 2309 2310 if (sk->sk_state == TCP_TIME_WAIT) 2311 get_timewait4_sock(v, seq, st->num); 2312 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2313 get_openreq4(v, seq, st->num); 2314 else 2315 get_tcp4_sock(v, seq, st->num); 2316 out: 2317 seq_pad(seq, '\n'); 2318 return 0; 2319 } 2320 2321 static const struct file_operations tcp_afinfo_seq_fops = { 2322 .owner = THIS_MODULE, 2323 .open = tcp_seq_open, 2324 .read = seq_read, 2325 .llseek = seq_lseek, 2326 .release = seq_release_net 2327 }; 2328 2329 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2330 .name = "tcp", 2331 .family = AF_INET, 2332 .seq_fops = &tcp_afinfo_seq_fops, 2333 .seq_ops = { 2334 .show = tcp4_seq_show, 2335 }, 2336 }; 2337 2338 static int __net_init tcp4_proc_init_net(struct net *net) 2339 { 2340 return tcp_proc_register(net, &tcp4_seq_afinfo); 2341 } 2342 2343 static void __net_exit tcp4_proc_exit_net(struct net *net) 2344 { 2345 tcp_proc_unregister(net, &tcp4_seq_afinfo); 2346 } 2347 2348 static struct pernet_operations tcp4_net_ops = { 2349 .init = tcp4_proc_init_net, 2350 .exit = tcp4_proc_exit_net, 2351 }; 2352 2353 int __init tcp4_proc_init(void) 2354 { 2355 return register_pernet_subsys(&tcp4_net_ops); 2356 } 2357 2358 void tcp4_proc_exit(void) 2359 { 2360 unregister_pernet_subsys(&tcp4_net_ops); 2361 } 2362 #endif /* CONFIG_PROC_FS */ 2363 2364 struct proto tcp_prot = { 2365 .name = "TCP", 2366 .owner = THIS_MODULE, 2367 .close = tcp_close, 2368 .connect = tcp_v4_connect, 2369 .disconnect = tcp_disconnect, 2370 .accept = inet_csk_accept, 2371 .ioctl = tcp_ioctl, 2372 .init = tcp_v4_init_sock, 2373 .destroy = tcp_v4_destroy_sock, 2374 .shutdown = tcp_shutdown, 2375 .setsockopt = tcp_setsockopt, 2376 .getsockopt = tcp_getsockopt, 2377 .keepalive = tcp_set_keepalive, 2378 .recvmsg = tcp_recvmsg, 2379 .sendmsg = tcp_sendmsg, 2380 .sendpage = tcp_sendpage, 2381 .backlog_rcv = tcp_v4_do_rcv, 2382 .release_cb = tcp_release_cb, 2383 .hash = inet_hash, 2384 .unhash = inet_unhash, 2385 .get_port = inet_csk_get_port, 2386 .enter_memory_pressure = tcp_enter_memory_pressure, 2387 .stream_memory_free = tcp_stream_memory_free, 2388 .sockets_allocated = &tcp_sockets_allocated, 2389 .orphan_count = &tcp_orphan_count, 2390 .memory_allocated = &tcp_memory_allocated, 2391 .memory_pressure = &tcp_memory_pressure, 2392 .sysctl_mem = sysctl_tcp_mem, 2393 .sysctl_wmem = sysctl_tcp_wmem, 2394 .sysctl_rmem = sysctl_tcp_rmem, 2395 .max_header = MAX_TCP_HEADER, 2396 .obj_size = sizeof(struct tcp_sock), 2397 .slab_flags = SLAB_DESTROY_BY_RCU, 2398 .twsk_prot = &tcp_timewait_sock_ops, 2399 .rsk_prot = &tcp_request_sock_ops, 2400 .h.hashinfo = &tcp_hashinfo, 2401 .no_autobind = true, 2402 #ifdef CONFIG_COMPAT 2403 .compat_setsockopt = compat_tcp_setsockopt, 2404 .compat_getsockopt = compat_tcp_getsockopt, 2405 #endif 2406 .diag_destroy = tcp_abort, 2407 }; 2408 EXPORT_SYMBOL(tcp_prot); 2409 2410 static void __net_exit tcp_sk_exit(struct net *net) 2411 { 2412 int cpu; 2413 2414 for_each_possible_cpu(cpu) 2415 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2416 free_percpu(net->ipv4.tcp_sk); 2417 } 2418 2419 static int __net_init tcp_sk_init(struct net *net) 2420 { 2421 int res, cpu, cnt; 2422 2423 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2424 if (!net->ipv4.tcp_sk) 2425 return -ENOMEM; 2426 2427 for_each_possible_cpu(cpu) { 2428 struct sock *sk; 2429 2430 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2431 IPPROTO_TCP, net); 2432 if (res) 2433 goto fail; 2434 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2435 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2436 } 2437 2438 net->ipv4.sysctl_tcp_ecn = 2; 2439 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2440 2441 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2442 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2443 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2444 2445 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2446 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2447 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2448 2449 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2450 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2451 net->ipv4.sysctl_tcp_syncookies = 1; 2452 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2453 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2454 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2455 net->ipv4.sysctl_tcp_orphan_retries = 0; 2456 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2457 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2458 net->ipv4.sysctl_tcp_tw_reuse = 0; 2459 2460 cnt = tcp_hashinfo.ehash_mask + 1; 2461 net->ipv4.tcp_death_row.sysctl_tw_recycle = 0; 2462 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; 2463 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2464 2465 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); 2466 2467 return 0; 2468 fail: 2469 tcp_sk_exit(net); 2470 2471 return res; 2472 } 2473 2474 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2475 { 2476 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2477 } 2478 2479 static struct pernet_operations __net_initdata tcp_sk_ops = { 2480 .init = tcp_sk_init, 2481 .exit = tcp_sk_exit, 2482 .exit_batch = tcp_sk_exit_batch, 2483 }; 2484 2485 void __init tcp_v4_init(void) 2486 { 2487 if (register_pernet_subsys(&tcp_sk_ops)) 2488 panic("Failed to create the TCP control socket.\n"); 2489 } 2490