1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * IPv4 specific functions 9 * 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24 /* 25 * Changes: 26 * David S. Miller : New socket lookup architecture. 27 * This code is dedicated to John Dyson. 28 * David S. Miller : Change semantics of established hash, 29 * half is devoted to TIME_WAIT sockets 30 * and the rest go in the other half. 31 * Andi Kleen : Add support for syncookies and fixed 32 * some bugs: ip options weren't passed to 33 * the TCP layer, missed a check for an 34 * ACK bit. 35 * Andi Kleen : Implemented fast path mtu discovery. 36 * Fixed many serious bugs in the 37 * request_sock handling and moved 38 * most of it into the af independent code. 39 * Added tail drop and some other bugfixes. 40 * Added new listen semantics. 41 * Mike McLagan : Routing by source 42 * Juan Jose Ciarlante: ip_dynaddr bits 43 * Andi Kleen: various fixes. 44 * Vitaly E. Lavrov : Transparent proxy revived after year 45 * coma. 46 * Andi Kleen : Fix new listen. 47 * Andi Kleen : Fix accept error reporting. 48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 50 * a single port at the same time. 51 */ 52 53 #define pr_fmt(fmt) "TCP: " fmt 54 55 #include <linux/bottom_half.h> 56 #include <linux/types.h> 57 #include <linux/fcntl.h> 58 #include <linux/module.h> 59 #include <linux/random.h> 60 #include <linux/cache.h> 61 #include <linux/jhash.h> 62 #include <linux/init.h> 63 #include <linux/times.h> 64 #include <linux/slab.h> 65 66 #include <net/net_namespace.h> 67 #include <net/icmp.h> 68 #include <net/inet_hashtables.h> 69 #include <net/tcp.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/timewait_sock.h> 74 #include <net/xfrm.h> 75 #include <net/secure_seq.h> 76 #include <net/busy_poll.h> 77 78 #include <linux/inet.h> 79 #include <linux/ipv6.h> 80 #include <linux/stddef.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 #include <linux/inetdevice.h> 84 85 #include <crypto/hash.h> 86 #include <linux/scatterlist.h> 87 88 #include <trace/events/tcp.h> 89 90 #ifdef CONFIG_TCP_MD5SIG 91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 92 __be32 daddr, __be32 saddr, const struct tcphdr *th); 93 #endif 94 95 struct inet_hashinfo tcp_hashinfo; 96 EXPORT_SYMBOL(tcp_hashinfo); 97 98 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 99 { 100 return secure_tcp_seq(ip_hdr(skb)->daddr, 101 ip_hdr(skb)->saddr, 102 tcp_hdr(skb)->dest, 103 tcp_hdr(skb)->source); 104 } 105 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 107 { 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 109 } 110 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 112 { 113 const struct inet_timewait_sock *tw = inet_twsk(sktw); 114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 115 struct tcp_sock *tp = tcp_sk(sk); 116 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 117 118 if (reuse == 2) { 119 /* Still does not detect *everything* that goes through 120 * lo, since we require a loopback src or dst address 121 * or direct binding to 'lo' interface. 122 */ 123 bool loopback = false; 124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 125 loopback = true; 126 #if IS_ENABLED(CONFIG_IPV6) 127 if (tw->tw_family == AF_INET6) { 128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 129 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && 130 (tw->tw_v6_daddr.s6_addr[12] == 127)) || 131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 132 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && 133 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) 134 loopback = true; 135 } else 136 #endif 137 { 138 if (ipv4_is_loopback(tw->tw_daddr) || 139 ipv4_is_loopback(tw->tw_rcv_saddr)) 140 loopback = true; 141 } 142 if (!loopback) 143 reuse = 0; 144 } 145 146 /* With PAWS, it is safe from the viewpoint 147 of data integrity. Even without PAWS it is safe provided sequence 148 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 149 150 Actually, the idea is close to VJ's one, only timestamp cache is 151 held not per host, but per port pair and TW bucket is used as state 152 holder. 153 154 If TW bucket has been already destroyed we fall back to VJ's scheme 155 and use initial timestamp retrieved from peer table. 156 */ 157 if (tcptw->tw_ts_recent_stamp && 158 (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 159 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 160 if (tp->write_seq == 0) 161 tp->write_seq = 1; 162 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 163 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 164 sock_hold(sktw); 165 return 1; 166 } 167 168 return 0; 169 } 170 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 171 172 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 173 int addr_len) 174 { 175 /* This check is replicated from tcp_v4_connect() and intended to 176 * prevent BPF program called below from accessing bytes that are out 177 * of the bound specified by user in addr_len. 178 */ 179 if (addr_len < sizeof(struct sockaddr_in)) 180 return -EINVAL; 181 182 sock_owned_by_me(sk); 183 184 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 185 } 186 187 /* This will initiate an outgoing connection. */ 188 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 189 { 190 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 191 struct inet_sock *inet = inet_sk(sk); 192 struct tcp_sock *tp = tcp_sk(sk); 193 __be16 orig_sport, orig_dport; 194 __be32 daddr, nexthop; 195 struct flowi4 *fl4; 196 struct rtable *rt; 197 int err; 198 struct ip_options_rcu *inet_opt; 199 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 200 201 if (addr_len < sizeof(struct sockaddr_in)) 202 return -EINVAL; 203 204 if (usin->sin_family != AF_INET) 205 return -EAFNOSUPPORT; 206 207 nexthop = daddr = usin->sin_addr.s_addr; 208 inet_opt = rcu_dereference_protected(inet->inet_opt, 209 lockdep_sock_is_held(sk)); 210 if (inet_opt && inet_opt->opt.srr) { 211 if (!daddr) 212 return -EINVAL; 213 nexthop = inet_opt->opt.faddr; 214 } 215 216 orig_sport = inet->inet_sport; 217 orig_dport = usin->sin_port; 218 fl4 = &inet->cork.fl.u.ip4; 219 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 220 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 221 IPPROTO_TCP, 222 orig_sport, orig_dport, sk); 223 if (IS_ERR(rt)) { 224 err = PTR_ERR(rt); 225 if (err == -ENETUNREACH) 226 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 227 return err; 228 } 229 230 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 231 ip_rt_put(rt); 232 return -ENETUNREACH; 233 } 234 235 if (!inet_opt || !inet_opt->opt.srr) 236 daddr = fl4->daddr; 237 238 if (!inet->inet_saddr) 239 inet->inet_saddr = fl4->saddr; 240 sk_rcv_saddr_set(sk, inet->inet_saddr); 241 242 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 243 /* Reset inherited state */ 244 tp->rx_opt.ts_recent = 0; 245 tp->rx_opt.ts_recent_stamp = 0; 246 if (likely(!tp->repair)) 247 tp->write_seq = 0; 248 } 249 250 inet->inet_dport = usin->sin_port; 251 sk_daddr_set(sk, daddr); 252 253 inet_csk(sk)->icsk_ext_hdr_len = 0; 254 if (inet_opt) 255 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 256 257 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 258 259 /* Socket identity is still unknown (sport may be zero). 260 * However we set state to SYN-SENT and not releasing socket 261 * lock select source port, enter ourselves into the hash tables and 262 * complete initialization after this. 263 */ 264 tcp_set_state(sk, TCP_SYN_SENT); 265 err = inet_hash_connect(tcp_death_row, sk); 266 if (err) 267 goto failure; 268 269 sk_set_txhash(sk); 270 271 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 272 inet->inet_sport, inet->inet_dport, sk); 273 if (IS_ERR(rt)) { 274 err = PTR_ERR(rt); 275 rt = NULL; 276 goto failure; 277 } 278 /* OK, now commit destination to socket. */ 279 sk->sk_gso_type = SKB_GSO_TCPV4; 280 sk_setup_caps(sk, &rt->dst); 281 rt = NULL; 282 283 if (likely(!tp->repair)) { 284 if (!tp->write_seq) 285 tp->write_seq = secure_tcp_seq(inet->inet_saddr, 286 inet->inet_daddr, 287 inet->inet_sport, 288 usin->sin_port); 289 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 290 inet->inet_saddr, 291 inet->inet_daddr); 292 } 293 294 inet->inet_id = tp->write_seq ^ jiffies; 295 296 if (tcp_fastopen_defer_connect(sk, &err)) 297 return err; 298 if (err) 299 goto failure; 300 301 err = tcp_connect(sk); 302 303 if (err) 304 goto failure; 305 306 return 0; 307 308 failure: 309 /* 310 * This unhashes the socket and releases the local port, 311 * if necessary. 312 */ 313 tcp_set_state(sk, TCP_CLOSE); 314 ip_rt_put(rt); 315 sk->sk_route_caps = 0; 316 inet->inet_dport = 0; 317 return err; 318 } 319 EXPORT_SYMBOL(tcp_v4_connect); 320 321 /* 322 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 323 * It can be called through tcp_release_cb() if socket was owned by user 324 * at the time tcp_v4_err() was called to handle ICMP message. 325 */ 326 void tcp_v4_mtu_reduced(struct sock *sk) 327 { 328 struct inet_sock *inet = inet_sk(sk); 329 struct dst_entry *dst; 330 u32 mtu; 331 332 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 333 return; 334 mtu = tcp_sk(sk)->mtu_info; 335 dst = inet_csk_update_pmtu(sk, mtu); 336 if (!dst) 337 return; 338 339 /* Something is about to be wrong... Remember soft error 340 * for the case, if this connection will not able to recover. 341 */ 342 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 343 sk->sk_err_soft = EMSGSIZE; 344 345 mtu = dst_mtu(dst); 346 347 if (inet->pmtudisc != IP_PMTUDISC_DONT && 348 ip_sk_accept_pmtu(sk) && 349 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 350 tcp_sync_mss(sk, mtu); 351 352 /* Resend the TCP packet because it's 353 * clear that the old packet has been 354 * dropped. This is the new "fast" path mtu 355 * discovery. 356 */ 357 tcp_simple_retransmit(sk); 358 } /* else let the usual retransmit timer handle it */ 359 } 360 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 361 362 static void do_redirect(struct sk_buff *skb, struct sock *sk) 363 { 364 struct dst_entry *dst = __sk_dst_check(sk, 0); 365 366 if (dst) 367 dst->ops->redirect(dst, sk, skb); 368 } 369 370 371 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 372 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 373 { 374 struct request_sock *req = inet_reqsk(sk); 375 struct net *net = sock_net(sk); 376 377 /* ICMPs are not backlogged, hence we cannot get 378 * an established socket here. 379 */ 380 if (seq != tcp_rsk(req)->snt_isn) { 381 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 382 } else if (abort) { 383 /* 384 * Still in SYN_RECV, just remove it silently. 385 * There is no good way to pass the error to the newly 386 * created socket, and POSIX does not want network 387 * errors returned from accept(). 388 */ 389 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 390 tcp_listendrop(req->rsk_listener); 391 } 392 reqsk_put(req); 393 } 394 EXPORT_SYMBOL(tcp_req_err); 395 396 /* 397 * This routine is called by the ICMP module when it gets some 398 * sort of error condition. If err < 0 then the socket should 399 * be closed and the error returned to the user. If err > 0 400 * it's just the icmp type << 8 | icmp code. After adjustment 401 * header points to the first 8 bytes of the tcp header. We need 402 * to find the appropriate port. 403 * 404 * The locking strategy used here is very "optimistic". When 405 * someone else accesses the socket the ICMP is just dropped 406 * and for some paths there is no check at all. 407 * A more general error queue to queue errors for later handling 408 * is probably better. 409 * 410 */ 411 412 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 413 { 414 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 415 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 416 struct inet_connection_sock *icsk; 417 struct tcp_sock *tp; 418 struct inet_sock *inet; 419 const int type = icmp_hdr(icmp_skb)->type; 420 const int code = icmp_hdr(icmp_skb)->code; 421 struct sock *sk; 422 struct sk_buff *skb; 423 struct request_sock *fastopen; 424 u32 seq, snd_una; 425 s32 remaining; 426 u32 delta_us; 427 int err; 428 struct net *net = dev_net(icmp_skb->dev); 429 430 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 431 th->dest, iph->saddr, ntohs(th->source), 432 inet_iif(icmp_skb), 0); 433 if (!sk) { 434 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 435 return; 436 } 437 if (sk->sk_state == TCP_TIME_WAIT) { 438 inet_twsk_put(inet_twsk(sk)); 439 return; 440 } 441 seq = ntohl(th->seq); 442 if (sk->sk_state == TCP_NEW_SYN_RECV) 443 return tcp_req_err(sk, seq, 444 type == ICMP_PARAMETERPROB || 445 type == ICMP_TIME_EXCEEDED || 446 (type == ICMP_DEST_UNREACH && 447 (code == ICMP_NET_UNREACH || 448 code == ICMP_HOST_UNREACH))); 449 450 bh_lock_sock(sk); 451 /* If too many ICMPs get dropped on busy 452 * servers this needs to be solved differently. 453 * We do take care of PMTU discovery (RFC1191) special case : 454 * we can receive locally generated ICMP messages while socket is held. 455 */ 456 if (sock_owned_by_user(sk)) { 457 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 458 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 459 } 460 if (sk->sk_state == TCP_CLOSE) 461 goto out; 462 463 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 464 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 465 goto out; 466 } 467 468 icsk = inet_csk(sk); 469 tp = tcp_sk(sk); 470 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 471 fastopen = tp->fastopen_rsk; 472 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 473 if (sk->sk_state != TCP_LISTEN && 474 !between(seq, snd_una, tp->snd_nxt)) { 475 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 476 goto out; 477 } 478 479 switch (type) { 480 case ICMP_REDIRECT: 481 if (!sock_owned_by_user(sk)) 482 do_redirect(icmp_skb, sk); 483 goto out; 484 case ICMP_SOURCE_QUENCH: 485 /* Just silently ignore these. */ 486 goto out; 487 case ICMP_PARAMETERPROB: 488 err = EPROTO; 489 break; 490 case ICMP_DEST_UNREACH: 491 if (code > NR_ICMP_UNREACH) 492 goto out; 493 494 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 495 /* We are not interested in TCP_LISTEN and open_requests 496 * (SYN-ACKs send out by Linux are always <576bytes so 497 * they should go through unfragmented). 498 */ 499 if (sk->sk_state == TCP_LISTEN) 500 goto out; 501 502 tp->mtu_info = info; 503 if (!sock_owned_by_user(sk)) { 504 tcp_v4_mtu_reduced(sk); 505 } else { 506 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 507 sock_hold(sk); 508 } 509 goto out; 510 } 511 512 err = icmp_err_convert[code].errno; 513 /* check if icmp_skb allows revert of backoff 514 * (see draft-zimmermann-tcp-lcd) */ 515 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 516 break; 517 if (seq != tp->snd_una || !icsk->icsk_retransmits || 518 !icsk->icsk_backoff || fastopen) 519 break; 520 521 if (sock_owned_by_user(sk)) 522 break; 523 524 icsk->icsk_backoff--; 525 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 526 TCP_TIMEOUT_INIT; 527 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 528 529 skb = tcp_rtx_queue_head(sk); 530 BUG_ON(!skb); 531 532 tcp_mstamp_refresh(tp); 533 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); 534 remaining = icsk->icsk_rto - 535 usecs_to_jiffies(delta_us); 536 537 if (remaining > 0) { 538 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 539 remaining, TCP_RTO_MAX); 540 } else { 541 /* RTO revert clocked out retransmission. 542 * Will retransmit now */ 543 tcp_retransmit_timer(sk); 544 } 545 546 break; 547 case ICMP_TIME_EXCEEDED: 548 err = EHOSTUNREACH; 549 break; 550 default: 551 goto out; 552 } 553 554 switch (sk->sk_state) { 555 case TCP_SYN_SENT: 556 case TCP_SYN_RECV: 557 /* Only in fast or simultaneous open. If a fast open socket is 558 * is already accepted it is treated as a connected one below. 559 */ 560 if (fastopen && !fastopen->sk) 561 break; 562 563 if (!sock_owned_by_user(sk)) { 564 sk->sk_err = err; 565 566 sk->sk_error_report(sk); 567 568 tcp_done(sk); 569 } else { 570 sk->sk_err_soft = err; 571 } 572 goto out; 573 } 574 575 /* If we've already connected we will keep trying 576 * until we time out, or the user gives up. 577 * 578 * rfc1122 4.2.3.9 allows to consider as hard errors 579 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 580 * but it is obsoleted by pmtu discovery). 581 * 582 * Note, that in modern internet, where routing is unreliable 583 * and in each dark corner broken firewalls sit, sending random 584 * errors ordered by their masters even this two messages finally lose 585 * their original sense (even Linux sends invalid PORT_UNREACHs) 586 * 587 * Now we are in compliance with RFCs. 588 * --ANK (980905) 589 */ 590 591 inet = inet_sk(sk); 592 if (!sock_owned_by_user(sk) && inet->recverr) { 593 sk->sk_err = err; 594 sk->sk_error_report(sk); 595 } else { /* Only an error on timeout */ 596 sk->sk_err_soft = err; 597 } 598 599 out: 600 bh_unlock_sock(sk); 601 sock_put(sk); 602 } 603 604 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 605 { 606 struct tcphdr *th = tcp_hdr(skb); 607 608 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 609 skb->csum_start = skb_transport_header(skb) - skb->head; 610 skb->csum_offset = offsetof(struct tcphdr, check); 611 } 612 613 /* This routine computes an IPv4 TCP checksum. */ 614 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 615 { 616 const struct inet_sock *inet = inet_sk(sk); 617 618 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 619 } 620 EXPORT_SYMBOL(tcp_v4_send_check); 621 622 /* 623 * This routine will send an RST to the other tcp. 624 * 625 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 626 * for reset. 627 * Answer: if a packet caused RST, it is not for a socket 628 * existing in our system, if it is matched to a socket, 629 * it is just duplicate segment or bug in other side's TCP. 630 * So that we build reply only basing on parameters 631 * arrived with segment. 632 * Exception: precedence violation. We do not implement it in any case. 633 */ 634 635 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 636 { 637 const struct tcphdr *th = tcp_hdr(skb); 638 struct { 639 struct tcphdr th; 640 #ifdef CONFIG_TCP_MD5SIG 641 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 642 #endif 643 } rep; 644 struct ip_reply_arg arg; 645 #ifdef CONFIG_TCP_MD5SIG 646 struct tcp_md5sig_key *key = NULL; 647 const __u8 *hash_location = NULL; 648 unsigned char newhash[16]; 649 int genhash; 650 struct sock *sk1 = NULL; 651 #endif 652 struct net *net; 653 struct sock *ctl_sk; 654 655 /* Never send a reset in response to a reset. */ 656 if (th->rst) 657 return; 658 659 /* If sk not NULL, it means we did a successful lookup and incoming 660 * route had to be correct. prequeue might have dropped our dst. 661 */ 662 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 663 return; 664 665 /* Swap the send and the receive. */ 666 memset(&rep, 0, sizeof(rep)); 667 rep.th.dest = th->source; 668 rep.th.source = th->dest; 669 rep.th.doff = sizeof(struct tcphdr) / 4; 670 rep.th.rst = 1; 671 672 if (th->ack) { 673 rep.th.seq = th->ack_seq; 674 } else { 675 rep.th.ack = 1; 676 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 677 skb->len - (th->doff << 2)); 678 } 679 680 memset(&arg, 0, sizeof(arg)); 681 arg.iov[0].iov_base = (unsigned char *)&rep; 682 arg.iov[0].iov_len = sizeof(rep.th); 683 684 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 685 #ifdef CONFIG_TCP_MD5SIG 686 rcu_read_lock(); 687 hash_location = tcp_parse_md5sig_option(th); 688 if (sk && sk_fullsock(sk)) { 689 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 690 &ip_hdr(skb)->saddr, AF_INET); 691 } else if (hash_location) { 692 /* 693 * active side is lost. Try to find listening socket through 694 * source port, and then find md5 key through listening socket. 695 * we are not loose security here: 696 * Incoming packet is checked with md5 hash with finding key, 697 * no RST generated if md5 hash doesn't match. 698 */ 699 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 700 ip_hdr(skb)->saddr, 701 th->source, ip_hdr(skb)->daddr, 702 ntohs(th->source), inet_iif(skb), 703 tcp_v4_sdif(skb)); 704 /* don't send rst if it can't find key */ 705 if (!sk1) 706 goto out; 707 708 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 709 &ip_hdr(skb)->saddr, AF_INET); 710 if (!key) 711 goto out; 712 713 714 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 715 if (genhash || memcmp(hash_location, newhash, 16) != 0) 716 goto out; 717 718 } 719 720 if (key) { 721 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 722 (TCPOPT_NOP << 16) | 723 (TCPOPT_MD5SIG << 8) | 724 TCPOLEN_MD5SIG); 725 /* Update length and the length the header thinks exists */ 726 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 727 rep.th.doff = arg.iov[0].iov_len / 4; 728 729 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 730 key, ip_hdr(skb)->saddr, 731 ip_hdr(skb)->daddr, &rep.th); 732 } 733 #endif 734 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 735 ip_hdr(skb)->saddr, /* XXX */ 736 arg.iov[0].iov_len, IPPROTO_TCP, 0); 737 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 738 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 739 740 /* When socket is gone, all binding information is lost. 741 * routing might fail in this case. No choice here, if we choose to force 742 * input interface, we will misroute in case of asymmetric route. 743 */ 744 if (sk) { 745 arg.bound_dev_if = sk->sk_bound_dev_if; 746 if (sk_fullsock(sk)) 747 trace_tcp_send_reset(sk, skb); 748 } 749 750 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 751 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 752 753 arg.tos = ip_hdr(skb)->tos; 754 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 755 local_bh_disable(); 756 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); 757 if (sk) 758 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 759 inet_twsk(sk)->tw_mark : sk->sk_mark; 760 ip_send_unicast_reply(ctl_sk, 761 skb, &TCP_SKB_CB(skb)->header.h4.opt, 762 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 763 &arg, arg.iov[0].iov_len); 764 765 ctl_sk->sk_mark = 0; 766 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 767 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 768 local_bh_enable(); 769 770 #ifdef CONFIG_TCP_MD5SIG 771 out: 772 rcu_read_unlock(); 773 #endif 774 } 775 776 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 777 outside socket context is ugly, certainly. What can I do? 778 */ 779 780 static void tcp_v4_send_ack(const struct sock *sk, 781 struct sk_buff *skb, u32 seq, u32 ack, 782 u32 win, u32 tsval, u32 tsecr, int oif, 783 struct tcp_md5sig_key *key, 784 int reply_flags, u8 tos) 785 { 786 const struct tcphdr *th = tcp_hdr(skb); 787 struct { 788 struct tcphdr th; 789 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 790 #ifdef CONFIG_TCP_MD5SIG 791 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 792 #endif 793 ]; 794 } rep; 795 struct net *net = sock_net(sk); 796 struct ip_reply_arg arg; 797 struct sock *ctl_sk; 798 799 memset(&rep.th, 0, sizeof(struct tcphdr)); 800 memset(&arg, 0, sizeof(arg)); 801 802 arg.iov[0].iov_base = (unsigned char *)&rep; 803 arg.iov[0].iov_len = sizeof(rep.th); 804 if (tsecr) { 805 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 806 (TCPOPT_TIMESTAMP << 8) | 807 TCPOLEN_TIMESTAMP); 808 rep.opt[1] = htonl(tsval); 809 rep.opt[2] = htonl(tsecr); 810 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 811 } 812 813 /* Swap the send and the receive. */ 814 rep.th.dest = th->source; 815 rep.th.source = th->dest; 816 rep.th.doff = arg.iov[0].iov_len / 4; 817 rep.th.seq = htonl(seq); 818 rep.th.ack_seq = htonl(ack); 819 rep.th.ack = 1; 820 rep.th.window = htons(win); 821 822 #ifdef CONFIG_TCP_MD5SIG 823 if (key) { 824 int offset = (tsecr) ? 3 : 0; 825 826 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 827 (TCPOPT_NOP << 16) | 828 (TCPOPT_MD5SIG << 8) | 829 TCPOLEN_MD5SIG); 830 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 831 rep.th.doff = arg.iov[0].iov_len/4; 832 833 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 834 key, ip_hdr(skb)->saddr, 835 ip_hdr(skb)->daddr, &rep.th); 836 } 837 #endif 838 arg.flags = reply_flags; 839 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 840 ip_hdr(skb)->saddr, /* XXX */ 841 arg.iov[0].iov_len, IPPROTO_TCP, 0); 842 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 843 if (oif) 844 arg.bound_dev_if = oif; 845 arg.tos = tos; 846 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 847 local_bh_disable(); 848 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); 849 if (sk) 850 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 851 inet_twsk(sk)->tw_mark : sk->sk_mark; 852 ip_send_unicast_reply(ctl_sk, 853 skb, &TCP_SKB_CB(skb)->header.h4.opt, 854 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 855 &arg, arg.iov[0].iov_len); 856 857 ctl_sk->sk_mark = 0; 858 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 859 local_bh_enable(); 860 } 861 862 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 863 { 864 struct inet_timewait_sock *tw = inet_twsk(sk); 865 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 866 867 tcp_v4_send_ack(sk, skb, 868 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 869 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 870 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 871 tcptw->tw_ts_recent, 872 tw->tw_bound_dev_if, 873 tcp_twsk_md5_key(tcptw), 874 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 875 tw->tw_tos 876 ); 877 878 inet_twsk_put(tw); 879 } 880 881 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 882 struct request_sock *req) 883 { 884 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 885 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 886 */ 887 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 888 tcp_sk(sk)->snd_nxt; 889 890 /* RFC 7323 2.3 891 * The window field (SEG.WND) of every outgoing segment, with the 892 * exception of <SYN> segments, MUST be right-shifted by 893 * Rcv.Wind.Shift bits: 894 */ 895 tcp_v4_send_ack(sk, skb, seq, 896 tcp_rsk(req)->rcv_nxt, 897 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 898 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 899 req->ts_recent, 900 0, 901 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, 902 AF_INET), 903 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 904 ip_hdr(skb)->tos); 905 } 906 907 /* 908 * Send a SYN-ACK after having received a SYN. 909 * This still operates on a request_sock only, not on a big 910 * socket. 911 */ 912 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 913 struct flowi *fl, 914 struct request_sock *req, 915 struct tcp_fastopen_cookie *foc, 916 enum tcp_synack_type synack_type) 917 { 918 const struct inet_request_sock *ireq = inet_rsk(req); 919 struct flowi4 fl4; 920 int err = -1; 921 struct sk_buff *skb; 922 923 /* First, grab a route. */ 924 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 925 return -1; 926 927 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 928 929 if (skb) { 930 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 931 932 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 933 ireq->ir_rmt_addr, 934 ireq_opt_deref(ireq)); 935 err = net_xmit_eval(err); 936 } 937 938 return err; 939 } 940 941 /* 942 * IPv4 request_sock destructor. 943 */ 944 static void tcp_v4_reqsk_destructor(struct request_sock *req) 945 { 946 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 947 } 948 949 #ifdef CONFIG_TCP_MD5SIG 950 /* 951 * RFC2385 MD5 checksumming requires a mapping of 952 * IP address->MD5 Key. 953 * We need to maintain these in the sk structure. 954 */ 955 956 /* Find the Key structure for an address. */ 957 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, 958 const union tcp_md5_addr *addr, 959 int family) 960 { 961 const struct tcp_sock *tp = tcp_sk(sk); 962 struct tcp_md5sig_key *key; 963 const struct tcp_md5sig_info *md5sig; 964 __be32 mask; 965 struct tcp_md5sig_key *best_match = NULL; 966 bool match; 967 968 /* caller either holds rcu_read_lock() or socket lock */ 969 md5sig = rcu_dereference_check(tp->md5sig_info, 970 lockdep_sock_is_held(sk)); 971 if (!md5sig) 972 return NULL; 973 974 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 975 if (key->family != family) 976 continue; 977 978 if (family == AF_INET) { 979 mask = inet_make_mask(key->prefixlen); 980 match = (key->addr.a4.s_addr & mask) == 981 (addr->a4.s_addr & mask); 982 #if IS_ENABLED(CONFIG_IPV6) 983 } else if (family == AF_INET6) { 984 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 985 key->prefixlen); 986 #endif 987 } else { 988 match = false; 989 } 990 991 if (match && (!best_match || 992 key->prefixlen > best_match->prefixlen)) 993 best_match = key; 994 } 995 return best_match; 996 } 997 EXPORT_SYMBOL(tcp_md5_do_lookup); 998 999 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1000 const union tcp_md5_addr *addr, 1001 int family, u8 prefixlen) 1002 { 1003 const struct tcp_sock *tp = tcp_sk(sk); 1004 struct tcp_md5sig_key *key; 1005 unsigned int size = sizeof(struct in_addr); 1006 const struct tcp_md5sig_info *md5sig; 1007 1008 /* caller either holds rcu_read_lock() or socket lock */ 1009 md5sig = rcu_dereference_check(tp->md5sig_info, 1010 lockdep_sock_is_held(sk)); 1011 if (!md5sig) 1012 return NULL; 1013 #if IS_ENABLED(CONFIG_IPV6) 1014 if (family == AF_INET6) 1015 size = sizeof(struct in6_addr); 1016 #endif 1017 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 1018 if (key->family != family) 1019 continue; 1020 if (!memcmp(&key->addr, addr, size) && 1021 key->prefixlen == prefixlen) 1022 return key; 1023 } 1024 return NULL; 1025 } 1026 1027 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1028 const struct sock *addr_sk) 1029 { 1030 const union tcp_md5_addr *addr; 1031 1032 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1033 return tcp_md5_do_lookup(sk, addr, AF_INET); 1034 } 1035 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1036 1037 /* This can be called on a newly created socket, from other files */ 1038 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1039 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, 1040 gfp_t gfp) 1041 { 1042 /* Add Key to the list */ 1043 struct tcp_md5sig_key *key; 1044 struct tcp_sock *tp = tcp_sk(sk); 1045 struct tcp_md5sig_info *md5sig; 1046 1047 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1048 if (key) { 1049 /* Pre-existing entry - just update that one. */ 1050 memcpy(key->key, newkey, newkeylen); 1051 key->keylen = newkeylen; 1052 return 0; 1053 } 1054 1055 md5sig = rcu_dereference_protected(tp->md5sig_info, 1056 lockdep_sock_is_held(sk)); 1057 if (!md5sig) { 1058 md5sig = kmalloc(sizeof(*md5sig), gfp); 1059 if (!md5sig) 1060 return -ENOMEM; 1061 1062 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1063 INIT_HLIST_HEAD(&md5sig->head); 1064 rcu_assign_pointer(tp->md5sig_info, md5sig); 1065 } 1066 1067 key = sock_kmalloc(sk, sizeof(*key), gfp); 1068 if (!key) 1069 return -ENOMEM; 1070 if (!tcp_alloc_md5sig_pool()) { 1071 sock_kfree_s(sk, key, sizeof(*key)); 1072 return -ENOMEM; 1073 } 1074 1075 memcpy(key->key, newkey, newkeylen); 1076 key->keylen = newkeylen; 1077 key->family = family; 1078 key->prefixlen = prefixlen; 1079 memcpy(&key->addr, addr, 1080 (family == AF_INET6) ? sizeof(struct in6_addr) : 1081 sizeof(struct in_addr)); 1082 hlist_add_head_rcu(&key->node, &md5sig->head); 1083 return 0; 1084 } 1085 EXPORT_SYMBOL(tcp_md5_do_add); 1086 1087 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1088 u8 prefixlen) 1089 { 1090 struct tcp_md5sig_key *key; 1091 1092 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1093 if (!key) 1094 return -ENOENT; 1095 hlist_del_rcu(&key->node); 1096 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1097 kfree_rcu(key, rcu); 1098 return 0; 1099 } 1100 EXPORT_SYMBOL(tcp_md5_do_del); 1101 1102 static void tcp_clear_md5_list(struct sock *sk) 1103 { 1104 struct tcp_sock *tp = tcp_sk(sk); 1105 struct tcp_md5sig_key *key; 1106 struct hlist_node *n; 1107 struct tcp_md5sig_info *md5sig; 1108 1109 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1110 1111 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1112 hlist_del_rcu(&key->node); 1113 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1114 kfree_rcu(key, rcu); 1115 } 1116 } 1117 1118 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1119 char __user *optval, int optlen) 1120 { 1121 struct tcp_md5sig cmd; 1122 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1123 u8 prefixlen = 32; 1124 1125 if (optlen < sizeof(cmd)) 1126 return -EINVAL; 1127 1128 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1129 return -EFAULT; 1130 1131 if (sin->sin_family != AF_INET) 1132 return -EINVAL; 1133 1134 if (optname == TCP_MD5SIG_EXT && 1135 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1136 prefixlen = cmd.tcpm_prefixlen; 1137 if (prefixlen > 32) 1138 return -EINVAL; 1139 } 1140 1141 if (!cmd.tcpm_keylen) 1142 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1143 AF_INET, prefixlen); 1144 1145 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1146 return -EINVAL; 1147 1148 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1149 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, 1150 GFP_KERNEL); 1151 } 1152 1153 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1154 __be32 daddr, __be32 saddr, 1155 const struct tcphdr *th, int nbytes) 1156 { 1157 struct tcp4_pseudohdr *bp; 1158 struct scatterlist sg; 1159 struct tcphdr *_th; 1160 1161 bp = hp->scratch; 1162 bp->saddr = saddr; 1163 bp->daddr = daddr; 1164 bp->pad = 0; 1165 bp->protocol = IPPROTO_TCP; 1166 bp->len = cpu_to_be16(nbytes); 1167 1168 _th = (struct tcphdr *)(bp + 1); 1169 memcpy(_th, th, sizeof(*th)); 1170 _th->check = 0; 1171 1172 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1173 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1174 sizeof(*bp) + sizeof(*th)); 1175 return crypto_ahash_update(hp->md5_req); 1176 } 1177 1178 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1179 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1180 { 1181 struct tcp_md5sig_pool *hp; 1182 struct ahash_request *req; 1183 1184 hp = tcp_get_md5sig_pool(); 1185 if (!hp) 1186 goto clear_hash_noput; 1187 req = hp->md5_req; 1188 1189 if (crypto_ahash_init(req)) 1190 goto clear_hash; 1191 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1192 goto clear_hash; 1193 if (tcp_md5_hash_key(hp, key)) 1194 goto clear_hash; 1195 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1196 if (crypto_ahash_final(req)) 1197 goto clear_hash; 1198 1199 tcp_put_md5sig_pool(); 1200 return 0; 1201 1202 clear_hash: 1203 tcp_put_md5sig_pool(); 1204 clear_hash_noput: 1205 memset(md5_hash, 0, 16); 1206 return 1; 1207 } 1208 1209 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1210 const struct sock *sk, 1211 const struct sk_buff *skb) 1212 { 1213 struct tcp_md5sig_pool *hp; 1214 struct ahash_request *req; 1215 const struct tcphdr *th = tcp_hdr(skb); 1216 __be32 saddr, daddr; 1217 1218 if (sk) { /* valid for establish/request sockets */ 1219 saddr = sk->sk_rcv_saddr; 1220 daddr = sk->sk_daddr; 1221 } else { 1222 const struct iphdr *iph = ip_hdr(skb); 1223 saddr = iph->saddr; 1224 daddr = iph->daddr; 1225 } 1226 1227 hp = tcp_get_md5sig_pool(); 1228 if (!hp) 1229 goto clear_hash_noput; 1230 req = hp->md5_req; 1231 1232 if (crypto_ahash_init(req)) 1233 goto clear_hash; 1234 1235 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1236 goto clear_hash; 1237 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1238 goto clear_hash; 1239 if (tcp_md5_hash_key(hp, key)) 1240 goto clear_hash; 1241 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1242 if (crypto_ahash_final(req)) 1243 goto clear_hash; 1244 1245 tcp_put_md5sig_pool(); 1246 return 0; 1247 1248 clear_hash: 1249 tcp_put_md5sig_pool(); 1250 clear_hash_noput: 1251 memset(md5_hash, 0, 16); 1252 return 1; 1253 } 1254 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1255 1256 #endif 1257 1258 /* Called with rcu_read_lock() */ 1259 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1260 const struct sk_buff *skb) 1261 { 1262 #ifdef CONFIG_TCP_MD5SIG 1263 /* 1264 * This gets called for each TCP segment that arrives 1265 * so we want to be efficient. 1266 * We have 3 drop cases: 1267 * o No MD5 hash and one expected. 1268 * o MD5 hash and we're not expecting one. 1269 * o MD5 hash and its wrong. 1270 */ 1271 const __u8 *hash_location = NULL; 1272 struct tcp_md5sig_key *hash_expected; 1273 const struct iphdr *iph = ip_hdr(skb); 1274 const struct tcphdr *th = tcp_hdr(skb); 1275 int genhash; 1276 unsigned char newhash[16]; 1277 1278 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1279 AF_INET); 1280 hash_location = tcp_parse_md5sig_option(th); 1281 1282 /* We've parsed the options - do we have a hash? */ 1283 if (!hash_expected && !hash_location) 1284 return false; 1285 1286 if (hash_expected && !hash_location) { 1287 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1288 return true; 1289 } 1290 1291 if (!hash_expected && hash_location) { 1292 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1293 return true; 1294 } 1295 1296 /* Okay, so this is hash_expected and hash_location - 1297 * so we need to calculate the checksum. 1298 */ 1299 genhash = tcp_v4_md5_hash_skb(newhash, 1300 hash_expected, 1301 NULL, skb); 1302 1303 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1304 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1305 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1306 &iph->saddr, ntohs(th->source), 1307 &iph->daddr, ntohs(th->dest), 1308 genhash ? " tcp_v4_calc_md5_hash failed" 1309 : ""); 1310 return true; 1311 } 1312 return false; 1313 #endif 1314 return false; 1315 } 1316 1317 static void tcp_v4_init_req(struct request_sock *req, 1318 const struct sock *sk_listener, 1319 struct sk_buff *skb) 1320 { 1321 struct inet_request_sock *ireq = inet_rsk(req); 1322 struct net *net = sock_net(sk_listener); 1323 1324 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1325 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1326 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1327 } 1328 1329 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1330 struct flowi *fl, 1331 const struct request_sock *req) 1332 { 1333 return inet_csk_route_req(sk, &fl->u.ip4, req); 1334 } 1335 1336 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1337 .family = PF_INET, 1338 .obj_size = sizeof(struct tcp_request_sock), 1339 .rtx_syn_ack = tcp_rtx_synack, 1340 .send_ack = tcp_v4_reqsk_send_ack, 1341 .destructor = tcp_v4_reqsk_destructor, 1342 .send_reset = tcp_v4_send_reset, 1343 .syn_ack_timeout = tcp_syn_ack_timeout, 1344 }; 1345 1346 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1347 .mss_clamp = TCP_MSS_DEFAULT, 1348 #ifdef CONFIG_TCP_MD5SIG 1349 .req_md5_lookup = tcp_v4_md5_lookup, 1350 .calc_md5_hash = tcp_v4_md5_hash_skb, 1351 #endif 1352 .init_req = tcp_v4_init_req, 1353 #ifdef CONFIG_SYN_COOKIES 1354 .cookie_init_seq = cookie_v4_init_sequence, 1355 #endif 1356 .route_req = tcp_v4_route_req, 1357 .init_seq = tcp_v4_init_seq, 1358 .init_ts_off = tcp_v4_init_ts_off, 1359 .send_synack = tcp_v4_send_synack, 1360 }; 1361 1362 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1363 { 1364 /* Never answer to SYNs send to broadcast or multicast */ 1365 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1366 goto drop; 1367 1368 return tcp_conn_request(&tcp_request_sock_ops, 1369 &tcp_request_sock_ipv4_ops, sk, skb); 1370 1371 drop: 1372 tcp_listendrop(sk); 1373 return 0; 1374 } 1375 EXPORT_SYMBOL(tcp_v4_conn_request); 1376 1377 1378 /* 1379 * The three way handshake has completed - we got a valid synack - 1380 * now create the new socket. 1381 */ 1382 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1383 struct request_sock *req, 1384 struct dst_entry *dst, 1385 struct request_sock *req_unhash, 1386 bool *own_req) 1387 { 1388 struct inet_request_sock *ireq; 1389 struct inet_sock *newinet; 1390 struct tcp_sock *newtp; 1391 struct sock *newsk; 1392 #ifdef CONFIG_TCP_MD5SIG 1393 struct tcp_md5sig_key *key; 1394 #endif 1395 struct ip_options_rcu *inet_opt; 1396 1397 if (sk_acceptq_is_full(sk)) 1398 goto exit_overflow; 1399 1400 newsk = tcp_create_openreq_child(sk, req, skb); 1401 if (!newsk) 1402 goto exit_nonewsk; 1403 1404 newsk->sk_gso_type = SKB_GSO_TCPV4; 1405 inet_sk_rx_dst_set(newsk, skb); 1406 1407 newtp = tcp_sk(newsk); 1408 newinet = inet_sk(newsk); 1409 ireq = inet_rsk(req); 1410 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1411 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1412 newsk->sk_bound_dev_if = ireq->ir_iif; 1413 newinet->inet_saddr = ireq->ir_loc_addr; 1414 inet_opt = rcu_dereference(ireq->ireq_opt); 1415 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1416 newinet->mc_index = inet_iif(skb); 1417 newinet->mc_ttl = ip_hdr(skb)->ttl; 1418 newinet->rcv_tos = ip_hdr(skb)->tos; 1419 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1420 if (inet_opt) 1421 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1422 newinet->inet_id = newtp->write_seq ^ jiffies; 1423 1424 if (!dst) { 1425 dst = inet_csk_route_child_sock(sk, newsk, req); 1426 if (!dst) 1427 goto put_and_exit; 1428 } else { 1429 /* syncookie case : see end of cookie_v4_check() */ 1430 } 1431 sk_setup_caps(newsk, dst); 1432 1433 tcp_ca_openreq_child(newsk, dst); 1434 1435 tcp_sync_mss(newsk, dst_mtu(dst)); 1436 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1437 1438 tcp_initialize_rcv_mss(newsk); 1439 1440 #ifdef CONFIG_TCP_MD5SIG 1441 /* Copy over the MD5 key from the original socket */ 1442 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1443 AF_INET); 1444 if (key) { 1445 /* 1446 * We're using one, so create a matching key 1447 * on the newsk structure. If we fail to get 1448 * memory, then we end up not copying the key 1449 * across. Shucks. 1450 */ 1451 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1452 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); 1453 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1454 } 1455 #endif 1456 1457 if (__inet_inherit_port(sk, newsk) < 0) 1458 goto put_and_exit; 1459 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1460 if (likely(*own_req)) { 1461 tcp_move_syn(newtp, req); 1462 ireq->ireq_opt = NULL; 1463 } else { 1464 newinet->inet_opt = NULL; 1465 } 1466 return newsk; 1467 1468 exit_overflow: 1469 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1470 exit_nonewsk: 1471 dst_release(dst); 1472 exit: 1473 tcp_listendrop(sk); 1474 return NULL; 1475 put_and_exit: 1476 newinet->inet_opt = NULL; 1477 inet_csk_prepare_forced_close(newsk); 1478 tcp_done(newsk); 1479 goto exit; 1480 } 1481 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1482 1483 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1484 { 1485 #ifdef CONFIG_SYN_COOKIES 1486 const struct tcphdr *th = tcp_hdr(skb); 1487 1488 if (!th->syn) 1489 sk = cookie_v4_check(sk, skb); 1490 #endif 1491 return sk; 1492 } 1493 1494 /* The socket must have it's spinlock held when we get 1495 * here, unless it is a TCP_LISTEN socket. 1496 * 1497 * We have a potential double-lock case here, so even when 1498 * doing backlog processing we use the BH locking scheme. 1499 * This is because we cannot sleep with the original spinlock 1500 * held. 1501 */ 1502 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1503 { 1504 struct sock *rsk; 1505 1506 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1507 struct dst_entry *dst = sk->sk_rx_dst; 1508 1509 sock_rps_save_rxhash(sk, skb); 1510 sk_mark_napi_id(sk, skb); 1511 if (dst) { 1512 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1513 !dst->ops->check(dst, 0)) { 1514 dst_release(dst); 1515 sk->sk_rx_dst = NULL; 1516 } 1517 } 1518 tcp_rcv_established(sk, skb); 1519 return 0; 1520 } 1521 1522 if (tcp_checksum_complete(skb)) 1523 goto csum_err; 1524 1525 if (sk->sk_state == TCP_LISTEN) { 1526 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1527 1528 if (!nsk) 1529 goto discard; 1530 if (nsk != sk) { 1531 if (tcp_child_process(sk, nsk, skb)) { 1532 rsk = nsk; 1533 goto reset; 1534 } 1535 return 0; 1536 } 1537 } else 1538 sock_rps_save_rxhash(sk, skb); 1539 1540 if (tcp_rcv_state_process(sk, skb)) { 1541 rsk = sk; 1542 goto reset; 1543 } 1544 return 0; 1545 1546 reset: 1547 tcp_v4_send_reset(rsk, skb); 1548 discard: 1549 kfree_skb(skb); 1550 /* Be careful here. If this function gets more complicated and 1551 * gcc suffers from register pressure on the x86, sk (in %ebx) 1552 * might be destroyed here. This current version compiles correctly, 1553 * but you have been warned. 1554 */ 1555 return 0; 1556 1557 csum_err: 1558 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1559 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1560 goto discard; 1561 } 1562 EXPORT_SYMBOL(tcp_v4_do_rcv); 1563 1564 int tcp_v4_early_demux(struct sk_buff *skb) 1565 { 1566 const struct iphdr *iph; 1567 const struct tcphdr *th; 1568 struct sock *sk; 1569 1570 if (skb->pkt_type != PACKET_HOST) 1571 return 0; 1572 1573 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1574 return 0; 1575 1576 iph = ip_hdr(skb); 1577 th = tcp_hdr(skb); 1578 1579 if (th->doff < sizeof(struct tcphdr) / 4) 1580 return 0; 1581 1582 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1583 iph->saddr, th->source, 1584 iph->daddr, ntohs(th->dest), 1585 skb->skb_iif, inet_sdif(skb)); 1586 if (sk) { 1587 skb->sk = sk; 1588 skb->destructor = sock_edemux; 1589 if (sk_fullsock(sk)) { 1590 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1591 1592 if (dst) 1593 dst = dst_check(dst, 0); 1594 if (dst && 1595 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1596 skb_dst_set_noref(skb, dst); 1597 } 1598 } 1599 return 0; 1600 } 1601 1602 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1603 { 1604 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1605 1606 /* Only socket owner can try to collapse/prune rx queues 1607 * to reduce memory overhead, so add a little headroom here. 1608 * Few sockets backlog are possibly concurrently non empty. 1609 */ 1610 limit += 64*1024; 1611 1612 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1613 * we can fix skb->truesize to its real value to avoid future drops. 1614 * This is valid because skb is not yet charged to the socket. 1615 * It has been noticed pure SACK packets were sometimes dropped 1616 * (if cooked by drivers without copybreak feature). 1617 */ 1618 skb_condense(skb); 1619 1620 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1621 bh_unlock_sock(sk); 1622 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1623 return true; 1624 } 1625 return false; 1626 } 1627 EXPORT_SYMBOL(tcp_add_backlog); 1628 1629 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1630 { 1631 struct tcphdr *th = (struct tcphdr *)skb->data; 1632 unsigned int eaten = skb->len; 1633 int err; 1634 1635 err = sk_filter_trim_cap(sk, skb, th->doff * 4); 1636 if (!err) { 1637 eaten -= skb->len; 1638 TCP_SKB_CB(skb)->end_seq -= eaten; 1639 } 1640 return err; 1641 } 1642 EXPORT_SYMBOL(tcp_filter); 1643 1644 static void tcp_v4_restore_cb(struct sk_buff *skb) 1645 { 1646 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1647 sizeof(struct inet_skb_parm)); 1648 } 1649 1650 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1651 const struct tcphdr *th) 1652 { 1653 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1654 * barrier() makes sure compiler wont play fool^Waliasing games. 1655 */ 1656 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1657 sizeof(struct inet_skb_parm)); 1658 barrier(); 1659 1660 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1661 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1662 skb->len - th->doff * 4); 1663 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1664 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1665 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1666 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1667 TCP_SKB_CB(skb)->sacked = 0; 1668 TCP_SKB_CB(skb)->has_rxtstamp = 1669 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1670 } 1671 1672 /* 1673 * From tcp_input.c 1674 */ 1675 1676 int tcp_v4_rcv(struct sk_buff *skb) 1677 { 1678 struct net *net = dev_net(skb->dev); 1679 int sdif = inet_sdif(skb); 1680 const struct iphdr *iph; 1681 const struct tcphdr *th; 1682 bool refcounted; 1683 struct sock *sk; 1684 int ret; 1685 1686 if (skb->pkt_type != PACKET_HOST) 1687 goto discard_it; 1688 1689 /* Count it even if it's bad */ 1690 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1691 1692 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1693 goto discard_it; 1694 1695 th = (const struct tcphdr *)skb->data; 1696 1697 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1698 goto bad_packet; 1699 if (!pskb_may_pull(skb, th->doff * 4)) 1700 goto discard_it; 1701 1702 /* An explanation is required here, I think. 1703 * Packet length and doff are validated by header prediction, 1704 * provided case of th->doff==0 is eliminated. 1705 * So, we defer the checks. */ 1706 1707 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1708 goto csum_error; 1709 1710 th = (const struct tcphdr *)skb->data; 1711 iph = ip_hdr(skb); 1712 lookup: 1713 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1714 th->dest, sdif, &refcounted); 1715 if (!sk) 1716 goto no_tcp_socket; 1717 1718 process: 1719 if (sk->sk_state == TCP_TIME_WAIT) 1720 goto do_time_wait; 1721 1722 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1723 struct request_sock *req = inet_reqsk(sk); 1724 bool req_stolen = false; 1725 struct sock *nsk; 1726 1727 sk = req->rsk_listener; 1728 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 1729 sk_drops_add(sk, skb); 1730 reqsk_put(req); 1731 goto discard_it; 1732 } 1733 if (tcp_checksum_complete(skb)) { 1734 reqsk_put(req); 1735 goto csum_error; 1736 } 1737 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1738 inet_csk_reqsk_queue_drop_and_put(sk, req); 1739 goto lookup; 1740 } 1741 /* We own a reference on the listener, increase it again 1742 * as we might lose it too soon. 1743 */ 1744 sock_hold(sk); 1745 refcounted = true; 1746 nsk = NULL; 1747 if (!tcp_filter(sk, skb)) { 1748 th = (const struct tcphdr *)skb->data; 1749 iph = ip_hdr(skb); 1750 tcp_v4_fill_cb(skb, iph, th); 1751 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1752 } 1753 if (!nsk) { 1754 reqsk_put(req); 1755 if (req_stolen) { 1756 /* Another cpu got exclusive access to req 1757 * and created a full blown socket. 1758 * Try to feed this packet to this socket 1759 * instead of discarding it. 1760 */ 1761 tcp_v4_restore_cb(skb); 1762 sock_put(sk); 1763 goto lookup; 1764 } 1765 goto discard_and_relse; 1766 } 1767 if (nsk == sk) { 1768 reqsk_put(req); 1769 tcp_v4_restore_cb(skb); 1770 } else if (tcp_child_process(sk, nsk, skb)) { 1771 tcp_v4_send_reset(nsk, skb); 1772 goto discard_and_relse; 1773 } else { 1774 sock_put(sk); 1775 return 0; 1776 } 1777 } 1778 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1779 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1780 goto discard_and_relse; 1781 } 1782 1783 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1784 goto discard_and_relse; 1785 1786 if (tcp_v4_inbound_md5_hash(sk, skb)) 1787 goto discard_and_relse; 1788 1789 nf_reset(skb); 1790 1791 if (tcp_filter(sk, skb)) 1792 goto discard_and_relse; 1793 th = (const struct tcphdr *)skb->data; 1794 iph = ip_hdr(skb); 1795 tcp_v4_fill_cb(skb, iph, th); 1796 1797 skb->dev = NULL; 1798 1799 if (sk->sk_state == TCP_LISTEN) { 1800 ret = tcp_v4_do_rcv(sk, skb); 1801 goto put_and_return; 1802 } 1803 1804 sk_incoming_cpu_update(sk); 1805 1806 bh_lock_sock_nested(sk); 1807 tcp_segs_in(tcp_sk(sk), skb); 1808 ret = 0; 1809 if (!sock_owned_by_user(sk)) { 1810 ret = tcp_v4_do_rcv(sk, skb); 1811 } else if (tcp_add_backlog(sk, skb)) { 1812 goto discard_and_relse; 1813 } 1814 bh_unlock_sock(sk); 1815 1816 put_and_return: 1817 if (refcounted) 1818 sock_put(sk); 1819 1820 return ret; 1821 1822 no_tcp_socket: 1823 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1824 goto discard_it; 1825 1826 tcp_v4_fill_cb(skb, iph, th); 1827 1828 if (tcp_checksum_complete(skb)) { 1829 csum_error: 1830 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 1831 bad_packet: 1832 __TCP_INC_STATS(net, TCP_MIB_INERRS); 1833 } else { 1834 tcp_v4_send_reset(NULL, skb); 1835 } 1836 1837 discard_it: 1838 /* Discard frame. */ 1839 kfree_skb(skb); 1840 return 0; 1841 1842 discard_and_relse: 1843 sk_drops_add(sk, skb); 1844 if (refcounted) 1845 sock_put(sk); 1846 goto discard_it; 1847 1848 do_time_wait: 1849 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1850 inet_twsk_put(inet_twsk(sk)); 1851 goto discard_it; 1852 } 1853 1854 tcp_v4_fill_cb(skb, iph, th); 1855 1856 if (tcp_checksum_complete(skb)) { 1857 inet_twsk_put(inet_twsk(sk)); 1858 goto csum_error; 1859 } 1860 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1861 case TCP_TW_SYN: { 1862 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1863 &tcp_hashinfo, skb, 1864 __tcp_hdrlen(th), 1865 iph->saddr, th->source, 1866 iph->daddr, th->dest, 1867 inet_iif(skb), 1868 sdif); 1869 if (sk2) { 1870 inet_twsk_deschedule_put(inet_twsk(sk)); 1871 sk = sk2; 1872 tcp_v4_restore_cb(skb); 1873 refcounted = false; 1874 goto process; 1875 } 1876 } 1877 /* to ACK */ 1878 /* fall through */ 1879 case TCP_TW_ACK: 1880 tcp_v4_timewait_ack(sk, skb); 1881 break; 1882 case TCP_TW_RST: 1883 tcp_v4_send_reset(sk, skb); 1884 inet_twsk_deschedule_put(inet_twsk(sk)); 1885 goto discard_it; 1886 case TCP_TW_SUCCESS:; 1887 } 1888 goto discard_it; 1889 } 1890 1891 static struct timewait_sock_ops tcp_timewait_sock_ops = { 1892 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1893 .twsk_unique = tcp_twsk_unique, 1894 .twsk_destructor= tcp_twsk_destructor, 1895 }; 1896 1897 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 1898 { 1899 struct dst_entry *dst = skb_dst(skb); 1900 1901 if (dst && dst_hold_safe(dst)) { 1902 sk->sk_rx_dst = dst; 1903 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 1904 } 1905 } 1906 EXPORT_SYMBOL(inet_sk_rx_dst_set); 1907 1908 const struct inet_connection_sock_af_ops ipv4_specific = { 1909 .queue_xmit = ip_queue_xmit, 1910 .send_check = tcp_v4_send_check, 1911 .rebuild_header = inet_sk_rebuild_header, 1912 .sk_rx_dst_set = inet_sk_rx_dst_set, 1913 .conn_request = tcp_v4_conn_request, 1914 .syn_recv_sock = tcp_v4_syn_recv_sock, 1915 .net_header_len = sizeof(struct iphdr), 1916 .setsockopt = ip_setsockopt, 1917 .getsockopt = ip_getsockopt, 1918 .addr2sockaddr = inet_csk_addr2sockaddr, 1919 .sockaddr_len = sizeof(struct sockaddr_in), 1920 #ifdef CONFIG_COMPAT 1921 .compat_setsockopt = compat_ip_setsockopt, 1922 .compat_getsockopt = compat_ip_getsockopt, 1923 #endif 1924 .mtu_reduced = tcp_v4_mtu_reduced, 1925 }; 1926 EXPORT_SYMBOL(ipv4_specific); 1927 1928 #ifdef CONFIG_TCP_MD5SIG 1929 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1930 .md5_lookup = tcp_v4_md5_lookup, 1931 .calc_md5_hash = tcp_v4_md5_hash_skb, 1932 .md5_parse = tcp_v4_parse_md5_keys, 1933 }; 1934 #endif 1935 1936 /* NOTE: A lot of things set to zero explicitly by call to 1937 * sk_alloc() so need not be done here. 1938 */ 1939 static int tcp_v4_init_sock(struct sock *sk) 1940 { 1941 struct inet_connection_sock *icsk = inet_csk(sk); 1942 1943 tcp_init_sock(sk); 1944 1945 icsk->icsk_af_ops = &ipv4_specific; 1946 1947 #ifdef CONFIG_TCP_MD5SIG 1948 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 1949 #endif 1950 1951 return 0; 1952 } 1953 1954 void tcp_v4_destroy_sock(struct sock *sk) 1955 { 1956 struct tcp_sock *tp = tcp_sk(sk); 1957 1958 trace_tcp_destroy_sock(sk); 1959 1960 tcp_clear_xmit_timers(sk); 1961 1962 tcp_cleanup_congestion_control(sk); 1963 1964 tcp_cleanup_ulp(sk); 1965 1966 /* Cleanup up the write buffer. */ 1967 tcp_write_queue_purge(sk); 1968 1969 /* Check if we want to disable active TFO */ 1970 tcp_fastopen_active_disable_ofo_check(sk); 1971 1972 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1973 skb_rbtree_purge(&tp->out_of_order_queue); 1974 1975 #ifdef CONFIG_TCP_MD5SIG 1976 /* Clean up the MD5 key list, if any */ 1977 if (tp->md5sig_info) { 1978 tcp_clear_md5_list(sk); 1979 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 1980 tp->md5sig_info = NULL; 1981 } 1982 #endif 1983 1984 /* Clean up a referenced TCP bind bucket. */ 1985 if (inet_csk(sk)->icsk_bind_hash) 1986 inet_put_port(sk); 1987 1988 BUG_ON(tp->fastopen_rsk); 1989 1990 /* If socket is aborted during connect operation */ 1991 tcp_free_fastopen_req(tp); 1992 tcp_fastopen_destroy_cipher(sk); 1993 tcp_saved_syn_free(tp); 1994 1995 sk_sockets_allocated_dec(sk); 1996 } 1997 EXPORT_SYMBOL(tcp_v4_destroy_sock); 1998 1999 #ifdef CONFIG_PROC_FS 2000 /* Proc filesystem TCP sock list dumping. */ 2001 2002 /* 2003 * Get next listener socket follow cur. If cur is NULL, get first socket 2004 * starting from bucket given in st->bucket; when st->bucket is zero the 2005 * very first socket in the hash table is returned. 2006 */ 2007 static void *listening_get_next(struct seq_file *seq, void *cur) 2008 { 2009 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2010 struct tcp_iter_state *st = seq->private; 2011 struct net *net = seq_file_net(seq); 2012 struct inet_listen_hashbucket *ilb; 2013 struct sock *sk = cur; 2014 2015 if (!sk) { 2016 get_head: 2017 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2018 spin_lock(&ilb->lock); 2019 sk = sk_head(&ilb->head); 2020 st->offset = 0; 2021 goto get_sk; 2022 } 2023 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2024 ++st->num; 2025 ++st->offset; 2026 2027 sk = sk_next(sk); 2028 get_sk: 2029 sk_for_each_from(sk) { 2030 if (!net_eq(sock_net(sk), net)) 2031 continue; 2032 if (sk->sk_family == afinfo->family) 2033 return sk; 2034 } 2035 spin_unlock(&ilb->lock); 2036 st->offset = 0; 2037 if (++st->bucket < INET_LHTABLE_SIZE) 2038 goto get_head; 2039 return NULL; 2040 } 2041 2042 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2043 { 2044 struct tcp_iter_state *st = seq->private; 2045 void *rc; 2046 2047 st->bucket = 0; 2048 st->offset = 0; 2049 rc = listening_get_next(seq, NULL); 2050 2051 while (rc && *pos) { 2052 rc = listening_get_next(seq, rc); 2053 --*pos; 2054 } 2055 return rc; 2056 } 2057 2058 static inline bool empty_bucket(const struct tcp_iter_state *st) 2059 { 2060 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2061 } 2062 2063 /* 2064 * Get first established socket starting from bucket given in st->bucket. 2065 * If st->bucket is zero, the very first socket in the hash is returned. 2066 */ 2067 static void *established_get_first(struct seq_file *seq) 2068 { 2069 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2070 struct tcp_iter_state *st = seq->private; 2071 struct net *net = seq_file_net(seq); 2072 void *rc = NULL; 2073 2074 st->offset = 0; 2075 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2076 struct sock *sk; 2077 struct hlist_nulls_node *node; 2078 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2079 2080 /* Lockless fast path for the common case of empty buckets */ 2081 if (empty_bucket(st)) 2082 continue; 2083 2084 spin_lock_bh(lock); 2085 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2086 if (sk->sk_family != afinfo->family || 2087 !net_eq(sock_net(sk), net)) { 2088 continue; 2089 } 2090 rc = sk; 2091 goto out; 2092 } 2093 spin_unlock_bh(lock); 2094 } 2095 out: 2096 return rc; 2097 } 2098 2099 static void *established_get_next(struct seq_file *seq, void *cur) 2100 { 2101 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2102 struct sock *sk = cur; 2103 struct hlist_nulls_node *node; 2104 struct tcp_iter_state *st = seq->private; 2105 struct net *net = seq_file_net(seq); 2106 2107 ++st->num; 2108 ++st->offset; 2109 2110 sk = sk_nulls_next(sk); 2111 2112 sk_nulls_for_each_from(sk, node) { 2113 if (sk->sk_family == afinfo->family && 2114 net_eq(sock_net(sk), net)) 2115 return sk; 2116 } 2117 2118 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2119 ++st->bucket; 2120 return established_get_first(seq); 2121 } 2122 2123 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2124 { 2125 struct tcp_iter_state *st = seq->private; 2126 void *rc; 2127 2128 st->bucket = 0; 2129 rc = established_get_first(seq); 2130 2131 while (rc && pos) { 2132 rc = established_get_next(seq, rc); 2133 --pos; 2134 } 2135 return rc; 2136 } 2137 2138 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2139 { 2140 void *rc; 2141 struct tcp_iter_state *st = seq->private; 2142 2143 st->state = TCP_SEQ_STATE_LISTENING; 2144 rc = listening_get_idx(seq, &pos); 2145 2146 if (!rc) { 2147 st->state = TCP_SEQ_STATE_ESTABLISHED; 2148 rc = established_get_idx(seq, pos); 2149 } 2150 2151 return rc; 2152 } 2153 2154 static void *tcp_seek_last_pos(struct seq_file *seq) 2155 { 2156 struct tcp_iter_state *st = seq->private; 2157 int offset = st->offset; 2158 int orig_num = st->num; 2159 void *rc = NULL; 2160 2161 switch (st->state) { 2162 case TCP_SEQ_STATE_LISTENING: 2163 if (st->bucket >= INET_LHTABLE_SIZE) 2164 break; 2165 st->state = TCP_SEQ_STATE_LISTENING; 2166 rc = listening_get_next(seq, NULL); 2167 while (offset-- && rc) 2168 rc = listening_get_next(seq, rc); 2169 if (rc) 2170 break; 2171 st->bucket = 0; 2172 st->state = TCP_SEQ_STATE_ESTABLISHED; 2173 /* Fallthrough */ 2174 case TCP_SEQ_STATE_ESTABLISHED: 2175 if (st->bucket > tcp_hashinfo.ehash_mask) 2176 break; 2177 rc = established_get_first(seq); 2178 while (offset-- && rc) 2179 rc = established_get_next(seq, rc); 2180 } 2181 2182 st->num = orig_num; 2183 2184 return rc; 2185 } 2186 2187 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2188 { 2189 struct tcp_iter_state *st = seq->private; 2190 void *rc; 2191 2192 if (*pos && *pos == st->last_pos) { 2193 rc = tcp_seek_last_pos(seq); 2194 if (rc) 2195 goto out; 2196 } 2197 2198 st->state = TCP_SEQ_STATE_LISTENING; 2199 st->num = 0; 2200 st->bucket = 0; 2201 st->offset = 0; 2202 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2203 2204 out: 2205 st->last_pos = *pos; 2206 return rc; 2207 } 2208 EXPORT_SYMBOL(tcp_seq_start); 2209 2210 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2211 { 2212 struct tcp_iter_state *st = seq->private; 2213 void *rc = NULL; 2214 2215 if (v == SEQ_START_TOKEN) { 2216 rc = tcp_get_idx(seq, 0); 2217 goto out; 2218 } 2219 2220 switch (st->state) { 2221 case TCP_SEQ_STATE_LISTENING: 2222 rc = listening_get_next(seq, v); 2223 if (!rc) { 2224 st->state = TCP_SEQ_STATE_ESTABLISHED; 2225 st->bucket = 0; 2226 st->offset = 0; 2227 rc = established_get_first(seq); 2228 } 2229 break; 2230 case TCP_SEQ_STATE_ESTABLISHED: 2231 rc = established_get_next(seq, v); 2232 break; 2233 } 2234 out: 2235 ++*pos; 2236 st->last_pos = *pos; 2237 return rc; 2238 } 2239 EXPORT_SYMBOL(tcp_seq_next); 2240 2241 void tcp_seq_stop(struct seq_file *seq, void *v) 2242 { 2243 struct tcp_iter_state *st = seq->private; 2244 2245 switch (st->state) { 2246 case TCP_SEQ_STATE_LISTENING: 2247 if (v != SEQ_START_TOKEN) 2248 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2249 break; 2250 case TCP_SEQ_STATE_ESTABLISHED: 2251 if (v) 2252 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2253 break; 2254 } 2255 } 2256 EXPORT_SYMBOL(tcp_seq_stop); 2257 2258 static void get_openreq4(const struct request_sock *req, 2259 struct seq_file *f, int i) 2260 { 2261 const struct inet_request_sock *ireq = inet_rsk(req); 2262 long delta = req->rsk_timer.expires - jiffies; 2263 2264 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2265 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2266 i, 2267 ireq->ir_loc_addr, 2268 ireq->ir_num, 2269 ireq->ir_rmt_addr, 2270 ntohs(ireq->ir_rmt_port), 2271 TCP_SYN_RECV, 2272 0, 0, /* could print option size, but that is af dependent. */ 2273 1, /* timers active (only the expire timer) */ 2274 jiffies_delta_to_clock_t(delta), 2275 req->num_timeout, 2276 from_kuid_munged(seq_user_ns(f), 2277 sock_i_uid(req->rsk_listener)), 2278 0, /* non standard timer */ 2279 0, /* open_requests have no inode */ 2280 0, 2281 req); 2282 } 2283 2284 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2285 { 2286 int timer_active; 2287 unsigned long timer_expires; 2288 const struct tcp_sock *tp = tcp_sk(sk); 2289 const struct inet_connection_sock *icsk = inet_csk(sk); 2290 const struct inet_sock *inet = inet_sk(sk); 2291 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2292 __be32 dest = inet->inet_daddr; 2293 __be32 src = inet->inet_rcv_saddr; 2294 __u16 destp = ntohs(inet->inet_dport); 2295 __u16 srcp = ntohs(inet->inet_sport); 2296 int rx_queue; 2297 int state; 2298 2299 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2300 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2301 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2302 timer_active = 1; 2303 timer_expires = icsk->icsk_timeout; 2304 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2305 timer_active = 4; 2306 timer_expires = icsk->icsk_timeout; 2307 } else if (timer_pending(&sk->sk_timer)) { 2308 timer_active = 2; 2309 timer_expires = sk->sk_timer.expires; 2310 } else { 2311 timer_active = 0; 2312 timer_expires = jiffies; 2313 } 2314 2315 state = inet_sk_state_load(sk); 2316 if (state == TCP_LISTEN) 2317 rx_queue = sk->sk_ack_backlog; 2318 else 2319 /* Because we don't lock the socket, 2320 * we might find a transient negative value. 2321 */ 2322 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2323 2324 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2325 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2326 i, src, srcp, dest, destp, state, 2327 tp->write_seq - tp->snd_una, 2328 rx_queue, 2329 timer_active, 2330 jiffies_delta_to_clock_t(timer_expires - jiffies), 2331 icsk->icsk_retransmits, 2332 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2333 icsk->icsk_probes_out, 2334 sock_i_ino(sk), 2335 refcount_read(&sk->sk_refcnt), sk, 2336 jiffies_to_clock_t(icsk->icsk_rto), 2337 jiffies_to_clock_t(icsk->icsk_ack.ato), 2338 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2339 tp->snd_cwnd, 2340 state == TCP_LISTEN ? 2341 fastopenq->max_qlen : 2342 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2343 } 2344 2345 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2346 struct seq_file *f, int i) 2347 { 2348 long delta = tw->tw_timer.expires - jiffies; 2349 __be32 dest, src; 2350 __u16 destp, srcp; 2351 2352 dest = tw->tw_daddr; 2353 src = tw->tw_rcv_saddr; 2354 destp = ntohs(tw->tw_dport); 2355 srcp = ntohs(tw->tw_sport); 2356 2357 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2358 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2359 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2360 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2361 refcount_read(&tw->tw_refcnt), tw); 2362 } 2363 2364 #define TMPSZ 150 2365 2366 static int tcp4_seq_show(struct seq_file *seq, void *v) 2367 { 2368 struct tcp_iter_state *st; 2369 struct sock *sk = v; 2370 2371 seq_setwidth(seq, TMPSZ - 1); 2372 if (v == SEQ_START_TOKEN) { 2373 seq_puts(seq, " sl local_address rem_address st tx_queue " 2374 "rx_queue tr tm->when retrnsmt uid timeout " 2375 "inode"); 2376 goto out; 2377 } 2378 st = seq->private; 2379 2380 if (sk->sk_state == TCP_TIME_WAIT) 2381 get_timewait4_sock(v, seq, st->num); 2382 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2383 get_openreq4(v, seq, st->num); 2384 else 2385 get_tcp4_sock(v, seq, st->num); 2386 out: 2387 seq_pad(seq, '\n'); 2388 return 0; 2389 } 2390 2391 static const struct seq_operations tcp4_seq_ops = { 2392 .show = tcp4_seq_show, 2393 .start = tcp_seq_start, 2394 .next = tcp_seq_next, 2395 .stop = tcp_seq_stop, 2396 }; 2397 2398 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2399 .family = AF_INET, 2400 }; 2401 2402 static int __net_init tcp4_proc_init_net(struct net *net) 2403 { 2404 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2405 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2406 return -ENOMEM; 2407 return 0; 2408 } 2409 2410 static void __net_exit tcp4_proc_exit_net(struct net *net) 2411 { 2412 remove_proc_entry("tcp", net->proc_net); 2413 } 2414 2415 static struct pernet_operations tcp4_net_ops = { 2416 .init = tcp4_proc_init_net, 2417 .exit = tcp4_proc_exit_net, 2418 }; 2419 2420 int __init tcp4_proc_init(void) 2421 { 2422 return register_pernet_subsys(&tcp4_net_ops); 2423 } 2424 2425 void tcp4_proc_exit(void) 2426 { 2427 unregister_pernet_subsys(&tcp4_net_ops); 2428 } 2429 #endif /* CONFIG_PROC_FS */ 2430 2431 struct proto tcp_prot = { 2432 .name = "TCP", 2433 .owner = THIS_MODULE, 2434 .close = tcp_close, 2435 .pre_connect = tcp_v4_pre_connect, 2436 .connect = tcp_v4_connect, 2437 .disconnect = tcp_disconnect, 2438 .accept = inet_csk_accept, 2439 .ioctl = tcp_ioctl, 2440 .init = tcp_v4_init_sock, 2441 .destroy = tcp_v4_destroy_sock, 2442 .shutdown = tcp_shutdown, 2443 .setsockopt = tcp_setsockopt, 2444 .getsockopt = tcp_getsockopt, 2445 .keepalive = tcp_set_keepalive, 2446 .recvmsg = tcp_recvmsg, 2447 .sendmsg = tcp_sendmsg, 2448 .sendpage = tcp_sendpage, 2449 .backlog_rcv = tcp_v4_do_rcv, 2450 .release_cb = tcp_release_cb, 2451 .hash = inet_hash, 2452 .unhash = inet_unhash, 2453 .get_port = inet_csk_get_port, 2454 .enter_memory_pressure = tcp_enter_memory_pressure, 2455 .leave_memory_pressure = tcp_leave_memory_pressure, 2456 .stream_memory_free = tcp_stream_memory_free, 2457 .sockets_allocated = &tcp_sockets_allocated, 2458 .orphan_count = &tcp_orphan_count, 2459 .memory_allocated = &tcp_memory_allocated, 2460 .memory_pressure = &tcp_memory_pressure, 2461 .sysctl_mem = sysctl_tcp_mem, 2462 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2463 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2464 .max_header = MAX_TCP_HEADER, 2465 .obj_size = sizeof(struct tcp_sock), 2466 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2467 .twsk_prot = &tcp_timewait_sock_ops, 2468 .rsk_prot = &tcp_request_sock_ops, 2469 .h.hashinfo = &tcp_hashinfo, 2470 .no_autobind = true, 2471 #ifdef CONFIG_COMPAT 2472 .compat_setsockopt = compat_tcp_setsockopt, 2473 .compat_getsockopt = compat_tcp_getsockopt, 2474 #endif 2475 .diag_destroy = tcp_abort, 2476 }; 2477 EXPORT_SYMBOL(tcp_prot); 2478 2479 static void __net_exit tcp_sk_exit(struct net *net) 2480 { 2481 int cpu; 2482 2483 module_put(net->ipv4.tcp_congestion_control->owner); 2484 2485 for_each_possible_cpu(cpu) 2486 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2487 free_percpu(net->ipv4.tcp_sk); 2488 } 2489 2490 static int __net_init tcp_sk_init(struct net *net) 2491 { 2492 int res, cpu, cnt; 2493 2494 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2495 if (!net->ipv4.tcp_sk) 2496 return -ENOMEM; 2497 2498 for_each_possible_cpu(cpu) { 2499 struct sock *sk; 2500 2501 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2502 IPPROTO_TCP, net); 2503 if (res) 2504 goto fail; 2505 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2506 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2507 } 2508 2509 net->ipv4.sysctl_tcp_ecn = 2; 2510 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2511 2512 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2513 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2514 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2515 2516 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2517 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2518 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2519 2520 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2521 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2522 net->ipv4.sysctl_tcp_syncookies = 1; 2523 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2524 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2525 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2526 net->ipv4.sysctl_tcp_orphan_retries = 0; 2527 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2528 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2529 net->ipv4.sysctl_tcp_tw_reuse = 2; 2530 2531 cnt = tcp_hashinfo.ehash_mask + 1; 2532 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; 2533 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2534 2535 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); 2536 net->ipv4.sysctl_tcp_sack = 1; 2537 net->ipv4.sysctl_tcp_window_scaling = 1; 2538 net->ipv4.sysctl_tcp_timestamps = 1; 2539 net->ipv4.sysctl_tcp_early_retrans = 3; 2540 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2541 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2542 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2543 net->ipv4.sysctl_tcp_max_reordering = 300; 2544 net->ipv4.sysctl_tcp_dsack = 1; 2545 net->ipv4.sysctl_tcp_app_win = 31; 2546 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2547 net->ipv4.sysctl_tcp_frto = 2; 2548 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2549 /* This limits the percentage of the congestion window which we 2550 * will allow a single TSO frame to consume. Building TSO frames 2551 * which are too large can cause TCP streams to be bursty. 2552 */ 2553 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2554 /* Default TSQ limit of four TSO segments */ 2555 net->ipv4.sysctl_tcp_limit_output_bytes = 262144; 2556 /* rfc5961 challenge ack rate limiting */ 2557 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2558 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2559 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2560 net->ipv4.sysctl_tcp_autocorking = 1; 2561 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2562 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2563 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2564 if (net != &init_net) { 2565 memcpy(net->ipv4.sysctl_tcp_rmem, 2566 init_net.ipv4.sysctl_tcp_rmem, 2567 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2568 memcpy(net->ipv4.sysctl_tcp_wmem, 2569 init_net.ipv4.sysctl_tcp_wmem, 2570 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2571 } 2572 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2573 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2574 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2575 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2576 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2577 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2578 2579 /* Reno is always built in */ 2580 if (!net_eq(net, &init_net) && 2581 try_module_get(init_net.ipv4.tcp_congestion_control->owner)) 2582 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2583 else 2584 net->ipv4.tcp_congestion_control = &tcp_reno; 2585 2586 return 0; 2587 fail: 2588 tcp_sk_exit(net); 2589 2590 return res; 2591 } 2592 2593 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2594 { 2595 struct net *net; 2596 2597 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2598 2599 list_for_each_entry(net, net_exit_list, exit_list) 2600 tcp_fastopen_ctx_destroy(net); 2601 } 2602 2603 static struct pernet_operations __net_initdata tcp_sk_ops = { 2604 .init = tcp_sk_init, 2605 .exit = tcp_sk_exit, 2606 .exit_batch = tcp_sk_exit_batch, 2607 }; 2608 2609 void __init tcp_v4_init(void) 2610 { 2611 if (register_pernet_subsys(&tcp_sk_ops)) 2612 panic("Failed to create the TCP control socket.\n"); 2613 } 2614