1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * IPv4 specific functions 9 * 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 * 18 * This program is free software; you can redistribute it and/or 19 * modify it under the terms of the GNU General Public License 20 * as published by the Free Software Foundation; either version 21 * 2 of the License, or (at your option) any later version. 22 */ 23 24 /* 25 * Changes: 26 * David S. Miller : New socket lookup architecture. 27 * This code is dedicated to John Dyson. 28 * David S. Miller : Change semantics of established hash, 29 * half is devoted to TIME_WAIT sockets 30 * and the rest go in the other half. 31 * Andi Kleen : Add support for syncookies and fixed 32 * some bugs: ip options weren't passed to 33 * the TCP layer, missed a check for an 34 * ACK bit. 35 * Andi Kleen : Implemented fast path mtu discovery. 36 * Fixed many serious bugs in the 37 * request_sock handling and moved 38 * most of it into the af independent code. 39 * Added tail drop and some other bugfixes. 40 * Added new listen semantics. 41 * Mike McLagan : Routing by source 42 * Juan Jose Ciarlante: ip_dynaddr bits 43 * Andi Kleen: various fixes. 44 * Vitaly E. Lavrov : Transparent proxy revived after year 45 * coma. 46 * Andi Kleen : Fix new listen. 47 * Andi Kleen : Fix accept error reporting. 48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 50 * a single port at the same time. 51 */ 52 53 #define pr_fmt(fmt) "TCP: " fmt 54 55 #include <linux/bottom_half.h> 56 #include <linux/types.h> 57 #include <linux/fcntl.h> 58 #include <linux/module.h> 59 #include <linux/random.h> 60 #include <linux/cache.h> 61 #include <linux/jhash.h> 62 #include <linux/init.h> 63 #include <linux/times.h> 64 #include <linux/slab.h> 65 66 #include <net/net_namespace.h> 67 #include <net/icmp.h> 68 #include <net/inet_hashtables.h> 69 #include <net/tcp.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/timewait_sock.h> 74 #include <net/xfrm.h> 75 #include <net/secure_seq.h> 76 #include <net/busy_poll.h> 77 78 #include <linux/inet.h> 79 #include <linux/ipv6.h> 80 #include <linux/stddef.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 #include <linux/inetdevice.h> 84 85 #include <crypto/hash.h> 86 #include <linux/scatterlist.h> 87 88 #include <trace/events/tcp.h> 89 90 #ifdef CONFIG_TCP_MD5SIG 91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 92 __be32 daddr, __be32 saddr, const struct tcphdr *th); 93 #endif 94 95 struct inet_hashinfo tcp_hashinfo; 96 EXPORT_SYMBOL(tcp_hashinfo); 97 98 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 99 { 100 return secure_tcp_seq(ip_hdr(skb)->daddr, 101 ip_hdr(skb)->saddr, 102 tcp_hdr(skb)->dest, 103 tcp_hdr(skb)->source); 104 } 105 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 107 { 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 109 } 110 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 112 { 113 const struct inet_timewait_sock *tw = inet_twsk(sktw); 114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 115 struct tcp_sock *tp = tcp_sk(sk); 116 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 117 118 if (reuse == 2) { 119 /* Still does not detect *everything* that goes through 120 * lo, since we require a loopback src or dst address 121 * or direct binding to 'lo' interface. 122 */ 123 bool loopback = false; 124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 125 loopback = true; 126 #if IS_ENABLED(CONFIG_IPV6) 127 if (tw->tw_family == AF_INET6) { 128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 129 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && 130 (tw->tw_v6_daddr.s6_addr[12] == 127)) || 131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 132 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && 133 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) 134 loopback = true; 135 } else 136 #endif 137 { 138 if (ipv4_is_loopback(tw->tw_daddr) || 139 ipv4_is_loopback(tw->tw_rcv_saddr)) 140 loopback = true; 141 } 142 if (!loopback) 143 reuse = 0; 144 } 145 146 /* With PAWS, it is safe from the viewpoint 147 of data integrity. Even without PAWS it is safe provided sequence 148 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 149 150 Actually, the idea is close to VJ's one, only timestamp cache is 151 held not per host, but per port pair and TW bucket is used as state 152 holder. 153 154 If TW bucket has been already destroyed we fall back to VJ's scheme 155 and use initial timestamp retrieved from peer table. 156 */ 157 if (tcptw->tw_ts_recent_stamp && 158 (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 159 /* In case of repair and re-using TIME-WAIT sockets we still 160 * want to be sure that it is safe as above but honor the 161 * sequence numbers and time stamps set as part of the repair 162 * process. 163 * 164 * Without this check re-using a TIME-WAIT socket with TCP 165 * repair would accumulate a -1 on the repair assigned 166 * sequence number. The first time it is reused the sequence 167 * is -1, the second time -2, etc. This fixes that issue 168 * without appearing to create any others. 169 */ 170 if (likely(!tp->repair)) { 171 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 172 if (tp->write_seq == 0) 173 tp->write_seq = 1; 174 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 175 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 176 } 177 sock_hold(sktw); 178 return 1; 179 } 180 181 return 0; 182 } 183 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 184 185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 186 int addr_len) 187 { 188 /* This check is replicated from tcp_v4_connect() and intended to 189 * prevent BPF program called below from accessing bytes that are out 190 * of the bound specified by user in addr_len. 191 */ 192 if (addr_len < sizeof(struct sockaddr_in)) 193 return -EINVAL; 194 195 sock_owned_by_me(sk); 196 197 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 198 } 199 200 /* This will initiate an outgoing connection. */ 201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 202 { 203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 204 struct inet_sock *inet = inet_sk(sk); 205 struct tcp_sock *tp = tcp_sk(sk); 206 __be16 orig_sport, orig_dport; 207 __be32 daddr, nexthop; 208 struct flowi4 *fl4; 209 struct rtable *rt; 210 int err; 211 struct ip_options_rcu *inet_opt; 212 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 213 214 if (addr_len < sizeof(struct sockaddr_in)) 215 return -EINVAL; 216 217 if (usin->sin_family != AF_INET) 218 return -EAFNOSUPPORT; 219 220 nexthop = daddr = usin->sin_addr.s_addr; 221 inet_opt = rcu_dereference_protected(inet->inet_opt, 222 lockdep_sock_is_held(sk)); 223 if (inet_opt && inet_opt->opt.srr) { 224 if (!daddr) 225 return -EINVAL; 226 nexthop = inet_opt->opt.faddr; 227 } 228 229 orig_sport = inet->inet_sport; 230 orig_dport = usin->sin_port; 231 fl4 = &inet->cork.fl.u.ip4; 232 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 233 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 234 IPPROTO_TCP, 235 orig_sport, orig_dport, sk); 236 if (IS_ERR(rt)) { 237 err = PTR_ERR(rt); 238 if (err == -ENETUNREACH) 239 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 240 return err; 241 } 242 243 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 244 ip_rt_put(rt); 245 return -ENETUNREACH; 246 } 247 248 if (!inet_opt || !inet_opt->opt.srr) 249 daddr = fl4->daddr; 250 251 if (!inet->inet_saddr) 252 inet->inet_saddr = fl4->saddr; 253 sk_rcv_saddr_set(sk, inet->inet_saddr); 254 255 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 256 /* Reset inherited state */ 257 tp->rx_opt.ts_recent = 0; 258 tp->rx_opt.ts_recent_stamp = 0; 259 if (likely(!tp->repair)) 260 tp->write_seq = 0; 261 } 262 263 inet->inet_dport = usin->sin_port; 264 sk_daddr_set(sk, daddr); 265 266 inet_csk(sk)->icsk_ext_hdr_len = 0; 267 if (inet_opt) 268 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 269 270 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 271 272 /* Socket identity is still unknown (sport may be zero). 273 * However we set state to SYN-SENT and not releasing socket 274 * lock select source port, enter ourselves into the hash tables and 275 * complete initialization after this. 276 */ 277 tcp_set_state(sk, TCP_SYN_SENT); 278 err = inet_hash_connect(tcp_death_row, sk); 279 if (err) 280 goto failure; 281 282 sk_set_txhash(sk); 283 284 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 285 inet->inet_sport, inet->inet_dport, sk); 286 if (IS_ERR(rt)) { 287 err = PTR_ERR(rt); 288 rt = NULL; 289 goto failure; 290 } 291 /* OK, now commit destination to socket. */ 292 sk->sk_gso_type = SKB_GSO_TCPV4; 293 sk_setup_caps(sk, &rt->dst); 294 rt = NULL; 295 296 if (likely(!tp->repair)) { 297 if (!tp->write_seq) 298 tp->write_seq = secure_tcp_seq(inet->inet_saddr, 299 inet->inet_daddr, 300 inet->inet_sport, 301 usin->sin_port); 302 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 303 inet->inet_saddr, 304 inet->inet_daddr); 305 } 306 307 inet->inet_id = tp->write_seq ^ jiffies; 308 309 if (tcp_fastopen_defer_connect(sk, &err)) 310 return err; 311 if (err) 312 goto failure; 313 314 err = tcp_connect(sk); 315 316 if (err) 317 goto failure; 318 319 return 0; 320 321 failure: 322 /* 323 * This unhashes the socket and releases the local port, 324 * if necessary. 325 */ 326 tcp_set_state(sk, TCP_CLOSE); 327 ip_rt_put(rt); 328 sk->sk_route_caps = 0; 329 inet->inet_dport = 0; 330 return err; 331 } 332 EXPORT_SYMBOL(tcp_v4_connect); 333 334 /* 335 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 336 * It can be called through tcp_release_cb() if socket was owned by user 337 * at the time tcp_v4_err() was called to handle ICMP message. 338 */ 339 void tcp_v4_mtu_reduced(struct sock *sk) 340 { 341 struct inet_sock *inet = inet_sk(sk); 342 struct dst_entry *dst; 343 u32 mtu; 344 345 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 346 return; 347 mtu = tcp_sk(sk)->mtu_info; 348 dst = inet_csk_update_pmtu(sk, mtu); 349 if (!dst) 350 return; 351 352 /* Something is about to be wrong... Remember soft error 353 * for the case, if this connection will not able to recover. 354 */ 355 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 356 sk->sk_err_soft = EMSGSIZE; 357 358 mtu = dst_mtu(dst); 359 360 if (inet->pmtudisc != IP_PMTUDISC_DONT && 361 ip_sk_accept_pmtu(sk) && 362 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 363 tcp_sync_mss(sk, mtu); 364 365 /* Resend the TCP packet because it's 366 * clear that the old packet has been 367 * dropped. This is the new "fast" path mtu 368 * discovery. 369 */ 370 tcp_simple_retransmit(sk); 371 } /* else let the usual retransmit timer handle it */ 372 } 373 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 374 375 static void do_redirect(struct sk_buff *skb, struct sock *sk) 376 { 377 struct dst_entry *dst = __sk_dst_check(sk, 0); 378 379 if (dst) 380 dst->ops->redirect(dst, sk, skb); 381 } 382 383 384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 385 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 386 { 387 struct request_sock *req = inet_reqsk(sk); 388 struct net *net = sock_net(sk); 389 390 /* ICMPs are not backlogged, hence we cannot get 391 * an established socket here. 392 */ 393 if (seq != tcp_rsk(req)->snt_isn) { 394 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 395 } else if (abort) { 396 /* 397 * Still in SYN_RECV, just remove it silently. 398 * There is no good way to pass the error to the newly 399 * created socket, and POSIX does not want network 400 * errors returned from accept(). 401 */ 402 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 403 tcp_listendrop(req->rsk_listener); 404 } 405 reqsk_put(req); 406 } 407 EXPORT_SYMBOL(tcp_req_err); 408 409 /* 410 * This routine is called by the ICMP module when it gets some 411 * sort of error condition. If err < 0 then the socket should 412 * be closed and the error returned to the user. If err > 0 413 * it's just the icmp type << 8 | icmp code. After adjustment 414 * header points to the first 8 bytes of the tcp header. We need 415 * to find the appropriate port. 416 * 417 * The locking strategy used here is very "optimistic". When 418 * someone else accesses the socket the ICMP is just dropped 419 * and for some paths there is no check at all. 420 * A more general error queue to queue errors for later handling 421 * is probably better. 422 * 423 */ 424 425 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 426 { 427 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 428 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 429 struct inet_connection_sock *icsk; 430 struct tcp_sock *tp; 431 struct inet_sock *inet; 432 const int type = icmp_hdr(icmp_skb)->type; 433 const int code = icmp_hdr(icmp_skb)->code; 434 struct sock *sk; 435 struct sk_buff *skb; 436 struct request_sock *fastopen; 437 u32 seq, snd_una; 438 s32 remaining; 439 u32 delta_us; 440 int err; 441 struct net *net = dev_net(icmp_skb->dev); 442 443 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 444 th->dest, iph->saddr, ntohs(th->source), 445 inet_iif(icmp_skb), 0); 446 if (!sk) { 447 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 448 return; 449 } 450 if (sk->sk_state == TCP_TIME_WAIT) { 451 inet_twsk_put(inet_twsk(sk)); 452 return; 453 } 454 seq = ntohl(th->seq); 455 if (sk->sk_state == TCP_NEW_SYN_RECV) 456 return tcp_req_err(sk, seq, 457 type == ICMP_PARAMETERPROB || 458 type == ICMP_TIME_EXCEEDED || 459 (type == ICMP_DEST_UNREACH && 460 (code == ICMP_NET_UNREACH || 461 code == ICMP_HOST_UNREACH))); 462 463 bh_lock_sock(sk); 464 /* If too many ICMPs get dropped on busy 465 * servers this needs to be solved differently. 466 * We do take care of PMTU discovery (RFC1191) special case : 467 * we can receive locally generated ICMP messages while socket is held. 468 */ 469 if (sock_owned_by_user(sk)) { 470 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 471 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 472 } 473 if (sk->sk_state == TCP_CLOSE) 474 goto out; 475 476 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 477 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 478 goto out; 479 } 480 481 icsk = inet_csk(sk); 482 tp = tcp_sk(sk); 483 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 484 fastopen = tp->fastopen_rsk; 485 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 486 if (sk->sk_state != TCP_LISTEN && 487 !between(seq, snd_una, tp->snd_nxt)) { 488 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 489 goto out; 490 } 491 492 switch (type) { 493 case ICMP_REDIRECT: 494 if (!sock_owned_by_user(sk)) 495 do_redirect(icmp_skb, sk); 496 goto out; 497 case ICMP_SOURCE_QUENCH: 498 /* Just silently ignore these. */ 499 goto out; 500 case ICMP_PARAMETERPROB: 501 err = EPROTO; 502 break; 503 case ICMP_DEST_UNREACH: 504 if (code > NR_ICMP_UNREACH) 505 goto out; 506 507 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 508 /* We are not interested in TCP_LISTEN and open_requests 509 * (SYN-ACKs send out by Linux are always <576bytes so 510 * they should go through unfragmented). 511 */ 512 if (sk->sk_state == TCP_LISTEN) 513 goto out; 514 515 tp->mtu_info = info; 516 if (!sock_owned_by_user(sk)) { 517 tcp_v4_mtu_reduced(sk); 518 } else { 519 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 520 sock_hold(sk); 521 } 522 goto out; 523 } 524 525 err = icmp_err_convert[code].errno; 526 /* check if icmp_skb allows revert of backoff 527 * (see draft-zimmermann-tcp-lcd) */ 528 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 529 break; 530 if (seq != tp->snd_una || !icsk->icsk_retransmits || 531 !icsk->icsk_backoff || fastopen) 532 break; 533 534 if (sock_owned_by_user(sk)) 535 break; 536 537 icsk->icsk_backoff--; 538 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 539 TCP_TIMEOUT_INIT; 540 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 541 542 skb = tcp_rtx_queue_head(sk); 543 BUG_ON(!skb); 544 545 tcp_mstamp_refresh(tp); 546 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); 547 remaining = icsk->icsk_rto - 548 usecs_to_jiffies(delta_us); 549 550 if (remaining > 0) { 551 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 552 remaining, TCP_RTO_MAX); 553 } else { 554 /* RTO revert clocked out retransmission. 555 * Will retransmit now */ 556 tcp_retransmit_timer(sk); 557 } 558 559 break; 560 case ICMP_TIME_EXCEEDED: 561 err = EHOSTUNREACH; 562 break; 563 default: 564 goto out; 565 } 566 567 switch (sk->sk_state) { 568 case TCP_SYN_SENT: 569 case TCP_SYN_RECV: 570 /* Only in fast or simultaneous open. If a fast open socket is 571 * is already accepted it is treated as a connected one below. 572 */ 573 if (fastopen && !fastopen->sk) 574 break; 575 576 if (!sock_owned_by_user(sk)) { 577 sk->sk_err = err; 578 579 sk->sk_error_report(sk); 580 581 tcp_done(sk); 582 } else { 583 sk->sk_err_soft = err; 584 } 585 goto out; 586 } 587 588 /* If we've already connected we will keep trying 589 * until we time out, or the user gives up. 590 * 591 * rfc1122 4.2.3.9 allows to consider as hard errors 592 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 593 * but it is obsoleted by pmtu discovery). 594 * 595 * Note, that in modern internet, where routing is unreliable 596 * and in each dark corner broken firewalls sit, sending random 597 * errors ordered by their masters even this two messages finally lose 598 * their original sense (even Linux sends invalid PORT_UNREACHs) 599 * 600 * Now we are in compliance with RFCs. 601 * --ANK (980905) 602 */ 603 604 inet = inet_sk(sk); 605 if (!sock_owned_by_user(sk) && inet->recverr) { 606 sk->sk_err = err; 607 sk->sk_error_report(sk); 608 } else { /* Only an error on timeout */ 609 sk->sk_err_soft = err; 610 } 611 612 out: 613 bh_unlock_sock(sk); 614 sock_put(sk); 615 } 616 617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 618 { 619 struct tcphdr *th = tcp_hdr(skb); 620 621 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 622 skb->csum_start = skb_transport_header(skb) - skb->head; 623 skb->csum_offset = offsetof(struct tcphdr, check); 624 } 625 626 /* This routine computes an IPv4 TCP checksum. */ 627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 628 { 629 const struct inet_sock *inet = inet_sk(sk); 630 631 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 632 } 633 EXPORT_SYMBOL(tcp_v4_send_check); 634 635 /* 636 * This routine will send an RST to the other tcp. 637 * 638 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 639 * for reset. 640 * Answer: if a packet caused RST, it is not for a socket 641 * existing in our system, if it is matched to a socket, 642 * it is just duplicate segment or bug in other side's TCP. 643 * So that we build reply only basing on parameters 644 * arrived with segment. 645 * Exception: precedence violation. We do not implement it in any case. 646 */ 647 648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 649 { 650 const struct tcphdr *th = tcp_hdr(skb); 651 struct { 652 struct tcphdr th; 653 #ifdef CONFIG_TCP_MD5SIG 654 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 655 #endif 656 } rep; 657 struct ip_reply_arg arg; 658 #ifdef CONFIG_TCP_MD5SIG 659 struct tcp_md5sig_key *key = NULL; 660 const __u8 *hash_location = NULL; 661 unsigned char newhash[16]; 662 int genhash; 663 struct sock *sk1 = NULL; 664 #endif 665 struct net *net; 666 struct sock *ctl_sk; 667 668 /* Never send a reset in response to a reset. */ 669 if (th->rst) 670 return; 671 672 /* If sk not NULL, it means we did a successful lookup and incoming 673 * route had to be correct. prequeue might have dropped our dst. 674 */ 675 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 676 return; 677 678 /* Swap the send and the receive. */ 679 memset(&rep, 0, sizeof(rep)); 680 rep.th.dest = th->source; 681 rep.th.source = th->dest; 682 rep.th.doff = sizeof(struct tcphdr) / 4; 683 rep.th.rst = 1; 684 685 if (th->ack) { 686 rep.th.seq = th->ack_seq; 687 } else { 688 rep.th.ack = 1; 689 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 690 skb->len - (th->doff << 2)); 691 } 692 693 memset(&arg, 0, sizeof(arg)); 694 arg.iov[0].iov_base = (unsigned char *)&rep; 695 arg.iov[0].iov_len = sizeof(rep.th); 696 697 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 698 #ifdef CONFIG_TCP_MD5SIG 699 rcu_read_lock(); 700 hash_location = tcp_parse_md5sig_option(th); 701 if (sk && sk_fullsock(sk)) { 702 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 703 &ip_hdr(skb)->saddr, AF_INET); 704 } else if (hash_location) { 705 /* 706 * active side is lost. Try to find listening socket through 707 * source port, and then find md5 key through listening socket. 708 * we are not loose security here: 709 * Incoming packet is checked with md5 hash with finding key, 710 * no RST generated if md5 hash doesn't match. 711 */ 712 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 713 ip_hdr(skb)->saddr, 714 th->source, ip_hdr(skb)->daddr, 715 ntohs(th->source), inet_iif(skb), 716 tcp_v4_sdif(skb)); 717 /* don't send rst if it can't find key */ 718 if (!sk1) 719 goto out; 720 721 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 722 &ip_hdr(skb)->saddr, AF_INET); 723 if (!key) 724 goto out; 725 726 727 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 728 if (genhash || memcmp(hash_location, newhash, 16) != 0) 729 goto out; 730 731 } 732 733 if (key) { 734 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 735 (TCPOPT_NOP << 16) | 736 (TCPOPT_MD5SIG << 8) | 737 TCPOLEN_MD5SIG); 738 /* Update length and the length the header thinks exists */ 739 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 740 rep.th.doff = arg.iov[0].iov_len / 4; 741 742 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 743 key, ip_hdr(skb)->saddr, 744 ip_hdr(skb)->daddr, &rep.th); 745 } 746 #endif 747 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 748 ip_hdr(skb)->saddr, /* XXX */ 749 arg.iov[0].iov_len, IPPROTO_TCP, 0); 750 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 751 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 752 753 /* When socket is gone, all binding information is lost. 754 * routing might fail in this case. No choice here, if we choose to force 755 * input interface, we will misroute in case of asymmetric route. 756 */ 757 if (sk) { 758 arg.bound_dev_if = sk->sk_bound_dev_if; 759 if (sk_fullsock(sk)) 760 trace_tcp_send_reset(sk, skb); 761 } 762 763 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 764 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 765 766 arg.tos = ip_hdr(skb)->tos; 767 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 768 local_bh_disable(); 769 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); 770 if (sk) 771 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 772 inet_twsk(sk)->tw_mark : sk->sk_mark; 773 ip_send_unicast_reply(ctl_sk, 774 skb, &TCP_SKB_CB(skb)->header.h4.opt, 775 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 776 &arg, arg.iov[0].iov_len); 777 778 ctl_sk->sk_mark = 0; 779 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 780 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 781 local_bh_enable(); 782 783 #ifdef CONFIG_TCP_MD5SIG 784 out: 785 rcu_read_unlock(); 786 #endif 787 } 788 789 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 790 outside socket context is ugly, certainly. What can I do? 791 */ 792 793 static void tcp_v4_send_ack(const struct sock *sk, 794 struct sk_buff *skb, u32 seq, u32 ack, 795 u32 win, u32 tsval, u32 tsecr, int oif, 796 struct tcp_md5sig_key *key, 797 int reply_flags, u8 tos) 798 { 799 const struct tcphdr *th = tcp_hdr(skb); 800 struct { 801 struct tcphdr th; 802 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 803 #ifdef CONFIG_TCP_MD5SIG 804 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 805 #endif 806 ]; 807 } rep; 808 struct net *net = sock_net(sk); 809 struct ip_reply_arg arg; 810 struct sock *ctl_sk; 811 812 memset(&rep.th, 0, sizeof(struct tcphdr)); 813 memset(&arg, 0, sizeof(arg)); 814 815 arg.iov[0].iov_base = (unsigned char *)&rep; 816 arg.iov[0].iov_len = sizeof(rep.th); 817 if (tsecr) { 818 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 819 (TCPOPT_TIMESTAMP << 8) | 820 TCPOLEN_TIMESTAMP); 821 rep.opt[1] = htonl(tsval); 822 rep.opt[2] = htonl(tsecr); 823 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 824 } 825 826 /* Swap the send and the receive. */ 827 rep.th.dest = th->source; 828 rep.th.source = th->dest; 829 rep.th.doff = arg.iov[0].iov_len / 4; 830 rep.th.seq = htonl(seq); 831 rep.th.ack_seq = htonl(ack); 832 rep.th.ack = 1; 833 rep.th.window = htons(win); 834 835 #ifdef CONFIG_TCP_MD5SIG 836 if (key) { 837 int offset = (tsecr) ? 3 : 0; 838 839 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 840 (TCPOPT_NOP << 16) | 841 (TCPOPT_MD5SIG << 8) | 842 TCPOLEN_MD5SIG); 843 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 844 rep.th.doff = arg.iov[0].iov_len/4; 845 846 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 847 key, ip_hdr(skb)->saddr, 848 ip_hdr(skb)->daddr, &rep.th); 849 } 850 #endif 851 arg.flags = reply_flags; 852 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 853 ip_hdr(skb)->saddr, /* XXX */ 854 arg.iov[0].iov_len, IPPROTO_TCP, 0); 855 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 856 if (oif) 857 arg.bound_dev_if = oif; 858 arg.tos = tos; 859 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 860 local_bh_disable(); 861 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); 862 if (sk) 863 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 864 inet_twsk(sk)->tw_mark : sk->sk_mark; 865 ip_send_unicast_reply(ctl_sk, 866 skb, &TCP_SKB_CB(skb)->header.h4.opt, 867 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 868 &arg, arg.iov[0].iov_len); 869 870 ctl_sk->sk_mark = 0; 871 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 872 local_bh_enable(); 873 } 874 875 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 876 { 877 struct inet_timewait_sock *tw = inet_twsk(sk); 878 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 879 880 tcp_v4_send_ack(sk, skb, 881 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 882 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 883 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 884 tcptw->tw_ts_recent, 885 tw->tw_bound_dev_if, 886 tcp_twsk_md5_key(tcptw), 887 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 888 tw->tw_tos 889 ); 890 891 inet_twsk_put(tw); 892 } 893 894 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 895 struct request_sock *req) 896 { 897 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 898 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 899 */ 900 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 901 tcp_sk(sk)->snd_nxt; 902 903 /* RFC 7323 2.3 904 * The window field (SEG.WND) of every outgoing segment, with the 905 * exception of <SYN> segments, MUST be right-shifted by 906 * Rcv.Wind.Shift bits: 907 */ 908 tcp_v4_send_ack(sk, skb, seq, 909 tcp_rsk(req)->rcv_nxt, 910 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 911 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 912 req->ts_recent, 913 0, 914 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, 915 AF_INET), 916 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 917 ip_hdr(skb)->tos); 918 } 919 920 /* 921 * Send a SYN-ACK after having received a SYN. 922 * This still operates on a request_sock only, not on a big 923 * socket. 924 */ 925 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 926 struct flowi *fl, 927 struct request_sock *req, 928 struct tcp_fastopen_cookie *foc, 929 enum tcp_synack_type synack_type) 930 { 931 const struct inet_request_sock *ireq = inet_rsk(req); 932 struct flowi4 fl4; 933 int err = -1; 934 struct sk_buff *skb; 935 936 /* First, grab a route. */ 937 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 938 return -1; 939 940 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 941 942 if (skb) { 943 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 944 945 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 946 ireq->ir_rmt_addr, 947 ireq_opt_deref(ireq)); 948 err = net_xmit_eval(err); 949 } 950 951 return err; 952 } 953 954 /* 955 * IPv4 request_sock destructor. 956 */ 957 static void tcp_v4_reqsk_destructor(struct request_sock *req) 958 { 959 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 960 } 961 962 #ifdef CONFIG_TCP_MD5SIG 963 /* 964 * RFC2385 MD5 checksumming requires a mapping of 965 * IP address->MD5 Key. 966 * We need to maintain these in the sk structure. 967 */ 968 969 /* Find the Key structure for an address. */ 970 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, 971 const union tcp_md5_addr *addr, 972 int family) 973 { 974 const struct tcp_sock *tp = tcp_sk(sk); 975 struct tcp_md5sig_key *key; 976 const struct tcp_md5sig_info *md5sig; 977 __be32 mask; 978 struct tcp_md5sig_key *best_match = NULL; 979 bool match; 980 981 /* caller either holds rcu_read_lock() or socket lock */ 982 md5sig = rcu_dereference_check(tp->md5sig_info, 983 lockdep_sock_is_held(sk)); 984 if (!md5sig) 985 return NULL; 986 987 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 988 if (key->family != family) 989 continue; 990 991 if (family == AF_INET) { 992 mask = inet_make_mask(key->prefixlen); 993 match = (key->addr.a4.s_addr & mask) == 994 (addr->a4.s_addr & mask); 995 #if IS_ENABLED(CONFIG_IPV6) 996 } else if (family == AF_INET6) { 997 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 998 key->prefixlen); 999 #endif 1000 } else { 1001 match = false; 1002 } 1003 1004 if (match && (!best_match || 1005 key->prefixlen > best_match->prefixlen)) 1006 best_match = key; 1007 } 1008 return best_match; 1009 } 1010 EXPORT_SYMBOL(tcp_md5_do_lookup); 1011 1012 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1013 const union tcp_md5_addr *addr, 1014 int family, u8 prefixlen) 1015 { 1016 const struct tcp_sock *tp = tcp_sk(sk); 1017 struct tcp_md5sig_key *key; 1018 unsigned int size = sizeof(struct in_addr); 1019 const struct tcp_md5sig_info *md5sig; 1020 1021 /* caller either holds rcu_read_lock() or socket lock */ 1022 md5sig = rcu_dereference_check(tp->md5sig_info, 1023 lockdep_sock_is_held(sk)); 1024 if (!md5sig) 1025 return NULL; 1026 #if IS_ENABLED(CONFIG_IPV6) 1027 if (family == AF_INET6) 1028 size = sizeof(struct in6_addr); 1029 #endif 1030 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 1031 if (key->family != family) 1032 continue; 1033 if (!memcmp(&key->addr, addr, size) && 1034 key->prefixlen == prefixlen) 1035 return key; 1036 } 1037 return NULL; 1038 } 1039 1040 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1041 const struct sock *addr_sk) 1042 { 1043 const union tcp_md5_addr *addr; 1044 1045 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1046 return tcp_md5_do_lookup(sk, addr, AF_INET); 1047 } 1048 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1049 1050 /* This can be called on a newly created socket, from other files */ 1051 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1052 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, 1053 gfp_t gfp) 1054 { 1055 /* Add Key to the list */ 1056 struct tcp_md5sig_key *key; 1057 struct tcp_sock *tp = tcp_sk(sk); 1058 struct tcp_md5sig_info *md5sig; 1059 1060 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1061 if (key) { 1062 /* Pre-existing entry - just update that one. */ 1063 memcpy(key->key, newkey, newkeylen); 1064 key->keylen = newkeylen; 1065 return 0; 1066 } 1067 1068 md5sig = rcu_dereference_protected(tp->md5sig_info, 1069 lockdep_sock_is_held(sk)); 1070 if (!md5sig) { 1071 md5sig = kmalloc(sizeof(*md5sig), gfp); 1072 if (!md5sig) 1073 return -ENOMEM; 1074 1075 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1076 INIT_HLIST_HEAD(&md5sig->head); 1077 rcu_assign_pointer(tp->md5sig_info, md5sig); 1078 } 1079 1080 key = sock_kmalloc(sk, sizeof(*key), gfp); 1081 if (!key) 1082 return -ENOMEM; 1083 if (!tcp_alloc_md5sig_pool()) { 1084 sock_kfree_s(sk, key, sizeof(*key)); 1085 return -ENOMEM; 1086 } 1087 1088 memcpy(key->key, newkey, newkeylen); 1089 key->keylen = newkeylen; 1090 key->family = family; 1091 key->prefixlen = prefixlen; 1092 memcpy(&key->addr, addr, 1093 (family == AF_INET6) ? sizeof(struct in6_addr) : 1094 sizeof(struct in_addr)); 1095 hlist_add_head_rcu(&key->node, &md5sig->head); 1096 return 0; 1097 } 1098 EXPORT_SYMBOL(tcp_md5_do_add); 1099 1100 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1101 u8 prefixlen) 1102 { 1103 struct tcp_md5sig_key *key; 1104 1105 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1106 if (!key) 1107 return -ENOENT; 1108 hlist_del_rcu(&key->node); 1109 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1110 kfree_rcu(key, rcu); 1111 return 0; 1112 } 1113 EXPORT_SYMBOL(tcp_md5_do_del); 1114 1115 static void tcp_clear_md5_list(struct sock *sk) 1116 { 1117 struct tcp_sock *tp = tcp_sk(sk); 1118 struct tcp_md5sig_key *key; 1119 struct hlist_node *n; 1120 struct tcp_md5sig_info *md5sig; 1121 1122 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1123 1124 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1125 hlist_del_rcu(&key->node); 1126 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1127 kfree_rcu(key, rcu); 1128 } 1129 } 1130 1131 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1132 char __user *optval, int optlen) 1133 { 1134 struct tcp_md5sig cmd; 1135 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1136 u8 prefixlen = 32; 1137 1138 if (optlen < sizeof(cmd)) 1139 return -EINVAL; 1140 1141 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1142 return -EFAULT; 1143 1144 if (sin->sin_family != AF_INET) 1145 return -EINVAL; 1146 1147 if (optname == TCP_MD5SIG_EXT && 1148 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1149 prefixlen = cmd.tcpm_prefixlen; 1150 if (prefixlen > 32) 1151 return -EINVAL; 1152 } 1153 1154 if (!cmd.tcpm_keylen) 1155 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1156 AF_INET, prefixlen); 1157 1158 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1159 return -EINVAL; 1160 1161 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1162 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, 1163 GFP_KERNEL); 1164 } 1165 1166 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1167 __be32 daddr, __be32 saddr, 1168 const struct tcphdr *th, int nbytes) 1169 { 1170 struct tcp4_pseudohdr *bp; 1171 struct scatterlist sg; 1172 struct tcphdr *_th; 1173 1174 bp = hp->scratch; 1175 bp->saddr = saddr; 1176 bp->daddr = daddr; 1177 bp->pad = 0; 1178 bp->protocol = IPPROTO_TCP; 1179 bp->len = cpu_to_be16(nbytes); 1180 1181 _th = (struct tcphdr *)(bp + 1); 1182 memcpy(_th, th, sizeof(*th)); 1183 _th->check = 0; 1184 1185 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1186 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1187 sizeof(*bp) + sizeof(*th)); 1188 return crypto_ahash_update(hp->md5_req); 1189 } 1190 1191 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1192 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1193 { 1194 struct tcp_md5sig_pool *hp; 1195 struct ahash_request *req; 1196 1197 hp = tcp_get_md5sig_pool(); 1198 if (!hp) 1199 goto clear_hash_noput; 1200 req = hp->md5_req; 1201 1202 if (crypto_ahash_init(req)) 1203 goto clear_hash; 1204 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1205 goto clear_hash; 1206 if (tcp_md5_hash_key(hp, key)) 1207 goto clear_hash; 1208 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1209 if (crypto_ahash_final(req)) 1210 goto clear_hash; 1211 1212 tcp_put_md5sig_pool(); 1213 return 0; 1214 1215 clear_hash: 1216 tcp_put_md5sig_pool(); 1217 clear_hash_noput: 1218 memset(md5_hash, 0, 16); 1219 return 1; 1220 } 1221 1222 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1223 const struct sock *sk, 1224 const struct sk_buff *skb) 1225 { 1226 struct tcp_md5sig_pool *hp; 1227 struct ahash_request *req; 1228 const struct tcphdr *th = tcp_hdr(skb); 1229 __be32 saddr, daddr; 1230 1231 if (sk) { /* valid for establish/request sockets */ 1232 saddr = sk->sk_rcv_saddr; 1233 daddr = sk->sk_daddr; 1234 } else { 1235 const struct iphdr *iph = ip_hdr(skb); 1236 saddr = iph->saddr; 1237 daddr = iph->daddr; 1238 } 1239 1240 hp = tcp_get_md5sig_pool(); 1241 if (!hp) 1242 goto clear_hash_noput; 1243 req = hp->md5_req; 1244 1245 if (crypto_ahash_init(req)) 1246 goto clear_hash; 1247 1248 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1249 goto clear_hash; 1250 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1251 goto clear_hash; 1252 if (tcp_md5_hash_key(hp, key)) 1253 goto clear_hash; 1254 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1255 if (crypto_ahash_final(req)) 1256 goto clear_hash; 1257 1258 tcp_put_md5sig_pool(); 1259 return 0; 1260 1261 clear_hash: 1262 tcp_put_md5sig_pool(); 1263 clear_hash_noput: 1264 memset(md5_hash, 0, 16); 1265 return 1; 1266 } 1267 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1268 1269 #endif 1270 1271 /* Called with rcu_read_lock() */ 1272 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1273 const struct sk_buff *skb) 1274 { 1275 #ifdef CONFIG_TCP_MD5SIG 1276 /* 1277 * This gets called for each TCP segment that arrives 1278 * so we want to be efficient. 1279 * We have 3 drop cases: 1280 * o No MD5 hash and one expected. 1281 * o MD5 hash and we're not expecting one. 1282 * o MD5 hash and its wrong. 1283 */ 1284 const __u8 *hash_location = NULL; 1285 struct tcp_md5sig_key *hash_expected; 1286 const struct iphdr *iph = ip_hdr(skb); 1287 const struct tcphdr *th = tcp_hdr(skb); 1288 int genhash; 1289 unsigned char newhash[16]; 1290 1291 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1292 AF_INET); 1293 hash_location = tcp_parse_md5sig_option(th); 1294 1295 /* We've parsed the options - do we have a hash? */ 1296 if (!hash_expected && !hash_location) 1297 return false; 1298 1299 if (hash_expected && !hash_location) { 1300 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1301 return true; 1302 } 1303 1304 if (!hash_expected && hash_location) { 1305 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1306 return true; 1307 } 1308 1309 /* Okay, so this is hash_expected and hash_location - 1310 * so we need to calculate the checksum. 1311 */ 1312 genhash = tcp_v4_md5_hash_skb(newhash, 1313 hash_expected, 1314 NULL, skb); 1315 1316 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1317 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1318 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1319 &iph->saddr, ntohs(th->source), 1320 &iph->daddr, ntohs(th->dest), 1321 genhash ? " tcp_v4_calc_md5_hash failed" 1322 : ""); 1323 return true; 1324 } 1325 return false; 1326 #endif 1327 return false; 1328 } 1329 1330 static void tcp_v4_init_req(struct request_sock *req, 1331 const struct sock *sk_listener, 1332 struct sk_buff *skb) 1333 { 1334 struct inet_request_sock *ireq = inet_rsk(req); 1335 struct net *net = sock_net(sk_listener); 1336 1337 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1338 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1339 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1340 } 1341 1342 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1343 struct flowi *fl, 1344 const struct request_sock *req) 1345 { 1346 return inet_csk_route_req(sk, &fl->u.ip4, req); 1347 } 1348 1349 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1350 .family = PF_INET, 1351 .obj_size = sizeof(struct tcp_request_sock), 1352 .rtx_syn_ack = tcp_rtx_synack, 1353 .send_ack = tcp_v4_reqsk_send_ack, 1354 .destructor = tcp_v4_reqsk_destructor, 1355 .send_reset = tcp_v4_send_reset, 1356 .syn_ack_timeout = tcp_syn_ack_timeout, 1357 }; 1358 1359 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1360 .mss_clamp = TCP_MSS_DEFAULT, 1361 #ifdef CONFIG_TCP_MD5SIG 1362 .req_md5_lookup = tcp_v4_md5_lookup, 1363 .calc_md5_hash = tcp_v4_md5_hash_skb, 1364 #endif 1365 .init_req = tcp_v4_init_req, 1366 #ifdef CONFIG_SYN_COOKIES 1367 .cookie_init_seq = cookie_v4_init_sequence, 1368 #endif 1369 .route_req = tcp_v4_route_req, 1370 .init_seq = tcp_v4_init_seq, 1371 .init_ts_off = tcp_v4_init_ts_off, 1372 .send_synack = tcp_v4_send_synack, 1373 }; 1374 1375 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1376 { 1377 /* Never answer to SYNs send to broadcast or multicast */ 1378 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1379 goto drop; 1380 1381 return tcp_conn_request(&tcp_request_sock_ops, 1382 &tcp_request_sock_ipv4_ops, sk, skb); 1383 1384 drop: 1385 tcp_listendrop(sk); 1386 return 0; 1387 } 1388 EXPORT_SYMBOL(tcp_v4_conn_request); 1389 1390 1391 /* 1392 * The three way handshake has completed - we got a valid synack - 1393 * now create the new socket. 1394 */ 1395 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1396 struct request_sock *req, 1397 struct dst_entry *dst, 1398 struct request_sock *req_unhash, 1399 bool *own_req) 1400 { 1401 struct inet_request_sock *ireq; 1402 struct inet_sock *newinet; 1403 struct tcp_sock *newtp; 1404 struct sock *newsk; 1405 #ifdef CONFIG_TCP_MD5SIG 1406 struct tcp_md5sig_key *key; 1407 #endif 1408 struct ip_options_rcu *inet_opt; 1409 1410 if (sk_acceptq_is_full(sk)) 1411 goto exit_overflow; 1412 1413 newsk = tcp_create_openreq_child(sk, req, skb); 1414 if (!newsk) 1415 goto exit_nonewsk; 1416 1417 newsk->sk_gso_type = SKB_GSO_TCPV4; 1418 inet_sk_rx_dst_set(newsk, skb); 1419 1420 newtp = tcp_sk(newsk); 1421 newinet = inet_sk(newsk); 1422 ireq = inet_rsk(req); 1423 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1424 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1425 newsk->sk_bound_dev_if = ireq->ir_iif; 1426 newinet->inet_saddr = ireq->ir_loc_addr; 1427 inet_opt = rcu_dereference(ireq->ireq_opt); 1428 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1429 newinet->mc_index = inet_iif(skb); 1430 newinet->mc_ttl = ip_hdr(skb)->ttl; 1431 newinet->rcv_tos = ip_hdr(skb)->tos; 1432 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1433 if (inet_opt) 1434 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1435 newinet->inet_id = newtp->write_seq ^ jiffies; 1436 1437 if (!dst) { 1438 dst = inet_csk_route_child_sock(sk, newsk, req); 1439 if (!dst) 1440 goto put_and_exit; 1441 } else { 1442 /* syncookie case : see end of cookie_v4_check() */ 1443 } 1444 sk_setup_caps(newsk, dst); 1445 1446 tcp_ca_openreq_child(newsk, dst); 1447 1448 tcp_sync_mss(newsk, dst_mtu(dst)); 1449 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1450 1451 tcp_initialize_rcv_mss(newsk); 1452 1453 #ifdef CONFIG_TCP_MD5SIG 1454 /* Copy over the MD5 key from the original socket */ 1455 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1456 AF_INET); 1457 if (key) { 1458 /* 1459 * We're using one, so create a matching key 1460 * on the newsk structure. If we fail to get 1461 * memory, then we end up not copying the key 1462 * across. Shucks. 1463 */ 1464 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1465 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); 1466 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1467 } 1468 #endif 1469 1470 if (__inet_inherit_port(sk, newsk) < 0) 1471 goto put_and_exit; 1472 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1473 if (likely(*own_req)) { 1474 tcp_move_syn(newtp, req); 1475 ireq->ireq_opt = NULL; 1476 } else { 1477 newinet->inet_opt = NULL; 1478 } 1479 return newsk; 1480 1481 exit_overflow: 1482 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1483 exit_nonewsk: 1484 dst_release(dst); 1485 exit: 1486 tcp_listendrop(sk); 1487 return NULL; 1488 put_and_exit: 1489 newinet->inet_opt = NULL; 1490 inet_csk_prepare_forced_close(newsk); 1491 tcp_done(newsk); 1492 goto exit; 1493 } 1494 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1495 1496 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1497 { 1498 #ifdef CONFIG_SYN_COOKIES 1499 const struct tcphdr *th = tcp_hdr(skb); 1500 1501 if (!th->syn) 1502 sk = cookie_v4_check(sk, skb); 1503 #endif 1504 return sk; 1505 } 1506 1507 /* The socket must have it's spinlock held when we get 1508 * here, unless it is a TCP_LISTEN socket. 1509 * 1510 * We have a potential double-lock case here, so even when 1511 * doing backlog processing we use the BH locking scheme. 1512 * This is because we cannot sleep with the original spinlock 1513 * held. 1514 */ 1515 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1516 { 1517 struct sock *rsk; 1518 1519 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1520 struct dst_entry *dst = sk->sk_rx_dst; 1521 1522 sock_rps_save_rxhash(sk, skb); 1523 sk_mark_napi_id(sk, skb); 1524 if (dst) { 1525 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1526 !dst->ops->check(dst, 0)) { 1527 dst_release(dst); 1528 sk->sk_rx_dst = NULL; 1529 } 1530 } 1531 tcp_rcv_established(sk, skb); 1532 return 0; 1533 } 1534 1535 if (tcp_checksum_complete(skb)) 1536 goto csum_err; 1537 1538 if (sk->sk_state == TCP_LISTEN) { 1539 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1540 1541 if (!nsk) 1542 goto discard; 1543 if (nsk != sk) { 1544 if (tcp_child_process(sk, nsk, skb)) { 1545 rsk = nsk; 1546 goto reset; 1547 } 1548 return 0; 1549 } 1550 } else 1551 sock_rps_save_rxhash(sk, skb); 1552 1553 if (tcp_rcv_state_process(sk, skb)) { 1554 rsk = sk; 1555 goto reset; 1556 } 1557 return 0; 1558 1559 reset: 1560 tcp_v4_send_reset(rsk, skb); 1561 discard: 1562 kfree_skb(skb); 1563 /* Be careful here. If this function gets more complicated and 1564 * gcc suffers from register pressure on the x86, sk (in %ebx) 1565 * might be destroyed here. This current version compiles correctly, 1566 * but you have been warned. 1567 */ 1568 return 0; 1569 1570 csum_err: 1571 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1572 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1573 goto discard; 1574 } 1575 EXPORT_SYMBOL(tcp_v4_do_rcv); 1576 1577 int tcp_v4_early_demux(struct sk_buff *skb) 1578 { 1579 const struct iphdr *iph; 1580 const struct tcphdr *th; 1581 struct sock *sk; 1582 1583 if (skb->pkt_type != PACKET_HOST) 1584 return 0; 1585 1586 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1587 return 0; 1588 1589 iph = ip_hdr(skb); 1590 th = tcp_hdr(skb); 1591 1592 if (th->doff < sizeof(struct tcphdr) / 4) 1593 return 0; 1594 1595 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1596 iph->saddr, th->source, 1597 iph->daddr, ntohs(th->dest), 1598 skb->skb_iif, inet_sdif(skb)); 1599 if (sk) { 1600 skb->sk = sk; 1601 skb->destructor = sock_edemux; 1602 if (sk_fullsock(sk)) { 1603 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1604 1605 if (dst) 1606 dst = dst_check(dst, 0); 1607 if (dst && 1608 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1609 skb_dst_set_noref(skb, dst); 1610 } 1611 } 1612 return 0; 1613 } 1614 1615 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1616 { 1617 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1618 1619 /* Only socket owner can try to collapse/prune rx queues 1620 * to reduce memory overhead, so add a little headroom here. 1621 * Few sockets backlog are possibly concurrently non empty. 1622 */ 1623 limit += 64*1024; 1624 1625 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1626 * we can fix skb->truesize to its real value to avoid future drops. 1627 * This is valid because skb is not yet charged to the socket. 1628 * It has been noticed pure SACK packets were sometimes dropped 1629 * (if cooked by drivers without copybreak feature). 1630 */ 1631 skb_condense(skb); 1632 1633 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1634 bh_unlock_sock(sk); 1635 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1636 return true; 1637 } 1638 return false; 1639 } 1640 EXPORT_SYMBOL(tcp_add_backlog); 1641 1642 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1643 { 1644 struct tcphdr *th = (struct tcphdr *)skb->data; 1645 unsigned int eaten = skb->len; 1646 int err; 1647 1648 err = sk_filter_trim_cap(sk, skb, th->doff * 4); 1649 if (!err) { 1650 eaten -= skb->len; 1651 TCP_SKB_CB(skb)->end_seq -= eaten; 1652 } 1653 return err; 1654 } 1655 EXPORT_SYMBOL(tcp_filter); 1656 1657 static void tcp_v4_restore_cb(struct sk_buff *skb) 1658 { 1659 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1660 sizeof(struct inet_skb_parm)); 1661 } 1662 1663 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1664 const struct tcphdr *th) 1665 { 1666 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1667 * barrier() makes sure compiler wont play fool^Waliasing games. 1668 */ 1669 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1670 sizeof(struct inet_skb_parm)); 1671 barrier(); 1672 1673 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1674 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1675 skb->len - th->doff * 4); 1676 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1677 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1678 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1679 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1680 TCP_SKB_CB(skb)->sacked = 0; 1681 TCP_SKB_CB(skb)->has_rxtstamp = 1682 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1683 } 1684 1685 /* 1686 * From tcp_input.c 1687 */ 1688 1689 int tcp_v4_rcv(struct sk_buff *skb) 1690 { 1691 struct net *net = dev_net(skb->dev); 1692 int sdif = inet_sdif(skb); 1693 const struct iphdr *iph; 1694 const struct tcphdr *th; 1695 bool refcounted; 1696 struct sock *sk; 1697 int ret; 1698 1699 if (skb->pkt_type != PACKET_HOST) 1700 goto discard_it; 1701 1702 /* Count it even if it's bad */ 1703 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1704 1705 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1706 goto discard_it; 1707 1708 th = (const struct tcphdr *)skb->data; 1709 1710 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1711 goto bad_packet; 1712 if (!pskb_may_pull(skb, th->doff * 4)) 1713 goto discard_it; 1714 1715 /* An explanation is required here, I think. 1716 * Packet length and doff are validated by header prediction, 1717 * provided case of th->doff==0 is eliminated. 1718 * So, we defer the checks. */ 1719 1720 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1721 goto csum_error; 1722 1723 th = (const struct tcphdr *)skb->data; 1724 iph = ip_hdr(skb); 1725 lookup: 1726 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1727 th->dest, sdif, &refcounted); 1728 if (!sk) 1729 goto no_tcp_socket; 1730 1731 process: 1732 if (sk->sk_state == TCP_TIME_WAIT) 1733 goto do_time_wait; 1734 1735 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1736 struct request_sock *req = inet_reqsk(sk); 1737 bool req_stolen = false; 1738 struct sock *nsk; 1739 1740 sk = req->rsk_listener; 1741 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 1742 sk_drops_add(sk, skb); 1743 reqsk_put(req); 1744 goto discard_it; 1745 } 1746 if (tcp_checksum_complete(skb)) { 1747 reqsk_put(req); 1748 goto csum_error; 1749 } 1750 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1751 inet_csk_reqsk_queue_drop_and_put(sk, req); 1752 goto lookup; 1753 } 1754 /* We own a reference on the listener, increase it again 1755 * as we might lose it too soon. 1756 */ 1757 sock_hold(sk); 1758 refcounted = true; 1759 nsk = NULL; 1760 if (!tcp_filter(sk, skb)) { 1761 th = (const struct tcphdr *)skb->data; 1762 iph = ip_hdr(skb); 1763 tcp_v4_fill_cb(skb, iph, th); 1764 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1765 } 1766 if (!nsk) { 1767 reqsk_put(req); 1768 if (req_stolen) { 1769 /* Another cpu got exclusive access to req 1770 * and created a full blown socket. 1771 * Try to feed this packet to this socket 1772 * instead of discarding it. 1773 */ 1774 tcp_v4_restore_cb(skb); 1775 sock_put(sk); 1776 goto lookup; 1777 } 1778 goto discard_and_relse; 1779 } 1780 if (nsk == sk) { 1781 reqsk_put(req); 1782 tcp_v4_restore_cb(skb); 1783 } else if (tcp_child_process(sk, nsk, skb)) { 1784 tcp_v4_send_reset(nsk, skb); 1785 goto discard_and_relse; 1786 } else { 1787 sock_put(sk); 1788 return 0; 1789 } 1790 } 1791 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1792 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1793 goto discard_and_relse; 1794 } 1795 1796 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1797 goto discard_and_relse; 1798 1799 if (tcp_v4_inbound_md5_hash(sk, skb)) 1800 goto discard_and_relse; 1801 1802 nf_reset(skb); 1803 1804 if (tcp_filter(sk, skb)) 1805 goto discard_and_relse; 1806 th = (const struct tcphdr *)skb->data; 1807 iph = ip_hdr(skb); 1808 tcp_v4_fill_cb(skb, iph, th); 1809 1810 skb->dev = NULL; 1811 1812 if (sk->sk_state == TCP_LISTEN) { 1813 ret = tcp_v4_do_rcv(sk, skb); 1814 goto put_and_return; 1815 } 1816 1817 sk_incoming_cpu_update(sk); 1818 1819 bh_lock_sock_nested(sk); 1820 tcp_segs_in(tcp_sk(sk), skb); 1821 ret = 0; 1822 if (!sock_owned_by_user(sk)) { 1823 ret = tcp_v4_do_rcv(sk, skb); 1824 } else if (tcp_add_backlog(sk, skb)) { 1825 goto discard_and_relse; 1826 } 1827 bh_unlock_sock(sk); 1828 1829 put_and_return: 1830 if (refcounted) 1831 sock_put(sk); 1832 1833 return ret; 1834 1835 no_tcp_socket: 1836 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1837 goto discard_it; 1838 1839 tcp_v4_fill_cb(skb, iph, th); 1840 1841 if (tcp_checksum_complete(skb)) { 1842 csum_error: 1843 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 1844 bad_packet: 1845 __TCP_INC_STATS(net, TCP_MIB_INERRS); 1846 } else { 1847 tcp_v4_send_reset(NULL, skb); 1848 } 1849 1850 discard_it: 1851 /* Discard frame. */ 1852 kfree_skb(skb); 1853 return 0; 1854 1855 discard_and_relse: 1856 sk_drops_add(sk, skb); 1857 if (refcounted) 1858 sock_put(sk); 1859 goto discard_it; 1860 1861 do_time_wait: 1862 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1863 inet_twsk_put(inet_twsk(sk)); 1864 goto discard_it; 1865 } 1866 1867 tcp_v4_fill_cb(skb, iph, th); 1868 1869 if (tcp_checksum_complete(skb)) { 1870 inet_twsk_put(inet_twsk(sk)); 1871 goto csum_error; 1872 } 1873 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1874 case TCP_TW_SYN: { 1875 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1876 &tcp_hashinfo, skb, 1877 __tcp_hdrlen(th), 1878 iph->saddr, th->source, 1879 iph->daddr, th->dest, 1880 inet_iif(skb), 1881 sdif); 1882 if (sk2) { 1883 inet_twsk_deschedule_put(inet_twsk(sk)); 1884 sk = sk2; 1885 tcp_v4_restore_cb(skb); 1886 refcounted = false; 1887 goto process; 1888 } 1889 } 1890 /* to ACK */ 1891 /* fall through */ 1892 case TCP_TW_ACK: 1893 tcp_v4_timewait_ack(sk, skb); 1894 break; 1895 case TCP_TW_RST: 1896 tcp_v4_send_reset(sk, skb); 1897 inet_twsk_deschedule_put(inet_twsk(sk)); 1898 goto discard_it; 1899 case TCP_TW_SUCCESS:; 1900 } 1901 goto discard_it; 1902 } 1903 1904 static struct timewait_sock_ops tcp_timewait_sock_ops = { 1905 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1906 .twsk_unique = tcp_twsk_unique, 1907 .twsk_destructor= tcp_twsk_destructor, 1908 }; 1909 1910 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 1911 { 1912 struct dst_entry *dst = skb_dst(skb); 1913 1914 if (dst && dst_hold_safe(dst)) { 1915 sk->sk_rx_dst = dst; 1916 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 1917 } 1918 } 1919 EXPORT_SYMBOL(inet_sk_rx_dst_set); 1920 1921 const struct inet_connection_sock_af_ops ipv4_specific = { 1922 .queue_xmit = ip_queue_xmit, 1923 .send_check = tcp_v4_send_check, 1924 .rebuild_header = inet_sk_rebuild_header, 1925 .sk_rx_dst_set = inet_sk_rx_dst_set, 1926 .conn_request = tcp_v4_conn_request, 1927 .syn_recv_sock = tcp_v4_syn_recv_sock, 1928 .net_header_len = sizeof(struct iphdr), 1929 .setsockopt = ip_setsockopt, 1930 .getsockopt = ip_getsockopt, 1931 .addr2sockaddr = inet_csk_addr2sockaddr, 1932 .sockaddr_len = sizeof(struct sockaddr_in), 1933 #ifdef CONFIG_COMPAT 1934 .compat_setsockopt = compat_ip_setsockopt, 1935 .compat_getsockopt = compat_ip_getsockopt, 1936 #endif 1937 .mtu_reduced = tcp_v4_mtu_reduced, 1938 }; 1939 EXPORT_SYMBOL(ipv4_specific); 1940 1941 #ifdef CONFIG_TCP_MD5SIG 1942 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1943 .md5_lookup = tcp_v4_md5_lookup, 1944 .calc_md5_hash = tcp_v4_md5_hash_skb, 1945 .md5_parse = tcp_v4_parse_md5_keys, 1946 }; 1947 #endif 1948 1949 /* NOTE: A lot of things set to zero explicitly by call to 1950 * sk_alloc() so need not be done here. 1951 */ 1952 static int tcp_v4_init_sock(struct sock *sk) 1953 { 1954 struct inet_connection_sock *icsk = inet_csk(sk); 1955 1956 tcp_init_sock(sk); 1957 1958 icsk->icsk_af_ops = &ipv4_specific; 1959 1960 #ifdef CONFIG_TCP_MD5SIG 1961 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 1962 #endif 1963 1964 return 0; 1965 } 1966 1967 void tcp_v4_destroy_sock(struct sock *sk) 1968 { 1969 struct tcp_sock *tp = tcp_sk(sk); 1970 1971 trace_tcp_destroy_sock(sk); 1972 1973 tcp_clear_xmit_timers(sk); 1974 1975 tcp_cleanup_congestion_control(sk); 1976 1977 tcp_cleanup_ulp(sk); 1978 1979 /* Cleanup up the write buffer. */ 1980 tcp_write_queue_purge(sk); 1981 1982 /* Check if we want to disable active TFO */ 1983 tcp_fastopen_active_disable_ofo_check(sk); 1984 1985 /* Cleans up our, hopefully empty, out_of_order_queue. */ 1986 skb_rbtree_purge(&tp->out_of_order_queue); 1987 1988 #ifdef CONFIG_TCP_MD5SIG 1989 /* Clean up the MD5 key list, if any */ 1990 if (tp->md5sig_info) { 1991 tcp_clear_md5_list(sk); 1992 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 1993 tp->md5sig_info = NULL; 1994 } 1995 #endif 1996 1997 /* Clean up a referenced TCP bind bucket. */ 1998 if (inet_csk(sk)->icsk_bind_hash) 1999 inet_put_port(sk); 2000 2001 BUG_ON(tp->fastopen_rsk); 2002 2003 /* If socket is aborted during connect operation */ 2004 tcp_free_fastopen_req(tp); 2005 tcp_fastopen_destroy_cipher(sk); 2006 tcp_saved_syn_free(tp); 2007 2008 sk_sockets_allocated_dec(sk); 2009 } 2010 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2011 2012 #ifdef CONFIG_PROC_FS 2013 /* Proc filesystem TCP sock list dumping. */ 2014 2015 /* 2016 * Get next listener socket follow cur. If cur is NULL, get first socket 2017 * starting from bucket given in st->bucket; when st->bucket is zero the 2018 * very first socket in the hash table is returned. 2019 */ 2020 static void *listening_get_next(struct seq_file *seq, void *cur) 2021 { 2022 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2023 struct tcp_iter_state *st = seq->private; 2024 struct net *net = seq_file_net(seq); 2025 struct inet_listen_hashbucket *ilb; 2026 struct sock *sk = cur; 2027 2028 if (!sk) { 2029 get_head: 2030 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2031 spin_lock(&ilb->lock); 2032 sk = sk_head(&ilb->head); 2033 st->offset = 0; 2034 goto get_sk; 2035 } 2036 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2037 ++st->num; 2038 ++st->offset; 2039 2040 sk = sk_next(sk); 2041 get_sk: 2042 sk_for_each_from(sk) { 2043 if (!net_eq(sock_net(sk), net)) 2044 continue; 2045 if (sk->sk_family == afinfo->family) 2046 return sk; 2047 } 2048 spin_unlock(&ilb->lock); 2049 st->offset = 0; 2050 if (++st->bucket < INET_LHTABLE_SIZE) 2051 goto get_head; 2052 return NULL; 2053 } 2054 2055 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2056 { 2057 struct tcp_iter_state *st = seq->private; 2058 void *rc; 2059 2060 st->bucket = 0; 2061 st->offset = 0; 2062 rc = listening_get_next(seq, NULL); 2063 2064 while (rc && *pos) { 2065 rc = listening_get_next(seq, rc); 2066 --*pos; 2067 } 2068 return rc; 2069 } 2070 2071 static inline bool empty_bucket(const struct tcp_iter_state *st) 2072 { 2073 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2074 } 2075 2076 /* 2077 * Get first established socket starting from bucket given in st->bucket. 2078 * If st->bucket is zero, the very first socket in the hash is returned. 2079 */ 2080 static void *established_get_first(struct seq_file *seq) 2081 { 2082 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2083 struct tcp_iter_state *st = seq->private; 2084 struct net *net = seq_file_net(seq); 2085 void *rc = NULL; 2086 2087 st->offset = 0; 2088 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2089 struct sock *sk; 2090 struct hlist_nulls_node *node; 2091 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2092 2093 /* Lockless fast path for the common case of empty buckets */ 2094 if (empty_bucket(st)) 2095 continue; 2096 2097 spin_lock_bh(lock); 2098 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2099 if (sk->sk_family != afinfo->family || 2100 !net_eq(sock_net(sk), net)) { 2101 continue; 2102 } 2103 rc = sk; 2104 goto out; 2105 } 2106 spin_unlock_bh(lock); 2107 } 2108 out: 2109 return rc; 2110 } 2111 2112 static void *established_get_next(struct seq_file *seq, void *cur) 2113 { 2114 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2115 struct sock *sk = cur; 2116 struct hlist_nulls_node *node; 2117 struct tcp_iter_state *st = seq->private; 2118 struct net *net = seq_file_net(seq); 2119 2120 ++st->num; 2121 ++st->offset; 2122 2123 sk = sk_nulls_next(sk); 2124 2125 sk_nulls_for_each_from(sk, node) { 2126 if (sk->sk_family == afinfo->family && 2127 net_eq(sock_net(sk), net)) 2128 return sk; 2129 } 2130 2131 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2132 ++st->bucket; 2133 return established_get_first(seq); 2134 } 2135 2136 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2137 { 2138 struct tcp_iter_state *st = seq->private; 2139 void *rc; 2140 2141 st->bucket = 0; 2142 rc = established_get_first(seq); 2143 2144 while (rc && pos) { 2145 rc = established_get_next(seq, rc); 2146 --pos; 2147 } 2148 return rc; 2149 } 2150 2151 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2152 { 2153 void *rc; 2154 struct tcp_iter_state *st = seq->private; 2155 2156 st->state = TCP_SEQ_STATE_LISTENING; 2157 rc = listening_get_idx(seq, &pos); 2158 2159 if (!rc) { 2160 st->state = TCP_SEQ_STATE_ESTABLISHED; 2161 rc = established_get_idx(seq, pos); 2162 } 2163 2164 return rc; 2165 } 2166 2167 static void *tcp_seek_last_pos(struct seq_file *seq) 2168 { 2169 struct tcp_iter_state *st = seq->private; 2170 int offset = st->offset; 2171 int orig_num = st->num; 2172 void *rc = NULL; 2173 2174 switch (st->state) { 2175 case TCP_SEQ_STATE_LISTENING: 2176 if (st->bucket >= INET_LHTABLE_SIZE) 2177 break; 2178 st->state = TCP_SEQ_STATE_LISTENING; 2179 rc = listening_get_next(seq, NULL); 2180 while (offset-- && rc) 2181 rc = listening_get_next(seq, rc); 2182 if (rc) 2183 break; 2184 st->bucket = 0; 2185 st->state = TCP_SEQ_STATE_ESTABLISHED; 2186 /* Fallthrough */ 2187 case TCP_SEQ_STATE_ESTABLISHED: 2188 if (st->bucket > tcp_hashinfo.ehash_mask) 2189 break; 2190 rc = established_get_first(seq); 2191 while (offset-- && rc) 2192 rc = established_get_next(seq, rc); 2193 } 2194 2195 st->num = orig_num; 2196 2197 return rc; 2198 } 2199 2200 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2201 { 2202 struct tcp_iter_state *st = seq->private; 2203 void *rc; 2204 2205 if (*pos && *pos == st->last_pos) { 2206 rc = tcp_seek_last_pos(seq); 2207 if (rc) 2208 goto out; 2209 } 2210 2211 st->state = TCP_SEQ_STATE_LISTENING; 2212 st->num = 0; 2213 st->bucket = 0; 2214 st->offset = 0; 2215 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2216 2217 out: 2218 st->last_pos = *pos; 2219 return rc; 2220 } 2221 EXPORT_SYMBOL(tcp_seq_start); 2222 2223 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2224 { 2225 struct tcp_iter_state *st = seq->private; 2226 void *rc = NULL; 2227 2228 if (v == SEQ_START_TOKEN) { 2229 rc = tcp_get_idx(seq, 0); 2230 goto out; 2231 } 2232 2233 switch (st->state) { 2234 case TCP_SEQ_STATE_LISTENING: 2235 rc = listening_get_next(seq, v); 2236 if (!rc) { 2237 st->state = TCP_SEQ_STATE_ESTABLISHED; 2238 st->bucket = 0; 2239 st->offset = 0; 2240 rc = established_get_first(seq); 2241 } 2242 break; 2243 case TCP_SEQ_STATE_ESTABLISHED: 2244 rc = established_get_next(seq, v); 2245 break; 2246 } 2247 out: 2248 ++*pos; 2249 st->last_pos = *pos; 2250 return rc; 2251 } 2252 EXPORT_SYMBOL(tcp_seq_next); 2253 2254 void tcp_seq_stop(struct seq_file *seq, void *v) 2255 { 2256 struct tcp_iter_state *st = seq->private; 2257 2258 switch (st->state) { 2259 case TCP_SEQ_STATE_LISTENING: 2260 if (v != SEQ_START_TOKEN) 2261 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2262 break; 2263 case TCP_SEQ_STATE_ESTABLISHED: 2264 if (v) 2265 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2266 break; 2267 } 2268 } 2269 EXPORT_SYMBOL(tcp_seq_stop); 2270 2271 static void get_openreq4(const struct request_sock *req, 2272 struct seq_file *f, int i) 2273 { 2274 const struct inet_request_sock *ireq = inet_rsk(req); 2275 long delta = req->rsk_timer.expires - jiffies; 2276 2277 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2278 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2279 i, 2280 ireq->ir_loc_addr, 2281 ireq->ir_num, 2282 ireq->ir_rmt_addr, 2283 ntohs(ireq->ir_rmt_port), 2284 TCP_SYN_RECV, 2285 0, 0, /* could print option size, but that is af dependent. */ 2286 1, /* timers active (only the expire timer) */ 2287 jiffies_delta_to_clock_t(delta), 2288 req->num_timeout, 2289 from_kuid_munged(seq_user_ns(f), 2290 sock_i_uid(req->rsk_listener)), 2291 0, /* non standard timer */ 2292 0, /* open_requests have no inode */ 2293 0, 2294 req); 2295 } 2296 2297 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2298 { 2299 int timer_active; 2300 unsigned long timer_expires; 2301 const struct tcp_sock *tp = tcp_sk(sk); 2302 const struct inet_connection_sock *icsk = inet_csk(sk); 2303 const struct inet_sock *inet = inet_sk(sk); 2304 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2305 __be32 dest = inet->inet_daddr; 2306 __be32 src = inet->inet_rcv_saddr; 2307 __u16 destp = ntohs(inet->inet_dport); 2308 __u16 srcp = ntohs(inet->inet_sport); 2309 int rx_queue; 2310 int state; 2311 2312 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2313 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2314 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2315 timer_active = 1; 2316 timer_expires = icsk->icsk_timeout; 2317 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2318 timer_active = 4; 2319 timer_expires = icsk->icsk_timeout; 2320 } else if (timer_pending(&sk->sk_timer)) { 2321 timer_active = 2; 2322 timer_expires = sk->sk_timer.expires; 2323 } else { 2324 timer_active = 0; 2325 timer_expires = jiffies; 2326 } 2327 2328 state = inet_sk_state_load(sk); 2329 if (state == TCP_LISTEN) 2330 rx_queue = sk->sk_ack_backlog; 2331 else 2332 /* Because we don't lock the socket, 2333 * we might find a transient negative value. 2334 */ 2335 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2336 2337 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2338 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2339 i, src, srcp, dest, destp, state, 2340 tp->write_seq - tp->snd_una, 2341 rx_queue, 2342 timer_active, 2343 jiffies_delta_to_clock_t(timer_expires - jiffies), 2344 icsk->icsk_retransmits, 2345 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2346 icsk->icsk_probes_out, 2347 sock_i_ino(sk), 2348 refcount_read(&sk->sk_refcnt), sk, 2349 jiffies_to_clock_t(icsk->icsk_rto), 2350 jiffies_to_clock_t(icsk->icsk_ack.ato), 2351 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2352 tp->snd_cwnd, 2353 state == TCP_LISTEN ? 2354 fastopenq->max_qlen : 2355 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2356 } 2357 2358 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2359 struct seq_file *f, int i) 2360 { 2361 long delta = tw->tw_timer.expires - jiffies; 2362 __be32 dest, src; 2363 __u16 destp, srcp; 2364 2365 dest = tw->tw_daddr; 2366 src = tw->tw_rcv_saddr; 2367 destp = ntohs(tw->tw_dport); 2368 srcp = ntohs(tw->tw_sport); 2369 2370 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2371 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2372 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2373 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2374 refcount_read(&tw->tw_refcnt), tw); 2375 } 2376 2377 #define TMPSZ 150 2378 2379 static int tcp4_seq_show(struct seq_file *seq, void *v) 2380 { 2381 struct tcp_iter_state *st; 2382 struct sock *sk = v; 2383 2384 seq_setwidth(seq, TMPSZ - 1); 2385 if (v == SEQ_START_TOKEN) { 2386 seq_puts(seq, " sl local_address rem_address st tx_queue " 2387 "rx_queue tr tm->when retrnsmt uid timeout " 2388 "inode"); 2389 goto out; 2390 } 2391 st = seq->private; 2392 2393 if (sk->sk_state == TCP_TIME_WAIT) 2394 get_timewait4_sock(v, seq, st->num); 2395 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2396 get_openreq4(v, seq, st->num); 2397 else 2398 get_tcp4_sock(v, seq, st->num); 2399 out: 2400 seq_pad(seq, '\n'); 2401 return 0; 2402 } 2403 2404 static const struct seq_operations tcp4_seq_ops = { 2405 .show = tcp4_seq_show, 2406 .start = tcp_seq_start, 2407 .next = tcp_seq_next, 2408 .stop = tcp_seq_stop, 2409 }; 2410 2411 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2412 .family = AF_INET, 2413 }; 2414 2415 static int __net_init tcp4_proc_init_net(struct net *net) 2416 { 2417 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2418 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2419 return -ENOMEM; 2420 return 0; 2421 } 2422 2423 static void __net_exit tcp4_proc_exit_net(struct net *net) 2424 { 2425 remove_proc_entry("tcp", net->proc_net); 2426 } 2427 2428 static struct pernet_operations tcp4_net_ops = { 2429 .init = tcp4_proc_init_net, 2430 .exit = tcp4_proc_exit_net, 2431 }; 2432 2433 int __init tcp4_proc_init(void) 2434 { 2435 return register_pernet_subsys(&tcp4_net_ops); 2436 } 2437 2438 void tcp4_proc_exit(void) 2439 { 2440 unregister_pernet_subsys(&tcp4_net_ops); 2441 } 2442 #endif /* CONFIG_PROC_FS */ 2443 2444 struct proto tcp_prot = { 2445 .name = "TCP", 2446 .owner = THIS_MODULE, 2447 .close = tcp_close, 2448 .pre_connect = tcp_v4_pre_connect, 2449 .connect = tcp_v4_connect, 2450 .disconnect = tcp_disconnect, 2451 .accept = inet_csk_accept, 2452 .ioctl = tcp_ioctl, 2453 .init = tcp_v4_init_sock, 2454 .destroy = tcp_v4_destroy_sock, 2455 .shutdown = tcp_shutdown, 2456 .setsockopt = tcp_setsockopt, 2457 .getsockopt = tcp_getsockopt, 2458 .keepalive = tcp_set_keepalive, 2459 .recvmsg = tcp_recvmsg, 2460 .sendmsg = tcp_sendmsg, 2461 .sendpage = tcp_sendpage, 2462 .backlog_rcv = tcp_v4_do_rcv, 2463 .release_cb = tcp_release_cb, 2464 .hash = inet_hash, 2465 .unhash = inet_unhash, 2466 .get_port = inet_csk_get_port, 2467 .enter_memory_pressure = tcp_enter_memory_pressure, 2468 .leave_memory_pressure = tcp_leave_memory_pressure, 2469 .stream_memory_free = tcp_stream_memory_free, 2470 .sockets_allocated = &tcp_sockets_allocated, 2471 .orphan_count = &tcp_orphan_count, 2472 .memory_allocated = &tcp_memory_allocated, 2473 .memory_pressure = &tcp_memory_pressure, 2474 .sysctl_mem = sysctl_tcp_mem, 2475 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2476 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2477 .max_header = MAX_TCP_HEADER, 2478 .obj_size = sizeof(struct tcp_sock), 2479 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2480 .twsk_prot = &tcp_timewait_sock_ops, 2481 .rsk_prot = &tcp_request_sock_ops, 2482 .h.hashinfo = &tcp_hashinfo, 2483 .no_autobind = true, 2484 #ifdef CONFIG_COMPAT 2485 .compat_setsockopt = compat_tcp_setsockopt, 2486 .compat_getsockopt = compat_tcp_getsockopt, 2487 #endif 2488 .diag_destroy = tcp_abort, 2489 }; 2490 EXPORT_SYMBOL(tcp_prot); 2491 2492 static void __net_exit tcp_sk_exit(struct net *net) 2493 { 2494 int cpu; 2495 2496 module_put(net->ipv4.tcp_congestion_control->owner); 2497 2498 for_each_possible_cpu(cpu) 2499 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2500 free_percpu(net->ipv4.tcp_sk); 2501 } 2502 2503 static int __net_init tcp_sk_init(struct net *net) 2504 { 2505 int res, cpu, cnt; 2506 2507 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2508 if (!net->ipv4.tcp_sk) 2509 return -ENOMEM; 2510 2511 for_each_possible_cpu(cpu) { 2512 struct sock *sk; 2513 2514 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2515 IPPROTO_TCP, net); 2516 if (res) 2517 goto fail; 2518 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2519 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2520 } 2521 2522 net->ipv4.sysctl_tcp_ecn = 2; 2523 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2524 2525 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2526 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2527 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2528 2529 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2530 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2531 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2532 2533 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2534 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2535 net->ipv4.sysctl_tcp_syncookies = 1; 2536 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2537 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2538 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2539 net->ipv4.sysctl_tcp_orphan_retries = 0; 2540 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2541 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2542 net->ipv4.sysctl_tcp_tw_reuse = 2; 2543 2544 cnt = tcp_hashinfo.ehash_mask + 1; 2545 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; 2546 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2547 2548 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); 2549 net->ipv4.sysctl_tcp_sack = 1; 2550 net->ipv4.sysctl_tcp_window_scaling = 1; 2551 net->ipv4.sysctl_tcp_timestamps = 1; 2552 net->ipv4.sysctl_tcp_early_retrans = 3; 2553 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2554 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2555 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2556 net->ipv4.sysctl_tcp_max_reordering = 300; 2557 net->ipv4.sysctl_tcp_dsack = 1; 2558 net->ipv4.sysctl_tcp_app_win = 31; 2559 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2560 net->ipv4.sysctl_tcp_frto = 2; 2561 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2562 /* This limits the percentage of the congestion window which we 2563 * will allow a single TSO frame to consume. Building TSO frames 2564 * which are too large can cause TCP streams to be bursty. 2565 */ 2566 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2567 /* Default TSQ limit of four TSO segments */ 2568 net->ipv4.sysctl_tcp_limit_output_bytes = 262144; 2569 /* rfc5961 challenge ack rate limiting */ 2570 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2571 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2572 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2573 net->ipv4.sysctl_tcp_autocorking = 1; 2574 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2575 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2576 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2577 if (net != &init_net) { 2578 memcpy(net->ipv4.sysctl_tcp_rmem, 2579 init_net.ipv4.sysctl_tcp_rmem, 2580 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2581 memcpy(net->ipv4.sysctl_tcp_wmem, 2582 init_net.ipv4.sysctl_tcp_wmem, 2583 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2584 } 2585 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2586 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2587 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2588 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2589 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2590 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2591 2592 /* Reno is always built in */ 2593 if (!net_eq(net, &init_net) && 2594 try_module_get(init_net.ipv4.tcp_congestion_control->owner)) 2595 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2596 else 2597 net->ipv4.tcp_congestion_control = &tcp_reno; 2598 2599 return 0; 2600 fail: 2601 tcp_sk_exit(net); 2602 2603 return res; 2604 } 2605 2606 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2607 { 2608 struct net *net; 2609 2610 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2611 2612 list_for_each_entry(net, net_exit_list, exit_list) 2613 tcp_fastopen_ctx_destroy(net); 2614 } 2615 2616 static struct pernet_operations __net_initdata tcp_sk_ops = { 2617 .init = tcp_sk_init, 2618 .exit = tcp_sk_exit, 2619 .exit_batch = tcp_sk_exit_batch, 2620 }; 2621 2622 void __init tcp_v4_init(void) 2623 { 2624 if (register_pernet_subsys(&tcp_sk_ops)) 2625 panic("Failed to create the TCP control socket.\n"); 2626 } 2627