1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 80 #include <crypto/hash.h> 81 #include <linux/scatterlist.h> 82 83 #include <trace/events/tcp.h> 84 85 #ifdef CONFIG_TCP_MD5SIG 86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 87 __be32 daddr, __be32 saddr, const struct tcphdr *th); 88 #endif 89 90 struct inet_hashinfo tcp_hashinfo; 91 EXPORT_SYMBOL(tcp_hashinfo); 92 93 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 94 { 95 return secure_tcp_seq(ip_hdr(skb)->daddr, 96 ip_hdr(skb)->saddr, 97 tcp_hdr(skb)->dest, 98 tcp_hdr(skb)->source); 99 } 100 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 102 { 103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 104 } 105 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 107 { 108 const struct inet_timewait_sock *tw = inet_twsk(sktw); 109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 110 struct tcp_sock *tp = tcp_sk(sk); 111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 112 113 if (reuse == 2) { 114 /* Still does not detect *everything* that goes through 115 * lo, since we require a loopback src or dst address 116 * or direct binding to 'lo' interface. 117 */ 118 bool loopback = false; 119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 120 loopback = true; 121 #if IS_ENABLED(CONFIG_IPV6) 122 if (tw->tw_family == AF_INET6) { 123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 124 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && 125 (tw->tw_v6_daddr.s6_addr[12] == 127)) || 126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 127 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && 128 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) 129 loopback = true; 130 } else 131 #endif 132 { 133 if (ipv4_is_loopback(tw->tw_daddr) || 134 ipv4_is_loopback(tw->tw_rcv_saddr)) 135 loopback = true; 136 } 137 if (!loopback) 138 reuse = 0; 139 } 140 141 /* With PAWS, it is safe from the viewpoint 142 of data integrity. Even without PAWS it is safe provided sequence 143 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 144 145 Actually, the idea is close to VJ's one, only timestamp cache is 146 held not per host, but per port pair and TW bucket is used as state 147 holder. 148 149 If TW bucket has been already destroyed we fall back to VJ's scheme 150 and use initial timestamp retrieved from peer table. 151 */ 152 if (tcptw->tw_ts_recent_stamp && 153 (!twp || (reuse && time_after32(ktime_get_seconds(), 154 tcptw->tw_ts_recent_stamp)))) { 155 /* In case of repair and re-using TIME-WAIT sockets we still 156 * want to be sure that it is safe as above but honor the 157 * sequence numbers and time stamps set as part of the repair 158 * process. 159 * 160 * Without this check re-using a TIME-WAIT socket with TCP 161 * repair would accumulate a -1 on the repair assigned 162 * sequence number. The first time it is reused the sequence 163 * is -1, the second time -2, etc. This fixes that issue 164 * without appearing to create any others. 165 */ 166 if (likely(!tp->repair)) { 167 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 168 if (tp->write_seq == 0) 169 tp->write_seq = 1; 170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 172 } 173 sock_hold(sktw); 174 return 1; 175 } 176 177 return 0; 178 } 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 180 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 182 int addr_len) 183 { 184 /* This check is replicated from tcp_v4_connect() and intended to 185 * prevent BPF program called below from accessing bytes that are out 186 * of the bound specified by user in addr_len. 187 */ 188 if (addr_len < sizeof(struct sockaddr_in)) 189 return -EINVAL; 190 191 sock_owned_by_me(sk); 192 193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 194 } 195 196 /* This will initiate an outgoing connection. */ 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 198 { 199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 200 struct inet_sock *inet = inet_sk(sk); 201 struct tcp_sock *tp = tcp_sk(sk); 202 __be16 orig_sport, orig_dport; 203 __be32 daddr, nexthop; 204 struct flowi4 *fl4; 205 struct rtable *rt; 206 int err; 207 struct ip_options_rcu *inet_opt; 208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 209 210 if (addr_len < sizeof(struct sockaddr_in)) 211 return -EINVAL; 212 213 if (usin->sin_family != AF_INET) 214 return -EAFNOSUPPORT; 215 216 nexthop = daddr = usin->sin_addr.s_addr; 217 inet_opt = rcu_dereference_protected(inet->inet_opt, 218 lockdep_sock_is_held(sk)); 219 if (inet_opt && inet_opt->opt.srr) { 220 if (!daddr) 221 return -EINVAL; 222 nexthop = inet_opt->opt.faddr; 223 } 224 225 orig_sport = inet->inet_sport; 226 orig_dport = usin->sin_port; 227 fl4 = &inet->cork.fl.u.ip4; 228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 230 IPPROTO_TCP, 231 orig_sport, orig_dport, sk); 232 if (IS_ERR(rt)) { 233 err = PTR_ERR(rt); 234 if (err == -ENETUNREACH) 235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 236 return err; 237 } 238 239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 240 ip_rt_put(rt); 241 return -ENETUNREACH; 242 } 243 244 if (!inet_opt || !inet_opt->opt.srr) 245 daddr = fl4->daddr; 246 247 if (!inet->inet_saddr) 248 inet->inet_saddr = fl4->saddr; 249 sk_rcv_saddr_set(sk, inet->inet_saddr); 250 251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 252 /* Reset inherited state */ 253 tp->rx_opt.ts_recent = 0; 254 tp->rx_opt.ts_recent_stamp = 0; 255 if (likely(!tp->repair)) 256 tp->write_seq = 0; 257 } 258 259 inet->inet_dport = usin->sin_port; 260 sk_daddr_set(sk, daddr); 261 262 inet_csk(sk)->icsk_ext_hdr_len = 0; 263 if (inet_opt) 264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 265 266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 267 268 /* Socket identity is still unknown (sport may be zero). 269 * However we set state to SYN-SENT and not releasing socket 270 * lock select source port, enter ourselves into the hash tables and 271 * complete initialization after this. 272 */ 273 tcp_set_state(sk, TCP_SYN_SENT); 274 err = inet_hash_connect(tcp_death_row, sk); 275 if (err) 276 goto failure; 277 278 sk_set_txhash(sk); 279 280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 281 inet->inet_sport, inet->inet_dport, sk); 282 if (IS_ERR(rt)) { 283 err = PTR_ERR(rt); 284 rt = NULL; 285 goto failure; 286 } 287 /* OK, now commit destination to socket. */ 288 sk->sk_gso_type = SKB_GSO_TCPV4; 289 sk_setup_caps(sk, &rt->dst); 290 rt = NULL; 291 292 if (likely(!tp->repair)) { 293 if (!tp->write_seq) 294 tp->write_seq = secure_tcp_seq(inet->inet_saddr, 295 inet->inet_daddr, 296 inet->inet_sport, 297 usin->sin_port); 298 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 299 inet->inet_saddr, 300 inet->inet_daddr); 301 } 302 303 inet->inet_id = tp->write_seq ^ jiffies; 304 305 if (tcp_fastopen_defer_connect(sk, &err)) 306 return err; 307 if (err) 308 goto failure; 309 310 err = tcp_connect(sk); 311 312 if (err) 313 goto failure; 314 315 return 0; 316 317 failure: 318 /* 319 * This unhashes the socket and releases the local port, 320 * if necessary. 321 */ 322 tcp_set_state(sk, TCP_CLOSE); 323 ip_rt_put(rt); 324 sk->sk_route_caps = 0; 325 inet->inet_dport = 0; 326 return err; 327 } 328 EXPORT_SYMBOL(tcp_v4_connect); 329 330 /* 331 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 332 * It can be called through tcp_release_cb() if socket was owned by user 333 * at the time tcp_v4_err() was called to handle ICMP message. 334 */ 335 void tcp_v4_mtu_reduced(struct sock *sk) 336 { 337 struct inet_sock *inet = inet_sk(sk); 338 struct dst_entry *dst; 339 u32 mtu; 340 341 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 342 return; 343 mtu = tcp_sk(sk)->mtu_info; 344 dst = inet_csk_update_pmtu(sk, mtu); 345 if (!dst) 346 return; 347 348 /* Something is about to be wrong... Remember soft error 349 * for the case, if this connection will not able to recover. 350 */ 351 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 352 sk->sk_err_soft = EMSGSIZE; 353 354 mtu = dst_mtu(dst); 355 356 if (inet->pmtudisc != IP_PMTUDISC_DONT && 357 ip_sk_accept_pmtu(sk) && 358 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 359 tcp_sync_mss(sk, mtu); 360 361 /* Resend the TCP packet because it's 362 * clear that the old packet has been 363 * dropped. This is the new "fast" path mtu 364 * discovery. 365 */ 366 tcp_simple_retransmit(sk); 367 } /* else let the usual retransmit timer handle it */ 368 } 369 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 370 371 static void do_redirect(struct sk_buff *skb, struct sock *sk) 372 { 373 struct dst_entry *dst = __sk_dst_check(sk, 0); 374 375 if (dst) 376 dst->ops->redirect(dst, sk, skb); 377 } 378 379 380 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 381 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 382 { 383 struct request_sock *req = inet_reqsk(sk); 384 struct net *net = sock_net(sk); 385 386 /* ICMPs are not backlogged, hence we cannot get 387 * an established socket here. 388 */ 389 if (seq != tcp_rsk(req)->snt_isn) { 390 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 391 } else if (abort) { 392 /* 393 * Still in SYN_RECV, just remove it silently. 394 * There is no good way to pass the error to the newly 395 * created socket, and POSIX does not want network 396 * errors returned from accept(). 397 */ 398 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 399 tcp_listendrop(req->rsk_listener); 400 } 401 reqsk_put(req); 402 } 403 EXPORT_SYMBOL(tcp_req_err); 404 405 /* 406 * This routine is called by the ICMP module when it gets some 407 * sort of error condition. If err < 0 then the socket should 408 * be closed and the error returned to the user. If err > 0 409 * it's just the icmp type << 8 | icmp code. After adjustment 410 * header points to the first 8 bytes of the tcp header. We need 411 * to find the appropriate port. 412 * 413 * The locking strategy used here is very "optimistic". When 414 * someone else accesses the socket the ICMP is just dropped 415 * and for some paths there is no check at all. 416 * A more general error queue to queue errors for later handling 417 * is probably better. 418 * 419 */ 420 421 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 422 { 423 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; 424 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 425 struct inet_connection_sock *icsk; 426 struct tcp_sock *tp; 427 struct inet_sock *inet; 428 const int type = icmp_hdr(icmp_skb)->type; 429 const int code = icmp_hdr(icmp_skb)->code; 430 struct sock *sk; 431 struct sk_buff *skb; 432 struct request_sock *fastopen; 433 u32 seq, snd_una; 434 s32 remaining; 435 u32 delta_us; 436 int err; 437 struct net *net = dev_net(icmp_skb->dev); 438 439 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 440 th->dest, iph->saddr, ntohs(th->source), 441 inet_iif(icmp_skb), 0); 442 if (!sk) { 443 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 444 return -ENOENT; 445 } 446 if (sk->sk_state == TCP_TIME_WAIT) { 447 inet_twsk_put(inet_twsk(sk)); 448 return 0; 449 } 450 seq = ntohl(th->seq); 451 if (sk->sk_state == TCP_NEW_SYN_RECV) { 452 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 453 type == ICMP_TIME_EXCEEDED || 454 (type == ICMP_DEST_UNREACH && 455 (code == ICMP_NET_UNREACH || 456 code == ICMP_HOST_UNREACH))); 457 return 0; 458 } 459 460 bh_lock_sock(sk); 461 /* If too many ICMPs get dropped on busy 462 * servers this needs to be solved differently. 463 * We do take care of PMTU discovery (RFC1191) special case : 464 * we can receive locally generated ICMP messages while socket is held. 465 */ 466 if (sock_owned_by_user(sk)) { 467 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 468 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 469 } 470 if (sk->sk_state == TCP_CLOSE) 471 goto out; 472 473 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 474 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 475 goto out; 476 } 477 478 icsk = inet_csk(sk); 479 tp = tcp_sk(sk); 480 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 481 fastopen = tp->fastopen_rsk; 482 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 483 if (sk->sk_state != TCP_LISTEN && 484 !between(seq, snd_una, tp->snd_nxt)) { 485 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 486 goto out; 487 } 488 489 switch (type) { 490 case ICMP_REDIRECT: 491 if (!sock_owned_by_user(sk)) 492 do_redirect(icmp_skb, sk); 493 goto out; 494 case ICMP_SOURCE_QUENCH: 495 /* Just silently ignore these. */ 496 goto out; 497 case ICMP_PARAMETERPROB: 498 err = EPROTO; 499 break; 500 case ICMP_DEST_UNREACH: 501 if (code > NR_ICMP_UNREACH) 502 goto out; 503 504 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 505 /* We are not interested in TCP_LISTEN and open_requests 506 * (SYN-ACKs send out by Linux are always <576bytes so 507 * they should go through unfragmented). 508 */ 509 if (sk->sk_state == TCP_LISTEN) 510 goto out; 511 512 tp->mtu_info = info; 513 if (!sock_owned_by_user(sk)) { 514 tcp_v4_mtu_reduced(sk); 515 } else { 516 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 517 sock_hold(sk); 518 } 519 goto out; 520 } 521 522 err = icmp_err_convert[code].errno; 523 /* check if icmp_skb allows revert of backoff 524 * (see draft-zimmermann-tcp-lcd) */ 525 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 526 break; 527 if (seq != tp->snd_una || !icsk->icsk_retransmits || 528 !icsk->icsk_backoff || fastopen) 529 break; 530 531 if (sock_owned_by_user(sk)) 532 break; 533 534 skb = tcp_rtx_queue_head(sk); 535 if (WARN_ON_ONCE(!skb)) 536 break; 537 538 icsk->icsk_backoff--; 539 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : 540 TCP_TIMEOUT_INIT; 541 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 542 543 544 tcp_mstamp_refresh(tp); 545 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 546 remaining = icsk->icsk_rto - 547 usecs_to_jiffies(delta_us); 548 549 if (remaining > 0) { 550 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 551 remaining, TCP_RTO_MAX); 552 } else { 553 /* RTO revert clocked out retransmission. 554 * Will retransmit now */ 555 tcp_retransmit_timer(sk); 556 } 557 558 break; 559 case ICMP_TIME_EXCEEDED: 560 err = EHOSTUNREACH; 561 break; 562 default: 563 goto out; 564 } 565 566 switch (sk->sk_state) { 567 case TCP_SYN_SENT: 568 case TCP_SYN_RECV: 569 /* Only in fast or simultaneous open. If a fast open socket is 570 * is already accepted it is treated as a connected one below. 571 */ 572 if (fastopen && !fastopen->sk) 573 break; 574 575 if (!sock_owned_by_user(sk)) { 576 sk->sk_err = err; 577 578 sk->sk_error_report(sk); 579 580 tcp_done(sk); 581 } else { 582 sk->sk_err_soft = err; 583 } 584 goto out; 585 } 586 587 /* If we've already connected we will keep trying 588 * until we time out, or the user gives up. 589 * 590 * rfc1122 4.2.3.9 allows to consider as hard errors 591 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 592 * but it is obsoleted by pmtu discovery). 593 * 594 * Note, that in modern internet, where routing is unreliable 595 * and in each dark corner broken firewalls sit, sending random 596 * errors ordered by their masters even this two messages finally lose 597 * their original sense (even Linux sends invalid PORT_UNREACHs) 598 * 599 * Now we are in compliance with RFCs. 600 * --ANK (980905) 601 */ 602 603 inet = inet_sk(sk); 604 if (!sock_owned_by_user(sk) && inet->recverr) { 605 sk->sk_err = err; 606 sk->sk_error_report(sk); 607 } else { /* Only an error on timeout */ 608 sk->sk_err_soft = err; 609 } 610 611 out: 612 bh_unlock_sock(sk); 613 sock_put(sk); 614 return 0; 615 } 616 617 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 618 { 619 struct tcphdr *th = tcp_hdr(skb); 620 621 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 622 skb->csum_start = skb_transport_header(skb) - skb->head; 623 skb->csum_offset = offsetof(struct tcphdr, check); 624 } 625 626 /* This routine computes an IPv4 TCP checksum. */ 627 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 628 { 629 const struct inet_sock *inet = inet_sk(sk); 630 631 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 632 } 633 EXPORT_SYMBOL(tcp_v4_send_check); 634 635 /* 636 * This routine will send an RST to the other tcp. 637 * 638 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 639 * for reset. 640 * Answer: if a packet caused RST, it is not for a socket 641 * existing in our system, if it is matched to a socket, 642 * it is just duplicate segment or bug in other side's TCP. 643 * So that we build reply only basing on parameters 644 * arrived with segment. 645 * Exception: precedence violation. We do not implement it in any case. 646 */ 647 648 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 649 { 650 const struct tcphdr *th = tcp_hdr(skb); 651 struct { 652 struct tcphdr th; 653 #ifdef CONFIG_TCP_MD5SIG 654 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 655 #endif 656 } rep; 657 struct ip_reply_arg arg; 658 #ifdef CONFIG_TCP_MD5SIG 659 struct tcp_md5sig_key *key = NULL; 660 const __u8 *hash_location = NULL; 661 unsigned char newhash[16]; 662 int genhash; 663 struct sock *sk1 = NULL; 664 #endif 665 struct net *net; 666 struct sock *ctl_sk; 667 668 /* Never send a reset in response to a reset. */ 669 if (th->rst) 670 return; 671 672 /* If sk not NULL, it means we did a successful lookup and incoming 673 * route had to be correct. prequeue might have dropped our dst. 674 */ 675 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 676 return; 677 678 /* Swap the send and the receive. */ 679 memset(&rep, 0, sizeof(rep)); 680 rep.th.dest = th->source; 681 rep.th.source = th->dest; 682 rep.th.doff = sizeof(struct tcphdr) / 4; 683 rep.th.rst = 1; 684 685 if (th->ack) { 686 rep.th.seq = th->ack_seq; 687 } else { 688 rep.th.ack = 1; 689 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 690 skb->len - (th->doff << 2)); 691 } 692 693 memset(&arg, 0, sizeof(arg)); 694 arg.iov[0].iov_base = (unsigned char *)&rep; 695 arg.iov[0].iov_len = sizeof(rep.th); 696 697 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 698 #ifdef CONFIG_TCP_MD5SIG 699 rcu_read_lock(); 700 hash_location = tcp_parse_md5sig_option(th); 701 if (sk && sk_fullsock(sk)) { 702 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) 703 &ip_hdr(skb)->saddr, AF_INET); 704 } else if (hash_location) { 705 /* 706 * active side is lost. Try to find listening socket through 707 * source port, and then find md5 key through listening socket. 708 * we are not loose security here: 709 * Incoming packet is checked with md5 hash with finding key, 710 * no RST generated if md5 hash doesn't match. 711 */ 712 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 713 ip_hdr(skb)->saddr, 714 th->source, ip_hdr(skb)->daddr, 715 ntohs(th->source), inet_iif(skb), 716 tcp_v4_sdif(skb)); 717 /* don't send rst if it can't find key */ 718 if (!sk1) 719 goto out; 720 721 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) 722 &ip_hdr(skb)->saddr, AF_INET); 723 if (!key) 724 goto out; 725 726 727 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 728 if (genhash || memcmp(hash_location, newhash, 16) != 0) 729 goto out; 730 731 } 732 733 if (key) { 734 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 735 (TCPOPT_NOP << 16) | 736 (TCPOPT_MD5SIG << 8) | 737 TCPOLEN_MD5SIG); 738 /* Update length and the length the header thinks exists */ 739 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 740 rep.th.doff = arg.iov[0].iov_len / 4; 741 742 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 743 key, ip_hdr(skb)->saddr, 744 ip_hdr(skb)->daddr, &rep.th); 745 } 746 #endif 747 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 748 ip_hdr(skb)->saddr, /* XXX */ 749 arg.iov[0].iov_len, IPPROTO_TCP, 0); 750 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 751 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 752 753 /* When socket is gone, all binding information is lost. 754 * routing might fail in this case. No choice here, if we choose to force 755 * input interface, we will misroute in case of asymmetric route. 756 */ 757 if (sk) { 758 arg.bound_dev_if = sk->sk_bound_dev_if; 759 if (sk_fullsock(sk)) 760 trace_tcp_send_reset(sk, skb); 761 } 762 763 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 764 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 765 766 arg.tos = ip_hdr(skb)->tos; 767 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 768 local_bh_disable(); 769 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); 770 if (sk) 771 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 772 inet_twsk(sk)->tw_mark : sk->sk_mark; 773 ip_send_unicast_reply(ctl_sk, 774 skb, &TCP_SKB_CB(skb)->header.h4.opt, 775 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 776 &arg, arg.iov[0].iov_len); 777 778 ctl_sk->sk_mark = 0; 779 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 780 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 781 local_bh_enable(); 782 783 #ifdef CONFIG_TCP_MD5SIG 784 out: 785 rcu_read_unlock(); 786 #endif 787 } 788 789 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 790 outside socket context is ugly, certainly. What can I do? 791 */ 792 793 static void tcp_v4_send_ack(const struct sock *sk, 794 struct sk_buff *skb, u32 seq, u32 ack, 795 u32 win, u32 tsval, u32 tsecr, int oif, 796 struct tcp_md5sig_key *key, 797 int reply_flags, u8 tos) 798 { 799 const struct tcphdr *th = tcp_hdr(skb); 800 struct { 801 struct tcphdr th; 802 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 803 #ifdef CONFIG_TCP_MD5SIG 804 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 805 #endif 806 ]; 807 } rep; 808 struct net *net = sock_net(sk); 809 struct ip_reply_arg arg; 810 struct sock *ctl_sk; 811 812 memset(&rep.th, 0, sizeof(struct tcphdr)); 813 memset(&arg, 0, sizeof(arg)); 814 815 arg.iov[0].iov_base = (unsigned char *)&rep; 816 arg.iov[0].iov_len = sizeof(rep.th); 817 if (tsecr) { 818 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 819 (TCPOPT_TIMESTAMP << 8) | 820 TCPOLEN_TIMESTAMP); 821 rep.opt[1] = htonl(tsval); 822 rep.opt[2] = htonl(tsecr); 823 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 824 } 825 826 /* Swap the send and the receive. */ 827 rep.th.dest = th->source; 828 rep.th.source = th->dest; 829 rep.th.doff = arg.iov[0].iov_len / 4; 830 rep.th.seq = htonl(seq); 831 rep.th.ack_seq = htonl(ack); 832 rep.th.ack = 1; 833 rep.th.window = htons(win); 834 835 #ifdef CONFIG_TCP_MD5SIG 836 if (key) { 837 int offset = (tsecr) ? 3 : 0; 838 839 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 840 (TCPOPT_NOP << 16) | 841 (TCPOPT_MD5SIG << 8) | 842 TCPOLEN_MD5SIG); 843 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 844 rep.th.doff = arg.iov[0].iov_len/4; 845 846 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 847 key, ip_hdr(skb)->saddr, 848 ip_hdr(skb)->daddr, &rep.th); 849 } 850 #endif 851 arg.flags = reply_flags; 852 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 853 ip_hdr(skb)->saddr, /* XXX */ 854 arg.iov[0].iov_len, IPPROTO_TCP, 0); 855 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 856 if (oif) 857 arg.bound_dev_if = oif; 858 arg.tos = tos; 859 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 860 local_bh_disable(); 861 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); 862 if (sk) 863 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 864 inet_twsk(sk)->tw_mark : sk->sk_mark; 865 ip_send_unicast_reply(ctl_sk, 866 skb, &TCP_SKB_CB(skb)->header.h4.opt, 867 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 868 &arg, arg.iov[0].iov_len); 869 870 ctl_sk->sk_mark = 0; 871 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 872 local_bh_enable(); 873 } 874 875 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 876 { 877 struct inet_timewait_sock *tw = inet_twsk(sk); 878 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 879 880 tcp_v4_send_ack(sk, skb, 881 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 882 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 883 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 884 tcptw->tw_ts_recent, 885 tw->tw_bound_dev_if, 886 tcp_twsk_md5_key(tcptw), 887 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 888 tw->tw_tos 889 ); 890 891 inet_twsk_put(tw); 892 } 893 894 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 895 struct request_sock *req) 896 { 897 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 898 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 899 */ 900 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 901 tcp_sk(sk)->snd_nxt; 902 903 /* RFC 7323 2.3 904 * The window field (SEG.WND) of every outgoing segment, with the 905 * exception of <SYN> segments, MUST be right-shifted by 906 * Rcv.Wind.Shift bits: 907 */ 908 tcp_v4_send_ack(sk, skb, seq, 909 tcp_rsk(req)->rcv_nxt, 910 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 911 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 912 req->ts_recent, 913 0, 914 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, 915 AF_INET), 916 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 917 ip_hdr(skb)->tos); 918 } 919 920 /* 921 * Send a SYN-ACK after having received a SYN. 922 * This still operates on a request_sock only, not on a big 923 * socket. 924 */ 925 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 926 struct flowi *fl, 927 struct request_sock *req, 928 struct tcp_fastopen_cookie *foc, 929 enum tcp_synack_type synack_type) 930 { 931 const struct inet_request_sock *ireq = inet_rsk(req); 932 struct flowi4 fl4; 933 int err = -1; 934 struct sk_buff *skb; 935 936 /* First, grab a route. */ 937 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 938 return -1; 939 940 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 941 942 if (skb) { 943 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 944 945 rcu_read_lock(); 946 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 947 ireq->ir_rmt_addr, 948 rcu_dereference(ireq->ireq_opt)); 949 rcu_read_unlock(); 950 err = net_xmit_eval(err); 951 } 952 953 return err; 954 } 955 956 /* 957 * IPv4 request_sock destructor. 958 */ 959 static void tcp_v4_reqsk_destructor(struct request_sock *req) 960 { 961 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 962 } 963 964 #ifdef CONFIG_TCP_MD5SIG 965 /* 966 * RFC2385 MD5 checksumming requires a mapping of 967 * IP address->MD5 Key. 968 * We need to maintain these in the sk structure. 969 */ 970 971 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 972 EXPORT_SYMBOL(tcp_md5_needed); 973 974 /* Find the Key structure for an address. */ 975 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, 976 const union tcp_md5_addr *addr, 977 int family) 978 { 979 const struct tcp_sock *tp = tcp_sk(sk); 980 struct tcp_md5sig_key *key; 981 const struct tcp_md5sig_info *md5sig; 982 __be32 mask; 983 struct tcp_md5sig_key *best_match = NULL; 984 bool match; 985 986 /* caller either holds rcu_read_lock() or socket lock */ 987 md5sig = rcu_dereference_check(tp->md5sig_info, 988 lockdep_sock_is_held(sk)); 989 if (!md5sig) 990 return NULL; 991 992 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 993 if (key->family != family) 994 continue; 995 996 if (family == AF_INET) { 997 mask = inet_make_mask(key->prefixlen); 998 match = (key->addr.a4.s_addr & mask) == 999 (addr->a4.s_addr & mask); 1000 #if IS_ENABLED(CONFIG_IPV6) 1001 } else if (family == AF_INET6) { 1002 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1003 key->prefixlen); 1004 #endif 1005 } else { 1006 match = false; 1007 } 1008 1009 if (match && (!best_match || 1010 key->prefixlen > best_match->prefixlen)) 1011 best_match = key; 1012 } 1013 return best_match; 1014 } 1015 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1016 1017 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1018 const union tcp_md5_addr *addr, 1019 int family, u8 prefixlen) 1020 { 1021 const struct tcp_sock *tp = tcp_sk(sk); 1022 struct tcp_md5sig_key *key; 1023 unsigned int size = sizeof(struct in_addr); 1024 const struct tcp_md5sig_info *md5sig; 1025 1026 /* caller either holds rcu_read_lock() or socket lock */ 1027 md5sig = rcu_dereference_check(tp->md5sig_info, 1028 lockdep_sock_is_held(sk)); 1029 if (!md5sig) 1030 return NULL; 1031 #if IS_ENABLED(CONFIG_IPV6) 1032 if (family == AF_INET6) 1033 size = sizeof(struct in6_addr); 1034 #endif 1035 hlist_for_each_entry_rcu(key, &md5sig->head, node) { 1036 if (key->family != family) 1037 continue; 1038 if (!memcmp(&key->addr, addr, size) && 1039 key->prefixlen == prefixlen) 1040 return key; 1041 } 1042 return NULL; 1043 } 1044 1045 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1046 const struct sock *addr_sk) 1047 { 1048 const union tcp_md5_addr *addr; 1049 1050 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1051 return tcp_md5_do_lookup(sk, addr, AF_INET); 1052 } 1053 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1054 1055 /* This can be called on a newly created socket, from other files */ 1056 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1057 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, 1058 gfp_t gfp) 1059 { 1060 /* Add Key to the list */ 1061 struct tcp_md5sig_key *key; 1062 struct tcp_sock *tp = tcp_sk(sk); 1063 struct tcp_md5sig_info *md5sig; 1064 1065 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1066 if (key) { 1067 /* Pre-existing entry - just update that one. */ 1068 memcpy(key->key, newkey, newkeylen); 1069 key->keylen = newkeylen; 1070 return 0; 1071 } 1072 1073 md5sig = rcu_dereference_protected(tp->md5sig_info, 1074 lockdep_sock_is_held(sk)); 1075 if (!md5sig) { 1076 md5sig = kmalloc(sizeof(*md5sig), gfp); 1077 if (!md5sig) 1078 return -ENOMEM; 1079 1080 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1081 INIT_HLIST_HEAD(&md5sig->head); 1082 rcu_assign_pointer(tp->md5sig_info, md5sig); 1083 } 1084 1085 key = sock_kmalloc(sk, sizeof(*key), gfp); 1086 if (!key) 1087 return -ENOMEM; 1088 if (!tcp_alloc_md5sig_pool()) { 1089 sock_kfree_s(sk, key, sizeof(*key)); 1090 return -ENOMEM; 1091 } 1092 1093 memcpy(key->key, newkey, newkeylen); 1094 key->keylen = newkeylen; 1095 key->family = family; 1096 key->prefixlen = prefixlen; 1097 memcpy(&key->addr, addr, 1098 (family == AF_INET6) ? sizeof(struct in6_addr) : 1099 sizeof(struct in_addr)); 1100 hlist_add_head_rcu(&key->node, &md5sig->head); 1101 return 0; 1102 } 1103 EXPORT_SYMBOL(tcp_md5_do_add); 1104 1105 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1106 u8 prefixlen) 1107 { 1108 struct tcp_md5sig_key *key; 1109 1110 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); 1111 if (!key) 1112 return -ENOENT; 1113 hlist_del_rcu(&key->node); 1114 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1115 kfree_rcu(key, rcu); 1116 return 0; 1117 } 1118 EXPORT_SYMBOL(tcp_md5_do_del); 1119 1120 static void tcp_clear_md5_list(struct sock *sk) 1121 { 1122 struct tcp_sock *tp = tcp_sk(sk); 1123 struct tcp_md5sig_key *key; 1124 struct hlist_node *n; 1125 struct tcp_md5sig_info *md5sig; 1126 1127 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1128 1129 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1130 hlist_del_rcu(&key->node); 1131 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1132 kfree_rcu(key, rcu); 1133 } 1134 } 1135 1136 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1137 char __user *optval, int optlen) 1138 { 1139 struct tcp_md5sig cmd; 1140 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1141 u8 prefixlen = 32; 1142 1143 if (optlen < sizeof(cmd)) 1144 return -EINVAL; 1145 1146 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1147 return -EFAULT; 1148 1149 if (sin->sin_family != AF_INET) 1150 return -EINVAL; 1151 1152 if (optname == TCP_MD5SIG_EXT && 1153 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1154 prefixlen = cmd.tcpm_prefixlen; 1155 if (prefixlen > 32) 1156 return -EINVAL; 1157 } 1158 1159 if (!cmd.tcpm_keylen) 1160 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1161 AF_INET, prefixlen); 1162 1163 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1164 return -EINVAL; 1165 1166 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1167 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, 1168 GFP_KERNEL); 1169 } 1170 1171 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1172 __be32 daddr, __be32 saddr, 1173 const struct tcphdr *th, int nbytes) 1174 { 1175 struct tcp4_pseudohdr *bp; 1176 struct scatterlist sg; 1177 struct tcphdr *_th; 1178 1179 bp = hp->scratch; 1180 bp->saddr = saddr; 1181 bp->daddr = daddr; 1182 bp->pad = 0; 1183 bp->protocol = IPPROTO_TCP; 1184 bp->len = cpu_to_be16(nbytes); 1185 1186 _th = (struct tcphdr *)(bp + 1); 1187 memcpy(_th, th, sizeof(*th)); 1188 _th->check = 0; 1189 1190 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1191 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1192 sizeof(*bp) + sizeof(*th)); 1193 return crypto_ahash_update(hp->md5_req); 1194 } 1195 1196 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1197 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1198 { 1199 struct tcp_md5sig_pool *hp; 1200 struct ahash_request *req; 1201 1202 hp = tcp_get_md5sig_pool(); 1203 if (!hp) 1204 goto clear_hash_noput; 1205 req = hp->md5_req; 1206 1207 if (crypto_ahash_init(req)) 1208 goto clear_hash; 1209 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1210 goto clear_hash; 1211 if (tcp_md5_hash_key(hp, key)) 1212 goto clear_hash; 1213 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1214 if (crypto_ahash_final(req)) 1215 goto clear_hash; 1216 1217 tcp_put_md5sig_pool(); 1218 return 0; 1219 1220 clear_hash: 1221 tcp_put_md5sig_pool(); 1222 clear_hash_noput: 1223 memset(md5_hash, 0, 16); 1224 return 1; 1225 } 1226 1227 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1228 const struct sock *sk, 1229 const struct sk_buff *skb) 1230 { 1231 struct tcp_md5sig_pool *hp; 1232 struct ahash_request *req; 1233 const struct tcphdr *th = tcp_hdr(skb); 1234 __be32 saddr, daddr; 1235 1236 if (sk) { /* valid for establish/request sockets */ 1237 saddr = sk->sk_rcv_saddr; 1238 daddr = sk->sk_daddr; 1239 } else { 1240 const struct iphdr *iph = ip_hdr(skb); 1241 saddr = iph->saddr; 1242 daddr = iph->daddr; 1243 } 1244 1245 hp = tcp_get_md5sig_pool(); 1246 if (!hp) 1247 goto clear_hash_noput; 1248 req = hp->md5_req; 1249 1250 if (crypto_ahash_init(req)) 1251 goto clear_hash; 1252 1253 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1254 goto clear_hash; 1255 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1256 goto clear_hash; 1257 if (tcp_md5_hash_key(hp, key)) 1258 goto clear_hash; 1259 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1260 if (crypto_ahash_final(req)) 1261 goto clear_hash; 1262 1263 tcp_put_md5sig_pool(); 1264 return 0; 1265 1266 clear_hash: 1267 tcp_put_md5sig_pool(); 1268 clear_hash_noput: 1269 memset(md5_hash, 0, 16); 1270 return 1; 1271 } 1272 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1273 1274 #endif 1275 1276 /* Called with rcu_read_lock() */ 1277 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1278 const struct sk_buff *skb) 1279 { 1280 #ifdef CONFIG_TCP_MD5SIG 1281 /* 1282 * This gets called for each TCP segment that arrives 1283 * so we want to be efficient. 1284 * We have 3 drop cases: 1285 * o No MD5 hash and one expected. 1286 * o MD5 hash and we're not expecting one. 1287 * o MD5 hash and its wrong. 1288 */ 1289 const __u8 *hash_location = NULL; 1290 struct tcp_md5sig_key *hash_expected; 1291 const struct iphdr *iph = ip_hdr(skb); 1292 const struct tcphdr *th = tcp_hdr(skb); 1293 int genhash; 1294 unsigned char newhash[16]; 1295 1296 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1297 AF_INET); 1298 hash_location = tcp_parse_md5sig_option(th); 1299 1300 /* We've parsed the options - do we have a hash? */ 1301 if (!hash_expected && !hash_location) 1302 return false; 1303 1304 if (hash_expected && !hash_location) { 1305 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1306 return true; 1307 } 1308 1309 if (!hash_expected && hash_location) { 1310 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1311 return true; 1312 } 1313 1314 /* Okay, so this is hash_expected and hash_location - 1315 * so we need to calculate the checksum. 1316 */ 1317 genhash = tcp_v4_md5_hash_skb(newhash, 1318 hash_expected, 1319 NULL, skb); 1320 1321 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1322 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1323 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1324 &iph->saddr, ntohs(th->source), 1325 &iph->daddr, ntohs(th->dest), 1326 genhash ? " tcp_v4_calc_md5_hash failed" 1327 : ""); 1328 return true; 1329 } 1330 return false; 1331 #endif 1332 return false; 1333 } 1334 1335 static void tcp_v4_init_req(struct request_sock *req, 1336 const struct sock *sk_listener, 1337 struct sk_buff *skb) 1338 { 1339 struct inet_request_sock *ireq = inet_rsk(req); 1340 struct net *net = sock_net(sk_listener); 1341 1342 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1343 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1344 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1345 } 1346 1347 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1348 struct flowi *fl, 1349 const struct request_sock *req) 1350 { 1351 return inet_csk_route_req(sk, &fl->u.ip4, req); 1352 } 1353 1354 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1355 .family = PF_INET, 1356 .obj_size = sizeof(struct tcp_request_sock), 1357 .rtx_syn_ack = tcp_rtx_synack, 1358 .send_ack = tcp_v4_reqsk_send_ack, 1359 .destructor = tcp_v4_reqsk_destructor, 1360 .send_reset = tcp_v4_send_reset, 1361 .syn_ack_timeout = tcp_syn_ack_timeout, 1362 }; 1363 1364 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1365 .mss_clamp = TCP_MSS_DEFAULT, 1366 #ifdef CONFIG_TCP_MD5SIG 1367 .req_md5_lookup = tcp_v4_md5_lookup, 1368 .calc_md5_hash = tcp_v4_md5_hash_skb, 1369 #endif 1370 .init_req = tcp_v4_init_req, 1371 #ifdef CONFIG_SYN_COOKIES 1372 .cookie_init_seq = cookie_v4_init_sequence, 1373 #endif 1374 .route_req = tcp_v4_route_req, 1375 .init_seq = tcp_v4_init_seq, 1376 .init_ts_off = tcp_v4_init_ts_off, 1377 .send_synack = tcp_v4_send_synack, 1378 }; 1379 1380 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1381 { 1382 /* Never answer to SYNs send to broadcast or multicast */ 1383 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1384 goto drop; 1385 1386 return tcp_conn_request(&tcp_request_sock_ops, 1387 &tcp_request_sock_ipv4_ops, sk, skb); 1388 1389 drop: 1390 tcp_listendrop(sk); 1391 return 0; 1392 } 1393 EXPORT_SYMBOL(tcp_v4_conn_request); 1394 1395 1396 /* 1397 * The three way handshake has completed - we got a valid synack - 1398 * now create the new socket. 1399 */ 1400 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1401 struct request_sock *req, 1402 struct dst_entry *dst, 1403 struct request_sock *req_unhash, 1404 bool *own_req) 1405 { 1406 struct inet_request_sock *ireq; 1407 struct inet_sock *newinet; 1408 struct tcp_sock *newtp; 1409 struct sock *newsk; 1410 #ifdef CONFIG_TCP_MD5SIG 1411 struct tcp_md5sig_key *key; 1412 #endif 1413 struct ip_options_rcu *inet_opt; 1414 1415 if (sk_acceptq_is_full(sk)) 1416 goto exit_overflow; 1417 1418 newsk = tcp_create_openreq_child(sk, req, skb); 1419 if (!newsk) 1420 goto exit_nonewsk; 1421 1422 newsk->sk_gso_type = SKB_GSO_TCPV4; 1423 inet_sk_rx_dst_set(newsk, skb); 1424 1425 newtp = tcp_sk(newsk); 1426 newinet = inet_sk(newsk); 1427 ireq = inet_rsk(req); 1428 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1429 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1430 newsk->sk_bound_dev_if = ireq->ir_iif; 1431 newinet->inet_saddr = ireq->ir_loc_addr; 1432 inet_opt = rcu_dereference(ireq->ireq_opt); 1433 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1434 newinet->mc_index = inet_iif(skb); 1435 newinet->mc_ttl = ip_hdr(skb)->ttl; 1436 newinet->rcv_tos = ip_hdr(skb)->tos; 1437 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1438 if (inet_opt) 1439 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1440 newinet->inet_id = newtp->write_seq ^ jiffies; 1441 1442 if (!dst) { 1443 dst = inet_csk_route_child_sock(sk, newsk, req); 1444 if (!dst) 1445 goto put_and_exit; 1446 } else { 1447 /* syncookie case : see end of cookie_v4_check() */ 1448 } 1449 sk_setup_caps(newsk, dst); 1450 1451 tcp_ca_openreq_child(newsk, dst); 1452 1453 tcp_sync_mss(newsk, dst_mtu(dst)); 1454 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1455 1456 tcp_initialize_rcv_mss(newsk); 1457 1458 #ifdef CONFIG_TCP_MD5SIG 1459 /* Copy over the MD5 key from the original socket */ 1460 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1461 AF_INET); 1462 if (key) { 1463 /* 1464 * We're using one, so create a matching key 1465 * on the newsk structure. If we fail to get 1466 * memory, then we end up not copying the key 1467 * across. Shucks. 1468 */ 1469 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1470 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); 1471 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1472 } 1473 #endif 1474 1475 if (__inet_inherit_port(sk, newsk) < 0) 1476 goto put_and_exit; 1477 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1478 if (likely(*own_req)) { 1479 tcp_move_syn(newtp, req); 1480 ireq->ireq_opt = NULL; 1481 } else { 1482 newinet->inet_opt = NULL; 1483 } 1484 return newsk; 1485 1486 exit_overflow: 1487 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1488 exit_nonewsk: 1489 dst_release(dst); 1490 exit: 1491 tcp_listendrop(sk); 1492 return NULL; 1493 put_and_exit: 1494 newinet->inet_opt = NULL; 1495 inet_csk_prepare_forced_close(newsk); 1496 tcp_done(newsk); 1497 goto exit; 1498 } 1499 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1500 1501 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1502 { 1503 #ifdef CONFIG_SYN_COOKIES 1504 const struct tcphdr *th = tcp_hdr(skb); 1505 1506 if (!th->syn) 1507 sk = cookie_v4_check(sk, skb); 1508 #endif 1509 return sk; 1510 } 1511 1512 /* The socket must have it's spinlock held when we get 1513 * here, unless it is a TCP_LISTEN socket. 1514 * 1515 * We have a potential double-lock case here, so even when 1516 * doing backlog processing we use the BH locking scheme. 1517 * This is because we cannot sleep with the original spinlock 1518 * held. 1519 */ 1520 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1521 { 1522 struct sock *rsk; 1523 1524 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1525 struct dst_entry *dst = sk->sk_rx_dst; 1526 1527 sock_rps_save_rxhash(sk, skb); 1528 sk_mark_napi_id(sk, skb); 1529 if (dst) { 1530 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1531 !dst->ops->check(dst, 0)) { 1532 dst_release(dst); 1533 sk->sk_rx_dst = NULL; 1534 } 1535 } 1536 tcp_rcv_established(sk, skb); 1537 return 0; 1538 } 1539 1540 if (tcp_checksum_complete(skb)) 1541 goto csum_err; 1542 1543 if (sk->sk_state == TCP_LISTEN) { 1544 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1545 1546 if (!nsk) 1547 goto discard; 1548 if (nsk != sk) { 1549 if (tcp_child_process(sk, nsk, skb)) { 1550 rsk = nsk; 1551 goto reset; 1552 } 1553 return 0; 1554 } 1555 } else 1556 sock_rps_save_rxhash(sk, skb); 1557 1558 if (tcp_rcv_state_process(sk, skb)) { 1559 rsk = sk; 1560 goto reset; 1561 } 1562 return 0; 1563 1564 reset: 1565 tcp_v4_send_reset(rsk, skb); 1566 discard: 1567 kfree_skb(skb); 1568 /* Be careful here. If this function gets more complicated and 1569 * gcc suffers from register pressure on the x86, sk (in %ebx) 1570 * might be destroyed here. This current version compiles correctly, 1571 * but you have been warned. 1572 */ 1573 return 0; 1574 1575 csum_err: 1576 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1577 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1578 goto discard; 1579 } 1580 EXPORT_SYMBOL(tcp_v4_do_rcv); 1581 1582 int tcp_v4_early_demux(struct sk_buff *skb) 1583 { 1584 const struct iphdr *iph; 1585 const struct tcphdr *th; 1586 struct sock *sk; 1587 1588 if (skb->pkt_type != PACKET_HOST) 1589 return 0; 1590 1591 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1592 return 0; 1593 1594 iph = ip_hdr(skb); 1595 th = tcp_hdr(skb); 1596 1597 if (th->doff < sizeof(struct tcphdr) / 4) 1598 return 0; 1599 1600 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1601 iph->saddr, th->source, 1602 iph->daddr, ntohs(th->dest), 1603 skb->skb_iif, inet_sdif(skb)); 1604 if (sk) { 1605 skb->sk = sk; 1606 skb->destructor = sock_edemux; 1607 if (sk_fullsock(sk)) { 1608 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1609 1610 if (dst) 1611 dst = dst_check(dst, 0); 1612 if (dst && 1613 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1614 skb_dst_set_noref(skb, dst); 1615 } 1616 } 1617 return 0; 1618 } 1619 1620 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1621 { 1622 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1623 struct skb_shared_info *shinfo; 1624 const struct tcphdr *th; 1625 struct tcphdr *thtail; 1626 struct sk_buff *tail; 1627 unsigned int hdrlen; 1628 bool fragstolen; 1629 u32 gso_segs; 1630 int delta; 1631 1632 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1633 * we can fix skb->truesize to its real value to avoid future drops. 1634 * This is valid because skb is not yet charged to the socket. 1635 * It has been noticed pure SACK packets were sometimes dropped 1636 * (if cooked by drivers without copybreak feature). 1637 */ 1638 skb_condense(skb); 1639 1640 skb_dst_drop(skb); 1641 1642 if (unlikely(tcp_checksum_complete(skb))) { 1643 bh_unlock_sock(sk); 1644 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1645 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1646 return true; 1647 } 1648 1649 /* Attempt coalescing to last skb in backlog, even if we are 1650 * above the limits. 1651 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1652 */ 1653 th = (const struct tcphdr *)skb->data; 1654 hdrlen = th->doff * 4; 1655 shinfo = skb_shinfo(skb); 1656 1657 if (!shinfo->gso_size) 1658 shinfo->gso_size = skb->len - hdrlen; 1659 1660 if (!shinfo->gso_segs) 1661 shinfo->gso_segs = 1; 1662 1663 tail = sk->sk_backlog.tail; 1664 if (!tail) 1665 goto no_coalesce; 1666 thtail = (struct tcphdr *)tail->data; 1667 1668 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1669 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1670 ((TCP_SKB_CB(tail)->tcp_flags | 1671 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1672 !((TCP_SKB_CB(tail)->tcp_flags & 1673 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1674 ((TCP_SKB_CB(tail)->tcp_flags ^ 1675 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1676 #ifdef CONFIG_TLS_DEVICE 1677 tail->decrypted != skb->decrypted || 1678 #endif 1679 thtail->doff != th->doff || 1680 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1681 goto no_coalesce; 1682 1683 __skb_pull(skb, hdrlen); 1684 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1685 thtail->window = th->window; 1686 1687 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1688 1689 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) 1690 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1691 1692 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1693 * thtail->fin, so that the fast path in tcp_rcv_established() 1694 * is not entered if we append a packet with a FIN. 1695 * SYN, RST, URG are not present. 1696 * ACK is set on both packets. 1697 * PSH : we do not really care in TCP stack, 1698 * at least for 'GRO' packets. 1699 */ 1700 thtail->fin |= th->fin; 1701 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1702 1703 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1704 TCP_SKB_CB(tail)->has_rxtstamp = true; 1705 tail->tstamp = skb->tstamp; 1706 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1707 } 1708 1709 /* Not as strict as GRO. We only need to carry mss max value */ 1710 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1711 skb_shinfo(tail)->gso_size); 1712 1713 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1714 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1715 1716 sk->sk_backlog.len += delta; 1717 __NET_INC_STATS(sock_net(sk), 1718 LINUX_MIB_TCPBACKLOGCOALESCE); 1719 kfree_skb_partial(skb, fragstolen); 1720 return false; 1721 } 1722 __skb_push(skb, hdrlen); 1723 1724 no_coalesce: 1725 /* Only socket owner can try to collapse/prune rx queues 1726 * to reduce memory overhead, so add a little headroom here. 1727 * Few sockets backlog are possibly concurrently non empty. 1728 */ 1729 limit += 64*1024; 1730 1731 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1732 bh_unlock_sock(sk); 1733 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1734 return true; 1735 } 1736 return false; 1737 } 1738 EXPORT_SYMBOL(tcp_add_backlog); 1739 1740 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1741 { 1742 struct tcphdr *th = (struct tcphdr *)skb->data; 1743 1744 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1745 } 1746 EXPORT_SYMBOL(tcp_filter); 1747 1748 static void tcp_v4_restore_cb(struct sk_buff *skb) 1749 { 1750 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1751 sizeof(struct inet_skb_parm)); 1752 } 1753 1754 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1755 const struct tcphdr *th) 1756 { 1757 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1758 * barrier() makes sure compiler wont play fool^Waliasing games. 1759 */ 1760 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1761 sizeof(struct inet_skb_parm)); 1762 barrier(); 1763 1764 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1765 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1766 skb->len - th->doff * 4); 1767 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1768 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1769 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1770 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1771 TCP_SKB_CB(skb)->sacked = 0; 1772 TCP_SKB_CB(skb)->has_rxtstamp = 1773 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1774 } 1775 1776 /* 1777 * From tcp_input.c 1778 */ 1779 1780 int tcp_v4_rcv(struct sk_buff *skb) 1781 { 1782 struct net *net = dev_net(skb->dev); 1783 struct sk_buff *skb_to_free; 1784 int sdif = inet_sdif(skb); 1785 const struct iphdr *iph; 1786 const struct tcphdr *th; 1787 bool refcounted; 1788 struct sock *sk; 1789 int ret; 1790 1791 if (skb->pkt_type != PACKET_HOST) 1792 goto discard_it; 1793 1794 /* Count it even if it's bad */ 1795 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1796 1797 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1798 goto discard_it; 1799 1800 th = (const struct tcphdr *)skb->data; 1801 1802 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1803 goto bad_packet; 1804 if (!pskb_may_pull(skb, th->doff * 4)) 1805 goto discard_it; 1806 1807 /* An explanation is required here, I think. 1808 * Packet length and doff are validated by header prediction, 1809 * provided case of th->doff==0 is eliminated. 1810 * So, we defer the checks. */ 1811 1812 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1813 goto csum_error; 1814 1815 th = (const struct tcphdr *)skb->data; 1816 iph = ip_hdr(skb); 1817 lookup: 1818 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1819 th->dest, sdif, &refcounted); 1820 if (!sk) 1821 goto no_tcp_socket; 1822 1823 process: 1824 if (sk->sk_state == TCP_TIME_WAIT) 1825 goto do_time_wait; 1826 1827 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1828 struct request_sock *req = inet_reqsk(sk); 1829 bool req_stolen = false; 1830 struct sock *nsk; 1831 1832 sk = req->rsk_listener; 1833 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 1834 sk_drops_add(sk, skb); 1835 reqsk_put(req); 1836 goto discard_it; 1837 } 1838 if (tcp_checksum_complete(skb)) { 1839 reqsk_put(req); 1840 goto csum_error; 1841 } 1842 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1843 inet_csk_reqsk_queue_drop_and_put(sk, req); 1844 goto lookup; 1845 } 1846 /* We own a reference on the listener, increase it again 1847 * as we might lose it too soon. 1848 */ 1849 sock_hold(sk); 1850 refcounted = true; 1851 nsk = NULL; 1852 if (!tcp_filter(sk, skb)) { 1853 th = (const struct tcphdr *)skb->data; 1854 iph = ip_hdr(skb); 1855 tcp_v4_fill_cb(skb, iph, th); 1856 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1857 } 1858 if (!nsk) { 1859 reqsk_put(req); 1860 if (req_stolen) { 1861 /* Another cpu got exclusive access to req 1862 * and created a full blown socket. 1863 * Try to feed this packet to this socket 1864 * instead of discarding it. 1865 */ 1866 tcp_v4_restore_cb(skb); 1867 sock_put(sk); 1868 goto lookup; 1869 } 1870 goto discard_and_relse; 1871 } 1872 if (nsk == sk) { 1873 reqsk_put(req); 1874 tcp_v4_restore_cb(skb); 1875 } else if (tcp_child_process(sk, nsk, skb)) { 1876 tcp_v4_send_reset(nsk, skb); 1877 goto discard_and_relse; 1878 } else { 1879 sock_put(sk); 1880 return 0; 1881 } 1882 } 1883 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1884 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1885 goto discard_and_relse; 1886 } 1887 1888 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1889 goto discard_and_relse; 1890 1891 if (tcp_v4_inbound_md5_hash(sk, skb)) 1892 goto discard_and_relse; 1893 1894 nf_reset(skb); 1895 1896 if (tcp_filter(sk, skb)) 1897 goto discard_and_relse; 1898 th = (const struct tcphdr *)skb->data; 1899 iph = ip_hdr(skb); 1900 tcp_v4_fill_cb(skb, iph, th); 1901 1902 skb->dev = NULL; 1903 1904 if (sk->sk_state == TCP_LISTEN) { 1905 ret = tcp_v4_do_rcv(sk, skb); 1906 goto put_and_return; 1907 } 1908 1909 sk_incoming_cpu_update(sk); 1910 1911 bh_lock_sock_nested(sk); 1912 tcp_segs_in(tcp_sk(sk), skb); 1913 ret = 0; 1914 if (!sock_owned_by_user(sk)) { 1915 skb_to_free = sk->sk_rx_skb_cache; 1916 sk->sk_rx_skb_cache = NULL; 1917 ret = tcp_v4_do_rcv(sk, skb); 1918 } else { 1919 if (tcp_add_backlog(sk, skb)) 1920 goto discard_and_relse; 1921 skb_to_free = NULL; 1922 } 1923 bh_unlock_sock(sk); 1924 if (skb_to_free) 1925 __kfree_skb(skb_to_free); 1926 1927 put_and_return: 1928 if (refcounted) 1929 sock_put(sk); 1930 1931 return ret; 1932 1933 no_tcp_socket: 1934 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 1935 goto discard_it; 1936 1937 tcp_v4_fill_cb(skb, iph, th); 1938 1939 if (tcp_checksum_complete(skb)) { 1940 csum_error: 1941 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 1942 bad_packet: 1943 __TCP_INC_STATS(net, TCP_MIB_INERRS); 1944 } else { 1945 tcp_v4_send_reset(NULL, skb); 1946 } 1947 1948 discard_it: 1949 /* Discard frame. */ 1950 kfree_skb(skb); 1951 return 0; 1952 1953 discard_and_relse: 1954 sk_drops_add(sk, skb); 1955 if (refcounted) 1956 sock_put(sk); 1957 goto discard_it; 1958 1959 do_time_wait: 1960 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1961 inet_twsk_put(inet_twsk(sk)); 1962 goto discard_it; 1963 } 1964 1965 tcp_v4_fill_cb(skb, iph, th); 1966 1967 if (tcp_checksum_complete(skb)) { 1968 inet_twsk_put(inet_twsk(sk)); 1969 goto csum_error; 1970 } 1971 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 1972 case TCP_TW_SYN: { 1973 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 1974 &tcp_hashinfo, skb, 1975 __tcp_hdrlen(th), 1976 iph->saddr, th->source, 1977 iph->daddr, th->dest, 1978 inet_iif(skb), 1979 sdif); 1980 if (sk2) { 1981 inet_twsk_deschedule_put(inet_twsk(sk)); 1982 sk = sk2; 1983 tcp_v4_restore_cb(skb); 1984 refcounted = false; 1985 goto process; 1986 } 1987 } 1988 /* to ACK */ 1989 /* fall through */ 1990 case TCP_TW_ACK: 1991 tcp_v4_timewait_ack(sk, skb); 1992 break; 1993 case TCP_TW_RST: 1994 tcp_v4_send_reset(sk, skb); 1995 inet_twsk_deschedule_put(inet_twsk(sk)); 1996 goto discard_it; 1997 case TCP_TW_SUCCESS:; 1998 } 1999 goto discard_it; 2000 } 2001 2002 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2003 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2004 .twsk_unique = tcp_twsk_unique, 2005 .twsk_destructor= tcp_twsk_destructor, 2006 }; 2007 2008 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2009 { 2010 struct dst_entry *dst = skb_dst(skb); 2011 2012 if (dst && dst_hold_safe(dst)) { 2013 sk->sk_rx_dst = dst; 2014 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2015 } 2016 } 2017 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2018 2019 const struct inet_connection_sock_af_ops ipv4_specific = { 2020 .queue_xmit = ip_queue_xmit, 2021 .send_check = tcp_v4_send_check, 2022 .rebuild_header = inet_sk_rebuild_header, 2023 .sk_rx_dst_set = inet_sk_rx_dst_set, 2024 .conn_request = tcp_v4_conn_request, 2025 .syn_recv_sock = tcp_v4_syn_recv_sock, 2026 .net_header_len = sizeof(struct iphdr), 2027 .setsockopt = ip_setsockopt, 2028 .getsockopt = ip_getsockopt, 2029 .addr2sockaddr = inet_csk_addr2sockaddr, 2030 .sockaddr_len = sizeof(struct sockaddr_in), 2031 #ifdef CONFIG_COMPAT 2032 .compat_setsockopt = compat_ip_setsockopt, 2033 .compat_getsockopt = compat_ip_getsockopt, 2034 #endif 2035 .mtu_reduced = tcp_v4_mtu_reduced, 2036 }; 2037 EXPORT_SYMBOL(ipv4_specific); 2038 2039 #ifdef CONFIG_TCP_MD5SIG 2040 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2041 .md5_lookup = tcp_v4_md5_lookup, 2042 .calc_md5_hash = tcp_v4_md5_hash_skb, 2043 .md5_parse = tcp_v4_parse_md5_keys, 2044 }; 2045 #endif 2046 2047 /* NOTE: A lot of things set to zero explicitly by call to 2048 * sk_alloc() so need not be done here. 2049 */ 2050 static int tcp_v4_init_sock(struct sock *sk) 2051 { 2052 struct inet_connection_sock *icsk = inet_csk(sk); 2053 2054 tcp_init_sock(sk); 2055 2056 icsk->icsk_af_ops = &ipv4_specific; 2057 2058 #ifdef CONFIG_TCP_MD5SIG 2059 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2060 #endif 2061 2062 return 0; 2063 } 2064 2065 void tcp_v4_destroy_sock(struct sock *sk) 2066 { 2067 struct tcp_sock *tp = tcp_sk(sk); 2068 2069 trace_tcp_destroy_sock(sk); 2070 2071 tcp_clear_xmit_timers(sk); 2072 2073 tcp_cleanup_congestion_control(sk); 2074 2075 tcp_cleanup_ulp(sk); 2076 2077 /* Cleanup up the write buffer. */ 2078 tcp_write_queue_purge(sk); 2079 2080 /* Check if we want to disable active TFO */ 2081 tcp_fastopen_active_disable_ofo_check(sk); 2082 2083 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2084 skb_rbtree_purge(&tp->out_of_order_queue); 2085 2086 #ifdef CONFIG_TCP_MD5SIG 2087 /* Clean up the MD5 key list, if any */ 2088 if (tp->md5sig_info) { 2089 tcp_clear_md5_list(sk); 2090 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2091 tp->md5sig_info = NULL; 2092 } 2093 #endif 2094 2095 /* Clean up a referenced TCP bind bucket. */ 2096 if (inet_csk(sk)->icsk_bind_hash) 2097 inet_put_port(sk); 2098 2099 BUG_ON(tp->fastopen_rsk); 2100 2101 /* If socket is aborted during connect operation */ 2102 tcp_free_fastopen_req(tp); 2103 tcp_fastopen_destroy_cipher(sk); 2104 tcp_saved_syn_free(tp); 2105 2106 sk_sockets_allocated_dec(sk); 2107 } 2108 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2109 2110 #ifdef CONFIG_PROC_FS 2111 /* Proc filesystem TCP sock list dumping. */ 2112 2113 /* 2114 * Get next listener socket follow cur. If cur is NULL, get first socket 2115 * starting from bucket given in st->bucket; when st->bucket is zero the 2116 * very first socket in the hash table is returned. 2117 */ 2118 static void *listening_get_next(struct seq_file *seq, void *cur) 2119 { 2120 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2121 struct tcp_iter_state *st = seq->private; 2122 struct net *net = seq_file_net(seq); 2123 struct inet_listen_hashbucket *ilb; 2124 struct sock *sk = cur; 2125 2126 if (!sk) { 2127 get_head: 2128 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2129 spin_lock(&ilb->lock); 2130 sk = sk_head(&ilb->head); 2131 st->offset = 0; 2132 goto get_sk; 2133 } 2134 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2135 ++st->num; 2136 ++st->offset; 2137 2138 sk = sk_next(sk); 2139 get_sk: 2140 sk_for_each_from(sk) { 2141 if (!net_eq(sock_net(sk), net)) 2142 continue; 2143 if (sk->sk_family == afinfo->family) 2144 return sk; 2145 } 2146 spin_unlock(&ilb->lock); 2147 st->offset = 0; 2148 if (++st->bucket < INET_LHTABLE_SIZE) 2149 goto get_head; 2150 return NULL; 2151 } 2152 2153 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2154 { 2155 struct tcp_iter_state *st = seq->private; 2156 void *rc; 2157 2158 st->bucket = 0; 2159 st->offset = 0; 2160 rc = listening_get_next(seq, NULL); 2161 2162 while (rc && *pos) { 2163 rc = listening_get_next(seq, rc); 2164 --*pos; 2165 } 2166 return rc; 2167 } 2168 2169 static inline bool empty_bucket(const struct tcp_iter_state *st) 2170 { 2171 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2172 } 2173 2174 /* 2175 * Get first established socket starting from bucket given in st->bucket. 2176 * If st->bucket is zero, the very first socket in the hash is returned. 2177 */ 2178 static void *established_get_first(struct seq_file *seq) 2179 { 2180 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2181 struct tcp_iter_state *st = seq->private; 2182 struct net *net = seq_file_net(seq); 2183 void *rc = NULL; 2184 2185 st->offset = 0; 2186 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2187 struct sock *sk; 2188 struct hlist_nulls_node *node; 2189 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2190 2191 /* Lockless fast path for the common case of empty buckets */ 2192 if (empty_bucket(st)) 2193 continue; 2194 2195 spin_lock_bh(lock); 2196 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2197 if (sk->sk_family != afinfo->family || 2198 !net_eq(sock_net(sk), net)) { 2199 continue; 2200 } 2201 rc = sk; 2202 goto out; 2203 } 2204 spin_unlock_bh(lock); 2205 } 2206 out: 2207 return rc; 2208 } 2209 2210 static void *established_get_next(struct seq_file *seq, void *cur) 2211 { 2212 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); 2213 struct sock *sk = cur; 2214 struct hlist_nulls_node *node; 2215 struct tcp_iter_state *st = seq->private; 2216 struct net *net = seq_file_net(seq); 2217 2218 ++st->num; 2219 ++st->offset; 2220 2221 sk = sk_nulls_next(sk); 2222 2223 sk_nulls_for_each_from(sk, node) { 2224 if (sk->sk_family == afinfo->family && 2225 net_eq(sock_net(sk), net)) 2226 return sk; 2227 } 2228 2229 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2230 ++st->bucket; 2231 return established_get_first(seq); 2232 } 2233 2234 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2235 { 2236 struct tcp_iter_state *st = seq->private; 2237 void *rc; 2238 2239 st->bucket = 0; 2240 rc = established_get_first(seq); 2241 2242 while (rc && pos) { 2243 rc = established_get_next(seq, rc); 2244 --pos; 2245 } 2246 return rc; 2247 } 2248 2249 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2250 { 2251 void *rc; 2252 struct tcp_iter_state *st = seq->private; 2253 2254 st->state = TCP_SEQ_STATE_LISTENING; 2255 rc = listening_get_idx(seq, &pos); 2256 2257 if (!rc) { 2258 st->state = TCP_SEQ_STATE_ESTABLISHED; 2259 rc = established_get_idx(seq, pos); 2260 } 2261 2262 return rc; 2263 } 2264 2265 static void *tcp_seek_last_pos(struct seq_file *seq) 2266 { 2267 struct tcp_iter_state *st = seq->private; 2268 int offset = st->offset; 2269 int orig_num = st->num; 2270 void *rc = NULL; 2271 2272 switch (st->state) { 2273 case TCP_SEQ_STATE_LISTENING: 2274 if (st->bucket >= INET_LHTABLE_SIZE) 2275 break; 2276 st->state = TCP_SEQ_STATE_LISTENING; 2277 rc = listening_get_next(seq, NULL); 2278 while (offset-- && rc) 2279 rc = listening_get_next(seq, rc); 2280 if (rc) 2281 break; 2282 st->bucket = 0; 2283 st->state = TCP_SEQ_STATE_ESTABLISHED; 2284 /* Fallthrough */ 2285 case TCP_SEQ_STATE_ESTABLISHED: 2286 if (st->bucket > tcp_hashinfo.ehash_mask) 2287 break; 2288 rc = established_get_first(seq); 2289 while (offset-- && rc) 2290 rc = established_get_next(seq, rc); 2291 } 2292 2293 st->num = orig_num; 2294 2295 return rc; 2296 } 2297 2298 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2299 { 2300 struct tcp_iter_state *st = seq->private; 2301 void *rc; 2302 2303 if (*pos && *pos == st->last_pos) { 2304 rc = tcp_seek_last_pos(seq); 2305 if (rc) 2306 goto out; 2307 } 2308 2309 st->state = TCP_SEQ_STATE_LISTENING; 2310 st->num = 0; 2311 st->bucket = 0; 2312 st->offset = 0; 2313 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2314 2315 out: 2316 st->last_pos = *pos; 2317 return rc; 2318 } 2319 EXPORT_SYMBOL(tcp_seq_start); 2320 2321 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2322 { 2323 struct tcp_iter_state *st = seq->private; 2324 void *rc = NULL; 2325 2326 if (v == SEQ_START_TOKEN) { 2327 rc = tcp_get_idx(seq, 0); 2328 goto out; 2329 } 2330 2331 switch (st->state) { 2332 case TCP_SEQ_STATE_LISTENING: 2333 rc = listening_get_next(seq, v); 2334 if (!rc) { 2335 st->state = TCP_SEQ_STATE_ESTABLISHED; 2336 st->bucket = 0; 2337 st->offset = 0; 2338 rc = established_get_first(seq); 2339 } 2340 break; 2341 case TCP_SEQ_STATE_ESTABLISHED: 2342 rc = established_get_next(seq, v); 2343 break; 2344 } 2345 out: 2346 ++*pos; 2347 st->last_pos = *pos; 2348 return rc; 2349 } 2350 EXPORT_SYMBOL(tcp_seq_next); 2351 2352 void tcp_seq_stop(struct seq_file *seq, void *v) 2353 { 2354 struct tcp_iter_state *st = seq->private; 2355 2356 switch (st->state) { 2357 case TCP_SEQ_STATE_LISTENING: 2358 if (v != SEQ_START_TOKEN) 2359 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2360 break; 2361 case TCP_SEQ_STATE_ESTABLISHED: 2362 if (v) 2363 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2364 break; 2365 } 2366 } 2367 EXPORT_SYMBOL(tcp_seq_stop); 2368 2369 static void get_openreq4(const struct request_sock *req, 2370 struct seq_file *f, int i) 2371 { 2372 const struct inet_request_sock *ireq = inet_rsk(req); 2373 long delta = req->rsk_timer.expires - jiffies; 2374 2375 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2376 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2377 i, 2378 ireq->ir_loc_addr, 2379 ireq->ir_num, 2380 ireq->ir_rmt_addr, 2381 ntohs(ireq->ir_rmt_port), 2382 TCP_SYN_RECV, 2383 0, 0, /* could print option size, but that is af dependent. */ 2384 1, /* timers active (only the expire timer) */ 2385 jiffies_delta_to_clock_t(delta), 2386 req->num_timeout, 2387 from_kuid_munged(seq_user_ns(f), 2388 sock_i_uid(req->rsk_listener)), 2389 0, /* non standard timer */ 2390 0, /* open_requests have no inode */ 2391 0, 2392 req); 2393 } 2394 2395 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2396 { 2397 int timer_active; 2398 unsigned long timer_expires; 2399 const struct tcp_sock *tp = tcp_sk(sk); 2400 const struct inet_connection_sock *icsk = inet_csk(sk); 2401 const struct inet_sock *inet = inet_sk(sk); 2402 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2403 __be32 dest = inet->inet_daddr; 2404 __be32 src = inet->inet_rcv_saddr; 2405 __u16 destp = ntohs(inet->inet_dport); 2406 __u16 srcp = ntohs(inet->inet_sport); 2407 int rx_queue; 2408 int state; 2409 2410 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2411 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2412 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2413 timer_active = 1; 2414 timer_expires = icsk->icsk_timeout; 2415 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2416 timer_active = 4; 2417 timer_expires = icsk->icsk_timeout; 2418 } else if (timer_pending(&sk->sk_timer)) { 2419 timer_active = 2; 2420 timer_expires = sk->sk_timer.expires; 2421 } else { 2422 timer_active = 0; 2423 timer_expires = jiffies; 2424 } 2425 2426 state = inet_sk_state_load(sk); 2427 if (state == TCP_LISTEN) 2428 rx_queue = sk->sk_ack_backlog; 2429 else 2430 /* Because we don't lock the socket, 2431 * we might find a transient negative value. 2432 */ 2433 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2434 2435 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2436 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2437 i, src, srcp, dest, destp, state, 2438 tp->write_seq - tp->snd_una, 2439 rx_queue, 2440 timer_active, 2441 jiffies_delta_to_clock_t(timer_expires - jiffies), 2442 icsk->icsk_retransmits, 2443 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2444 icsk->icsk_probes_out, 2445 sock_i_ino(sk), 2446 refcount_read(&sk->sk_refcnt), sk, 2447 jiffies_to_clock_t(icsk->icsk_rto), 2448 jiffies_to_clock_t(icsk->icsk_ack.ato), 2449 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2450 tp->snd_cwnd, 2451 state == TCP_LISTEN ? 2452 fastopenq->max_qlen : 2453 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2454 } 2455 2456 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2457 struct seq_file *f, int i) 2458 { 2459 long delta = tw->tw_timer.expires - jiffies; 2460 __be32 dest, src; 2461 __u16 destp, srcp; 2462 2463 dest = tw->tw_daddr; 2464 src = tw->tw_rcv_saddr; 2465 destp = ntohs(tw->tw_dport); 2466 srcp = ntohs(tw->tw_sport); 2467 2468 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2469 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2470 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2471 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2472 refcount_read(&tw->tw_refcnt), tw); 2473 } 2474 2475 #define TMPSZ 150 2476 2477 static int tcp4_seq_show(struct seq_file *seq, void *v) 2478 { 2479 struct tcp_iter_state *st; 2480 struct sock *sk = v; 2481 2482 seq_setwidth(seq, TMPSZ - 1); 2483 if (v == SEQ_START_TOKEN) { 2484 seq_puts(seq, " sl local_address rem_address st tx_queue " 2485 "rx_queue tr tm->when retrnsmt uid timeout " 2486 "inode"); 2487 goto out; 2488 } 2489 st = seq->private; 2490 2491 if (sk->sk_state == TCP_TIME_WAIT) 2492 get_timewait4_sock(v, seq, st->num); 2493 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2494 get_openreq4(v, seq, st->num); 2495 else 2496 get_tcp4_sock(v, seq, st->num); 2497 out: 2498 seq_pad(seq, '\n'); 2499 return 0; 2500 } 2501 2502 static const struct seq_operations tcp4_seq_ops = { 2503 .show = tcp4_seq_show, 2504 .start = tcp_seq_start, 2505 .next = tcp_seq_next, 2506 .stop = tcp_seq_stop, 2507 }; 2508 2509 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2510 .family = AF_INET, 2511 }; 2512 2513 static int __net_init tcp4_proc_init_net(struct net *net) 2514 { 2515 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2516 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2517 return -ENOMEM; 2518 return 0; 2519 } 2520 2521 static void __net_exit tcp4_proc_exit_net(struct net *net) 2522 { 2523 remove_proc_entry("tcp", net->proc_net); 2524 } 2525 2526 static struct pernet_operations tcp4_net_ops = { 2527 .init = tcp4_proc_init_net, 2528 .exit = tcp4_proc_exit_net, 2529 }; 2530 2531 int __init tcp4_proc_init(void) 2532 { 2533 return register_pernet_subsys(&tcp4_net_ops); 2534 } 2535 2536 void tcp4_proc_exit(void) 2537 { 2538 unregister_pernet_subsys(&tcp4_net_ops); 2539 } 2540 #endif /* CONFIG_PROC_FS */ 2541 2542 struct proto tcp_prot = { 2543 .name = "TCP", 2544 .owner = THIS_MODULE, 2545 .close = tcp_close, 2546 .pre_connect = tcp_v4_pre_connect, 2547 .connect = tcp_v4_connect, 2548 .disconnect = tcp_disconnect, 2549 .accept = inet_csk_accept, 2550 .ioctl = tcp_ioctl, 2551 .init = tcp_v4_init_sock, 2552 .destroy = tcp_v4_destroy_sock, 2553 .shutdown = tcp_shutdown, 2554 .setsockopt = tcp_setsockopt, 2555 .getsockopt = tcp_getsockopt, 2556 .keepalive = tcp_set_keepalive, 2557 .recvmsg = tcp_recvmsg, 2558 .sendmsg = tcp_sendmsg, 2559 .sendpage = tcp_sendpage, 2560 .backlog_rcv = tcp_v4_do_rcv, 2561 .release_cb = tcp_release_cb, 2562 .hash = inet_hash, 2563 .unhash = inet_unhash, 2564 .get_port = inet_csk_get_port, 2565 .enter_memory_pressure = tcp_enter_memory_pressure, 2566 .leave_memory_pressure = tcp_leave_memory_pressure, 2567 .stream_memory_free = tcp_stream_memory_free, 2568 .sockets_allocated = &tcp_sockets_allocated, 2569 .orphan_count = &tcp_orphan_count, 2570 .memory_allocated = &tcp_memory_allocated, 2571 .memory_pressure = &tcp_memory_pressure, 2572 .sysctl_mem = sysctl_tcp_mem, 2573 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2574 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2575 .max_header = MAX_TCP_HEADER, 2576 .obj_size = sizeof(struct tcp_sock), 2577 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2578 .twsk_prot = &tcp_timewait_sock_ops, 2579 .rsk_prot = &tcp_request_sock_ops, 2580 .h.hashinfo = &tcp_hashinfo, 2581 .no_autobind = true, 2582 #ifdef CONFIG_COMPAT 2583 .compat_setsockopt = compat_tcp_setsockopt, 2584 .compat_getsockopt = compat_tcp_getsockopt, 2585 #endif 2586 .diag_destroy = tcp_abort, 2587 }; 2588 EXPORT_SYMBOL(tcp_prot); 2589 2590 static void __net_exit tcp_sk_exit(struct net *net) 2591 { 2592 int cpu; 2593 2594 if (net->ipv4.tcp_congestion_control) 2595 module_put(net->ipv4.tcp_congestion_control->owner); 2596 2597 for_each_possible_cpu(cpu) 2598 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2599 free_percpu(net->ipv4.tcp_sk); 2600 } 2601 2602 static int __net_init tcp_sk_init(struct net *net) 2603 { 2604 int res, cpu, cnt; 2605 2606 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2607 if (!net->ipv4.tcp_sk) 2608 return -ENOMEM; 2609 2610 for_each_possible_cpu(cpu) { 2611 struct sock *sk; 2612 2613 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2614 IPPROTO_TCP, net); 2615 if (res) 2616 goto fail; 2617 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2618 2619 /* Please enforce IP_DF and IPID==0 for RST and 2620 * ACK sent in SYN-RECV and TIME-WAIT state. 2621 */ 2622 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2623 2624 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2625 } 2626 2627 net->ipv4.sysctl_tcp_ecn = 2; 2628 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2629 2630 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2631 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2632 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2633 2634 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2635 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2636 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2637 2638 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2639 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2640 net->ipv4.sysctl_tcp_syncookies = 1; 2641 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2642 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2643 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2644 net->ipv4.sysctl_tcp_orphan_retries = 0; 2645 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2646 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2647 net->ipv4.sysctl_tcp_tw_reuse = 2; 2648 2649 cnt = tcp_hashinfo.ehash_mask + 1; 2650 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2651 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2652 2653 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); 2654 net->ipv4.sysctl_tcp_sack = 1; 2655 net->ipv4.sysctl_tcp_window_scaling = 1; 2656 net->ipv4.sysctl_tcp_timestamps = 1; 2657 net->ipv4.sysctl_tcp_early_retrans = 3; 2658 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2659 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2660 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2661 net->ipv4.sysctl_tcp_max_reordering = 300; 2662 net->ipv4.sysctl_tcp_dsack = 1; 2663 net->ipv4.sysctl_tcp_app_win = 31; 2664 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2665 net->ipv4.sysctl_tcp_frto = 2; 2666 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2667 /* This limits the percentage of the congestion window which we 2668 * will allow a single TSO frame to consume. Building TSO frames 2669 * which are too large can cause TCP streams to be bursty. 2670 */ 2671 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2672 /* Default TSQ limit of 16 TSO segments */ 2673 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2674 /* rfc5961 challenge ack rate limiting */ 2675 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2676 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2677 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2678 net->ipv4.sysctl_tcp_autocorking = 1; 2679 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2680 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2681 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2682 if (net != &init_net) { 2683 memcpy(net->ipv4.sysctl_tcp_rmem, 2684 init_net.ipv4.sysctl_tcp_rmem, 2685 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2686 memcpy(net->ipv4.sysctl_tcp_wmem, 2687 init_net.ipv4.sysctl_tcp_wmem, 2688 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2689 } 2690 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2691 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2692 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2693 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2694 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2695 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2696 2697 /* Reno is always built in */ 2698 if (!net_eq(net, &init_net) && 2699 try_module_get(init_net.ipv4.tcp_congestion_control->owner)) 2700 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2701 else 2702 net->ipv4.tcp_congestion_control = &tcp_reno; 2703 2704 return 0; 2705 fail: 2706 tcp_sk_exit(net); 2707 2708 return res; 2709 } 2710 2711 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2712 { 2713 struct net *net; 2714 2715 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2716 2717 list_for_each_entry(net, net_exit_list, exit_list) 2718 tcp_fastopen_ctx_destroy(net); 2719 } 2720 2721 static struct pernet_operations __net_initdata tcp_sk_ops = { 2722 .init = tcp_sk_init, 2723 .exit = tcp_sk_exit, 2724 .exit_batch = tcp_sk_exit_batch, 2725 }; 2726 2727 void __init tcp_v4_init(void) 2728 { 2729 if (register_pernet_subsys(&tcp_sk_ops)) 2730 panic("Failed to create the TCP control socket.\n"); 2731 } 2732