1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 95 96 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 97 { 98 return secure_tcp_seq(ip_hdr(skb)->daddr, 99 ip_hdr(skb)->saddr, 100 tcp_hdr(skb)->dest, 101 tcp_hdr(skb)->source); 102 } 103 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 105 { 106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 107 } 108 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 110 { 111 const struct inet_timewait_sock *tw = inet_twsk(sktw); 112 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 113 struct tcp_sock *tp = tcp_sk(sk); 114 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 115 116 if (reuse == 2) { 117 /* Still does not detect *everything* that goes through 118 * lo, since we require a loopback src or dst address 119 * or direct binding to 'lo' interface. 120 */ 121 bool loopback = false; 122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 123 loopback = true; 124 #if IS_ENABLED(CONFIG_IPV6) 125 if (tw->tw_family == AF_INET6) { 126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 130 loopback = true; 131 } else 132 #endif 133 { 134 if (ipv4_is_loopback(tw->tw_daddr) || 135 ipv4_is_loopback(tw->tw_rcv_saddr)) 136 loopback = true; 137 } 138 if (!loopback) 139 reuse = 0; 140 } 141 142 /* With PAWS, it is safe from the viewpoint 143 of data integrity. Even without PAWS it is safe provided sequence 144 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 145 146 Actually, the idea is close to VJ's one, only timestamp cache is 147 held not per host, but per port pair and TW bucket is used as state 148 holder. 149 150 If TW bucket has been already destroyed we fall back to VJ's scheme 151 and use initial timestamp retrieved from peer table. 152 */ 153 if (tcptw->tw_ts_recent_stamp && 154 (!twp || (reuse && time_after32(ktime_get_seconds(), 155 tcptw->tw_ts_recent_stamp)))) { 156 /* In case of repair and re-using TIME-WAIT sockets we still 157 * want to be sure that it is safe as above but honor the 158 * sequence numbers and time stamps set as part of the repair 159 * process. 160 * 161 * Without this check re-using a TIME-WAIT socket with TCP 162 * repair would accumulate a -1 on the repair assigned 163 * sequence number. The first time it is reused the sequence 164 * is -1, the second time -2, etc. This fixes that issue 165 * without appearing to create any others. 166 */ 167 if (likely(!tp->repair)) { 168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 169 170 if (!seq) 171 seq = 1; 172 WRITE_ONCE(tp->write_seq, seq); 173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 175 } 176 sock_hold(sktw); 177 return 1; 178 } 179 180 return 0; 181 } 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 183 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 185 int addr_len) 186 { 187 /* This check is replicated from tcp_v4_connect() and intended to 188 * prevent BPF program called below from accessing bytes that are out 189 * of the bound specified by user in addr_len. 190 */ 191 if (addr_len < sizeof(struct sockaddr_in)) 192 return -EINVAL; 193 194 sock_owned_by_me(sk); 195 196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 197 } 198 199 /* This will initiate an outgoing connection. */ 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 201 { 202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 203 struct inet_sock *inet = inet_sk(sk); 204 struct tcp_sock *tp = tcp_sk(sk); 205 __be16 orig_sport, orig_dport; 206 __be32 daddr, nexthop; 207 struct flowi4 *fl4; 208 struct rtable *rt; 209 int err; 210 struct ip_options_rcu *inet_opt; 211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; 212 213 if (addr_len < sizeof(struct sockaddr_in)) 214 return -EINVAL; 215 216 if (usin->sin_family != AF_INET) 217 return -EAFNOSUPPORT; 218 219 nexthop = daddr = usin->sin_addr.s_addr; 220 inet_opt = rcu_dereference_protected(inet->inet_opt, 221 lockdep_sock_is_held(sk)); 222 if (inet_opt && inet_opt->opt.srr) { 223 if (!daddr) 224 return -EINVAL; 225 nexthop = inet_opt->opt.faddr; 226 } 227 228 orig_sport = inet->inet_sport; 229 orig_dport = usin->sin_port; 230 fl4 = &inet->cork.fl.u.ip4; 231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 232 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 233 IPPROTO_TCP, 234 orig_sport, orig_dport, sk); 235 if (IS_ERR(rt)) { 236 err = PTR_ERR(rt); 237 if (err == -ENETUNREACH) 238 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 239 return err; 240 } 241 242 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 243 ip_rt_put(rt); 244 return -ENETUNREACH; 245 } 246 247 if (!inet_opt || !inet_opt->opt.srr) 248 daddr = fl4->daddr; 249 250 if (!inet->inet_saddr) 251 inet->inet_saddr = fl4->saddr; 252 sk_rcv_saddr_set(sk, inet->inet_saddr); 253 254 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 255 /* Reset inherited state */ 256 tp->rx_opt.ts_recent = 0; 257 tp->rx_opt.ts_recent_stamp = 0; 258 if (likely(!tp->repair)) 259 WRITE_ONCE(tp->write_seq, 0); 260 } 261 262 inet->inet_dport = usin->sin_port; 263 sk_daddr_set(sk, daddr); 264 265 inet_csk(sk)->icsk_ext_hdr_len = 0; 266 if (inet_opt) 267 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 268 269 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 270 271 /* Socket identity is still unknown (sport may be zero). 272 * However we set state to SYN-SENT and not releasing socket 273 * lock select source port, enter ourselves into the hash tables and 274 * complete initialization after this. 275 */ 276 tcp_set_state(sk, TCP_SYN_SENT); 277 err = inet_hash_connect(tcp_death_row, sk); 278 if (err) 279 goto failure; 280 281 sk_set_txhash(sk); 282 283 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 284 inet->inet_sport, inet->inet_dport, sk); 285 if (IS_ERR(rt)) { 286 err = PTR_ERR(rt); 287 rt = NULL; 288 goto failure; 289 } 290 /* OK, now commit destination to socket. */ 291 sk->sk_gso_type = SKB_GSO_TCPV4; 292 sk_setup_caps(sk, &rt->dst); 293 rt = NULL; 294 295 if (likely(!tp->repair)) { 296 if (!tp->write_seq) 297 WRITE_ONCE(tp->write_seq, 298 secure_tcp_seq(inet->inet_saddr, 299 inet->inet_daddr, 300 inet->inet_sport, 301 usin->sin_port)); 302 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 303 inet->inet_saddr, 304 inet->inet_daddr); 305 } 306 307 inet->inet_id = prandom_u32(); 308 309 if (tcp_fastopen_defer_connect(sk, &err)) 310 return err; 311 if (err) 312 goto failure; 313 314 err = tcp_connect(sk); 315 316 if (err) 317 goto failure; 318 319 return 0; 320 321 failure: 322 /* 323 * This unhashes the socket and releases the local port, 324 * if necessary. 325 */ 326 tcp_set_state(sk, TCP_CLOSE); 327 ip_rt_put(rt); 328 sk->sk_route_caps = 0; 329 inet->inet_dport = 0; 330 return err; 331 } 332 EXPORT_SYMBOL(tcp_v4_connect); 333 334 /* 335 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 336 * It can be called through tcp_release_cb() if socket was owned by user 337 * at the time tcp_v4_err() was called to handle ICMP message. 338 */ 339 void tcp_v4_mtu_reduced(struct sock *sk) 340 { 341 struct inet_sock *inet = inet_sk(sk); 342 struct dst_entry *dst; 343 u32 mtu; 344 345 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 346 return; 347 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 348 dst = inet_csk_update_pmtu(sk, mtu); 349 if (!dst) 350 return; 351 352 /* Something is about to be wrong... Remember soft error 353 * for the case, if this connection will not able to recover. 354 */ 355 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 356 sk->sk_err_soft = EMSGSIZE; 357 358 mtu = dst_mtu(dst); 359 360 if (inet->pmtudisc != IP_PMTUDISC_DONT && 361 ip_sk_accept_pmtu(sk) && 362 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 363 tcp_sync_mss(sk, mtu); 364 365 /* Resend the TCP packet because it's 366 * clear that the old packet has been 367 * dropped. This is the new "fast" path mtu 368 * discovery. 369 */ 370 tcp_simple_retransmit(sk); 371 } /* else let the usual retransmit timer handle it */ 372 } 373 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 374 375 static void do_redirect(struct sk_buff *skb, struct sock *sk) 376 { 377 struct dst_entry *dst = __sk_dst_check(sk, 0); 378 379 if (dst) 380 dst->ops->redirect(dst, sk, skb); 381 } 382 383 384 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 385 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 386 { 387 struct request_sock *req = inet_reqsk(sk); 388 struct net *net = sock_net(sk); 389 390 /* ICMPs are not backlogged, hence we cannot get 391 * an established socket here. 392 */ 393 if (seq != tcp_rsk(req)->snt_isn) { 394 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 395 } else if (abort) { 396 /* 397 * Still in SYN_RECV, just remove it silently. 398 * There is no good way to pass the error to the newly 399 * created socket, and POSIX does not want network 400 * errors returned from accept(). 401 */ 402 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 403 tcp_listendrop(req->rsk_listener); 404 } 405 reqsk_put(req); 406 } 407 EXPORT_SYMBOL(tcp_req_err); 408 409 /* TCP-LD (RFC 6069) logic */ 410 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 411 { 412 struct inet_connection_sock *icsk = inet_csk(sk); 413 struct tcp_sock *tp = tcp_sk(sk); 414 struct sk_buff *skb; 415 s32 remaining; 416 u32 delta_us; 417 418 if (sock_owned_by_user(sk)) 419 return; 420 421 if (seq != tp->snd_una || !icsk->icsk_retransmits || 422 !icsk->icsk_backoff) 423 return; 424 425 skb = tcp_rtx_queue_head(sk); 426 if (WARN_ON_ONCE(!skb)) 427 return; 428 429 icsk->icsk_backoff--; 430 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 431 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 432 433 tcp_mstamp_refresh(tp); 434 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 435 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 436 437 if (remaining > 0) { 438 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 439 remaining, TCP_RTO_MAX); 440 } else { 441 /* RTO revert clocked out retransmission. 442 * Will retransmit now. 443 */ 444 tcp_retransmit_timer(sk); 445 } 446 } 447 EXPORT_SYMBOL(tcp_ld_RTO_revert); 448 449 /* 450 * This routine is called by the ICMP module when it gets some 451 * sort of error condition. If err < 0 then the socket should 452 * be closed and the error returned to the user. If err > 0 453 * it's just the icmp type << 8 | icmp code. After adjustment 454 * header points to the first 8 bytes of the tcp header. We need 455 * to find the appropriate port. 456 * 457 * The locking strategy used here is very "optimistic". When 458 * someone else accesses the socket the ICMP is just dropped 459 * and for some paths there is no check at all. 460 * A more general error queue to queue errors for later handling 461 * is probably better. 462 * 463 */ 464 465 int tcp_v4_err(struct sk_buff *skb, u32 info) 466 { 467 const struct iphdr *iph = (const struct iphdr *)skb->data; 468 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 469 struct tcp_sock *tp; 470 struct inet_sock *inet; 471 const int type = icmp_hdr(skb)->type; 472 const int code = icmp_hdr(skb)->code; 473 struct sock *sk; 474 struct request_sock *fastopen; 475 u32 seq, snd_una; 476 int err; 477 struct net *net = dev_net(skb->dev); 478 479 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 480 th->dest, iph->saddr, ntohs(th->source), 481 inet_iif(skb), 0); 482 if (!sk) { 483 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 484 return -ENOENT; 485 } 486 if (sk->sk_state == TCP_TIME_WAIT) { 487 inet_twsk_put(inet_twsk(sk)); 488 return 0; 489 } 490 seq = ntohl(th->seq); 491 if (sk->sk_state == TCP_NEW_SYN_RECV) { 492 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 493 type == ICMP_TIME_EXCEEDED || 494 (type == ICMP_DEST_UNREACH && 495 (code == ICMP_NET_UNREACH || 496 code == ICMP_HOST_UNREACH))); 497 return 0; 498 } 499 500 bh_lock_sock(sk); 501 /* If too many ICMPs get dropped on busy 502 * servers this needs to be solved differently. 503 * We do take care of PMTU discovery (RFC1191) special case : 504 * we can receive locally generated ICMP messages while socket is held. 505 */ 506 if (sock_owned_by_user(sk)) { 507 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 508 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 509 } 510 if (sk->sk_state == TCP_CLOSE) 511 goto out; 512 513 if (static_branch_unlikely(&ip4_min_ttl)) { 514 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 515 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 516 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 517 goto out; 518 } 519 } 520 521 tp = tcp_sk(sk); 522 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 523 fastopen = rcu_dereference(tp->fastopen_rsk); 524 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 525 if (sk->sk_state != TCP_LISTEN && 526 !between(seq, snd_una, tp->snd_nxt)) { 527 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 528 goto out; 529 } 530 531 switch (type) { 532 case ICMP_REDIRECT: 533 if (!sock_owned_by_user(sk)) 534 do_redirect(skb, sk); 535 goto out; 536 case ICMP_SOURCE_QUENCH: 537 /* Just silently ignore these. */ 538 goto out; 539 case ICMP_PARAMETERPROB: 540 err = EPROTO; 541 break; 542 case ICMP_DEST_UNREACH: 543 if (code > NR_ICMP_UNREACH) 544 goto out; 545 546 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 547 /* We are not interested in TCP_LISTEN and open_requests 548 * (SYN-ACKs send out by Linux are always <576bytes so 549 * they should go through unfragmented). 550 */ 551 if (sk->sk_state == TCP_LISTEN) 552 goto out; 553 554 WRITE_ONCE(tp->mtu_info, info); 555 if (!sock_owned_by_user(sk)) { 556 tcp_v4_mtu_reduced(sk); 557 } else { 558 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 559 sock_hold(sk); 560 } 561 goto out; 562 } 563 564 err = icmp_err_convert[code].errno; 565 /* check if this ICMP message allows revert of backoff. 566 * (see RFC 6069) 567 */ 568 if (!fastopen && 569 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 570 tcp_ld_RTO_revert(sk, seq); 571 break; 572 case ICMP_TIME_EXCEEDED: 573 err = EHOSTUNREACH; 574 break; 575 default: 576 goto out; 577 } 578 579 switch (sk->sk_state) { 580 case TCP_SYN_SENT: 581 case TCP_SYN_RECV: 582 /* Only in fast or simultaneous open. If a fast open socket is 583 * already accepted it is treated as a connected one below. 584 */ 585 if (fastopen && !fastopen->sk) 586 break; 587 588 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 589 590 if (!sock_owned_by_user(sk)) { 591 sk->sk_err = err; 592 593 sk_error_report(sk); 594 595 tcp_done(sk); 596 } else { 597 sk->sk_err_soft = err; 598 } 599 goto out; 600 } 601 602 /* If we've already connected we will keep trying 603 * until we time out, or the user gives up. 604 * 605 * rfc1122 4.2.3.9 allows to consider as hard errors 606 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 607 * but it is obsoleted by pmtu discovery). 608 * 609 * Note, that in modern internet, where routing is unreliable 610 * and in each dark corner broken firewalls sit, sending random 611 * errors ordered by their masters even this two messages finally lose 612 * their original sense (even Linux sends invalid PORT_UNREACHs) 613 * 614 * Now we are in compliance with RFCs. 615 * --ANK (980905) 616 */ 617 618 inet = inet_sk(sk); 619 if (!sock_owned_by_user(sk) && inet->recverr) { 620 sk->sk_err = err; 621 sk_error_report(sk); 622 } else { /* Only an error on timeout */ 623 sk->sk_err_soft = err; 624 } 625 626 out: 627 bh_unlock_sock(sk); 628 sock_put(sk); 629 return 0; 630 } 631 632 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 633 { 634 struct tcphdr *th = tcp_hdr(skb); 635 636 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 637 skb->csum_start = skb_transport_header(skb) - skb->head; 638 skb->csum_offset = offsetof(struct tcphdr, check); 639 } 640 641 /* This routine computes an IPv4 TCP checksum. */ 642 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 643 { 644 const struct inet_sock *inet = inet_sk(sk); 645 646 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 647 } 648 EXPORT_SYMBOL(tcp_v4_send_check); 649 650 /* 651 * This routine will send an RST to the other tcp. 652 * 653 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 654 * for reset. 655 * Answer: if a packet caused RST, it is not for a socket 656 * existing in our system, if it is matched to a socket, 657 * it is just duplicate segment or bug in other side's TCP. 658 * So that we build reply only basing on parameters 659 * arrived with segment. 660 * Exception: precedence violation. We do not implement it in any case. 661 */ 662 663 #ifdef CONFIG_TCP_MD5SIG 664 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 665 #else 666 #define OPTION_BYTES sizeof(__be32) 667 #endif 668 669 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 670 { 671 const struct tcphdr *th = tcp_hdr(skb); 672 struct { 673 struct tcphdr th; 674 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 675 } rep; 676 struct ip_reply_arg arg; 677 #ifdef CONFIG_TCP_MD5SIG 678 struct tcp_md5sig_key *key = NULL; 679 const __u8 *hash_location = NULL; 680 unsigned char newhash[16]; 681 int genhash; 682 struct sock *sk1 = NULL; 683 #endif 684 u64 transmit_time = 0; 685 struct sock *ctl_sk; 686 struct net *net; 687 688 /* Never send a reset in response to a reset. */ 689 if (th->rst) 690 return; 691 692 /* If sk not NULL, it means we did a successful lookup and incoming 693 * route had to be correct. prequeue might have dropped our dst. 694 */ 695 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 696 return; 697 698 /* Swap the send and the receive. */ 699 memset(&rep, 0, sizeof(rep)); 700 rep.th.dest = th->source; 701 rep.th.source = th->dest; 702 rep.th.doff = sizeof(struct tcphdr) / 4; 703 rep.th.rst = 1; 704 705 if (th->ack) { 706 rep.th.seq = th->ack_seq; 707 } else { 708 rep.th.ack = 1; 709 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 710 skb->len - (th->doff << 2)); 711 } 712 713 memset(&arg, 0, sizeof(arg)); 714 arg.iov[0].iov_base = (unsigned char *)&rep; 715 arg.iov[0].iov_len = sizeof(rep.th); 716 717 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 718 #ifdef CONFIG_TCP_MD5SIG 719 rcu_read_lock(); 720 hash_location = tcp_parse_md5sig_option(th); 721 if (sk && sk_fullsock(sk)) { 722 const union tcp_md5_addr *addr; 723 int l3index; 724 725 /* sdif set, means packet ingressed via a device 726 * in an L3 domain and inet_iif is set to it. 727 */ 728 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 729 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 730 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 731 } else if (hash_location) { 732 const union tcp_md5_addr *addr; 733 int sdif = tcp_v4_sdif(skb); 734 int dif = inet_iif(skb); 735 int l3index; 736 737 /* 738 * active side is lost. Try to find listening socket through 739 * source port, and then find md5 key through listening socket. 740 * we are not loose security here: 741 * Incoming packet is checked with md5 hash with finding key, 742 * no RST generated if md5 hash doesn't match. 743 */ 744 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 745 ip_hdr(skb)->saddr, 746 th->source, ip_hdr(skb)->daddr, 747 ntohs(th->source), dif, sdif); 748 /* don't send rst if it can't find key */ 749 if (!sk1) 750 goto out; 751 752 /* sdif set, means packet ingressed via a device 753 * in an L3 domain and dif is set to it. 754 */ 755 l3index = sdif ? dif : 0; 756 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 757 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 758 if (!key) 759 goto out; 760 761 762 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 763 if (genhash || memcmp(hash_location, newhash, 16) != 0) 764 goto out; 765 766 } 767 768 if (key) { 769 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 770 (TCPOPT_NOP << 16) | 771 (TCPOPT_MD5SIG << 8) | 772 TCPOLEN_MD5SIG); 773 /* Update length and the length the header thinks exists */ 774 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 775 rep.th.doff = arg.iov[0].iov_len / 4; 776 777 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 778 key, ip_hdr(skb)->saddr, 779 ip_hdr(skb)->daddr, &rep.th); 780 } 781 #endif 782 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 783 if (rep.opt[0] == 0) { 784 __be32 mrst = mptcp_reset_option(skb); 785 786 if (mrst) { 787 rep.opt[0] = mrst; 788 arg.iov[0].iov_len += sizeof(mrst); 789 rep.th.doff = arg.iov[0].iov_len / 4; 790 } 791 } 792 793 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 794 ip_hdr(skb)->saddr, /* XXX */ 795 arg.iov[0].iov_len, IPPROTO_TCP, 0); 796 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 797 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 798 799 /* When socket is gone, all binding information is lost. 800 * routing might fail in this case. No choice here, if we choose to force 801 * input interface, we will misroute in case of asymmetric route. 802 */ 803 if (sk) { 804 arg.bound_dev_if = sk->sk_bound_dev_if; 805 if (sk_fullsock(sk)) 806 trace_tcp_send_reset(sk, skb); 807 } 808 809 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 810 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 811 812 arg.tos = ip_hdr(skb)->tos; 813 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 814 local_bh_disable(); 815 ctl_sk = this_cpu_read(ipv4_tcp_sk); 816 sock_net_set(ctl_sk, net); 817 if (sk) { 818 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 819 inet_twsk(sk)->tw_mark : sk->sk_mark; 820 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 821 inet_twsk(sk)->tw_priority : sk->sk_priority; 822 transmit_time = tcp_transmit_time(sk); 823 } 824 ip_send_unicast_reply(ctl_sk, 825 skb, &TCP_SKB_CB(skb)->header.h4.opt, 826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 827 &arg, arg.iov[0].iov_len, 828 transmit_time); 829 830 ctl_sk->sk_mark = 0; 831 sock_net_set(ctl_sk, &init_net); 832 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 833 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 834 local_bh_enable(); 835 836 #ifdef CONFIG_TCP_MD5SIG 837 out: 838 rcu_read_unlock(); 839 #endif 840 } 841 842 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 843 outside socket context is ugly, certainly. What can I do? 844 */ 845 846 static void tcp_v4_send_ack(const struct sock *sk, 847 struct sk_buff *skb, u32 seq, u32 ack, 848 u32 win, u32 tsval, u32 tsecr, int oif, 849 struct tcp_md5sig_key *key, 850 int reply_flags, u8 tos) 851 { 852 const struct tcphdr *th = tcp_hdr(skb); 853 struct { 854 struct tcphdr th; 855 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 856 #ifdef CONFIG_TCP_MD5SIG 857 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 858 #endif 859 ]; 860 } rep; 861 struct net *net = sock_net(sk); 862 struct ip_reply_arg arg; 863 struct sock *ctl_sk; 864 u64 transmit_time; 865 866 memset(&rep.th, 0, sizeof(struct tcphdr)); 867 memset(&arg, 0, sizeof(arg)); 868 869 arg.iov[0].iov_base = (unsigned char *)&rep; 870 arg.iov[0].iov_len = sizeof(rep.th); 871 if (tsecr) { 872 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 873 (TCPOPT_TIMESTAMP << 8) | 874 TCPOLEN_TIMESTAMP); 875 rep.opt[1] = htonl(tsval); 876 rep.opt[2] = htonl(tsecr); 877 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 878 } 879 880 /* Swap the send and the receive. */ 881 rep.th.dest = th->source; 882 rep.th.source = th->dest; 883 rep.th.doff = arg.iov[0].iov_len / 4; 884 rep.th.seq = htonl(seq); 885 rep.th.ack_seq = htonl(ack); 886 rep.th.ack = 1; 887 rep.th.window = htons(win); 888 889 #ifdef CONFIG_TCP_MD5SIG 890 if (key) { 891 int offset = (tsecr) ? 3 : 0; 892 893 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 894 (TCPOPT_NOP << 16) | 895 (TCPOPT_MD5SIG << 8) | 896 TCPOLEN_MD5SIG); 897 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 898 rep.th.doff = arg.iov[0].iov_len/4; 899 900 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 901 key, ip_hdr(skb)->saddr, 902 ip_hdr(skb)->daddr, &rep.th); 903 } 904 #endif 905 arg.flags = reply_flags; 906 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 907 ip_hdr(skb)->saddr, /* XXX */ 908 arg.iov[0].iov_len, IPPROTO_TCP, 0); 909 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 910 if (oif) 911 arg.bound_dev_if = oif; 912 arg.tos = tos; 913 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 914 local_bh_disable(); 915 ctl_sk = this_cpu_read(ipv4_tcp_sk); 916 sock_net_set(ctl_sk, net); 917 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 918 inet_twsk(sk)->tw_mark : sk->sk_mark; 919 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 920 inet_twsk(sk)->tw_priority : sk->sk_priority; 921 transmit_time = tcp_transmit_time(sk); 922 ip_send_unicast_reply(ctl_sk, 923 skb, &TCP_SKB_CB(skb)->header.h4.opt, 924 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 925 &arg, arg.iov[0].iov_len, 926 transmit_time); 927 928 ctl_sk->sk_mark = 0; 929 sock_net_set(ctl_sk, &init_net); 930 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 931 local_bh_enable(); 932 } 933 934 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 935 { 936 struct inet_timewait_sock *tw = inet_twsk(sk); 937 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 938 939 tcp_v4_send_ack(sk, skb, 940 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 941 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 942 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 943 tcptw->tw_ts_recent, 944 tw->tw_bound_dev_if, 945 tcp_twsk_md5_key(tcptw), 946 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 947 tw->tw_tos 948 ); 949 950 inet_twsk_put(tw); 951 } 952 953 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 954 struct request_sock *req) 955 { 956 const union tcp_md5_addr *addr; 957 int l3index; 958 959 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 960 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 961 */ 962 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 963 tcp_sk(sk)->snd_nxt; 964 965 /* RFC 7323 2.3 966 * The window field (SEG.WND) of every outgoing segment, with the 967 * exception of <SYN> segments, MUST be right-shifted by 968 * Rcv.Wind.Shift bits: 969 */ 970 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 971 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 972 tcp_v4_send_ack(sk, skb, seq, 973 tcp_rsk(req)->rcv_nxt, 974 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 975 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 976 req->ts_recent, 977 0, 978 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 979 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 980 ip_hdr(skb)->tos); 981 } 982 983 /* 984 * Send a SYN-ACK after having received a SYN. 985 * This still operates on a request_sock only, not on a big 986 * socket. 987 */ 988 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 989 struct flowi *fl, 990 struct request_sock *req, 991 struct tcp_fastopen_cookie *foc, 992 enum tcp_synack_type synack_type, 993 struct sk_buff *syn_skb) 994 { 995 const struct inet_request_sock *ireq = inet_rsk(req); 996 struct flowi4 fl4; 997 int err = -1; 998 struct sk_buff *skb; 999 u8 tos; 1000 1001 /* First, grab a route. */ 1002 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1003 return -1; 1004 1005 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1006 1007 if (skb) { 1008 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1009 1010 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? 1011 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1012 (inet_sk(sk)->tos & INET_ECN_MASK) : 1013 inet_sk(sk)->tos; 1014 1015 if (!INET_ECN_is_capable(tos) && 1016 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1017 tos |= INET_ECN_ECT_0; 1018 1019 rcu_read_lock(); 1020 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1021 ireq->ir_rmt_addr, 1022 rcu_dereference(ireq->ireq_opt), 1023 tos); 1024 rcu_read_unlock(); 1025 err = net_xmit_eval(err); 1026 } 1027 1028 return err; 1029 } 1030 1031 /* 1032 * IPv4 request_sock destructor. 1033 */ 1034 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1035 { 1036 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1037 } 1038 1039 #ifdef CONFIG_TCP_MD5SIG 1040 /* 1041 * RFC2385 MD5 checksumming requires a mapping of 1042 * IP address->MD5 Key. 1043 * We need to maintain these in the sk structure. 1044 */ 1045 1046 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1047 EXPORT_SYMBOL(tcp_md5_needed); 1048 1049 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1050 { 1051 if (!old) 1052 return true; 1053 1054 /* l3index always overrides non-l3index */ 1055 if (old->l3index && new->l3index == 0) 1056 return false; 1057 if (old->l3index == 0 && new->l3index) 1058 return true; 1059 1060 return old->prefixlen < new->prefixlen; 1061 } 1062 1063 /* Find the Key structure for an address. */ 1064 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1065 const union tcp_md5_addr *addr, 1066 int family) 1067 { 1068 const struct tcp_sock *tp = tcp_sk(sk); 1069 struct tcp_md5sig_key *key; 1070 const struct tcp_md5sig_info *md5sig; 1071 __be32 mask; 1072 struct tcp_md5sig_key *best_match = NULL; 1073 bool match; 1074 1075 /* caller either holds rcu_read_lock() or socket lock */ 1076 md5sig = rcu_dereference_check(tp->md5sig_info, 1077 lockdep_sock_is_held(sk)); 1078 if (!md5sig) 1079 return NULL; 1080 1081 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1082 lockdep_sock_is_held(sk)) { 1083 if (key->family != family) 1084 continue; 1085 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1086 continue; 1087 if (family == AF_INET) { 1088 mask = inet_make_mask(key->prefixlen); 1089 match = (key->addr.a4.s_addr & mask) == 1090 (addr->a4.s_addr & mask); 1091 #if IS_ENABLED(CONFIG_IPV6) 1092 } else if (family == AF_INET6) { 1093 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1094 key->prefixlen); 1095 #endif 1096 } else { 1097 match = false; 1098 } 1099 1100 if (match && better_md5_match(best_match, key)) 1101 best_match = key; 1102 } 1103 return best_match; 1104 } 1105 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1106 1107 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1108 const union tcp_md5_addr *addr, 1109 int family, u8 prefixlen, 1110 int l3index, u8 flags) 1111 { 1112 const struct tcp_sock *tp = tcp_sk(sk); 1113 struct tcp_md5sig_key *key; 1114 unsigned int size = sizeof(struct in_addr); 1115 const struct tcp_md5sig_info *md5sig; 1116 1117 /* caller either holds rcu_read_lock() or socket lock */ 1118 md5sig = rcu_dereference_check(tp->md5sig_info, 1119 lockdep_sock_is_held(sk)); 1120 if (!md5sig) 1121 return NULL; 1122 #if IS_ENABLED(CONFIG_IPV6) 1123 if (family == AF_INET6) 1124 size = sizeof(struct in6_addr); 1125 #endif 1126 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1127 lockdep_sock_is_held(sk)) { 1128 if (key->family != family) 1129 continue; 1130 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1131 continue; 1132 if (key->l3index != l3index) 1133 continue; 1134 if (!memcmp(&key->addr, addr, size) && 1135 key->prefixlen == prefixlen) 1136 return key; 1137 } 1138 return NULL; 1139 } 1140 1141 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1142 const struct sock *addr_sk) 1143 { 1144 const union tcp_md5_addr *addr; 1145 int l3index; 1146 1147 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1148 addr_sk->sk_bound_dev_if); 1149 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1150 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1151 } 1152 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1153 1154 /* This can be called on a newly created socket, from other files */ 1155 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1156 int family, u8 prefixlen, int l3index, u8 flags, 1157 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1158 { 1159 /* Add Key to the list */ 1160 struct tcp_md5sig_key *key; 1161 struct tcp_sock *tp = tcp_sk(sk); 1162 struct tcp_md5sig_info *md5sig; 1163 1164 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1165 if (key) { 1166 /* Pre-existing entry - just update that one. 1167 * Note that the key might be used concurrently. 1168 * data_race() is telling kcsan that we do not care of 1169 * key mismatches, since changing MD5 key on live flows 1170 * can lead to packet drops. 1171 */ 1172 data_race(memcpy(key->key, newkey, newkeylen)); 1173 1174 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1175 * Also note that a reader could catch new key->keylen value 1176 * but old key->key[], this is the reason we use __GFP_ZERO 1177 * at sock_kmalloc() time below these lines. 1178 */ 1179 WRITE_ONCE(key->keylen, newkeylen); 1180 1181 return 0; 1182 } 1183 1184 md5sig = rcu_dereference_protected(tp->md5sig_info, 1185 lockdep_sock_is_held(sk)); 1186 if (!md5sig) { 1187 md5sig = kmalloc(sizeof(*md5sig), gfp); 1188 if (!md5sig) 1189 return -ENOMEM; 1190 1191 sk_gso_disable(sk); 1192 INIT_HLIST_HEAD(&md5sig->head); 1193 rcu_assign_pointer(tp->md5sig_info, md5sig); 1194 } 1195 1196 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1197 if (!key) 1198 return -ENOMEM; 1199 if (!tcp_alloc_md5sig_pool()) { 1200 sock_kfree_s(sk, key, sizeof(*key)); 1201 return -ENOMEM; 1202 } 1203 1204 memcpy(key->key, newkey, newkeylen); 1205 key->keylen = newkeylen; 1206 key->family = family; 1207 key->prefixlen = prefixlen; 1208 key->l3index = l3index; 1209 key->flags = flags; 1210 memcpy(&key->addr, addr, 1211 (family == AF_INET6) ? sizeof(struct in6_addr) : 1212 sizeof(struct in_addr)); 1213 hlist_add_head_rcu(&key->node, &md5sig->head); 1214 return 0; 1215 } 1216 EXPORT_SYMBOL(tcp_md5_do_add); 1217 1218 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1219 u8 prefixlen, int l3index, u8 flags) 1220 { 1221 struct tcp_md5sig_key *key; 1222 1223 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1224 if (!key) 1225 return -ENOENT; 1226 hlist_del_rcu(&key->node); 1227 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1228 kfree_rcu(key, rcu); 1229 return 0; 1230 } 1231 EXPORT_SYMBOL(tcp_md5_do_del); 1232 1233 static void tcp_clear_md5_list(struct sock *sk) 1234 { 1235 struct tcp_sock *tp = tcp_sk(sk); 1236 struct tcp_md5sig_key *key; 1237 struct hlist_node *n; 1238 struct tcp_md5sig_info *md5sig; 1239 1240 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1241 1242 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1243 hlist_del_rcu(&key->node); 1244 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1245 kfree_rcu(key, rcu); 1246 } 1247 } 1248 1249 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1250 sockptr_t optval, int optlen) 1251 { 1252 struct tcp_md5sig cmd; 1253 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1254 const union tcp_md5_addr *addr; 1255 u8 prefixlen = 32; 1256 int l3index = 0; 1257 u8 flags; 1258 1259 if (optlen < sizeof(cmd)) 1260 return -EINVAL; 1261 1262 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1263 return -EFAULT; 1264 1265 if (sin->sin_family != AF_INET) 1266 return -EINVAL; 1267 1268 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1269 1270 if (optname == TCP_MD5SIG_EXT && 1271 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1272 prefixlen = cmd.tcpm_prefixlen; 1273 if (prefixlen > 32) 1274 return -EINVAL; 1275 } 1276 1277 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1278 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1279 struct net_device *dev; 1280 1281 rcu_read_lock(); 1282 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1283 if (dev && netif_is_l3_master(dev)) 1284 l3index = dev->ifindex; 1285 1286 rcu_read_unlock(); 1287 1288 /* ok to reference set/not set outside of rcu; 1289 * right now device MUST be an L3 master 1290 */ 1291 if (!dev || !l3index) 1292 return -EINVAL; 1293 } 1294 1295 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1296 1297 if (!cmd.tcpm_keylen) 1298 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1299 1300 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1301 return -EINVAL; 1302 1303 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1304 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1305 } 1306 1307 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1308 __be32 daddr, __be32 saddr, 1309 const struct tcphdr *th, int nbytes) 1310 { 1311 struct tcp4_pseudohdr *bp; 1312 struct scatterlist sg; 1313 struct tcphdr *_th; 1314 1315 bp = hp->scratch; 1316 bp->saddr = saddr; 1317 bp->daddr = daddr; 1318 bp->pad = 0; 1319 bp->protocol = IPPROTO_TCP; 1320 bp->len = cpu_to_be16(nbytes); 1321 1322 _th = (struct tcphdr *)(bp + 1); 1323 memcpy(_th, th, sizeof(*th)); 1324 _th->check = 0; 1325 1326 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1327 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1328 sizeof(*bp) + sizeof(*th)); 1329 return crypto_ahash_update(hp->md5_req); 1330 } 1331 1332 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1333 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1334 { 1335 struct tcp_md5sig_pool *hp; 1336 struct ahash_request *req; 1337 1338 hp = tcp_get_md5sig_pool(); 1339 if (!hp) 1340 goto clear_hash_noput; 1341 req = hp->md5_req; 1342 1343 if (crypto_ahash_init(req)) 1344 goto clear_hash; 1345 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1346 goto clear_hash; 1347 if (tcp_md5_hash_key(hp, key)) 1348 goto clear_hash; 1349 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1350 if (crypto_ahash_final(req)) 1351 goto clear_hash; 1352 1353 tcp_put_md5sig_pool(); 1354 return 0; 1355 1356 clear_hash: 1357 tcp_put_md5sig_pool(); 1358 clear_hash_noput: 1359 memset(md5_hash, 0, 16); 1360 return 1; 1361 } 1362 1363 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1364 const struct sock *sk, 1365 const struct sk_buff *skb) 1366 { 1367 struct tcp_md5sig_pool *hp; 1368 struct ahash_request *req; 1369 const struct tcphdr *th = tcp_hdr(skb); 1370 __be32 saddr, daddr; 1371 1372 if (sk) { /* valid for establish/request sockets */ 1373 saddr = sk->sk_rcv_saddr; 1374 daddr = sk->sk_daddr; 1375 } else { 1376 const struct iphdr *iph = ip_hdr(skb); 1377 saddr = iph->saddr; 1378 daddr = iph->daddr; 1379 } 1380 1381 hp = tcp_get_md5sig_pool(); 1382 if (!hp) 1383 goto clear_hash_noput; 1384 req = hp->md5_req; 1385 1386 if (crypto_ahash_init(req)) 1387 goto clear_hash; 1388 1389 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1390 goto clear_hash; 1391 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1392 goto clear_hash; 1393 if (tcp_md5_hash_key(hp, key)) 1394 goto clear_hash; 1395 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1396 if (crypto_ahash_final(req)) 1397 goto clear_hash; 1398 1399 tcp_put_md5sig_pool(); 1400 return 0; 1401 1402 clear_hash: 1403 tcp_put_md5sig_pool(); 1404 clear_hash_noput: 1405 memset(md5_hash, 0, 16); 1406 return 1; 1407 } 1408 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1409 1410 #endif 1411 1412 static void tcp_v4_init_req(struct request_sock *req, 1413 const struct sock *sk_listener, 1414 struct sk_buff *skb) 1415 { 1416 struct inet_request_sock *ireq = inet_rsk(req); 1417 struct net *net = sock_net(sk_listener); 1418 1419 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1420 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1421 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1422 } 1423 1424 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1425 struct sk_buff *skb, 1426 struct flowi *fl, 1427 struct request_sock *req) 1428 { 1429 tcp_v4_init_req(req, sk, skb); 1430 1431 if (security_inet_conn_request(sk, skb, req)) 1432 return NULL; 1433 1434 return inet_csk_route_req(sk, &fl->u.ip4, req); 1435 } 1436 1437 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1438 .family = PF_INET, 1439 .obj_size = sizeof(struct tcp_request_sock), 1440 .rtx_syn_ack = tcp_rtx_synack, 1441 .send_ack = tcp_v4_reqsk_send_ack, 1442 .destructor = tcp_v4_reqsk_destructor, 1443 .send_reset = tcp_v4_send_reset, 1444 .syn_ack_timeout = tcp_syn_ack_timeout, 1445 }; 1446 1447 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1448 .mss_clamp = TCP_MSS_DEFAULT, 1449 #ifdef CONFIG_TCP_MD5SIG 1450 .req_md5_lookup = tcp_v4_md5_lookup, 1451 .calc_md5_hash = tcp_v4_md5_hash_skb, 1452 #endif 1453 #ifdef CONFIG_SYN_COOKIES 1454 .cookie_init_seq = cookie_v4_init_sequence, 1455 #endif 1456 .route_req = tcp_v4_route_req, 1457 .init_seq = tcp_v4_init_seq, 1458 .init_ts_off = tcp_v4_init_ts_off, 1459 .send_synack = tcp_v4_send_synack, 1460 }; 1461 1462 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1463 { 1464 /* Never answer to SYNs send to broadcast or multicast */ 1465 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1466 goto drop; 1467 1468 return tcp_conn_request(&tcp_request_sock_ops, 1469 &tcp_request_sock_ipv4_ops, sk, skb); 1470 1471 drop: 1472 tcp_listendrop(sk); 1473 return 0; 1474 } 1475 EXPORT_SYMBOL(tcp_v4_conn_request); 1476 1477 1478 /* 1479 * The three way handshake has completed - we got a valid synack - 1480 * now create the new socket. 1481 */ 1482 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1483 struct request_sock *req, 1484 struct dst_entry *dst, 1485 struct request_sock *req_unhash, 1486 bool *own_req) 1487 { 1488 struct inet_request_sock *ireq; 1489 bool found_dup_sk = false; 1490 struct inet_sock *newinet; 1491 struct tcp_sock *newtp; 1492 struct sock *newsk; 1493 #ifdef CONFIG_TCP_MD5SIG 1494 const union tcp_md5_addr *addr; 1495 struct tcp_md5sig_key *key; 1496 int l3index; 1497 #endif 1498 struct ip_options_rcu *inet_opt; 1499 1500 if (sk_acceptq_is_full(sk)) 1501 goto exit_overflow; 1502 1503 newsk = tcp_create_openreq_child(sk, req, skb); 1504 if (!newsk) 1505 goto exit_nonewsk; 1506 1507 newsk->sk_gso_type = SKB_GSO_TCPV4; 1508 inet_sk_rx_dst_set(newsk, skb); 1509 1510 newtp = tcp_sk(newsk); 1511 newinet = inet_sk(newsk); 1512 ireq = inet_rsk(req); 1513 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1514 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1515 newsk->sk_bound_dev_if = ireq->ir_iif; 1516 newinet->inet_saddr = ireq->ir_loc_addr; 1517 inet_opt = rcu_dereference(ireq->ireq_opt); 1518 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1519 newinet->mc_index = inet_iif(skb); 1520 newinet->mc_ttl = ip_hdr(skb)->ttl; 1521 newinet->rcv_tos = ip_hdr(skb)->tos; 1522 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1523 if (inet_opt) 1524 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1525 newinet->inet_id = prandom_u32(); 1526 1527 /* Set ToS of the new socket based upon the value of incoming SYN. 1528 * ECT bits are set later in tcp_init_transfer(). 1529 */ 1530 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) 1531 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1532 1533 if (!dst) { 1534 dst = inet_csk_route_child_sock(sk, newsk, req); 1535 if (!dst) 1536 goto put_and_exit; 1537 } else { 1538 /* syncookie case : see end of cookie_v4_check() */ 1539 } 1540 sk_setup_caps(newsk, dst); 1541 1542 tcp_ca_openreq_child(newsk, dst); 1543 1544 tcp_sync_mss(newsk, dst_mtu(dst)); 1545 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1546 1547 tcp_initialize_rcv_mss(newsk); 1548 1549 #ifdef CONFIG_TCP_MD5SIG 1550 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1551 /* Copy over the MD5 key from the original socket */ 1552 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1553 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1554 if (key) { 1555 /* 1556 * We're using one, so create a matching key 1557 * on the newsk structure. If we fail to get 1558 * memory, then we end up not copying the key 1559 * across. Shucks. 1560 */ 1561 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, 1562 key->key, key->keylen, GFP_ATOMIC); 1563 sk_gso_disable(newsk); 1564 } 1565 #endif 1566 1567 if (__inet_inherit_port(sk, newsk) < 0) 1568 goto put_and_exit; 1569 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1570 &found_dup_sk); 1571 if (likely(*own_req)) { 1572 tcp_move_syn(newtp, req); 1573 ireq->ireq_opt = NULL; 1574 } else { 1575 newinet->inet_opt = NULL; 1576 1577 if (!req_unhash && found_dup_sk) { 1578 /* This code path should only be executed in the 1579 * syncookie case only 1580 */ 1581 bh_unlock_sock(newsk); 1582 sock_put(newsk); 1583 newsk = NULL; 1584 } 1585 } 1586 return newsk; 1587 1588 exit_overflow: 1589 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1590 exit_nonewsk: 1591 dst_release(dst); 1592 exit: 1593 tcp_listendrop(sk); 1594 return NULL; 1595 put_and_exit: 1596 newinet->inet_opt = NULL; 1597 inet_csk_prepare_forced_close(newsk); 1598 tcp_done(newsk); 1599 goto exit; 1600 } 1601 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1602 1603 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1604 { 1605 #ifdef CONFIG_SYN_COOKIES 1606 const struct tcphdr *th = tcp_hdr(skb); 1607 1608 if (!th->syn) 1609 sk = cookie_v4_check(sk, skb); 1610 #endif 1611 return sk; 1612 } 1613 1614 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1615 struct tcphdr *th, u32 *cookie) 1616 { 1617 u16 mss = 0; 1618 #ifdef CONFIG_SYN_COOKIES 1619 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1620 &tcp_request_sock_ipv4_ops, sk, th); 1621 if (mss) { 1622 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1623 tcp_synq_overflow(sk); 1624 } 1625 #endif 1626 return mss; 1627 } 1628 1629 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1630 u32)); 1631 /* The socket must have it's spinlock held when we get 1632 * here, unless it is a TCP_LISTEN socket. 1633 * 1634 * We have a potential double-lock case here, so even when 1635 * doing backlog processing we use the BH locking scheme. 1636 * This is because we cannot sleep with the original spinlock 1637 * held. 1638 */ 1639 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1640 { 1641 enum skb_drop_reason reason; 1642 struct sock *rsk; 1643 1644 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1645 struct dst_entry *dst; 1646 1647 dst = rcu_dereference_protected(sk->sk_rx_dst, 1648 lockdep_sock_is_held(sk)); 1649 1650 sock_rps_save_rxhash(sk, skb); 1651 sk_mark_napi_id(sk, skb); 1652 if (dst) { 1653 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1654 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1655 dst, 0)) { 1656 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1657 dst_release(dst); 1658 } 1659 } 1660 tcp_rcv_established(sk, skb); 1661 return 0; 1662 } 1663 1664 reason = SKB_DROP_REASON_NOT_SPECIFIED; 1665 if (tcp_checksum_complete(skb)) 1666 goto csum_err; 1667 1668 if (sk->sk_state == TCP_LISTEN) { 1669 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1670 1671 if (!nsk) 1672 goto discard; 1673 if (nsk != sk) { 1674 if (tcp_child_process(sk, nsk, skb)) { 1675 rsk = nsk; 1676 goto reset; 1677 } 1678 return 0; 1679 } 1680 } else 1681 sock_rps_save_rxhash(sk, skb); 1682 1683 if (tcp_rcv_state_process(sk, skb)) { 1684 rsk = sk; 1685 goto reset; 1686 } 1687 return 0; 1688 1689 reset: 1690 tcp_v4_send_reset(rsk, skb); 1691 discard: 1692 kfree_skb_reason(skb, reason); 1693 /* Be careful here. If this function gets more complicated and 1694 * gcc suffers from register pressure on the x86, sk (in %ebx) 1695 * might be destroyed here. This current version compiles correctly, 1696 * but you have been warned. 1697 */ 1698 return 0; 1699 1700 csum_err: 1701 reason = SKB_DROP_REASON_TCP_CSUM; 1702 trace_tcp_bad_csum(skb); 1703 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1704 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1705 goto discard; 1706 } 1707 EXPORT_SYMBOL(tcp_v4_do_rcv); 1708 1709 int tcp_v4_early_demux(struct sk_buff *skb) 1710 { 1711 const struct iphdr *iph; 1712 const struct tcphdr *th; 1713 struct sock *sk; 1714 1715 if (skb->pkt_type != PACKET_HOST) 1716 return 0; 1717 1718 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1719 return 0; 1720 1721 iph = ip_hdr(skb); 1722 th = tcp_hdr(skb); 1723 1724 if (th->doff < sizeof(struct tcphdr) / 4) 1725 return 0; 1726 1727 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1728 iph->saddr, th->source, 1729 iph->daddr, ntohs(th->dest), 1730 skb->skb_iif, inet_sdif(skb)); 1731 if (sk) { 1732 skb->sk = sk; 1733 skb->destructor = sock_edemux; 1734 if (sk_fullsock(sk)) { 1735 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1736 1737 if (dst) 1738 dst = dst_check(dst, 0); 1739 if (dst && 1740 sk->sk_rx_dst_ifindex == skb->skb_iif) 1741 skb_dst_set_noref(skb, dst); 1742 } 1743 } 1744 return 0; 1745 } 1746 1747 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1748 enum skb_drop_reason *reason) 1749 { 1750 u32 limit, tail_gso_size, tail_gso_segs; 1751 struct skb_shared_info *shinfo; 1752 const struct tcphdr *th; 1753 struct tcphdr *thtail; 1754 struct sk_buff *tail; 1755 unsigned int hdrlen; 1756 bool fragstolen; 1757 u32 gso_segs; 1758 u32 gso_size; 1759 int delta; 1760 1761 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1762 * we can fix skb->truesize to its real value to avoid future drops. 1763 * This is valid because skb is not yet charged to the socket. 1764 * It has been noticed pure SACK packets were sometimes dropped 1765 * (if cooked by drivers without copybreak feature). 1766 */ 1767 skb_condense(skb); 1768 1769 skb_dst_drop(skb); 1770 1771 if (unlikely(tcp_checksum_complete(skb))) { 1772 bh_unlock_sock(sk); 1773 trace_tcp_bad_csum(skb); 1774 *reason = SKB_DROP_REASON_TCP_CSUM; 1775 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1776 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1777 return true; 1778 } 1779 1780 /* Attempt coalescing to last skb in backlog, even if we are 1781 * above the limits. 1782 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1783 */ 1784 th = (const struct tcphdr *)skb->data; 1785 hdrlen = th->doff * 4; 1786 1787 tail = sk->sk_backlog.tail; 1788 if (!tail) 1789 goto no_coalesce; 1790 thtail = (struct tcphdr *)tail->data; 1791 1792 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1793 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1794 ((TCP_SKB_CB(tail)->tcp_flags | 1795 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1796 !((TCP_SKB_CB(tail)->tcp_flags & 1797 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1798 ((TCP_SKB_CB(tail)->tcp_flags ^ 1799 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1800 #ifdef CONFIG_TLS_DEVICE 1801 tail->decrypted != skb->decrypted || 1802 #endif 1803 thtail->doff != th->doff || 1804 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1805 goto no_coalesce; 1806 1807 __skb_pull(skb, hdrlen); 1808 1809 shinfo = skb_shinfo(skb); 1810 gso_size = shinfo->gso_size ?: skb->len; 1811 gso_segs = shinfo->gso_segs ?: 1; 1812 1813 shinfo = skb_shinfo(tail); 1814 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1815 tail_gso_segs = shinfo->gso_segs ?: 1; 1816 1817 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1818 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1819 1820 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1821 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1822 thtail->window = th->window; 1823 } 1824 1825 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1826 * thtail->fin, so that the fast path in tcp_rcv_established() 1827 * is not entered if we append a packet with a FIN. 1828 * SYN, RST, URG are not present. 1829 * ACK is set on both packets. 1830 * PSH : we do not really care in TCP stack, 1831 * at least for 'GRO' packets. 1832 */ 1833 thtail->fin |= th->fin; 1834 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1835 1836 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1837 TCP_SKB_CB(tail)->has_rxtstamp = true; 1838 tail->tstamp = skb->tstamp; 1839 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1840 } 1841 1842 /* Not as strict as GRO. We only need to carry mss max value */ 1843 shinfo->gso_size = max(gso_size, tail_gso_size); 1844 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1845 1846 sk->sk_backlog.len += delta; 1847 __NET_INC_STATS(sock_net(sk), 1848 LINUX_MIB_TCPBACKLOGCOALESCE); 1849 kfree_skb_partial(skb, fragstolen); 1850 return false; 1851 } 1852 __skb_push(skb, hdrlen); 1853 1854 no_coalesce: 1855 /* Only socket owner can try to collapse/prune rx queues 1856 * to reduce memory overhead, so add a little headroom here. 1857 * Few sockets backlog are possibly concurrently non empty. 1858 */ 1859 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024; 1860 1861 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1862 bh_unlock_sock(sk); 1863 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 1864 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1865 return true; 1866 } 1867 return false; 1868 } 1869 EXPORT_SYMBOL(tcp_add_backlog); 1870 1871 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1872 { 1873 struct tcphdr *th = (struct tcphdr *)skb->data; 1874 1875 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1876 } 1877 EXPORT_SYMBOL(tcp_filter); 1878 1879 static void tcp_v4_restore_cb(struct sk_buff *skb) 1880 { 1881 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1882 sizeof(struct inet_skb_parm)); 1883 } 1884 1885 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1886 const struct tcphdr *th) 1887 { 1888 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1889 * barrier() makes sure compiler wont play fool^Waliasing games. 1890 */ 1891 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1892 sizeof(struct inet_skb_parm)); 1893 barrier(); 1894 1895 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1896 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1897 skb->len - th->doff * 4); 1898 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1899 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1900 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1901 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1902 TCP_SKB_CB(skb)->sacked = 0; 1903 TCP_SKB_CB(skb)->has_rxtstamp = 1904 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1905 } 1906 1907 /* 1908 * From tcp_input.c 1909 */ 1910 1911 int tcp_v4_rcv(struct sk_buff *skb) 1912 { 1913 struct net *net = dev_net(skb->dev); 1914 enum skb_drop_reason drop_reason; 1915 int sdif = inet_sdif(skb); 1916 int dif = inet_iif(skb); 1917 const struct iphdr *iph; 1918 const struct tcphdr *th; 1919 bool refcounted; 1920 struct sock *sk; 1921 int ret; 1922 1923 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 1924 if (skb->pkt_type != PACKET_HOST) 1925 goto discard_it; 1926 1927 /* Count it even if it's bad */ 1928 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1929 1930 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1931 goto discard_it; 1932 1933 th = (const struct tcphdr *)skb->data; 1934 1935 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 1936 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 1937 goto bad_packet; 1938 } 1939 if (!pskb_may_pull(skb, th->doff * 4)) 1940 goto discard_it; 1941 1942 /* An explanation is required here, I think. 1943 * Packet length and doff are validated by header prediction, 1944 * provided case of th->doff==0 is eliminated. 1945 * So, we defer the checks. */ 1946 1947 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1948 goto csum_error; 1949 1950 th = (const struct tcphdr *)skb->data; 1951 iph = ip_hdr(skb); 1952 lookup: 1953 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1954 th->dest, sdif, &refcounted); 1955 if (!sk) 1956 goto no_tcp_socket; 1957 1958 process: 1959 if (sk->sk_state == TCP_TIME_WAIT) 1960 goto do_time_wait; 1961 1962 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1963 struct request_sock *req = inet_reqsk(sk); 1964 bool req_stolen = false; 1965 struct sock *nsk; 1966 1967 sk = req->rsk_listener; 1968 if (unlikely(tcp_inbound_md5_hash(sk, skb, &drop_reason, 1969 &iph->saddr, &iph->daddr, 1970 AF_INET, dif, sdif))) { 1971 sk_drops_add(sk, skb); 1972 reqsk_put(req); 1973 goto discard_it; 1974 } 1975 if (tcp_checksum_complete(skb)) { 1976 reqsk_put(req); 1977 goto csum_error; 1978 } 1979 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1980 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 1981 if (!nsk) { 1982 inet_csk_reqsk_queue_drop_and_put(sk, req); 1983 goto lookup; 1984 } 1985 sk = nsk; 1986 /* reuseport_migrate_sock() has already held one sk_refcnt 1987 * before returning. 1988 */ 1989 } else { 1990 /* We own a reference on the listener, increase it again 1991 * as we might lose it too soon. 1992 */ 1993 sock_hold(sk); 1994 } 1995 refcounted = true; 1996 nsk = NULL; 1997 if (!tcp_filter(sk, skb)) { 1998 th = (const struct tcphdr *)skb->data; 1999 iph = ip_hdr(skb); 2000 tcp_v4_fill_cb(skb, iph, th); 2001 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2002 } else { 2003 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2004 } 2005 if (!nsk) { 2006 reqsk_put(req); 2007 if (req_stolen) { 2008 /* Another cpu got exclusive access to req 2009 * and created a full blown socket. 2010 * Try to feed this packet to this socket 2011 * instead of discarding it. 2012 */ 2013 tcp_v4_restore_cb(skb); 2014 sock_put(sk); 2015 goto lookup; 2016 } 2017 goto discard_and_relse; 2018 } 2019 if (nsk == sk) { 2020 reqsk_put(req); 2021 tcp_v4_restore_cb(skb); 2022 } else if (tcp_child_process(sk, nsk, skb)) { 2023 tcp_v4_send_reset(nsk, skb); 2024 goto discard_and_relse; 2025 } else { 2026 sock_put(sk); 2027 return 0; 2028 } 2029 } 2030 2031 if (static_branch_unlikely(&ip4_min_ttl)) { 2032 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2033 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2034 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2035 goto discard_and_relse; 2036 } 2037 } 2038 2039 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2040 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2041 goto discard_and_relse; 2042 } 2043 2044 if (tcp_inbound_md5_hash(sk, skb, &drop_reason, &iph->saddr, 2045 &iph->daddr, AF_INET, dif, sdif)) 2046 goto discard_and_relse; 2047 2048 nf_reset_ct(skb); 2049 2050 if (tcp_filter(sk, skb)) { 2051 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2052 goto discard_and_relse; 2053 } 2054 th = (const struct tcphdr *)skb->data; 2055 iph = ip_hdr(skb); 2056 tcp_v4_fill_cb(skb, iph, th); 2057 2058 skb->dev = NULL; 2059 2060 if (sk->sk_state == TCP_LISTEN) { 2061 ret = tcp_v4_do_rcv(sk, skb); 2062 goto put_and_return; 2063 } 2064 2065 sk_incoming_cpu_update(sk); 2066 2067 sk_defer_free_flush(sk); 2068 bh_lock_sock_nested(sk); 2069 tcp_segs_in(tcp_sk(sk), skb); 2070 ret = 0; 2071 if (!sock_owned_by_user(sk)) { 2072 ret = tcp_v4_do_rcv(sk, skb); 2073 } else { 2074 if (tcp_add_backlog(sk, skb, &drop_reason)) 2075 goto discard_and_relse; 2076 } 2077 bh_unlock_sock(sk); 2078 2079 put_and_return: 2080 if (refcounted) 2081 sock_put(sk); 2082 2083 return ret; 2084 2085 no_tcp_socket: 2086 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2087 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2088 goto discard_it; 2089 2090 tcp_v4_fill_cb(skb, iph, th); 2091 2092 if (tcp_checksum_complete(skb)) { 2093 csum_error: 2094 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2095 trace_tcp_bad_csum(skb); 2096 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2097 bad_packet: 2098 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2099 } else { 2100 tcp_v4_send_reset(NULL, skb); 2101 } 2102 2103 discard_it: 2104 /* Discard frame. */ 2105 kfree_skb_reason(skb, drop_reason); 2106 return 0; 2107 2108 discard_and_relse: 2109 sk_drops_add(sk, skb); 2110 if (refcounted) 2111 sock_put(sk); 2112 goto discard_it; 2113 2114 do_time_wait: 2115 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2116 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2117 inet_twsk_put(inet_twsk(sk)); 2118 goto discard_it; 2119 } 2120 2121 tcp_v4_fill_cb(skb, iph, th); 2122 2123 if (tcp_checksum_complete(skb)) { 2124 inet_twsk_put(inet_twsk(sk)); 2125 goto csum_error; 2126 } 2127 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2128 case TCP_TW_SYN: { 2129 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2130 &tcp_hashinfo, skb, 2131 __tcp_hdrlen(th), 2132 iph->saddr, th->source, 2133 iph->daddr, th->dest, 2134 inet_iif(skb), 2135 sdif); 2136 if (sk2) { 2137 inet_twsk_deschedule_put(inet_twsk(sk)); 2138 sk = sk2; 2139 tcp_v4_restore_cb(skb); 2140 refcounted = false; 2141 goto process; 2142 } 2143 } 2144 /* to ACK */ 2145 fallthrough; 2146 case TCP_TW_ACK: 2147 tcp_v4_timewait_ack(sk, skb); 2148 break; 2149 case TCP_TW_RST: 2150 tcp_v4_send_reset(sk, skb); 2151 inet_twsk_deschedule_put(inet_twsk(sk)); 2152 goto discard_it; 2153 case TCP_TW_SUCCESS:; 2154 } 2155 goto discard_it; 2156 } 2157 2158 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2159 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2160 .twsk_unique = tcp_twsk_unique, 2161 .twsk_destructor= tcp_twsk_destructor, 2162 }; 2163 2164 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2165 { 2166 struct dst_entry *dst = skb_dst(skb); 2167 2168 if (dst && dst_hold_safe(dst)) { 2169 rcu_assign_pointer(sk->sk_rx_dst, dst); 2170 sk->sk_rx_dst_ifindex = skb->skb_iif; 2171 } 2172 } 2173 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2174 2175 const struct inet_connection_sock_af_ops ipv4_specific = { 2176 .queue_xmit = ip_queue_xmit, 2177 .send_check = tcp_v4_send_check, 2178 .rebuild_header = inet_sk_rebuild_header, 2179 .sk_rx_dst_set = inet_sk_rx_dst_set, 2180 .conn_request = tcp_v4_conn_request, 2181 .syn_recv_sock = tcp_v4_syn_recv_sock, 2182 .net_header_len = sizeof(struct iphdr), 2183 .setsockopt = ip_setsockopt, 2184 .getsockopt = ip_getsockopt, 2185 .addr2sockaddr = inet_csk_addr2sockaddr, 2186 .sockaddr_len = sizeof(struct sockaddr_in), 2187 .mtu_reduced = tcp_v4_mtu_reduced, 2188 }; 2189 EXPORT_SYMBOL(ipv4_specific); 2190 2191 #ifdef CONFIG_TCP_MD5SIG 2192 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2193 .md5_lookup = tcp_v4_md5_lookup, 2194 .calc_md5_hash = tcp_v4_md5_hash_skb, 2195 .md5_parse = tcp_v4_parse_md5_keys, 2196 }; 2197 #endif 2198 2199 /* NOTE: A lot of things set to zero explicitly by call to 2200 * sk_alloc() so need not be done here. 2201 */ 2202 static int tcp_v4_init_sock(struct sock *sk) 2203 { 2204 struct inet_connection_sock *icsk = inet_csk(sk); 2205 2206 tcp_init_sock(sk); 2207 2208 icsk->icsk_af_ops = &ipv4_specific; 2209 2210 #ifdef CONFIG_TCP_MD5SIG 2211 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2212 #endif 2213 2214 return 0; 2215 } 2216 2217 void tcp_v4_destroy_sock(struct sock *sk) 2218 { 2219 struct tcp_sock *tp = tcp_sk(sk); 2220 2221 trace_tcp_destroy_sock(sk); 2222 2223 tcp_clear_xmit_timers(sk); 2224 2225 tcp_cleanup_congestion_control(sk); 2226 2227 tcp_cleanup_ulp(sk); 2228 2229 /* Cleanup up the write buffer. */ 2230 tcp_write_queue_purge(sk); 2231 2232 /* Check if we want to disable active TFO */ 2233 tcp_fastopen_active_disable_ofo_check(sk); 2234 2235 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2236 skb_rbtree_purge(&tp->out_of_order_queue); 2237 2238 #ifdef CONFIG_TCP_MD5SIG 2239 /* Clean up the MD5 key list, if any */ 2240 if (tp->md5sig_info) { 2241 tcp_clear_md5_list(sk); 2242 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2243 tp->md5sig_info = NULL; 2244 } 2245 #endif 2246 2247 /* Clean up a referenced TCP bind bucket. */ 2248 if (inet_csk(sk)->icsk_bind_hash) 2249 inet_put_port(sk); 2250 2251 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2252 2253 /* If socket is aborted during connect operation */ 2254 tcp_free_fastopen_req(tp); 2255 tcp_fastopen_destroy_cipher(sk); 2256 tcp_saved_syn_free(tp); 2257 2258 sk_sockets_allocated_dec(sk); 2259 } 2260 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2261 2262 #ifdef CONFIG_PROC_FS 2263 /* Proc filesystem TCP sock list dumping. */ 2264 2265 static unsigned short seq_file_family(const struct seq_file *seq); 2266 2267 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2268 { 2269 unsigned short family = seq_file_family(seq); 2270 2271 /* AF_UNSPEC is used as a match all */ 2272 return ((family == AF_UNSPEC || family == sk->sk_family) && 2273 net_eq(sock_net(sk), seq_file_net(seq))); 2274 } 2275 2276 /* Find a non empty bucket (starting from st->bucket) 2277 * and return the first sk from it. 2278 */ 2279 static void *listening_get_first(struct seq_file *seq) 2280 { 2281 struct tcp_iter_state *st = seq->private; 2282 2283 st->offset = 0; 2284 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { 2285 struct inet_listen_hashbucket *ilb2; 2286 struct inet_connection_sock *icsk; 2287 struct sock *sk; 2288 2289 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2290 if (hlist_empty(&ilb2->head)) 2291 continue; 2292 2293 spin_lock(&ilb2->lock); 2294 inet_lhash2_for_each_icsk(icsk, &ilb2->head) { 2295 sk = (struct sock *)icsk; 2296 if (seq_sk_match(seq, sk)) 2297 return sk; 2298 } 2299 spin_unlock(&ilb2->lock); 2300 } 2301 2302 return NULL; 2303 } 2304 2305 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2306 * If "cur" is the last one in the st->bucket, 2307 * call listening_get_first() to return the first sk of the next 2308 * non empty bucket. 2309 */ 2310 static void *listening_get_next(struct seq_file *seq, void *cur) 2311 { 2312 struct tcp_iter_state *st = seq->private; 2313 struct inet_listen_hashbucket *ilb2; 2314 struct inet_connection_sock *icsk; 2315 struct sock *sk = cur; 2316 2317 ++st->num; 2318 ++st->offset; 2319 2320 icsk = inet_csk(sk); 2321 inet_lhash2_for_each_icsk_continue(icsk) { 2322 sk = (struct sock *)icsk; 2323 if (seq_sk_match(seq, sk)) 2324 return sk; 2325 } 2326 2327 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2328 spin_unlock(&ilb2->lock); 2329 ++st->bucket; 2330 return listening_get_first(seq); 2331 } 2332 2333 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2334 { 2335 struct tcp_iter_state *st = seq->private; 2336 void *rc; 2337 2338 st->bucket = 0; 2339 st->offset = 0; 2340 rc = listening_get_first(seq); 2341 2342 while (rc && *pos) { 2343 rc = listening_get_next(seq, rc); 2344 --*pos; 2345 } 2346 return rc; 2347 } 2348 2349 static inline bool empty_bucket(const struct tcp_iter_state *st) 2350 { 2351 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2352 } 2353 2354 /* 2355 * Get first established socket starting from bucket given in st->bucket. 2356 * If st->bucket is zero, the very first socket in the hash is returned. 2357 */ 2358 static void *established_get_first(struct seq_file *seq) 2359 { 2360 struct tcp_iter_state *st = seq->private; 2361 2362 st->offset = 0; 2363 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2364 struct sock *sk; 2365 struct hlist_nulls_node *node; 2366 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2367 2368 /* Lockless fast path for the common case of empty buckets */ 2369 if (empty_bucket(st)) 2370 continue; 2371 2372 spin_lock_bh(lock); 2373 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2374 if (seq_sk_match(seq, sk)) 2375 return sk; 2376 } 2377 spin_unlock_bh(lock); 2378 } 2379 2380 return NULL; 2381 } 2382 2383 static void *established_get_next(struct seq_file *seq, void *cur) 2384 { 2385 struct sock *sk = cur; 2386 struct hlist_nulls_node *node; 2387 struct tcp_iter_state *st = seq->private; 2388 2389 ++st->num; 2390 ++st->offset; 2391 2392 sk = sk_nulls_next(sk); 2393 2394 sk_nulls_for_each_from(sk, node) { 2395 if (seq_sk_match(seq, sk)) 2396 return sk; 2397 } 2398 2399 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2400 ++st->bucket; 2401 return established_get_first(seq); 2402 } 2403 2404 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2405 { 2406 struct tcp_iter_state *st = seq->private; 2407 void *rc; 2408 2409 st->bucket = 0; 2410 rc = established_get_first(seq); 2411 2412 while (rc && pos) { 2413 rc = established_get_next(seq, rc); 2414 --pos; 2415 } 2416 return rc; 2417 } 2418 2419 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2420 { 2421 void *rc; 2422 struct tcp_iter_state *st = seq->private; 2423 2424 st->state = TCP_SEQ_STATE_LISTENING; 2425 rc = listening_get_idx(seq, &pos); 2426 2427 if (!rc) { 2428 st->state = TCP_SEQ_STATE_ESTABLISHED; 2429 rc = established_get_idx(seq, pos); 2430 } 2431 2432 return rc; 2433 } 2434 2435 static void *tcp_seek_last_pos(struct seq_file *seq) 2436 { 2437 struct tcp_iter_state *st = seq->private; 2438 int bucket = st->bucket; 2439 int offset = st->offset; 2440 int orig_num = st->num; 2441 void *rc = NULL; 2442 2443 switch (st->state) { 2444 case TCP_SEQ_STATE_LISTENING: 2445 if (st->bucket > tcp_hashinfo.lhash2_mask) 2446 break; 2447 st->state = TCP_SEQ_STATE_LISTENING; 2448 rc = listening_get_first(seq); 2449 while (offset-- && rc && bucket == st->bucket) 2450 rc = listening_get_next(seq, rc); 2451 if (rc) 2452 break; 2453 st->bucket = 0; 2454 st->state = TCP_SEQ_STATE_ESTABLISHED; 2455 fallthrough; 2456 case TCP_SEQ_STATE_ESTABLISHED: 2457 if (st->bucket > tcp_hashinfo.ehash_mask) 2458 break; 2459 rc = established_get_first(seq); 2460 while (offset-- && rc && bucket == st->bucket) 2461 rc = established_get_next(seq, rc); 2462 } 2463 2464 st->num = orig_num; 2465 2466 return rc; 2467 } 2468 2469 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2470 { 2471 struct tcp_iter_state *st = seq->private; 2472 void *rc; 2473 2474 if (*pos && *pos == st->last_pos) { 2475 rc = tcp_seek_last_pos(seq); 2476 if (rc) 2477 goto out; 2478 } 2479 2480 st->state = TCP_SEQ_STATE_LISTENING; 2481 st->num = 0; 2482 st->bucket = 0; 2483 st->offset = 0; 2484 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2485 2486 out: 2487 st->last_pos = *pos; 2488 return rc; 2489 } 2490 EXPORT_SYMBOL(tcp_seq_start); 2491 2492 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2493 { 2494 struct tcp_iter_state *st = seq->private; 2495 void *rc = NULL; 2496 2497 if (v == SEQ_START_TOKEN) { 2498 rc = tcp_get_idx(seq, 0); 2499 goto out; 2500 } 2501 2502 switch (st->state) { 2503 case TCP_SEQ_STATE_LISTENING: 2504 rc = listening_get_next(seq, v); 2505 if (!rc) { 2506 st->state = TCP_SEQ_STATE_ESTABLISHED; 2507 st->bucket = 0; 2508 st->offset = 0; 2509 rc = established_get_first(seq); 2510 } 2511 break; 2512 case TCP_SEQ_STATE_ESTABLISHED: 2513 rc = established_get_next(seq, v); 2514 break; 2515 } 2516 out: 2517 ++*pos; 2518 st->last_pos = *pos; 2519 return rc; 2520 } 2521 EXPORT_SYMBOL(tcp_seq_next); 2522 2523 void tcp_seq_stop(struct seq_file *seq, void *v) 2524 { 2525 struct tcp_iter_state *st = seq->private; 2526 2527 switch (st->state) { 2528 case TCP_SEQ_STATE_LISTENING: 2529 if (v != SEQ_START_TOKEN) 2530 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2531 break; 2532 case TCP_SEQ_STATE_ESTABLISHED: 2533 if (v) 2534 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2535 break; 2536 } 2537 } 2538 EXPORT_SYMBOL(tcp_seq_stop); 2539 2540 static void get_openreq4(const struct request_sock *req, 2541 struct seq_file *f, int i) 2542 { 2543 const struct inet_request_sock *ireq = inet_rsk(req); 2544 long delta = req->rsk_timer.expires - jiffies; 2545 2546 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2547 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2548 i, 2549 ireq->ir_loc_addr, 2550 ireq->ir_num, 2551 ireq->ir_rmt_addr, 2552 ntohs(ireq->ir_rmt_port), 2553 TCP_SYN_RECV, 2554 0, 0, /* could print option size, but that is af dependent. */ 2555 1, /* timers active (only the expire timer) */ 2556 jiffies_delta_to_clock_t(delta), 2557 req->num_timeout, 2558 from_kuid_munged(seq_user_ns(f), 2559 sock_i_uid(req->rsk_listener)), 2560 0, /* non standard timer */ 2561 0, /* open_requests have no inode */ 2562 0, 2563 req); 2564 } 2565 2566 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2567 { 2568 int timer_active; 2569 unsigned long timer_expires; 2570 const struct tcp_sock *tp = tcp_sk(sk); 2571 const struct inet_connection_sock *icsk = inet_csk(sk); 2572 const struct inet_sock *inet = inet_sk(sk); 2573 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2574 __be32 dest = inet->inet_daddr; 2575 __be32 src = inet->inet_rcv_saddr; 2576 __u16 destp = ntohs(inet->inet_dport); 2577 __u16 srcp = ntohs(inet->inet_sport); 2578 int rx_queue; 2579 int state; 2580 2581 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2582 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2583 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2584 timer_active = 1; 2585 timer_expires = icsk->icsk_timeout; 2586 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2587 timer_active = 4; 2588 timer_expires = icsk->icsk_timeout; 2589 } else if (timer_pending(&sk->sk_timer)) { 2590 timer_active = 2; 2591 timer_expires = sk->sk_timer.expires; 2592 } else { 2593 timer_active = 0; 2594 timer_expires = jiffies; 2595 } 2596 2597 state = inet_sk_state_load(sk); 2598 if (state == TCP_LISTEN) 2599 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2600 else 2601 /* Because we don't lock the socket, 2602 * we might find a transient negative value. 2603 */ 2604 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2605 READ_ONCE(tp->copied_seq), 0); 2606 2607 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2608 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2609 i, src, srcp, dest, destp, state, 2610 READ_ONCE(tp->write_seq) - tp->snd_una, 2611 rx_queue, 2612 timer_active, 2613 jiffies_delta_to_clock_t(timer_expires - jiffies), 2614 icsk->icsk_retransmits, 2615 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2616 icsk->icsk_probes_out, 2617 sock_i_ino(sk), 2618 refcount_read(&sk->sk_refcnt), sk, 2619 jiffies_to_clock_t(icsk->icsk_rto), 2620 jiffies_to_clock_t(icsk->icsk_ack.ato), 2621 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2622 tp->snd_cwnd, 2623 state == TCP_LISTEN ? 2624 fastopenq->max_qlen : 2625 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2626 } 2627 2628 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2629 struct seq_file *f, int i) 2630 { 2631 long delta = tw->tw_timer.expires - jiffies; 2632 __be32 dest, src; 2633 __u16 destp, srcp; 2634 2635 dest = tw->tw_daddr; 2636 src = tw->tw_rcv_saddr; 2637 destp = ntohs(tw->tw_dport); 2638 srcp = ntohs(tw->tw_sport); 2639 2640 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2641 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2642 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2643 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2644 refcount_read(&tw->tw_refcnt), tw); 2645 } 2646 2647 #define TMPSZ 150 2648 2649 static int tcp4_seq_show(struct seq_file *seq, void *v) 2650 { 2651 struct tcp_iter_state *st; 2652 struct sock *sk = v; 2653 2654 seq_setwidth(seq, TMPSZ - 1); 2655 if (v == SEQ_START_TOKEN) { 2656 seq_puts(seq, " sl local_address rem_address st tx_queue " 2657 "rx_queue tr tm->when retrnsmt uid timeout " 2658 "inode"); 2659 goto out; 2660 } 2661 st = seq->private; 2662 2663 if (sk->sk_state == TCP_TIME_WAIT) 2664 get_timewait4_sock(v, seq, st->num); 2665 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2666 get_openreq4(v, seq, st->num); 2667 else 2668 get_tcp4_sock(v, seq, st->num); 2669 out: 2670 seq_pad(seq, '\n'); 2671 return 0; 2672 } 2673 2674 #ifdef CONFIG_BPF_SYSCALL 2675 struct bpf_tcp_iter_state { 2676 struct tcp_iter_state state; 2677 unsigned int cur_sk; 2678 unsigned int end_sk; 2679 unsigned int max_sk; 2680 struct sock **batch; 2681 bool st_bucket_done; 2682 }; 2683 2684 struct bpf_iter__tcp { 2685 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2686 __bpf_md_ptr(struct sock_common *, sk_common); 2687 uid_t uid __aligned(8); 2688 }; 2689 2690 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2691 struct sock_common *sk_common, uid_t uid) 2692 { 2693 struct bpf_iter__tcp ctx; 2694 2695 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2696 ctx.meta = meta; 2697 ctx.sk_common = sk_common; 2698 ctx.uid = uid; 2699 return bpf_iter_run_prog(prog, &ctx); 2700 } 2701 2702 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2703 { 2704 while (iter->cur_sk < iter->end_sk) 2705 sock_put(iter->batch[iter->cur_sk++]); 2706 } 2707 2708 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2709 unsigned int new_batch_sz) 2710 { 2711 struct sock **new_batch; 2712 2713 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2714 GFP_USER | __GFP_NOWARN); 2715 if (!new_batch) 2716 return -ENOMEM; 2717 2718 bpf_iter_tcp_put_batch(iter); 2719 kvfree(iter->batch); 2720 iter->batch = new_batch; 2721 iter->max_sk = new_batch_sz; 2722 2723 return 0; 2724 } 2725 2726 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2727 struct sock *start_sk) 2728 { 2729 struct bpf_tcp_iter_state *iter = seq->private; 2730 struct tcp_iter_state *st = &iter->state; 2731 struct inet_connection_sock *icsk; 2732 unsigned int expected = 1; 2733 struct sock *sk; 2734 2735 sock_hold(start_sk); 2736 iter->batch[iter->end_sk++] = start_sk; 2737 2738 icsk = inet_csk(start_sk); 2739 inet_lhash2_for_each_icsk_continue(icsk) { 2740 sk = (struct sock *)icsk; 2741 if (seq_sk_match(seq, sk)) { 2742 if (iter->end_sk < iter->max_sk) { 2743 sock_hold(sk); 2744 iter->batch[iter->end_sk++] = sk; 2745 } 2746 expected++; 2747 } 2748 } 2749 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2750 2751 return expected; 2752 } 2753 2754 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2755 struct sock *start_sk) 2756 { 2757 struct bpf_tcp_iter_state *iter = seq->private; 2758 struct tcp_iter_state *st = &iter->state; 2759 struct hlist_nulls_node *node; 2760 unsigned int expected = 1; 2761 struct sock *sk; 2762 2763 sock_hold(start_sk); 2764 iter->batch[iter->end_sk++] = start_sk; 2765 2766 sk = sk_nulls_next(start_sk); 2767 sk_nulls_for_each_from(sk, node) { 2768 if (seq_sk_match(seq, sk)) { 2769 if (iter->end_sk < iter->max_sk) { 2770 sock_hold(sk); 2771 iter->batch[iter->end_sk++] = sk; 2772 } 2773 expected++; 2774 } 2775 } 2776 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2777 2778 return expected; 2779 } 2780 2781 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2782 { 2783 struct bpf_tcp_iter_state *iter = seq->private; 2784 struct tcp_iter_state *st = &iter->state; 2785 unsigned int expected; 2786 bool resized = false; 2787 struct sock *sk; 2788 2789 /* The st->bucket is done. Directly advance to the next 2790 * bucket instead of having the tcp_seek_last_pos() to skip 2791 * one by one in the current bucket and eventually find out 2792 * it has to advance to the next bucket. 2793 */ 2794 if (iter->st_bucket_done) { 2795 st->offset = 0; 2796 st->bucket++; 2797 if (st->state == TCP_SEQ_STATE_LISTENING && 2798 st->bucket > tcp_hashinfo.lhash2_mask) { 2799 st->state = TCP_SEQ_STATE_ESTABLISHED; 2800 st->bucket = 0; 2801 } 2802 } 2803 2804 again: 2805 /* Get a new batch */ 2806 iter->cur_sk = 0; 2807 iter->end_sk = 0; 2808 iter->st_bucket_done = false; 2809 2810 sk = tcp_seek_last_pos(seq); 2811 if (!sk) 2812 return NULL; /* Done */ 2813 2814 if (st->state == TCP_SEQ_STATE_LISTENING) 2815 expected = bpf_iter_tcp_listening_batch(seq, sk); 2816 else 2817 expected = bpf_iter_tcp_established_batch(seq, sk); 2818 2819 if (iter->end_sk == expected) { 2820 iter->st_bucket_done = true; 2821 return sk; 2822 } 2823 2824 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2825 resized = true; 2826 goto again; 2827 } 2828 2829 return sk; 2830 } 2831 2832 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2833 { 2834 /* bpf iter does not support lseek, so it always 2835 * continue from where it was stop()-ped. 2836 */ 2837 if (*pos) 2838 return bpf_iter_tcp_batch(seq); 2839 2840 return SEQ_START_TOKEN; 2841 } 2842 2843 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2844 { 2845 struct bpf_tcp_iter_state *iter = seq->private; 2846 struct tcp_iter_state *st = &iter->state; 2847 struct sock *sk; 2848 2849 /* Whenever seq_next() is called, the iter->cur_sk is 2850 * done with seq_show(), so advance to the next sk in 2851 * the batch. 2852 */ 2853 if (iter->cur_sk < iter->end_sk) { 2854 /* Keeping st->num consistent in tcp_iter_state. 2855 * bpf_iter_tcp does not use st->num. 2856 * meta.seq_num is used instead. 2857 */ 2858 st->num++; 2859 /* Move st->offset to the next sk in the bucket such that 2860 * the future start() will resume at st->offset in 2861 * st->bucket. See tcp_seek_last_pos(). 2862 */ 2863 st->offset++; 2864 sock_put(iter->batch[iter->cur_sk++]); 2865 } 2866 2867 if (iter->cur_sk < iter->end_sk) 2868 sk = iter->batch[iter->cur_sk]; 2869 else 2870 sk = bpf_iter_tcp_batch(seq); 2871 2872 ++*pos; 2873 /* Keeping st->last_pos consistent in tcp_iter_state. 2874 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2875 */ 2876 st->last_pos = *pos; 2877 return sk; 2878 } 2879 2880 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2881 { 2882 struct bpf_iter_meta meta; 2883 struct bpf_prog *prog; 2884 struct sock *sk = v; 2885 bool slow; 2886 uid_t uid; 2887 int ret; 2888 2889 if (v == SEQ_START_TOKEN) 2890 return 0; 2891 2892 if (sk_fullsock(sk)) 2893 slow = lock_sock_fast(sk); 2894 2895 if (unlikely(sk_unhashed(sk))) { 2896 ret = SEQ_SKIP; 2897 goto unlock; 2898 } 2899 2900 if (sk->sk_state == TCP_TIME_WAIT) { 2901 uid = 0; 2902 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2903 const struct request_sock *req = v; 2904 2905 uid = from_kuid_munged(seq_user_ns(seq), 2906 sock_i_uid(req->rsk_listener)); 2907 } else { 2908 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2909 } 2910 2911 meta.seq = seq; 2912 prog = bpf_iter_get_info(&meta, false); 2913 ret = tcp_prog_seq_show(prog, &meta, v, uid); 2914 2915 unlock: 2916 if (sk_fullsock(sk)) 2917 unlock_sock_fast(sk, slow); 2918 return ret; 2919 2920 } 2921 2922 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2923 { 2924 struct bpf_tcp_iter_state *iter = seq->private; 2925 struct bpf_iter_meta meta; 2926 struct bpf_prog *prog; 2927 2928 if (!v) { 2929 meta.seq = seq; 2930 prog = bpf_iter_get_info(&meta, true); 2931 if (prog) 2932 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2933 } 2934 2935 if (iter->cur_sk < iter->end_sk) { 2936 bpf_iter_tcp_put_batch(iter); 2937 iter->st_bucket_done = false; 2938 } 2939 } 2940 2941 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2942 .show = bpf_iter_tcp_seq_show, 2943 .start = bpf_iter_tcp_seq_start, 2944 .next = bpf_iter_tcp_seq_next, 2945 .stop = bpf_iter_tcp_seq_stop, 2946 }; 2947 #endif 2948 static unsigned short seq_file_family(const struct seq_file *seq) 2949 { 2950 const struct tcp_seq_afinfo *afinfo; 2951 2952 #ifdef CONFIG_BPF_SYSCALL 2953 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 2954 if (seq->op == &bpf_iter_tcp_seq_ops) 2955 return AF_UNSPEC; 2956 #endif 2957 2958 /* Iterated from proc fs */ 2959 afinfo = pde_data(file_inode(seq->file)); 2960 return afinfo->family; 2961 } 2962 2963 static const struct seq_operations tcp4_seq_ops = { 2964 .show = tcp4_seq_show, 2965 .start = tcp_seq_start, 2966 .next = tcp_seq_next, 2967 .stop = tcp_seq_stop, 2968 }; 2969 2970 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2971 .family = AF_INET, 2972 }; 2973 2974 static int __net_init tcp4_proc_init_net(struct net *net) 2975 { 2976 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2977 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2978 return -ENOMEM; 2979 return 0; 2980 } 2981 2982 static void __net_exit tcp4_proc_exit_net(struct net *net) 2983 { 2984 remove_proc_entry("tcp", net->proc_net); 2985 } 2986 2987 static struct pernet_operations tcp4_net_ops = { 2988 .init = tcp4_proc_init_net, 2989 .exit = tcp4_proc_exit_net, 2990 }; 2991 2992 int __init tcp4_proc_init(void) 2993 { 2994 return register_pernet_subsys(&tcp4_net_ops); 2995 } 2996 2997 void tcp4_proc_exit(void) 2998 { 2999 unregister_pernet_subsys(&tcp4_net_ops); 3000 } 3001 #endif /* CONFIG_PROC_FS */ 3002 3003 /* @wake is one when sk_stream_write_space() calls us. 3004 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3005 * This mimics the strategy used in sock_def_write_space(). 3006 */ 3007 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3008 { 3009 const struct tcp_sock *tp = tcp_sk(sk); 3010 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3011 READ_ONCE(tp->snd_nxt); 3012 3013 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3014 } 3015 EXPORT_SYMBOL(tcp_stream_memory_free); 3016 3017 struct proto tcp_prot = { 3018 .name = "TCP", 3019 .owner = THIS_MODULE, 3020 .close = tcp_close, 3021 .pre_connect = tcp_v4_pre_connect, 3022 .connect = tcp_v4_connect, 3023 .disconnect = tcp_disconnect, 3024 .accept = inet_csk_accept, 3025 .ioctl = tcp_ioctl, 3026 .init = tcp_v4_init_sock, 3027 .destroy = tcp_v4_destroy_sock, 3028 .shutdown = tcp_shutdown, 3029 .setsockopt = tcp_setsockopt, 3030 .getsockopt = tcp_getsockopt, 3031 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3032 .keepalive = tcp_set_keepalive, 3033 .recvmsg = tcp_recvmsg, 3034 .sendmsg = tcp_sendmsg, 3035 .sendpage = tcp_sendpage, 3036 .backlog_rcv = tcp_v4_do_rcv, 3037 .release_cb = tcp_release_cb, 3038 .hash = inet_hash, 3039 .unhash = inet_unhash, 3040 .get_port = inet_csk_get_port, 3041 .put_port = inet_put_port, 3042 #ifdef CONFIG_BPF_SYSCALL 3043 .psock_update_sk_prot = tcp_bpf_update_proto, 3044 #endif 3045 .enter_memory_pressure = tcp_enter_memory_pressure, 3046 .leave_memory_pressure = tcp_leave_memory_pressure, 3047 .stream_memory_free = tcp_stream_memory_free, 3048 .sockets_allocated = &tcp_sockets_allocated, 3049 .orphan_count = &tcp_orphan_count, 3050 .memory_allocated = &tcp_memory_allocated, 3051 .memory_pressure = &tcp_memory_pressure, 3052 .sysctl_mem = sysctl_tcp_mem, 3053 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3054 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3055 .max_header = MAX_TCP_HEADER, 3056 .obj_size = sizeof(struct tcp_sock), 3057 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3058 .twsk_prot = &tcp_timewait_sock_ops, 3059 .rsk_prot = &tcp_request_sock_ops, 3060 .h.hashinfo = &tcp_hashinfo, 3061 .no_autobind = true, 3062 .diag_destroy = tcp_abort, 3063 }; 3064 EXPORT_SYMBOL(tcp_prot); 3065 3066 static void __net_exit tcp_sk_exit(struct net *net) 3067 { 3068 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row; 3069 3070 if (net->ipv4.tcp_congestion_control) 3071 bpf_module_put(net->ipv4.tcp_congestion_control, 3072 net->ipv4.tcp_congestion_control->owner); 3073 if (refcount_dec_and_test(&tcp_death_row->tw_refcount)) 3074 kfree(tcp_death_row); 3075 } 3076 3077 static int __net_init tcp_sk_init(struct net *net) 3078 { 3079 int cnt; 3080 3081 net->ipv4.sysctl_tcp_ecn = 2; 3082 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3083 3084 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3085 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3086 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3087 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3088 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3089 3090 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3091 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3092 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3093 3094 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3095 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3096 net->ipv4.sysctl_tcp_syncookies = 1; 3097 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3098 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3099 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3100 net->ipv4.sysctl_tcp_orphan_retries = 0; 3101 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3102 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3103 net->ipv4.sysctl_tcp_tw_reuse = 2; 3104 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3105 3106 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL); 3107 if (!net->ipv4.tcp_death_row) 3108 return -ENOMEM; 3109 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1); 3110 cnt = tcp_hashinfo.ehash_mask + 1; 3111 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2; 3112 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo; 3113 3114 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 3115 net->ipv4.sysctl_tcp_sack = 1; 3116 net->ipv4.sysctl_tcp_window_scaling = 1; 3117 net->ipv4.sysctl_tcp_timestamps = 1; 3118 net->ipv4.sysctl_tcp_early_retrans = 3; 3119 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3120 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3121 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3122 net->ipv4.sysctl_tcp_max_reordering = 300; 3123 net->ipv4.sysctl_tcp_dsack = 1; 3124 net->ipv4.sysctl_tcp_app_win = 31; 3125 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3126 net->ipv4.sysctl_tcp_frto = 2; 3127 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3128 /* This limits the percentage of the congestion window which we 3129 * will allow a single TSO frame to consume. Building TSO frames 3130 * which are too large can cause TCP streams to be bursty. 3131 */ 3132 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3133 /* Default TSQ limit of 16 TSO segments */ 3134 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3135 /* rfc5961 challenge ack rate limiting */ 3136 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 3137 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3138 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3139 net->ipv4.sysctl_tcp_autocorking = 1; 3140 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3141 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3142 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3143 if (net != &init_net) { 3144 memcpy(net->ipv4.sysctl_tcp_rmem, 3145 init_net.ipv4.sysctl_tcp_rmem, 3146 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3147 memcpy(net->ipv4.sysctl_tcp_wmem, 3148 init_net.ipv4.sysctl_tcp_wmem, 3149 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3150 } 3151 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3152 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3153 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3154 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3155 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3156 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3157 3158 /* Reno is always built in */ 3159 if (!net_eq(net, &init_net) && 3160 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3161 init_net.ipv4.tcp_congestion_control->owner)) 3162 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3163 else 3164 net->ipv4.tcp_congestion_control = &tcp_reno; 3165 3166 return 0; 3167 } 3168 3169 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3170 { 3171 struct net *net; 3172 3173 list_for_each_entry(net, net_exit_list, exit_list) 3174 tcp_fastopen_ctx_destroy(net); 3175 } 3176 3177 static struct pernet_operations __net_initdata tcp_sk_ops = { 3178 .init = tcp_sk_init, 3179 .exit = tcp_sk_exit, 3180 .exit_batch = tcp_sk_exit_batch, 3181 }; 3182 3183 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3184 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3185 struct sock_common *sk_common, uid_t uid) 3186 3187 #define INIT_BATCH_SZ 16 3188 3189 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3190 { 3191 struct bpf_tcp_iter_state *iter = priv_data; 3192 int err; 3193 3194 err = bpf_iter_init_seq_net(priv_data, aux); 3195 if (err) 3196 return err; 3197 3198 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3199 if (err) { 3200 bpf_iter_fini_seq_net(priv_data); 3201 return err; 3202 } 3203 3204 return 0; 3205 } 3206 3207 static void bpf_iter_fini_tcp(void *priv_data) 3208 { 3209 struct bpf_tcp_iter_state *iter = priv_data; 3210 3211 bpf_iter_fini_seq_net(priv_data); 3212 kvfree(iter->batch); 3213 } 3214 3215 static const struct bpf_iter_seq_info tcp_seq_info = { 3216 .seq_ops = &bpf_iter_tcp_seq_ops, 3217 .init_seq_private = bpf_iter_init_tcp, 3218 .fini_seq_private = bpf_iter_fini_tcp, 3219 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3220 }; 3221 3222 static const struct bpf_func_proto * 3223 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3224 const struct bpf_prog *prog) 3225 { 3226 switch (func_id) { 3227 case BPF_FUNC_setsockopt: 3228 return &bpf_sk_setsockopt_proto; 3229 case BPF_FUNC_getsockopt: 3230 return &bpf_sk_getsockopt_proto; 3231 default: 3232 return NULL; 3233 } 3234 } 3235 3236 static struct bpf_iter_reg tcp_reg_info = { 3237 .target = "tcp", 3238 .ctx_arg_info_size = 1, 3239 .ctx_arg_info = { 3240 { offsetof(struct bpf_iter__tcp, sk_common), 3241 PTR_TO_BTF_ID_OR_NULL }, 3242 }, 3243 .get_func_proto = bpf_iter_tcp_get_func_proto, 3244 .seq_info = &tcp_seq_info, 3245 }; 3246 3247 static void __init bpf_iter_register(void) 3248 { 3249 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3250 if (bpf_iter_reg_target(&tcp_reg_info)) 3251 pr_warn("Warning: could not register bpf iterator tcp\n"); 3252 } 3253 3254 #endif 3255 3256 void __init tcp_v4_init(void) 3257 { 3258 int cpu, res; 3259 3260 for_each_possible_cpu(cpu) { 3261 struct sock *sk; 3262 3263 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3264 IPPROTO_TCP, &init_net); 3265 if (res) 3266 panic("Failed to create the TCP control socket.\n"); 3267 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3268 3269 /* Please enforce IP_DF and IPID==0 for RST and 3270 * ACK sent in SYN-RECV and TIME-WAIT state. 3271 */ 3272 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3273 3274 per_cpu(ipv4_tcp_sk, cpu) = sk; 3275 } 3276 if (register_pernet_subsys(&tcp_sk_ops)) 3277 panic("Failed to create the TCP control socket.\n"); 3278 3279 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3280 bpf_iter_register(); 3281 #endif 3282 } 3283