1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/fips.h> 57 #include <linux/jhash.h> 58 #include <linux/init.h> 59 #include <linux/times.h> 60 #include <linux/slab.h> 61 #include <linux/sched.h> 62 #include <linux/sock_diag.h> 63 64 #include <net/aligned_data.h> 65 #include <net/net_namespace.h> 66 #include <net/icmp.h> 67 #include <net/inet_hashtables.h> 68 #include <net/tcp.h> 69 #include <net/tcp_ecn.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/inet_ecn.h> 74 #include <net/timewait_sock.h> 75 #include <net/xfrm.h> 76 #include <net/secure_seq.h> 77 #include <net/busy_poll.h> 78 #include <net/rstreason.h> 79 #include <net/psp.h> 80 81 #include <linux/inet.h> 82 #include <linux/ipv6.h> 83 #include <linux/stddef.h> 84 #include <linux/proc_fs.h> 85 #include <linux/seq_file.h> 86 #include <linux/inetdevice.h> 87 #include <linux/btf_ids.h> 88 #include <linux/skbuff_ref.h> 89 90 #include <crypto/md5.h> 91 #include <crypto/utils.h> 92 93 #include <trace/events/tcp.h> 94 95 #ifdef CONFIG_TCP_MD5SIG 96 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 97 __be32 daddr, __be32 saddr, const struct tcphdr *th); 98 #endif 99 100 struct inet_hashinfo tcp_hashinfo; 101 102 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 103 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 104 }; 105 106 static DEFINE_MUTEX(tcp_exit_batch_mutex); 107 108 static union tcp_seq_and_ts_off 109 tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb) 110 { 111 return secure_tcp_seq_and_ts_off(net, 112 ip_hdr(skb)->daddr, 113 ip_hdr(skb)->saddr, 114 tcp_hdr(skb)->dest, 115 tcp_hdr(skb)->source); 116 } 117 118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 119 { 120 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 121 const struct inet_timewait_sock *tw = inet_twsk(sktw); 122 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 123 struct tcp_sock *tp = tcp_sk(sk); 124 int ts_recent_stamp; 125 u32 reuse_thresh; 126 127 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 128 reuse = 0; 129 130 if (reuse == 2) { 131 /* Still does not detect *everything* that goes through 132 * lo, since we require a loopback src or dst address 133 * or direct binding to 'lo' interface. 134 */ 135 bool loopback = false; 136 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 137 loopback = true; 138 #if IS_ENABLED(CONFIG_IPV6) 139 if (tw->tw_family == AF_INET6) { 140 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 142 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 144 loopback = true; 145 } else 146 #endif 147 { 148 if (ipv4_is_loopback(tw->tw_daddr) || 149 ipv4_is_loopback(tw->tw_rcv_saddr)) 150 loopback = true; 151 } 152 if (!loopback) 153 reuse = 0; 154 } 155 156 /* With PAWS, it is safe from the viewpoint 157 of data integrity. Even without PAWS it is safe provided sequence 158 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 159 160 Actually, the idea is close to VJ's one, only timestamp cache is 161 held not per host, but per port pair and TW bucket is used as state 162 holder. 163 164 If TW bucket has been already destroyed we fall back to VJ's scheme 165 and use initial timestamp retrieved from peer table. 166 */ 167 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 168 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 169 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 170 if (ts_recent_stamp && 171 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 172 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 173 * and releasing the bucket lock. 174 */ 175 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 176 return 0; 177 178 /* In case of repair and re-using TIME-WAIT sockets we still 179 * want to be sure that it is safe as above but honor the 180 * sequence numbers and time stamps set as part of the repair 181 * process. 182 * 183 * Without this check re-using a TIME-WAIT socket with TCP 184 * repair would accumulate a -1 on the repair assigned 185 * sequence number. The first time it is reused the sequence 186 * is -1, the second time -2, etc. This fixes that issue 187 * without appearing to create any others. 188 */ 189 if (likely(!tp->repair)) { 190 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 191 192 if (!seq) 193 seq = 1; 194 WRITE_ONCE(tp->write_seq, seq); 195 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 196 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 197 } 198 199 return 1; 200 } 201 202 return 0; 203 } 204 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 205 206 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, 207 int addr_len) 208 { 209 /* This check is replicated from tcp_v4_connect() and intended to 210 * prevent BPF program called below from accessing bytes that are out 211 * of the bound specified by user in addr_len. 212 */ 213 if (addr_len < sizeof(struct sockaddr_in)) 214 return -EINVAL; 215 216 sock_owned_by_me(sk); 217 218 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 219 } 220 221 /* This will initiate an outgoing connection. */ 222 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) 223 { 224 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 225 struct inet_timewait_death_row *tcp_death_row; 226 struct inet_sock *inet = inet_sk(sk); 227 struct tcp_sock *tp = tcp_sk(sk); 228 struct ip_options_rcu *inet_opt; 229 struct net *net = sock_net(sk); 230 __be16 orig_sport, orig_dport; 231 __be32 daddr, nexthop; 232 struct flowi4 *fl4; 233 struct rtable *rt; 234 int err; 235 236 if (addr_len < sizeof(struct sockaddr_in)) 237 return -EINVAL; 238 239 if (usin->sin_family != AF_INET) 240 return -EAFNOSUPPORT; 241 242 nexthop = daddr = usin->sin_addr.s_addr; 243 inet_opt = rcu_dereference_protected(inet->inet_opt, 244 lockdep_sock_is_held(sk)); 245 if (inet_opt && inet_opt->opt.srr) { 246 if (!daddr) 247 return -EINVAL; 248 nexthop = inet_opt->opt.faddr; 249 } 250 251 orig_sport = inet->inet_sport; 252 orig_dport = usin->sin_port; 253 fl4 = &inet->cork.fl.u.ip4; 254 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 255 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 256 orig_dport, sk); 257 if (IS_ERR(rt)) { 258 err = PTR_ERR(rt); 259 if (err == -ENETUNREACH) 260 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 261 return err; 262 } 263 264 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 265 ip_rt_put(rt); 266 return -ENETUNREACH; 267 } 268 269 if (!inet_opt || !inet_opt->opt.srr) 270 daddr = fl4->daddr; 271 272 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 273 274 if (!inet->inet_saddr) { 275 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 276 if (err) { 277 ip_rt_put(rt); 278 return err; 279 } 280 } else { 281 sk_rcv_saddr_set(sk, inet->inet_saddr); 282 } 283 284 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 285 /* Reset inherited state */ 286 tp->rx_opt.ts_recent = 0; 287 tp->rx_opt.ts_recent_stamp = 0; 288 if (likely(!tp->repair)) 289 WRITE_ONCE(tp->write_seq, 0); 290 } 291 292 inet->inet_dport = usin->sin_port; 293 sk_daddr_set(sk, daddr); 294 295 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); 296 if (inet_opt) 297 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; 298 299 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 300 301 /* Socket identity is still unknown (sport may be zero). 302 * However we set state to SYN-SENT and not releasing socket 303 * lock select source port, enter ourselves into the hash tables and 304 * complete initialization after this. 305 */ 306 tcp_set_state(sk, TCP_SYN_SENT); 307 err = inet_hash_connect(tcp_death_row, sk); 308 if (err) 309 goto failure; 310 311 sk_set_txhash(sk); 312 313 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 314 inet->inet_sport, inet->inet_dport, sk); 315 if (IS_ERR(rt)) { 316 err = PTR_ERR(rt); 317 rt = NULL; 318 goto failure; 319 } 320 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 321 /* OK, now commit destination to socket. */ 322 sk->sk_gso_type = SKB_GSO_TCPV4; 323 sk_setup_caps(sk, &rt->dst); 324 rt = NULL; 325 326 if (likely(!tp->repair)) { 327 union tcp_seq_and_ts_off st; 328 329 st = secure_tcp_seq_and_ts_off(net, 330 inet->inet_saddr, 331 inet->inet_daddr, 332 inet->inet_sport, 333 usin->sin_port); 334 if (!tp->write_seq) 335 WRITE_ONCE(tp->write_seq, st.seq); 336 WRITE_ONCE(tp->tsoffset, st.ts_off); 337 } 338 339 atomic_set(&inet->inet_id, get_random_u16()); 340 341 if (tcp_fastopen_defer_connect(sk, &err)) 342 return err; 343 if (err) 344 goto failure; 345 346 err = tcp_connect(sk); 347 348 if (err) 349 goto failure; 350 351 return 0; 352 353 failure: 354 /* 355 * This unhashes the socket and releases the local port, 356 * if necessary. 357 */ 358 tcp_set_state(sk, TCP_CLOSE); 359 inet_bhash2_reset_saddr(sk); 360 ip_rt_put(rt); 361 sk->sk_route_caps = 0; 362 inet->inet_dport = 0; 363 return err; 364 } 365 EXPORT_IPV6_MOD(tcp_v4_connect); 366 367 /* 368 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 369 * It can be called through tcp_release_cb() if socket was owned by user 370 * at the time tcp_v4_err() was called to handle ICMP message. 371 */ 372 void tcp_v4_mtu_reduced(struct sock *sk) 373 { 374 struct inet_sock *inet = inet_sk(sk); 375 struct dst_entry *dst; 376 u32 mtu, dmtu; 377 378 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 379 return; 380 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 381 dst = inet_csk_update_pmtu(sk, mtu); 382 if (!dst) 383 return; 384 385 /* Something is about to be wrong... Remember soft error 386 * for the case, if this connection will not able to recover. 387 */ 388 dmtu = dst4_mtu(dst); 389 if (mtu < dmtu && ip_dont_fragment(sk, dst)) 390 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 391 392 if (inet->pmtudisc != IP_PMTUDISC_DONT && 393 ip_sk_accept_pmtu(sk) && 394 inet_csk(sk)->icsk_pmtu_cookie > dmtu) { 395 tcp_sync_mss(sk, dmtu); 396 397 /* Resend the TCP packet because it's 398 * clear that the old packet has been 399 * dropped. This is the new "fast" path mtu 400 * discovery. 401 */ 402 tcp_simple_retransmit(sk); 403 } /* else let the usual retransmit timer handle it */ 404 } 405 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 406 407 static void do_redirect(struct sk_buff *skb, struct sock *sk) 408 { 409 struct dst_entry *dst = __sk_dst_check(sk, 0); 410 411 if (dst) 412 dst->ops->redirect(dst, sk, skb); 413 } 414 415 416 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 417 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 418 { 419 struct request_sock *req = inet_reqsk(sk); 420 struct net *net = sock_net(sk); 421 422 /* ICMPs are not backlogged, hence we cannot get 423 * an established socket here. 424 */ 425 if (seq != tcp_rsk(req)->snt_isn) { 426 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 427 } else if (abort) { 428 /* 429 * Still in SYN_RECV, just remove it silently. 430 * There is no good way to pass the error to the newly 431 * created socket, and POSIX does not want network 432 * errors returned from accept(). 433 */ 434 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 435 tcp_listendrop(req->rsk_listener); 436 } 437 reqsk_put(req); 438 } 439 EXPORT_IPV6_MOD(tcp_req_err); 440 441 /* TCP-LD (RFC 6069) logic */ 442 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 443 { 444 struct inet_connection_sock *icsk = inet_csk(sk); 445 struct tcp_sock *tp = tcp_sk(sk); 446 struct sk_buff *skb; 447 s32 remaining; 448 u32 delta_us; 449 450 if (sock_owned_by_user(sk)) 451 return; 452 453 if (seq != tp->snd_una || !icsk->icsk_retransmits || 454 !icsk->icsk_backoff) 455 return; 456 457 skb = tcp_rtx_queue_head(sk); 458 if (WARN_ON_ONCE(!skb)) 459 return; 460 461 icsk->icsk_backoff--; 462 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 463 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 464 465 tcp_mstamp_refresh(tp); 466 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 467 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 468 469 if (remaining > 0) { 470 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 471 } else { 472 /* RTO revert clocked out retransmission. 473 * Will retransmit now. 474 */ 475 tcp_retransmit_timer(sk); 476 } 477 } 478 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 479 480 /* 481 * This routine is called by the ICMP module when it gets some 482 * sort of error condition. If err < 0 then the socket should 483 * be closed and the error returned to the user. If err > 0 484 * it's just the icmp type << 8 | icmp code. After adjustment 485 * header points to the first 8 bytes of the tcp header. We need 486 * to find the appropriate port. 487 * 488 * The locking strategy used here is very "optimistic". When 489 * someone else accesses the socket the ICMP is just dropped 490 * and for some paths there is no check at all. 491 * A more general error queue to queue errors for later handling 492 * is probably better. 493 * 494 */ 495 496 int tcp_v4_err(struct sk_buff *skb, u32 info) 497 { 498 const struct iphdr *iph = (const struct iphdr *)skb->data; 499 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 500 struct net *net = dev_net_rcu(skb->dev); 501 const int type = icmp_hdr(skb)->type; 502 const int code = icmp_hdr(skb)->code; 503 struct request_sock *fastopen; 504 struct tcp_sock *tp; 505 u32 seq, snd_una; 506 struct sock *sk; 507 int err; 508 509 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 510 ntohs(th->source), inet_iif(skb), 0); 511 if (!sk) { 512 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 513 return -ENOENT; 514 } 515 if (sk->sk_state == TCP_TIME_WAIT) { 516 /* To increase the counter of ignored icmps for TCP-AO */ 517 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 518 inet_twsk_put(inet_twsk(sk)); 519 return 0; 520 } 521 seq = ntohl(th->seq); 522 if (sk->sk_state == TCP_NEW_SYN_RECV) { 523 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 524 type == ICMP_TIME_EXCEEDED || 525 (type == ICMP_DEST_UNREACH && 526 (code == ICMP_NET_UNREACH || 527 code == ICMP_HOST_UNREACH))); 528 return 0; 529 } 530 531 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 532 sock_put(sk); 533 return 0; 534 } 535 536 bh_lock_sock(sk); 537 /* If too many ICMPs get dropped on busy 538 * servers this needs to be solved differently. 539 * We do take care of PMTU discovery (RFC1191) special case : 540 * we can receive locally generated ICMP messages while socket is held. 541 */ 542 if (sock_owned_by_user(sk)) { 543 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 544 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 545 } 546 if (sk->sk_state == TCP_CLOSE) 547 goto out; 548 549 if (static_branch_unlikely(&ip4_min_ttl)) { 550 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 551 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 552 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 553 goto out; 554 } 555 } 556 557 tp = tcp_sk(sk); 558 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 559 fastopen = rcu_dereference(tp->fastopen_rsk); 560 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 561 if (sk->sk_state != TCP_LISTEN && 562 !between(seq, snd_una, tp->snd_nxt)) { 563 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 564 goto out; 565 } 566 567 switch (type) { 568 case ICMP_REDIRECT: 569 if (!sock_owned_by_user(sk)) 570 do_redirect(skb, sk); 571 goto out; 572 case ICMP_SOURCE_QUENCH: 573 /* Just silently ignore these. */ 574 goto out; 575 case ICMP_PARAMETERPROB: 576 err = EPROTO; 577 break; 578 case ICMP_DEST_UNREACH: 579 if (code > NR_ICMP_UNREACH) 580 goto out; 581 582 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 583 /* We are not interested in TCP_LISTEN and open_requests 584 * (SYN-ACKs send out by Linux are always <576bytes so 585 * they should go through unfragmented). 586 */ 587 if (sk->sk_state == TCP_LISTEN) 588 goto out; 589 590 WRITE_ONCE(tp->mtu_info, info); 591 if (!sock_owned_by_user(sk)) { 592 tcp_v4_mtu_reduced(sk); 593 } else { 594 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 595 sock_hold(sk); 596 } 597 goto out; 598 } 599 600 err = icmp_err_convert[code].errno; 601 /* check if this ICMP message allows revert of backoff. 602 * (see RFC 6069) 603 */ 604 if (!fastopen && 605 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 606 tcp_ld_RTO_revert(sk, seq); 607 break; 608 case ICMP_TIME_EXCEEDED: 609 err = EHOSTUNREACH; 610 break; 611 default: 612 goto out; 613 } 614 615 switch (sk->sk_state) { 616 case TCP_SYN_SENT: 617 case TCP_SYN_RECV: 618 /* Only in fast or simultaneous open. If a fast open socket is 619 * already accepted it is treated as a connected one below. 620 */ 621 if (fastopen && !fastopen->sk) 622 break; 623 624 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 625 626 if (!sock_owned_by_user(sk)) 627 tcp_done_with_error(sk, err); 628 else 629 WRITE_ONCE(sk->sk_err_soft, err); 630 goto out; 631 } 632 633 /* If we've already connected we will keep trying 634 * until we time out, or the user gives up. 635 * 636 * rfc1122 4.2.3.9 allows to consider as hard errors 637 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 638 * but it is obsoleted by pmtu discovery). 639 * 640 * Note, that in modern internet, where routing is unreliable 641 * and in each dark corner broken firewalls sit, sending random 642 * errors ordered by their masters even this two messages finally lose 643 * their original sense (even Linux sends invalid PORT_UNREACHs) 644 * 645 * Now we are in compliance with RFCs. 646 * --ANK (980905) 647 */ 648 649 if (!sock_owned_by_user(sk) && 650 inet_test_bit(RECVERR, sk)) { 651 WRITE_ONCE(sk->sk_err, err); 652 sk_error_report(sk); 653 } else { /* Only an error on timeout */ 654 WRITE_ONCE(sk->sk_err_soft, err); 655 } 656 657 out: 658 bh_unlock_sock(sk); 659 sock_put(sk); 660 return 0; 661 } 662 663 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 664 665 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 666 const struct tcp_ao_hdr *aoh, 667 struct ip_reply_arg *arg, struct tcphdr *reply, 668 __be32 reply_options[REPLY_OPTIONS_LEN]) 669 { 670 #ifdef CONFIG_TCP_AO 671 int sdif = tcp_v4_sdif(skb); 672 int dif = inet_iif(skb); 673 int l3index = sdif ? dif : 0; 674 bool allocated_traffic_key; 675 struct tcp_ao_key *key; 676 char *traffic_key; 677 bool drop = true; 678 u32 ao_sne = 0; 679 u8 keyid; 680 681 rcu_read_lock(); 682 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 683 &key, &traffic_key, &allocated_traffic_key, 684 &keyid, &ao_sne)) 685 goto out; 686 687 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 688 (aoh->rnext_keyid << 8) | keyid); 689 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 690 reply->doff = arg->iov[0].iov_len / 4; 691 692 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 693 key, traffic_key, 694 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 695 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 696 reply, ao_sne)) 697 goto out; 698 drop = false; 699 out: 700 rcu_read_unlock(); 701 if (allocated_traffic_key) 702 kfree(traffic_key); 703 return drop; 704 #else 705 return true; 706 #endif 707 } 708 709 /* 710 * This routine will send an RST to the other tcp. 711 * 712 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 713 * for reset. 714 * Answer: if a packet caused RST, it is not for a socket 715 * existing in our system, if it is matched to a socket, 716 * it is just duplicate segment or bug in other side's TCP. 717 * So that we build reply only basing on parameters 718 * arrived with segment. 719 * Exception: precedence violation. We do not implement it in any case. 720 */ 721 722 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 723 enum sk_rst_reason reason) 724 { 725 const struct tcphdr *th = tcp_hdr(skb); 726 struct { 727 struct tcphdr th; 728 __be32 opt[REPLY_OPTIONS_LEN]; 729 } rep; 730 const __u8 *md5_hash_location = NULL; 731 const struct tcp_ao_hdr *aoh; 732 struct ip_reply_arg arg; 733 #ifdef CONFIG_TCP_MD5SIG 734 struct tcp_md5sig_key *key = NULL; 735 unsigned char newhash[16]; 736 struct sock *sk1 = NULL; 737 #endif 738 u64 transmit_time = 0; 739 struct sock *ctl_sk; 740 struct net *net; 741 u32 txhash = 0; 742 743 /* Never send a reset in response to a reset. */ 744 if (th->rst) 745 return; 746 747 /* If sk not NULL, it means we did a successful lookup and incoming 748 * route had to be correct. prequeue might have dropped our dst. 749 */ 750 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 751 return; 752 753 /* Swap the send and the receive. */ 754 memset(&rep, 0, sizeof(rep)); 755 rep.th.dest = th->source; 756 rep.th.source = th->dest; 757 rep.th.doff = sizeof(struct tcphdr) / 4; 758 rep.th.rst = 1; 759 760 if (th->ack) { 761 rep.th.seq = th->ack_seq; 762 } else { 763 rep.th.ack = 1; 764 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 765 skb->len - (th->doff << 2)); 766 } 767 768 memset(&arg, 0, sizeof(arg)); 769 arg.iov[0].iov_base = (unsigned char *)&rep; 770 arg.iov[0].iov_len = sizeof(rep.th); 771 772 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 773 774 /* Invalid TCP option size or twice included auth */ 775 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 776 return; 777 778 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 779 return; 780 781 #ifdef CONFIG_TCP_MD5SIG 782 rcu_read_lock(); 783 if (sk && sk_fullsock(sk)) { 784 const union tcp_md5_addr *addr; 785 int l3index; 786 787 /* sdif set, means packet ingressed via a device 788 * in an L3 domain and inet_iif is set to it. 789 */ 790 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 791 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 792 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 793 } else if (md5_hash_location) { 794 const union tcp_md5_addr *addr; 795 int sdif = tcp_v4_sdif(skb); 796 int dif = inet_iif(skb); 797 int l3index; 798 799 /* 800 * active side is lost. Try to find listening socket through 801 * source port, and then find md5 key through listening socket. 802 * we are not loose security here: 803 * Incoming packet is checked with md5 hash with finding key, 804 * no RST generated if md5 hash doesn't match. 805 */ 806 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 807 th->source, ip_hdr(skb)->daddr, 808 ntohs(th->source), dif, sdif); 809 /* don't send rst if it can't find key */ 810 if (!sk1) 811 goto out; 812 813 /* sdif set, means packet ingressed via a device 814 * in an L3 domain and dif is set to it. 815 */ 816 l3index = sdif ? dif : 0; 817 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 818 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 819 if (!key) 820 goto out; 821 822 tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 823 if (crypto_memneq(md5_hash_location, newhash, 16)) 824 goto out; 825 } 826 827 if (key) { 828 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 829 (TCPOPT_NOP << 16) | 830 (TCPOPT_MD5SIG << 8) | 831 TCPOLEN_MD5SIG); 832 /* Update length and the length the header thinks exists */ 833 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 834 rep.th.doff = arg.iov[0].iov_len / 4; 835 836 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 837 key, ip_hdr(skb)->saddr, 838 ip_hdr(skb)->daddr, &rep.th); 839 } 840 #endif 841 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 842 if (rep.opt[0] == 0) { 843 __be32 mrst = mptcp_reset_option(skb); 844 845 if (mrst) { 846 rep.opt[0] = mrst; 847 arg.iov[0].iov_len += sizeof(mrst); 848 rep.th.doff = arg.iov[0].iov_len / 4; 849 } 850 } 851 852 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 853 ip_hdr(skb)->saddr, /* XXX */ 854 arg.iov[0].iov_len, IPPROTO_TCP, 0); 855 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 856 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 857 858 /* When socket is gone, all binding information is lost. 859 * routing might fail in this case. No choice here, if we choose to force 860 * input interface, we will misroute in case of asymmetric route. 861 */ 862 if (sk) 863 arg.bound_dev_if = sk->sk_bound_dev_if; 864 865 trace_tcp_send_reset(sk, skb, reason); 866 867 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 868 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 869 870 /* ECN bits of TW reset are cleared */ 871 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 872 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 873 local_bh_disable(); 874 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 875 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 876 877 sock_net_set(ctl_sk, net); 878 if (sk) { 879 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 880 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 881 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 882 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 883 transmit_time = tcp_transmit_time(sk); 884 xfrm_sk_clone_policy(ctl_sk, sk); 885 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 886 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 887 } else { 888 ctl_sk->sk_mark = 0; 889 ctl_sk->sk_priority = 0; 890 } 891 ip_send_unicast_reply(ctl_sk, sk, 892 skb, &TCP_SKB_CB(skb)->header.h4.opt, 893 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 894 &arg, arg.iov[0].iov_len, 895 transmit_time, txhash); 896 897 xfrm_sk_free_policy(ctl_sk); 898 sock_net_set(ctl_sk, &init_net); 899 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 900 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 901 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 902 local_bh_enable(); 903 904 #ifdef CONFIG_TCP_MD5SIG 905 out: 906 rcu_read_unlock(); 907 #endif 908 } 909 910 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 911 outside socket context is ugly, certainly. What can I do? 912 */ 913 914 static void tcp_v4_send_ack(const struct sock *sk, 915 struct sk_buff *skb, u32 seq, u32 ack, 916 u32 win, u32 tsval, u32 tsecr, int oif, 917 struct tcp_key *key, 918 int reply_flags, u8 tos, u32 txhash) 919 { 920 const struct tcphdr *th = tcp_hdr(skb); 921 struct { 922 struct tcphdr th; 923 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 924 } rep; 925 struct net *net = sock_net(sk); 926 struct ip_reply_arg arg; 927 struct sock *ctl_sk; 928 u64 transmit_time; 929 930 memset(&rep.th, 0, sizeof(struct tcphdr)); 931 memset(&arg, 0, sizeof(arg)); 932 933 arg.iov[0].iov_base = (unsigned char *)&rep; 934 arg.iov[0].iov_len = sizeof(rep.th); 935 if (tsecr) { 936 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 937 (TCPOPT_TIMESTAMP << 8) | 938 TCPOLEN_TIMESTAMP); 939 rep.opt[1] = htonl(tsval); 940 rep.opt[2] = htonl(tsecr); 941 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 942 } 943 944 /* Swap the send and the receive. */ 945 rep.th.dest = th->source; 946 rep.th.source = th->dest; 947 rep.th.doff = arg.iov[0].iov_len / 4; 948 rep.th.seq = htonl(seq); 949 rep.th.ack_seq = htonl(ack); 950 rep.th.ack = 1; 951 rep.th.window = htons(win); 952 953 #ifdef CONFIG_TCP_MD5SIG 954 if (tcp_key_is_md5(key)) { 955 int offset = (tsecr) ? 3 : 0; 956 957 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 958 (TCPOPT_NOP << 16) | 959 (TCPOPT_MD5SIG << 8) | 960 TCPOLEN_MD5SIG); 961 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 962 rep.th.doff = arg.iov[0].iov_len/4; 963 964 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 965 key->md5_key, ip_hdr(skb)->saddr, 966 ip_hdr(skb)->daddr, &rep.th); 967 } 968 #endif 969 #ifdef CONFIG_TCP_AO 970 if (tcp_key_is_ao(key)) { 971 int offset = (tsecr) ? 3 : 0; 972 973 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 974 (tcp_ao_len(key->ao_key) << 16) | 975 (key->ao_key->sndid << 8) | 976 key->rcv_next); 977 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 978 rep.th.doff = arg.iov[0].iov_len / 4; 979 980 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 981 key->ao_key, key->traffic_key, 982 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 983 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 984 &rep.th, key->sne); 985 } 986 #endif 987 arg.flags = reply_flags; 988 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 989 ip_hdr(skb)->saddr, /* XXX */ 990 arg.iov[0].iov_len, IPPROTO_TCP, 0); 991 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 992 if (oif) 993 arg.bound_dev_if = oif; 994 arg.tos = tos; 995 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 996 local_bh_disable(); 997 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 998 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 999 sock_net_set(ctl_sk, net); 1000 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1001 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1002 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1003 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1004 transmit_time = tcp_transmit_time(sk); 1005 ip_send_unicast_reply(ctl_sk, sk, 1006 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1007 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1008 &arg, arg.iov[0].iov_len, 1009 transmit_time, txhash); 1010 1011 sock_net_set(ctl_sk, &init_net); 1012 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1013 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1014 local_bh_enable(); 1015 } 1016 1017 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1018 enum tcp_tw_status tw_status) 1019 { 1020 struct inet_timewait_sock *tw = inet_twsk(sk); 1021 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1022 struct tcp_key key = {}; 1023 u8 tos = tw->tw_tos; 1024 1025 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1026 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1027 * being placed in a different service queues (Classic rather than L4S) 1028 */ 1029 if (tw_status == TCP_TW_ACK_OOW) 1030 tos &= ~INET_ECN_MASK; 1031 1032 #ifdef CONFIG_TCP_AO 1033 struct tcp_ao_info *ao_info; 1034 1035 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1036 /* FIXME: the segment to-be-acked is not verified yet */ 1037 ao_info = rcu_dereference(tcptw->ao_info); 1038 if (ao_info) { 1039 const struct tcp_ao_hdr *aoh; 1040 1041 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1042 inet_twsk_put(tw); 1043 return; 1044 } 1045 1046 if (aoh) 1047 key.ao_key = tcp_ao_established_key(sk, ao_info, 1048 aoh->rnext_keyid, -1); 1049 } 1050 } 1051 if (key.ao_key) { 1052 struct tcp_ao_key *rnext_key; 1053 1054 key.traffic_key = snd_other_key(key.ao_key); 1055 key.sne = READ_ONCE(ao_info->snd_sne); 1056 rnext_key = READ_ONCE(ao_info->rnext_key); 1057 key.rcv_next = rnext_key->rcvid; 1058 key.type = TCP_KEY_AO; 1059 #else 1060 if (0) { 1061 #endif 1062 } else if (static_branch_tcp_md5()) { 1063 key.md5_key = tcp_twsk_md5_key(tcptw); 1064 if (key.md5_key) 1065 key.type = TCP_KEY_MD5; 1066 } 1067 1068 tcp_v4_send_ack(sk, skb, 1069 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1070 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1071 tcp_tw_tsval(tcptw), 1072 READ_ONCE(tcptw->tw_ts_recent), 1073 tw->tw_bound_dev_if, &key, 1074 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1075 tos, 1076 tw->tw_txhash); 1077 1078 inet_twsk_put(tw); 1079 } 1080 1081 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1082 struct request_sock *req) 1083 { 1084 struct tcp_key key = {}; 1085 1086 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1087 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1088 */ 1089 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1090 tcp_sk(sk)->snd_nxt; 1091 1092 #ifdef CONFIG_TCP_AO 1093 if (static_branch_unlikely(&tcp_ao_needed.key) && 1094 tcp_rsk_used_ao(req)) { 1095 const union tcp_md5_addr *addr; 1096 const struct tcp_ao_hdr *aoh; 1097 int l3index; 1098 1099 /* Invalid TCP option size or twice included auth */ 1100 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1101 return; 1102 if (!aoh) 1103 return; 1104 1105 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1106 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1107 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1108 aoh->rnext_keyid, -1); 1109 if (unlikely(!key.ao_key)) { 1110 /* Send ACK with any matching MKT for the peer */ 1111 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1112 /* Matching key disappeared (user removed the key?) 1113 * let the handshake timeout. 1114 */ 1115 if (!key.ao_key) { 1116 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1117 addr, 1118 ntohs(tcp_hdr(skb)->source), 1119 &ip_hdr(skb)->daddr, 1120 ntohs(tcp_hdr(skb)->dest)); 1121 return; 1122 } 1123 } 1124 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1125 if (!key.traffic_key) 1126 return; 1127 1128 key.type = TCP_KEY_AO; 1129 key.rcv_next = aoh->keyid; 1130 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1131 #else 1132 if (0) { 1133 #endif 1134 } else if (static_branch_tcp_md5()) { 1135 const union tcp_md5_addr *addr; 1136 int l3index; 1137 1138 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1139 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1140 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1141 if (key.md5_key) 1142 key.type = TCP_KEY_MD5; 1143 } 1144 1145 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1146 tcp_v4_send_ack(sk, skb, seq, 1147 tcp_rsk(req)->rcv_nxt, 1148 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1149 tcp_rsk_tsval(tcp_rsk(req)), 1150 req->ts_recent, 1151 0, &key, 1152 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1153 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1154 READ_ONCE(tcp_rsk(req)->txhash)); 1155 if (tcp_key_is_ao(&key)) 1156 kfree(key.traffic_key); 1157 } 1158 1159 /* 1160 * Send a SYN-ACK after having received a SYN. 1161 * This still operates on a request_sock only, not on a big 1162 * socket. 1163 */ 1164 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1165 struct flowi *fl, 1166 struct request_sock *req, 1167 struct tcp_fastopen_cookie *foc, 1168 enum tcp_synack_type synack_type, 1169 struct sk_buff *syn_skb) 1170 { 1171 struct inet_request_sock *ireq = inet_rsk(req); 1172 struct flowi4 fl4; 1173 int err = -1; 1174 struct sk_buff *skb; 1175 u8 tos; 1176 1177 /* First, grab a route. */ 1178 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1179 return -1; 1180 1181 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1182 1183 if (skb) { 1184 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 1185 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1186 1187 tos = READ_ONCE(inet_sk(sk)->tos); 1188 1189 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1190 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1191 (tos & INET_ECN_MASK); 1192 1193 if (!INET_ECN_is_capable(tos) && 1194 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1195 tos |= INET_ECN_ECT_0; 1196 1197 rcu_read_lock(); 1198 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1199 ireq->ir_rmt_addr, 1200 rcu_dereference(ireq->ireq_opt), 1201 tos); 1202 rcu_read_unlock(); 1203 err = net_xmit_eval(err); 1204 } 1205 1206 return err; 1207 } 1208 1209 /* 1210 * IPv4 request_sock destructor. 1211 */ 1212 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1213 { 1214 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1215 } 1216 1217 #ifdef CONFIG_TCP_MD5SIG 1218 /* 1219 * RFC2385 MD5 checksumming requires a mapping of 1220 * IP address->MD5 Key. 1221 * We need to maintain these in the sk structure. 1222 */ 1223 1224 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1225 EXPORT_IPV6_MOD(tcp_md5_needed); 1226 1227 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1228 { 1229 if (!old) 1230 return true; 1231 1232 /* l3index always overrides non-l3index */ 1233 if (old->l3index && new->l3index == 0) 1234 return false; 1235 if (old->l3index == 0 && new->l3index) 1236 return true; 1237 1238 return old->prefixlen < new->prefixlen; 1239 } 1240 1241 /* Find the Key structure for an address. */ 1242 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1243 const union tcp_md5_addr *addr, 1244 int family, bool any_l3index) 1245 { 1246 const struct tcp_sock *tp = tcp_sk(sk); 1247 struct tcp_md5sig_key *key; 1248 const struct tcp_md5sig_info *md5sig; 1249 __be32 mask; 1250 struct tcp_md5sig_key *best_match = NULL; 1251 bool match; 1252 1253 /* caller either holds rcu_read_lock() or socket lock */ 1254 md5sig = rcu_dereference_check(tp->md5sig_info, 1255 lockdep_sock_is_held(sk)); 1256 if (!md5sig) 1257 return NULL; 1258 1259 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1260 lockdep_sock_is_held(sk)) { 1261 if (key->family != family) 1262 continue; 1263 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1264 key->l3index != l3index) 1265 continue; 1266 if (family == AF_INET) { 1267 mask = inet_make_mask(key->prefixlen); 1268 match = (key->addr.a4.s_addr & mask) == 1269 (addr->a4.s_addr & mask); 1270 #if IS_ENABLED(CONFIG_IPV6) 1271 } else if (family == AF_INET6) { 1272 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1273 key->prefixlen); 1274 #endif 1275 } else { 1276 match = false; 1277 } 1278 1279 if (match && better_md5_match(best_match, key)) 1280 best_match = key; 1281 } 1282 return best_match; 1283 } 1284 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1285 1286 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1287 const union tcp_md5_addr *addr, 1288 int family, u8 prefixlen, 1289 int l3index, u8 flags) 1290 { 1291 const struct tcp_sock *tp = tcp_sk(sk); 1292 struct tcp_md5sig_key *key; 1293 unsigned int size = sizeof(struct in_addr); 1294 const struct tcp_md5sig_info *md5sig; 1295 1296 /* caller either holds rcu_read_lock() or socket lock */ 1297 md5sig = rcu_dereference_check(tp->md5sig_info, 1298 lockdep_sock_is_held(sk)); 1299 if (!md5sig) 1300 return NULL; 1301 #if IS_ENABLED(CONFIG_IPV6) 1302 if (family == AF_INET6) 1303 size = sizeof(struct in6_addr); 1304 #endif 1305 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1306 lockdep_sock_is_held(sk)) { 1307 if (key->family != family) 1308 continue; 1309 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1310 continue; 1311 if (key->l3index != l3index) 1312 continue; 1313 if (!memcmp(&key->addr, addr, size) && 1314 key->prefixlen == prefixlen) 1315 return key; 1316 } 1317 return NULL; 1318 } 1319 1320 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1321 const struct sock *addr_sk) 1322 { 1323 const union tcp_md5_addr *addr; 1324 int l3index; 1325 1326 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1327 addr_sk->sk_bound_dev_if); 1328 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1329 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1330 } 1331 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1332 1333 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1334 { 1335 struct tcp_sock *tp = tcp_sk(sk); 1336 struct tcp_md5sig_info *md5sig; 1337 1338 md5sig = kmalloc_obj(*md5sig, gfp); 1339 if (!md5sig) 1340 return -ENOMEM; 1341 1342 sk_gso_disable(sk); 1343 INIT_HLIST_HEAD(&md5sig->head); 1344 rcu_assign_pointer(tp->md5sig_info, md5sig); 1345 return 0; 1346 } 1347 1348 /* This can be called on a newly created socket, from other files */ 1349 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1350 int family, u8 prefixlen, int l3index, u8 flags, 1351 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1352 { 1353 /* Add Key to the list */ 1354 struct tcp_md5sig_key *key; 1355 struct tcp_sock *tp = tcp_sk(sk); 1356 struct tcp_md5sig_info *md5sig; 1357 1358 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1359 if (key) { 1360 /* Pre-existing entry - just update that one. 1361 * Note that the key might be used concurrently. 1362 * data_race() is telling kcsan that we do not care of 1363 * key mismatches, since changing MD5 key on live flows 1364 * can lead to packet drops. 1365 */ 1366 data_race(memcpy(key->key, newkey, newkeylen)); 1367 1368 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1369 * Also note that a reader could catch new key->keylen value 1370 * but old key->key[], this is the reason we use __GFP_ZERO 1371 * at sock_kmalloc() time below these lines. 1372 */ 1373 WRITE_ONCE(key->keylen, newkeylen); 1374 1375 return 0; 1376 } 1377 1378 md5sig = rcu_dereference_protected(tp->md5sig_info, 1379 lockdep_sock_is_held(sk)); 1380 1381 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1382 if (!key) 1383 return -ENOMEM; 1384 1385 memcpy(key->key, newkey, newkeylen); 1386 key->keylen = newkeylen; 1387 key->family = family; 1388 key->prefixlen = prefixlen; 1389 key->l3index = l3index; 1390 key->flags = flags; 1391 memcpy(&key->addr, addr, 1392 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1393 sizeof(struct in_addr)); 1394 hlist_add_head_rcu(&key->node, &md5sig->head); 1395 return 0; 1396 } 1397 1398 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1399 int family, u8 prefixlen, int l3index, u8 flags, 1400 const u8 *newkey, u8 newkeylen) 1401 { 1402 struct tcp_sock *tp = tcp_sk(sk); 1403 1404 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1405 if (fips_enabled) { 1406 pr_warn_once("TCP-MD5 support is disabled due to FIPS\n"); 1407 return -EOPNOTSUPP; 1408 } 1409 1410 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1411 return -ENOMEM; 1412 1413 if (!static_branch_inc(&tcp_md5_needed.key)) { 1414 struct tcp_md5sig_info *md5sig; 1415 1416 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1417 rcu_assign_pointer(tp->md5sig_info, NULL); 1418 kfree_rcu(md5sig, rcu); 1419 return -EUSERS; 1420 } 1421 } 1422 1423 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1424 newkey, newkeylen, GFP_KERNEL); 1425 } 1426 EXPORT_IPV6_MOD(tcp_md5_do_add); 1427 1428 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1429 int family, u8 prefixlen, int l3index, 1430 struct tcp_md5sig_key *key) 1431 { 1432 struct tcp_sock *tp = tcp_sk(sk); 1433 1434 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1435 1436 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1437 return -ENOMEM; 1438 1439 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1440 struct tcp_md5sig_info *md5sig; 1441 1442 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1443 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1444 rcu_assign_pointer(tp->md5sig_info, NULL); 1445 kfree_rcu(md5sig, rcu); 1446 return -EUSERS; 1447 } 1448 } 1449 1450 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1451 key->flags, key->key, key->keylen, 1452 sk_gfp_mask(sk, GFP_ATOMIC)); 1453 } 1454 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1455 1456 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1457 u8 prefixlen, int l3index, u8 flags) 1458 { 1459 struct tcp_md5sig_key *key; 1460 1461 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1462 if (!key) 1463 return -ENOENT; 1464 hlist_del_rcu(&key->node); 1465 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1466 kfree_rcu(key, rcu); 1467 return 0; 1468 } 1469 EXPORT_IPV6_MOD(tcp_md5_do_del); 1470 1471 void tcp_clear_md5_list(struct sock *sk) 1472 { 1473 struct tcp_sock *tp = tcp_sk(sk); 1474 struct tcp_md5sig_key *key; 1475 struct hlist_node *n; 1476 struct tcp_md5sig_info *md5sig; 1477 1478 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1479 1480 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1481 hlist_del(&key->node); 1482 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1483 kfree(key); 1484 } 1485 } 1486 1487 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1488 sockptr_t optval, int optlen) 1489 { 1490 struct tcp_md5sig cmd; 1491 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1492 const union tcp_md5_addr *addr; 1493 u8 prefixlen = 32; 1494 int l3index = 0; 1495 bool l3flag; 1496 u8 flags; 1497 1498 if (optlen < sizeof(cmd)) 1499 return -EINVAL; 1500 1501 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1502 return -EFAULT; 1503 1504 if (sin->sin_family != AF_INET) 1505 return -EINVAL; 1506 1507 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1508 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1509 1510 if (optname == TCP_MD5SIG_EXT && 1511 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1512 prefixlen = cmd.tcpm_prefixlen; 1513 if (prefixlen > 32) 1514 return -EINVAL; 1515 } 1516 1517 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1518 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1519 struct net_device *dev; 1520 1521 rcu_read_lock(); 1522 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1523 if (dev && netif_is_l3_master(dev)) 1524 l3index = dev->ifindex; 1525 1526 rcu_read_unlock(); 1527 1528 /* ok to reference set/not set outside of rcu; 1529 * right now device MUST be an L3 master 1530 */ 1531 if (!dev || !l3index) 1532 return -EINVAL; 1533 } 1534 1535 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1536 1537 if (!cmd.tcpm_keylen) 1538 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1539 1540 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1541 return -EINVAL; 1542 1543 /* Don't allow keys for peers that have a matching TCP-AO key. 1544 * See the comment in tcp_ao_add_cmd() 1545 */ 1546 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1547 return -EKEYREJECTED; 1548 1549 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1550 cmd.tcpm_key, cmd.tcpm_keylen); 1551 } 1552 1553 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx, 1554 __be32 daddr, __be32 saddr, 1555 const struct tcphdr *th, int nbytes) 1556 { 1557 struct { 1558 struct tcp4_pseudohdr ip; 1559 struct tcphdr tcp; 1560 } h; 1561 1562 h.ip.saddr = saddr; 1563 h.ip.daddr = daddr; 1564 h.ip.pad = 0; 1565 h.ip.protocol = IPPROTO_TCP; 1566 h.ip.len = cpu_to_be16(nbytes); 1567 h.tcp = *th; 1568 h.tcp.check = 0; 1569 md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); 1570 } 1571 1572 static noinline_for_stack void 1573 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1574 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1575 { 1576 struct md5_ctx ctx; 1577 1578 md5_init(&ctx); 1579 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); 1580 tcp_md5_hash_key(&ctx, key); 1581 md5_final(&ctx, md5_hash); 1582 } 1583 1584 noinline_for_stack void 1585 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1586 const struct sock *sk, const struct sk_buff *skb) 1587 { 1588 const struct tcphdr *th = tcp_hdr(skb); 1589 __be32 saddr, daddr; 1590 struct md5_ctx ctx; 1591 1592 if (sk) { /* valid for establish/request sockets */ 1593 saddr = sk->sk_rcv_saddr; 1594 daddr = sk->sk_daddr; 1595 } else { 1596 const struct iphdr *iph = ip_hdr(skb); 1597 saddr = iph->saddr; 1598 daddr = iph->daddr; 1599 } 1600 1601 md5_init(&ctx); 1602 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); 1603 tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); 1604 tcp_md5_hash_key(&ctx, key); 1605 md5_final(&ctx, md5_hash); 1606 } 1607 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1608 1609 #endif 1610 1611 static void tcp_v4_init_req(struct request_sock *req, 1612 const struct sock *sk_listener, 1613 struct sk_buff *skb) 1614 { 1615 struct inet_request_sock *ireq = inet_rsk(req); 1616 struct net *net = sock_net(sk_listener); 1617 1618 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1619 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1620 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1621 } 1622 1623 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1624 struct sk_buff *skb, 1625 struct flowi *fl, 1626 struct request_sock *req, 1627 u32 tw_isn) 1628 { 1629 tcp_v4_init_req(req, sk, skb); 1630 1631 if (security_inet_conn_request(sk, skb, req)) 1632 return NULL; 1633 1634 return inet_csk_route_req(sk, &fl->u.ip4, req); 1635 } 1636 1637 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1638 .family = PF_INET, 1639 .obj_size = sizeof(struct tcp_request_sock), 1640 .send_ack = tcp_v4_reqsk_send_ack, 1641 .destructor = tcp_v4_reqsk_destructor, 1642 .send_reset = tcp_v4_send_reset, 1643 }; 1644 1645 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1646 .mss_clamp = TCP_MSS_DEFAULT, 1647 #ifdef CONFIG_TCP_MD5SIG 1648 .req_md5_lookup = tcp_v4_md5_lookup, 1649 .calc_md5_hash = tcp_v4_md5_hash_skb, 1650 #endif 1651 #ifdef CONFIG_TCP_AO 1652 .ao_lookup = tcp_v4_ao_lookup_rsk, 1653 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1654 .ao_synack_hash = tcp_v4_ao_synack_hash, 1655 #endif 1656 #ifdef CONFIG_SYN_COOKIES 1657 .cookie_init_seq = cookie_v4_init_sequence, 1658 #endif 1659 .route_req = tcp_v4_route_req, 1660 .init_seq_and_ts_off = tcp_v4_init_seq_and_ts_off, 1661 .send_synack = tcp_v4_send_synack, 1662 }; 1663 1664 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1665 { 1666 /* Never answer to SYNs send to broadcast or multicast */ 1667 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1668 goto drop; 1669 1670 return tcp_conn_request(&tcp_request_sock_ops, 1671 &tcp_request_sock_ipv4_ops, sk, skb); 1672 1673 drop: 1674 tcp_listendrop(sk); 1675 return 0; 1676 } 1677 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1678 1679 1680 /* 1681 * The three way handshake has completed - we got a valid synack - 1682 * now create the new socket. 1683 */ 1684 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1685 struct request_sock *req, 1686 struct dst_entry *dst, 1687 struct request_sock *req_unhash, 1688 bool *own_req, 1689 void (*opt_child_init)(struct sock *newsk, 1690 const struct sock *sk)) 1691 { 1692 struct inet_request_sock *ireq; 1693 bool found_dup_sk = false; 1694 struct inet_sock *newinet; 1695 struct tcp_sock *newtp; 1696 struct sock *newsk; 1697 #ifdef CONFIG_TCP_MD5SIG 1698 const union tcp_md5_addr *addr; 1699 struct tcp_md5sig_key *key; 1700 int l3index; 1701 #endif 1702 struct ip_options_rcu *inet_opt; 1703 1704 if (sk_acceptq_is_full(sk)) 1705 goto exit_overflow; 1706 1707 newsk = tcp_create_openreq_child(sk, req, skb); 1708 if (!newsk) 1709 goto exit_nonewsk; 1710 1711 newsk->sk_gso_type = SKB_GSO_TCPV4; 1712 inet_sk_rx_dst_set(newsk, skb); 1713 1714 newtp = tcp_sk(newsk); 1715 newinet = inet_sk(newsk); 1716 ireq = inet_rsk(req); 1717 inet_opt = rcu_dereference(ireq->ireq_opt); 1718 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1719 newinet->mc_index = inet_iif(skb); 1720 newinet->mc_ttl = ip_hdr(skb)->ttl; 1721 newinet->rcv_tos = ip_hdr(skb)->tos; 1722 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1723 if (inet_opt) 1724 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1725 atomic_set(&newinet->inet_id, get_random_u16()); 1726 1727 /* Set ToS of the new socket based upon the value of incoming SYN. 1728 * ECT bits are set later in tcp_init_transfer(). 1729 */ 1730 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1731 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1732 1733 if (!dst) { 1734 dst = inet_csk_route_child_sock(sk, newsk, req); 1735 if (!dst) 1736 goto put_and_exit; 1737 } else { 1738 /* syncookie case : see end of cookie_v4_check() */ 1739 } 1740 sk_setup_caps(newsk, dst); 1741 1742 #if IS_ENABLED(CONFIG_IPV6) 1743 if (opt_child_init) 1744 opt_child_init(newsk, sk); 1745 #endif 1746 tcp_ca_openreq_child(newsk, dst); 1747 1748 tcp_sync_mss(newsk, dst4_mtu(dst)); 1749 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1750 1751 tcp_initialize_rcv_mss(newsk); 1752 1753 #ifdef CONFIG_TCP_MD5SIG 1754 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1755 /* Copy over the MD5 key from the original socket */ 1756 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1757 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1758 if (key && !tcp_rsk_used_ao(req)) { 1759 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1760 goto put_and_exit; 1761 sk_gso_disable(newsk); 1762 } 1763 #endif 1764 #ifdef CONFIG_TCP_AO 1765 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1766 goto put_and_exit; /* OOM, release back memory */ 1767 #endif 1768 1769 if (__inet_inherit_port(sk, newsk) < 0) 1770 goto put_and_exit; 1771 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1772 &found_dup_sk); 1773 if (likely(*own_req)) { 1774 tcp_move_syn(newtp, req); 1775 ireq->ireq_opt = NULL; 1776 } else { 1777 newinet->inet_opt = NULL; 1778 1779 if (!req_unhash && found_dup_sk) { 1780 /* This code path should only be executed in the 1781 * syncookie case only 1782 */ 1783 bh_unlock_sock(newsk); 1784 sock_put(newsk); 1785 newsk = NULL; 1786 } 1787 } 1788 return newsk; 1789 1790 exit_overflow: 1791 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1792 exit_nonewsk: 1793 dst_release(dst); 1794 exit: 1795 tcp_listendrop(sk); 1796 return NULL; 1797 put_and_exit: 1798 newinet->inet_opt = NULL; 1799 inet_csk_prepare_forced_close(newsk); 1800 tcp_done(newsk); 1801 goto exit; 1802 } 1803 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1804 1805 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1806 { 1807 #ifdef CONFIG_SYN_COOKIES 1808 const struct tcphdr *th = tcp_hdr(skb); 1809 1810 if (!th->syn) 1811 sk = cookie_v4_check(sk, skb); 1812 #endif 1813 return sk; 1814 } 1815 1816 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1817 struct tcphdr *th, u32 *cookie) 1818 { 1819 u16 mss = 0; 1820 #ifdef CONFIG_SYN_COOKIES 1821 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1822 &tcp_request_sock_ipv4_ops, sk, th); 1823 if (mss) { 1824 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1825 tcp_synq_overflow(sk); 1826 } 1827 #endif 1828 return mss; 1829 } 1830 1831 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1832 u32)); 1833 /* The socket must have it's spinlock held when we get 1834 * here, unless it is a TCP_LISTEN socket. 1835 * 1836 * We have a potential double-lock case here, so even when 1837 * doing backlog processing we use the BH locking scheme. 1838 * This is because we cannot sleep with the original spinlock 1839 * held. 1840 */ 1841 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1842 { 1843 enum skb_drop_reason reason; 1844 struct sock *rsk; 1845 1846 reason = psp_sk_rx_policy_check(sk, skb); 1847 if (reason) 1848 goto err_discard; 1849 1850 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1851 struct dst_entry *dst; 1852 1853 dst = rcu_dereference_protected(sk->sk_rx_dst, 1854 lockdep_sock_is_held(sk)); 1855 1856 sock_rps_save_rxhash(sk, skb); 1857 sk_mark_napi_id(sk, skb); 1858 if (dst) { 1859 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1860 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1861 dst, 0)) { 1862 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1863 dst_release(dst); 1864 } 1865 } 1866 tcp_rcv_established(sk, skb); 1867 return 0; 1868 } 1869 1870 if (tcp_checksum_complete(skb)) 1871 goto csum_err; 1872 1873 if (sk->sk_state == TCP_LISTEN) { 1874 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1875 1876 if (!nsk) 1877 return 0; 1878 if (nsk != sk) { 1879 reason = tcp_child_process(sk, nsk, skb); 1880 if (reason) { 1881 rsk = nsk; 1882 goto reset; 1883 } 1884 return 0; 1885 } 1886 } else 1887 sock_rps_save_rxhash(sk, skb); 1888 1889 reason = tcp_rcv_state_process(sk, skb); 1890 if (reason) { 1891 rsk = sk; 1892 goto reset; 1893 } 1894 return 0; 1895 1896 reset: 1897 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1898 discard: 1899 sk_skb_reason_drop(sk, skb, reason); 1900 /* Be careful here. If this function gets more complicated and 1901 * gcc suffers from register pressure on the x86, sk (in %ebx) 1902 * might be destroyed here. This current version compiles correctly, 1903 * but you have been warned. 1904 */ 1905 return 0; 1906 1907 csum_err: 1908 reason = SKB_DROP_REASON_TCP_CSUM; 1909 trace_tcp_bad_csum(skb); 1910 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1911 err_discard: 1912 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1913 goto discard; 1914 } 1915 EXPORT_SYMBOL(tcp_v4_do_rcv); 1916 1917 int tcp_v4_early_demux(struct sk_buff *skb) 1918 { 1919 struct net *net = dev_net_rcu(skb->dev); 1920 const struct iphdr *iph; 1921 const struct tcphdr *th; 1922 struct sock *sk; 1923 1924 if (skb->pkt_type != PACKET_HOST) 1925 return 0; 1926 1927 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1928 return 0; 1929 1930 iph = ip_hdr(skb); 1931 th = tcp_hdr(skb); 1932 1933 if (th->doff < sizeof(struct tcphdr) / 4) 1934 return 0; 1935 1936 sk = __inet_lookup_established(net, iph->saddr, th->source, 1937 iph->daddr, ntohs(th->dest), 1938 skb->skb_iif, inet_sdif(skb)); 1939 if (sk) { 1940 skb->sk = sk; 1941 skb->destructor = sock_edemux; 1942 if (sk_fullsock(sk)) { 1943 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1944 1945 if (dst) 1946 dst = dst_check(dst, 0); 1947 if (dst && 1948 sk->sk_rx_dst_ifindex == skb->skb_iif) 1949 skb_dst_set_noref(skb, dst); 1950 } 1951 } 1952 return 0; 1953 } 1954 1955 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1956 enum skb_drop_reason *reason) 1957 { 1958 u32 tail_gso_size, tail_gso_segs; 1959 struct skb_shared_info *shinfo; 1960 const struct tcphdr *th; 1961 struct tcphdr *thtail; 1962 struct sk_buff *tail; 1963 unsigned int hdrlen; 1964 bool fragstolen; 1965 u32 gso_segs; 1966 u32 gso_size; 1967 u64 limit; 1968 int delta; 1969 int err; 1970 1971 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1972 * we can fix skb->truesize to its real value to avoid future drops. 1973 * This is valid because skb is not yet charged to the socket. 1974 * It has been noticed pure SACK packets were sometimes dropped 1975 * (if cooked by drivers without copybreak feature). 1976 */ 1977 skb_condense(skb); 1978 1979 tcp_cleanup_skb(skb); 1980 1981 if (unlikely(tcp_checksum_complete(skb))) { 1982 bh_unlock_sock(sk); 1983 trace_tcp_bad_csum(skb); 1984 *reason = SKB_DROP_REASON_TCP_CSUM; 1985 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1986 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1987 return true; 1988 } 1989 1990 /* Attempt coalescing to last skb in backlog, even if we are 1991 * above the limits. 1992 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1993 */ 1994 th = (const struct tcphdr *)skb->data; 1995 hdrlen = th->doff * 4; 1996 1997 tail = sk->sk_backlog.tail; 1998 if (!tail) 1999 goto no_coalesce; 2000 thtail = (struct tcphdr *)tail->data; 2001 2002 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2003 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2004 ((TCP_SKB_CB(tail)->tcp_flags | 2005 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2006 !((TCP_SKB_CB(tail)->tcp_flags & 2007 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2008 ((TCP_SKB_CB(tail)->tcp_flags ^ 2009 TCP_SKB_CB(skb)->tcp_flags) & 2010 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2011 !tcp_skb_can_collapse_rx(tail, skb) || 2012 thtail->doff != th->doff || 2013 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || 2014 /* prior to PSP Rx policy check, retain exact PSP metadata */ 2015 psp_skb_coalesce_diff(tail, skb)) 2016 goto no_coalesce; 2017 2018 __skb_pull(skb, hdrlen); 2019 2020 shinfo = skb_shinfo(skb); 2021 gso_size = shinfo->gso_size ?: skb->len; 2022 gso_segs = shinfo->gso_segs ?: 1; 2023 2024 shinfo = skb_shinfo(tail); 2025 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2026 tail_gso_segs = shinfo->gso_segs ?: 1; 2027 2028 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2029 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2030 2031 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2032 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2033 thtail->window = th->window; 2034 } 2035 2036 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2037 * thtail->fin, so that the fast path in tcp_rcv_established() 2038 * is not entered if we append a packet with a FIN. 2039 * SYN, RST, URG are not present. 2040 * ACK is set on both packets. 2041 * PSH : we do not really care in TCP stack, 2042 * at least for 'GRO' packets. 2043 */ 2044 thtail->fin |= th->fin; 2045 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2046 2047 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2048 TCP_SKB_CB(tail)->has_rxtstamp = true; 2049 tail->tstamp = skb->tstamp; 2050 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2051 } 2052 2053 /* Not as strict as GRO. We only need to carry mss max value */ 2054 shinfo->gso_size = max(gso_size, tail_gso_size); 2055 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2056 2057 sk->sk_backlog.len += delta; 2058 __NET_INC_STATS(sock_net(sk), 2059 LINUX_MIB_TCPBACKLOGCOALESCE); 2060 kfree_skb_partial(skb, fragstolen); 2061 return false; 2062 } 2063 __skb_push(skb, hdrlen); 2064 2065 no_coalesce: 2066 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2067 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2068 * sk_rcvbuf in normal conditions. 2069 */ 2070 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2071 2072 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2073 2074 /* Only socket owner can try to collapse/prune rx queues 2075 * to reduce memory overhead, so add a little headroom here. 2076 * Few sockets backlog are possibly concurrently non empty. 2077 */ 2078 limit += 64 * 1024; 2079 2080 limit = min_t(u64, limit, UINT_MAX); 2081 2082 err = sk_add_backlog(sk, skb, limit); 2083 if (unlikely(err)) { 2084 bh_unlock_sock(sk); 2085 if (err == -ENOMEM) { 2086 *reason = SKB_DROP_REASON_PFMEMALLOC; 2087 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2088 } else { 2089 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2090 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2091 } 2092 return true; 2093 } 2094 return false; 2095 } 2096 EXPORT_IPV6_MOD(tcp_add_backlog); 2097 2098 static void tcp_v4_restore_cb(struct sk_buff *skb) 2099 { 2100 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2101 sizeof(struct inet_skb_parm)); 2102 } 2103 2104 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2105 const struct tcphdr *th) 2106 { 2107 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2108 * barrier() makes sure compiler wont play fool^Waliasing games. 2109 */ 2110 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2111 sizeof(struct inet_skb_parm)); 2112 barrier(); 2113 2114 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2115 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2116 skb->len - th->doff * 4); 2117 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2118 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2119 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2120 TCP_SKB_CB(skb)->sacked = 0; 2121 TCP_SKB_CB(skb)->has_rxtstamp = 2122 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2123 } 2124 2125 /* 2126 * From tcp_input.c 2127 */ 2128 2129 int tcp_v4_rcv(struct sk_buff *skb) 2130 { 2131 struct net *net = dev_net_rcu(skb->dev); 2132 enum skb_drop_reason drop_reason; 2133 enum tcp_tw_status tw_status; 2134 int sdif = inet_sdif(skb); 2135 int dif = inet_iif(skb); 2136 const struct iphdr *iph; 2137 const struct tcphdr *th; 2138 struct sock *sk = NULL; 2139 bool refcounted; 2140 int ret; 2141 u32 isn; 2142 2143 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2144 if (skb->pkt_type != PACKET_HOST) 2145 goto discard_it; 2146 2147 /* Count it even if it's bad */ 2148 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2149 2150 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2151 goto discard_it; 2152 2153 th = (const struct tcphdr *)skb->data; 2154 2155 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2156 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2157 goto bad_packet; 2158 } 2159 if (!pskb_may_pull(skb, th->doff * 4)) 2160 goto discard_it; 2161 2162 /* An explanation is required here, I think. 2163 * Packet length and doff are validated by header prediction, 2164 * provided case of th->doff==0 is eliminated. 2165 * So, we defer the checks. */ 2166 2167 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2168 goto csum_error; 2169 2170 th = (const struct tcphdr *)skb->data; 2171 iph = ip_hdr(skb); 2172 lookup: 2173 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2174 th->dest, sdif, &refcounted); 2175 if (!sk) 2176 goto no_tcp_socket; 2177 2178 if (sk->sk_state == TCP_TIME_WAIT) 2179 goto do_time_wait; 2180 2181 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2182 struct request_sock *req = inet_reqsk(sk); 2183 bool req_stolen = false; 2184 struct sock *nsk; 2185 2186 sk = req->rsk_listener; 2187 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2188 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2189 else 2190 drop_reason = tcp_inbound_hash(sk, req, skb, 2191 &iph->saddr, &iph->daddr, 2192 AF_INET, dif, sdif); 2193 if (unlikely(drop_reason)) { 2194 sk_drops_skbadd(sk, skb); 2195 reqsk_put(req); 2196 goto discard_it; 2197 } 2198 if (tcp_checksum_complete(skb)) { 2199 reqsk_put(req); 2200 goto csum_error; 2201 } 2202 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2203 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2204 if (!nsk) { 2205 inet_csk_reqsk_queue_drop_and_put(sk, req); 2206 goto lookup; 2207 } 2208 sk = nsk; 2209 /* reuseport_migrate_sock() has already held one sk_refcnt 2210 * before returning. 2211 */ 2212 } else { 2213 /* We own a reference on the listener, increase it again 2214 * as we might lose it too soon. 2215 */ 2216 sock_hold(sk); 2217 } 2218 refcounted = true; 2219 nsk = NULL; 2220 if (!tcp_filter(sk, skb, &drop_reason)) { 2221 th = (const struct tcphdr *)skb->data; 2222 iph = ip_hdr(skb); 2223 tcp_v4_fill_cb(skb, iph, th); 2224 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2225 &drop_reason); 2226 } 2227 if (!nsk) { 2228 reqsk_put(req); 2229 if (req_stolen) { 2230 /* Another cpu got exclusive access to req 2231 * and created a full blown socket. 2232 * Try to feed this packet to this socket 2233 * instead of discarding it. 2234 */ 2235 tcp_v4_restore_cb(skb); 2236 sock_put(sk); 2237 goto lookup; 2238 } 2239 goto discard_and_relse; 2240 } 2241 nf_reset_ct(skb); 2242 if (nsk == sk) { 2243 reqsk_put(req); 2244 tcp_v4_restore_cb(skb); 2245 } else { 2246 drop_reason = tcp_child_process(sk, nsk, skb); 2247 if (drop_reason) { 2248 enum sk_rst_reason rst_reason; 2249 2250 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2251 tcp_v4_send_reset(nsk, skb, rst_reason); 2252 goto discard_and_relse; 2253 } 2254 sock_put(sk); 2255 return 0; 2256 } 2257 } 2258 2259 process: 2260 if (static_branch_unlikely(&ip4_min_ttl)) { 2261 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2262 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2263 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2264 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2265 goto discard_and_relse; 2266 } 2267 } 2268 2269 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2270 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2271 goto discard_and_relse; 2272 } 2273 2274 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2275 AF_INET, dif, sdif); 2276 if (drop_reason) 2277 goto discard_and_relse; 2278 2279 nf_reset_ct(skb); 2280 2281 if (tcp_filter(sk, skb, &drop_reason)) 2282 goto discard_and_relse; 2283 2284 th = (const struct tcphdr *)skb->data; 2285 iph = ip_hdr(skb); 2286 tcp_v4_fill_cb(skb, iph, th); 2287 2288 skb->dev = NULL; 2289 2290 if (sk->sk_state == TCP_LISTEN) { 2291 ret = tcp_v4_do_rcv(sk, skb); 2292 goto put_and_return; 2293 } 2294 2295 sk_incoming_cpu_update(sk); 2296 2297 bh_lock_sock_nested(sk); 2298 tcp_segs_in(tcp_sk(sk), skb); 2299 ret = 0; 2300 if (!sock_owned_by_user(sk)) { 2301 ret = tcp_v4_do_rcv(sk, skb); 2302 } else { 2303 if (tcp_add_backlog(sk, skb, &drop_reason)) 2304 goto discard_and_relse; 2305 } 2306 bh_unlock_sock(sk); 2307 2308 put_and_return: 2309 if (refcounted) 2310 sock_put(sk); 2311 2312 return ret; 2313 2314 no_tcp_socket: 2315 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2316 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2317 goto discard_it; 2318 2319 tcp_v4_fill_cb(skb, iph, th); 2320 2321 if (tcp_checksum_complete(skb)) { 2322 csum_error: 2323 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2324 trace_tcp_bad_csum(skb); 2325 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2326 bad_packet: 2327 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2328 } else { 2329 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2330 } 2331 2332 discard_it: 2333 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2334 /* Discard frame. */ 2335 sk_skb_reason_drop(sk, skb, drop_reason); 2336 return 0; 2337 2338 discard_and_relse: 2339 sk_drops_skbadd(sk, skb); 2340 if (refcounted) 2341 sock_put(sk); 2342 goto discard_it; 2343 2344 do_time_wait: 2345 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2346 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2347 inet_twsk_put(inet_twsk(sk)); 2348 goto discard_it; 2349 } 2350 2351 tcp_v4_fill_cb(skb, iph, th); 2352 2353 if (tcp_checksum_complete(skb)) { 2354 inet_twsk_put(inet_twsk(sk)); 2355 goto csum_error; 2356 } 2357 2358 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2359 &drop_reason); 2360 switch (tw_status) { 2361 case TCP_TW_SYN: { 2362 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2363 iph->saddr, th->source, 2364 iph->daddr, th->dest, 2365 inet_iif(skb), 2366 sdif); 2367 if (sk2) { 2368 inet_twsk_deschedule_put(inet_twsk(sk)); 2369 sk = sk2; 2370 tcp_v4_restore_cb(skb); 2371 refcounted = false; 2372 __this_cpu_write(tcp_tw_isn, isn); 2373 goto process; 2374 } 2375 2376 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); 2377 if (drop_reason) 2378 break; 2379 } 2380 /* to ACK */ 2381 fallthrough; 2382 case TCP_TW_ACK: 2383 case TCP_TW_ACK_OOW: 2384 tcp_v4_timewait_ack(sk, skb, tw_status); 2385 break; 2386 case TCP_TW_RST: 2387 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2388 inet_twsk_deschedule_put(inet_twsk(sk)); 2389 goto discard_it; 2390 case TCP_TW_SUCCESS:; 2391 } 2392 goto discard_it; 2393 } 2394 2395 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2396 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2397 }; 2398 2399 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2400 { 2401 struct dst_entry *dst = skb_dst(skb); 2402 2403 if (dst && dst_hold_safe(dst)) { 2404 rcu_assign_pointer(sk->sk_rx_dst, dst); 2405 sk->sk_rx_dst_ifindex = skb->skb_iif; 2406 } 2407 } 2408 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2409 2410 const struct inet_connection_sock_af_ops ipv4_specific = { 2411 .queue_xmit = ip_queue_xmit, 2412 .rebuild_header = inet_sk_rebuild_header, 2413 .sk_rx_dst_set = inet_sk_rx_dst_set, 2414 .conn_request = tcp_v4_conn_request, 2415 .syn_recv_sock = tcp_v4_syn_recv_sock, 2416 .net_header_len = sizeof(struct iphdr), 2417 .setsockopt = ip_setsockopt, 2418 .getsockopt = ip_getsockopt, 2419 .mtu_reduced = tcp_v4_mtu_reduced, 2420 }; 2421 EXPORT_IPV6_MOD(ipv4_specific); 2422 2423 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2424 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2425 #ifdef CONFIG_TCP_MD5SIG 2426 .md5_lookup = tcp_v4_md5_lookup, 2427 .calc_md5_hash = tcp_v4_md5_hash_skb, 2428 .md5_parse = tcp_v4_parse_md5_keys, 2429 #endif 2430 #ifdef CONFIG_TCP_AO 2431 .ao_lookup = tcp_v4_ao_lookup, 2432 .calc_ao_hash = tcp_v4_ao_hash_skb, 2433 .ao_parse = tcp_v4_parse_ao, 2434 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2435 #endif 2436 }; 2437 2438 static void tcp4_destruct_sock(struct sock *sk) 2439 { 2440 tcp_md5_destruct_sock(sk); 2441 tcp_ao_destroy_sock(sk, false); 2442 inet_sock_destruct(sk); 2443 } 2444 #endif 2445 2446 /* NOTE: A lot of things set to zero explicitly by call to 2447 * sk_alloc() so need not be done here. 2448 */ 2449 static int tcp_v4_init_sock(struct sock *sk) 2450 { 2451 struct inet_connection_sock *icsk = inet_csk(sk); 2452 2453 tcp_init_sock(sk); 2454 2455 icsk->icsk_af_ops = &ipv4_specific; 2456 2457 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2458 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2459 sk->sk_destruct = tcp4_destruct_sock; 2460 #endif 2461 2462 return 0; 2463 } 2464 2465 static void tcp_release_user_frags(struct sock *sk) 2466 { 2467 #ifdef CONFIG_PAGE_POOL 2468 unsigned long index; 2469 void *netmem; 2470 2471 xa_for_each(&sk->sk_user_frags, index, netmem) 2472 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2473 #endif 2474 } 2475 2476 void tcp_v4_destroy_sock(struct sock *sk) 2477 { 2478 struct tcp_sock *tp = tcp_sk(sk); 2479 2480 tcp_release_user_frags(sk); 2481 2482 xa_destroy(&sk->sk_user_frags); 2483 2484 trace_tcp_destroy_sock(sk); 2485 2486 tcp_clear_xmit_timers(sk); 2487 2488 tcp_cleanup_congestion_control(sk); 2489 2490 tcp_cleanup_ulp(sk); 2491 2492 /* Cleanup up the write buffer. */ 2493 tcp_write_queue_purge(sk); 2494 2495 /* Check if we want to disable active TFO */ 2496 tcp_fastopen_active_disable_ofo_check(sk); 2497 2498 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2499 skb_rbtree_purge(&tp->out_of_order_queue); 2500 2501 /* Clean up a referenced TCP bind bucket. */ 2502 if (inet_csk(sk)->icsk_bind_hash) 2503 inet_put_port(sk); 2504 2505 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2506 2507 /* If socket is aborted during connect operation */ 2508 tcp_free_fastopen_req(tp); 2509 tcp_fastopen_destroy_cipher(sk); 2510 tcp_saved_syn_free(tp); 2511 2512 sk_sockets_allocated_dec(sk); 2513 } 2514 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2515 2516 #ifdef CONFIG_PROC_FS 2517 /* Proc filesystem TCP sock list dumping. */ 2518 2519 static unsigned short seq_file_family(const struct seq_file *seq); 2520 2521 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2522 { 2523 unsigned short family = seq_file_family(seq); 2524 2525 /* AF_UNSPEC is used as a match all */ 2526 return ((family == AF_UNSPEC || family == sk->sk_family) && 2527 net_eq(sock_net(sk), seq_file_net(seq))); 2528 } 2529 2530 /* Find a non empty bucket (starting from st->bucket) 2531 * and return the first sk from it. 2532 */ 2533 static void *listening_get_first(struct seq_file *seq) 2534 { 2535 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2536 struct tcp_iter_state *st = seq->private; 2537 2538 st->offset = 0; 2539 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2540 struct inet_listen_hashbucket *ilb2; 2541 struct hlist_nulls_node *node; 2542 struct sock *sk; 2543 2544 ilb2 = &hinfo->lhash2[st->bucket]; 2545 if (hlist_nulls_empty(&ilb2->nulls_head)) 2546 continue; 2547 2548 spin_lock(&ilb2->lock); 2549 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2550 if (seq_sk_match(seq, sk)) 2551 return sk; 2552 } 2553 spin_unlock(&ilb2->lock); 2554 } 2555 2556 return NULL; 2557 } 2558 2559 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2560 * If "cur" is the last one in the st->bucket, 2561 * call listening_get_first() to return the first sk of the next 2562 * non empty bucket. 2563 */ 2564 static void *listening_get_next(struct seq_file *seq, void *cur) 2565 { 2566 struct tcp_iter_state *st = seq->private; 2567 struct inet_listen_hashbucket *ilb2; 2568 struct hlist_nulls_node *node; 2569 struct inet_hashinfo *hinfo; 2570 struct sock *sk = cur; 2571 2572 ++st->num; 2573 ++st->offset; 2574 2575 sk = sk_nulls_next(sk); 2576 sk_nulls_for_each_from(sk, node) { 2577 if (seq_sk_match(seq, sk)) 2578 return sk; 2579 } 2580 2581 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2582 ilb2 = &hinfo->lhash2[st->bucket]; 2583 spin_unlock(&ilb2->lock); 2584 ++st->bucket; 2585 return listening_get_first(seq); 2586 } 2587 2588 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2589 { 2590 struct tcp_iter_state *st = seq->private; 2591 void *rc; 2592 2593 st->bucket = 0; 2594 st->offset = 0; 2595 rc = listening_get_first(seq); 2596 2597 while (rc && *pos) { 2598 rc = listening_get_next(seq, rc); 2599 --*pos; 2600 } 2601 return rc; 2602 } 2603 2604 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2605 const struct tcp_iter_state *st) 2606 { 2607 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2608 } 2609 2610 /* 2611 * Get first established socket starting from bucket given in st->bucket. 2612 * If st->bucket is zero, the very first socket in the hash is returned. 2613 */ 2614 static void *established_get_first(struct seq_file *seq) 2615 { 2616 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2617 struct tcp_iter_state *st = seq->private; 2618 2619 st->offset = 0; 2620 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2621 struct sock *sk; 2622 struct hlist_nulls_node *node; 2623 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2624 2625 cond_resched(); 2626 2627 /* Lockless fast path for the common case of empty buckets */ 2628 if (empty_bucket(hinfo, st)) 2629 continue; 2630 2631 spin_lock_bh(lock); 2632 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2633 if (seq_sk_match(seq, sk)) 2634 return sk; 2635 } 2636 spin_unlock_bh(lock); 2637 } 2638 2639 return NULL; 2640 } 2641 2642 static void *established_get_next(struct seq_file *seq, void *cur) 2643 { 2644 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2645 struct tcp_iter_state *st = seq->private; 2646 struct hlist_nulls_node *node; 2647 struct sock *sk = cur; 2648 2649 ++st->num; 2650 ++st->offset; 2651 2652 sk = sk_nulls_next(sk); 2653 2654 sk_nulls_for_each_from(sk, node) { 2655 if (seq_sk_match(seq, sk)) 2656 return sk; 2657 } 2658 2659 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2660 ++st->bucket; 2661 return established_get_first(seq); 2662 } 2663 2664 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2665 { 2666 struct tcp_iter_state *st = seq->private; 2667 void *rc; 2668 2669 st->bucket = 0; 2670 rc = established_get_first(seq); 2671 2672 while (rc && pos) { 2673 rc = established_get_next(seq, rc); 2674 --pos; 2675 } 2676 return rc; 2677 } 2678 2679 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2680 { 2681 void *rc; 2682 struct tcp_iter_state *st = seq->private; 2683 2684 st->state = TCP_SEQ_STATE_LISTENING; 2685 rc = listening_get_idx(seq, &pos); 2686 2687 if (!rc) { 2688 st->state = TCP_SEQ_STATE_ESTABLISHED; 2689 rc = established_get_idx(seq, pos); 2690 } 2691 2692 return rc; 2693 } 2694 2695 static void *tcp_seek_last_pos(struct seq_file *seq) 2696 { 2697 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2698 struct tcp_iter_state *st = seq->private; 2699 int bucket = st->bucket; 2700 int offset = st->offset; 2701 int orig_num = st->num; 2702 void *rc = NULL; 2703 2704 switch (st->state) { 2705 case TCP_SEQ_STATE_LISTENING: 2706 if (st->bucket > hinfo->lhash2_mask) 2707 break; 2708 rc = listening_get_first(seq); 2709 while (offset-- && rc && bucket == st->bucket) 2710 rc = listening_get_next(seq, rc); 2711 if (rc) 2712 break; 2713 st->bucket = 0; 2714 st->state = TCP_SEQ_STATE_ESTABLISHED; 2715 fallthrough; 2716 case TCP_SEQ_STATE_ESTABLISHED: 2717 if (st->bucket > hinfo->ehash_mask) 2718 break; 2719 rc = established_get_first(seq); 2720 while (offset-- && rc && bucket == st->bucket) 2721 rc = established_get_next(seq, rc); 2722 } 2723 2724 st->num = orig_num; 2725 2726 return rc; 2727 } 2728 2729 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2730 { 2731 struct tcp_iter_state *st = seq->private; 2732 void *rc; 2733 2734 if (*pos && *pos == st->last_pos) { 2735 rc = tcp_seek_last_pos(seq); 2736 if (rc) 2737 goto out; 2738 } 2739 2740 st->state = TCP_SEQ_STATE_LISTENING; 2741 st->num = 0; 2742 st->bucket = 0; 2743 st->offset = 0; 2744 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2745 2746 out: 2747 st->last_pos = *pos; 2748 return rc; 2749 } 2750 EXPORT_IPV6_MOD(tcp_seq_start); 2751 2752 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2753 { 2754 struct tcp_iter_state *st = seq->private; 2755 void *rc = NULL; 2756 2757 if (v == SEQ_START_TOKEN) { 2758 rc = tcp_get_idx(seq, 0); 2759 goto out; 2760 } 2761 2762 switch (st->state) { 2763 case TCP_SEQ_STATE_LISTENING: 2764 rc = listening_get_next(seq, v); 2765 if (!rc) { 2766 st->state = TCP_SEQ_STATE_ESTABLISHED; 2767 st->bucket = 0; 2768 st->offset = 0; 2769 rc = established_get_first(seq); 2770 } 2771 break; 2772 case TCP_SEQ_STATE_ESTABLISHED: 2773 rc = established_get_next(seq, v); 2774 break; 2775 } 2776 out: 2777 ++*pos; 2778 st->last_pos = *pos; 2779 return rc; 2780 } 2781 EXPORT_IPV6_MOD(tcp_seq_next); 2782 2783 void tcp_seq_stop(struct seq_file *seq, void *v) 2784 { 2785 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2786 struct tcp_iter_state *st = seq->private; 2787 2788 switch (st->state) { 2789 case TCP_SEQ_STATE_LISTENING: 2790 if (v != SEQ_START_TOKEN) 2791 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2792 break; 2793 case TCP_SEQ_STATE_ESTABLISHED: 2794 if (v) 2795 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2796 break; 2797 } 2798 } 2799 EXPORT_IPV6_MOD(tcp_seq_stop); 2800 2801 static void get_openreq4(const struct request_sock *req, 2802 struct seq_file *f, int i) 2803 { 2804 const struct inet_request_sock *ireq = inet_rsk(req); 2805 long delta = req->rsk_timer.expires - jiffies; 2806 2807 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2808 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2809 i, 2810 ireq->ir_loc_addr, 2811 ireq->ir_num, 2812 ireq->ir_rmt_addr, 2813 ntohs(ireq->ir_rmt_port), 2814 TCP_SYN_RECV, 2815 0, 0, /* could print option size, but that is af dependent. */ 2816 1, /* timers active (only the expire timer) */ 2817 jiffies_delta_to_clock_t(delta), 2818 req->num_timeout, 2819 from_kuid_munged(seq_user_ns(f), 2820 sk_uid(req->rsk_listener)), 2821 0, /* non standard timer */ 2822 0, /* open_requests have no inode */ 2823 0, 2824 req); 2825 } 2826 2827 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2828 { 2829 int timer_active; 2830 unsigned long timer_expires; 2831 const struct tcp_sock *tp = tcp_sk(sk); 2832 const struct inet_connection_sock *icsk = inet_csk(sk); 2833 const struct inet_sock *inet = inet_sk(sk); 2834 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2835 __be32 dest = inet->inet_daddr; 2836 __be32 src = inet->inet_rcv_saddr; 2837 __u16 destp = ntohs(inet->inet_dport); 2838 __u16 srcp = ntohs(inet->inet_sport); 2839 u8 icsk_pending; 2840 int rx_queue; 2841 int state; 2842 2843 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2844 if (icsk_pending == ICSK_TIME_RETRANS || 2845 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2846 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2847 timer_active = 1; 2848 timer_expires = tcp_timeout_expires(sk); 2849 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2850 timer_active = 4; 2851 timer_expires = tcp_timeout_expires(sk); 2852 } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 2853 timer_active = 2; 2854 timer_expires = icsk->icsk_keepalive_timer.expires; 2855 } else { 2856 timer_active = 0; 2857 timer_expires = jiffies; 2858 } 2859 2860 state = inet_sk_state_load(sk); 2861 if (state == TCP_LISTEN) 2862 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2863 else 2864 /* Because we don't lock the socket, 2865 * we might find a transient negative value. 2866 */ 2867 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2868 READ_ONCE(tp->copied_seq), 0); 2869 2870 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2871 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2872 i, src, srcp, dest, destp, state, 2873 READ_ONCE(tp->write_seq) - tp->snd_una, 2874 rx_queue, 2875 timer_active, 2876 jiffies_delta_to_clock_t(timer_expires - jiffies), 2877 READ_ONCE(icsk->icsk_retransmits), 2878 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2879 READ_ONCE(icsk->icsk_probes_out), 2880 sock_i_ino(sk), 2881 refcount_read(&sk->sk_refcnt), sk, 2882 jiffies_to_clock_t(icsk->icsk_rto), 2883 jiffies_to_clock_t(icsk->icsk_ack.ato), 2884 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2885 tcp_snd_cwnd(tp), 2886 state == TCP_LISTEN ? 2887 fastopenq->max_qlen : 2888 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2889 } 2890 2891 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2892 struct seq_file *f, int i) 2893 { 2894 long delta = tw->tw_timer.expires - jiffies; 2895 __be32 dest, src; 2896 __u16 destp, srcp; 2897 2898 dest = tw->tw_daddr; 2899 src = tw->tw_rcv_saddr; 2900 destp = ntohs(tw->tw_dport); 2901 srcp = ntohs(tw->tw_sport); 2902 2903 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2904 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2905 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2906 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2907 refcount_read(&tw->tw_refcnt), tw); 2908 } 2909 2910 #define TMPSZ 150 2911 2912 static int tcp4_seq_show(struct seq_file *seq, void *v) 2913 { 2914 struct tcp_iter_state *st; 2915 struct sock *sk = v; 2916 2917 seq_setwidth(seq, TMPSZ - 1); 2918 if (v == SEQ_START_TOKEN) { 2919 seq_puts(seq, " sl local_address rem_address st tx_queue " 2920 "rx_queue tr tm->when retrnsmt uid timeout " 2921 "inode"); 2922 goto out; 2923 } 2924 st = seq->private; 2925 2926 if (sk->sk_state == TCP_TIME_WAIT) 2927 get_timewait4_sock(v, seq, st->num); 2928 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2929 get_openreq4(v, seq, st->num); 2930 else 2931 get_tcp4_sock(v, seq, st->num); 2932 out: 2933 seq_pad(seq, '\n'); 2934 return 0; 2935 } 2936 2937 #ifdef CONFIG_BPF_SYSCALL 2938 union bpf_tcp_iter_batch_item { 2939 struct sock *sk; 2940 __u64 cookie; 2941 }; 2942 2943 struct bpf_tcp_iter_state { 2944 struct tcp_iter_state state; 2945 unsigned int cur_sk; 2946 unsigned int end_sk; 2947 unsigned int max_sk; 2948 union bpf_tcp_iter_batch_item *batch; 2949 }; 2950 2951 struct bpf_iter__tcp { 2952 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2953 __bpf_md_ptr(struct sock_common *, sk_common); 2954 uid_t uid __aligned(8); 2955 }; 2956 2957 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2958 struct sock_common *sk_common, uid_t uid) 2959 { 2960 struct bpf_iter__tcp ctx; 2961 2962 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2963 ctx.meta = meta; 2964 ctx.sk_common = sk_common; 2965 ctx.uid = uid; 2966 return bpf_iter_run_prog(prog, &ctx); 2967 } 2968 2969 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2970 { 2971 union bpf_tcp_iter_batch_item *item; 2972 unsigned int cur_sk = iter->cur_sk; 2973 __u64 cookie; 2974 2975 /* Remember the cookies of the sockets we haven't seen yet, so we can 2976 * pick up where we left off next time around. 2977 */ 2978 while (cur_sk < iter->end_sk) { 2979 item = &iter->batch[cur_sk++]; 2980 cookie = sock_gen_cookie(item->sk); 2981 sock_gen_put(item->sk); 2982 item->cookie = cookie; 2983 } 2984 } 2985 2986 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2987 unsigned int new_batch_sz, gfp_t flags) 2988 { 2989 union bpf_tcp_iter_batch_item *new_batch; 2990 2991 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2992 flags | __GFP_NOWARN); 2993 if (!new_batch) 2994 return -ENOMEM; 2995 2996 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 2997 kvfree(iter->batch); 2998 iter->batch = new_batch; 2999 iter->max_sk = new_batch_sz; 3000 3001 return 0; 3002 } 3003 3004 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3005 union bpf_tcp_iter_batch_item *cookies, 3006 int n_cookies) 3007 { 3008 struct hlist_nulls_node *node; 3009 struct sock *sk; 3010 int i; 3011 3012 for (i = 0; i < n_cookies; i++) { 3013 sk = first_sk; 3014 sk_nulls_for_each_from(sk, node) 3015 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3016 return sk; 3017 } 3018 3019 return NULL; 3020 } 3021 3022 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3023 { 3024 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3025 struct bpf_tcp_iter_state *iter = seq->private; 3026 struct tcp_iter_state *st = &iter->state; 3027 unsigned int find_cookie = iter->cur_sk; 3028 unsigned int end_cookie = iter->end_sk; 3029 int resume_bucket = st->bucket; 3030 struct sock *sk; 3031 3032 if (end_cookie && find_cookie == end_cookie) 3033 ++st->bucket; 3034 3035 sk = listening_get_first(seq); 3036 iter->cur_sk = 0; 3037 iter->end_sk = 0; 3038 3039 if (sk && st->bucket == resume_bucket && end_cookie) { 3040 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3041 end_cookie - find_cookie); 3042 if (!sk) { 3043 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3044 ++st->bucket; 3045 sk = listening_get_first(seq); 3046 } 3047 } 3048 3049 return sk; 3050 } 3051 3052 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3053 { 3054 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3055 struct bpf_tcp_iter_state *iter = seq->private; 3056 struct tcp_iter_state *st = &iter->state; 3057 unsigned int find_cookie = iter->cur_sk; 3058 unsigned int end_cookie = iter->end_sk; 3059 int resume_bucket = st->bucket; 3060 struct sock *sk; 3061 3062 if (end_cookie && find_cookie == end_cookie) 3063 ++st->bucket; 3064 3065 sk = established_get_first(seq); 3066 iter->cur_sk = 0; 3067 iter->end_sk = 0; 3068 3069 if (sk && st->bucket == resume_bucket && end_cookie) { 3070 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3071 end_cookie - find_cookie); 3072 if (!sk) { 3073 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3074 ++st->bucket; 3075 sk = established_get_first(seq); 3076 } 3077 } 3078 3079 return sk; 3080 } 3081 3082 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3083 { 3084 struct bpf_tcp_iter_state *iter = seq->private; 3085 struct tcp_iter_state *st = &iter->state; 3086 struct sock *sk = NULL; 3087 3088 switch (st->state) { 3089 case TCP_SEQ_STATE_LISTENING: 3090 sk = bpf_iter_tcp_resume_listening(seq); 3091 if (sk) 3092 break; 3093 st->bucket = 0; 3094 st->state = TCP_SEQ_STATE_ESTABLISHED; 3095 fallthrough; 3096 case TCP_SEQ_STATE_ESTABLISHED: 3097 sk = bpf_iter_tcp_resume_established(seq); 3098 break; 3099 } 3100 3101 return sk; 3102 } 3103 3104 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3105 struct sock **start_sk) 3106 { 3107 struct bpf_tcp_iter_state *iter = seq->private; 3108 struct hlist_nulls_node *node; 3109 unsigned int expected = 1; 3110 struct sock *sk; 3111 3112 sock_hold(*start_sk); 3113 iter->batch[iter->end_sk++].sk = *start_sk; 3114 3115 sk = sk_nulls_next(*start_sk); 3116 *start_sk = NULL; 3117 sk_nulls_for_each_from(sk, node) { 3118 if (seq_sk_match(seq, sk)) { 3119 if (iter->end_sk < iter->max_sk) { 3120 sock_hold(sk); 3121 iter->batch[iter->end_sk++].sk = sk; 3122 } else if (!*start_sk) { 3123 /* Remember where we left off. */ 3124 *start_sk = sk; 3125 } 3126 expected++; 3127 } 3128 } 3129 3130 return expected; 3131 } 3132 3133 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3134 struct sock **start_sk) 3135 { 3136 struct bpf_tcp_iter_state *iter = seq->private; 3137 struct hlist_nulls_node *node; 3138 unsigned int expected = 1; 3139 struct sock *sk; 3140 3141 sock_hold(*start_sk); 3142 iter->batch[iter->end_sk++].sk = *start_sk; 3143 3144 sk = sk_nulls_next(*start_sk); 3145 *start_sk = NULL; 3146 sk_nulls_for_each_from(sk, node) { 3147 if (seq_sk_match(seq, sk)) { 3148 if (iter->end_sk < iter->max_sk) { 3149 sock_hold(sk); 3150 iter->batch[iter->end_sk++].sk = sk; 3151 } else if (!*start_sk) { 3152 /* Remember where we left off. */ 3153 *start_sk = sk; 3154 } 3155 expected++; 3156 } 3157 } 3158 3159 return expected; 3160 } 3161 3162 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3163 struct sock **start_sk) 3164 { 3165 struct bpf_tcp_iter_state *iter = seq->private; 3166 struct tcp_iter_state *st = &iter->state; 3167 3168 if (st->state == TCP_SEQ_STATE_LISTENING) 3169 return bpf_iter_tcp_listening_batch(seq, start_sk); 3170 else 3171 return bpf_iter_tcp_established_batch(seq, start_sk); 3172 } 3173 3174 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3175 { 3176 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3177 struct bpf_tcp_iter_state *iter = seq->private; 3178 struct tcp_iter_state *st = &iter->state; 3179 3180 if (st->state == TCP_SEQ_STATE_LISTENING) 3181 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3182 else 3183 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3184 } 3185 3186 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3187 { 3188 struct bpf_tcp_iter_state *iter = seq->private; 3189 unsigned int expected; 3190 struct sock *sk; 3191 int err; 3192 3193 sk = bpf_iter_tcp_resume(seq); 3194 if (!sk) 3195 return NULL; /* Done */ 3196 3197 expected = bpf_iter_fill_batch(seq, &sk); 3198 if (likely(iter->end_sk == expected)) 3199 goto done; 3200 3201 /* Batch size was too small. */ 3202 bpf_iter_tcp_unlock_bucket(seq); 3203 bpf_iter_tcp_put_batch(iter); 3204 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3205 GFP_USER); 3206 if (err) 3207 return ERR_PTR(err); 3208 3209 sk = bpf_iter_tcp_resume(seq); 3210 if (!sk) 3211 return NULL; /* Done */ 3212 3213 expected = bpf_iter_fill_batch(seq, &sk); 3214 if (likely(iter->end_sk == expected)) 3215 goto done; 3216 3217 /* Batch size was still too small. Hold onto the lock while we try 3218 * again with a larger batch to make sure the current bucket's size 3219 * does not change in the meantime. 3220 */ 3221 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3222 if (err) { 3223 bpf_iter_tcp_unlock_bucket(seq); 3224 return ERR_PTR(err); 3225 } 3226 3227 expected = bpf_iter_fill_batch(seq, &sk); 3228 WARN_ON_ONCE(iter->end_sk != expected); 3229 done: 3230 bpf_iter_tcp_unlock_bucket(seq); 3231 return iter->batch[0].sk; 3232 } 3233 3234 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3235 { 3236 /* bpf iter does not support lseek, so it always 3237 * continue from where it was stop()-ped. 3238 */ 3239 if (*pos) 3240 return bpf_iter_tcp_batch(seq); 3241 3242 return SEQ_START_TOKEN; 3243 } 3244 3245 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3246 { 3247 struct bpf_tcp_iter_state *iter = seq->private; 3248 struct tcp_iter_state *st = &iter->state; 3249 struct sock *sk; 3250 3251 /* Whenever seq_next() is called, the iter->cur_sk is 3252 * done with seq_show(), so advance to the next sk in 3253 * the batch. 3254 */ 3255 if (iter->cur_sk < iter->end_sk) { 3256 /* Keeping st->num consistent in tcp_iter_state. 3257 * bpf_iter_tcp does not use st->num. 3258 * meta.seq_num is used instead. 3259 */ 3260 st->num++; 3261 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3262 } 3263 3264 if (iter->cur_sk < iter->end_sk) 3265 sk = iter->batch[iter->cur_sk].sk; 3266 else 3267 sk = bpf_iter_tcp_batch(seq); 3268 3269 ++*pos; 3270 /* Keeping st->last_pos consistent in tcp_iter_state. 3271 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3272 */ 3273 st->last_pos = *pos; 3274 return sk; 3275 } 3276 3277 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3278 { 3279 struct bpf_iter_meta meta; 3280 struct bpf_prog *prog; 3281 struct sock *sk = v; 3282 uid_t uid; 3283 int ret; 3284 3285 if (v == SEQ_START_TOKEN) 3286 return 0; 3287 3288 if (sk_fullsock(sk)) 3289 lock_sock(sk); 3290 3291 if (unlikely(sk_unhashed(sk))) { 3292 ret = SEQ_SKIP; 3293 goto unlock; 3294 } 3295 3296 if (sk->sk_state == TCP_TIME_WAIT) { 3297 uid = 0; 3298 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3299 const struct request_sock *req = v; 3300 3301 uid = from_kuid_munged(seq_user_ns(seq), 3302 sk_uid(req->rsk_listener)); 3303 } else { 3304 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3305 } 3306 3307 meta.seq = seq; 3308 prog = bpf_iter_get_info(&meta, false); 3309 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3310 3311 unlock: 3312 if (sk_fullsock(sk)) 3313 release_sock(sk); 3314 return ret; 3315 3316 } 3317 3318 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3319 { 3320 struct bpf_tcp_iter_state *iter = seq->private; 3321 struct bpf_iter_meta meta; 3322 struct bpf_prog *prog; 3323 3324 if (!v) { 3325 meta.seq = seq; 3326 prog = bpf_iter_get_info(&meta, true); 3327 if (prog) 3328 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3329 } 3330 3331 if (iter->cur_sk < iter->end_sk) 3332 bpf_iter_tcp_put_batch(iter); 3333 } 3334 3335 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3336 .show = bpf_iter_tcp_seq_show, 3337 .start = bpf_iter_tcp_seq_start, 3338 .next = bpf_iter_tcp_seq_next, 3339 .stop = bpf_iter_tcp_seq_stop, 3340 }; 3341 #endif 3342 static unsigned short seq_file_family(const struct seq_file *seq) 3343 { 3344 const struct tcp_seq_afinfo *afinfo; 3345 3346 #ifdef CONFIG_BPF_SYSCALL 3347 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3348 if (seq->op == &bpf_iter_tcp_seq_ops) 3349 return AF_UNSPEC; 3350 #endif 3351 3352 /* Iterated from proc fs */ 3353 afinfo = pde_data(file_inode(seq->file)); 3354 return afinfo->family; 3355 } 3356 3357 static const struct seq_operations tcp4_seq_ops = { 3358 .show = tcp4_seq_show, 3359 .start = tcp_seq_start, 3360 .next = tcp_seq_next, 3361 .stop = tcp_seq_stop, 3362 }; 3363 3364 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3365 .family = AF_INET, 3366 }; 3367 3368 static int __net_init tcp4_proc_init_net(struct net *net) 3369 { 3370 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3371 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3372 return -ENOMEM; 3373 return 0; 3374 } 3375 3376 static void __net_exit tcp4_proc_exit_net(struct net *net) 3377 { 3378 remove_proc_entry("tcp", net->proc_net); 3379 } 3380 3381 static struct pernet_operations tcp4_net_ops = { 3382 .init = tcp4_proc_init_net, 3383 .exit = tcp4_proc_exit_net, 3384 }; 3385 3386 int __init tcp4_proc_init(void) 3387 { 3388 return register_pernet_subsys(&tcp4_net_ops); 3389 } 3390 3391 void tcp4_proc_exit(void) 3392 { 3393 unregister_pernet_subsys(&tcp4_net_ops); 3394 } 3395 #endif /* CONFIG_PROC_FS */ 3396 3397 struct proto tcp_prot = { 3398 .name = "TCP", 3399 .owner = THIS_MODULE, 3400 .close = tcp_close, 3401 .pre_connect = tcp_v4_pre_connect, 3402 .connect = tcp_v4_connect, 3403 .disconnect = tcp_disconnect, 3404 .accept = inet_csk_accept, 3405 .ioctl = tcp_ioctl, 3406 .init = tcp_v4_init_sock, 3407 .destroy = tcp_v4_destroy_sock, 3408 .shutdown = tcp_shutdown, 3409 .setsockopt = tcp_setsockopt, 3410 .getsockopt = tcp_getsockopt, 3411 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3412 .keepalive = tcp_set_keepalive, 3413 .recvmsg = tcp_recvmsg, 3414 .sendmsg = tcp_sendmsg, 3415 .splice_eof = tcp_splice_eof, 3416 .backlog_rcv = tcp_v4_do_rcv, 3417 .release_cb = tcp_release_cb, 3418 .hash = inet_hash, 3419 .unhash = inet_unhash, 3420 .get_port = inet_csk_get_port, 3421 .put_port = inet_put_port, 3422 #ifdef CONFIG_BPF_SYSCALL 3423 .psock_update_sk_prot = tcp_bpf_update_proto, 3424 #endif 3425 .enter_memory_pressure = tcp_enter_memory_pressure, 3426 .leave_memory_pressure = tcp_leave_memory_pressure, 3427 .stream_memory_free = tcp_stream_memory_free, 3428 .sockets_allocated = &tcp_sockets_allocated, 3429 3430 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3431 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3432 3433 .memory_pressure = &tcp_memory_pressure, 3434 .sysctl_mem = sysctl_tcp_mem, 3435 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3436 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3437 .max_header = MAX_TCP_HEADER, 3438 .obj_size = sizeof(struct tcp_sock), 3439 .freeptr_offset = offsetof(struct tcp_sock, 3440 inet_conn.icsk_inet.sk.sk_freeptr), 3441 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3442 .twsk_prot = &tcp_timewait_sock_ops, 3443 .rsk_prot = &tcp_request_sock_ops, 3444 .h.hashinfo = NULL, 3445 .no_autobind = true, 3446 .diag_destroy = tcp_abort, 3447 }; 3448 EXPORT_SYMBOL(tcp_prot); 3449 3450 static void __net_exit tcp_sk_exit(struct net *net) 3451 { 3452 if (net->ipv4.tcp_congestion_control) 3453 bpf_module_put(net->ipv4.tcp_congestion_control, 3454 net->ipv4.tcp_congestion_control->owner); 3455 } 3456 3457 static void __net_init tcp_set_hashinfo(struct net *net) 3458 { 3459 struct inet_hashinfo *hinfo; 3460 unsigned int ehash_entries; 3461 struct net *old_net; 3462 3463 if (net_eq(net, &init_net)) 3464 goto fallback; 3465 3466 old_net = current->nsproxy->net_ns; 3467 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3468 if (!ehash_entries) 3469 goto fallback; 3470 3471 ehash_entries = roundup_pow_of_two(ehash_entries); 3472 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3473 if (!hinfo) { 3474 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3475 "for a netns, fallback to the global one\n", 3476 ehash_entries); 3477 fallback: 3478 hinfo = &tcp_hashinfo; 3479 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3480 } 3481 3482 net->ipv4.tcp_death_row.hashinfo = hinfo; 3483 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3484 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3485 } 3486 3487 static int __net_init tcp_sk_init(struct net *net) 3488 { 3489 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3490 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3491 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; 3492 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3493 3494 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3495 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3496 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3497 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3498 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3499 3500 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3501 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3502 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3503 3504 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3505 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3506 net->ipv4.sysctl_tcp_syncookies = 1; 3507 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3508 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3509 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3510 net->ipv4.sysctl_tcp_orphan_retries = 0; 3511 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3512 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3513 net->ipv4.sysctl_tcp_tw_reuse = 2; 3514 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3515 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3516 3517 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3518 tcp_set_hashinfo(net); 3519 3520 net->ipv4.sysctl_tcp_sack = 1; 3521 net->ipv4.sysctl_tcp_window_scaling = 1; 3522 net->ipv4.sysctl_tcp_timestamps = 1; 3523 net->ipv4.sysctl_tcp_early_retrans = 3; 3524 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3525 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3526 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3527 net->ipv4.sysctl_tcp_max_reordering = 300; 3528 net->ipv4.sysctl_tcp_dsack = 1; 3529 net->ipv4.sysctl_tcp_app_win = 31; 3530 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3531 net->ipv4.sysctl_tcp_frto = 2; 3532 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3533 net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC; 3534 /* This limits the percentage of the congestion window which we 3535 * will allow a single TSO frame to consume. Building TSO frames 3536 * which are too large can cause TCP streams to be bursty. 3537 */ 3538 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3539 /* Default TSQ limit of 4 MB */ 3540 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3541 3542 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3543 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3544 3545 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3546 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3547 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3548 net->ipv4.sysctl_tcp_autocorking = 1; 3549 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3550 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3551 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3552 if (net != &init_net) { 3553 memcpy(net->ipv4.sysctl_tcp_rmem, 3554 init_net.ipv4.sysctl_tcp_rmem, 3555 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3556 memcpy(net->ipv4.sysctl_tcp_wmem, 3557 init_net.ipv4.sysctl_tcp_wmem, 3558 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3559 } 3560 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3561 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC; 3562 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3563 net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33; 3564 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3565 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3566 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3567 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3568 3569 /* Set default values for PLB */ 3570 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3571 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3572 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3573 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3574 /* Default congestion threshold for PLB to mark a round is 50% */ 3575 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3576 3577 /* Reno is always built in */ 3578 if (!net_eq(net, &init_net) && 3579 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3580 init_net.ipv4.tcp_congestion_control->owner)) 3581 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3582 else 3583 net->ipv4.tcp_congestion_control = &tcp_reno; 3584 3585 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3586 net->ipv4.sysctl_tcp_shrink_window = 0; 3587 3588 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3589 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3590 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3591 3592 return 0; 3593 } 3594 3595 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3596 { 3597 struct net *net; 3598 3599 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3600 * and failed setup_net error unwinding path are serialized. 3601 * 3602 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3603 * net_exit_list, the thread that dismantles a particular twsk must 3604 * do so without other thread progressing to refcount_dec_and_test() of 3605 * tcp_death_row.tw_refcount. 3606 */ 3607 mutex_lock(&tcp_exit_batch_mutex); 3608 3609 tcp_twsk_purge(net_exit_list); 3610 3611 list_for_each_entry(net, net_exit_list, exit_list) { 3612 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3613 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3614 tcp_fastopen_ctx_destroy(net); 3615 } 3616 3617 mutex_unlock(&tcp_exit_batch_mutex); 3618 } 3619 3620 static struct pernet_operations __net_initdata tcp_sk_ops = { 3621 .init = tcp_sk_init, 3622 .exit = tcp_sk_exit, 3623 .exit_batch = tcp_sk_exit_batch, 3624 }; 3625 3626 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3627 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3628 struct sock_common *sk_common, uid_t uid) 3629 3630 #define INIT_BATCH_SZ 16 3631 3632 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3633 { 3634 struct bpf_tcp_iter_state *iter = priv_data; 3635 int err; 3636 3637 err = bpf_iter_init_seq_net(priv_data, aux); 3638 if (err) 3639 return err; 3640 3641 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3642 if (err) { 3643 bpf_iter_fini_seq_net(priv_data); 3644 return err; 3645 } 3646 3647 return 0; 3648 } 3649 3650 static void bpf_iter_fini_tcp(void *priv_data) 3651 { 3652 struct bpf_tcp_iter_state *iter = priv_data; 3653 3654 bpf_iter_fini_seq_net(priv_data); 3655 kvfree(iter->batch); 3656 } 3657 3658 static const struct bpf_iter_seq_info tcp_seq_info = { 3659 .seq_ops = &bpf_iter_tcp_seq_ops, 3660 .init_seq_private = bpf_iter_init_tcp, 3661 .fini_seq_private = bpf_iter_fini_tcp, 3662 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3663 }; 3664 3665 static const struct bpf_func_proto * 3666 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3667 const struct bpf_prog *prog) 3668 { 3669 switch (func_id) { 3670 case BPF_FUNC_setsockopt: 3671 return &bpf_sk_setsockopt_proto; 3672 case BPF_FUNC_getsockopt: 3673 return &bpf_sk_getsockopt_proto; 3674 default: 3675 return NULL; 3676 } 3677 } 3678 3679 static struct bpf_iter_reg tcp_reg_info = { 3680 .target = "tcp", 3681 .ctx_arg_info_size = 1, 3682 .ctx_arg_info = { 3683 { offsetof(struct bpf_iter__tcp, sk_common), 3684 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3685 }, 3686 .get_func_proto = bpf_iter_tcp_get_func_proto, 3687 .seq_info = &tcp_seq_info, 3688 }; 3689 3690 static void __init bpf_iter_register(void) 3691 { 3692 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3693 if (bpf_iter_reg_target(&tcp_reg_info)) 3694 pr_warn("Warning: could not register bpf iterator tcp\n"); 3695 } 3696 3697 #endif 3698 3699 void __init tcp_v4_init(void) 3700 { 3701 int cpu, res; 3702 3703 for_each_possible_cpu(cpu) { 3704 struct sock *sk; 3705 3706 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3707 IPPROTO_TCP, &init_net); 3708 if (res) 3709 panic("Failed to create the TCP control socket.\n"); 3710 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3711 3712 /* Please enforce IP_DF and IPID==0 for RST and 3713 * ACK sent in SYN-RECV and TIME-WAIT state. 3714 */ 3715 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3716 3717 sk->sk_clockid = CLOCK_MONOTONIC; 3718 3719 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3720 } 3721 if (register_pernet_subsys(&tcp_sk_ops)) 3722 panic("Failed to create the TCP control socket.\n"); 3723 3724 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3725 bpf_iter_register(); 3726 #endif 3727 } 3728