1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/fips.h> 57 #include <linux/jhash.h> 58 #include <linux/init.h> 59 #include <linux/times.h> 60 #include <linux/slab.h> 61 #include <linux/sched.h> 62 #include <linux/sock_diag.h> 63 64 #include <net/aligned_data.h> 65 #include <net/net_namespace.h> 66 #include <net/icmp.h> 67 #include <net/inet_hashtables.h> 68 #include <net/tcp.h> 69 #include <net/tcp_ecn.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/inet_ecn.h> 74 #include <net/timewait_sock.h> 75 #include <net/xfrm.h> 76 #include <net/secure_seq.h> 77 #include <net/busy_poll.h> 78 #include <net/rstreason.h> 79 #include <net/psp.h> 80 81 #include <linux/inet.h> 82 #include <linux/ipv6.h> 83 #include <linux/stddef.h> 84 #include <linux/proc_fs.h> 85 #include <linux/seq_file.h> 86 #include <linux/inetdevice.h> 87 #include <linux/btf_ids.h> 88 #include <linux/skbuff_ref.h> 89 90 #include <crypto/md5.h> 91 92 #include <trace/events/tcp.h> 93 94 #ifdef CONFIG_TCP_MD5SIG 95 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 96 __be32 daddr, __be32 saddr, const struct tcphdr *th); 97 #endif 98 99 struct inet_hashinfo tcp_hashinfo; 100 101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 102 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 103 }; 104 105 static DEFINE_MUTEX(tcp_exit_batch_mutex); 106 107 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 108 { 109 return secure_tcp_seq(ip_hdr(skb)->daddr, 110 ip_hdr(skb)->saddr, 111 tcp_hdr(skb)->dest, 112 tcp_hdr(skb)->source); 113 } 114 115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 116 { 117 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 118 } 119 120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 121 { 122 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 123 const struct inet_timewait_sock *tw = inet_twsk(sktw); 124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 125 struct tcp_sock *tp = tcp_sk(sk); 126 int ts_recent_stamp; 127 u32 reuse_thresh; 128 129 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 130 reuse = 0; 131 132 if (reuse == 2) { 133 /* Still does not detect *everything* that goes through 134 * lo, since we require a loopback src or dst address 135 * or direct binding to 'lo' interface. 136 */ 137 bool loopback = false; 138 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 139 loopback = true; 140 #if IS_ENABLED(CONFIG_IPV6) 141 if (tw->tw_family == AF_INET6) { 142 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 144 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 145 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 146 loopback = true; 147 } else 148 #endif 149 { 150 if (ipv4_is_loopback(tw->tw_daddr) || 151 ipv4_is_loopback(tw->tw_rcv_saddr)) 152 loopback = true; 153 } 154 if (!loopback) 155 reuse = 0; 156 } 157 158 /* With PAWS, it is safe from the viewpoint 159 of data integrity. Even without PAWS it is safe provided sequence 160 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 161 162 Actually, the idea is close to VJ's one, only timestamp cache is 163 held not per host, but per port pair and TW bucket is used as state 164 holder. 165 166 If TW bucket has been already destroyed we fall back to VJ's scheme 167 and use initial timestamp retrieved from peer table. 168 */ 169 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 170 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 171 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 172 if (ts_recent_stamp && 173 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 174 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 175 * and releasing the bucket lock. 176 */ 177 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 178 return 0; 179 180 /* In case of repair and re-using TIME-WAIT sockets we still 181 * want to be sure that it is safe as above but honor the 182 * sequence numbers and time stamps set as part of the repair 183 * process. 184 * 185 * Without this check re-using a TIME-WAIT socket with TCP 186 * repair would accumulate a -1 on the repair assigned 187 * sequence number. The first time it is reused the sequence 188 * is -1, the second time -2, etc. This fixes that issue 189 * without appearing to create any others. 190 */ 191 if (likely(!tp->repair)) { 192 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 193 194 if (!seq) 195 seq = 1; 196 WRITE_ONCE(tp->write_seq, seq); 197 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 198 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 199 } 200 201 return 1; 202 } 203 204 return 0; 205 } 206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 207 208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, 209 int addr_len) 210 { 211 /* This check is replicated from tcp_v4_connect() and intended to 212 * prevent BPF program called below from accessing bytes that are out 213 * of the bound specified by user in addr_len. 214 */ 215 if (addr_len < sizeof(struct sockaddr_in)) 216 return -EINVAL; 217 218 sock_owned_by_me(sk); 219 220 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 221 } 222 223 /* This will initiate an outgoing connection. */ 224 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) 225 { 226 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 227 struct inet_timewait_death_row *tcp_death_row; 228 struct inet_sock *inet = inet_sk(sk); 229 struct tcp_sock *tp = tcp_sk(sk); 230 struct ip_options_rcu *inet_opt; 231 struct net *net = sock_net(sk); 232 __be16 orig_sport, orig_dport; 233 __be32 daddr, nexthop; 234 struct flowi4 *fl4; 235 struct rtable *rt; 236 int err; 237 238 if (addr_len < sizeof(struct sockaddr_in)) 239 return -EINVAL; 240 241 if (usin->sin_family != AF_INET) 242 return -EAFNOSUPPORT; 243 244 nexthop = daddr = usin->sin_addr.s_addr; 245 inet_opt = rcu_dereference_protected(inet->inet_opt, 246 lockdep_sock_is_held(sk)); 247 if (inet_opt && inet_opt->opt.srr) { 248 if (!daddr) 249 return -EINVAL; 250 nexthop = inet_opt->opt.faddr; 251 } 252 253 orig_sport = inet->inet_sport; 254 orig_dport = usin->sin_port; 255 fl4 = &inet->cork.fl.u.ip4; 256 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 257 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 258 orig_dport, sk); 259 if (IS_ERR(rt)) { 260 err = PTR_ERR(rt); 261 if (err == -ENETUNREACH) 262 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 263 return err; 264 } 265 266 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 267 ip_rt_put(rt); 268 return -ENETUNREACH; 269 } 270 271 if (!inet_opt || !inet_opt->opt.srr) 272 daddr = fl4->daddr; 273 274 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 275 276 if (!inet->inet_saddr) { 277 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 278 if (err) { 279 ip_rt_put(rt); 280 return err; 281 } 282 } else { 283 sk_rcv_saddr_set(sk, inet->inet_saddr); 284 } 285 286 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 287 /* Reset inherited state */ 288 tp->rx_opt.ts_recent = 0; 289 tp->rx_opt.ts_recent_stamp = 0; 290 if (likely(!tp->repair)) 291 WRITE_ONCE(tp->write_seq, 0); 292 } 293 294 inet->inet_dport = usin->sin_port; 295 sk_daddr_set(sk, daddr); 296 297 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); 298 if (inet_opt) 299 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; 300 301 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 302 303 /* Socket identity is still unknown (sport may be zero). 304 * However we set state to SYN-SENT and not releasing socket 305 * lock select source port, enter ourselves into the hash tables and 306 * complete initialization after this. 307 */ 308 tcp_set_state(sk, TCP_SYN_SENT); 309 err = inet_hash_connect(tcp_death_row, sk); 310 if (err) 311 goto failure; 312 313 sk_set_txhash(sk); 314 315 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 316 inet->inet_sport, inet->inet_dport, sk); 317 if (IS_ERR(rt)) { 318 err = PTR_ERR(rt); 319 rt = NULL; 320 goto failure; 321 } 322 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 323 /* OK, now commit destination to socket. */ 324 sk->sk_gso_type = SKB_GSO_TCPV4; 325 sk_setup_caps(sk, &rt->dst); 326 rt = NULL; 327 328 if (likely(!tp->repair)) { 329 if (!tp->write_seq) 330 WRITE_ONCE(tp->write_seq, 331 secure_tcp_seq(inet->inet_saddr, 332 inet->inet_daddr, 333 inet->inet_sport, 334 usin->sin_port)); 335 WRITE_ONCE(tp->tsoffset, 336 secure_tcp_ts_off(net, inet->inet_saddr, 337 inet->inet_daddr)); 338 } 339 340 atomic_set(&inet->inet_id, get_random_u16()); 341 342 if (tcp_fastopen_defer_connect(sk, &err)) 343 return err; 344 if (err) 345 goto failure; 346 347 err = tcp_connect(sk); 348 349 if (err) 350 goto failure; 351 352 return 0; 353 354 failure: 355 /* 356 * This unhashes the socket and releases the local port, 357 * if necessary. 358 */ 359 tcp_set_state(sk, TCP_CLOSE); 360 inet_bhash2_reset_saddr(sk); 361 ip_rt_put(rt); 362 sk->sk_route_caps = 0; 363 inet->inet_dport = 0; 364 return err; 365 } 366 EXPORT_IPV6_MOD(tcp_v4_connect); 367 368 /* 369 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 370 * It can be called through tcp_release_cb() if socket was owned by user 371 * at the time tcp_v4_err() was called to handle ICMP message. 372 */ 373 void tcp_v4_mtu_reduced(struct sock *sk) 374 { 375 struct inet_sock *inet = inet_sk(sk); 376 struct dst_entry *dst; 377 u32 mtu, dmtu; 378 379 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 380 return; 381 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 382 dst = inet_csk_update_pmtu(sk, mtu); 383 if (!dst) 384 return; 385 386 /* Something is about to be wrong... Remember soft error 387 * for the case, if this connection will not able to recover. 388 */ 389 dmtu = dst4_mtu(dst); 390 if (mtu < dmtu && ip_dont_fragment(sk, dst)) 391 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 392 393 if (inet->pmtudisc != IP_PMTUDISC_DONT && 394 ip_sk_accept_pmtu(sk) && 395 inet_csk(sk)->icsk_pmtu_cookie > dmtu) { 396 tcp_sync_mss(sk, dmtu); 397 398 /* Resend the TCP packet because it's 399 * clear that the old packet has been 400 * dropped. This is the new "fast" path mtu 401 * discovery. 402 */ 403 tcp_simple_retransmit(sk); 404 } /* else let the usual retransmit timer handle it */ 405 } 406 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 407 408 static void do_redirect(struct sk_buff *skb, struct sock *sk) 409 { 410 struct dst_entry *dst = __sk_dst_check(sk, 0); 411 412 if (dst) 413 dst->ops->redirect(dst, sk, skb); 414 } 415 416 417 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 418 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 419 { 420 struct request_sock *req = inet_reqsk(sk); 421 struct net *net = sock_net(sk); 422 423 /* ICMPs are not backlogged, hence we cannot get 424 * an established socket here. 425 */ 426 if (seq != tcp_rsk(req)->snt_isn) { 427 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 428 } else if (abort) { 429 /* 430 * Still in SYN_RECV, just remove it silently. 431 * There is no good way to pass the error to the newly 432 * created socket, and POSIX does not want network 433 * errors returned from accept(). 434 */ 435 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 436 tcp_listendrop(req->rsk_listener); 437 } 438 reqsk_put(req); 439 } 440 EXPORT_IPV6_MOD(tcp_req_err); 441 442 /* TCP-LD (RFC 6069) logic */ 443 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 444 { 445 struct inet_connection_sock *icsk = inet_csk(sk); 446 struct tcp_sock *tp = tcp_sk(sk); 447 struct sk_buff *skb; 448 s32 remaining; 449 u32 delta_us; 450 451 if (sock_owned_by_user(sk)) 452 return; 453 454 if (seq != tp->snd_una || !icsk->icsk_retransmits || 455 !icsk->icsk_backoff) 456 return; 457 458 skb = tcp_rtx_queue_head(sk); 459 if (WARN_ON_ONCE(!skb)) 460 return; 461 462 icsk->icsk_backoff--; 463 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 464 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 465 466 tcp_mstamp_refresh(tp); 467 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 468 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 469 470 if (remaining > 0) { 471 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 472 } else { 473 /* RTO revert clocked out retransmission. 474 * Will retransmit now. 475 */ 476 tcp_retransmit_timer(sk); 477 } 478 } 479 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 480 481 /* 482 * This routine is called by the ICMP module when it gets some 483 * sort of error condition. If err < 0 then the socket should 484 * be closed and the error returned to the user. If err > 0 485 * it's just the icmp type << 8 | icmp code. After adjustment 486 * header points to the first 8 bytes of the tcp header. We need 487 * to find the appropriate port. 488 * 489 * The locking strategy used here is very "optimistic". When 490 * someone else accesses the socket the ICMP is just dropped 491 * and for some paths there is no check at all. 492 * A more general error queue to queue errors for later handling 493 * is probably better. 494 * 495 */ 496 497 int tcp_v4_err(struct sk_buff *skb, u32 info) 498 { 499 const struct iphdr *iph = (const struct iphdr *)skb->data; 500 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 501 struct net *net = dev_net_rcu(skb->dev); 502 const int type = icmp_hdr(skb)->type; 503 const int code = icmp_hdr(skb)->code; 504 struct request_sock *fastopen; 505 struct tcp_sock *tp; 506 u32 seq, snd_una; 507 struct sock *sk; 508 int err; 509 510 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 511 ntohs(th->source), inet_iif(skb), 0); 512 if (!sk) { 513 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 514 return -ENOENT; 515 } 516 if (sk->sk_state == TCP_TIME_WAIT) { 517 /* To increase the counter of ignored icmps for TCP-AO */ 518 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 519 inet_twsk_put(inet_twsk(sk)); 520 return 0; 521 } 522 seq = ntohl(th->seq); 523 if (sk->sk_state == TCP_NEW_SYN_RECV) { 524 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 525 type == ICMP_TIME_EXCEEDED || 526 (type == ICMP_DEST_UNREACH && 527 (code == ICMP_NET_UNREACH || 528 code == ICMP_HOST_UNREACH))); 529 return 0; 530 } 531 532 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 533 sock_put(sk); 534 return 0; 535 } 536 537 bh_lock_sock(sk); 538 /* If too many ICMPs get dropped on busy 539 * servers this needs to be solved differently. 540 * We do take care of PMTU discovery (RFC1191) special case : 541 * we can receive locally generated ICMP messages while socket is held. 542 */ 543 if (sock_owned_by_user(sk)) { 544 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 545 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 546 } 547 if (sk->sk_state == TCP_CLOSE) 548 goto out; 549 550 if (static_branch_unlikely(&ip4_min_ttl)) { 551 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 552 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 553 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 554 goto out; 555 } 556 } 557 558 tp = tcp_sk(sk); 559 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 560 fastopen = rcu_dereference(tp->fastopen_rsk); 561 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 562 if (sk->sk_state != TCP_LISTEN && 563 !between(seq, snd_una, tp->snd_nxt)) { 564 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 565 goto out; 566 } 567 568 switch (type) { 569 case ICMP_REDIRECT: 570 if (!sock_owned_by_user(sk)) 571 do_redirect(skb, sk); 572 goto out; 573 case ICMP_SOURCE_QUENCH: 574 /* Just silently ignore these. */ 575 goto out; 576 case ICMP_PARAMETERPROB: 577 err = EPROTO; 578 break; 579 case ICMP_DEST_UNREACH: 580 if (code > NR_ICMP_UNREACH) 581 goto out; 582 583 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 584 /* We are not interested in TCP_LISTEN and open_requests 585 * (SYN-ACKs send out by Linux are always <576bytes so 586 * they should go through unfragmented). 587 */ 588 if (sk->sk_state == TCP_LISTEN) 589 goto out; 590 591 WRITE_ONCE(tp->mtu_info, info); 592 if (!sock_owned_by_user(sk)) { 593 tcp_v4_mtu_reduced(sk); 594 } else { 595 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 596 sock_hold(sk); 597 } 598 goto out; 599 } 600 601 err = icmp_err_convert[code].errno; 602 /* check if this ICMP message allows revert of backoff. 603 * (see RFC 6069) 604 */ 605 if (!fastopen && 606 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 607 tcp_ld_RTO_revert(sk, seq); 608 break; 609 case ICMP_TIME_EXCEEDED: 610 err = EHOSTUNREACH; 611 break; 612 default: 613 goto out; 614 } 615 616 switch (sk->sk_state) { 617 case TCP_SYN_SENT: 618 case TCP_SYN_RECV: 619 /* Only in fast or simultaneous open. If a fast open socket is 620 * already accepted it is treated as a connected one below. 621 */ 622 if (fastopen && !fastopen->sk) 623 break; 624 625 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 626 627 if (!sock_owned_by_user(sk)) 628 tcp_done_with_error(sk, err); 629 else 630 WRITE_ONCE(sk->sk_err_soft, err); 631 goto out; 632 } 633 634 /* If we've already connected we will keep trying 635 * until we time out, or the user gives up. 636 * 637 * rfc1122 4.2.3.9 allows to consider as hard errors 638 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 639 * but it is obsoleted by pmtu discovery). 640 * 641 * Note, that in modern internet, where routing is unreliable 642 * and in each dark corner broken firewalls sit, sending random 643 * errors ordered by their masters even this two messages finally lose 644 * their original sense (even Linux sends invalid PORT_UNREACHs) 645 * 646 * Now we are in compliance with RFCs. 647 * --ANK (980905) 648 */ 649 650 if (!sock_owned_by_user(sk) && 651 inet_test_bit(RECVERR, sk)) { 652 WRITE_ONCE(sk->sk_err, err); 653 sk_error_report(sk); 654 } else { /* Only an error on timeout */ 655 WRITE_ONCE(sk->sk_err_soft, err); 656 } 657 658 out: 659 bh_unlock_sock(sk); 660 sock_put(sk); 661 return 0; 662 } 663 664 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 665 666 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 667 const struct tcp_ao_hdr *aoh, 668 struct ip_reply_arg *arg, struct tcphdr *reply, 669 __be32 reply_options[REPLY_OPTIONS_LEN]) 670 { 671 #ifdef CONFIG_TCP_AO 672 int sdif = tcp_v4_sdif(skb); 673 int dif = inet_iif(skb); 674 int l3index = sdif ? dif : 0; 675 bool allocated_traffic_key; 676 struct tcp_ao_key *key; 677 char *traffic_key; 678 bool drop = true; 679 u32 ao_sne = 0; 680 u8 keyid; 681 682 rcu_read_lock(); 683 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 684 &key, &traffic_key, &allocated_traffic_key, 685 &keyid, &ao_sne)) 686 goto out; 687 688 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 689 (aoh->rnext_keyid << 8) | keyid); 690 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 691 reply->doff = arg->iov[0].iov_len / 4; 692 693 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 694 key, traffic_key, 695 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 696 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 697 reply, ao_sne)) 698 goto out; 699 drop = false; 700 out: 701 rcu_read_unlock(); 702 if (allocated_traffic_key) 703 kfree(traffic_key); 704 return drop; 705 #else 706 return true; 707 #endif 708 } 709 710 /* 711 * This routine will send an RST to the other tcp. 712 * 713 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 714 * for reset. 715 * Answer: if a packet caused RST, it is not for a socket 716 * existing in our system, if it is matched to a socket, 717 * it is just duplicate segment or bug in other side's TCP. 718 * So that we build reply only basing on parameters 719 * arrived with segment. 720 * Exception: precedence violation. We do not implement it in any case. 721 */ 722 723 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 724 enum sk_rst_reason reason) 725 { 726 const struct tcphdr *th = tcp_hdr(skb); 727 struct { 728 struct tcphdr th; 729 __be32 opt[REPLY_OPTIONS_LEN]; 730 } rep; 731 const __u8 *md5_hash_location = NULL; 732 const struct tcp_ao_hdr *aoh; 733 struct ip_reply_arg arg; 734 #ifdef CONFIG_TCP_MD5SIG 735 struct tcp_md5sig_key *key = NULL; 736 unsigned char newhash[16]; 737 struct sock *sk1 = NULL; 738 #endif 739 u64 transmit_time = 0; 740 struct sock *ctl_sk; 741 struct net *net; 742 u32 txhash = 0; 743 744 /* Never send a reset in response to a reset. */ 745 if (th->rst) 746 return; 747 748 /* If sk not NULL, it means we did a successful lookup and incoming 749 * route had to be correct. prequeue might have dropped our dst. 750 */ 751 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 752 return; 753 754 /* Swap the send and the receive. */ 755 memset(&rep, 0, sizeof(rep)); 756 rep.th.dest = th->source; 757 rep.th.source = th->dest; 758 rep.th.doff = sizeof(struct tcphdr) / 4; 759 rep.th.rst = 1; 760 761 if (th->ack) { 762 rep.th.seq = th->ack_seq; 763 } else { 764 rep.th.ack = 1; 765 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 766 skb->len - (th->doff << 2)); 767 } 768 769 memset(&arg, 0, sizeof(arg)); 770 arg.iov[0].iov_base = (unsigned char *)&rep; 771 arg.iov[0].iov_len = sizeof(rep.th); 772 773 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 774 775 /* Invalid TCP option size or twice included auth */ 776 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 777 return; 778 779 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 780 return; 781 782 #ifdef CONFIG_TCP_MD5SIG 783 rcu_read_lock(); 784 if (sk && sk_fullsock(sk)) { 785 const union tcp_md5_addr *addr; 786 int l3index; 787 788 /* sdif set, means packet ingressed via a device 789 * in an L3 domain and inet_iif is set to it. 790 */ 791 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 792 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 793 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 794 } else if (md5_hash_location) { 795 const union tcp_md5_addr *addr; 796 int sdif = tcp_v4_sdif(skb); 797 int dif = inet_iif(skb); 798 int l3index; 799 800 /* 801 * active side is lost. Try to find listening socket through 802 * source port, and then find md5 key through listening socket. 803 * we are not loose security here: 804 * Incoming packet is checked with md5 hash with finding key, 805 * no RST generated if md5 hash doesn't match. 806 */ 807 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 808 th->source, ip_hdr(skb)->daddr, 809 ntohs(th->source), dif, sdif); 810 /* don't send rst if it can't find key */ 811 if (!sk1) 812 goto out; 813 814 /* sdif set, means packet ingressed via a device 815 * in an L3 domain and dif is set to it. 816 */ 817 l3index = sdif ? dif : 0; 818 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 819 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 820 if (!key) 821 goto out; 822 823 tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 824 if (memcmp(md5_hash_location, newhash, 16) != 0) 825 goto out; 826 } 827 828 if (key) { 829 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 830 (TCPOPT_NOP << 16) | 831 (TCPOPT_MD5SIG << 8) | 832 TCPOLEN_MD5SIG); 833 /* Update length and the length the header thinks exists */ 834 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 835 rep.th.doff = arg.iov[0].iov_len / 4; 836 837 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 838 key, ip_hdr(skb)->saddr, 839 ip_hdr(skb)->daddr, &rep.th); 840 } 841 #endif 842 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 843 if (rep.opt[0] == 0) { 844 __be32 mrst = mptcp_reset_option(skb); 845 846 if (mrst) { 847 rep.opt[0] = mrst; 848 arg.iov[0].iov_len += sizeof(mrst); 849 rep.th.doff = arg.iov[0].iov_len / 4; 850 } 851 } 852 853 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 854 ip_hdr(skb)->saddr, /* XXX */ 855 arg.iov[0].iov_len, IPPROTO_TCP, 0); 856 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 857 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 858 859 /* When socket is gone, all binding information is lost. 860 * routing might fail in this case. No choice here, if we choose to force 861 * input interface, we will misroute in case of asymmetric route. 862 */ 863 if (sk) 864 arg.bound_dev_if = sk->sk_bound_dev_if; 865 866 trace_tcp_send_reset(sk, skb, reason); 867 868 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 869 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 870 871 /* ECN bits of TW reset are cleared */ 872 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 873 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 874 local_bh_disable(); 875 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 876 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 877 878 sock_net_set(ctl_sk, net); 879 if (sk) { 880 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 881 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 882 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 883 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 884 transmit_time = tcp_transmit_time(sk); 885 xfrm_sk_clone_policy(ctl_sk, sk); 886 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 887 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 888 } else { 889 ctl_sk->sk_mark = 0; 890 ctl_sk->sk_priority = 0; 891 } 892 ip_send_unicast_reply(ctl_sk, sk, 893 skb, &TCP_SKB_CB(skb)->header.h4.opt, 894 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 895 &arg, arg.iov[0].iov_len, 896 transmit_time, txhash); 897 898 xfrm_sk_free_policy(ctl_sk); 899 sock_net_set(ctl_sk, &init_net); 900 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 901 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 902 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 903 local_bh_enable(); 904 905 #ifdef CONFIG_TCP_MD5SIG 906 out: 907 rcu_read_unlock(); 908 #endif 909 } 910 911 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 912 outside socket context is ugly, certainly. What can I do? 913 */ 914 915 static void tcp_v4_send_ack(const struct sock *sk, 916 struct sk_buff *skb, u32 seq, u32 ack, 917 u32 win, u32 tsval, u32 tsecr, int oif, 918 struct tcp_key *key, 919 int reply_flags, u8 tos, u32 txhash) 920 { 921 const struct tcphdr *th = tcp_hdr(skb); 922 struct { 923 struct tcphdr th; 924 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 925 } rep; 926 struct net *net = sock_net(sk); 927 struct ip_reply_arg arg; 928 struct sock *ctl_sk; 929 u64 transmit_time; 930 931 memset(&rep.th, 0, sizeof(struct tcphdr)); 932 memset(&arg, 0, sizeof(arg)); 933 934 arg.iov[0].iov_base = (unsigned char *)&rep; 935 arg.iov[0].iov_len = sizeof(rep.th); 936 if (tsecr) { 937 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 938 (TCPOPT_TIMESTAMP << 8) | 939 TCPOLEN_TIMESTAMP); 940 rep.opt[1] = htonl(tsval); 941 rep.opt[2] = htonl(tsecr); 942 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 943 } 944 945 /* Swap the send and the receive. */ 946 rep.th.dest = th->source; 947 rep.th.source = th->dest; 948 rep.th.doff = arg.iov[0].iov_len / 4; 949 rep.th.seq = htonl(seq); 950 rep.th.ack_seq = htonl(ack); 951 rep.th.ack = 1; 952 rep.th.window = htons(win); 953 954 #ifdef CONFIG_TCP_MD5SIG 955 if (tcp_key_is_md5(key)) { 956 int offset = (tsecr) ? 3 : 0; 957 958 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 959 (TCPOPT_NOP << 16) | 960 (TCPOPT_MD5SIG << 8) | 961 TCPOLEN_MD5SIG); 962 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 963 rep.th.doff = arg.iov[0].iov_len/4; 964 965 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 966 key->md5_key, ip_hdr(skb)->saddr, 967 ip_hdr(skb)->daddr, &rep.th); 968 } 969 #endif 970 #ifdef CONFIG_TCP_AO 971 if (tcp_key_is_ao(key)) { 972 int offset = (tsecr) ? 3 : 0; 973 974 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 975 (tcp_ao_len(key->ao_key) << 16) | 976 (key->ao_key->sndid << 8) | 977 key->rcv_next); 978 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 979 rep.th.doff = arg.iov[0].iov_len / 4; 980 981 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 982 key->ao_key, key->traffic_key, 983 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 984 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 985 &rep.th, key->sne); 986 } 987 #endif 988 arg.flags = reply_flags; 989 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 990 ip_hdr(skb)->saddr, /* XXX */ 991 arg.iov[0].iov_len, IPPROTO_TCP, 0); 992 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 993 if (oif) 994 arg.bound_dev_if = oif; 995 arg.tos = tos; 996 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 997 local_bh_disable(); 998 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 999 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1000 sock_net_set(ctl_sk, net); 1001 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1002 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1003 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1004 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1005 transmit_time = tcp_transmit_time(sk); 1006 ip_send_unicast_reply(ctl_sk, sk, 1007 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1008 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1009 &arg, arg.iov[0].iov_len, 1010 transmit_time, txhash); 1011 1012 sock_net_set(ctl_sk, &init_net); 1013 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1014 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1015 local_bh_enable(); 1016 } 1017 1018 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1019 enum tcp_tw_status tw_status) 1020 { 1021 struct inet_timewait_sock *tw = inet_twsk(sk); 1022 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1023 struct tcp_key key = {}; 1024 u8 tos = tw->tw_tos; 1025 1026 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1027 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1028 * being placed in a different service queues (Classic rather than L4S) 1029 */ 1030 if (tw_status == TCP_TW_ACK_OOW) 1031 tos &= ~INET_ECN_MASK; 1032 1033 #ifdef CONFIG_TCP_AO 1034 struct tcp_ao_info *ao_info; 1035 1036 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1037 /* FIXME: the segment to-be-acked is not verified yet */ 1038 ao_info = rcu_dereference(tcptw->ao_info); 1039 if (ao_info) { 1040 const struct tcp_ao_hdr *aoh; 1041 1042 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1043 inet_twsk_put(tw); 1044 return; 1045 } 1046 1047 if (aoh) 1048 key.ao_key = tcp_ao_established_key(sk, ao_info, 1049 aoh->rnext_keyid, -1); 1050 } 1051 } 1052 if (key.ao_key) { 1053 struct tcp_ao_key *rnext_key; 1054 1055 key.traffic_key = snd_other_key(key.ao_key); 1056 key.sne = READ_ONCE(ao_info->snd_sne); 1057 rnext_key = READ_ONCE(ao_info->rnext_key); 1058 key.rcv_next = rnext_key->rcvid; 1059 key.type = TCP_KEY_AO; 1060 #else 1061 if (0) { 1062 #endif 1063 } else if (static_branch_tcp_md5()) { 1064 key.md5_key = tcp_twsk_md5_key(tcptw); 1065 if (key.md5_key) 1066 key.type = TCP_KEY_MD5; 1067 } 1068 1069 tcp_v4_send_ack(sk, skb, 1070 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1071 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1072 tcp_tw_tsval(tcptw), 1073 READ_ONCE(tcptw->tw_ts_recent), 1074 tw->tw_bound_dev_if, &key, 1075 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1076 tos, 1077 tw->tw_txhash); 1078 1079 inet_twsk_put(tw); 1080 } 1081 1082 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1083 struct request_sock *req) 1084 { 1085 struct tcp_key key = {}; 1086 1087 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1088 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1089 */ 1090 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1091 tcp_sk(sk)->snd_nxt; 1092 1093 #ifdef CONFIG_TCP_AO 1094 if (static_branch_unlikely(&tcp_ao_needed.key) && 1095 tcp_rsk_used_ao(req)) { 1096 const union tcp_md5_addr *addr; 1097 const struct tcp_ao_hdr *aoh; 1098 int l3index; 1099 1100 /* Invalid TCP option size or twice included auth */ 1101 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1102 return; 1103 if (!aoh) 1104 return; 1105 1106 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1107 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1108 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1109 aoh->rnext_keyid, -1); 1110 if (unlikely(!key.ao_key)) { 1111 /* Send ACK with any matching MKT for the peer */ 1112 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1113 /* Matching key disappeared (user removed the key?) 1114 * let the handshake timeout. 1115 */ 1116 if (!key.ao_key) { 1117 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1118 addr, 1119 ntohs(tcp_hdr(skb)->source), 1120 &ip_hdr(skb)->daddr, 1121 ntohs(tcp_hdr(skb)->dest)); 1122 return; 1123 } 1124 } 1125 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1126 if (!key.traffic_key) 1127 return; 1128 1129 key.type = TCP_KEY_AO; 1130 key.rcv_next = aoh->keyid; 1131 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1132 #else 1133 if (0) { 1134 #endif 1135 } else if (static_branch_tcp_md5()) { 1136 const union tcp_md5_addr *addr; 1137 int l3index; 1138 1139 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1140 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1141 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1142 if (key.md5_key) 1143 key.type = TCP_KEY_MD5; 1144 } 1145 1146 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1147 tcp_v4_send_ack(sk, skb, seq, 1148 tcp_rsk(req)->rcv_nxt, 1149 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1150 tcp_rsk_tsval(tcp_rsk(req)), 1151 req->ts_recent, 1152 0, &key, 1153 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1154 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1155 READ_ONCE(tcp_rsk(req)->txhash)); 1156 if (tcp_key_is_ao(&key)) 1157 kfree(key.traffic_key); 1158 } 1159 1160 /* 1161 * Send a SYN-ACK after having received a SYN. 1162 * This still operates on a request_sock only, not on a big 1163 * socket. 1164 */ 1165 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1166 struct flowi *fl, 1167 struct request_sock *req, 1168 struct tcp_fastopen_cookie *foc, 1169 enum tcp_synack_type synack_type, 1170 struct sk_buff *syn_skb) 1171 { 1172 struct inet_request_sock *ireq = inet_rsk(req); 1173 struct flowi4 fl4; 1174 int err = -1; 1175 struct sk_buff *skb; 1176 u8 tos; 1177 1178 /* First, grab a route. */ 1179 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1180 return -1; 1181 1182 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1183 1184 if (skb) { 1185 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 1186 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1187 1188 tos = READ_ONCE(inet_sk(sk)->tos); 1189 1190 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1191 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1192 (tos & INET_ECN_MASK); 1193 1194 if (!INET_ECN_is_capable(tos) && 1195 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1196 tos |= INET_ECN_ECT_0; 1197 1198 rcu_read_lock(); 1199 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1200 ireq->ir_rmt_addr, 1201 rcu_dereference(ireq->ireq_opt), 1202 tos); 1203 rcu_read_unlock(); 1204 err = net_xmit_eval(err); 1205 } 1206 1207 return err; 1208 } 1209 1210 /* 1211 * IPv4 request_sock destructor. 1212 */ 1213 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1214 { 1215 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1216 } 1217 1218 #ifdef CONFIG_TCP_MD5SIG 1219 /* 1220 * RFC2385 MD5 checksumming requires a mapping of 1221 * IP address->MD5 Key. 1222 * We need to maintain these in the sk structure. 1223 */ 1224 1225 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1226 EXPORT_IPV6_MOD(tcp_md5_needed); 1227 1228 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1229 { 1230 if (!old) 1231 return true; 1232 1233 /* l3index always overrides non-l3index */ 1234 if (old->l3index && new->l3index == 0) 1235 return false; 1236 if (old->l3index == 0 && new->l3index) 1237 return true; 1238 1239 return old->prefixlen < new->prefixlen; 1240 } 1241 1242 /* Find the Key structure for an address. */ 1243 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1244 const union tcp_md5_addr *addr, 1245 int family, bool any_l3index) 1246 { 1247 const struct tcp_sock *tp = tcp_sk(sk); 1248 struct tcp_md5sig_key *key; 1249 const struct tcp_md5sig_info *md5sig; 1250 __be32 mask; 1251 struct tcp_md5sig_key *best_match = NULL; 1252 bool match; 1253 1254 /* caller either holds rcu_read_lock() or socket lock */ 1255 md5sig = rcu_dereference_check(tp->md5sig_info, 1256 lockdep_sock_is_held(sk)); 1257 if (!md5sig) 1258 return NULL; 1259 1260 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1261 lockdep_sock_is_held(sk)) { 1262 if (key->family != family) 1263 continue; 1264 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1265 key->l3index != l3index) 1266 continue; 1267 if (family == AF_INET) { 1268 mask = inet_make_mask(key->prefixlen); 1269 match = (key->addr.a4.s_addr & mask) == 1270 (addr->a4.s_addr & mask); 1271 #if IS_ENABLED(CONFIG_IPV6) 1272 } else if (family == AF_INET6) { 1273 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1274 key->prefixlen); 1275 #endif 1276 } else { 1277 match = false; 1278 } 1279 1280 if (match && better_md5_match(best_match, key)) 1281 best_match = key; 1282 } 1283 return best_match; 1284 } 1285 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1286 1287 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1288 const union tcp_md5_addr *addr, 1289 int family, u8 prefixlen, 1290 int l3index, u8 flags) 1291 { 1292 const struct tcp_sock *tp = tcp_sk(sk); 1293 struct tcp_md5sig_key *key; 1294 unsigned int size = sizeof(struct in_addr); 1295 const struct tcp_md5sig_info *md5sig; 1296 1297 /* caller either holds rcu_read_lock() or socket lock */ 1298 md5sig = rcu_dereference_check(tp->md5sig_info, 1299 lockdep_sock_is_held(sk)); 1300 if (!md5sig) 1301 return NULL; 1302 #if IS_ENABLED(CONFIG_IPV6) 1303 if (family == AF_INET6) 1304 size = sizeof(struct in6_addr); 1305 #endif 1306 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1307 lockdep_sock_is_held(sk)) { 1308 if (key->family != family) 1309 continue; 1310 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1311 continue; 1312 if (key->l3index != l3index) 1313 continue; 1314 if (!memcmp(&key->addr, addr, size) && 1315 key->prefixlen == prefixlen) 1316 return key; 1317 } 1318 return NULL; 1319 } 1320 1321 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1322 const struct sock *addr_sk) 1323 { 1324 const union tcp_md5_addr *addr; 1325 int l3index; 1326 1327 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1328 addr_sk->sk_bound_dev_if); 1329 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1330 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1331 } 1332 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1333 1334 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1335 { 1336 struct tcp_sock *tp = tcp_sk(sk); 1337 struct tcp_md5sig_info *md5sig; 1338 1339 md5sig = kmalloc_obj(*md5sig, gfp); 1340 if (!md5sig) 1341 return -ENOMEM; 1342 1343 sk_gso_disable(sk); 1344 INIT_HLIST_HEAD(&md5sig->head); 1345 rcu_assign_pointer(tp->md5sig_info, md5sig); 1346 return 0; 1347 } 1348 1349 /* This can be called on a newly created socket, from other files */ 1350 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1351 int family, u8 prefixlen, int l3index, u8 flags, 1352 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1353 { 1354 /* Add Key to the list */ 1355 struct tcp_md5sig_key *key; 1356 struct tcp_sock *tp = tcp_sk(sk); 1357 struct tcp_md5sig_info *md5sig; 1358 1359 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1360 if (key) { 1361 /* Pre-existing entry - just update that one. 1362 * Note that the key might be used concurrently. 1363 * data_race() is telling kcsan that we do not care of 1364 * key mismatches, since changing MD5 key on live flows 1365 * can lead to packet drops. 1366 */ 1367 data_race(memcpy(key->key, newkey, newkeylen)); 1368 1369 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1370 * Also note that a reader could catch new key->keylen value 1371 * but old key->key[], this is the reason we use __GFP_ZERO 1372 * at sock_kmalloc() time below these lines. 1373 */ 1374 WRITE_ONCE(key->keylen, newkeylen); 1375 1376 return 0; 1377 } 1378 1379 md5sig = rcu_dereference_protected(tp->md5sig_info, 1380 lockdep_sock_is_held(sk)); 1381 1382 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1383 if (!key) 1384 return -ENOMEM; 1385 1386 memcpy(key->key, newkey, newkeylen); 1387 key->keylen = newkeylen; 1388 key->family = family; 1389 key->prefixlen = prefixlen; 1390 key->l3index = l3index; 1391 key->flags = flags; 1392 memcpy(&key->addr, addr, 1393 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1394 sizeof(struct in_addr)); 1395 hlist_add_head_rcu(&key->node, &md5sig->head); 1396 return 0; 1397 } 1398 1399 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1400 int family, u8 prefixlen, int l3index, u8 flags, 1401 const u8 *newkey, u8 newkeylen) 1402 { 1403 struct tcp_sock *tp = tcp_sk(sk); 1404 1405 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1406 if (fips_enabled) { 1407 pr_warn_once("TCP-MD5 support is disabled due to FIPS\n"); 1408 return -EOPNOTSUPP; 1409 } 1410 1411 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1412 return -ENOMEM; 1413 1414 if (!static_branch_inc(&tcp_md5_needed.key)) { 1415 struct tcp_md5sig_info *md5sig; 1416 1417 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1418 rcu_assign_pointer(tp->md5sig_info, NULL); 1419 kfree_rcu(md5sig, rcu); 1420 return -EUSERS; 1421 } 1422 } 1423 1424 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1425 newkey, newkeylen, GFP_KERNEL); 1426 } 1427 EXPORT_IPV6_MOD(tcp_md5_do_add); 1428 1429 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1430 int family, u8 prefixlen, int l3index, 1431 struct tcp_md5sig_key *key) 1432 { 1433 struct tcp_sock *tp = tcp_sk(sk); 1434 1435 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1436 1437 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1438 return -ENOMEM; 1439 1440 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1441 struct tcp_md5sig_info *md5sig; 1442 1443 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1444 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1445 rcu_assign_pointer(tp->md5sig_info, NULL); 1446 kfree_rcu(md5sig, rcu); 1447 return -EUSERS; 1448 } 1449 } 1450 1451 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1452 key->flags, key->key, key->keylen, 1453 sk_gfp_mask(sk, GFP_ATOMIC)); 1454 } 1455 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1456 1457 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1458 u8 prefixlen, int l3index, u8 flags) 1459 { 1460 struct tcp_md5sig_key *key; 1461 1462 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1463 if (!key) 1464 return -ENOENT; 1465 hlist_del_rcu(&key->node); 1466 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1467 kfree_rcu(key, rcu); 1468 return 0; 1469 } 1470 EXPORT_IPV6_MOD(tcp_md5_do_del); 1471 1472 void tcp_clear_md5_list(struct sock *sk) 1473 { 1474 struct tcp_sock *tp = tcp_sk(sk); 1475 struct tcp_md5sig_key *key; 1476 struct hlist_node *n; 1477 struct tcp_md5sig_info *md5sig; 1478 1479 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1480 1481 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1482 hlist_del(&key->node); 1483 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1484 kfree(key); 1485 } 1486 } 1487 1488 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1489 sockptr_t optval, int optlen) 1490 { 1491 struct tcp_md5sig cmd; 1492 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1493 const union tcp_md5_addr *addr; 1494 u8 prefixlen = 32; 1495 int l3index = 0; 1496 bool l3flag; 1497 u8 flags; 1498 1499 if (optlen < sizeof(cmd)) 1500 return -EINVAL; 1501 1502 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1503 return -EFAULT; 1504 1505 if (sin->sin_family != AF_INET) 1506 return -EINVAL; 1507 1508 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1509 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1510 1511 if (optname == TCP_MD5SIG_EXT && 1512 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1513 prefixlen = cmd.tcpm_prefixlen; 1514 if (prefixlen > 32) 1515 return -EINVAL; 1516 } 1517 1518 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1519 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1520 struct net_device *dev; 1521 1522 rcu_read_lock(); 1523 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1524 if (dev && netif_is_l3_master(dev)) 1525 l3index = dev->ifindex; 1526 1527 rcu_read_unlock(); 1528 1529 /* ok to reference set/not set outside of rcu; 1530 * right now device MUST be an L3 master 1531 */ 1532 if (!dev || !l3index) 1533 return -EINVAL; 1534 } 1535 1536 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1537 1538 if (!cmd.tcpm_keylen) 1539 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1540 1541 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1542 return -EINVAL; 1543 1544 /* Don't allow keys for peers that have a matching TCP-AO key. 1545 * See the comment in tcp_ao_add_cmd() 1546 */ 1547 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1548 return -EKEYREJECTED; 1549 1550 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1551 cmd.tcpm_key, cmd.tcpm_keylen); 1552 } 1553 1554 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx, 1555 __be32 daddr, __be32 saddr, 1556 const struct tcphdr *th, int nbytes) 1557 { 1558 struct { 1559 struct tcp4_pseudohdr ip; 1560 struct tcphdr tcp; 1561 } h; 1562 1563 h.ip.saddr = saddr; 1564 h.ip.daddr = daddr; 1565 h.ip.pad = 0; 1566 h.ip.protocol = IPPROTO_TCP; 1567 h.ip.len = cpu_to_be16(nbytes); 1568 h.tcp = *th; 1569 h.tcp.check = 0; 1570 md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); 1571 } 1572 1573 static noinline_for_stack void 1574 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1575 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1576 { 1577 struct md5_ctx ctx; 1578 1579 md5_init(&ctx); 1580 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); 1581 tcp_md5_hash_key(&ctx, key); 1582 md5_final(&ctx, md5_hash); 1583 } 1584 1585 noinline_for_stack void 1586 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1587 const struct sock *sk, const struct sk_buff *skb) 1588 { 1589 const struct tcphdr *th = tcp_hdr(skb); 1590 __be32 saddr, daddr; 1591 struct md5_ctx ctx; 1592 1593 if (sk) { /* valid for establish/request sockets */ 1594 saddr = sk->sk_rcv_saddr; 1595 daddr = sk->sk_daddr; 1596 } else { 1597 const struct iphdr *iph = ip_hdr(skb); 1598 saddr = iph->saddr; 1599 daddr = iph->daddr; 1600 } 1601 1602 md5_init(&ctx); 1603 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); 1604 tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); 1605 tcp_md5_hash_key(&ctx, key); 1606 md5_final(&ctx, md5_hash); 1607 } 1608 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1609 1610 #endif 1611 1612 static void tcp_v4_init_req(struct request_sock *req, 1613 const struct sock *sk_listener, 1614 struct sk_buff *skb) 1615 { 1616 struct inet_request_sock *ireq = inet_rsk(req); 1617 struct net *net = sock_net(sk_listener); 1618 1619 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1620 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1621 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1622 } 1623 1624 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1625 struct sk_buff *skb, 1626 struct flowi *fl, 1627 struct request_sock *req, 1628 u32 tw_isn) 1629 { 1630 tcp_v4_init_req(req, sk, skb); 1631 1632 if (security_inet_conn_request(sk, skb, req)) 1633 return NULL; 1634 1635 return inet_csk_route_req(sk, &fl->u.ip4, req); 1636 } 1637 1638 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1639 .family = PF_INET, 1640 .obj_size = sizeof(struct tcp_request_sock), 1641 .send_ack = tcp_v4_reqsk_send_ack, 1642 .destructor = tcp_v4_reqsk_destructor, 1643 .send_reset = tcp_v4_send_reset, 1644 }; 1645 1646 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1647 .mss_clamp = TCP_MSS_DEFAULT, 1648 #ifdef CONFIG_TCP_MD5SIG 1649 .req_md5_lookup = tcp_v4_md5_lookup, 1650 .calc_md5_hash = tcp_v4_md5_hash_skb, 1651 #endif 1652 #ifdef CONFIG_TCP_AO 1653 .ao_lookup = tcp_v4_ao_lookup_rsk, 1654 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1655 .ao_synack_hash = tcp_v4_ao_synack_hash, 1656 #endif 1657 #ifdef CONFIG_SYN_COOKIES 1658 .cookie_init_seq = cookie_v4_init_sequence, 1659 #endif 1660 .route_req = tcp_v4_route_req, 1661 .init_seq = tcp_v4_init_seq, 1662 .init_ts_off = tcp_v4_init_ts_off, 1663 .send_synack = tcp_v4_send_synack, 1664 }; 1665 1666 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1667 { 1668 /* Never answer to SYNs send to broadcast or multicast */ 1669 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1670 goto drop; 1671 1672 return tcp_conn_request(&tcp_request_sock_ops, 1673 &tcp_request_sock_ipv4_ops, sk, skb); 1674 1675 drop: 1676 tcp_listendrop(sk); 1677 return 0; 1678 } 1679 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1680 1681 1682 /* 1683 * The three way handshake has completed - we got a valid synack - 1684 * now create the new socket. 1685 */ 1686 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1687 struct request_sock *req, 1688 struct dst_entry *dst, 1689 struct request_sock *req_unhash, 1690 bool *own_req, 1691 void (*opt_child_init)(struct sock *newsk, 1692 const struct sock *sk)) 1693 { 1694 struct inet_request_sock *ireq; 1695 bool found_dup_sk = false; 1696 struct inet_sock *newinet; 1697 struct tcp_sock *newtp; 1698 struct sock *newsk; 1699 #ifdef CONFIG_TCP_MD5SIG 1700 const union tcp_md5_addr *addr; 1701 struct tcp_md5sig_key *key; 1702 int l3index; 1703 #endif 1704 struct ip_options_rcu *inet_opt; 1705 1706 if (sk_acceptq_is_full(sk)) 1707 goto exit_overflow; 1708 1709 newsk = tcp_create_openreq_child(sk, req, skb); 1710 if (!newsk) 1711 goto exit_nonewsk; 1712 1713 newsk->sk_gso_type = SKB_GSO_TCPV4; 1714 inet_sk_rx_dst_set(newsk, skb); 1715 1716 newtp = tcp_sk(newsk); 1717 newinet = inet_sk(newsk); 1718 ireq = inet_rsk(req); 1719 inet_opt = rcu_dereference(ireq->ireq_opt); 1720 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1721 newinet->mc_index = inet_iif(skb); 1722 newinet->mc_ttl = ip_hdr(skb)->ttl; 1723 newinet->rcv_tos = ip_hdr(skb)->tos; 1724 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1725 if (inet_opt) 1726 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1727 atomic_set(&newinet->inet_id, get_random_u16()); 1728 1729 /* Set ToS of the new socket based upon the value of incoming SYN. 1730 * ECT bits are set later in tcp_init_transfer(). 1731 */ 1732 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1733 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1734 1735 if (!dst) { 1736 dst = inet_csk_route_child_sock(sk, newsk, req); 1737 if (!dst) 1738 goto put_and_exit; 1739 } else { 1740 /* syncookie case : see end of cookie_v4_check() */ 1741 } 1742 sk_setup_caps(newsk, dst); 1743 1744 #if IS_ENABLED(CONFIG_IPV6) 1745 if (opt_child_init) 1746 opt_child_init(newsk, sk); 1747 #endif 1748 tcp_ca_openreq_child(newsk, dst); 1749 1750 tcp_sync_mss(newsk, dst4_mtu(dst)); 1751 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1752 1753 tcp_initialize_rcv_mss(newsk); 1754 1755 #ifdef CONFIG_TCP_MD5SIG 1756 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1757 /* Copy over the MD5 key from the original socket */ 1758 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1759 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1760 if (key && !tcp_rsk_used_ao(req)) { 1761 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1762 goto put_and_exit; 1763 sk_gso_disable(newsk); 1764 } 1765 #endif 1766 #ifdef CONFIG_TCP_AO 1767 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1768 goto put_and_exit; /* OOM, release back memory */ 1769 #endif 1770 1771 if (__inet_inherit_port(sk, newsk) < 0) 1772 goto put_and_exit; 1773 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1774 &found_dup_sk); 1775 if (likely(*own_req)) { 1776 tcp_move_syn(newtp, req); 1777 ireq->ireq_opt = NULL; 1778 } else { 1779 newinet->inet_opt = NULL; 1780 1781 if (!req_unhash && found_dup_sk) { 1782 /* This code path should only be executed in the 1783 * syncookie case only 1784 */ 1785 bh_unlock_sock(newsk); 1786 sock_put(newsk); 1787 newsk = NULL; 1788 } 1789 } 1790 return newsk; 1791 1792 exit_overflow: 1793 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1794 exit_nonewsk: 1795 dst_release(dst); 1796 exit: 1797 tcp_listendrop(sk); 1798 return NULL; 1799 put_and_exit: 1800 newinet->inet_opt = NULL; 1801 inet_csk_prepare_forced_close(newsk); 1802 tcp_done(newsk); 1803 goto exit; 1804 } 1805 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1806 1807 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1808 { 1809 #ifdef CONFIG_SYN_COOKIES 1810 const struct tcphdr *th = tcp_hdr(skb); 1811 1812 if (!th->syn) 1813 sk = cookie_v4_check(sk, skb); 1814 #endif 1815 return sk; 1816 } 1817 1818 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1819 struct tcphdr *th, u32 *cookie) 1820 { 1821 u16 mss = 0; 1822 #ifdef CONFIG_SYN_COOKIES 1823 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1824 &tcp_request_sock_ipv4_ops, sk, th); 1825 if (mss) { 1826 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1827 tcp_synq_overflow(sk); 1828 } 1829 #endif 1830 return mss; 1831 } 1832 1833 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1834 u32)); 1835 /* The socket must have it's spinlock held when we get 1836 * here, unless it is a TCP_LISTEN socket. 1837 * 1838 * We have a potential double-lock case here, so even when 1839 * doing backlog processing we use the BH locking scheme. 1840 * This is because we cannot sleep with the original spinlock 1841 * held. 1842 */ 1843 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1844 { 1845 enum skb_drop_reason reason; 1846 struct sock *rsk; 1847 1848 reason = psp_sk_rx_policy_check(sk, skb); 1849 if (reason) 1850 goto err_discard; 1851 1852 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1853 struct dst_entry *dst; 1854 1855 dst = rcu_dereference_protected(sk->sk_rx_dst, 1856 lockdep_sock_is_held(sk)); 1857 1858 sock_rps_save_rxhash(sk, skb); 1859 sk_mark_napi_id(sk, skb); 1860 if (dst) { 1861 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1862 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1863 dst, 0)) { 1864 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1865 dst_release(dst); 1866 } 1867 } 1868 tcp_rcv_established(sk, skb); 1869 return 0; 1870 } 1871 1872 if (tcp_checksum_complete(skb)) 1873 goto csum_err; 1874 1875 if (sk->sk_state == TCP_LISTEN) { 1876 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1877 1878 if (!nsk) 1879 return 0; 1880 if (nsk != sk) { 1881 reason = tcp_child_process(sk, nsk, skb); 1882 if (reason) { 1883 rsk = nsk; 1884 goto reset; 1885 } 1886 return 0; 1887 } 1888 } else 1889 sock_rps_save_rxhash(sk, skb); 1890 1891 reason = tcp_rcv_state_process(sk, skb); 1892 if (reason) { 1893 rsk = sk; 1894 goto reset; 1895 } 1896 return 0; 1897 1898 reset: 1899 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1900 discard: 1901 sk_skb_reason_drop(sk, skb, reason); 1902 /* Be careful here. If this function gets more complicated and 1903 * gcc suffers from register pressure on the x86, sk (in %ebx) 1904 * might be destroyed here. This current version compiles correctly, 1905 * but you have been warned. 1906 */ 1907 return 0; 1908 1909 csum_err: 1910 reason = SKB_DROP_REASON_TCP_CSUM; 1911 trace_tcp_bad_csum(skb); 1912 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1913 err_discard: 1914 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1915 goto discard; 1916 } 1917 EXPORT_SYMBOL(tcp_v4_do_rcv); 1918 1919 int tcp_v4_early_demux(struct sk_buff *skb) 1920 { 1921 struct net *net = dev_net_rcu(skb->dev); 1922 const struct iphdr *iph; 1923 const struct tcphdr *th; 1924 struct sock *sk; 1925 1926 if (skb->pkt_type != PACKET_HOST) 1927 return 0; 1928 1929 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1930 return 0; 1931 1932 iph = ip_hdr(skb); 1933 th = tcp_hdr(skb); 1934 1935 if (th->doff < sizeof(struct tcphdr) / 4) 1936 return 0; 1937 1938 sk = __inet_lookup_established(net, iph->saddr, th->source, 1939 iph->daddr, ntohs(th->dest), 1940 skb->skb_iif, inet_sdif(skb)); 1941 if (sk) { 1942 skb->sk = sk; 1943 skb->destructor = sock_edemux; 1944 if (sk_fullsock(sk)) { 1945 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1946 1947 if (dst) 1948 dst = dst_check(dst, 0); 1949 if (dst && 1950 sk->sk_rx_dst_ifindex == skb->skb_iif) 1951 skb_dst_set_noref(skb, dst); 1952 } 1953 } 1954 return 0; 1955 } 1956 1957 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1958 enum skb_drop_reason *reason) 1959 { 1960 u32 tail_gso_size, tail_gso_segs; 1961 struct skb_shared_info *shinfo; 1962 const struct tcphdr *th; 1963 struct tcphdr *thtail; 1964 struct sk_buff *tail; 1965 unsigned int hdrlen; 1966 bool fragstolen; 1967 u32 gso_segs; 1968 u32 gso_size; 1969 u64 limit; 1970 int delta; 1971 int err; 1972 1973 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1974 * we can fix skb->truesize to its real value to avoid future drops. 1975 * This is valid because skb is not yet charged to the socket. 1976 * It has been noticed pure SACK packets were sometimes dropped 1977 * (if cooked by drivers without copybreak feature). 1978 */ 1979 skb_condense(skb); 1980 1981 tcp_cleanup_skb(skb); 1982 1983 if (unlikely(tcp_checksum_complete(skb))) { 1984 bh_unlock_sock(sk); 1985 trace_tcp_bad_csum(skb); 1986 *reason = SKB_DROP_REASON_TCP_CSUM; 1987 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1988 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1989 return true; 1990 } 1991 1992 /* Attempt coalescing to last skb in backlog, even if we are 1993 * above the limits. 1994 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1995 */ 1996 th = (const struct tcphdr *)skb->data; 1997 hdrlen = th->doff * 4; 1998 1999 tail = sk->sk_backlog.tail; 2000 if (!tail) 2001 goto no_coalesce; 2002 thtail = (struct tcphdr *)tail->data; 2003 2004 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2005 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2006 ((TCP_SKB_CB(tail)->tcp_flags | 2007 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2008 !((TCP_SKB_CB(tail)->tcp_flags & 2009 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2010 ((TCP_SKB_CB(tail)->tcp_flags ^ 2011 TCP_SKB_CB(skb)->tcp_flags) & 2012 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2013 !tcp_skb_can_collapse_rx(tail, skb) || 2014 thtail->doff != th->doff || 2015 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || 2016 /* prior to PSP Rx policy check, retain exact PSP metadata */ 2017 psp_skb_coalesce_diff(tail, skb)) 2018 goto no_coalesce; 2019 2020 __skb_pull(skb, hdrlen); 2021 2022 shinfo = skb_shinfo(skb); 2023 gso_size = shinfo->gso_size ?: skb->len; 2024 gso_segs = shinfo->gso_segs ?: 1; 2025 2026 shinfo = skb_shinfo(tail); 2027 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2028 tail_gso_segs = shinfo->gso_segs ?: 1; 2029 2030 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2031 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2032 2033 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2034 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2035 thtail->window = th->window; 2036 } 2037 2038 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2039 * thtail->fin, so that the fast path in tcp_rcv_established() 2040 * is not entered if we append a packet with a FIN. 2041 * SYN, RST, URG are not present. 2042 * ACK is set on both packets. 2043 * PSH : we do not really care in TCP stack, 2044 * at least for 'GRO' packets. 2045 */ 2046 thtail->fin |= th->fin; 2047 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2048 2049 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2050 TCP_SKB_CB(tail)->has_rxtstamp = true; 2051 tail->tstamp = skb->tstamp; 2052 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2053 } 2054 2055 /* Not as strict as GRO. We only need to carry mss max value */ 2056 shinfo->gso_size = max(gso_size, tail_gso_size); 2057 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2058 2059 sk->sk_backlog.len += delta; 2060 __NET_INC_STATS(sock_net(sk), 2061 LINUX_MIB_TCPBACKLOGCOALESCE); 2062 kfree_skb_partial(skb, fragstolen); 2063 return false; 2064 } 2065 __skb_push(skb, hdrlen); 2066 2067 no_coalesce: 2068 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2069 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2070 * sk_rcvbuf in normal conditions. 2071 */ 2072 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2073 2074 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2075 2076 /* Only socket owner can try to collapse/prune rx queues 2077 * to reduce memory overhead, so add a little headroom here. 2078 * Few sockets backlog are possibly concurrently non empty. 2079 */ 2080 limit += 64 * 1024; 2081 2082 limit = min_t(u64, limit, UINT_MAX); 2083 2084 err = sk_add_backlog(sk, skb, limit); 2085 if (unlikely(err)) { 2086 bh_unlock_sock(sk); 2087 if (err == -ENOMEM) { 2088 *reason = SKB_DROP_REASON_PFMEMALLOC; 2089 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2090 } else { 2091 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2092 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2093 } 2094 return true; 2095 } 2096 return false; 2097 } 2098 EXPORT_IPV6_MOD(tcp_add_backlog); 2099 2100 static void tcp_v4_restore_cb(struct sk_buff *skb) 2101 { 2102 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2103 sizeof(struct inet_skb_parm)); 2104 } 2105 2106 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2107 const struct tcphdr *th) 2108 { 2109 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2110 * barrier() makes sure compiler wont play fool^Waliasing games. 2111 */ 2112 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2113 sizeof(struct inet_skb_parm)); 2114 barrier(); 2115 2116 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2117 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2118 skb->len - th->doff * 4); 2119 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2120 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2121 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2122 TCP_SKB_CB(skb)->sacked = 0; 2123 TCP_SKB_CB(skb)->has_rxtstamp = 2124 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2125 } 2126 2127 /* 2128 * From tcp_input.c 2129 */ 2130 2131 int tcp_v4_rcv(struct sk_buff *skb) 2132 { 2133 struct net *net = dev_net_rcu(skb->dev); 2134 enum skb_drop_reason drop_reason; 2135 enum tcp_tw_status tw_status; 2136 int sdif = inet_sdif(skb); 2137 int dif = inet_iif(skb); 2138 const struct iphdr *iph; 2139 const struct tcphdr *th; 2140 struct sock *sk = NULL; 2141 bool refcounted; 2142 int ret; 2143 u32 isn; 2144 2145 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2146 if (skb->pkt_type != PACKET_HOST) 2147 goto discard_it; 2148 2149 /* Count it even if it's bad */ 2150 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2151 2152 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2153 goto discard_it; 2154 2155 th = (const struct tcphdr *)skb->data; 2156 2157 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2158 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2159 goto bad_packet; 2160 } 2161 if (!pskb_may_pull(skb, th->doff * 4)) 2162 goto discard_it; 2163 2164 /* An explanation is required here, I think. 2165 * Packet length and doff are validated by header prediction, 2166 * provided case of th->doff==0 is eliminated. 2167 * So, we defer the checks. */ 2168 2169 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2170 goto csum_error; 2171 2172 th = (const struct tcphdr *)skb->data; 2173 iph = ip_hdr(skb); 2174 lookup: 2175 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2176 th->dest, sdif, &refcounted); 2177 if (!sk) 2178 goto no_tcp_socket; 2179 2180 if (sk->sk_state == TCP_TIME_WAIT) 2181 goto do_time_wait; 2182 2183 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2184 struct request_sock *req = inet_reqsk(sk); 2185 bool req_stolen = false; 2186 struct sock *nsk; 2187 2188 sk = req->rsk_listener; 2189 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2190 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2191 else 2192 drop_reason = tcp_inbound_hash(sk, req, skb, 2193 &iph->saddr, &iph->daddr, 2194 AF_INET, dif, sdif); 2195 if (unlikely(drop_reason)) { 2196 sk_drops_skbadd(sk, skb); 2197 reqsk_put(req); 2198 goto discard_it; 2199 } 2200 if (tcp_checksum_complete(skb)) { 2201 reqsk_put(req); 2202 goto csum_error; 2203 } 2204 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2205 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2206 if (!nsk) { 2207 inet_csk_reqsk_queue_drop_and_put(sk, req); 2208 goto lookup; 2209 } 2210 sk = nsk; 2211 /* reuseport_migrate_sock() has already held one sk_refcnt 2212 * before returning. 2213 */ 2214 } else { 2215 /* We own a reference on the listener, increase it again 2216 * as we might lose it too soon. 2217 */ 2218 sock_hold(sk); 2219 } 2220 refcounted = true; 2221 nsk = NULL; 2222 if (!tcp_filter(sk, skb, &drop_reason)) { 2223 th = (const struct tcphdr *)skb->data; 2224 iph = ip_hdr(skb); 2225 tcp_v4_fill_cb(skb, iph, th); 2226 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2227 &drop_reason); 2228 } 2229 if (!nsk) { 2230 reqsk_put(req); 2231 if (req_stolen) { 2232 /* Another cpu got exclusive access to req 2233 * and created a full blown socket. 2234 * Try to feed this packet to this socket 2235 * instead of discarding it. 2236 */ 2237 tcp_v4_restore_cb(skb); 2238 sock_put(sk); 2239 goto lookup; 2240 } 2241 goto discard_and_relse; 2242 } 2243 nf_reset_ct(skb); 2244 if (nsk == sk) { 2245 reqsk_put(req); 2246 tcp_v4_restore_cb(skb); 2247 } else { 2248 drop_reason = tcp_child_process(sk, nsk, skb); 2249 if (drop_reason) { 2250 enum sk_rst_reason rst_reason; 2251 2252 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2253 tcp_v4_send_reset(nsk, skb, rst_reason); 2254 goto discard_and_relse; 2255 } 2256 sock_put(sk); 2257 return 0; 2258 } 2259 } 2260 2261 process: 2262 if (static_branch_unlikely(&ip4_min_ttl)) { 2263 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2264 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2265 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2266 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2267 goto discard_and_relse; 2268 } 2269 } 2270 2271 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2272 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2273 goto discard_and_relse; 2274 } 2275 2276 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2277 AF_INET, dif, sdif); 2278 if (drop_reason) 2279 goto discard_and_relse; 2280 2281 nf_reset_ct(skb); 2282 2283 if (tcp_filter(sk, skb, &drop_reason)) 2284 goto discard_and_relse; 2285 2286 th = (const struct tcphdr *)skb->data; 2287 iph = ip_hdr(skb); 2288 tcp_v4_fill_cb(skb, iph, th); 2289 2290 skb->dev = NULL; 2291 2292 if (sk->sk_state == TCP_LISTEN) { 2293 ret = tcp_v4_do_rcv(sk, skb); 2294 goto put_and_return; 2295 } 2296 2297 sk_incoming_cpu_update(sk); 2298 2299 bh_lock_sock_nested(sk); 2300 tcp_segs_in(tcp_sk(sk), skb); 2301 ret = 0; 2302 if (!sock_owned_by_user(sk)) { 2303 ret = tcp_v4_do_rcv(sk, skb); 2304 } else { 2305 if (tcp_add_backlog(sk, skb, &drop_reason)) 2306 goto discard_and_relse; 2307 } 2308 bh_unlock_sock(sk); 2309 2310 put_and_return: 2311 if (refcounted) 2312 sock_put(sk); 2313 2314 return ret; 2315 2316 no_tcp_socket: 2317 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2318 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2319 goto discard_it; 2320 2321 tcp_v4_fill_cb(skb, iph, th); 2322 2323 if (tcp_checksum_complete(skb)) { 2324 csum_error: 2325 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2326 trace_tcp_bad_csum(skb); 2327 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2328 bad_packet: 2329 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2330 } else { 2331 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2332 } 2333 2334 discard_it: 2335 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2336 /* Discard frame. */ 2337 sk_skb_reason_drop(sk, skb, drop_reason); 2338 return 0; 2339 2340 discard_and_relse: 2341 sk_drops_skbadd(sk, skb); 2342 if (refcounted) 2343 sock_put(sk); 2344 goto discard_it; 2345 2346 do_time_wait: 2347 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2348 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2349 inet_twsk_put(inet_twsk(sk)); 2350 goto discard_it; 2351 } 2352 2353 tcp_v4_fill_cb(skb, iph, th); 2354 2355 if (tcp_checksum_complete(skb)) { 2356 inet_twsk_put(inet_twsk(sk)); 2357 goto csum_error; 2358 } 2359 2360 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2361 &drop_reason); 2362 switch (tw_status) { 2363 case TCP_TW_SYN: { 2364 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2365 iph->saddr, th->source, 2366 iph->daddr, th->dest, 2367 inet_iif(skb), 2368 sdif); 2369 if (sk2) { 2370 inet_twsk_deschedule_put(inet_twsk(sk)); 2371 sk = sk2; 2372 tcp_v4_restore_cb(skb); 2373 refcounted = false; 2374 __this_cpu_write(tcp_tw_isn, isn); 2375 goto process; 2376 } 2377 2378 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); 2379 if (drop_reason) 2380 break; 2381 } 2382 /* to ACK */ 2383 fallthrough; 2384 case TCP_TW_ACK: 2385 case TCP_TW_ACK_OOW: 2386 tcp_v4_timewait_ack(sk, skb, tw_status); 2387 break; 2388 case TCP_TW_RST: 2389 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2390 inet_twsk_deschedule_put(inet_twsk(sk)); 2391 goto discard_it; 2392 case TCP_TW_SUCCESS:; 2393 } 2394 goto discard_it; 2395 } 2396 2397 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2398 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2399 }; 2400 2401 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2402 { 2403 struct dst_entry *dst = skb_dst(skb); 2404 2405 if (dst && dst_hold_safe(dst)) { 2406 rcu_assign_pointer(sk->sk_rx_dst, dst); 2407 sk->sk_rx_dst_ifindex = skb->skb_iif; 2408 } 2409 } 2410 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2411 2412 const struct inet_connection_sock_af_ops ipv4_specific = { 2413 .queue_xmit = ip_queue_xmit, 2414 .rebuild_header = inet_sk_rebuild_header, 2415 .sk_rx_dst_set = inet_sk_rx_dst_set, 2416 .conn_request = tcp_v4_conn_request, 2417 .syn_recv_sock = tcp_v4_syn_recv_sock, 2418 .net_header_len = sizeof(struct iphdr), 2419 .setsockopt = ip_setsockopt, 2420 .getsockopt = ip_getsockopt, 2421 .mtu_reduced = tcp_v4_mtu_reduced, 2422 }; 2423 EXPORT_IPV6_MOD(ipv4_specific); 2424 2425 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2426 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2427 #ifdef CONFIG_TCP_MD5SIG 2428 .md5_lookup = tcp_v4_md5_lookup, 2429 .calc_md5_hash = tcp_v4_md5_hash_skb, 2430 .md5_parse = tcp_v4_parse_md5_keys, 2431 #endif 2432 #ifdef CONFIG_TCP_AO 2433 .ao_lookup = tcp_v4_ao_lookup, 2434 .calc_ao_hash = tcp_v4_ao_hash_skb, 2435 .ao_parse = tcp_v4_parse_ao, 2436 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2437 #endif 2438 }; 2439 2440 static void tcp4_destruct_sock(struct sock *sk) 2441 { 2442 tcp_md5_destruct_sock(sk); 2443 tcp_ao_destroy_sock(sk, false); 2444 inet_sock_destruct(sk); 2445 } 2446 #endif 2447 2448 /* NOTE: A lot of things set to zero explicitly by call to 2449 * sk_alloc() so need not be done here. 2450 */ 2451 static int tcp_v4_init_sock(struct sock *sk) 2452 { 2453 struct inet_connection_sock *icsk = inet_csk(sk); 2454 2455 tcp_init_sock(sk); 2456 2457 icsk->icsk_af_ops = &ipv4_specific; 2458 2459 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2460 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2461 sk->sk_destruct = tcp4_destruct_sock; 2462 #endif 2463 2464 return 0; 2465 } 2466 2467 static void tcp_release_user_frags(struct sock *sk) 2468 { 2469 #ifdef CONFIG_PAGE_POOL 2470 unsigned long index; 2471 void *netmem; 2472 2473 xa_for_each(&sk->sk_user_frags, index, netmem) 2474 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2475 #endif 2476 } 2477 2478 void tcp_v4_destroy_sock(struct sock *sk) 2479 { 2480 struct tcp_sock *tp = tcp_sk(sk); 2481 2482 tcp_release_user_frags(sk); 2483 2484 xa_destroy(&sk->sk_user_frags); 2485 2486 trace_tcp_destroy_sock(sk); 2487 2488 tcp_clear_xmit_timers(sk); 2489 2490 tcp_cleanup_congestion_control(sk); 2491 2492 tcp_cleanup_ulp(sk); 2493 2494 /* Cleanup up the write buffer. */ 2495 tcp_write_queue_purge(sk); 2496 2497 /* Check if we want to disable active TFO */ 2498 tcp_fastopen_active_disable_ofo_check(sk); 2499 2500 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2501 skb_rbtree_purge(&tp->out_of_order_queue); 2502 2503 /* Clean up a referenced TCP bind bucket. */ 2504 if (inet_csk(sk)->icsk_bind_hash) 2505 inet_put_port(sk); 2506 2507 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2508 2509 /* If socket is aborted during connect operation */ 2510 tcp_free_fastopen_req(tp); 2511 tcp_fastopen_destroy_cipher(sk); 2512 tcp_saved_syn_free(tp); 2513 2514 sk_sockets_allocated_dec(sk); 2515 } 2516 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2517 2518 #ifdef CONFIG_PROC_FS 2519 /* Proc filesystem TCP sock list dumping. */ 2520 2521 static unsigned short seq_file_family(const struct seq_file *seq); 2522 2523 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2524 { 2525 unsigned short family = seq_file_family(seq); 2526 2527 /* AF_UNSPEC is used as a match all */ 2528 return ((family == AF_UNSPEC || family == sk->sk_family) && 2529 net_eq(sock_net(sk), seq_file_net(seq))); 2530 } 2531 2532 /* Find a non empty bucket (starting from st->bucket) 2533 * and return the first sk from it. 2534 */ 2535 static void *listening_get_first(struct seq_file *seq) 2536 { 2537 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2538 struct tcp_iter_state *st = seq->private; 2539 2540 st->offset = 0; 2541 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2542 struct inet_listen_hashbucket *ilb2; 2543 struct hlist_nulls_node *node; 2544 struct sock *sk; 2545 2546 ilb2 = &hinfo->lhash2[st->bucket]; 2547 if (hlist_nulls_empty(&ilb2->nulls_head)) 2548 continue; 2549 2550 spin_lock(&ilb2->lock); 2551 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2552 if (seq_sk_match(seq, sk)) 2553 return sk; 2554 } 2555 spin_unlock(&ilb2->lock); 2556 } 2557 2558 return NULL; 2559 } 2560 2561 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2562 * If "cur" is the last one in the st->bucket, 2563 * call listening_get_first() to return the first sk of the next 2564 * non empty bucket. 2565 */ 2566 static void *listening_get_next(struct seq_file *seq, void *cur) 2567 { 2568 struct tcp_iter_state *st = seq->private; 2569 struct inet_listen_hashbucket *ilb2; 2570 struct hlist_nulls_node *node; 2571 struct inet_hashinfo *hinfo; 2572 struct sock *sk = cur; 2573 2574 ++st->num; 2575 ++st->offset; 2576 2577 sk = sk_nulls_next(sk); 2578 sk_nulls_for_each_from(sk, node) { 2579 if (seq_sk_match(seq, sk)) 2580 return sk; 2581 } 2582 2583 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2584 ilb2 = &hinfo->lhash2[st->bucket]; 2585 spin_unlock(&ilb2->lock); 2586 ++st->bucket; 2587 return listening_get_first(seq); 2588 } 2589 2590 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2591 { 2592 struct tcp_iter_state *st = seq->private; 2593 void *rc; 2594 2595 st->bucket = 0; 2596 st->offset = 0; 2597 rc = listening_get_first(seq); 2598 2599 while (rc && *pos) { 2600 rc = listening_get_next(seq, rc); 2601 --*pos; 2602 } 2603 return rc; 2604 } 2605 2606 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2607 const struct tcp_iter_state *st) 2608 { 2609 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2610 } 2611 2612 /* 2613 * Get first established socket starting from bucket given in st->bucket. 2614 * If st->bucket is zero, the very first socket in the hash is returned. 2615 */ 2616 static void *established_get_first(struct seq_file *seq) 2617 { 2618 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2619 struct tcp_iter_state *st = seq->private; 2620 2621 st->offset = 0; 2622 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2623 struct sock *sk; 2624 struct hlist_nulls_node *node; 2625 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2626 2627 cond_resched(); 2628 2629 /* Lockless fast path for the common case of empty buckets */ 2630 if (empty_bucket(hinfo, st)) 2631 continue; 2632 2633 spin_lock_bh(lock); 2634 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2635 if (seq_sk_match(seq, sk)) 2636 return sk; 2637 } 2638 spin_unlock_bh(lock); 2639 } 2640 2641 return NULL; 2642 } 2643 2644 static void *established_get_next(struct seq_file *seq, void *cur) 2645 { 2646 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2647 struct tcp_iter_state *st = seq->private; 2648 struct hlist_nulls_node *node; 2649 struct sock *sk = cur; 2650 2651 ++st->num; 2652 ++st->offset; 2653 2654 sk = sk_nulls_next(sk); 2655 2656 sk_nulls_for_each_from(sk, node) { 2657 if (seq_sk_match(seq, sk)) 2658 return sk; 2659 } 2660 2661 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2662 ++st->bucket; 2663 return established_get_first(seq); 2664 } 2665 2666 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2667 { 2668 struct tcp_iter_state *st = seq->private; 2669 void *rc; 2670 2671 st->bucket = 0; 2672 rc = established_get_first(seq); 2673 2674 while (rc && pos) { 2675 rc = established_get_next(seq, rc); 2676 --pos; 2677 } 2678 return rc; 2679 } 2680 2681 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2682 { 2683 void *rc; 2684 struct tcp_iter_state *st = seq->private; 2685 2686 st->state = TCP_SEQ_STATE_LISTENING; 2687 rc = listening_get_idx(seq, &pos); 2688 2689 if (!rc) { 2690 st->state = TCP_SEQ_STATE_ESTABLISHED; 2691 rc = established_get_idx(seq, pos); 2692 } 2693 2694 return rc; 2695 } 2696 2697 static void *tcp_seek_last_pos(struct seq_file *seq) 2698 { 2699 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2700 struct tcp_iter_state *st = seq->private; 2701 int bucket = st->bucket; 2702 int offset = st->offset; 2703 int orig_num = st->num; 2704 void *rc = NULL; 2705 2706 switch (st->state) { 2707 case TCP_SEQ_STATE_LISTENING: 2708 if (st->bucket > hinfo->lhash2_mask) 2709 break; 2710 rc = listening_get_first(seq); 2711 while (offset-- && rc && bucket == st->bucket) 2712 rc = listening_get_next(seq, rc); 2713 if (rc) 2714 break; 2715 st->bucket = 0; 2716 st->state = TCP_SEQ_STATE_ESTABLISHED; 2717 fallthrough; 2718 case TCP_SEQ_STATE_ESTABLISHED: 2719 if (st->bucket > hinfo->ehash_mask) 2720 break; 2721 rc = established_get_first(seq); 2722 while (offset-- && rc && bucket == st->bucket) 2723 rc = established_get_next(seq, rc); 2724 } 2725 2726 st->num = orig_num; 2727 2728 return rc; 2729 } 2730 2731 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2732 { 2733 struct tcp_iter_state *st = seq->private; 2734 void *rc; 2735 2736 if (*pos && *pos == st->last_pos) { 2737 rc = tcp_seek_last_pos(seq); 2738 if (rc) 2739 goto out; 2740 } 2741 2742 st->state = TCP_SEQ_STATE_LISTENING; 2743 st->num = 0; 2744 st->bucket = 0; 2745 st->offset = 0; 2746 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2747 2748 out: 2749 st->last_pos = *pos; 2750 return rc; 2751 } 2752 EXPORT_IPV6_MOD(tcp_seq_start); 2753 2754 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2755 { 2756 struct tcp_iter_state *st = seq->private; 2757 void *rc = NULL; 2758 2759 if (v == SEQ_START_TOKEN) { 2760 rc = tcp_get_idx(seq, 0); 2761 goto out; 2762 } 2763 2764 switch (st->state) { 2765 case TCP_SEQ_STATE_LISTENING: 2766 rc = listening_get_next(seq, v); 2767 if (!rc) { 2768 st->state = TCP_SEQ_STATE_ESTABLISHED; 2769 st->bucket = 0; 2770 st->offset = 0; 2771 rc = established_get_first(seq); 2772 } 2773 break; 2774 case TCP_SEQ_STATE_ESTABLISHED: 2775 rc = established_get_next(seq, v); 2776 break; 2777 } 2778 out: 2779 ++*pos; 2780 st->last_pos = *pos; 2781 return rc; 2782 } 2783 EXPORT_IPV6_MOD(tcp_seq_next); 2784 2785 void tcp_seq_stop(struct seq_file *seq, void *v) 2786 { 2787 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2788 struct tcp_iter_state *st = seq->private; 2789 2790 switch (st->state) { 2791 case TCP_SEQ_STATE_LISTENING: 2792 if (v != SEQ_START_TOKEN) 2793 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2794 break; 2795 case TCP_SEQ_STATE_ESTABLISHED: 2796 if (v) 2797 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2798 break; 2799 } 2800 } 2801 EXPORT_IPV6_MOD(tcp_seq_stop); 2802 2803 static void get_openreq4(const struct request_sock *req, 2804 struct seq_file *f, int i) 2805 { 2806 const struct inet_request_sock *ireq = inet_rsk(req); 2807 long delta = req->rsk_timer.expires - jiffies; 2808 2809 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2810 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2811 i, 2812 ireq->ir_loc_addr, 2813 ireq->ir_num, 2814 ireq->ir_rmt_addr, 2815 ntohs(ireq->ir_rmt_port), 2816 TCP_SYN_RECV, 2817 0, 0, /* could print option size, but that is af dependent. */ 2818 1, /* timers active (only the expire timer) */ 2819 jiffies_delta_to_clock_t(delta), 2820 req->num_timeout, 2821 from_kuid_munged(seq_user_ns(f), 2822 sk_uid(req->rsk_listener)), 2823 0, /* non standard timer */ 2824 0, /* open_requests have no inode */ 2825 0, 2826 req); 2827 } 2828 2829 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2830 { 2831 int timer_active; 2832 unsigned long timer_expires; 2833 const struct tcp_sock *tp = tcp_sk(sk); 2834 const struct inet_connection_sock *icsk = inet_csk(sk); 2835 const struct inet_sock *inet = inet_sk(sk); 2836 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2837 __be32 dest = inet->inet_daddr; 2838 __be32 src = inet->inet_rcv_saddr; 2839 __u16 destp = ntohs(inet->inet_dport); 2840 __u16 srcp = ntohs(inet->inet_sport); 2841 u8 icsk_pending; 2842 int rx_queue; 2843 int state; 2844 2845 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2846 if (icsk_pending == ICSK_TIME_RETRANS || 2847 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2848 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2849 timer_active = 1; 2850 timer_expires = tcp_timeout_expires(sk); 2851 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2852 timer_active = 4; 2853 timer_expires = tcp_timeout_expires(sk); 2854 } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 2855 timer_active = 2; 2856 timer_expires = icsk->icsk_keepalive_timer.expires; 2857 } else { 2858 timer_active = 0; 2859 timer_expires = jiffies; 2860 } 2861 2862 state = inet_sk_state_load(sk); 2863 if (state == TCP_LISTEN) 2864 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2865 else 2866 /* Because we don't lock the socket, 2867 * we might find a transient negative value. 2868 */ 2869 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2870 READ_ONCE(tp->copied_seq), 0); 2871 2872 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2873 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2874 i, src, srcp, dest, destp, state, 2875 READ_ONCE(tp->write_seq) - tp->snd_una, 2876 rx_queue, 2877 timer_active, 2878 jiffies_delta_to_clock_t(timer_expires - jiffies), 2879 READ_ONCE(icsk->icsk_retransmits), 2880 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2881 READ_ONCE(icsk->icsk_probes_out), 2882 sock_i_ino(sk), 2883 refcount_read(&sk->sk_refcnt), sk, 2884 jiffies_to_clock_t(icsk->icsk_rto), 2885 jiffies_to_clock_t(icsk->icsk_ack.ato), 2886 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2887 tcp_snd_cwnd(tp), 2888 state == TCP_LISTEN ? 2889 fastopenq->max_qlen : 2890 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2891 } 2892 2893 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2894 struct seq_file *f, int i) 2895 { 2896 long delta = tw->tw_timer.expires - jiffies; 2897 __be32 dest, src; 2898 __u16 destp, srcp; 2899 2900 dest = tw->tw_daddr; 2901 src = tw->tw_rcv_saddr; 2902 destp = ntohs(tw->tw_dport); 2903 srcp = ntohs(tw->tw_sport); 2904 2905 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2906 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2907 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2908 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2909 refcount_read(&tw->tw_refcnt), tw); 2910 } 2911 2912 #define TMPSZ 150 2913 2914 static int tcp4_seq_show(struct seq_file *seq, void *v) 2915 { 2916 struct tcp_iter_state *st; 2917 struct sock *sk = v; 2918 2919 seq_setwidth(seq, TMPSZ - 1); 2920 if (v == SEQ_START_TOKEN) { 2921 seq_puts(seq, " sl local_address rem_address st tx_queue " 2922 "rx_queue tr tm->when retrnsmt uid timeout " 2923 "inode"); 2924 goto out; 2925 } 2926 st = seq->private; 2927 2928 if (sk->sk_state == TCP_TIME_WAIT) 2929 get_timewait4_sock(v, seq, st->num); 2930 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2931 get_openreq4(v, seq, st->num); 2932 else 2933 get_tcp4_sock(v, seq, st->num); 2934 out: 2935 seq_pad(seq, '\n'); 2936 return 0; 2937 } 2938 2939 #ifdef CONFIG_BPF_SYSCALL 2940 union bpf_tcp_iter_batch_item { 2941 struct sock *sk; 2942 __u64 cookie; 2943 }; 2944 2945 struct bpf_tcp_iter_state { 2946 struct tcp_iter_state state; 2947 unsigned int cur_sk; 2948 unsigned int end_sk; 2949 unsigned int max_sk; 2950 union bpf_tcp_iter_batch_item *batch; 2951 }; 2952 2953 struct bpf_iter__tcp { 2954 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2955 __bpf_md_ptr(struct sock_common *, sk_common); 2956 uid_t uid __aligned(8); 2957 }; 2958 2959 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2960 struct sock_common *sk_common, uid_t uid) 2961 { 2962 struct bpf_iter__tcp ctx; 2963 2964 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2965 ctx.meta = meta; 2966 ctx.sk_common = sk_common; 2967 ctx.uid = uid; 2968 return bpf_iter_run_prog(prog, &ctx); 2969 } 2970 2971 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2972 { 2973 union bpf_tcp_iter_batch_item *item; 2974 unsigned int cur_sk = iter->cur_sk; 2975 __u64 cookie; 2976 2977 /* Remember the cookies of the sockets we haven't seen yet, so we can 2978 * pick up where we left off next time around. 2979 */ 2980 while (cur_sk < iter->end_sk) { 2981 item = &iter->batch[cur_sk++]; 2982 cookie = sock_gen_cookie(item->sk); 2983 sock_gen_put(item->sk); 2984 item->cookie = cookie; 2985 } 2986 } 2987 2988 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2989 unsigned int new_batch_sz, gfp_t flags) 2990 { 2991 union bpf_tcp_iter_batch_item *new_batch; 2992 2993 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2994 flags | __GFP_NOWARN); 2995 if (!new_batch) 2996 return -ENOMEM; 2997 2998 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 2999 kvfree(iter->batch); 3000 iter->batch = new_batch; 3001 iter->max_sk = new_batch_sz; 3002 3003 return 0; 3004 } 3005 3006 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3007 union bpf_tcp_iter_batch_item *cookies, 3008 int n_cookies) 3009 { 3010 struct hlist_nulls_node *node; 3011 struct sock *sk; 3012 int i; 3013 3014 for (i = 0; i < n_cookies; i++) { 3015 sk = first_sk; 3016 sk_nulls_for_each_from(sk, node) 3017 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3018 return sk; 3019 } 3020 3021 return NULL; 3022 } 3023 3024 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3025 { 3026 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3027 struct bpf_tcp_iter_state *iter = seq->private; 3028 struct tcp_iter_state *st = &iter->state; 3029 unsigned int find_cookie = iter->cur_sk; 3030 unsigned int end_cookie = iter->end_sk; 3031 int resume_bucket = st->bucket; 3032 struct sock *sk; 3033 3034 if (end_cookie && find_cookie == end_cookie) 3035 ++st->bucket; 3036 3037 sk = listening_get_first(seq); 3038 iter->cur_sk = 0; 3039 iter->end_sk = 0; 3040 3041 if (sk && st->bucket == resume_bucket && end_cookie) { 3042 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3043 end_cookie - find_cookie); 3044 if (!sk) { 3045 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3046 ++st->bucket; 3047 sk = listening_get_first(seq); 3048 } 3049 } 3050 3051 return sk; 3052 } 3053 3054 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3055 { 3056 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3057 struct bpf_tcp_iter_state *iter = seq->private; 3058 struct tcp_iter_state *st = &iter->state; 3059 unsigned int find_cookie = iter->cur_sk; 3060 unsigned int end_cookie = iter->end_sk; 3061 int resume_bucket = st->bucket; 3062 struct sock *sk; 3063 3064 if (end_cookie && find_cookie == end_cookie) 3065 ++st->bucket; 3066 3067 sk = established_get_first(seq); 3068 iter->cur_sk = 0; 3069 iter->end_sk = 0; 3070 3071 if (sk && st->bucket == resume_bucket && end_cookie) { 3072 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3073 end_cookie - find_cookie); 3074 if (!sk) { 3075 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3076 ++st->bucket; 3077 sk = established_get_first(seq); 3078 } 3079 } 3080 3081 return sk; 3082 } 3083 3084 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3085 { 3086 struct bpf_tcp_iter_state *iter = seq->private; 3087 struct tcp_iter_state *st = &iter->state; 3088 struct sock *sk = NULL; 3089 3090 switch (st->state) { 3091 case TCP_SEQ_STATE_LISTENING: 3092 sk = bpf_iter_tcp_resume_listening(seq); 3093 if (sk) 3094 break; 3095 st->bucket = 0; 3096 st->state = TCP_SEQ_STATE_ESTABLISHED; 3097 fallthrough; 3098 case TCP_SEQ_STATE_ESTABLISHED: 3099 sk = bpf_iter_tcp_resume_established(seq); 3100 break; 3101 } 3102 3103 return sk; 3104 } 3105 3106 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3107 struct sock **start_sk) 3108 { 3109 struct bpf_tcp_iter_state *iter = seq->private; 3110 struct hlist_nulls_node *node; 3111 unsigned int expected = 1; 3112 struct sock *sk; 3113 3114 sock_hold(*start_sk); 3115 iter->batch[iter->end_sk++].sk = *start_sk; 3116 3117 sk = sk_nulls_next(*start_sk); 3118 *start_sk = NULL; 3119 sk_nulls_for_each_from(sk, node) { 3120 if (seq_sk_match(seq, sk)) { 3121 if (iter->end_sk < iter->max_sk) { 3122 sock_hold(sk); 3123 iter->batch[iter->end_sk++].sk = sk; 3124 } else if (!*start_sk) { 3125 /* Remember where we left off. */ 3126 *start_sk = sk; 3127 } 3128 expected++; 3129 } 3130 } 3131 3132 return expected; 3133 } 3134 3135 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3136 struct sock **start_sk) 3137 { 3138 struct bpf_tcp_iter_state *iter = seq->private; 3139 struct hlist_nulls_node *node; 3140 unsigned int expected = 1; 3141 struct sock *sk; 3142 3143 sock_hold(*start_sk); 3144 iter->batch[iter->end_sk++].sk = *start_sk; 3145 3146 sk = sk_nulls_next(*start_sk); 3147 *start_sk = NULL; 3148 sk_nulls_for_each_from(sk, node) { 3149 if (seq_sk_match(seq, sk)) { 3150 if (iter->end_sk < iter->max_sk) { 3151 sock_hold(sk); 3152 iter->batch[iter->end_sk++].sk = sk; 3153 } else if (!*start_sk) { 3154 /* Remember where we left off. */ 3155 *start_sk = sk; 3156 } 3157 expected++; 3158 } 3159 } 3160 3161 return expected; 3162 } 3163 3164 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3165 struct sock **start_sk) 3166 { 3167 struct bpf_tcp_iter_state *iter = seq->private; 3168 struct tcp_iter_state *st = &iter->state; 3169 3170 if (st->state == TCP_SEQ_STATE_LISTENING) 3171 return bpf_iter_tcp_listening_batch(seq, start_sk); 3172 else 3173 return bpf_iter_tcp_established_batch(seq, start_sk); 3174 } 3175 3176 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3177 { 3178 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3179 struct bpf_tcp_iter_state *iter = seq->private; 3180 struct tcp_iter_state *st = &iter->state; 3181 3182 if (st->state == TCP_SEQ_STATE_LISTENING) 3183 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3184 else 3185 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3186 } 3187 3188 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3189 { 3190 struct bpf_tcp_iter_state *iter = seq->private; 3191 unsigned int expected; 3192 struct sock *sk; 3193 int err; 3194 3195 sk = bpf_iter_tcp_resume(seq); 3196 if (!sk) 3197 return NULL; /* Done */ 3198 3199 expected = bpf_iter_fill_batch(seq, &sk); 3200 if (likely(iter->end_sk == expected)) 3201 goto done; 3202 3203 /* Batch size was too small. */ 3204 bpf_iter_tcp_unlock_bucket(seq); 3205 bpf_iter_tcp_put_batch(iter); 3206 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3207 GFP_USER); 3208 if (err) 3209 return ERR_PTR(err); 3210 3211 sk = bpf_iter_tcp_resume(seq); 3212 if (!sk) 3213 return NULL; /* Done */ 3214 3215 expected = bpf_iter_fill_batch(seq, &sk); 3216 if (likely(iter->end_sk == expected)) 3217 goto done; 3218 3219 /* Batch size was still too small. Hold onto the lock while we try 3220 * again with a larger batch to make sure the current bucket's size 3221 * does not change in the meantime. 3222 */ 3223 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3224 if (err) { 3225 bpf_iter_tcp_unlock_bucket(seq); 3226 return ERR_PTR(err); 3227 } 3228 3229 expected = bpf_iter_fill_batch(seq, &sk); 3230 WARN_ON_ONCE(iter->end_sk != expected); 3231 done: 3232 bpf_iter_tcp_unlock_bucket(seq); 3233 return iter->batch[0].sk; 3234 } 3235 3236 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3237 { 3238 /* bpf iter does not support lseek, so it always 3239 * continue from where it was stop()-ped. 3240 */ 3241 if (*pos) 3242 return bpf_iter_tcp_batch(seq); 3243 3244 return SEQ_START_TOKEN; 3245 } 3246 3247 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3248 { 3249 struct bpf_tcp_iter_state *iter = seq->private; 3250 struct tcp_iter_state *st = &iter->state; 3251 struct sock *sk; 3252 3253 /* Whenever seq_next() is called, the iter->cur_sk is 3254 * done with seq_show(), so advance to the next sk in 3255 * the batch. 3256 */ 3257 if (iter->cur_sk < iter->end_sk) { 3258 /* Keeping st->num consistent in tcp_iter_state. 3259 * bpf_iter_tcp does not use st->num. 3260 * meta.seq_num is used instead. 3261 */ 3262 st->num++; 3263 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3264 } 3265 3266 if (iter->cur_sk < iter->end_sk) 3267 sk = iter->batch[iter->cur_sk].sk; 3268 else 3269 sk = bpf_iter_tcp_batch(seq); 3270 3271 ++*pos; 3272 /* Keeping st->last_pos consistent in tcp_iter_state. 3273 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3274 */ 3275 st->last_pos = *pos; 3276 return sk; 3277 } 3278 3279 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3280 { 3281 struct bpf_iter_meta meta; 3282 struct bpf_prog *prog; 3283 struct sock *sk = v; 3284 uid_t uid; 3285 int ret; 3286 3287 if (v == SEQ_START_TOKEN) 3288 return 0; 3289 3290 if (sk_fullsock(sk)) 3291 lock_sock(sk); 3292 3293 if (unlikely(sk_unhashed(sk))) { 3294 ret = SEQ_SKIP; 3295 goto unlock; 3296 } 3297 3298 if (sk->sk_state == TCP_TIME_WAIT) { 3299 uid = 0; 3300 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3301 const struct request_sock *req = v; 3302 3303 uid = from_kuid_munged(seq_user_ns(seq), 3304 sk_uid(req->rsk_listener)); 3305 } else { 3306 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3307 } 3308 3309 meta.seq = seq; 3310 prog = bpf_iter_get_info(&meta, false); 3311 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3312 3313 unlock: 3314 if (sk_fullsock(sk)) 3315 release_sock(sk); 3316 return ret; 3317 3318 } 3319 3320 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3321 { 3322 struct bpf_tcp_iter_state *iter = seq->private; 3323 struct bpf_iter_meta meta; 3324 struct bpf_prog *prog; 3325 3326 if (!v) { 3327 meta.seq = seq; 3328 prog = bpf_iter_get_info(&meta, true); 3329 if (prog) 3330 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3331 } 3332 3333 if (iter->cur_sk < iter->end_sk) 3334 bpf_iter_tcp_put_batch(iter); 3335 } 3336 3337 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3338 .show = bpf_iter_tcp_seq_show, 3339 .start = bpf_iter_tcp_seq_start, 3340 .next = bpf_iter_tcp_seq_next, 3341 .stop = bpf_iter_tcp_seq_stop, 3342 }; 3343 #endif 3344 static unsigned short seq_file_family(const struct seq_file *seq) 3345 { 3346 const struct tcp_seq_afinfo *afinfo; 3347 3348 #ifdef CONFIG_BPF_SYSCALL 3349 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3350 if (seq->op == &bpf_iter_tcp_seq_ops) 3351 return AF_UNSPEC; 3352 #endif 3353 3354 /* Iterated from proc fs */ 3355 afinfo = pde_data(file_inode(seq->file)); 3356 return afinfo->family; 3357 } 3358 3359 static const struct seq_operations tcp4_seq_ops = { 3360 .show = tcp4_seq_show, 3361 .start = tcp_seq_start, 3362 .next = tcp_seq_next, 3363 .stop = tcp_seq_stop, 3364 }; 3365 3366 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3367 .family = AF_INET, 3368 }; 3369 3370 static int __net_init tcp4_proc_init_net(struct net *net) 3371 { 3372 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3373 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3374 return -ENOMEM; 3375 return 0; 3376 } 3377 3378 static void __net_exit tcp4_proc_exit_net(struct net *net) 3379 { 3380 remove_proc_entry("tcp", net->proc_net); 3381 } 3382 3383 static struct pernet_operations tcp4_net_ops = { 3384 .init = tcp4_proc_init_net, 3385 .exit = tcp4_proc_exit_net, 3386 }; 3387 3388 int __init tcp4_proc_init(void) 3389 { 3390 return register_pernet_subsys(&tcp4_net_ops); 3391 } 3392 3393 void tcp4_proc_exit(void) 3394 { 3395 unregister_pernet_subsys(&tcp4_net_ops); 3396 } 3397 #endif /* CONFIG_PROC_FS */ 3398 3399 struct proto tcp_prot = { 3400 .name = "TCP", 3401 .owner = THIS_MODULE, 3402 .close = tcp_close, 3403 .pre_connect = tcp_v4_pre_connect, 3404 .connect = tcp_v4_connect, 3405 .disconnect = tcp_disconnect, 3406 .accept = inet_csk_accept, 3407 .ioctl = tcp_ioctl, 3408 .init = tcp_v4_init_sock, 3409 .destroy = tcp_v4_destroy_sock, 3410 .shutdown = tcp_shutdown, 3411 .setsockopt = tcp_setsockopt, 3412 .getsockopt = tcp_getsockopt, 3413 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3414 .keepalive = tcp_set_keepalive, 3415 .recvmsg = tcp_recvmsg, 3416 .sendmsg = tcp_sendmsg, 3417 .splice_eof = tcp_splice_eof, 3418 .backlog_rcv = tcp_v4_do_rcv, 3419 .release_cb = tcp_release_cb, 3420 .hash = inet_hash, 3421 .unhash = inet_unhash, 3422 .get_port = inet_csk_get_port, 3423 .put_port = inet_put_port, 3424 #ifdef CONFIG_BPF_SYSCALL 3425 .psock_update_sk_prot = tcp_bpf_update_proto, 3426 #endif 3427 .enter_memory_pressure = tcp_enter_memory_pressure, 3428 .leave_memory_pressure = tcp_leave_memory_pressure, 3429 .stream_memory_free = tcp_stream_memory_free, 3430 .sockets_allocated = &tcp_sockets_allocated, 3431 3432 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3433 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3434 3435 .memory_pressure = &tcp_memory_pressure, 3436 .sysctl_mem = sysctl_tcp_mem, 3437 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3438 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3439 .max_header = MAX_TCP_HEADER, 3440 .obj_size = sizeof(struct tcp_sock), 3441 .freeptr_offset = offsetof(struct tcp_sock, 3442 inet_conn.icsk_inet.sk.sk_freeptr), 3443 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3444 .twsk_prot = &tcp_timewait_sock_ops, 3445 .rsk_prot = &tcp_request_sock_ops, 3446 .h.hashinfo = NULL, 3447 .no_autobind = true, 3448 .diag_destroy = tcp_abort, 3449 }; 3450 EXPORT_SYMBOL(tcp_prot); 3451 3452 static void __net_exit tcp_sk_exit(struct net *net) 3453 { 3454 if (net->ipv4.tcp_congestion_control) 3455 bpf_module_put(net->ipv4.tcp_congestion_control, 3456 net->ipv4.tcp_congestion_control->owner); 3457 } 3458 3459 static void __net_init tcp_set_hashinfo(struct net *net) 3460 { 3461 struct inet_hashinfo *hinfo; 3462 unsigned int ehash_entries; 3463 struct net *old_net; 3464 3465 if (net_eq(net, &init_net)) 3466 goto fallback; 3467 3468 old_net = current->nsproxy->net_ns; 3469 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3470 if (!ehash_entries) 3471 goto fallback; 3472 3473 ehash_entries = roundup_pow_of_two(ehash_entries); 3474 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3475 if (!hinfo) { 3476 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3477 "for a netns, fallback to the global one\n", 3478 ehash_entries); 3479 fallback: 3480 hinfo = &tcp_hashinfo; 3481 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3482 } 3483 3484 net->ipv4.tcp_death_row.hashinfo = hinfo; 3485 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3486 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3487 } 3488 3489 static int __net_init tcp_sk_init(struct net *net) 3490 { 3491 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3492 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3493 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; 3494 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3495 3496 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3497 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3498 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3499 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3500 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3501 3502 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3503 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3504 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3505 3506 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3507 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3508 net->ipv4.sysctl_tcp_syncookies = 1; 3509 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3510 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3511 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3512 net->ipv4.sysctl_tcp_orphan_retries = 0; 3513 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3514 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3515 net->ipv4.sysctl_tcp_tw_reuse = 2; 3516 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3517 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3518 3519 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3520 tcp_set_hashinfo(net); 3521 3522 net->ipv4.sysctl_tcp_sack = 1; 3523 net->ipv4.sysctl_tcp_window_scaling = 1; 3524 net->ipv4.sysctl_tcp_timestamps = 1; 3525 net->ipv4.sysctl_tcp_early_retrans = 3; 3526 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3527 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3528 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3529 net->ipv4.sysctl_tcp_max_reordering = 300; 3530 net->ipv4.sysctl_tcp_dsack = 1; 3531 net->ipv4.sysctl_tcp_app_win = 31; 3532 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3533 net->ipv4.sysctl_tcp_frto = 2; 3534 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3535 net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC; 3536 /* This limits the percentage of the congestion window which we 3537 * will allow a single TSO frame to consume. Building TSO frames 3538 * which are too large can cause TCP streams to be bursty. 3539 */ 3540 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3541 /* Default TSQ limit of 4 MB */ 3542 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3543 3544 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3545 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3546 3547 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3548 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3549 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3550 net->ipv4.sysctl_tcp_autocorking = 1; 3551 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3552 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3553 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3554 if (net != &init_net) { 3555 memcpy(net->ipv4.sysctl_tcp_rmem, 3556 init_net.ipv4.sysctl_tcp_rmem, 3557 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3558 memcpy(net->ipv4.sysctl_tcp_wmem, 3559 init_net.ipv4.sysctl_tcp_wmem, 3560 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3561 } 3562 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3563 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC; 3564 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3565 net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33; 3566 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3567 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3568 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3569 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3570 3571 /* Set default values for PLB */ 3572 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3573 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3574 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3575 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3576 /* Default congestion threshold for PLB to mark a round is 50% */ 3577 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3578 3579 /* Reno is always built in */ 3580 if (!net_eq(net, &init_net) && 3581 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3582 init_net.ipv4.tcp_congestion_control->owner)) 3583 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3584 else 3585 net->ipv4.tcp_congestion_control = &tcp_reno; 3586 3587 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3588 net->ipv4.sysctl_tcp_shrink_window = 0; 3589 3590 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3591 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3592 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3593 3594 return 0; 3595 } 3596 3597 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3598 { 3599 struct net *net; 3600 3601 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3602 * and failed setup_net error unwinding path are serialized. 3603 * 3604 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3605 * net_exit_list, the thread that dismantles a particular twsk must 3606 * do so without other thread progressing to refcount_dec_and_test() of 3607 * tcp_death_row.tw_refcount. 3608 */ 3609 mutex_lock(&tcp_exit_batch_mutex); 3610 3611 tcp_twsk_purge(net_exit_list); 3612 3613 list_for_each_entry(net, net_exit_list, exit_list) { 3614 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3615 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3616 tcp_fastopen_ctx_destroy(net); 3617 } 3618 3619 mutex_unlock(&tcp_exit_batch_mutex); 3620 } 3621 3622 static struct pernet_operations __net_initdata tcp_sk_ops = { 3623 .init = tcp_sk_init, 3624 .exit = tcp_sk_exit, 3625 .exit_batch = tcp_sk_exit_batch, 3626 }; 3627 3628 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3629 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3630 struct sock_common *sk_common, uid_t uid) 3631 3632 #define INIT_BATCH_SZ 16 3633 3634 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3635 { 3636 struct bpf_tcp_iter_state *iter = priv_data; 3637 int err; 3638 3639 err = bpf_iter_init_seq_net(priv_data, aux); 3640 if (err) 3641 return err; 3642 3643 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3644 if (err) { 3645 bpf_iter_fini_seq_net(priv_data); 3646 return err; 3647 } 3648 3649 return 0; 3650 } 3651 3652 static void bpf_iter_fini_tcp(void *priv_data) 3653 { 3654 struct bpf_tcp_iter_state *iter = priv_data; 3655 3656 bpf_iter_fini_seq_net(priv_data); 3657 kvfree(iter->batch); 3658 } 3659 3660 static const struct bpf_iter_seq_info tcp_seq_info = { 3661 .seq_ops = &bpf_iter_tcp_seq_ops, 3662 .init_seq_private = bpf_iter_init_tcp, 3663 .fini_seq_private = bpf_iter_fini_tcp, 3664 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3665 }; 3666 3667 static const struct bpf_func_proto * 3668 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3669 const struct bpf_prog *prog) 3670 { 3671 switch (func_id) { 3672 case BPF_FUNC_setsockopt: 3673 return &bpf_sk_setsockopt_proto; 3674 case BPF_FUNC_getsockopt: 3675 return &bpf_sk_getsockopt_proto; 3676 default: 3677 return NULL; 3678 } 3679 } 3680 3681 static struct bpf_iter_reg tcp_reg_info = { 3682 .target = "tcp", 3683 .ctx_arg_info_size = 1, 3684 .ctx_arg_info = { 3685 { offsetof(struct bpf_iter__tcp, sk_common), 3686 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3687 }, 3688 .get_func_proto = bpf_iter_tcp_get_func_proto, 3689 .seq_info = &tcp_seq_info, 3690 }; 3691 3692 static void __init bpf_iter_register(void) 3693 { 3694 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3695 if (bpf_iter_reg_target(&tcp_reg_info)) 3696 pr_warn("Warning: could not register bpf iterator tcp\n"); 3697 } 3698 3699 #endif 3700 3701 void __init tcp_v4_init(void) 3702 { 3703 int cpu, res; 3704 3705 for_each_possible_cpu(cpu) { 3706 struct sock *sk; 3707 3708 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3709 IPPROTO_TCP, &init_net); 3710 if (res) 3711 panic("Failed to create the TCP control socket.\n"); 3712 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3713 3714 /* Please enforce IP_DF and IPID==0 for RST and 3715 * ACK sent in SYN-RECV and TIME-WAIT state. 3716 */ 3717 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3718 3719 sk->sk_clockid = CLOCK_MONOTONIC; 3720 3721 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3722 } 3723 if (register_pernet_subsys(&tcp_sk_ops)) 3724 panic("Failed to create the TCP control socket.\n"); 3725 3726 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3727 bpf_iter_register(); 3728 #endif 3729 } 3730