1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/fips.h> 57 #include <linux/jhash.h> 58 #include <linux/init.h> 59 #include <linux/times.h> 60 #include <linux/slab.h> 61 #include <linux/sched.h> 62 #include <linux/sock_diag.h> 63 64 #include <net/aligned_data.h> 65 #include <net/net_namespace.h> 66 #include <net/icmp.h> 67 #include <net/inet_hashtables.h> 68 #include <net/tcp.h> 69 #include <net/tcp_ecn.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/inet_ecn.h> 74 #include <net/timewait_sock.h> 75 #include <net/xfrm.h> 76 #include <net/secure_seq.h> 77 #include <net/busy_poll.h> 78 #include <net/rstreason.h> 79 #include <net/psp.h> 80 81 #include <linux/inet.h> 82 #include <linux/ipv6.h> 83 #include <linux/stddef.h> 84 #include <linux/proc_fs.h> 85 #include <linux/seq_file.h> 86 #include <linux/inetdevice.h> 87 #include <linux/btf_ids.h> 88 #include <linux/skbuff_ref.h> 89 90 #include <crypto/md5.h> 91 92 #include <trace/events/tcp.h> 93 94 #ifdef CONFIG_TCP_MD5SIG 95 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 96 __be32 daddr, __be32 saddr, const struct tcphdr *th); 97 #endif 98 99 struct inet_hashinfo tcp_hashinfo; 100 101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 102 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 103 }; 104 105 static DEFINE_MUTEX(tcp_exit_batch_mutex); 106 107 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 108 { 109 return secure_tcp_seq(ip_hdr(skb)->daddr, 110 ip_hdr(skb)->saddr, 111 tcp_hdr(skb)->dest, 112 tcp_hdr(skb)->source); 113 } 114 115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 116 { 117 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 118 } 119 120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 121 { 122 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 123 const struct inet_timewait_sock *tw = inet_twsk(sktw); 124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 125 struct tcp_sock *tp = tcp_sk(sk); 126 int ts_recent_stamp; 127 u32 reuse_thresh; 128 129 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 130 reuse = 0; 131 132 if (reuse == 2) { 133 /* Still does not detect *everything* that goes through 134 * lo, since we require a loopback src or dst address 135 * or direct binding to 'lo' interface. 136 */ 137 bool loopback = false; 138 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 139 loopback = true; 140 #if IS_ENABLED(CONFIG_IPV6) 141 if (tw->tw_family == AF_INET6) { 142 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 144 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 145 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 146 loopback = true; 147 } else 148 #endif 149 { 150 if (ipv4_is_loopback(tw->tw_daddr) || 151 ipv4_is_loopback(tw->tw_rcv_saddr)) 152 loopback = true; 153 } 154 if (!loopback) 155 reuse = 0; 156 } 157 158 /* With PAWS, it is safe from the viewpoint 159 of data integrity. Even without PAWS it is safe provided sequence 160 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 161 162 Actually, the idea is close to VJ's one, only timestamp cache is 163 held not per host, but per port pair and TW bucket is used as state 164 holder. 165 166 If TW bucket has been already destroyed we fall back to VJ's scheme 167 and use initial timestamp retrieved from peer table. 168 */ 169 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 170 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 171 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 172 if (ts_recent_stamp && 173 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 174 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 175 * and releasing the bucket lock. 176 */ 177 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 178 return 0; 179 180 /* In case of repair and re-using TIME-WAIT sockets we still 181 * want to be sure that it is safe as above but honor the 182 * sequence numbers and time stamps set as part of the repair 183 * process. 184 * 185 * Without this check re-using a TIME-WAIT socket with TCP 186 * repair would accumulate a -1 on the repair assigned 187 * sequence number. The first time it is reused the sequence 188 * is -1, the second time -2, etc. This fixes that issue 189 * without appearing to create any others. 190 */ 191 if (likely(!tp->repair)) { 192 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 193 194 if (!seq) 195 seq = 1; 196 WRITE_ONCE(tp->write_seq, seq); 197 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 198 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 199 } 200 201 return 1; 202 } 203 204 return 0; 205 } 206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 207 208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 209 int addr_len) 210 { 211 /* This check is replicated from tcp_v4_connect() and intended to 212 * prevent BPF program called below from accessing bytes that are out 213 * of the bound specified by user in addr_len. 214 */ 215 if (addr_len < sizeof(struct sockaddr_in)) 216 return -EINVAL; 217 218 sock_owned_by_me(sk); 219 220 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 221 } 222 223 /* This will initiate an outgoing connection. */ 224 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 225 { 226 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 227 struct inet_timewait_death_row *tcp_death_row; 228 struct inet_sock *inet = inet_sk(sk); 229 struct tcp_sock *tp = tcp_sk(sk); 230 struct ip_options_rcu *inet_opt; 231 struct net *net = sock_net(sk); 232 __be16 orig_sport, orig_dport; 233 __be32 daddr, nexthop; 234 struct flowi4 *fl4; 235 struct rtable *rt; 236 int err; 237 238 if (addr_len < sizeof(struct sockaddr_in)) 239 return -EINVAL; 240 241 if (usin->sin_family != AF_INET) 242 return -EAFNOSUPPORT; 243 244 nexthop = daddr = usin->sin_addr.s_addr; 245 inet_opt = rcu_dereference_protected(inet->inet_opt, 246 lockdep_sock_is_held(sk)); 247 if (inet_opt && inet_opt->opt.srr) { 248 if (!daddr) 249 return -EINVAL; 250 nexthop = inet_opt->opt.faddr; 251 } 252 253 orig_sport = inet->inet_sport; 254 orig_dport = usin->sin_port; 255 fl4 = &inet->cork.fl.u.ip4; 256 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 257 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 258 orig_dport, sk); 259 if (IS_ERR(rt)) { 260 err = PTR_ERR(rt); 261 if (err == -ENETUNREACH) 262 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 263 return err; 264 } 265 266 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 267 ip_rt_put(rt); 268 return -ENETUNREACH; 269 } 270 271 if (!inet_opt || !inet_opt->opt.srr) 272 daddr = fl4->daddr; 273 274 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 275 276 if (!inet->inet_saddr) { 277 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 278 if (err) { 279 ip_rt_put(rt); 280 return err; 281 } 282 } else { 283 sk_rcv_saddr_set(sk, inet->inet_saddr); 284 } 285 286 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 287 /* Reset inherited state */ 288 tp->rx_opt.ts_recent = 0; 289 tp->rx_opt.ts_recent_stamp = 0; 290 if (likely(!tp->repair)) 291 WRITE_ONCE(tp->write_seq, 0); 292 } 293 294 inet->inet_dport = usin->sin_port; 295 sk_daddr_set(sk, daddr); 296 297 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); 298 if (inet_opt) 299 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; 300 301 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 302 303 /* Socket identity is still unknown (sport may be zero). 304 * However we set state to SYN-SENT and not releasing socket 305 * lock select source port, enter ourselves into the hash tables and 306 * complete initialization after this. 307 */ 308 tcp_set_state(sk, TCP_SYN_SENT); 309 err = inet_hash_connect(tcp_death_row, sk); 310 if (err) 311 goto failure; 312 313 sk_set_txhash(sk); 314 315 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 316 inet->inet_sport, inet->inet_dport, sk); 317 if (IS_ERR(rt)) { 318 err = PTR_ERR(rt); 319 rt = NULL; 320 goto failure; 321 } 322 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 323 /* OK, now commit destination to socket. */ 324 sk->sk_gso_type = SKB_GSO_TCPV4; 325 sk_setup_caps(sk, &rt->dst); 326 rt = NULL; 327 328 if (likely(!tp->repair)) { 329 if (!tp->write_seq) 330 WRITE_ONCE(tp->write_seq, 331 secure_tcp_seq(inet->inet_saddr, 332 inet->inet_daddr, 333 inet->inet_sport, 334 usin->sin_port)); 335 WRITE_ONCE(tp->tsoffset, 336 secure_tcp_ts_off(net, inet->inet_saddr, 337 inet->inet_daddr)); 338 } 339 340 atomic_set(&inet->inet_id, get_random_u16()); 341 342 if (tcp_fastopen_defer_connect(sk, &err)) 343 return err; 344 if (err) 345 goto failure; 346 347 err = tcp_connect(sk); 348 349 if (err) 350 goto failure; 351 352 return 0; 353 354 failure: 355 /* 356 * This unhashes the socket and releases the local port, 357 * if necessary. 358 */ 359 tcp_set_state(sk, TCP_CLOSE); 360 inet_bhash2_reset_saddr(sk); 361 ip_rt_put(rt); 362 sk->sk_route_caps = 0; 363 inet->inet_dport = 0; 364 return err; 365 } 366 EXPORT_IPV6_MOD(tcp_v4_connect); 367 368 /* 369 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 370 * It can be called through tcp_release_cb() if socket was owned by user 371 * at the time tcp_v4_err() was called to handle ICMP message. 372 */ 373 void tcp_v4_mtu_reduced(struct sock *sk) 374 { 375 struct inet_sock *inet = inet_sk(sk); 376 struct dst_entry *dst; 377 u32 mtu; 378 379 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 380 return; 381 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 382 dst = inet_csk_update_pmtu(sk, mtu); 383 if (!dst) 384 return; 385 386 /* Something is about to be wrong... Remember soft error 387 * for the case, if this connection will not able to recover. 388 */ 389 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 390 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 391 392 mtu = dst_mtu(dst); 393 394 if (inet->pmtudisc != IP_PMTUDISC_DONT && 395 ip_sk_accept_pmtu(sk) && 396 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 397 tcp_sync_mss(sk, mtu); 398 399 /* Resend the TCP packet because it's 400 * clear that the old packet has been 401 * dropped. This is the new "fast" path mtu 402 * discovery. 403 */ 404 tcp_simple_retransmit(sk); 405 } /* else let the usual retransmit timer handle it */ 406 } 407 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 408 409 static void do_redirect(struct sk_buff *skb, struct sock *sk) 410 { 411 struct dst_entry *dst = __sk_dst_check(sk, 0); 412 413 if (dst) 414 dst->ops->redirect(dst, sk, skb); 415 } 416 417 418 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 419 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 420 { 421 struct request_sock *req = inet_reqsk(sk); 422 struct net *net = sock_net(sk); 423 424 /* ICMPs are not backlogged, hence we cannot get 425 * an established socket here. 426 */ 427 if (seq != tcp_rsk(req)->snt_isn) { 428 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 429 } else if (abort) { 430 /* 431 * Still in SYN_RECV, just remove it silently. 432 * There is no good way to pass the error to the newly 433 * created socket, and POSIX does not want network 434 * errors returned from accept(). 435 */ 436 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 437 tcp_listendrop(req->rsk_listener); 438 } 439 reqsk_put(req); 440 } 441 EXPORT_IPV6_MOD(tcp_req_err); 442 443 /* TCP-LD (RFC 6069) logic */ 444 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 445 { 446 struct inet_connection_sock *icsk = inet_csk(sk); 447 struct tcp_sock *tp = tcp_sk(sk); 448 struct sk_buff *skb; 449 s32 remaining; 450 u32 delta_us; 451 452 if (sock_owned_by_user(sk)) 453 return; 454 455 if (seq != tp->snd_una || !icsk->icsk_retransmits || 456 !icsk->icsk_backoff) 457 return; 458 459 skb = tcp_rtx_queue_head(sk); 460 if (WARN_ON_ONCE(!skb)) 461 return; 462 463 icsk->icsk_backoff--; 464 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 465 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 466 467 tcp_mstamp_refresh(tp); 468 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 469 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 470 471 if (remaining > 0) { 472 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 473 } else { 474 /* RTO revert clocked out retransmission. 475 * Will retransmit now. 476 */ 477 tcp_retransmit_timer(sk); 478 } 479 } 480 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 481 482 /* 483 * This routine is called by the ICMP module when it gets some 484 * sort of error condition. If err < 0 then the socket should 485 * be closed and the error returned to the user. If err > 0 486 * it's just the icmp type << 8 | icmp code. After adjustment 487 * header points to the first 8 bytes of the tcp header. We need 488 * to find the appropriate port. 489 * 490 * The locking strategy used here is very "optimistic". When 491 * someone else accesses the socket the ICMP is just dropped 492 * and for some paths there is no check at all. 493 * A more general error queue to queue errors for later handling 494 * is probably better. 495 * 496 */ 497 498 int tcp_v4_err(struct sk_buff *skb, u32 info) 499 { 500 const struct iphdr *iph = (const struct iphdr *)skb->data; 501 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 502 struct net *net = dev_net_rcu(skb->dev); 503 const int type = icmp_hdr(skb)->type; 504 const int code = icmp_hdr(skb)->code; 505 struct request_sock *fastopen; 506 struct tcp_sock *tp; 507 u32 seq, snd_una; 508 struct sock *sk; 509 int err; 510 511 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 512 ntohs(th->source), inet_iif(skb), 0); 513 if (!sk) { 514 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 515 return -ENOENT; 516 } 517 if (sk->sk_state == TCP_TIME_WAIT) { 518 /* To increase the counter of ignored icmps for TCP-AO */ 519 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 520 inet_twsk_put(inet_twsk(sk)); 521 return 0; 522 } 523 seq = ntohl(th->seq); 524 if (sk->sk_state == TCP_NEW_SYN_RECV) { 525 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 526 type == ICMP_TIME_EXCEEDED || 527 (type == ICMP_DEST_UNREACH && 528 (code == ICMP_NET_UNREACH || 529 code == ICMP_HOST_UNREACH))); 530 return 0; 531 } 532 533 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 534 sock_put(sk); 535 return 0; 536 } 537 538 bh_lock_sock(sk); 539 /* If too many ICMPs get dropped on busy 540 * servers this needs to be solved differently. 541 * We do take care of PMTU discovery (RFC1191) special case : 542 * we can receive locally generated ICMP messages while socket is held. 543 */ 544 if (sock_owned_by_user(sk)) { 545 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 546 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 547 } 548 if (sk->sk_state == TCP_CLOSE) 549 goto out; 550 551 if (static_branch_unlikely(&ip4_min_ttl)) { 552 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 553 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 554 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 555 goto out; 556 } 557 } 558 559 tp = tcp_sk(sk); 560 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 561 fastopen = rcu_dereference(tp->fastopen_rsk); 562 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 563 if (sk->sk_state != TCP_LISTEN && 564 !between(seq, snd_una, tp->snd_nxt)) { 565 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 566 goto out; 567 } 568 569 switch (type) { 570 case ICMP_REDIRECT: 571 if (!sock_owned_by_user(sk)) 572 do_redirect(skb, sk); 573 goto out; 574 case ICMP_SOURCE_QUENCH: 575 /* Just silently ignore these. */ 576 goto out; 577 case ICMP_PARAMETERPROB: 578 err = EPROTO; 579 break; 580 case ICMP_DEST_UNREACH: 581 if (code > NR_ICMP_UNREACH) 582 goto out; 583 584 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 585 /* We are not interested in TCP_LISTEN and open_requests 586 * (SYN-ACKs send out by Linux are always <576bytes so 587 * they should go through unfragmented). 588 */ 589 if (sk->sk_state == TCP_LISTEN) 590 goto out; 591 592 WRITE_ONCE(tp->mtu_info, info); 593 if (!sock_owned_by_user(sk)) { 594 tcp_v4_mtu_reduced(sk); 595 } else { 596 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 597 sock_hold(sk); 598 } 599 goto out; 600 } 601 602 err = icmp_err_convert[code].errno; 603 /* check if this ICMP message allows revert of backoff. 604 * (see RFC 6069) 605 */ 606 if (!fastopen && 607 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 608 tcp_ld_RTO_revert(sk, seq); 609 break; 610 case ICMP_TIME_EXCEEDED: 611 err = EHOSTUNREACH; 612 break; 613 default: 614 goto out; 615 } 616 617 switch (sk->sk_state) { 618 case TCP_SYN_SENT: 619 case TCP_SYN_RECV: 620 /* Only in fast or simultaneous open. If a fast open socket is 621 * already accepted it is treated as a connected one below. 622 */ 623 if (fastopen && !fastopen->sk) 624 break; 625 626 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 627 628 if (!sock_owned_by_user(sk)) 629 tcp_done_with_error(sk, err); 630 else 631 WRITE_ONCE(sk->sk_err_soft, err); 632 goto out; 633 } 634 635 /* If we've already connected we will keep trying 636 * until we time out, or the user gives up. 637 * 638 * rfc1122 4.2.3.9 allows to consider as hard errors 639 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 640 * but it is obsoleted by pmtu discovery). 641 * 642 * Note, that in modern internet, where routing is unreliable 643 * and in each dark corner broken firewalls sit, sending random 644 * errors ordered by their masters even this two messages finally lose 645 * their original sense (even Linux sends invalid PORT_UNREACHs) 646 * 647 * Now we are in compliance with RFCs. 648 * --ANK (980905) 649 */ 650 651 if (!sock_owned_by_user(sk) && 652 inet_test_bit(RECVERR, sk)) { 653 WRITE_ONCE(sk->sk_err, err); 654 sk_error_report(sk); 655 } else { /* Only an error on timeout */ 656 WRITE_ONCE(sk->sk_err_soft, err); 657 } 658 659 out: 660 bh_unlock_sock(sk); 661 sock_put(sk); 662 return 0; 663 } 664 665 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 666 { 667 struct tcphdr *th = tcp_hdr(skb); 668 669 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 670 skb->csum_start = skb_transport_header(skb) - skb->head; 671 skb->csum_offset = offsetof(struct tcphdr, check); 672 } 673 674 /* This routine computes an IPv4 TCP checksum. */ 675 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 676 { 677 const struct inet_sock *inet = inet_sk(sk); 678 679 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 680 } 681 EXPORT_IPV6_MOD(tcp_v4_send_check); 682 683 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 684 685 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 686 const struct tcp_ao_hdr *aoh, 687 struct ip_reply_arg *arg, struct tcphdr *reply, 688 __be32 reply_options[REPLY_OPTIONS_LEN]) 689 { 690 #ifdef CONFIG_TCP_AO 691 int sdif = tcp_v4_sdif(skb); 692 int dif = inet_iif(skb); 693 int l3index = sdif ? dif : 0; 694 bool allocated_traffic_key; 695 struct tcp_ao_key *key; 696 char *traffic_key; 697 bool drop = true; 698 u32 ao_sne = 0; 699 u8 keyid; 700 701 rcu_read_lock(); 702 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 703 &key, &traffic_key, &allocated_traffic_key, 704 &keyid, &ao_sne)) 705 goto out; 706 707 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 708 (aoh->rnext_keyid << 8) | keyid); 709 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 710 reply->doff = arg->iov[0].iov_len / 4; 711 712 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 713 key, traffic_key, 714 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 715 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 716 reply, ao_sne)) 717 goto out; 718 drop = false; 719 out: 720 rcu_read_unlock(); 721 if (allocated_traffic_key) 722 kfree(traffic_key); 723 return drop; 724 #else 725 return true; 726 #endif 727 } 728 729 /* 730 * This routine will send an RST to the other tcp. 731 * 732 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 733 * for reset. 734 * Answer: if a packet caused RST, it is not for a socket 735 * existing in our system, if it is matched to a socket, 736 * it is just duplicate segment or bug in other side's TCP. 737 * So that we build reply only basing on parameters 738 * arrived with segment. 739 * Exception: precedence violation. We do not implement it in any case. 740 */ 741 742 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 743 enum sk_rst_reason reason) 744 { 745 const struct tcphdr *th = tcp_hdr(skb); 746 struct { 747 struct tcphdr th; 748 __be32 opt[REPLY_OPTIONS_LEN]; 749 } rep; 750 const __u8 *md5_hash_location = NULL; 751 const struct tcp_ao_hdr *aoh; 752 struct ip_reply_arg arg; 753 #ifdef CONFIG_TCP_MD5SIG 754 struct tcp_md5sig_key *key = NULL; 755 unsigned char newhash[16]; 756 struct sock *sk1 = NULL; 757 #endif 758 u64 transmit_time = 0; 759 struct sock *ctl_sk; 760 struct net *net; 761 u32 txhash = 0; 762 763 /* Never send a reset in response to a reset. */ 764 if (th->rst) 765 return; 766 767 /* If sk not NULL, it means we did a successful lookup and incoming 768 * route had to be correct. prequeue might have dropped our dst. 769 */ 770 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 771 return; 772 773 /* Swap the send and the receive. */ 774 memset(&rep, 0, sizeof(rep)); 775 rep.th.dest = th->source; 776 rep.th.source = th->dest; 777 rep.th.doff = sizeof(struct tcphdr) / 4; 778 rep.th.rst = 1; 779 780 if (th->ack) { 781 rep.th.seq = th->ack_seq; 782 } else { 783 rep.th.ack = 1; 784 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 785 skb->len - (th->doff << 2)); 786 } 787 788 memset(&arg, 0, sizeof(arg)); 789 arg.iov[0].iov_base = (unsigned char *)&rep; 790 arg.iov[0].iov_len = sizeof(rep.th); 791 792 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 793 794 /* Invalid TCP option size or twice included auth */ 795 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 796 return; 797 798 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 799 return; 800 801 #ifdef CONFIG_TCP_MD5SIG 802 rcu_read_lock(); 803 if (sk && sk_fullsock(sk)) { 804 const union tcp_md5_addr *addr; 805 int l3index; 806 807 /* sdif set, means packet ingressed via a device 808 * in an L3 domain and inet_iif is set to it. 809 */ 810 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 811 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 812 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 813 } else if (md5_hash_location) { 814 const union tcp_md5_addr *addr; 815 int sdif = tcp_v4_sdif(skb); 816 int dif = inet_iif(skb); 817 int l3index; 818 819 /* 820 * active side is lost. Try to find listening socket through 821 * source port, and then find md5 key through listening socket. 822 * we are not loose security here: 823 * Incoming packet is checked with md5 hash with finding key, 824 * no RST generated if md5 hash doesn't match. 825 */ 826 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 827 th->source, ip_hdr(skb)->daddr, 828 ntohs(th->source), dif, sdif); 829 /* don't send rst if it can't find key */ 830 if (!sk1) 831 goto out; 832 833 /* sdif set, means packet ingressed via a device 834 * in an L3 domain and dif is set to it. 835 */ 836 l3index = sdif ? dif : 0; 837 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 838 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 839 if (!key) 840 goto out; 841 842 tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 843 if (memcmp(md5_hash_location, newhash, 16) != 0) 844 goto out; 845 } 846 847 if (key) { 848 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 849 (TCPOPT_NOP << 16) | 850 (TCPOPT_MD5SIG << 8) | 851 TCPOLEN_MD5SIG); 852 /* Update length and the length the header thinks exists */ 853 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 854 rep.th.doff = arg.iov[0].iov_len / 4; 855 856 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 857 key, ip_hdr(skb)->saddr, 858 ip_hdr(skb)->daddr, &rep.th); 859 } 860 #endif 861 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 862 if (rep.opt[0] == 0) { 863 __be32 mrst = mptcp_reset_option(skb); 864 865 if (mrst) { 866 rep.opt[0] = mrst; 867 arg.iov[0].iov_len += sizeof(mrst); 868 rep.th.doff = arg.iov[0].iov_len / 4; 869 } 870 } 871 872 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 873 ip_hdr(skb)->saddr, /* XXX */ 874 arg.iov[0].iov_len, IPPROTO_TCP, 0); 875 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 876 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 877 878 /* When socket is gone, all binding information is lost. 879 * routing might fail in this case. No choice here, if we choose to force 880 * input interface, we will misroute in case of asymmetric route. 881 */ 882 if (sk) 883 arg.bound_dev_if = sk->sk_bound_dev_if; 884 885 trace_tcp_send_reset(sk, skb, reason); 886 887 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 888 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 889 890 /* ECN bits of TW reset are cleared */ 891 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 892 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 893 local_bh_disable(); 894 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 895 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 896 897 sock_net_set(ctl_sk, net); 898 if (sk) { 899 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 900 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 901 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 902 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 903 transmit_time = tcp_transmit_time(sk); 904 xfrm_sk_clone_policy(ctl_sk, sk); 905 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 906 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 907 } else { 908 ctl_sk->sk_mark = 0; 909 ctl_sk->sk_priority = 0; 910 } 911 ip_send_unicast_reply(ctl_sk, sk, 912 skb, &TCP_SKB_CB(skb)->header.h4.opt, 913 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 914 &arg, arg.iov[0].iov_len, 915 transmit_time, txhash); 916 917 xfrm_sk_free_policy(ctl_sk); 918 sock_net_set(ctl_sk, &init_net); 919 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 920 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 921 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 922 local_bh_enable(); 923 924 #ifdef CONFIG_TCP_MD5SIG 925 out: 926 rcu_read_unlock(); 927 #endif 928 } 929 930 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 931 outside socket context is ugly, certainly. What can I do? 932 */ 933 934 static void tcp_v4_send_ack(const struct sock *sk, 935 struct sk_buff *skb, u32 seq, u32 ack, 936 u32 win, u32 tsval, u32 tsecr, int oif, 937 struct tcp_key *key, 938 int reply_flags, u8 tos, u32 txhash) 939 { 940 const struct tcphdr *th = tcp_hdr(skb); 941 struct { 942 struct tcphdr th; 943 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 944 } rep; 945 struct net *net = sock_net(sk); 946 struct ip_reply_arg arg; 947 struct sock *ctl_sk; 948 u64 transmit_time; 949 950 memset(&rep.th, 0, sizeof(struct tcphdr)); 951 memset(&arg, 0, sizeof(arg)); 952 953 arg.iov[0].iov_base = (unsigned char *)&rep; 954 arg.iov[0].iov_len = sizeof(rep.th); 955 if (tsecr) { 956 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 957 (TCPOPT_TIMESTAMP << 8) | 958 TCPOLEN_TIMESTAMP); 959 rep.opt[1] = htonl(tsval); 960 rep.opt[2] = htonl(tsecr); 961 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 962 } 963 964 /* Swap the send and the receive. */ 965 rep.th.dest = th->source; 966 rep.th.source = th->dest; 967 rep.th.doff = arg.iov[0].iov_len / 4; 968 rep.th.seq = htonl(seq); 969 rep.th.ack_seq = htonl(ack); 970 rep.th.ack = 1; 971 rep.th.window = htons(win); 972 973 #ifdef CONFIG_TCP_MD5SIG 974 if (tcp_key_is_md5(key)) { 975 int offset = (tsecr) ? 3 : 0; 976 977 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 978 (TCPOPT_NOP << 16) | 979 (TCPOPT_MD5SIG << 8) | 980 TCPOLEN_MD5SIG); 981 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 982 rep.th.doff = arg.iov[0].iov_len/4; 983 984 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 985 key->md5_key, ip_hdr(skb)->saddr, 986 ip_hdr(skb)->daddr, &rep.th); 987 } 988 #endif 989 #ifdef CONFIG_TCP_AO 990 if (tcp_key_is_ao(key)) { 991 int offset = (tsecr) ? 3 : 0; 992 993 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 994 (tcp_ao_len(key->ao_key) << 16) | 995 (key->ao_key->sndid << 8) | 996 key->rcv_next); 997 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 998 rep.th.doff = arg.iov[0].iov_len / 4; 999 1000 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1001 key->ao_key, key->traffic_key, 1002 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1003 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1004 &rep.th, key->sne); 1005 } 1006 #endif 1007 arg.flags = reply_flags; 1008 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1009 ip_hdr(skb)->saddr, /* XXX */ 1010 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1011 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1012 if (oif) 1013 arg.bound_dev_if = oif; 1014 arg.tos = tos; 1015 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1016 local_bh_disable(); 1017 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1018 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1019 sock_net_set(ctl_sk, net); 1020 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1021 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1022 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1023 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1024 transmit_time = tcp_transmit_time(sk); 1025 ip_send_unicast_reply(ctl_sk, sk, 1026 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1027 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1028 &arg, arg.iov[0].iov_len, 1029 transmit_time, txhash); 1030 1031 sock_net_set(ctl_sk, &init_net); 1032 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1033 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1034 local_bh_enable(); 1035 } 1036 1037 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1038 enum tcp_tw_status tw_status) 1039 { 1040 struct inet_timewait_sock *tw = inet_twsk(sk); 1041 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1042 struct tcp_key key = {}; 1043 u8 tos = tw->tw_tos; 1044 1045 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1046 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1047 * being placed in a different service queues (Classic rather than L4S) 1048 */ 1049 if (tw_status == TCP_TW_ACK_OOW) 1050 tos &= ~INET_ECN_MASK; 1051 1052 #ifdef CONFIG_TCP_AO 1053 struct tcp_ao_info *ao_info; 1054 1055 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1056 /* FIXME: the segment to-be-acked is not verified yet */ 1057 ao_info = rcu_dereference(tcptw->ao_info); 1058 if (ao_info) { 1059 const struct tcp_ao_hdr *aoh; 1060 1061 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1062 inet_twsk_put(tw); 1063 return; 1064 } 1065 1066 if (aoh) 1067 key.ao_key = tcp_ao_established_key(sk, ao_info, 1068 aoh->rnext_keyid, -1); 1069 } 1070 } 1071 if (key.ao_key) { 1072 struct tcp_ao_key *rnext_key; 1073 1074 key.traffic_key = snd_other_key(key.ao_key); 1075 key.sne = READ_ONCE(ao_info->snd_sne); 1076 rnext_key = READ_ONCE(ao_info->rnext_key); 1077 key.rcv_next = rnext_key->rcvid; 1078 key.type = TCP_KEY_AO; 1079 #else 1080 if (0) { 1081 #endif 1082 } else if (static_branch_tcp_md5()) { 1083 key.md5_key = tcp_twsk_md5_key(tcptw); 1084 if (key.md5_key) 1085 key.type = TCP_KEY_MD5; 1086 } 1087 1088 tcp_v4_send_ack(sk, skb, 1089 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1090 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1091 tcp_tw_tsval(tcptw), 1092 READ_ONCE(tcptw->tw_ts_recent), 1093 tw->tw_bound_dev_if, &key, 1094 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1095 tos, 1096 tw->tw_txhash); 1097 1098 inet_twsk_put(tw); 1099 } 1100 1101 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1102 struct request_sock *req) 1103 { 1104 struct tcp_key key = {}; 1105 1106 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1107 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1108 */ 1109 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1110 tcp_sk(sk)->snd_nxt; 1111 1112 #ifdef CONFIG_TCP_AO 1113 if (static_branch_unlikely(&tcp_ao_needed.key) && 1114 tcp_rsk_used_ao(req)) { 1115 const union tcp_md5_addr *addr; 1116 const struct tcp_ao_hdr *aoh; 1117 int l3index; 1118 1119 /* Invalid TCP option size or twice included auth */ 1120 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1121 return; 1122 if (!aoh) 1123 return; 1124 1125 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1126 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1127 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1128 aoh->rnext_keyid, -1); 1129 if (unlikely(!key.ao_key)) { 1130 /* Send ACK with any matching MKT for the peer */ 1131 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1132 /* Matching key disappeared (user removed the key?) 1133 * let the handshake timeout. 1134 */ 1135 if (!key.ao_key) { 1136 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1137 addr, 1138 ntohs(tcp_hdr(skb)->source), 1139 &ip_hdr(skb)->daddr, 1140 ntohs(tcp_hdr(skb)->dest)); 1141 return; 1142 } 1143 } 1144 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1145 if (!key.traffic_key) 1146 return; 1147 1148 key.type = TCP_KEY_AO; 1149 key.rcv_next = aoh->keyid; 1150 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1151 #else 1152 if (0) { 1153 #endif 1154 } else if (static_branch_tcp_md5()) { 1155 const union tcp_md5_addr *addr; 1156 int l3index; 1157 1158 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1159 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1160 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1161 if (key.md5_key) 1162 key.type = TCP_KEY_MD5; 1163 } 1164 1165 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1166 tcp_v4_send_ack(sk, skb, seq, 1167 tcp_rsk(req)->rcv_nxt, 1168 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1169 tcp_rsk_tsval(tcp_rsk(req)), 1170 req->ts_recent, 1171 0, &key, 1172 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1173 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1174 READ_ONCE(tcp_rsk(req)->txhash)); 1175 if (tcp_key_is_ao(&key)) 1176 kfree(key.traffic_key); 1177 } 1178 1179 /* 1180 * Send a SYN-ACK after having received a SYN. 1181 * This still operates on a request_sock only, not on a big 1182 * socket. 1183 */ 1184 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1185 struct flowi *fl, 1186 struct request_sock *req, 1187 struct tcp_fastopen_cookie *foc, 1188 enum tcp_synack_type synack_type, 1189 struct sk_buff *syn_skb) 1190 { 1191 struct inet_request_sock *ireq = inet_rsk(req); 1192 struct flowi4 fl4; 1193 int err = -1; 1194 struct sk_buff *skb; 1195 u8 tos; 1196 1197 /* First, grab a route. */ 1198 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1199 return -1; 1200 1201 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1202 1203 if (skb) { 1204 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 1205 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1206 1207 tos = READ_ONCE(inet_sk(sk)->tos); 1208 1209 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1210 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1211 (tos & INET_ECN_MASK); 1212 1213 if (!INET_ECN_is_capable(tos) && 1214 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1215 tos |= INET_ECN_ECT_0; 1216 1217 rcu_read_lock(); 1218 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1219 ireq->ir_rmt_addr, 1220 rcu_dereference(ireq->ireq_opt), 1221 tos); 1222 rcu_read_unlock(); 1223 err = net_xmit_eval(err); 1224 } 1225 1226 return err; 1227 } 1228 1229 /* 1230 * IPv4 request_sock destructor. 1231 */ 1232 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1233 { 1234 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1235 } 1236 1237 #ifdef CONFIG_TCP_MD5SIG 1238 /* 1239 * RFC2385 MD5 checksumming requires a mapping of 1240 * IP address->MD5 Key. 1241 * We need to maintain these in the sk structure. 1242 */ 1243 1244 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1245 EXPORT_IPV6_MOD(tcp_md5_needed); 1246 1247 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1248 { 1249 if (!old) 1250 return true; 1251 1252 /* l3index always overrides non-l3index */ 1253 if (old->l3index && new->l3index == 0) 1254 return false; 1255 if (old->l3index == 0 && new->l3index) 1256 return true; 1257 1258 return old->prefixlen < new->prefixlen; 1259 } 1260 1261 /* Find the Key structure for an address. */ 1262 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1263 const union tcp_md5_addr *addr, 1264 int family, bool any_l3index) 1265 { 1266 const struct tcp_sock *tp = tcp_sk(sk); 1267 struct tcp_md5sig_key *key; 1268 const struct tcp_md5sig_info *md5sig; 1269 __be32 mask; 1270 struct tcp_md5sig_key *best_match = NULL; 1271 bool match; 1272 1273 /* caller either holds rcu_read_lock() or socket lock */ 1274 md5sig = rcu_dereference_check(tp->md5sig_info, 1275 lockdep_sock_is_held(sk)); 1276 if (!md5sig) 1277 return NULL; 1278 1279 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1280 lockdep_sock_is_held(sk)) { 1281 if (key->family != family) 1282 continue; 1283 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1284 key->l3index != l3index) 1285 continue; 1286 if (family == AF_INET) { 1287 mask = inet_make_mask(key->prefixlen); 1288 match = (key->addr.a4.s_addr & mask) == 1289 (addr->a4.s_addr & mask); 1290 #if IS_ENABLED(CONFIG_IPV6) 1291 } else if (family == AF_INET6) { 1292 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1293 key->prefixlen); 1294 #endif 1295 } else { 1296 match = false; 1297 } 1298 1299 if (match && better_md5_match(best_match, key)) 1300 best_match = key; 1301 } 1302 return best_match; 1303 } 1304 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1305 1306 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1307 const union tcp_md5_addr *addr, 1308 int family, u8 prefixlen, 1309 int l3index, u8 flags) 1310 { 1311 const struct tcp_sock *tp = tcp_sk(sk); 1312 struct tcp_md5sig_key *key; 1313 unsigned int size = sizeof(struct in_addr); 1314 const struct tcp_md5sig_info *md5sig; 1315 1316 /* caller either holds rcu_read_lock() or socket lock */ 1317 md5sig = rcu_dereference_check(tp->md5sig_info, 1318 lockdep_sock_is_held(sk)); 1319 if (!md5sig) 1320 return NULL; 1321 #if IS_ENABLED(CONFIG_IPV6) 1322 if (family == AF_INET6) 1323 size = sizeof(struct in6_addr); 1324 #endif 1325 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1326 lockdep_sock_is_held(sk)) { 1327 if (key->family != family) 1328 continue; 1329 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1330 continue; 1331 if (key->l3index != l3index) 1332 continue; 1333 if (!memcmp(&key->addr, addr, size) && 1334 key->prefixlen == prefixlen) 1335 return key; 1336 } 1337 return NULL; 1338 } 1339 1340 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1341 const struct sock *addr_sk) 1342 { 1343 const union tcp_md5_addr *addr; 1344 int l3index; 1345 1346 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1347 addr_sk->sk_bound_dev_if); 1348 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1349 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1350 } 1351 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1352 1353 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1354 { 1355 struct tcp_sock *tp = tcp_sk(sk); 1356 struct tcp_md5sig_info *md5sig; 1357 1358 md5sig = kmalloc(sizeof(*md5sig), gfp); 1359 if (!md5sig) 1360 return -ENOMEM; 1361 1362 sk_gso_disable(sk); 1363 INIT_HLIST_HEAD(&md5sig->head); 1364 rcu_assign_pointer(tp->md5sig_info, md5sig); 1365 return 0; 1366 } 1367 1368 /* This can be called on a newly created socket, from other files */ 1369 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1370 int family, u8 prefixlen, int l3index, u8 flags, 1371 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1372 { 1373 /* Add Key to the list */ 1374 struct tcp_md5sig_key *key; 1375 struct tcp_sock *tp = tcp_sk(sk); 1376 struct tcp_md5sig_info *md5sig; 1377 1378 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1379 if (key) { 1380 /* Pre-existing entry - just update that one. 1381 * Note that the key might be used concurrently. 1382 * data_race() is telling kcsan that we do not care of 1383 * key mismatches, since changing MD5 key on live flows 1384 * can lead to packet drops. 1385 */ 1386 data_race(memcpy(key->key, newkey, newkeylen)); 1387 1388 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1389 * Also note that a reader could catch new key->keylen value 1390 * but old key->key[], this is the reason we use __GFP_ZERO 1391 * at sock_kmalloc() time below these lines. 1392 */ 1393 WRITE_ONCE(key->keylen, newkeylen); 1394 1395 return 0; 1396 } 1397 1398 md5sig = rcu_dereference_protected(tp->md5sig_info, 1399 lockdep_sock_is_held(sk)); 1400 1401 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1402 if (!key) 1403 return -ENOMEM; 1404 1405 memcpy(key->key, newkey, newkeylen); 1406 key->keylen = newkeylen; 1407 key->family = family; 1408 key->prefixlen = prefixlen; 1409 key->l3index = l3index; 1410 key->flags = flags; 1411 memcpy(&key->addr, addr, 1412 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1413 sizeof(struct in_addr)); 1414 hlist_add_head_rcu(&key->node, &md5sig->head); 1415 return 0; 1416 } 1417 1418 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1419 int family, u8 prefixlen, int l3index, u8 flags, 1420 const u8 *newkey, u8 newkeylen) 1421 { 1422 struct tcp_sock *tp = tcp_sk(sk); 1423 1424 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1425 if (fips_enabled) { 1426 pr_warn_once("TCP-MD5 support is disabled due to FIPS\n"); 1427 return -EOPNOTSUPP; 1428 } 1429 1430 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1431 return -ENOMEM; 1432 1433 if (!static_branch_inc(&tcp_md5_needed.key)) { 1434 struct tcp_md5sig_info *md5sig; 1435 1436 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1437 rcu_assign_pointer(tp->md5sig_info, NULL); 1438 kfree_rcu(md5sig, rcu); 1439 return -EUSERS; 1440 } 1441 } 1442 1443 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1444 newkey, newkeylen, GFP_KERNEL); 1445 } 1446 EXPORT_IPV6_MOD(tcp_md5_do_add); 1447 1448 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1449 int family, u8 prefixlen, int l3index, 1450 struct tcp_md5sig_key *key) 1451 { 1452 struct tcp_sock *tp = tcp_sk(sk); 1453 1454 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1455 1456 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1457 return -ENOMEM; 1458 1459 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1460 struct tcp_md5sig_info *md5sig; 1461 1462 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1463 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1464 rcu_assign_pointer(tp->md5sig_info, NULL); 1465 kfree_rcu(md5sig, rcu); 1466 return -EUSERS; 1467 } 1468 } 1469 1470 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1471 key->flags, key->key, key->keylen, 1472 sk_gfp_mask(sk, GFP_ATOMIC)); 1473 } 1474 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1475 1476 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1477 u8 prefixlen, int l3index, u8 flags) 1478 { 1479 struct tcp_md5sig_key *key; 1480 1481 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1482 if (!key) 1483 return -ENOENT; 1484 hlist_del_rcu(&key->node); 1485 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1486 kfree_rcu(key, rcu); 1487 return 0; 1488 } 1489 EXPORT_IPV6_MOD(tcp_md5_do_del); 1490 1491 void tcp_clear_md5_list(struct sock *sk) 1492 { 1493 struct tcp_sock *tp = tcp_sk(sk); 1494 struct tcp_md5sig_key *key; 1495 struct hlist_node *n; 1496 struct tcp_md5sig_info *md5sig; 1497 1498 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1499 1500 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1501 hlist_del(&key->node); 1502 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1503 kfree(key); 1504 } 1505 } 1506 1507 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1508 sockptr_t optval, int optlen) 1509 { 1510 struct tcp_md5sig cmd; 1511 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1512 const union tcp_md5_addr *addr; 1513 u8 prefixlen = 32; 1514 int l3index = 0; 1515 bool l3flag; 1516 u8 flags; 1517 1518 if (optlen < sizeof(cmd)) 1519 return -EINVAL; 1520 1521 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1522 return -EFAULT; 1523 1524 if (sin->sin_family != AF_INET) 1525 return -EINVAL; 1526 1527 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1528 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1529 1530 if (optname == TCP_MD5SIG_EXT && 1531 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1532 prefixlen = cmd.tcpm_prefixlen; 1533 if (prefixlen > 32) 1534 return -EINVAL; 1535 } 1536 1537 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1538 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1539 struct net_device *dev; 1540 1541 rcu_read_lock(); 1542 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1543 if (dev && netif_is_l3_master(dev)) 1544 l3index = dev->ifindex; 1545 1546 rcu_read_unlock(); 1547 1548 /* ok to reference set/not set outside of rcu; 1549 * right now device MUST be an L3 master 1550 */ 1551 if (!dev || !l3index) 1552 return -EINVAL; 1553 } 1554 1555 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1556 1557 if (!cmd.tcpm_keylen) 1558 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1559 1560 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1561 return -EINVAL; 1562 1563 /* Don't allow keys for peers that have a matching TCP-AO key. 1564 * See the comment in tcp_ao_add_cmd() 1565 */ 1566 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1567 return -EKEYREJECTED; 1568 1569 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1570 cmd.tcpm_key, cmd.tcpm_keylen); 1571 } 1572 1573 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx, 1574 __be32 daddr, __be32 saddr, 1575 const struct tcphdr *th, int nbytes) 1576 { 1577 struct { 1578 struct tcp4_pseudohdr ip; 1579 struct tcphdr tcp; 1580 } h; 1581 1582 h.ip.saddr = saddr; 1583 h.ip.daddr = daddr; 1584 h.ip.pad = 0; 1585 h.ip.protocol = IPPROTO_TCP; 1586 h.ip.len = cpu_to_be16(nbytes); 1587 h.tcp = *th; 1588 h.tcp.check = 0; 1589 md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); 1590 } 1591 1592 static noinline_for_stack void 1593 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1594 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1595 { 1596 struct md5_ctx ctx; 1597 1598 md5_init(&ctx); 1599 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); 1600 tcp_md5_hash_key(&ctx, key); 1601 md5_final(&ctx, md5_hash); 1602 } 1603 1604 noinline_for_stack void 1605 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1606 const struct sock *sk, const struct sk_buff *skb) 1607 { 1608 const struct tcphdr *th = tcp_hdr(skb); 1609 __be32 saddr, daddr; 1610 struct md5_ctx ctx; 1611 1612 if (sk) { /* valid for establish/request sockets */ 1613 saddr = sk->sk_rcv_saddr; 1614 daddr = sk->sk_daddr; 1615 } else { 1616 const struct iphdr *iph = ip_hdr(skb); 1617 saddr = iph->saddr; 1618 daddr = iph->daddr; 1619 } 1620 1621 md5_init(&ctx); 1622 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); 1623 tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); 1624 tcp_md5_hash_key(&ctx, key); 1625 md5_final(&ctx, md5_hash); 1626 } 1627 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1628 1629 #endif 1630 1631 static void tcp_v4_init_req(struct request_sock *req, 1632 const struct sock *sk_listener, 1633 struct sk_buff *skb) 1634 { 1635 struct inet_request_sock *ireq = inet_rsk(req); 1636 struct net *net = sock_net(sk_listener); 1637 1638 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1639 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1640 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1641 } 1642 1643 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1644 struct sk_buff *skb, 1645 struct flowi *fl, 1646 struct request_sock *req, 1647 u32 tw_isn) 1648 { 1649 tcp_v4_init_req(req, sk, skb); 1650 1651 if (security_inet_conn_request(sk, skb, req)) 1652 return NULL; 1653 1654 return inet_csk_route_req(sk, &fl->u.ip4, req); 1655 } 1656 1657 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1658 .family = PF_INET, 1659 .obj_size = sizeof(struct tcp_request_sock), 1660 .send_ack = tcp_v4_reqsk_send_ack, 1661 .destructor = tcp_v4_reqsk_destructor, 1662 .send_reset = tcp_v4_send_reset, 1663 .syn_ack_timeout = tcp_syn_ack_timeout, 1664 }; 1665 1666 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1667 .mss_clamp = TCP_MSS_DEFAULT, 1668 #ifdef CONFIG_TCP_MD5SIG 1669 .req_md5_lookup = tcp_v4_md5_lookup, 1670 .calc_md5_hash = tcp_v4_md5_hash_skb, 1671 #endif 1672 #ifdef CONFIG_TCP_AO 1673 .ao_lookup = tcp_v4_ao_lookup_rsk, 1674 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1675 .ao_synack_hash = tcp_v4_ao_synack_hash, 1676 #endif 1677 #ifdef CONFIG_SYN_COOKIES 1678 .cookie_init_seq = cookie_v4_init_sequence, 1679 #endif 1680 .route_req = tcp_v4_route_req, 1681 .init_seq = tcp_v4_init_seq, 1682 .init_ts_off = tcp_v4_init_ts_off, 1683 .send_synack = tcp_v4_send_synack, 1684 }; 1685 1686 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1687 { 1688 /* Never answer to SYNs send to broadcast or multicast */ 1689 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1690 goto drop; 1691 1692 return tcp_conn_request(&tcp_request_sock_ops, 1693 &tcp_request_sock_ipv4_ops, sk, skb); 1694 1695 drop: 1696 tcp_listendrop(sk); 1697 return 0; 1698 } 1699 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1700 1701 1702 /* 1703 * The three way handshake has completed - we got a valid synack - 1704 * now create the new socket. 1705 */ 1706 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1707 struct request_sock *req, 1708 struct dst_entry *dst, 1709 struct request_sock *req_unhash, 1710 bool *own_req) 1711 { 1712 struct inet_request_sock *ireq; 1713 bool found_dup_sk = false; 1714 struct inet_sock *newinet; 1715 struct tcp_sock *newtp; 1716 struct sock *newsk; 1717 #ifdef CONFIG_TCP_MD5SIG 1718 const union tcp_md5_addr *addr; 1719 struct tcp_md5sig_key *key; 1720 int l3index; 1721 #endif 1722 struct ip_options_rcu *inet_opt; 1723 1724 if (sk_acceptq_is_full(sk)) 1725 goto exit_overflow; 1726 1727 newsk = tcp_create_openreq_child(sk, req, skb); 1728 if (!newsk) 1729 goto exit_nonewsk; 1730 1731 newsk->sk_gso_type = SKB_GSO_TCPV4; 1732 inet_sk_rx_dst_set(newsk, skb); 1733 1734 newtp = tcp_sk(newsk); 1735 newinet = inet_sk(newsk); 1736 ireq = inet_rsk(req); 1737 inet_opt = rcu_dereference(ireq->ireq_opt); 1738 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1739 newinet->mc_index = inet_iif(skb); 1740 newinet->mc_ttl = ip_hdr(skb)->ttl; 1741 newinet->rcv_tos = ip_hdr(skb)->tos; 1742 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1743 if (inet_opt) 1744 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1745 atomic_set(&newinet->inet_id, get_random_u16()); 1746 1747 /* Set ToS of the new socket based upon the value of incoming SYN. 1748 * ECT bits are set later in tcp_init_transfer(). 1749 */ 1750 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1751 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1752 1753 if (!dst) { 1754 dst = inet_csk_route_child_sock(sk, newsk, req); 1755 if (!dst) 1756 goto put_and_exit; 1757 } else { 1758 /* syncookie case : see end of cookie_v4_check() */ 1759 } 1760 sk_setup_caps(newsk, dst); 1761 1762 tcp_ca_openreq_child(newsk, dst); 1763 1764 tcp_sync_mss(newsk, dst_mtu(dst)); 1765 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1766 1767 tcp_initialize_rcv_mss(newsk); 1768 1769 #ifdef CONFIG_TCP_MD5SIG 1770 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1771 /* Copy over the MD5 key from the original socket */ 1772 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1773 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1774 if (key && !tcp_rsk_used_ao(req)) { 1775 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1776 goto put_and_exit; 1777 sk_gso_disable(newsk); 1778 } 1779 #endif 1780 #ifdef CONFIG_TCP_AO 1781 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1782 goto put_and_exit; /* OOM, release back memory */ 1783 #endif 1784 1785 if (__inet_inherit_port(sk, newsk) < 0) 1786 goto put_and_exit; 1787 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1788 &found_dup_sk); 1789 if (likely(*own_req)) { 1790 tcp_move_syn(newtp, req); 1791 ireq->ireq_opt = NULL; 1792 } else { 1793 newinet->inet_opt = NULL; 1794 1795 if (!req_unhash && found_dup_sk) { 1796 /* This code path should only be executed in the 1797 * syncookie case only 1798 */ 1799 bh_unlock_sock(newsk); 1800 sock_put(newsk); 1801 newsk = NULL; 1802 } 1803 } 1804 return newsk; 1805 1806 exit_overflow: 1807 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1808 exit_nonewsk: 1809 dst_release(dst); 1810 exit: 1811 tcp_listendrop(sk); 1812 return NULL; 1813 put_and_exit: 1814 newinet->inet_opt = NULL; 1815 inet_csk_prepare_forced_close(newsk); 1816 tcp_done(newsk); 1817 goto exit; 1818 } 1819 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1820 1821 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1822 { 1823 #ifdef CONFIG_SYN_COOKIES 1824 const struct tcphdr *th = tcp_hdr(skb); 1825 1826 if (!th->syn) 1827 sk = cookie_v4_check(sk, skb); 1828 #endif 1829 return sk; 1830 } 1831 1832 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1833 struct tcphdr *th, u32 *cookie) 1834 { 1835 u16 mss = 0; 1836 #ifdef CONFIG_SYN_COOKIES 1837 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1838 &tcp_request_sock_ipv4_ops, sk, th); 1839 if (mss) { 1840 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1841 tcp_synq_overflow(sk); 1842 } 1843 #endif 1844 return mss; 1845 } 1846 1847 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1848 u32)); 1849 /* The socket must have it's spinlock held when we get 1850 * here, unless it is a TCP_LISTEN socket. 1851 * 1852 * We have a potential double-lock case here, so even when 1853 * doing backlog processing we use the BH locking scheme. 1854 * This is because we cannot sleep with the original spinlock 1855 * held. 1856 */ 1857 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1858 { 1859 enum skb_drop_reason reason; 1860 struct sock *rsk; 1861 1862 reason = psp_sk_rx_policy_check(sk, skb); 1863 if (reason) 1864 goto err_discard; 1865 1866 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1867 struct dst_entry *dst; 1868 1869 dst = rcu_dereference_protected(sk->sk_rx_dst, 1870 lockdep_sock_is_held(sk)); 1871 1872 sock_rps_save_rxhash(sk, skb); 1873 sk_mark_napi_id(sk, skb); 1874 if (dst) { 1875 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1876 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1877 dst, 0)) { 1878 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1879 dst_release(dst); 1880 } 1881 } 1882 tcp_rcv_established(sk, skb); 1883 return 0; 1884 } 1885 1886 if (tcp_checksum_complete(skb)) 1887 goto csum_err; 1888 1889 if (sk->sk_state == TCP_LISTEN) { 1890 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1891 1892 if (!nsk) 1893 return 0; 1894 if (nsk != sk) { 1895 reason = tcp_child_process(sk, nsk, skb); 1896 if (reason) { 1897 rsk = nsk; 1898 goto reset; 1899 } 1900 return 0; 1901 } 1902 } else 1903 sock_rps_save_rxhash(sk, skb); 1904 1905 reason = tcp_rcv_state_process(sk, skb); 1906 if (reason) { 1907 rsk = sk; 1908 goto reset; 1909 } 1910 return 0; 1911 1912 reset: 1913 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1914 discard: 1915 sk_skb_reason_drop(sk, skb, reason); 1916 /* Be careful here. If this function gets more complicated and 1917 * gcc suffers from register pressure on the x86, sk (in %ebx) 1918 * might be destroyed here. This current version compiles correctly, 1919 * but you have been warned. 1920 */ 1921 return 0; 1922 1923 csum_err: 1924 reason = SKB_DROP_REASON_TCP_CSUM; 1925 trace_tcp_bad_csum(skb); 1926 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1927 err_discard: 1928 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1929 goto discard; 1930 } 1931 EXPORT_SYMBOL(tcp_v4_do_rcv); 1932 1933 int tcp_v4_early_demux(struct sk_buff *skb) 1934 { 1935 struct net *net = dev_net_rcu(skb->dev); 1936 const struct iphdr *iph; 1937 const struct tcphdr *th; 1938 struct sock *sk; 1939 1940 if (skb->pkt_type != PACKET_HOST) 1941 return 0; 1942 1943 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1944 return 0; 1945 1946 iph = ip_hdr(skb); 1947 th = tcp_hdr(skb); 1948 1949 if (th->doff < sizeof(struct tcphdr) / 4) 1950 return 0; 1951 1952 sk = __inet_lookup_established(net, iph->saddr, th->source, 1953 iph->daddr, ntohs(th->dest), 1954 skb->skb_iif, inet_sdif(skb)); 1955 if (sk) { 1956 skb->sk = sk; 1957 skb->destructor = sock_edemux; 1958 if (sk_fullsock(sk)) { 1959 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1960 1961 if (dst) 1962 dst = dst_check(dst, 0); 1963 if (dst && 1964 sk->sk_rx_dst_ifindex == skb->skb_iif) 1965 skb_dst_set_noref(skb, dst); 1966 } 1967 } 1968 return 0; 1969 } 1970 1971 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1972 enum skb_drop_reason *reason) 1973 { 1974 u32 tail_gso_size, tail_gso_segs; 1975 struct skb_shared_info *shinfo; 1976 const struct tcphdr *th; 1977 struct tcphdr *thtail; 1978 struct sk_buff *tail; 1979 unsigned int hdrlen; 1980 bool fragstolen; 1981 u32 gso_segs; 1982 u32 gso_size; 1983 u64 limit; 1984 int delta; 1985 int err; 1986 1987 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1988 * we can fix skb->truesize to its real value to avoid future drops. 1989 * This is valid because skb is not yet charged to the socket. 1990 * It has been noticed pure SACK packets were sometimes dropped 1991 * (if cooked by drivers without copybreak feature). 1992 */ 1993 skb_condense(skb); 1994 1995 tcp_cleanup_skb(skb); 1996 1997 if (unlikely(tcp_checksum_complete(skb))) { 1998 bh_unlock_sock(sk); 1999 trace_tcp_bad_csum(skb); 2000 *reason = SKB_DROP_REASON_TCP_CSUM; 2001 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2002 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2003 return true; 2004 } 2005 2006 /* Attempt coalescing to last skb in backlog, even if we are 2007 * above the limits. 2008 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2009 */ 2010 th = (const struct tcphdr *)skb->data; 2011 hdrlen = th->doff * 4; 2012 2013 tail = sk->sk_backlog.tail; 2014 if (!tail) 2015 goto no_coalesce; 2016 thtail = (struct tcphdr *)tail->data; 2017 2018 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2019 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2020 ((TCP_SKB_CB(tail)->tcp_flags | 2021 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2022 !((TCP_SKB_CB(tail)->tcp_flags & 2023 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2024 ((TCP_SKB_CB(tail)->tcp_flags ^ 2025 TCP_SKB_CB(skb)->tcp_flags) & 2026 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2027 !tcp_skb_can_collapse_rx(tail, skb) || 2028 thtail->doff != th->doff || 2029 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || 2030 /* prior to PSP Rx policy check, retain exact PSP metadata */ 2031 psp_skb_coalesce_diff(tail, skb)) 2032 goto no_coalesce; 2033 2034 __skb_pull(skb, hdrlen); 2035 2036 shinfo = skb_shinfo(skb); 2037 gso_size = shinfo->gso_size ?: skb->len; 2038 gso_segs = shinfo->gso_segs ?: 1; 2039 2040 shinfo = skb_shinfo(tail); 2041 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2042 tail_gso_segs = shinfo->gso_segs ?: 1; 2043 2044 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2045 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2046 2047 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2048 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2049 thtail->window = th->window; 2050 } 2051 2052 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2053 * thtail->fin, so that the fast path in tcp_rcv_established() 2054 * is not entered if we append a packet with a FIN. 2055 * SYN, RST, URG are not present. 2056 * ACK is set on both packets. 2057 * PSH : we do not really care in TCP stack, 2058 * at least for 'GRO' packets. 2059 */ 2060 thtail->fin |= th->fin; 2061 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2062 2063 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2064 TCP_SKB_CB(tail)->has_rxtstamp = true; 2065 tail->tstamp = skb->tstamp; 2066 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2067 } 2068 2069 /* Not as strict as GRO. We only need to carry mss max value */ 2070 shinfo->gso_size = max(gso_size, tail_gso_size); 2071 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2072 2073 sk->sk_backlog.len += delta; 2074 __NET_INC_STATS(sock_net(sk), 2075 LINUX_MIB_TCPBACKLOGCOALESCE); 2076 kfree_skb_partial(skb, fragstolen); 2077 return false; 2078 } 2079 __skb_push(skb, hdrlen); 2080 2081 no_coalesce: 2082 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2083 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2084 * sk_rcvbuf in normal conditions. 2085 */ 2086 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2087 2088 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2089 2090 /* Only socket owner can try to collapse/prune rx queues 2091 * to reduce memory overhead, so add a little headroom here. 2092 * Few sockets backlog are possibly concurrently non empty. 2093 */ 2094 limit += 64 * 1024; 2095 2096 limit = min_t(u64, limit, UINT_MAX); 2097 2098 err = sk_add_backlog(sk, skb, limit); 2099 if (unlikely(err)) { 2100 bh_unlock_sock(sk); 2101 if (err == -ENOMEM) { 2102 *reason = SKB_DROP_REASON_PFMEMALLOC; 2103 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2104 } else { 2105 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2106 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2107 } 2108 return true; 2109 } 2110 return false; 2111 } 2112 EXPORT_IPV6_MOD(tcp_add_backlog); 2113 2114 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) 2115 { 2116 struct tcphdr *th = (struct tcphdr *)skb->data; 2117 2118 return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); 2119 } 2120 EXPORT_IPV6_MOD(tcp_filter); 2121 2122 static void tcp_v4_restore_cb(struct sk_buff *skb) 2123 { 2124 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2125 sizeof(struct inet_skb_parm)); 2126 } 2127 2128 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2129 const struct tcphdr *th) 2130 { 2131 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2132 * barrier() makes sure compiler wont play fool^Waliasing games. 2133 */ 2134 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2135 sizeof(struct inet_skb_parm)); 2136 barrier(); 2137 2138 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2139 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2140 skb->len - th->doff * 4); 2141 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2142 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2143 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2144 TCP_SKB_CB(skb)->sacked = 0; 2145 TCP_SKB_CB(skb)->has_rxtstamp = 2146 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2147 } 2148 2149 /* 2150 * From tcp_input.c 2151 */ 2152 2153 int tcp_v4_rcv(struct sk_buff *skb) 2154 { 2155 struct net *net = dev_net_rcu(skb->dev); 2156 enum skb_drop_reason drop_reason; 2157 enum tcp_tw_status tw_status; 2158 int sdif = inet_sdif(skb); 2159 int dif = inet_iif(skb); 2160 const struct iphdr *iph; 2161 const struct tcphdr *th; 2162 struct sock *sk = NULL; 2163 bool refcounted; 2164 int ret; 2165 u32 isn; 2166 2167 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2168 if (skb->pkt_type != PACKET_HOST) 2169 goto discard_it; 2170 2171 /* Count it even if it's bad */ 2172 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2173 2174 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2175 goto discard_it; 2176 2177 th = (const struct tcphdr *)skb->data; 2178 2179 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2180 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2181 goto bad_packet; 2182 } 2183 if (!pskb_may_pull(skb, th->doff * 4)) 2184 goto discard_it; 2185 2186 /* An explanation is required here, I think. 2187 * Packet length and doff are validated by header prediction, 2188 * provided case of th->doff==0 is eliminated. 2189 * So, we defer the checks. */ 2190 2191 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2192 goto csum_error; 2193 2194 th = (const struct tcphdr *)skb->data; 2195 iph = ip_hdr(skb); 2196 lookup: 2197 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2198 th->dest, sdif, &refcounted); 2199 if (!sk) 2200 goto no_tcp_socket; 2201 2202 if (sk->sk_state == TCP_TIME_WAIT) 2203 goto do_time_wait; 2204 2205 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2206 struct request_sock *req = inet_reqsk(sk); 2207 bool req_stolen = false; 2208 struct sock *nsk; 2209 2210 sk = req->rsk_listener; 2211 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2212 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2213 else 2214 drop_reason = tcp_inbound_hash(sk, req, skb, 2215 &iph->saddr, &iph->daddr, 2216 AF_INET, dif, sdif); 2217 if (unlikely(drop_reason)) { 2218 sk_drops_skbadd(sk, skb); 2219 reqsk_put(req); 2220 goto discard_it; 2221 } 2222 if (tcp_checksum_complete(skb)) { 2223 reqsk_put(req); 2224 goto csum_error; 2225 } 2226 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2227 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2228 if (!nsk) { 2229 inet_csk_reqsk_queue_drop_and_put(sk, req); 2230 goto lookup; 2231 } 2232 sk = nsk; 2233 /* reuseport_migrate_sock() has already held one sk_refcnt 2234 * before returning. 2235 */ 2236 } else { 2237 /* We own a reference on the listener, increase it again 2238 * as we might lose it too soon. 2239 */ 2240 sock_hold(sk); 2241 } 2242 refcounted = true; 2243 nsk = NULL; 2244 if (!tcp_filter(sk, skb, &drop_reason)) { 2245 th = (const struct tcphdr *)skb->data; 2246 iph = ip_hdr(skb); 2247 tcp_v4_fill_cb(skb, iph, th); 2248 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2249 &drop_reason); 2250 } 2251 if (!nsk) { 2252 reqsk_put(req); 2253 if (req_stolen) { 2254 /* Another cpu got exclusive access to req 2255 * and created a full blown socket. 2256 * Try to feed this packet to this socket 2257 * instead of discarding it. 2258 */ 2259 tcp_v4_restore_cb(skb); 2260 sock_put(sk); 2261 goto lookup; 2262 } 2263 goto discard_and_relse; 2264 } 2265 nf_reset_ct(skb); 2266 if (nsk == sk) { 2267 reqsk_put(req); 2268 tcp_v4_restore_cb(skb); 2269 } else { 2270 drop_reason = tcp_child_process(sk, nsk, skb); 2271 if (drop_reason) { 2272 enum sk_rst_reason rst_reason; 2273 2274 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2275 tcp_v4_send_reset(nsk, skb, rst_reason); 2276 goto discard_and_relse; 2277 } 2278 sock_put(sk); 2279 return 0; 2280 } 2281 } 2282 2283 process: 2284 if (static_branch_unlikely(&ip4_min_ttl)) { 2285 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2286 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2287 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2288 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2289 goto discard_and_relse; 2290 } 2291 } 2292 2293 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2294 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2295 goto discard_and_relse; 2296 } 2297 2298 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2299 AF_INET, dif, sdif); 2300 if (drop_reason) 2301 goto discard_and_relse; 2302 2303 nf_reset_ct(skb); 2304 2305 if (tcp_filter(sk, skb, &drop_reason)) 2306 goto discard_and_relse; 2307 2308 th = (const struct tcphdr *)skb->data; 2309 iph = ip_hdr(skb); 2310 tcp_v4_fill_cb(skb, iph, th); 2311 2312 skb->dev = NULL; 2313 2314 if (sk->sk_state == TCP_LISTEN) { 2315 ret = tcp_v4_do_rcv(sk, skb); 2316 goto put_and_return; 2317 } 2318 2319 sk_incoming_cpu_update(sk); 2320 2321 bh_lock_sock_nested(sk); 2322 tcp_segs_in(tcp_sk(sk), skb); 2323 ret = 0; 2324 if (!sock_owned_by_user(sk)) { 2325 ret = tcp_v4_do_rcv(sk, skb); 2326 } else { 2327 if (tcp_add_backlog(sk, skb, &drop_reason)) 2328 goto discard_and_relse; 2329 } 2330 bh_unlock_sock(sk); 2331 2332 put_and_return: 2333 if (refcounted) 2334 sock_put(sk); 2335 2336 return ret; 2337 2338 no_tcp_socket: 2339 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2340 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2341 goto discard_it; 2342 2343 tcp_v4_fill_cb(skb, iph, th); 2344 2345 if (tcp_checksum_complete(skb)) { 2346 csum_error: 2347 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2348 trace_tcp_bad_csum(skb); 2349 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2350 bad_packet: 2351 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2352 } else { 2353 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2354 } 2355 2356 discard_it: 2357 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2358 /* Discard frame. */ 2359 sk_skb_reason_drop(sk, skb, drop_reason); 2360 return 0; 2361 2362 discard_and_relse: 2363 sk_drops_skbadd(sk, skb); 2364 if (refcounted) 2365 sock_put(sk); 2366 goto discard_it; 2367 2368 do_time_wait: 2369 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2370 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2371 inet_twsk_put(inet_twsk(sk)); 2372 goto discard_it; 2373 } 2374 2375 tcp_v4_fill_cb(skb, iph, th); 2376 2377 if (tcp_checksum_complete(skb)) { 2378 inet_twsk_put(inet_twsk(sk)); 2379 goto csum_error; 2380 } 2381 2382 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2383 &drop_reason); 2384 switch (tw_status) { 2385 case TCP_TW_SYN: { 2386 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2387 iph->saddr, th->source, 2388 iph->daddr, th->dest, 2389 inet_iif(skb), 2390 sdif); 2391 if (sk2) { 2392 inet_twsk_deschedule_put(inet_twsk(sk)); 2393 sk = sk2; 2394 tcp_v4_restore_cb(skb); 2395 refcounted = false; 2396 __this_cpu_write(tcp_tw_isn, isn); 2397 goto process; 2398 } 2399 2400 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); 2401 if (drop_reason) 2402 break; 2403 } 2404 /* to ACK */ 2405 fallthrough; 2406 case TCP_TW_ACK: 2407 case TCP_TW_ACK_OOW: 2408 tcp_v4_timewait_ack(sk, skb, tw_status); 2409 break; 2410 case TCP_TW_RST: 2411 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2412 inet_twsk_deschedule_put(inet_twsk(sk)); 2413 goto discard_it; 2414 case TCP_TW_SUCCESS:; 2415 } 2416 goto discard_it; 2417 } 2418 2419 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2420 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2421 }; 2422 2423 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2424 { 2425 struct dst_entry *dst = skb_dst(skb); 2426 2427 if (dst && dst_hold_safe(dst)) { 2428 rcu_assign_pointer(sk->sk_rx_dst, dst); 2429 sk->sk_rx_dst_ifindex = skb->skb_iif; 2430 } 2431 } 2432 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2433 2434 const struct inet_connection_sock_af_ops ipv4_specific = { 2435 .queue_xmit = ip_queue_xmit, 2436 .send_check = tcp_v4_send_check, 2437 .rebuild_header = inet_sk_rebuild_header, 2438 .sk_rx_dst_set = inet_sk_rx_dst_set, 2439 .conn_request = tcp_v4_conn_request, 2440 .syn_recv_sock = tcp_v4_syn_recv_sock, 2441 .net_header_len = sizeof(struct iphdr), 2442 .setsockopt = ip_setsockopt, 2443 .getsockopt = ip_getsockopt, 2444 .mtu_reduced = tcp_v4_mtu_reduced, 2445 }; 2446 EXPORT_IPV6_MOD(ipv4_specific); 2447 2448 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2449 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2450 #ifdef CONFIG_TCP_MD5SIG 2451 .md5_lookup = tcp_v4_md5_lookup, 2452 .calc_md5_hash = tcp_v4_md5_hash_skb, 2453 .md5_parse = tcp_v4_parse_md5_keys, 2454 #endif 2455 #ifdef CONFIG_TCP_AO 2456 .ao_lookup = tcp_v4_ao_lookup, 2457 .calc_ao_hash = tcp_v4_ao_hash_skb, 2458 .ao_parse = tcp_v4_parse_ao, 2459 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2460 #endif 2461 }; 2462 2463 static void tcp4_destruct_sock(struct sock *sk) 2464 { 2465 tcp_md5_destruct_sock(sk); 2466 tcp_ao_destroy_sock(sk, false); 2467 inet_sock_destruct(sk); 2468 } 2469 #endif 2470 2471 /* NOTE: A lot of things set to zero explicitly by call to 2472 * sk_alloc() so need not be done here. 2473 */ 2474 static int tcp_v4_init_sock(struct sock *sk) 2475 { 2476 struct inet_connection_sock *icsk = inet_csk(sk); 2477 2478 tcp_init_sock(sk); 2479 2480 icsk->icsk_af_ops = &ipv4_specific; 2481 2482 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2483 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2484 sk->sk_destruct = tcp4_destruct_sock; 2485 #endif 2486 2487 return 0; 2488 } 2489 2490 static void tcp_release_user_frags(struct sock *sk) 2491 { 2492 #ifdef CONFIG_PAGE_POOL 2493 unsigned long index; 2494 void *netmem; 2495 2496 xa_for_each(&sk->sk_user_frags, index, netmem) 2497 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2498 #endif 2499 } 2500 2501 void tcp_v4_destroy_sock(struct sock *sk) 2502 { 2503 struct tcp_sock *tp = tcp_sk(sk); 2504 2505 tcp_release_user_frags(sk); 2506 2507 xa_destroy(&sk->sk_user_frags); 2508 2509 trace_tcp_destroy_sock(sk); 2510 2511 tcp_clear_xmit_timers(sk); 2512 2513 tcp_cleanup_congestion_control(sk); 2514 2515 tcp_cleanup_ulp(sk); 2516 2517 /* Cleanup up the write buffer. */ 2518 tcp_write_queue_purge(sk); 2519 2520 /* Check if we want to disable active TFO */ 2521 tcp_fastopen_active_disable_ofo_check(sk); 2522 2523 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2524 skb_rbtree_purge(&tp->out_of_order_queue); 2525 2526 /* Clean up a referenced TCP bind bucket. */ 2527 if (inet_csk(sk)->icsk_bind_hash) 2528 inet_put_port(sk); 2529 2530 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2531 2532 /* If socket is aborted during connect operation */ 2533 tcp_free_fastopen_req(tp); 2534 tcp_fastopen_destroy_cipher(sk); 2535 tcp_saved_syn_free(tp); 2536 2537 sk_sockets_allocated_dec(sk); 2538 } 2539 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2540 2541 #ifdef CONFIG_PROC_FS 2542 /* Proc filesystem TCP sock list dumping. */ 2543 2544 static unsigned short seq_file_family(const struct seq_file *seq); 2545 2546 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2547 { 2548 unsigned short family = seq_file_family(seq); 2549 2550 /* AF_UNSPEC is used as a match all */ 2551 return ((family == AF_UNSPEC || family == sk->sk_family) && 2552 net_eq(sock_net(sk), seq_file_net(seq))); 2553 } 2554 2555 /* Find a non empty bucket (starting from st->bucket) 2556 * and return the first sk from it. 2557 */ 2558 static void *listening_get_first(struct seq_file *seq) 2559 { 2560 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2561 struct tcp_iter_state *st = seq->private; 2562 2563 st->offset = 0; 2564 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2565 struct inet_listen_hashbucket *ilb2; 2566 struct hlist_nulls_node *node; 2567 struct sock *sk; 2568 2569 ilb2 = &hinfo->lhash2[st->bucket]; 2570 if (hlist_nulls_empty(&ilb2->nulls_head)) 2571 continue; 2572 2573 spin_lock(&ilb2->lock); 2574 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2575 if (seq_sk_match(seq, sk)) 2576 return sk; 2577 } 2578 spin_unlock(&ilb2->lock); 2579 } 2580 2581 return NULL; 2582 } 2583 2584 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2585 * If "cur" is the last one in the st->bucket, 2586 * call listening_get_first() to return the first sk of the next 2587 * non empty bucket. 2588 */ 2589 static void *listening_get_next(struct seq_file *seq, void *cur) 2590 { 2591 struct tcp_iter_state *st = seq->private; 2592 struct inet_listen_hashbucket *ilb2; 2593 struct hlist_nulls_node *node; 2594 struct inet_hashinfo *hinfo; 2595 struct sock *sk = cur; 2596 2597 ++st->num; 2598 ++st->offset; 2599 2600 sk = sk_nulls_next(sk); 2601 sk_nulls_for_each_from(sk, node) { 2602 if (seq_sk_match(seq, sk)) 2603 return sk; 2604 } 2605 2606 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2607 ilb2 = &hinfo->lhash2[st->bucket]; 2608 spin_unlock(&ilb2->lock); 2609 ++st->bucket; 2610 return listening_get_first(seq); 2611 } 2612 2613 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2614 { 2615 struct tcp_iter_state *st = seq->private; 2616 void *rc; 2617 2618 st->bucket = 0; 2619 st->offset = 0; 2620 rc = listening_get_first(seq); 2621 2622 while (rc && *pos) { 2623 rc = listening_get_next(seq, rc); 2624 --*pos; 2625 } 2626 return rc; 2627 } 2628 2629 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2630 const struct tcp_iter_state *st) 2631 { 2632 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2633 } 2634 2635 /* 2636 * Get first established socket starting from bucket given in st->bucket. 2637 * If st->bucket is zero, the very first socket in the hash is returned. 2638 */ 2639 static void *established_get_first(struct seq_file *seq) 2640 { 2641 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2642 struct tcp_iter_state *st = seq->private; 2643 2644 st->offset = 0; 2645 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2646 struct sock *sk; 2647 struct hlist_nulls_node *node; 2648 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2649 2650 cond_resched(); 2651 2652 /* Lockless fast path for the common case of empty buckets */ 2653 if (empty_bucket(hinfo, st)) 2654 continue; 2655 2656 spin_lock_bh(lock); 2657 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2658 if (seq_sk_match(seq, sk)) 2659 return sk; 2660 } 2661 spin_unlock_bh(lock); 2662 } 2663 2664 return NULL; 2665 } 2666 2667 static void *established_get_next(struct seq_file *seq, void *cur) 2668 { 2669 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2670 struct tcp_iter_state *st = seq->private; 2671 struct hlist_nulls_node *node; 2672 struct sock *sk = cur; 2673 2674 ++st->num; 2675 ++st->offset; 2676 2677 sk = sk_nulls_next(sk); 2678 2679 sk_nulls_for_each_from(sk, node) { 2680 if (seq_sk_match(seq, sk)) 2681 return sk; 2682 } 2683 2684 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2685 ++st->bucket; 2686 return established_get_first(seq); 2687 } 2688 2689 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2690 { 2691 struct tcp_iter_state *st = seq->private; 2692 void *rc; 2693 2694 st->bucket = 0; 2695 rc = established_get_first(seq); 2696 2697 while (rc && pos) { 2698 rc = established_get_next(seq, rc); 2699 --pos; 2700 } 2701 return rc; 2702 } 2703 2704 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2705 { 2706 void *rc; 2707 struct tcp_iter_state *st = seq->private; 2708 2709 st->state = TCP_SEQ_STATE_LISTENING; 2710 rc = listening_get_idx(seq, &pos); 2711 2712 if (!rc) { 2713 st->state = TCP_SEQ_STATE_ESTABLISHED; 2714 rc = established_get_idx(seq, pos); 2715 } 2716 2717 return rc; 2718 } 2719 2720 static void *tcp_seek_last_pos(struct seq_file *seq) 2721 { 2722 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2723 struct tcp_iter_state *st = seq->private; 2724 int bucket = st->bucket; 2725 int offset = st->offset; 2726 int orig_num = st->num; 2727 void *rc = NULL; 2728 2729 switch (st->state) { 2730 case TCP_SEQ_STATE_LISTENING: 2731 if (st->bucket > hinfo->lhash2_mask) 2732 break; 2733 rc = listening_get_first(seq); 2734 while (offset-- && rc && bucket == st->bucket) 2735 rc = listening_get_next(seq, rc); 2736 if (rc) 2737 break; 2738 st->bucket = 0; 2739 st->state = TCP_SEQ_STATE_ESTABLISHED; 2740 fallthrough; 2741 case TCP_SEQ_STATE_ESTABLISHED: 2742 if (st->bucket > hinfo->ehash_mask) 2743 break; 2744 rc = established_get_first(seq); 2745 while (offset-- && rc && bucket == st->bucket) 2746 rc = established_get_next(seq, rc); 2747 } 2748 2749 st->num = orig_num; 2750 2751 return rc; 2752 } 2753 2754 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2755 { 2756 struct tcp_iter_state *st = seq->private; 2757 void *rc; 2758 2759 if (*pos && *pos == st->last_pos) { 2760 rc = tcp_seek_last_pos(seq); 2761 if (rc) 2762 goto out; 2763 } 2764 2765 st->state = TCP_SEQ_STATE_LISTENING; 2766 st->num = 0; 2767 st->bucket = 0; 2768 st->offset = 0; 2769 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2770 2771 out: 2772 st->last_pos = *pos; 2773 return rc; 2774 } 2775 EXPORT_IPV6_MOD(tcp_seq_start); 2776 2777 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2778 { 2779 struct tcp_iter_state *st = seq->private; 2780 void *rc = NULL; 2781 2782 if (v == SEQ_START_TOKEN) { 2783 rc = tcp_get_idx(seq, 0); 2784 goto out; 2785 } 2786 2787 switch (st->state) { 2788 case TCP_SEQ_STATE_LISTENING: 2789 rc = listening_get_next(seq, v); 2790 if (!rc) { 2791 st->state = TCP_SEQ_STATE_ESTABLISHED; 2792 st->bucket = 0; 2793 st->offset = 0; 2794 rc = established_get_first(seq); 2795 } 2796 break; 2797 case TCP_SEQ_STATE_ESTABLISHED: 2798 rc = established_get_next(seq, v); 2799 break; 2800 } 2801 out: 2802 ++*pos; 2803 st->last_pos = *pos; 2804 return rc; 2805 } 2806 EXPORT_IPV6_MOD(tcp_seq_next); 2807 2808 void tcp_seq_stop(struct seq_file *seq, void *v) 2809 { 2810 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2811 struct tcp_iter_state *st = seq->private; 2812 2813 switch (st->state) { 2814 case TCP_SEQ_STATE_LISTENING: 2815 if (v != SEQ_START_TOKEN) 2816 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2817 break; 2818 case TCP_SEQ_STATE_ESTABLISHED: 2819 if (v) 2820 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2821 break; 2822 } 2823 } 2824 EXPORT_IPV6_MOD(tcp_seq_stop); 2825 2826 static void get_openreq4(const struct request_sock *req, 2827 struct seq_file *f, int i) 2828 { 2829 const struct inet_request_sock *ireq = inet_rsk(req); 2830 long delta = req->rsk_timer.expires - jiffies; 2831 2832 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2833 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2834 i, 2835 ireq->ir_loc_addr, 2836 ireq->ir_num, 2837 ireq->ir_rmt_addr, 2838 ntohs(ireq->ir_rmt_port), 2839 TCP_SYN_RECV, 2840 0, 0, /* could print option size, but that is af dependent. */ 2841 1, /* timers active (only the expire timer) */ 2842 jiffies_delta_to_clock_t(delta), 2843 req->num_timeout, 2844 from_kuid_munged(seq_user_ns(f), 2845 sk_uid(req->rsk_listener)), 2846 0, /* non standard timer */ 2847 0, /* open_requests have no inode */ 2848 0, 2849 req); 2850 } 2851 2852 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2853 { 2854 int timer_active; 2855 unsigned long timer_expires; 2856 const struct tcp_sock *tp = tcp_sk(sk); 2857 const struct inet_connection_sock *icsk = inet_csk(sk); 2858 const struct inet_sock *inet = inet_sk(sk); 2859 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2860 __be32 dest = inet->inet_daddr; 2861 __be32 src = inet->inet_rcv_saddr; 2862 __u16 destp = ntohs(inet->inet_dport); 2863 __u16 srcp = ntohs(inet->inet_sport); 2864 u8 icsk_pending; 2865 int rx_queue; 2866 int state; 2867 2868 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2869 if (icsk_pending == ICSK_TIME_RETRANS || 2870 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2871 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2872 timer_active = 1; 2873 timer_expires = icsk_timeout(icsk); 2874 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2875 timer_active = 4; 2876 timer_expires = icsk_timeout(icsk); 2877 } else if (timer_pending(&sk->sk_timer)) { 2878 timer_active = 2; 2879 timer_expires = sk->sk_timer.expires; 2880 } else { 2881 timer_active = 0; 2882 timer_expires = jiffies; 2883 } 2884 2885 state = inet_sk_state_load(sk); 2886 if (state == TCP_LISTEN) 2887 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2888 else 2889 /* Because we don't lock the socket, 2890 * we might find a transient negative value. 2891 */ 2892 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2893 READ_ONCE(tp->copied_seq), 0); 2894 2895 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2896 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2897 i, src, srcp, dest, destp, state, 2898 READ_ONCE(tp->write_seq) - tp->snd_una, 2899 rx_queue, 2900 timer_active, 2901 jiffies_delta_to_clock_t(timer_expires - jiffies), 2902 READ_ONCE(icsk->icsk_retransmits), 2903 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2904 READ_ONCE(icsk->icsk_probes_out), 2905 sock_i_ino(sk), 2906 refcount_read(&sk->sk_refcnt), sk, 2907 jiffies_to_clock_t(icsk->icsk_rto), 2908 jiffies_to_clock_t(icsk->icsk_ack.ato), 2909 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2910 tcp_snd_cwnd(tp), 2911 state == TCP_LISTEN ? 2912 fastopenq->max_qlen : 2913 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2914 } 2915 2916 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2917 struct seq_file *f, int i) 2918 { 2919 long delta = tw->tw_timer.expires - jiffies; 2920 __be32 dest, src; 2921 __u16 destp, srcp; 2922 2923 dest = tw->tw_daddr; 2924 src = tw->tw_rcv_saddr; 2925 destp = ntohs(tw->tw_dport); 2926 srcp = ntohs(tw->tw_sport); 2927 2928 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2929 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2930 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2931 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2932 refcount_read(&tw->tw_refcnt), tw); 2933 } 2934 2935 #define TMPSZ 150 2936 2937 static int tcp4_seq_show(struct seq_file *seq, void *v) 2938 { 2939 struct tcp_iter_state *st; 2940 struct sock *sk = v; 2941 2942 seq_setwidth(seq, TMPSZ - 1); 2943 if (v == SEQ_START_TOKEN) { 2944 seq_puts(seq, " sl local_address rem_address st tx_queue " 2945 "rx_queue tr tm->when retrnsmt uid timeout " 2946 "inode"); 2947 goto out; 2948 } 2949 st = seq->private; 2950 2951 if (sk->sk_state == TCP_TIME_WAIT) 2952 get_timewait4_sock(v, seq, st->num); 2953 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2954 get_openreq4(v, seq, st->num); 2955 else 2956 get_tcp4_sock(v, seq, st->num); 2957 out: 2958 seq_pad(seq, '\n'); 2959 return 0; 2960 } 2961 2962 #ifdef CONFIG_BPF_SYSCALL 2963 union bpf_tcp_iter_batch_item { 2964 struct sock *sk; 2965 __u64 cookie; 2966 }; 2967 2968 struct bpf_tcp_iter_state { 2969 struct tcp_iter_state state; 2970 unsigned int cur_sk; 2971 unsigned int end_sk; 2972 unsigned int max_sk; 2973 union bpf_tcp_iter_batch_item *batch; 2974 }; 2975 2976 struct bpf_iter__tcp { 2977 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2978 __bpf_md_ptr(struct sock_common *, sk_common); 2979 uid_t uid __aligned(8); 2980 }; 2981 2982 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2983 struct sock_common *sk_common, uid_t uid) 2984 { 2985 struct bpf_iter__tcp ctx; 2986 2987 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2988 ctx.meta = meta; 2989 ctx.sk_common = sk_common; 2990 ctx.uid = uid; 2991 return bpf_iter_run_prog(prog, &ctx); 2992 } 2993 2994 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2995 { 2996 union bpf_tcp_iter_batch_item *item; 2997 unsigned int cur_sk = iter->cur_sk; 2998 __u64 cookie; 2999 3000 /* Remember the cookies of the sockets we haven't seen yet, so we can 3001 * pick up where we left off next time around. 3002 */ 3003 while (cur_sk < iter->end_sk) { 3004 item = &iter->batch[cur_sk++]; 3005 cookie = sock_gen_cookie(item->sk); 3006 sock_gen_put(item->sk); 3007 item->cookie = cookie; 3008 } 3009 } 3010 3011 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3012 unsigned int new_batch_sz, gfp_t flags) 3013 { 3014 union bpf_tcp_iter_batch_item *new_batch; 3015 3016 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3017 flags | __GFP_NOWARN); 3018 if (!new_batch) 3019 return -ENOMEM; 3020 3021 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 3022 kvfree(iter->batch); 3023 iter->batch = new_batch; 3024 iter->max_sk = new_batch_sz; 3025 3026 return 0; 3027 } 3028 3029 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3030 union bpf_tcp_iter_batch_item *cookies, 3031 int n_cookies) 3032 { 3033 struct hlist_nulls_node *node; 3034 struct sock *sk; 3035 int i; 3036 3037 for (i = 0; i < n_cookies; i++) { 3038 sk = first_sk; 3039 sk_nulls_for_each_from(sk, node) 3040 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3041 return sk; 3042 } 3043 3044 return NULL; 3045 } 3046 3047 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3048 { 3049 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3050 struct bpf_tcp_iter_state *iter = seq->private; 3051 struct tcp_iter_state *st = &iter->state; 3052 unsigned int find_cookie = iter->cur_sk; 3053 unsigned int end_cookie = iter->end_sk; 3054 int resume_bucket = st->bucket; 3055 struct sock *sk; 3056 3057 if (end_cookie && find_cookie == end_cookie) 3058 ++st->bucket; 3059 3060 sk = listening_get_first(seq); 3061 iter->cur_sk = 0; 3062 iter->end_sk = 0; 3063 3064 if (sk && st->bucket == resume_bucket && end_cookie) { 3065 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3066 end_cookie - find_cookie); 3067 if (!sk) { 3068 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3069 ++st->bucket; 3070 sk = listening_get_first(seq); 3071 } 3072 } 3073 3074 return sk; 3075 } 3076 3077 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3078 { 3079 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3080 struct bpf_tcp_iter_state *iter = seq->private; 3081 struct tcp_iter_state *st = &iter->state; 3082 unsigned int find_cookie = iter->cur_sk; 3083 unsigned int end_cookie = iter->end_sk; 3084 int resume_bucket = st->bucket; 3085 struct sock *sk; 3086 3087 if (end_cookie && find_cookie == end_cookie) 3088 ++st->bucket; 3089 3090 sk = established_get_first(seq); 3091 iter->cur_sk = 0; 3092 iter->end_sk = 0; 3093 3094 if (sk && st->bucket == resume_bucket && end_cookie) { 3095 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3096 end_cookie - find_cookie); 3097 if (!sk) { 3098 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3099 ++st->bucket; 3100 sk = established_get_first(seq); 3101 } 3102 } 3103 3104 return sk; 3105 } 3106 3107 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3108 { 3109 struct bpf_tcp_iter_state *iter = seq->private; 3110 struct tcp_iter_state *st = &iter->state; 3111 struct sock *sk = NULL; 3112 3113 switch (st->state) { 3114 case TCP_SEQ_STATE_LISTENING: 3115 sk = bpf_iter_tcp_resume_listening(seq); 3116 if (sk) 3117 break; 3118 st->bucket = 0; 3119 st->state = TCP_SEQ_STATE_ESTABLISHED; 3120 fallthrough; 3121 case TCP_SEQ_STATE_ESTABLISHED: 3122 sk = bpf_iter_tcp_resume_established(seq); 3123 break; 3124 } 3125 3126 return sk; 3127 } 3128 3129 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3130 struct sock **start_sk) 3131 { 3132 struct bpf_tcp_iter_state *iter = seq->private; 3133 struct hlist_nulls_node *node; 3134 unsigned int expected = 1; 3135 struct sock *sk; 3136 3137 sock_hold(*start_sk); 3138 iter->batch[iter->end_sk++].sk = *start_sk; 3139 3140 sk = sk_nulls_next(*start_sk); 3141 *start_sk = NULL; 3142 sk_nulls_for_each_from(sk, node) { 3143 if (seq_sk_match(seq, sk)) { 3144 if (iter->end_sk < iter->max_sk) { 3145 sock_hold(sk); 3146 iter->batch[iter->end_sk++].sk = sk; 3147 } else if (!*start_sk) { 3148 /* Remember where we left off. */ 3149 *start_sk = sk; 3150 } 3151 expected++; 3152 } 3153 } 3154 3155 return expected; 3156 } 3157 3158 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3159 struct sock **start_sk) 3160 { 3161 struct bpf_tcp_iter_state *iter = seq->private; 3162 struct hlist_nulls_node *node; 3163 unsigned int expected = 1; 3164 struct sock *sk; 3165 3166 sock_hold(*start_sk); 3167 iter->batch[iter->end_sk++].sk = *start_sk; 3168 3169 sk = sk_nulls_next(*start_sk); 3170 *start_sk = NULL; 3171 sk_nulls_for_each_from(sk, node) { 3172 if (seq_sk_match(seq, sk)) { 3173 if (iter->end_sk < iter->max_sk) { 3174 sock_hold(sk); 3175 iter->batch[iter->end_sk++].sk = sk; 3176 } else if (!*start_sk) { 3177 /* Remember where we left off. */ 3178 *start_sk = sk; 3179 } 3180 expected++; 3181 } 3182 } 3183 3184 return expected; 3185 } 3186 3187 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3188 struct sock **start_sk) 3189 { 3190 struct bpf_tcp_iter_state *iter = seq->private; 3191 struct tcp_iter_state *st = &iter->state; 3192 3193 if (st->state == TCP_SEQ_STATE_LISTENING) 3194 return bpf_iter_tcp_listening_batch(seq, start_sk); 3195 else 3196 return bpf_iter_tcp_established_batch(seq, start_sk); 3197 } 3198 3199 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3200 { 3201 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3202 struct bpf_tcp_iter_state *iter = seq->private; 3203 struct tcp_iter_state *st = &iter->state; 3204 3205 if (st->state == TCP_SEQ_STATE_LISTENING) 3206 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3207 else 3208 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3209 } 3210 3211 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3212 { 3213 struct bpf_tcp_iter_state *iter = seq->private; 3214 unsigned int expected; 3215 struct sock *sk; 3216 int err; 3217 3218 sk = bpf_iter_tcp_resume(seq); 3219 if (!sk) 3220 return NULL; /* Done */ 3221 3222 expected = bpf_iter_fill_batch(seq, &sk); 3223 if (likely(iter->end_sk == expected)) 3224 goto done; 3225 3226 /* Batch size was too small. */ 3227 bpf_iter_tcp_unlock_bucket(seq); 3228 bpf_iter_tcp_put_batch(iter); 3229 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3230 GFP_USER); 3231 if (err) 3232 return ERR_PTR(err); 3233 3234 sk = bpf_iter_tcp_resume(seq); 3235 if (!sk) 3236 return NULL; /* Done */ 3237 3238 expected = bpf_iter_fill_batch(seq, &sk); 3239 if (likely(iter->end_sk == expected)) 3240 goto done; 3241 3242 /* Batch size was still too small. Hold onto the lock while we try 3243 * again with a larger batch to make sure the current bucket's size 3244 * does not change in the meantime. 3245 */ 3246 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3247 if (err) { 3248 bpf_iter_tcp_unlock_bucket(seq); 3249 return ERR_PTR(err); 3250 } 3251 3252 expected = bpf_iter_fill_batch(seq, &sk); 3253 WARN_ON_ONCE(iter->end_sk != expected); 3254 done: 3255 bpf_iter_tcp_unlock_bucket(seq); 3256 return iter->batch[0].sk; 3257 } 3258 3259 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3260 { 3261 /* bpf iter does not support lseek, so it always 3262 * continue from where it was stop()-ped. 3263 */ 3264 if (*pos) 3265 return bpf_iter_tcp_batch(seq); 3266 3267 return SEQ_START_TOKEN; 3268 } 3269 3270 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3271 { 3272 struct bpf_tcp_iter_state *iter = seq->private; 3273 struct tcp_iter_state *st = &iter->state; 3274 struct sock *sk; 3275 3276 /* Whenever seq_next() is called, the iter->cur_sk is 3277 * done with seq_show(), so advance to the next sk in 3278 * the batch. 3279 */ 3280 if (iter->cur_sk < iter->end_sk) { 3281 /* Keeping st->num consistent in tcp_iter_state. 3282 * bpf_iter_tcp does not use st->num. 3283 * meta.seq_num is used instead. 3284 */ 3285 st->num++; 3286 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3287 } 3288 3289 if (iter->cur_sk < iter->end_sk) 3290 sk = iter->batch[iter->cur_sk].sk; 3291 else 3292 sk = bpf_iter_tcp_batch(seq); 3293 3294 ++*pos; 3295 /* Keeping st->last_pos consistent in tcp_iter_state. 3296 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3297 */ 3298 st->last_pos = *pos; 3299 return sk; 3300 } 3301 3302 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3303 { 3304 struct bpf_iter_meta meta; 3305 struct bpf_prog *prog; 3306 struct sock *sk = v; 3307 uid_t uid; 3308 int ret; 3309 3310 if (v == SEQ_START_TOKEN) 3311 return 0; 3312 3313 if (sk_fullsock(sk)) 3314 lock_sock(sk); 3315 3316 if (unlikely(sk_unhashed(sk))) { 3317 ret = SEQ_SKIP; 3318 goto unlock; 3319 } 3320 3321 if (sk->sk_state == TCP_TIME_WAIT) { 3322 uid = 0; 3323 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3324 const struct request_sock *req = v; 3325 3326 uid = from_kuid_munged(seq_user_ns(seq), 3327 sk_uid(req->rsk_listener)); 3328 } else { 3329 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3330 } 3331 3332 meta.seq = seq; 3333 prog = bpf_iter_get_info(&meta, false); 3334 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3335 3336 unlock: 3337 if (sk_fullsock(sk)) 3338 release_sock(sk); 3339 return ret; 3340 3341 } 3342 3343 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3344 { 3345 struct bpf_tcp_iter_state *iter = seq->private; 3346 struct bpf_iter_meta meta; 3347 struct bpf_prog *prog; 3348 3349 if (!v) { 3350 meta.seq = seq; 3351 prog = bpf_iter_get_info(&meta, true); 3352 if (prog) 3353 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3354 } 3355 3356 if (iter->cur_sk < iter->end_sk) 3357 bpf_iter_tcp_put_batch(iter); 3358 } 3359 3360 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3361 .show = bpf_iter_tcp_seq_show, 3362 .start = bpf_iter_tcp_seq_start, 3363 .next = bpf_iter_tcp_seq_next, 3364 .stop = bpf_iter_tcp_seq_stop, 3365 }; 3366 #endif 3367 static unsigned short seq_file_family(const struct seq_file *seq) 3368 { 3369 const struct tcp_seq_afinfo *afinfo; 3370 3371 #ifdef CONFIG_BPF_SYSCALL 3372 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3373 if (seq->op == &bpf_iter_tcp_seq_ops) 3374 return AF_UNSPEC; 3375 #endif 3376 3377 /* Iterated from proc fs */ 3378 afinfo = pde_data(file_inode(seq->file)); 3379 return afinfo->family; 3380 } 3381 3382 static const struct seq_operations tcp4_seq_ops = { 3383 .show = tcp4_seq_show, 3384 .start = tcp_seq_start, 3385 .next = tcp_seq_next, 3386 .stop = tcp_seq_stop, 3387 }; 3388 3389 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3390 .family = AF_INET, 3391 }; 3392 3393 static int __net_init tcp4_proc_init_net(struct net *net) 3394 { 3395 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3396 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3397 return -ENOMEM; 3398 return 0; 3399 } 3400 3401 static void __net_exit tcp4_proc_exit_net(struct net *net) 3402 { 3403 remove_proc_entry("tcp", net->proc_net); 3404 } 3405 3406 static struct pernet_operations tcp4_net_ops = { 3407 .init = tcp4_proc_init_net, 3408 .exit = tcp4_proc_exit_net, 3409 }; 3410 3411 int __init tcp4_proc_init(void) 3412 { 3413 return register_pernet_subsys(&tcp4_net_ops); 3414 } 3415 3416 void tcp4_proc_exit(void) 3417 { 3418 unregister_pernet_subsys(&tcp4_net_ops); 3419 } 3420 #endif /* CONFIG_PROC_FS */ 3421 3422 /* @wake is one when sk_stream_write_space() calls us. 3423 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3424 * This mimics the strategy used in sock_def_write_space(). 3425 */ 3426 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3427 { 3428 const struct tcp_sock *tp = tcp_sk(sk); 3429 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3430 READ_ONCE(tp->snd_nxt); 3431 3432 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3433 } 3434 EXPORT_SYMBOL(tcp_stream_memory_free); 3435 3436 struct proto tcp_prot = { 3437 .name = "TCP", 3438 .owner = THIS_MODULE, 3439 .close = tcp_close, 3440 .pre_connect = tcp_v4_pre_connect, 3441 .connect = tcp_v4_connect, 3442 .disconnect = tcp_disconnect, 3443 .accept = inet_csk_accept, 3444 .ioctl = tcp_ioctl, 3445 .init = tcp_v4_init_sock, 3446 .destroy = tcp_v4_destroy_sock, 3447 .shutdown = tcp_shutdown, 3448 .setsockopt = tcp_setsockopt, 3449 .getsockopt = tcp_getsockopt, 3450 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3451 .keepalive = tcp_set_keepalive, 3452 .recvmsg = tcp_recvmsg, 3453 .sendmsg = tcp_sendmsg, 3454 .splice_eof = tcp_splice_eof, 3455 .backlog_rcv = tcp_v4_do_rcv, 3456 .release_cb = tcp_release_cb, 3457 .hash = inet_hash, 3458 .unhash = inet_unhash, 3459 .get_port = inet_csk_get_port, 3460 .put_port = inet_put_port, 3461 #ifdef CONFIG_BPF_SYSCALL 3462 .psock_update_sk_prot = tcp_bpf_update_proto, 3463 #endif 3464 .enter_memory_pressure = tcp_enter_memory_pressure, 3465 .leave_memory_pressure = tcp_leave_memory_pressure, 3466 .stream_memory_free = tcp_stream_memory_free, 3467 .sockets_allocated = &tcp_sockets_allocated, 3468 3469 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3470 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3471 3472 .memory_pressure = &tcp_memory_pressure, 3473 .sysctl_mem = sysctl_tcp_mem, 3474 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3475 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3476 .max_header = MAX_TCP_HEADER, 3477 .obj_size = sizeof(struct tcp_sock), 3478 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3479 .twsk_prot = &tcp_timewait_sock_ops, 3480 .rsk_prot = &tcp_request_sock_ops, 3481 .h.hashinfo = NULL, 3482 .no_autobind = true, 3483 .diag_destroy = tcp_abort, 3484 }; 3485 EXPORT_SYMBOL(tcp_prot); 3486 3487 static void __net_exit tcp_sk_exit(struct net *net) 3488 { 3489 if (net->ipv4.tcp_congestion_control) 3490 bpf_module_put(net->ipv4.tcp_congestion_control, 3491 net->ipv4.tcp_congestion_control->owner); 3492 } 3493 3494 static void __net_init tcp_set_hashinfo(struct net *net) 3495 { 3496 struct inet_hashinfo *hinfo; 3497 unsigned int ehash_entries; 3498 struct net *old_net; 3499 3500 if (net_eq(net, &init_net)) 3501 goto fallback; 3502 3503 old_net = current->nsproxy->net_ns; 3504 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3505 if (!ehash_entries) 3506 goto fallback; 3507 3508 ehash_entries = roundup_pow_of_two(ehash_entries); 3509 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3510 if (!hinfo) { 3511 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3512 "for a netns, fallback to the global one\n", 3513 ehash_entries); 3514 fallback: 3515 hinfo = &tcp_hashinfo; 3516 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3517 } 3518 3519 net->ipv4.tcp_death_row.hashinfo = hinfo; 3520 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3521 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3522 } 3523 3524 static int __net_init tcp_sk_init(struct net *net) 3525 { 3526 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3527 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3528 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; 3529 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3530 3531 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3532 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3533 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3534 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3535 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3536 3537 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3538 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3539 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3540 3541 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3542 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3543 net->ipv4.sysctl_tcp_syncookies = 1; 3544 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3545 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3546 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3547 net->ipv4.sysctl_tcp_orphan_retries = 0; 3548 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3549 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3550 net->ipv4.sysctl_tcp_tw_reuse = 2; 3551 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3552 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3553 3554 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3555 tcp_set_hashinfo(net); 3556 3557 net->ipv4.sysctl_tcp_sack = 1; 3558 net->ipv4.sysctl_tcp_window_scaling = 1; 3559 net->ipv4.sysctl_tcp_timestamps = 1; 3560 net->ipv4.sysctl_tcp_early_retrans = 3; 3561 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3562 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3563 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3564 net->ipv4.sysctl_tcp_max_reordering = 300; 3565 net->ipv4.sysctl_tcp_dsack = 1; 3566 net->ipv4.sysctl_tcp_app_win = 31; 3567 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3568 net->ipv4.sysctl_tcp_frto = 2; 3569 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3570 /* This limits the percentage of the congestion window which we 3571 * will allow a single TSO frame to consume. Building TSO frames 3572 * which are too large can cause TCP streams to be bursty. 3573 */ 3574 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3575 /* Default TSQ limit of 4 MB */ 3576 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3577 3578 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3579 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3580 3581 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3582 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3583 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3584 net->ipv4.sysctl_tcp_autocorking = 1; 3585 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3586 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3587 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3588 if (net != &init_net) { 3589 memcpy(net->ipv4.sysctl_tcp_rmem, 3590 init_net.ipv4.sysctl_tcp_rmem, 3591 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3592 memcpy(net->ipv4.sysctl_tcp_wmem, 3593 init_net.ipv4.sysctl_tcp_wmem, 3594 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3595 } 3596 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3597 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3598 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3599 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3600 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3601 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3602 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3603 3604 /* Set default values for PLB */ 3605 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3606 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3607 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3608 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3609 /* Default congestion threshold for PLB to mark a round is 50% */ 3610 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3611 3612 /* Reno is always built in */ 3613 if (!net_eq(net, &init_net) && 3614 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3615 init_net.ipv4.tcp_congestion_control->owner)) 3616 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3617 else 3618 net->ipv4.tcp_congestion_control = &tcp_reno; 3619 3620 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3621 net->ipv4.sysctl_tcp_shrink_window = 0; 3622 3623 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3624 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3625 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3626 3627 return 0; 3628 } 3629 3630 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3631 { 3632 struct net *net; 3633 3634 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3635 * and failed setup_net error unwinding path are serialized. 3636 * 3637 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3638 * net_exit_list, the thread that dismantles a particular twsk must 3639 * do so without other thread progressing to refcount_dec_and_test() of 3640 * tcp_death_row.tw_refcount. 3641 */ 3642 mutex_lock(&tcp_exit_batch_mutex); 3643 3644 tcp_twsk_purge(net_exit_list); 3645 3646 list_for_each_entry(net, net_exit_list, exit_list) { 3647 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3648 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3649 tcp_fastopen_ctx_destroy(net); 3650 } 3651 3652 mutex_unlock(&tcp_exit_batch_mutex); 3653 } 3654 3655 static struct pernet_operations __net_initdata tcp_sk_ops = { 3656 .init = tcp_sk_init, 3657 .exit = tcp_sk_exit, 3658 .exit_batch = tcp_sk_exit_batch, 3659 }; 3660 3661 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3662 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3663 struct sock_common *sk_common, uid_t uid) 3664 3665 #define INIT_BATCH_SZ 16 3666 3667 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3668 { 3669 struct bpf_tcp_iter_state *iter = priv_data; 3670 int err; 3671 3672 err = bpf_iter_init_seq_net(priv_data, aux); 3673 if (err) 3674 return err; 3675 3676 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3677 if (err) { 3678 bpf_iter_fini_seq_net(priv_data); 3679 return err; 3680 } 3681 3682 return 0; 3683 } 3684 3685 static void bpf_iter_fini_tcp(void *priv_data) 3686 { 3687 struct bpf_tcp_iter_state *iter = priv_data; 3688 3689 bpf_iter_fini_seq_net(priv_data); 3690 kvfree(iter->batch); 3691 } 3692 3693 static const struct bpf_iter_seq_info tcp_seq_info = { 3694 .seq_ops = &bpf_iter_tcp_seq_ops, 3695 .init_seq_private = bpf_iter_init_tcp, 3696 .fini_seq_private = bpf_iter_fini_tcp, 3697 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3698 }; 3699 3700 static const struct bpf_func_proto * 3701 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3702 const struct bpf_prog *prog) 3703 { 3704 switch (func_id) { 3705 case BPF_FUNC_setsockopt: 3706 return &bpf_sk_setsockopt_proto; 3707 case BPF_FUNC_getsockopt: 3708 return &bpf_sk_getsockopt_proto; 3709 default: 3710 return NULL; 3711 } 3712 } 3713 3714 static struct bpf_iter_reg tcp_reg_info = { 3715 .target = "tcp", 3716 .ctx_arg_info_size = 1, 3717 .ctx_arg_info = { 3718 { offsetof(struct bpf_iter__tcp, sk_common), 3719 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3720 }, 3721 .get_func_proto = bpf_iter_tcp_get_func_proto, 3722 .seq_info = &tcp_seq_info, 3723 }; 3724 3725 static void __init bpf_iter_register(void) 3726 { 3727 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3728 if (bpf_iter_reg_target(&tcp_reg_info)) 3729 pr_warn("Warning: could not register bpf iterator tcp\n"); 3730 } 3731 3732 #endif 3733 3734 void __init tcp_v4_init(void) 3735 { 3736 int cpu, res; 3737 3738 for_each_possible_cpu(cpu) { 3739 struct sock *sk; 3740 3741 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3742 IPPROTO_TCP, &init_net); 3743 if (res) 3744 panic("Failed to create the TCP control socket.\n"); 3745 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3746 3747 /* Please enforce IP_DF and IPID==0 for RST and 3748 * ACK sent in SYN-RECV and TIME-WAIT state. 3749 */ 3750 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3751 3752 sk->sk_clockid = CLOCK_MONOTONIC; 3753 3754 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3755 } 3756 if (register_pernet_subsys(&tcp_sk_ops)) 3757 panic("Failed to create the TCP control socket.\n"); 3758 3759 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3760 bpf_iter_register(); 3761 #endif 3762 } 3763