1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/fips.h> 57 #include <linux/jhash.h> 58 #include <linux/init.h> 59 #include <linux/times.h> 60 #include <linux/slab.h> 61 #include <linux/sched.h> 62 #include <linux/sock_diag.h> 63 64 #include <net/aligned_data.h> 65 #include <net/net_namespace.h> 66 #include <net/icmp.h> 67 #include <net/inet_hashtables.h> 68 #include <net/tcp.h> 69 #include <net/tcp_ecn.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/inet_ecn.h> 74 #include <net/timewait_sock.h> 75 #include <net/xfrm.h> 76 #include <net/secure_seq.h> 77 #include <net/busy_poll.h> 78 #include <net/rstreason.h> 79 #include <net/psp.h> 80 81 #include <linux/inet.h> 82 #include <linux/ipv6.h> 83 #include <linux/stddef.h> 84 #include <linux/proc_fs.h> 85 #include <linux/seq_file.h> 86 #include <linux/inetdevice.h> 87 #include <linux/btf_ids.h> 88 #include <linux/skbuff_ref.h> 89 90 #include <crypto/md5.h> 91 92 #include <trace/events/tcp.h> 93 94 #ifdef CONFIG_TCP_MD5SIG 95 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 96 __be32 daddr, __be32 saddr, const struct tcphdr *th); 97 #endif 98 99 struct inet_hashinfo tcp_hashinfo; 100 101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 102 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 103 }; 104 105 static DEFINE_MUTEX(tcp_exit_batch_mutex); 106 107 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 108 { 109 return secure_tcp_seq(ip_hdr(skb)->daddr, 110 ip_hdr(skb)->saddr, 111 tcp_hdr(skb)->dest, 112 tcp_hdr(skb)->source); 113 } 114 115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 116 { 117 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 118 } 119 120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 121 { 122 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 123 const struct inet_timewait_sock *tw = inet_twsk(sktw); 124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 125 struct tcp_sock *tp = tcp_sk(sk); 126 int ts_recent_stamp; 127 u32 reuse_thresh; 128 129 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 130 reuse = 0; 131 132 if (reuse == 2) { 133 /* Still does not detect *everything* that goes through 134 * lo, since we require a loopback src or dst address 135 * or direct binding to 'lo' interface. 136 */ 137 bool loopback = false; 138 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 139 loopback = true; 140 #if IS_ENABLED(CONFIG_IPV6) 141 if (tw->tw_family == AF_INET6) { 142 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 144 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 145 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 146 loopback = true; 147 } else 148 #endif 149 { 150 if (ipv4_is_loopback(tw->tw_daddr) || 151 ipv4_is_loopback(tw->tw_rcv_saddr)) 152 loopback = true; 153 } 154 if (!loopback) 155 reuse = 0; 156 } 157 158 /* With PAWS, it is safe from the viewpoint 159 of data integrity. Even without PAWS it is safe provided sequence 160 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 161 162 Actually, the idea is close to VJ's one, only timestamp cache is 163 held not per host, but per port pair and TW bucket is used as state 164 holder. 165 166 If TW bucket has been already destroyed we fall back to VJ's scheme 167 and use initial timestamp retrieved from peer table. 168 */ 169 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 170 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 171 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 172 if (ts_recent_stamp && 173 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 174 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 175 * and releasing the bucket lock. 176 */ 177 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 178 return 0; 179 180 /* In case of repair and re-using TIME-WAIT sockets we still 181 * want to be sure that it is safe as above but honor the 182 * sequence numbers and time stamps set as part of the repair 183 * process. 184 * 185 * Without this check re-using a TIME-WAIT socket with TCP 186 * repair would accumulate a -1 on the repair assigned 187 * sequence number. The first time it is reused the sequence 188 * is -1, the second time -2, etc. This fixes that issue 189 * without appearing to create any others. 190 */ 191 if (likely(!tp->repair)) { 192 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 193 194 if (!seq) 195 seq = 1; 196 WRITE_ONCE(tp->write_seq, seq); 197 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 198 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 199 } 200 201 return 1; 202 } 203 204 return 0; 205 } 206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 207 208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, 209 int addr_len) 210 { 211 /* This check is replicated from tcp_v4_connect() and intended to 212 * prevent BPF program called below from accessing bytes that are out 213 * of the bound specified by user in addr_len. 214 */ 215 if (addr_len < sizeof(struct sockaddr_in)) 216 return -EINVAL; 217 218 sock_owned_by_me(sk); 219 220 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 221 } 222 223 /* This will initiate an outgoing connection. */ 224 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) 225 { 226 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 227 struct inet_timewait_death_row *tcp_death_row; 228 struct inet_sock *inet = inet_sk(sk); 229 struct tcp_sock *tp = tcp_sk(sk); 230 struct ip_options_rcu *inet_opt; 231 struct net *net = sock_net(sk); 232 __be16 orig_sport, orig_dport; 233 __be32 daddr, nexthop; 234 struct flowi4 *fl4; 235 struct rtable *rt; 236 int err; 237 238 if (addr_len < sizeof(struct sockaddr_in)) 239 return -EINVAL; 240 241 if (usin->sin_family != AF_INET) 242 return -EAFNOSUPPORT; 243 244 nexthop = daddr = usin->sin_addr.s_addr; 245 inet_opt = rcu_dereference_protected(inet->inet_opt, 246 lockdep_sock_is_held(sk)); 247 if (inet_opt && inet_opt->opt.srr) { 248 if (!daddr) 249 return -EINVAL; 250 nexthop = inet_opt->opt.faddr; 251 } 252 253 orig_sport = inet->inet_sport; 254 orig_dport = usin->sin_port; 255 fl4 = &inet->cork.fl.u.ip4; 256 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 257 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 258 orig_dport, sk); 259 if (IS_ERR(rt)) { 260 err = PTR_ERR(rt); 261 if (err == -ENETUNREACH) 262 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 263 return err; 264 } 265 266 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 267 ip_rt_put(rt); 268 return -ENETUNREACH; 269 } 270 271 if (!inet_opt || !inet_opt->opt.srr) 272 daddr = fl4->daddr; 273 274 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 275 276 if (!inet->inet_saddr) { 277 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 278 if (err) { 279 ip_rt_put(rt); 280 return err; 281 } 282 } else { 283 sk_rcv_saddr_set(sk, inet->inet_saddr); 284 } 285 286 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 287 /* Reset inherited state */ 288 tp->rx_opt.ts_recent = 0; 289 tp->rx_opt.ts_recent_stamp = 0; 290 if (likely(!tp->repair)) 291 WRITE_ONCE(tp->write_seq, 0); 292 } 293 294 inet->inet_dport = usin->sin_port; 295 sk_daddr_set(sk, daddr); 296 297 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); 298 if (inet_opt) 299 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; 300 301 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 302 303 /* Socket identity is still unknown (sport may be zero). 304 * However we set state to SYN-SENT and not releasing socket 305 * lock select source port, enter ourselves into the hash tables and 306 * complete initialization after this. 307 */ 308 tcp_set_state(sk, TCP_SYN_SENT); 309 err = inet_hash_connect(tcp_death_row, sk); 310 if (err) 311 goto failure; 312 313 sk_set_txhash(sk); 314 315 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 316 inet->inet_sport, inet->inet_dport, sk); 317 if (IS_ERR(rt)) { 318 err = PTR_ERR(rt); 319 rt = NULL; 320 goto failure; 321 } 322 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 323 /* OK, now commit destination to socket. */ 324 sk->sk_gso_type = SKB_GSO_TCPV4; 325 sk_setup_caps(sk, &rt->dst); 326 rt = NULL; 327 328 if (likely(!tp->repair)) { 329 if (!tp->write_seq) 330 WRITE_ONCE(tp->write_seq, 331 secure_tcp_seq(inet->inet_saddr, 332 inet->inet_daddr, 333 inet->inet_sport, 334 usin->sin_port)); 335 WRITE_ONCE(tp->tsoffset, 336 secure_tcp_ts_off(net, inet->inet_saddr, 337 inet->inet_daddr)); 338 } 339 340 atomic_set(&inet->inet_id, get_random_u16()); 341 342 if (tcp_fastopen_defer_connect(sk, &err)) 343 return err; 344 if (err) 345 goto failure; 346 347 err = tcp_connect(sk); 348 349 if (err) 350 goto failure; 351 352 return 0; 353 354 failure: 355 /* 356 * This unhashes the socket and releases the local port, 357 * if necessary. 358 */ 359 tcp_set_state(sk, TCP_CLOSE); 360 inet_bhash2_reset_saddr(sk); 361 ip_rt_put(rt); 362 sk->sk_route_caps = 0; 363 inet->inet_dport = 0; 364 return err; 365 } 366 EXPORT_IPV6_MOD(tcp_v4_connect); 367 368 /* 369 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 370 * It can be called through tcp_release_cb() if socket was owned by user 371 * at the time tcp_v4_err() was called to handle ICMP message. 372 */ 373 void tcp_v4_mtu_reduced(struct sock *sk) 374 { 375 struct inet_sock *inet = inet_sk(sk); 376 struct dst_entry *dst; 377 u32 mtu, dmtu; 378 379 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 380 return; 381 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 382 dst = inet_csk_update_pmtu(sk, mtu); 383 if (!dst) 384 return; 385 386 /* Something is about to be wrong... Remember soft error 387 * for the case, if this connection will not able to recover. 388 */ 389 dmtu = dst4_mtu(dst); 390 if (mtu < dmtu && ip_dont_fragment(sk, dst)) 391 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 392 393 if (inet->pmtudisc != IP_PMTUDISC_DONT && 394 ip_sk_accept_pmtu(sk) && 395 inet_csk(sk)->icsk_pmtu_cookie > dmtu) { 396 tcp_sync_mss(sk, dmtu); 397 398 /* Resend the TCP packet because it's 399 * clear that the old packet has been 400 * dropped. This is the new "fast" path mtu 401 * discovery. 402 */ 403 tcp_simple_retransmit(sk); 404 } /* else let the usual retransmit timer handle it */ 405 } 406 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 407 408 static void do_redirect(struct sk_buff *skb, struct sock *sk) 409 { 410 struct dst_entry *dst = __sk_dst_check(sk, 0); 411 412 if (dst) 413 dst->ops->redirect(dst, sk, skb); 414 } 415 416 417 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 418 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 419 { 420 struct request_sock *req = inet_reqsk(sk); 421 struct net *net = sock_net(sk); 422 423 /* ICMPs are not backlogged, hence we cannot get 424 * an established socket here. 425 */ 426 if (seq != tcp_rsk(req)->snt_isn) { 427 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 428 } else if (abort) { 429 /* 430 * Still in SYN_RECV, just remove it silently. 431 * There is no good way to pass the error to the newly 432 * created socket, and POSIX does not want network 433 * errors returned from accept(). 434 */ 435 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 436 tcp_listendrop(req->rsk_listener); 437 } 438 reqsk_put(req); 439 } 440 EXPORT_IPV6_MOD(tcp_req_err); 441 442 /* TCP-LD (RFC 6069) logic */ 443 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 444 { 445 struct inet_connection_sock *icsk = inet_csk(sk); 446 struct tcp_sock *tp = tcp_sk(sk); 447 struct sk_buff *skb; 448 s32 remaining; 449 u32 delta_us; 450 451 if (sock_owned_by_user(sk)) 452 return; 453 454 if (seq != tp->snd_una || !icsk->icsk_retransmits || 455 !icsk->icsk_backoff) 456 return; 457 458 skb = tcp_rtx_queue_head(sk); 459 if (WARN_ON_ONCE(!skb)) 460 return; 461 462 icsk->icsk_backoff--; 463 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 464 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 465 466 tcp_mstamp_refresh(tp); 467 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 468 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 469 470 if (remaining > 0) { 471 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 472 } else { 473 /* RTO revert clocked out retransmission. 474 * Will retransmit now. 475 */ 476 tcp_retransmit_timer(sk); 477 } 478 } 479 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 480 481 /* 482 * This routine is called by the ICMP module when it gets some 483 * sort of error condition. If err < 0 then the socket should 484 * be closed and the error returned to the user. If err > 0 485 * it's just the icmp type << 8 | icmp code. After adjustment 486 * header points to the first 8 bytes of the tcp header. We need 487 * to find the appropriate port. 488 * 489 * The locking strategy used here is very "optimistic". When 490 * someone else accesses the socket the ICMP is just dropped 491 * and for some paths there is no check at all. 492 * A more general error queue to queue errors for later handling 493 * is probably better. 494 * 495 */ 496 497 int tcp_v4_err(struct sk_buff *skb, u32 info) 498 { 499 const struct iphdr *iph = (const struct iphdr *)skb->data; 500 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 501 struct net *net = dev_net_rcu(skb->dev); 502 const int type = icmp_hdr(skb)->type; 503 const int code = icmp_hdr(skb)->code; 504 struct request_sock *fastopen; 505 struct tcp_sock *tp; 506 u32 seq, snd_una; 507 struct sock *sk; 508 int err; 509 510 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 511 ntohs(th->source), inet_iif(skb), 0); 512 if (!sk) { 513 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 514 return -ENOENT; 515 } 516 if (sk->sk_state == TCP_TIME_WAIT) { 517 /* To increase the counter of ignored icmps for TCP-AO */ 518 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 519 inet_twsk_put(inet_twsk(sk)); 520 return 0; 521 } 522 seq = ntohl(th->seq); 523 if (sk->sk_state == TCP_NEW_SYN_RECV) { 524 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 525 type == ICMP_TIME_EXCEEDED || 526 (type == ICMP_DEST_UNREACH && 527 (code == ICMP_NET_UNREACH || 528 code == ICMP_HOST_UNREACH))); 529 return 0; 530 } 531 532 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 533 sock_put(sk); 534 return 0; 535 } 536 537 bh_lock_sock(sk); 538 /* If too many ICMPs get dropped on busy 539 * servers this needs to be solved differently. 540 * We do take care of PMTU discovery (RFC1191) special case : 541 * we can receive locally generated ICMP messages while socket is held. 542 */ 543 if (sock_owned_by_user(sk)) { 544 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 545 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 546 } 547 if (sk->sk_state == TCP_CLOSE) 548 goto out; 549 550 if (static_branch_unlikely(&ip4_min_ttl)) { 551 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 552 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 553 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 554 goto out; 555 } 556 } 557 558 tp = tcp_sk(sk); 559 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 560 fastopen = rcu_dereference(tp->fastopen_rsk); 561 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 562 if (sk->sk_state != TCP_LISTEN && 563 !between(seq, snd_una, tp->snd_nxt)) { 564 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 565 goto out; 566 } 567 568 switch (type) { 569 case ICMP_REDIRECT: 570 if (!sock_owned_by_user(sk)) 571 do_redirect(skb, sk); 572 goto out; 573 case ICMP_SOURCE_QUENCH: 574 /* Just silently ignore these. */ 575 goto out; 576 case ICMP_PARAMETERPROB: 577 err = EPROTO; 578 break; 579 case ICMP_DEST_UNREACH: 580 if (code > NR_ICMP_UNREACH) 581 goto out; 582 583 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 584 /* We are not interested in TCP_LISTEN and open_requests 585 * (SYN-ACKs send out by Linux are always <576bytes so 586 * they should go through unfragmented). 587 */ 588 if (sk->sk_state == TCP_LISTEN) 589 goto out; 590 591 WRITE_ONCE(tp->mtu_info, info); 592 if (!sock_owned_by_user(sk)) { 593 tcp_v4_mtu_reduced(sk); 594 } else { 595 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 596 sock_hold(sk); 597 } 598 goto out; 599 } 600 601 err = icmp_err_convert[code].errno; 602 /* check if this ICMP message allows revert of backoff. 603 * (see RFC 6069) 604 */ 605 if (!fastopen && 606 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 607 tcp_ld_RTO_revert(sk, seq); 608 break; 609 case ICMP_TIME_EXCEEDED: 610 err = EHOSTUNREACH; 611 break; 612 default: 613 goto out; 614 } 615 616 switch (sk->sk_state) { 617 case TCP_SYN_SENT: 618 case TCP_SYN_RECV: 619 /* Only in fast or simultaneous open. If a fast open socket is 620 * already accepted it is treated as a connected one below. 621 */ 622 if (fastopen && !fastopen->sk) 623 break; 624 625 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 626 627 if (!sock_owned_by_user(sk)) 628 tcp_done_with_error(sk, err); 629 else 630 WRITE_ONCE(sk->sk_err_soft, err); 631 goto out; 632 } 633 634 /* If we've already connected we will keep trying 635 * until we time out, or the user gives up. 636 * 637 * rfc1122 4.2.3.9 allows to consider as hard errors 638 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 639 * but it is obsoleted by pmtu discovery). 640 * 641 * Note, that in modern internet, where routing is unreliable 642 * and in each dark corner broken firewalls sit, sending random 643 * errors ordered by their masters even this two messages finally lose 644 * their original sense (even Linux sends invalid PORT_UNREACHs) 645 * 646 * Now we are in compliance with RFCs. 647 * --ANK (980905) 648 */ 649 650 if (!sock_owned_by_user(sk) && 651 inet_test_bit(RECVERR, sk)) { 652 WRITE_ONCE(sk->sk_err, err); 653 sk_error_report(sk); 654 } else { /* Only an error on timeout */ 655 WRITE_ONCE(sk->sk_err_soft, err); 656 } 657 658 out: 659 bh_unlock_sock(sk); 660 sock_put(sk); 661 return 0; 662 } 663 664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 665 { 666 struct tcphdr *th = tcp_hdr(skb); 667 668 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 669 skb->csum_start = skb_transport_header(skb) - skb->head; 670 skb->csum_offset = offsetof(struct tcphdr, check); 671 } 672 673 /* This routine computes an IPv4 TCP checksum. */ 674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 675 { 676 const struct inet_sock *inet = inet_sk(sk); 677 678 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 679 } 680 EXPORT_IPV6_MOD(tcp_v4_send_check); 681 682 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 683 684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 685 const struct tcp_ao_hdr *aoh, 686 struct ip_reply_arg *arg, struct tcphdr *reply, 687 __be32 reply_options[REPLY_OPTIONS_LEN]) 688 { 689 #ifdef CONFIG_TCP_AO 690 int sdif = tcp_v4_sdif(skb); 691 int dif = inet_iif(skb); 692 int l3index = sdif ? dif : 0; 693 bool allocated_traffic_key; 694 struct tcp_ao_key *key; 695 char *traffic_key; 696 bool drop = true; 697 u32 ao_sne = 0; 698 u8 keyid; 699 700 rcu_read_lock(); 701 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 702 &key, &traffic_key, &allocated_traffic_key, 703 &keyid, &ao_sne)) 704 goto out; 705 706 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 707 (aoh->rnext_keyid << 8) | keyid); 708 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 709 reply->doff = arg->iov[0].iov_len / 4; 710 711 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 712 key, traffic_key, 713 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 714 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 715 reply, ao_sne)) 716 goto out; 717 drop = false; 718 out: 719 rcu_read_unlock(); 720 if (allocated_traffic_key) 721 kfree(traffic_key); 722 return drop; 723 #else 724 return true; 725 #endif 726 } 727 728 /* 729 * This routine will send an RST to the other tcp. 730 * 731 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 732 * for reset. 733 * Answer: if a packet caused RST, it is not for a socket 734 * existing in our system, if it is matched to a socket, 735 * it is just duplicate segment or bug in other side's TCP. 736 * So that we build reply only basing on parameters 737 * arrived with segment. 738 * Exception: precedence violation. We do not implement it in any case. 739 */ 740 741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 742 enum sk_rst_reason reason) 743 { 744 const struct tcphdr *th = tcp_hdr(skb); 745 struct { 746 struct tcphdr th; 747 __be32 opt[REPLY_OPTIONS_LEN]; 748 } rep; 749 const __u8 *md5_hash_location = NULL; 750 const struct tcp_ao_hdr *aoh; 751 struct ip_reply_arg arg; 752 #ifdef CONFIG_TCP_MD5SIG 753 struct tcp_md5sig_key *key = NULL; 754 unsigned char newhash[16]; 755 struct sock *sk1 = NULL; 756 #endif 757 u64 transmit_time = 0; 758 struct sock *ctl_sk; 759 struct net *net; 760 u32 txhash = 0; 761 762 /* Never send a reset in response to a reset. */ 763 if (th->rst) 764 return; 765 766 /* If sk not NULL, it means we did a successful lookup and incoming 767 * route had to be correct. prequeue might have dropped our dst. 768 */ 769 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 770 return; 771 772 /* Swap the send and the receive. */ 773 memset(&rep, 0, sizeof(rep)); 774 rep.th.dest = th->source; 775 rep.th.source = th->dest; 776 rep.th.doff = sizeof(struct tcphdr) / 4; 777 rep.th.rst = 1; 778 779 if (th->ack) { 780 rep.th.seq = th->ack_seq; 781 } else { 782 rep.th.ack = 1; 783 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 784 skb->len - (th->doff << 2)); 785 } 786 787 memset(&arg, 0, sizeof(arg)); 788 arg.iov[0].iov_base = (unsigned char *)&rep; 789 arg.iov[0].iov_len = sizeof(rep.th); 790 791 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 792 793 /* Invalid TCP option size or twice included auth */ 794 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 795 return; 796 797 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 798 return; 799 800 #ifdef CONFIG_TCP_MD5SIG 801 rcu_read_lock(); 802 if (sk && sk_fullsock(sk)) { 803 const union tcp_md5_addr *addr; 804 int l3index; 805 806 /* sdif set, means packet ingressed via a device 807 * in an L3 domain and inet_iif is set to it. 808 */ 809 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 810 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 811 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 812 } else if (md5_hash_location) { 813 const union tcp_md5_addr *addr; 814 int sdif = tcp_v4_sdif(skb); 815 int dif = inet_iif(skb); 816 int l3index; 817 818 /* 819 * active side is lost. Try to find listening socket through 820 * source port, and then find md5 key through listening socket. 821 * we are not loose security here: 822 * Incoming packet is checked with md5 hash with finding key, 823 * no RST generated if md5 hash doesn't match. 824 */ 825 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 826 th->source, ip_hdr(skb)->daddr, 827 ntohs(th->source), dif, sdif); 828 /* don't send rst if it can't find key */ 829 if (!sk1) 830 goto out; 831 832 /* sdif set, means packet ingressed via a device 833 * in an L3 domain and dif is set to it. 834 */ 835 l3index = sdif ? dif : 0; 836 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 837 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 838 if (!key) 839 goto out; 840 841 tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 842 if (memcmp(md5_hash_location, newhash, 16) != 0) 843 goto out; 844 } 845 846 if (key) { 847 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 848 (TCPOPT_NOP << 16) | 849 (TCPOPT_MD5SIG << 8) | 850 TCPOLEN_MD5SIG); 851 /* Update length and the length the header thinks exists */ 852 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 853 rep.th.doff = arg.iov[0].iov_len / 4; 854 855 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 856 key, ip_hdr(skb)->saddr, 857 ip_hdr(skb)->daddr, &rep.th); 858 } 859 #endif 860 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 861 if (rep.opt[0] == 0) { 862 __be32 mrst = mptcp_reset_option(skb); 863 864 if (mrst) { 865 rep.opt[0] = mrst; 866 arg.iov[0].iov_len += sizeof(mrst); 867 rep.th.doff = arg.iov[0].iov_len / 4; 868 } 869 } 870 871 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 872 ip_hdr(skb)->saddr, /* XXX */ 873 arg.iov[0].iov_len, IPPROTO_TCP, 0); 874 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 875 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 876 877 /* When socket is gone, all binding information is lost. 878 * routing might fail in this case. No choice here, if we choose to force 879 * input interface, we will misroute in case of asymmetric route. 880 */ 881 if (sk) 882 arg.bound_dev_if = sk->sk_bound_dev_if; 883 884 trace_tcp_send_reset(sk, skb, reason); 885 886 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 887 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 888 889 /* ECN bits of TW reset are cleared */ 890 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 891 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 892 local_bh_disable(); 893 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 894 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 895 896 sock_net_set(ctl_sk, net); 897 if (sk) { 898 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 899 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 900 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 901 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 902 transmit_time = tcp_transmit_time(sk); 903 xfrm_sk_clone_policy(ctl_sk, sk); 904 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 905 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 906 } else { 907 ctl_sk->sk_mark = 0; 908 ctl_sk->sk_priority = 0; 909 } 910 ip_send_unicast_reply(ctl_sk, sk, 911 skb, &TCP_SKB_CB(skb)->header.h4.opt, 912 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 913 &arg, arg.iov[0].iov_len, 914 transmit_time, txhash); 915 916 xfrm_sk_free_policy(ctl_sk); 917 sock_net_set(ctl_sk, &init_net); 918 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 919 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 920 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 921 local_bh_enable(); 922 923 #ifdef CONFIG_TCP_MD5SIG 924 out: 925 rcu_read_unlock(); 926 #endif 927 } 928 929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 930 outside socket context is ugly, certainly. What can I do? 931 */ 932 933 static void tcp_v4_send_ack(const struct sock *sk, 934 struct sk_buff *skb, u32 seq, u32 ack, 935 u32 win, u32 tsval, u32 tsecr, int oif, 936 struct tcp_key *key, 937 int reply_flags, u8 tos, u32 txhash) 938 { 939 const struct tcphdr *th = tcp_hdr(skb); 940 struct { 941 struct tcphdr th; 942 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 943 } rep; 944 struct net *net = sock_net(sk); 945 struct ip_reply_arg arg; 946 struct sock *ctl_sk; 947 u64 transmit_time; 948 949 memset(&rep.th, 0, sizeof(struct tcphdr)); 950 memset(&arg, 0, sizeof(arg)); 951 952 arg.iov[0].iov_base = (unsigned char *)&rep; 953 arg.iov[0].iov_len = sizeof(rep.th); 954 if (tsecr) { 955 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 956 (TCPOPT_TIMESTAMP << 8) | 957 TCPOLEN_TIMESTAMP); 958 rep.opt[1] = htonl(tsval); 959 rep.opt[2] = htonl(tsecr); 960 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 961 } 962 963 /* Swap the send and the receive. */ 964 rep.th.dest = th->source; 965 rep.th.source = th->dest; 966 rep.th.doff = arg.iov[0].iov_len / 4; 967 rep.th.seq = htonl(seq); 968 rep.th.ack_seq = htonl(ack); 969 rep.th.ack = 1; 970 rep.th.window = htons(win); 971 972 #ifdef CONFIG_TCP_MD5SIG 973 if (tcp_key_is_md5(key)) { 974 int offset = (tsecr) ? 3 : 0; 975 976 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 977 (TCPOPT_NOP << 16) | 978 (TCPOPT_MD5SIG << 8) | 979 TCPOLEN_MD5SIG); 980 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 981 rep.th.doff = arg.iov[0].iov_len/4; 982 983 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 984 key->md5_key, ip_hdr(skb)->saddr, 985 ip_hdr(skb)->daddr, &rep.th); 986 } 987 #endif 988 #ifdef CONFIG_TCP_AO 989 if (tcp_key_is_ao(key)) { 990 int offset = (tsecr) ? 3 : 0; 991 992 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 993 (tcp_ao_len(key->ao_key) << 16) | 994 (key->ao_key->sndid << 8) | 995 key->rcv_next); 996 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 997 rep.th.doff = arg.iov[0].iov_len / 4; 998 999 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1000 key->ao_key, key->traffic_key, 1001 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1002 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1003 &rep.th, key->sne); 1004 } 1005 #endif 1006 arg.flags = reply_flags; 1007 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1008 ip_hdr(skb)->saddr, /* XXX */ 1009 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1010 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1011 if (oif) 1012 arg.bound_dev_if = oif; 1013 arg.tos = tos; 1014 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1015 local_bh_disable(); 1016 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1017 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1018 sock_net_set(ctl_sk, net); 1019 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1020 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1021 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1022 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1023 transmit_time = tcp_transmit_time(sk); 1024 ip_send_unicast_reply(ctl_sk, sk, 1025 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1026 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1027 &arg, arg.iov[0].iov_len, 1028 transmit_time, txhash); 1029 1030 sock_net_set(ctl_sk, &init_net); 1031 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1032 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1033 local_bh_enable(); 1034 } 1035 1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1037 enum tcp_tw_status tw_status) 1038 { 1039 struct inet_timewait_sock *tw = inet_twsk(sk); 1040 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1041 struct tcp_key key = {}; 1042 u8 tos = tw->tw_tos; 1043 1044 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1045 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1046 * being placed in a different service queues (Classic rather than L4S) 1047 */ 1048 if (tw_status == TCP_TW_ACK_OOW) 1049 tos &= ~INET_ECN_MASK; 1050 1051 #ifdef CONFIG_TCP_AO 1052 struct tcp_ao_info *ao_info; 1053 1054 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1055 /* FIXME: the segment to-be-acked is not verified yet */ 1056 ao_info = rcu_dereference(tcptw->ao_info); 1057 if (ao_info) { 1058 const struct tcp_ao_hdr *aoh; 1059 1060 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1061 inet_twsk_put(tw); 1062 return; 1063 } 1064 1065 if (aoh) 1066 key.ao_key = tcp_ao_established_key(sk, ao_info, 1067 aoh->rnext_keyid, -1); 1068 } 1069 } 1070 if (key.ao_key) { 1071 struct tcp_ao_key *rnext_key; 1072 1073 key.traffic_key = snd_other_key(key.ao_key); 1074 key.sne = READ_ONCE(ao_info->snd_sne); 1075 rnext_key = READ_ONCE(ao_info->rnext_key); 1076 key.rcv_next = rnext_key->rcvid; 1077 key.type = TCP_KEY_AO; 1078 #else 1079 if (0) { 1080 #endif 1081 } else if (static_branch_tcp_md5()) { 1082 key.md5_key = tcp_twsk_md5_key(tcptw); 1083 if (key.md5_key) 1084 key.type = TCP_KEY_MD5; 1085 } 1086 1087 tcp_v4_send_ack(sk, skb, 1088 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1089 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1090 tcp_tw_tsval(tcptw), 1091 READ_ONCE(tcptw->tw_ts_recent), 1092 tw->tw_bound_dev_if, &key, 1093 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1094 tos, 1095 tw->tw_txhash); 1096 1097 inet_twsk_put(tw); 1098 } 1099 1100 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1101 struct request_sock *req) 1102 { 1103 struct tcp_key key = {}; 1104 1105 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1106 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1107 */ 1108 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1109 tcp_sk(sk)->snd_nxt; 1110 1111 #ifdef CONFIG_TCP_AO 1112 if (static_branch_unlikely(&tcp_ao_needed.key) && 1113 tcp_rsk_used_ao(req)) { 1114 const union tcp_md5_addr *addr; 1115 const struct tcp_ao_hdr *aoh; 1116 int l3index; 1117 1118 /* Invalid TCP option size or twice included auth */ 1119 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1120 return; 1121 if (!aoh) 1122 return; 1123 1124 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1125 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1126 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1127 aoh->rnext_keyid, -1); 1128 if (unlikely(!key.ao_key)) { 1129 /* Send ACK with any matching MKT for the peer */ 1130 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1131 /* Matching key disappeared (user removed the key?) 1132 * let the handshake timeout. 1133 */ 1134 if (!key.ao_key) { 1135 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1136 addr, 1137 ntohs(tcp_hdr(skb)->source), 1138 &ip_hdr(skb)->daddr, 1139 ntohs(tcp_hdr(skb)->dest)); 1140 return; 1141 } 1142 } 1143 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1144 if (!key.traffic_key) 1145 return; 1146 1147 key.type = TCP_KEY_AO; 1148 key.rcv_next = aoh->keyid; 1149 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1150 #else 1151 if (0) { 1152 #endif 1153 } else if (static_branch_tcp_md5()) { 1154 const union tcp_md5_addr *addr; 1155 int l3index; 1156 1157 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1158 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1159 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1160 if (key.md5_key) 1161 key.type = TCP_KEY_MD5; 1162 } 1163 1164 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1165 tcp_v4_send_ack(sk, skb, seq, 1166 tcp_rsk(req)->rcv_nxt, 1167 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1168 tcp_rsk_tsval(tcp_rsk(req)), 1169 req->ts_recent, 1170 0, &key, 1171 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1172 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1173 READ_ONCE(tcp_rsk(req)->txhash)); 1174 if (tcp_key_is_ao(&key)) 1175 kfree(key.traffic_key); 1176 } 1177 1178 /* 1179 * Send a SYN-ACK after having received a SYN. 1180 * This still operates on a request_sock only, not on a big 1181 * socket. 1182 */ 1183 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1184 struct flowi *fl, 1185 struct request_sock *req, 1186 struct tcp_fastopen_cookie *foc, 1187 enum tcp_synack_type synack_type, 1188 struct sk_buff *syn_skb) 1189 { 1190 struct inet_request_sock *ireq = inet_rsk(req); 1191 struct flowi4 fl4; 1192 int err = -1; 1193 struct sk_buff *skb; 1194 u8 tos; 1195 1196 /* First, grab a route. */ 1197 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1198 return -1; 1199 1200 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1201 1202 if (skb) { 1203 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 1204 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1205 1206 tos = READ_ONCE(inet_sk(sk)->tos); 1207 1208 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1209 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1210 (tos & INET_ECN_MASK); 1211 1212 if (!INET_ECN_is_capable(tos) && 1213 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1214 tos |= INET_ECN_ECT_0; 1215 1216 rcu_read_lock(); 1217 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1218 ireq->ir_rmt_addr, 1219 rcu_dereference(ireq->ireq_opt), 1220 tos); 1221 rcu_read_unlock(); 1222 err = net_xmit_eval(err); 1223 } 1224 1225 return err; 1226 } 1227 1228 /* 1229 * IPv4 request_sock destructor. 1230 */ 1231 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1232 { 1233 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1234 } 1235 1236 #ifdef CONFIG_TCP_MD5SIG 1237 /* 1238 * RFC2385 MD5 checksumming requires a mapping of 1239 * IP address->MD5 Key. 1240 * We need to maintain these in the sk structure. 1241 */ 1242 1243 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1244 EXPORT_IPV6_MOD(tcp_md5_needed); 1245 1246 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1247 { 1248 if (!old) 1249 return true; 1250 1251 /* l3index always overrides non-l3index */ 1252 if (old->l3index && new->l3index == 0) 1253 return false; 1254 if (old->l3index == 0 && new->l3index) 1255 return true; 1256 1257 return old->prefixlen < new->prefixlen; 1258 } 1259 1260 /* Find the Key structure for an address. */ 1261 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1262 const union tcp_md5_addr *addr, 1263 int family, bool any_l3index) 1264 { 1265 const struct tcp_sock *tp = tcp_sk(sk); 1266 struct tcp_md5sig_key *key; 1267 const struct tcp_md5sig_info *md5sig; 1268 __be32 mask; 1269 struct tcp_md5sig_key *best_match = NULL; 1270 bool match; 1271 1272 /* caller either holds rcu_read_lock() or socket lock */ 1273 md5sig = rcu_dereference_check(tp->md5sig_info, 1274 lockdep_sock_is_held(sk)); 1275 if (!md5sig) 1276 return NULL; 1277 1278 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1279 lockdep_sock_is_held(sk)) { 1280 if (key->family != family) 1281 continue; 1282 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1283 key->l3index != l3index) 1284 continue; 1285 if (family == AF_INET) { 1286 mask = inet_make_mask(key->prefixlen); 1287 match = (key->addr.a4.s_addr & mask) == 1288 (addr->a4.s_addr & mask); 1289 #if IS_ENABLED(CONFIG_IPV6) 1290 } else if (family == AF_INET6) { 1291 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1292 key->prefixlen); 1293 #endif 1294 } else { 1295 match = false; 1296 } 1297 1298 if (match && better_md5_match(best_match, key)) 1299 best_match = key; 1300 } 1301 return best_match; 1302 } 1303 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1304 1305 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1306 const union tcp_md5_addr *addr, 1307 int family, u8 prefixlen, 1308 int l3index, u8 flags) 1309 { 1310 const struct tcp_sock *tp = tcp_sk(sk); 1311 struct tcp_md5sig_key *key; 1312 unsigned int size = sizeof(struct in_addr); 1313 const struct tcp_md5sig_info *md5sig; 1314 1315 /* caller either holds rcu_read_lock() or socket lock */ 1316 md5sig = rcu_dereference_check(tp->md5sig_info, 1317 lockdep_sock_is_held(sk)); 1318 if (!md5sig) 1319 return NULL; 1320 #if IS_ENABLED(CONFIG_IPV6) 1321 if (family == AF_INET6) 1322 size = sizeof(struct in6_addr); 1323 #endif 1324 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1325 lockdep_sock_is_held(sk)) { 1326 if (key->family != family) 1327 continue; 1328 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1329 continue; 1330 if (key->l3index != l3index) 1331 continue; 1332 if (!memcmp(&key->addr, addr, size) && 1333 key->prefixlen == prefixlen) 1334 return key; 1335 } 1336 return NULL; 1337 } 1338 1339 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1340 const struct sock *addr_sk) 1341 { 1342 const union tcp_md5_addr *addr; 1343 int l3index; 1344 1345 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1346 addr_sk->sk_bound_dev_if); 1347 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1348 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1349 } 1350 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1351 1352 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1353 { 1354 struct tcp_sock *tp = tcp_sk(sk); 1355 struct tcp_md5sig_info *md5sig; 1356 1357 md5sig = kmalloc(sizeof(*md5sig), gfp); 1358 if (!md5sig) 1359 return -ENOMEM; 1360 1361 sk_gso_disable(sk); 1362 INIT_HLIST_HEAD(&md5sig->head); 1363 rcu_assign_pointer(tp->md5sig_info, md5sig); 1364 return 0; 1365 } 1366 1367 /* This can be called on a newly created socket, from other files */ 1368 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1369 int family, u8 prefixlen, int l3index, u8 flags, 1370 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1371 { 1372 /* Add Key to the list */ 1373 struct tcp_md5sig_key *key; 1374 struct tcp_sock *tp = tcp_sk(sk); 1375 struct tcp_md5sig_info *md5sig; 1376 1377 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1378 if (key) { 1379 /* Pre-existing entry - just update that one. 1380 * Note that the key might be used concurrently. 1381 * data_race() is telling kcsan that we do not care of 1382 * key mismatches, since changing MD5 key on live flows 1383 * can lead to packet drops. 1384 */ 1385 data_race(memcpy(key->key, newkey, newkeylen)); 1386 1387 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1388 * Also note that a reader could catch new key->keylen value 1389 * but old key->key[], this is the reason we use __GFP_ZERO 1390 * at sock_kmalloc() time below these lines. 1391 */ 1392 WRITE_ONCE(key->keylen, newkeylen); 1393 1394 return 0; 1395 } 1396 1397 md5sig = rcu_dereference_protected(tp->md5sig_info, 1398 lockdep_sock_is_held(sk)); 1399 1400 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1401 if (!key) 1402 return -ENOMEM; 1403 1404 memcpy(key->key, newkey, newkeylen); 1405 key->keylen = newkeylen; 1406 key->family = family; 1407 key->prefixlen = prefixlen; 1408 key->l3index = l3index; 1409 key->flags = flags; 1410 memcpy(&key->addr, addr, 1411 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1412 sizeof(struct in_addr)); 1413 hlist_add_head_rcu(&key->node, &md5sig->head); 1414 return 0; 1415 } 1416 1417 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1418 int family, u8 prefixlen, int l3index, u8 flags, 1419 const u8 *newkey, u8 newkeylen) 1420 { 1421 struct tcp_sock *tp = tcp_sk(sk); 1422 1423 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1424 if (fips_enabled) { 1425 pr_warn_once("TCP-MD5 support is disabled due to FIPS\n"); 1426 return -EOPNOTSUPP; 1427 } 1428 1429 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1430 return -ENOMEM; 1431 1432 if (!static_branch_inc(&tcp_md5_needed.key)) { 1433 struct tcp_md5sig_info *md5sig; 1434 1435 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1436 rcu_assign_pointer(tp->md5sig_info, NULL); 1437 kfree_rcu(md5sig, rcu); 1438 return -EUSERS; 1439 } 1440 } 1441 1442 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1443 newkey, newkeylen, GFP_KERNEL); 1444 } 1445 EXPORT_IPV6_MOD(tcp_md5_do_add); 1446 1447 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1448 int family, u8 prefixlen, int l3index, 1449 struct tcp_md5sig_key *key) 1450 { 1451 struct tcp_sock *tp = tcp_sk(sk); 1452 1453 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1454 1455 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1456 return -ENOMEM; 1457 1458 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1459 struct tcp_md5sig_info *md5sig; 1460 1461 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1462 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1463 rcu_assign_pointer(tp->md5sig_info, NULL); 1464 kfree_rcu(md5sig, rcu); 1465 return -EUSERS; 1466 } 1467 } 1468 1469 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1470 key->flags, key->key, key->keylen, 1471 sk_gfp_mask(sk, GFP_ATOMIC)); 1472 } 1473 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1474 1475 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1476 u8 prefixlen, int l3index, u8 flags) 1477 { 1478 struct tcp_md5sig_key *key; 1479 1480 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1481 if (!key) 1482 return -ENOENT; 1483 hlist_del_rcu(&key->node); 1484 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1485 kfree_rcu(key, rcu); 1486 return 0; 1487 } 1488 EXPORT_IPV6_MOD(tcp_md5_do_del); 1489 1490 void tcp_clear_md5_list(struct sock *sk) 1491 { 1492 struct tcp_sock *tp = tcp_sk(sk); 1493 struct tcp_md5sig_key *key; 1494 struct hlist_node *n; 1495 struct tcp_md5sig_info *md5sig; 1496 1497 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1498 1499 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1500 hlist_del(&key->node); 1501 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1502 kfree(key); 1503 } 1504 } 1505 1506 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1507 sockptr_t optval, int optlen) 1508 { 1509 struct tcp_md5sig cmd; 1510 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1511 const union tcp_md5_addr *addr; 1512 u8 prefixlen = 32; 1513 int l3index = 0; 1514 bool l3flag; 1515 u8 flags; 1516 1517 if (optlen < sizeof(cmd)) 1518 return -EINVAL; 1519 1520 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1521 return -EFAULT; 1522 1523 if (sin->sin_family != AF_INET) 1524 return -EINVAL; 1525 1526 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1527 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1528 1529 if (optname == TCP_MD5SIG_EXT && 1530 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1531 prefixlen = cmd.tcpm_prefixlen; 1532 if (prefixlen > 32) 1533 return -EINVAL; 1534 } 1535 1536 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1537 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1538 struct net_device *dev; 1539 1540 rcu_read_lock(); 1541 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1542 if (dev && netif_is_l3_master(dev)) 1543 l3index = dev->ifindex; 1544 1545 rcu_read_unlock(); 1546 1547 /* ok to reference set/not set outside of rcu; 1548 * right now device MUST be an L3 master 1549 */ 1550 if (!dev || !l3index) 1551 return -EINVAL; 1552 } 1553 1554 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1555 1556 if (!cmd.tcpm_keylen) 1557 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1558 1559 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1560 return -EINVAL; 1561 1562 /* Don't allow keys for peers that have a matching TCP-AO key. 1563 * See the comment in tcp_ao_add_cmd() 1564 */ 1565 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1566 return -EKEYREJECTED; 1567 1568 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1569 cmd.tcpm_key, cmd.tcpm_keylen); 1570 } 1571 1572 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx, 1573 __be32 daddr, __be32 saddr, 1574 const struct tcphdr *th, int nbytes) 1575 { 1576 struct { 1577 struct tcp4_pseudohdr ip; 1578 struct tcphdr tcp; 1579 } h; 1580 1581 h.ip.saddr = saddr; 1582 h.ip.daddr = daddr; 1583 h.ip.pad = 0; 1584 h.ip.protocol = IPPROTO_TCP; 1585 h.ip.len = cpu_to_be16(nbytes); 1586 h.tcp = *th; 1587 h.tcp.check = 0; 1588 md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); 1589 } 1590 1591 static noinline_for_stack void 1592 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1593 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1594 { 1595 struct md5_ctx ctx; 1596 1597 md5_init(&ctx); 1598 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); 1599 tcp_md5_hash_key(&ctx, key); 1600 md5_final(&ctx, md5_hash); 1601 } 1602 1603 noinline_for_stack void 1604 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1605 const struct sock *sk, const struct sk_buff *skb) 1606 { 1607 const struct tcphdr *th = tcp_hdr(skb); 1608 __be32 saddr, daddr; 1609 struct md5_ctx ctx; 1610 1611 if (sk) { /* valid for establish/request sockets */ 1612 saddr = sk->sk_rcv_saddr; 1613 daddr = sk->sk_daddr; 1614 } else { 1615 const struct iphdr *iph = ip_hdr(skb); 1616 saddr = iph->saddr; 1617 daddr = iph->daddr; 1618 } 1619 1620 md5_init(&ctx); 1621 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); 1622 tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); 1623 tcp_md5_hash_key(&ctx, key); 1624 md5_final(&ctx, md5_hash); 1625 } 1626 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1627 1628 #endif 1629 1630 static void tcp_v4_init_req(struct request_sock *req, 1631 const struct sock *sk_listener, 1632 struct sk_buff *skb) 1633 { 1634 struct inet_request_sock *ireq = inet_rsk(req); 1635 struct net *net = sock_net(sk_listener); 1636 1637 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1638 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1639 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1640 } 1641 1642 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1643 struct sk_buff *skb, 1644 struct flowi *fl, 1645 struct request_sock *req, 1646 u32 tw_isn) 1647 { 1648 tcp_v4_init_req(req, sk, skb); 1649 1650 if (security_inet_conn_request(sk, skb, req)) 1651 return NULL; 1652 1653 return inet_csk_route_req(sk, &fl->u.ip4, req); 1654 } 1655 1656 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1657 .family = PF_INET, 1658 .obj_size = sizeof(struct tcp_request_sock), 1659 .send_ack = tcp_v4_reqsk_send_ack, 1660 .destructor = tcp_v4_reqsk_destructor, 1661 .send_reset = tcp_v4_send_reset, 1662 }; 1663 1664 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1665 .mss_clamp = TCP_MSS_DEFAULT, 1666 #ifdef CONFIG_TCP_MD5SIG 1667 .req_md5_lookup = tcp_v4_md5_lookup, 1668 .calc_md5_hash = tcp_v4_md5_hash_skb, 1669 #endif 1670 #ifdef CONFIG_TCP_AO 1671 .ao_lookup = tcp_v4_ao_lookup_rsk, 1672 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1673 .ao_synack_hash = tcp_v4_ao_synack_hash, 1674 #endif 1675 #ifdef CONFIG_SYN_COOKIES 1676 .cookie_init_seq = cookie_v4_init_sequence, 1677 #endif 1678 .route_req = tcp_v4_route_req, 1679 .init_seq = tcp_v4_init_seq, 1680 .init_ts_off = tcp_v4_init_ts_off, 1681 .send_synack = tcp_v4_send_synack, 1682 }; 1683 1684 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1685 { 1686 /* Never answer to SYNs send to broadcast or multicast */ 1687 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1688 goto drop; 1689 1690 return tcp_conn_request(&tcp_request_sock_ops, 1691 &tcp_request_sock_ipv4_ops, sk, skb); 1692 1693 drop: 1694 tcp_listendrop(sk); 1695 return 0; 1696 } 1697 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1698 1699 1700 /* 1701 * The three way handshake has completed - we got a valid synack - 1702 * now create the new socket. 1703 */ 1704 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1705 struct request_sock *req, 1706 struct dst_entry *dst, 1707 struct request_sock *req_unhash, 1708 bool *own_req) 1709 { 1710 struct inet_request_sock *ireq; 1711 bool found_dup_sk = false; 1712 struct inet_sock *newinet; 1713 struct tcp_sock *newtp; 1714 struct sock *newsk; 1715 #ifdef CONFIG_TCP_MD5SIG 1716 const union tcp_md5_addr *addr; 1717 struct tcp_md5sig_key *key; 1718 int l3index; 1719 #endif 1720 struct ip_options_rcu *inet_opt; 1721 1722 if (sk_acceptq_is_full(sk)) 1723 goto exit_overflow; 1724 1725 newsk = tcp_create_openreq_child(sk, req, skb); 1726 if (!newsk) 1727 goto exit_nonewsk; 1728 1729 newsk->sk_gso_type = SKB_GSO_TCPV4; 1730 inet_sk_rx_dst_set(newsk, skb); 1731 1732 newtp = tcp_sk(newsk); 1733 newinet = inet_sk(newsk); 1734 ireq = inet_rsk(req); 1735 inet_opt = rcu_dereference(ireq->ireq_opt); 1736 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1737 newinet->mc_index = inet_iif(skb); 1738 newinet->mc_ttl = ip_hdr(skb)->ttl; 1739 newinet->rcv_tos = ip_hdr(skb)->tos; 1740 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1741 if (inet_opt) 1742 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1743 atomic_set(&newinet->inet_id, get_random_u16()); 1744 1745 /* Set ToS of the new socket based upon the value of incoming SYN. 1746 * ECT bits are set later in tcp_init_transfer(). 1747 */ 1748 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1749 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1750 1751 if (!dst) { 1752 dst = inet_csk_route_child_sock(sk, newsk, req); 1753 if (!dst) 1754 goto put_and_exit; 1755 } else { 1756 /* syncookie case : see end of cookie_v4_check() */ 1757 } 1758 sk_setup_caps(newsk, dst); 1759 1760 tcp_ca_openreq_child(newsk, dst); 1761 1762 tcp_sync_mss(newsk, dst4_mtu(dst)); 1763 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1764 1765 tcp_initialize_rcv_mss(newsk); 1766 1767 #ifdef CONFIG_TCP_MD5SIG 1768 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1769 /* Copy over the MD5 key from the original socket */ 1770 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1771 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1772 if (key && !tcp_rsk_used_ao(req)) { 1773 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1774 goto put_and_exit; 1775 sk_gso_disable(newsk); 1776 } 1777 #endif 1778 #ifdef CONFIG_TCP_AO 1779 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1780 goto put_and_exit; /* OOM, release back memory */ 1781 #endif 1782 1783 if (__inet_inherit_port(sk, newsk) < 0) 1784 goto put_and_exit; 1785 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1786 &found_dup_sk); 1787 if (likely(*own_req)) { 1788 tcp_move_syn(newtp, req); 1789 ireq->ireq_opt = NULL; 1790 } else { 1791 newinet->inet_opt = NULL; 1792 1793 if (!req_unhash && found_dup_sk) { 1794 /* This code path should only be executed in the 1795 * syncookie case only 1796 */ 1797 bh_unlock_sock(newsk); 1798 sock_put(newsk); 1799 newsk = NULL; 1800 } 1801 } 1802 return newsk; 1803 1804 exit_overflow: 1805 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1806 exit_nonewsk: 1807 dst_release(dst); 1808 exit: 1809 tcp_listendrop(sk); 1810 return NULL; 1811 put_and_exit: 1812 newinet->inet_opt = NULL; 1813 inet_csk_prepare_forced_close(newsk); 1814 tcp_done(newsk); 1815 goto exit; 1816 } 1817 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1818 1819 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1820 { 1821 #ifdef CONFIG_SYN_COOKIES 1822 const struct tcphdr *th = tcp_hdr(skb); 1823 1824 if (!th->syn) 1825 sk = cookie_v4_check(sk, skb); 1826 #endif 1827 return sk; 1828 } 1829 1830 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1831 struct tcphdr *th, u32 *cookie) 1832 { 1833 u16 mss = 0; 1834 #ifdef CONFIG_SYN_COOKIES 1835 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1836 &tcp_request_sock_ipv4_ops, sk, th); 1837 if (mss) { 1838 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1839 tcp_synq_overflow(sk); 1840 } 1841 #endif 1842 return mss; 1843 } 1844 1845 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1846 u32)); 1847 /* The socket must have it's spinlock held when we get 1848 * here, unless it is a TCP_LISTEN socket. 1849 * 1850 * We have a potential double-lock case here, so even when 1851 * doing backlog processing we use the BH locking scheme. 1852 * This is because we cannot sleep with the original spinlock 1853 * held. 1854 */ 1855 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1856 { 1857 enum skb_drop_reason reason; 1858 struct sock *rsk; 1859 1860 reason = psp_sk_rx_policy_check(sk, skb); 1861 if (reason) 1862 goto err_discard; 1863 1864 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1865 struct dst_entry *dst; 1866 1867 dst = rcu_dereference_protected(sk->sk_rx_dst, 1868 lockdep_sock_is_held(sk)); 1869 1870 sock_rps_save_rxhash(sk, skb); 1871 sk_mark_napi_id(sk, skb); 1872 if (dst) { 1873 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1874 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1875 dst, 0)) { 1876 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1877 dst_release(dst); 1878 } 1879 } 1880 tcp_rcv_established(sk, skb); 1881 return 0; 1882 } 1883 1884 if (tcp_checksum_complete(skb)) 1885 goto csum_err; 1886 1887 if (sk->sk_state == TCP_LISTEN) { 1888 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1889 1890 if (!nsk) 1891 return 0; 1892 if (nsk != sk) { 1893 reason = tcp_child_process(sk, nsk, skb); 1894 if (reason) { 1895 rsk = nsk; 1896 goto reset; 1897 } 1898 return 0; 1899 } 1900 } else 1901 sock_rps_save_rxhash(sk, skb); 1902 1903 reason = tcp_rcv_state_process(sk, skb); 1904 if (reason) { 1905 rsk = sk; 1906 goto reset; 1907 } 1908 return 0; 1909 1910 reset: 1911 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1912 discard: 1913 sk_skb_reason_drop(sk, skb, reason); 1914 /* Be careful here. If this function gets more complicated and 1915 * gcc suffers from register pressure on the x86, sk (in %ebx) 1916 * might be destroyed here. This current version compiles correctly, 1917 * but you have been warned. 1918 */ 1919 return 0; 1920 1921 csum_err: 1922 reason = SKB_DROP_REASON_TCP_CSUM; 1923 trace_tcp_bad_csum(skb); 1924 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1925 err_discard: 1926 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1927 goto discard; 1928 } 1929 EXPORT_SYMBOL(tcp_v4_do_rcv); 1930 1931 int tcp_v4_early_demux(struct sk_buff *skb) 1932 { 1933 struct net *net = dev_net_rcu(skb->dev); 1934 const struct iphdr *iph; 1935 const struct tcphdr *th; 1936 struct sock *sk; 1937 1938 if (skb->pkt_type != PACKET_HOST) 1939 return 0; 1940 1941 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1942 return 0; 1943 1944 iph = ip_hdr(skb); 1945 th = tcp_hdr(skb); 1946 1947 if (th->doff < sizeof(struct tcphdr) / 4) 1948 return 0; 1949 1950 sk = __inet_lookup_established(net, iph->saddr, th->source, 1951 iph->daddr, ntohs(th->dest), 1952 skb->skb_iif, inet_sdif(skb)); 1953 if (sk) { 1954 skb->sk = sk; 1955 skb->destructor = sock_edemux; 1956 if (sk_fullsock(sk)) { 1957 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1958 1959 if (dst) 1960 dst = dst_check(dst, 0); 1961 if (dst && 1962 sk->sk_rx_dst_ifindex == skb->skb_iif) 1963 skb_dst_set_noref(skb, dst); 1964 } 1965 } 1966 return 0; 1967 } 1968 1969 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1970 enum skb_drop_reason *reason) 1971 { 1972 u32 tail_gso_size, tail_gso_segs; 1973 struct skb_shared_info *shinfo; 1974 const struct tcphdr *th; 1975 struct tcphdr *thtail; 1976 struct sk_buff *tail; 1977 unsigned int hdrlen; 1978 bool fragstolen; 1979 u32 gso_segs; 1980 u32 gso_size; 1981 u64 limit; 1982 int delta; 1983 int err; 1984 1985 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1986 * we can fix skb->truesize to its real value to avoid future drops. 1987 * This is valid because skb is not yet charged to the socket. 1988 * It has been noticed pure SACK packets were sometimes dropped 1989 * (if cooked by drivers without copybreak feature). 1990 */ 1991 skb_condense(skb); 1992 1993 tcp_cleanup_skb(skb); 1994 1995 if (unlikely(tcp_checksum_complete(skb))) { 1996 bh_unlock_sock(sk); 1997 trace_tcp_bad_csum(skb); 1998 *reason = SKB_DROP_REASON_TCP_CSUM; 1999 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2000 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2001 return true; 2002 } 2003 2004 /* Attempt coalescing to last skb in backlog, even if we are 2005 * above the limits. 2006 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2007 */ 2008 th = (const struct tcphdr *)skb->data; 2009 hdrlen = th->doff * 4; 2010 2011 tail = sk->sk_backlog.tail; 2012 if (!tail) 2013 goto no_coalesce; 2014 thtail = (struct tcphdr *)tail->data; 2015 2016 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2017 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2018 ((TCP_SKB_CB(tail)->tcp_flags | 2019 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2020 !((TCP_SKB_CB(tail)->tcp_flags & 2021 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2022 ((TCP_SKB_CB(tail)->tcp_flags ^ 2023 TCP_SKB_CB(skb)->tcp_flags) & 2024 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2025 !tcp_skb_can_collapse_rx(tail, skb) || 2026 thtail->doff != th->doff || 2027 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || 2028 /* prior to PSP Rx policy check, retain exact PSP metadata */ 2029 psp_skb_coalesce_diff(tail, skb)) 2030 goto no_coalesce; 2031 2032 __skb_pull(skb, hdrlen); 2033 2034 shinfo = skb_shinfo(skb); 2035 gso_size = shinfo->gso_size ?: skb->len; 2036 gso_segs = shinfo->gso_segs ?: 1; 2037 2038 shinfo = skb_shinfo(tail); 2039 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2040 tail_gso_segs = shinfo->gso_segs ?: 1; 2041 2042 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2043 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2044 2045 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2046 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2047 thtail->window = th->window; 2048 } 2049 2050 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2051 * thtail->fin, so that the fast path in tcp_rcv_established() 2052 * is not entered if we append a packet with a FIN. 2053 * SYN, RST, URG are not present. 2054 * ACK is set on both packets. 2055 * PSH : we do not really care in TCP stack, 2056 * at least for 'GRO' packets. 2057 */ 2058 thtail->fin |= th->fin; 2059 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2060 2061 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2062 TCP_SKB_CB(tail)->has_rxtstamp = true; 2063 tail->tstamp = skb->tstamp; 2064 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2065 } 2066 2067 /* Not as strict as GRO. We only need to carry mss max value */ 2068 shinfo->gso_size = max(gso_size, tail_gso_size); 2069 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2070 2071 sk->sk_backlog.len += delta; 2072 __NET_INC_STATS(sock_net(sk), 2073 LINUX_MIB_TCPBACKLOGCOALESCE); 2074 kfree_skb_partial(skb, fragstolen); 2075 return false; 2076 } 2077 __skb_push(skb, hdrlen); 2078 2079 no_coalesce: 2080 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2081 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2082 * sk_rcvbuf in normal conditions. 2083 */ 2084 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2085 2086 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2087 2088 /* Only socket owner can try to collapse/prune rx queues 2089 * to reduce memory overhead, so add a little headroom here. 2090 * Few sockets backlog are possibly concurrently non empty. 2091 */ 2092 limit += 64 * 1024; 2093 2094 limit = min_t(u64, limit, UINT_MAX); 2095 2096 err = sk_add_backlog(sk, skb, limit); 2097 if (unlikely(err)) { 2098 bh_unlock_sock(sk); 2099 if (err == -ENOMEM) { 2100 *reason = SKB_DROP_REASON_PFMEMALLOC; 2101 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2102 } else { 2103 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2104 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2105 } 2106 return true; 2107 } 2108 return false; 2109 } 2110 EXPORT_IPV6_MOD(tcp_add_backlog); 2111 2112 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) 2113 { 2114 struct tcphdr *th = (struct tcphdr *)skb->data; 2115 2116 return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); 2117 } 2118 EXPORT_IPV6_MOD(tcp_filter); 2119 2120 static void tcp_v4_restore_cb(struct sk_buff *skb) 2121 { 2122 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2123 sizeof(struct inet_skb_parm)); 2124 } 2125 2126 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2127 const struct tcphdr *th) 2128 { 2129 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2130 * barrier() makes sure compiler wont play fool^Waliasing games. 2131 */ 2132 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2133 sizeof(struct inet_skb_parm)); 2134 barrier(); 2135 2136 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2137 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2138 skb->len - th->doff * 4); 2139 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2140 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2141 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2142 TCP_SKB_CB(skb)->sacked = 0; 2143 TCP_SKB_CB(skb)->has_rxtstamp = 2144 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2145 } 2146 2147 /* 2148 * From tcp_input.c 2149 */ 2150 2151 int tcp_v4_rcv(struct sk_buff *skb) 2152 { 2153 struct net *net = dev_net_rcu(skb->dev); 2154 enum skb_drop_reason drop_reason; 2155 enum tcp_tw_status tw_status; 2156 int sdif = inet_sdif(skb); 2157 int dif = inet_iif(skb); 2158 const struct iphdr *iph; 2159 const struct tcphdr *th; 2160 struct sock *sk = NULL; 2161 bool refcounted; 2162 int ret; 2163 u32 isn; 2164 2165 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2166 if (skb->pkt_type != PACKET_HOST) 2167 goto discard_it; 2168 2169 /* Count it even if it's bad */ 2170 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2171 2172 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2173 goto discard_it; 2174 2175 th = (const struct tcphdr *)skb->data; 2176 2177 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2178 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2179 goto bad_packet; 2180 } 2181 if (!pskb_may_pull(skb, th->doff * 4)) 2182 goto discard_it; 2183 2184 /* An explanation is required here, I think. 2185 * Packet length and doff are validated by header prediction, 2186 * provided case of th->doff==0 is eliminated. 2187 * So, we defer the checks. */ 2188 2189 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2190 goto csum_error; 2191 2192 th = (const struct tcphdr *)skb->data; 2193 iph = ip_hdr(skb); 2194 lookup: 2195 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2196 th->dest, sdif, &refcounted); 2197 if (!sk) 2198 goto no_tcp_socket; 2199 2200 if (sk->sk_state == TCP_TIME_WAIT) 2201 goto do_time_wait; 2202 2203 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2204 struct request_sock *req = inet_reqsk(sk); 2205 bool req_stolen = false; 2206 struct sock *nsk; 2207 2208 sk = req->rsk_listener; 2209 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2210 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2211 else 2212 drop_reason = tcp_inbound_hash(sk, req, skb, 2213 &iph->saddr, &iph->daddr, 2214 AF_INET, dif, sdif); 2215 if (unlikely(drop_reason)) { 2216 sk_drops_skbadd(sk, skb); 2217 reqsk_put(req); 2218 goto discard_it; 2219 } 2220 if (tcp_checksum_complete(skb)) { 2221 reqsk_put(req); 2222 goto csum_error; 2223 } 2224 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2225 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2226 if (!nsk) { 2227 inet_csk_reqsk_queue_drop_and_put(sk, req); 2228 goto lookup; 2229 } 2230 sk = nsk; 2231 /* reuseport_migrate_sock() has already held one sk_refcnt 2232 * before returning. 2233 */ 2234 } else { 2235 /* We own a reference on the listener, increase it again 2236 * as we might lose it too soon. 2237 */ 2238 sock_hold(sk); 2239 } 2240 refcounted = true; 2241 nsk = NULL; 2242 if (!tcp_filter(sk, skb, &drop_reason)) { 2243 th = (const struct tcphdr *)skb->data; 2244 iph = ip_hdr(skb); 2245 tcp_v4_fill_cb(skb, iph, th); 2246 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2247 &drop_reason); 2248 } 2249 if (!nsk) { 2250 reqsk_put(req); 2251 if (req_stolen) { 2252 /* Another cpu got exclusive access to req 2253 * and created a full blown socket. 2254 * Try to feed this packet to this socket 2255 * instead of discarding it. 2256 */ 2257 tcp_v4_restore_cb(skb); 2258 sock_put(sk); 2259 goto lookup; 2260 } 2261 goto discard_and_relse; 2262 } 2263 nf_reset_ct(skb); 2264 if (nsk == sk) { 2265 reqsk_put(req); 2266 tcp_v4_restore_cb(skb); 2267 } else { 2268 drop_reason = tcp_child_process(sk, nsk, skb); 2269 if (drop_reason) { 2270 enum sk_rst_reason rst_reason; 2271 2272 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2273 tcp_v4_send_reset(nsk, skb, rst_reason); 2274 goto discard_and_relse; 2275 } 2276 sock_put(sk); 2277 return 0; 2278 } 2279 } 2280 2281 process: 2282 if (static_branch_unlikely(&ip4_min_ttl)) { 2283 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2284 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2285 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2286 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2287 goto discard_and_relse; 2288 } 2289 } 2290 2291 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2292 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2293 goto discard_and_relse; 2294 } 2295 2296 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2297 AF_INET, dif, sdif); 2298 if (drop_reason) 2299 goto discard_and_relse; 2300 2301 nf_reset_ct(skb); 2302 2303 if (tcp_filter(sk, skb, &drop_reason)) 2304 goto discard_and_relse; 2305 2306 th = (const struct tcphdr *)skb->data; 2307 iph = ip_hdr(skb); 2308 tcp_v4_fill_cb(skb, iph, th); 2309 2310 skb->dev = NULL; 2311 2312 if (sk->sk_state == TCP_LISTEN) { 2313 ret = tcp_v4_do_rcv(sk, skb); 2314 goto put_and_return; 2315 } 2316 2317 sk_incoming_cpu_update(sk); 2318 2319 bh_lock_sock_nested(sk); 2320 tcp_segs_in(tcp_sk(sk), skb); 2321 ret = 0; 2322 if (!sock_owned_by_user(sk)) { 2323 ret = tcp_v4_do_rcv(sk, skb); 2324 } else { 2325 if (tcp_add_backlog(sk, skb, &drop_reason)) 2326 goto discard_and_relse; 2327 } 2328 bh_unlock_sock(sk); 2329 2330 put_and_return: 2331 if (refcounted) 2332 sock_put(sk); 2333 2334 return ret; 2335 2336 no_tcp_socket: 2337 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2338 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2339 goto discard_it; 2340 2341 tcp_v4_fill_cb(skb, iph, th); 2342 2343 if (tcp_checksum_complete(skb)) { 2344 csum_error: 2345 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2346 trace_tcp_bad_csum(skb); 2347 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2348 bad_packet: 2349 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2350 } else { 2351 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2352 } 2353 2354 discard_it: 2355 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2356 /* Discard frame. */ 2357 sk_skb_reason_drop(sk, skb, drop_reason); 2358 return 0; 2359 2360 discard_and_relse: 2361 sk_drops_skbadd(sk, skb); 2362 if (refcounted) 2363 sock_put(sk); 2364 goto discard_it; 2365 2366 do_time_wait: 2367 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2368 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2369 inet_twsk_put(inet_twsk(sk)); 2370 goto discard_it; 2371 } 2372 2373 tcp_v4_fill_cb(skb, iph, th); 2374 2375 if (tcp_checksum_complete(skb)) { 2376 inet_twsk_put(inet_twsk(sk)); 2377 goto csum_error; 2378 } 2379 2380 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2381 &drop_reason); 2382 switch (tw_status) { 2383 case TCP_TW_SYN: { 2384 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2385 iph->saddr, th->source, 2386 iph->daddr, th->dest, 2387 inet_iif(skb), 2388 sdif); 2389 if (sk2) { 2390 inet_twsk_deschedule_put(inet_twsk(sk)); 2391 sk = sk2; 2392 tcp_v4_restore_cb(skb); 2393 refcounted = false; 2394 __this_cpu_write(tcp_tw_isn, isn); 2395 goto process; 2396 } 2397 2398 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); 2399 if (drop_reason) 2400 break; 2401 } 2402 /* to ACK */ 2403 fallthrough; 2404 case TCP_TW_ACK: 2405 case TCP_TW_ACK_OOW: 2406 tcp_v4_timewait_ack(sk, skb, tw_status); 2407 break; 2408 case TCP_TW_RST: 2409 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2410 inet_twsk_deschedule_put(inet_twsk(sk)); 2411 goto discard_it; 2412 case TCP_TW_SUCCESS:; 2413 } 2414 goto discard_it; 2415 } 2416 2417 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2418 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2419 }; 2420 2421 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2422 { 2423 struct dst_entry *dst = skb_dst(skb); 2424 2425 if (dst && dst_hold_safe(dst)) { 2426 rcu_assign_pointer(sk->sk_rx_dst, dst); 2427 sk->sk_rx_dst_ifindex = skb->skb_iif; 2428 } 2429 } 2430 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2431 2432 const struct inet_connection_sock_af_ops ipv4_specific = { 2433 .queue_xmit = ip_queue_xmit, 2434 .send_check = tcp_v4_send_check, 2435 .rebuild_header = inet_sk_rebuild_header, 2436 .sk_rx_dst_set = inet_sk_rx_dst_set, 2437 .conn_request = tcp_v4_conn_request, 2438 .syn_recv_sock = tcp_v4_syn_recv_sock, 2439 .net_header_len = sizeof(struct iphdr), 2440 .setsockopt = ip_setsockopt, 2441 .getsockopt = ip_getsockopt, 2442 .mtu_reduced = tcp_v4_mtu_reduced, 2443 }; 2444 EXPORT_IPV6_MOD(ipv4_specific); 2445 2446 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2447 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2448 #ifdef CONFIG_TCP_MD5SIG 2449 .md5_lookup = tcp_v4_md5_lookup, 2450 .calc_md5_hash = tcp_v4_md5_hash_skb, 2451 .md5_parse = tcp_v4_parse_md5_keys, 2452 #endif 2453 #ifdef CONFIG_TCP_AO 2454 .ao_lookup = tcp_v4_ao_lookup, 2455 .calc_ao_hash = tcp_v4_ao_hash_skb, 2456 .ao_parse = tcp_v4_parse_ao, 2457 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2458 #endif 2459 }; 2460 2461 static void tcp4_destruct_sock(struct sock *sk) 2462 { 2463 tcp_md5_destruct_sock(sk); 2464 tcp_ao_destroy_sock(sk, false); 2465 inet_sock_destruct(sk); 2466 } 2467 #endif 2468 2469 /* NOTE: A lot of things set to zero explicitly by call to 2470 * sk_alloc() so need not be done here. 2471 */ 2472 static int tcp_v4_init_sock(struct sock *sk) 2473 { 2474 struct inet_connection_sock *icsk = inet_csk(sk); 2475 2476 tcp_init_sock(sk); 2477 2478 icsk->icsk_af_ops = &ipv4_specific; 2479 2480 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2481 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2482 sk->sk_destruct = tcp4_destruct_sock; 2483 #endif 2484 2485 return 0; 2486 } 2487 2488 static void tcp_release_user_frags(struct sock *sk) 2489 { 2490 #ifdef CONFIG_PAGE_POOL 2491 unsigned long index; 2492 void *netmem; 2493 2494 xa_for_each(&sk->sk_user_frags, index, netmem) 2495 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2496 #endif 2497 } 2498 2499 void tcp_v4_destroy_sock(struct sock *sk) 2500 { 2501 struct tcp_sock *tp = tcp_sk(sk); 2502 2503 tcp_release_user_frags(sk); 2504 2505 xa_destroy(&sk->sk_user_frags); 2506 2507 trace_tcp_destroy_sock(sk); 2508 2509 tcp_clear_xmit_timers(sk); 2510 2511 tcp_cleanup_congestion_control(sk); 2512 2513 tcp_cleanup_ulp(sk); 2514 2515 /* Cleanup up the write buffer. */ 2516 tcp_write_queue_purge(sk); 2517 2518 /* Check if we want to disable active TFO */ 2519 tcp_fastopen_active_disable_ofo_check(sk); 2520 2521 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2522 skb_rbtree_purge(&tp->out_of_order_queue); 2523 2524 /* Clean up a referenced TCP bind bucket. */ 2525 if (inet_csk(sk)->icsk_bind_hash) 2526 inet_put_port(sk); 2527 2528 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2529 2530 /* If socket is aborted during connect operation */ 2531 tcp_free_fastopen_req(tp); 2532 tcp_fastopen_destroy_cipher(sk); 2533 tcp_saved_syn_free(tp); 2534 2535 sk_sockets_allocated_dec(sk); 2536 } 2537 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2538 2539 #ifdef CONFIG_PROC_FS 2540 /* Proc filesystem TCP sock list dumping. */ 2541 2542 static unsigned short seq_file_family(const struct seq_file *seq); 2543 2544 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2545 { 2546 unsigned short family = seq_file_family(seq); 2547 2548 /* AF_UNSPEC is used as a match all */ 2549 return ((family == AF_UNSPEC || family == sk->sk_family) && 2550 net_eq(sock_net(sk), seq_file_net(seq))); 2551 } 2552 2553 /* Find a non empty bucket (starting from st->bucket) 2554 * and return the first sk from it. 2555 */ 2556 static void *listening_get_first(struct seq_file *seq) 2557 { 2558 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2559 struct tcp_iter_state *st = seq->private; 2560 2561 st->offset = 0; 2562 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2563 struct inet_listen_hashbucket *ilb2; 2564 struct hlist_nulls_node *node; 2565 struct sock *sk; 2566 2567 ilb2 = &hinfo->lhash2[st->bucket]; 2568 if (hlist_nulls_empty(&ilb2->nulls_head)) 2569 continue; 2570 2571 spin_lock(&ilb2->lock); 2572 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2573 if (seq_sk_match(seq, sk)) 2574 return sk; 2575 } 2576 spin_unlock(&ilb2->lock); 2577 } 2578 2579 return NULL; 2580 } 2581 2582 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2583 * If "cur" is the last one in the st->bucket, 2584 * call listening_get_first() to return the first sk of the next 2585 * non empty bucket. 2586 */ 2587 static void *listening_get_next(struct seq_file *seq, void *cur) 2588 { 2589 struct tcp_iter_state *st = seq->private; 2590 struct inet_listen_hashbucket *ilb2; 2591 struct hlist_nulls_node *node; 2592 struct inet_hashinfo *hinfo; 2593 struct sock *sk = cur; 2594 2595 ++st->num; 2596 ++st->offset; 2597 2598 sk = sk_nulls_next(sk); 2599 sk_nulls_for_each_from(sk, node) { 2600 if (seq_sk_match(seq, sk)) 2601 return sk; 2602 } 2603 2604 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2605 ilb2 = &hinfo->lhash2[st->bucket]; 2606 spin_unlock(&ilb2->lock); 2607 ++st->bucket; 2608 return listening_get_first(seq); 2609 } 2610 2611 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2612 { 2613 struct tcp_iter_state *st = seq->private; 2614 void *rc; 2615 2616 st->bucket = 0; 2617 st->offset = 0; 2618 rc = listening_get_first(seq); 2619 2620 while (rc && *pos) { 2621 rc = listening_get_next(seq, rc); 2622 --*pos; 2623 } 2624 return rc; 2625 } 2626 2627 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2628 const struct tcp_iter_state *st) 2629 { 2630 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2631 } 2632 2633 /* 2634 * Get first established socket starting from bucket given in st->bucket. 2635 * If st->bucket is zero, the very first socket in the hash is returned. 2636 */ 2637 static void *established_get_first(struct seq_file *seq) 2638 { 2639 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2640 struct tcp_iter_state *st = seq->private; 2641 2642 st->offset = 0; 2643 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2644 struct sock *sk; 2645 struct hlist_nulls_node *node; 2646 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2647 2648 cond_resched(); 2649 2650 /* Lockless fast path for the common case of empty buckets */ 2651 if (empty_bucket(hinfo, st)) 2652 continue; 2653 2654 spin_lock_bh(lock); 2655 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2656 if (seq_sk_match(seq, sk)) 2657 return sk; 2658 } 2659 spin_unlock_bh(lock); 2660 } 2661 2662 return NULL; 2663 } 2664 2665 static void *established_get_next(struct seq_file *seq, void *cur) 2666 { 2667 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2668 struct tcp_iter_state *st = seq->private; 2669 struct hlist_nulls_node *node; 2670 struct sock *sk = cur; 2671 2672 ++st->num; 2673 ++st->offset; 2674 2675 sk = sk_nulls_next(sk); 2676 2677 sk_nulls_for_each_from(sk, node) { 2678 if (seq_sk_match(seq, sk)) 2679 return sk; 2680 } 2681 2682 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2683 ++st->bucket; 2684 return established_get_first(seq); 2685 } 2686 2687 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2688 { 2689 struct tcp_iter_state *st = seq->private; 2690 void *rc; 2691 2692 st->bucket = 0; 2693 rc = established_get_first(seq); 2694 2695 while (rc && pos) { 2696 rc = established_get_next(seq, rc); 2697 --pos; 2698 } 2699 return rc; 2700 } 2701 2702 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2703 { 2704 void *rc; 2705 struct tcp_iter_state *st = seq->private; 2706 2707 st->state = TCP_SEQ_STATE_LISTENING; 2708 rc = listening_get_idx(seq, &pos); 2709 2710 if (!rc) { 2711 st->state = TCP_SEQ_STATE_ESTABLISHED; 2712 rc = established_get_idx(seq, pos); 2713 } 2714 2715 return rc; 2716 } 2717 2718 static void *tcp_seek_last_pos(struct seq_file *seq) 2719 { 2720 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2721 struct tcp_iter_state *st = seq->private; 2722 int bucket = st->bucket; 2723 int offset = st->offset; 2724 int orig_num = st->num; 2725 void *rc = NULL; 2726 2727 switch (st->state) { 2728 case TCP_SEQ_STATE_LISTENING: 2729 if (st->bucket > hinfo->lhash2_mask) 2730 break; 2731 rc = listening_get_first(seq); 2732 while (offset-- && rc && bucket == st->bucket) 2733 rc = listening_get_next(seq, rc); 2734 if (rc) 2735 break; 2736 st->bucket = 0; 2737 st->state = TCP_SEQ_STATE_ESTABLISHED; 2738 fallthrough; 2739 case TCP_SEQ_STATE_ESTABLISHED: 2740 if (st->bucket > hinfo->ehash_mask) 2741 break; 2742 rc = established_get_first(seq); 2743 while (offset-- && rc && bucket == st->bucket) 2744 rc = established_get_next(seq, rc); 2745 } 2746 2747 st->num = orig_num; 2748 2749 return rc; 2750 } 2751 2752 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2753 { 2754 struct tcp_iter_state *st = seq->private; 2755 void *rc; 2756 2757 if (*pos && *pos == st->last_pos) { 2758 rc = tcp_seek_last_pos(seq); 2759 if (rc) 2760 goto out; 2761 } 2762 2763 st->state = TCP_SEQ_STATE_LISTENING; 2764 st->num = 0; 2765 st->bucket = 0; 2766 st->offset = 0; 2767 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2768 2769 out: 2770 st->last_pos = *pos; 2771 return rc; 2772 } 2773 EXPORT_IPV6_MOD(tcp_seq_start); 2774 2775 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2776 { 2777 struct tcp_iter_state *st = seq->private; 2778 void *rc = NULL; 2779 2780 if (v == SEQ_START_TOKEN) { 2781 rc = tcp_get_idx(seq, 0); 2782 goto out; 2783 } 2784 2785 switch (st->state) { 2786 case TCP_SEQ_STATE_LISTENING: 2787 rc = listening_get_next(seq, v); 2788 if (!rc) { 2789 st->state = TCP_SEQ_STATE_ESTABLISHED; 2790 st->bucket = 0; 2791 st->offset = 0; 2792 rc = established_get_first(seq); 2793 } 2794 break; 2795 case TCP_SEQ_STATE_ESTABLISHED: 2796 rc = established_get_next(seq, v); 2797 break; 2798 } 2799 out: 2800 ++*pos; 2801 st->last_pos = *pos; 2802 return rc; 2803 } 2804 EXPORT_IPV6_MOD(tcp_seq_next); 2805 2806 void tcp_seq_stop(struct seq_file *seq, void *v) 2807 { 2808 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2809 struct tcp_iter_state *st = seq->private; 2810 2811 switch (st->state) { 2812 case TCP_SEQ_STATE_LISTENING: 2813 if (v != SEQ_START_TOKEN) 2814 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2815 break; 2816 case TCP_SEQ_STATE_ESTABLISHED: 2817 if (v) 2818 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2819 break; 2820 } 2821 } 2822 EXPORT_IPV6_MOD(tcp_seq_stop); 2823 2824 static void get_openreq4(const struct request_sock *req, 2825 struct seq_file *f, int i) 2826 { 2827 const struct inet_request_sock *ireq = inet_rsk(req); 2828 long delta = req->rsk_timer.expires - jiffies; 2829 2830 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2831 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2832 i, 2833 ireq->ir_loc_addr, 2834 ireq->ir_num, 2835 ireq->ir_rmt_addr, 2836 ntohs(ireq->ir_rmt_port), 2837 TCP_SYN_RECV, 2838 0, 0, /* could print option size, but that is af dependent. */ 2839 1, /* timers active (only the expire timer) */ 2840 jiffies_delta_to_clock_t(delta), 2841 req->num_timeout, 2842 from_kuid_munged(seq_user_ns(f), 2843 sk_uid(req->rsk_listener)), 2844 0, /* non standard timer */ 2845 0, /* open_requests have no inode */ 2846 0, 2847 req); 2848 } 2849 2850 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2851 { 2852 int timer_active; 2853 unsigned long timer_expires; 2854 const struct tcp_sock *tp = tcp_sk(sk); 2855 const struct inet_connection_sock *icsk = inet_csk(sk); 2856 const struct inet_sock *inet = inet_sk(sk); 2857 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2858 __be32 dest = inet->inet_daddr; 2859 __be32 src = inet->inet_rcv_saddr; 2860 __u16 destp = ntohs(inet->inet_dport); 2861 __u16 srcp = ntohs(inet->inet_sport); 2862 u8 icsk_pending; 2863 int rx_queue; 2864 int state; 2865 2866 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2867 if (icsk_pending == ICSK_TIME_RETRANS || 2868 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2869 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2870 timer_active = 1; 2871 timer_expires = tcp_timeout_expires(sk); 2872 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2873 timer_active = 4; 2874 timer_expires = tcp_timeout_expires(sk); 2875 } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 2876 timer_active = 2; 2877 timer_expires = icsk->icsk_keepalive_timer.expires; 2878 } else { 2879 timer_active = 0; 2880 timer_expires = jiffies; 2881 } 2882 2883 state = inet_sk_state_load(sk); 2884 if (state == TCP_LISTEN) 2885 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2886 else 2887 /* Because we don't lock the socket, 2888 * we might find a transient negative value. 2889 */ 2890 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2891 READ_ONCE(tp->copied_seq), 0); 2892 2893 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2894 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2895 i, src, srcp, dest, destp, state, 2896 READ_ONCE(tp->write_seq) - tp->snd_una, 2897 rx_queue, 2898 timer_active, 2899 jiffies_delta_to_clock_t(timer_expires - jiffies), 2900 READ_ONCE(icsk->icsk_retransmits), 2901 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2902 READ_ONCE(icsk->icsk_probes_out), 2903 sock_i_ino(sk), 2904 refcount_read(&sk->sk_refcnt), sk, 2905 jiffies_to_clock_t(icsk->icsk_rto), 2906 jiffies_to_clock_t(icsk->icsk_ack.ato), 2907 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2908 tcp_snd_cwnd(tp), 2909 state == TCP_LISTEN ? 2910 fastopenq->max_qlen : 2911 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2912 } 2913 2914 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2915 struct seq_file *f, int i) 2916 { 2917 long delta = tw->tw_timer.expires - jiffies; 2918 __be32 dest, src; 2919 __u16 destp, srcp; 2920 2921 dest = tw->tw_daddr; 2922 src = tw->tw_rcv_saddr; 2923 destp = ntohs(tw->tw_dport); 2924 srcp = ntohs(tw->tw_sport); 2925 2926 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2927 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2928 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2929 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2930 refcount_read(&tw->tw_refcnt), tw); 2931 } 2932 2933 #define TMPSZ 150 2934 2935 static int tcp4_seq_show(struct seq_file *seq, void *v) 2936 { 2937 struct tcp_iter_state *st; 2938 struct sock *sk = v; 2939 2940 seq_setwidth(seq, TMPSZ - 1); 2941 if (v == SEQ_START_TOKEN) { 2942 seq_puts(seq, " sl local_address rem_address st tx_queue " 2943 "rx_queue tr tm->when retrnsmt uid timeout " 2944 "inode"); 2945 goto out; 2946 } 2947 st = seq->private; 2948 2949 if (sk->sk_state == TCP_TIME_WAIT) 2950 get_timewait4_sock(v, seq, st->num); 2951 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2952 get_openreq4(v, seq, st->num); 2953 else 2954 get_tcp4_sock(v, seq, st->num); 2955 out: 2956 seq_pad(seq, '\n'); 2957 return 0; 2958 } 2959 2960 #ifdef CONFIG_BPF_SYSCALL 2961 union bpf_tcp_iter_batch_item { 2962 struct sock *sk; 2963 __u64 cookie; 2964 }; 2965 2966 struct bpf_tcp_iter_state { 2967 struct tcp_iter_state state; 2968 unsigned int cur_sk; 2969 unsigned int end_sk; 2970 unsigned int max_sk; 2971 union bpf_tcp_iter_batch_item *batch; 2972 }; 2973 2974 struct bpf_iter__tcp { 2975 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2976 __bpf_md_ptr(struct sock_common *, sk_common); 2977 uid_t uid __aligned(8); 2978 }; 2979 2980 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2981 struct sock_common *sk_common, uid_t uid) 2982 { 2983 struct bpf_iter__tcp ctx; 2984 2985 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2986 ctx.meta = meta; 2987 ctx.sk_common = sk_common; 2988 ctx.uid = uid; 2989 return bpf_iter_run_prog(prog, &ctx); 2990 } 2991 2992 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2993 { 2994 union bpf_tcp_iter_batch_item *item; 2995 unsigned int cur_sk = iter->cur_sk; 2996 __u64 cookie; 2997 2998 /* Remember the cookies of the sockets we haven't seen yet, so we can 2999 * pick up where we left off next time around. 3000 */ 3001 while (cur_sk < iter->end_sk) { 3002 item = &iter->batch[cur_sk++]; 3003 cookie = sock_gen_cookie(item->sk); 3004 sock_gen_put(item->sk); 3005 item->cookie = cookie; 3006 } 3007 } 3008 3009 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3010 unsigned int new_batch_sz, gfp_t flags) 3011 { 3012 union bpf_tcp_iter_batch_item *new_batch; 3013 3014 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3015 flags | __GFP_NOWARN); 3016 if (!new_batch) 3017 return -ENOMEM; 3018 3019 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 3020 kvfree(iter->batch); 3021 iter->batch = new_batch; 3022 iter->max_sk = new_batch_sz; 3023 3024 return 0; 3025 } 3026 3027 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3028 union bpf_tcp_iter_batch_item *cookies, 3029 int n_cookies) 3030 { 3031 struct hlist_nulls_node *node; 3032 struct sock *sk; 3033 int i; 3034 3035 for (i = 0; i < n_cookies; i++) { 3036 sk = first_sk; 3037 sk_nulls_for_each_from(sk, node) 3038 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3039 return sk; 3040 } 3041 3042 return NULL; 3043 } 3044 3045 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3046 { 3047 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3048 struct bpf_tcp_iter_state *iter = seq->private; 3049 struct tcp_iter_state *st = &iter->state; 3050 unsigned int find_cookie = iter->cur_sk; 3051 unsigned int end_cookie = iter->end_sk; 3052 int resume_bucket = st->bucket; 3053 struct sock *sk; 3054 3055 if (end_cookie && find_cookie == end_cookie) 3056 ++st->bucket; 3057 3058 sk = listening_get_first(seq); 3059 iter->cur_sk = 0; 3060 iter->end_sk = 0; 3061 3062 if (sk && st->bucket == resume_bucket && end_cookie) { 3063 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3064 end_cookie - find_cookie); 3065 if (!sk) { 3066 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3067 ++st->bucket; 3068 sk = listening_get_first(seq); 3069 } 3070 } 3071 3072 return sk; 3073 } 3074 3075 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3076 { 3077 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3078 struct bpf_tcp_iter_state *iter = seq->private; 3079 struct tcp_iter_state *st = &iter->state; 3080 unsigned int find_cookie = iter->cur_sk; 3081 unsigned int end_cookie = iter->end_sk; 3082 int resume_bucket = st->bucket; 3083 struct sock *sk; 3084 3085 if (end_cookie && find_cookie == end_cookie) 3086 ++st->bucket; 3087 3088 sk = established_get_first(seq); 3089 iter->cur_sk = 0; 3090 iter->end_sk = 0; 3091 3092 if (sk && st->bucket == resume_bucket && end_cookie) { 3093 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3094 end_cookie - find_cookie); 3095 if (!sk) { 3096 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3097 ++st->bucket; 3098 sk = established_get_first(seq); 3099 } 3100 } 3101 3102 return sk; 3103 } 3104 3105 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3106 { 3107 struct bpf_tcp_iter_state *iter = seq->private; 3108 struct tcp_iter_state *st = &iter->state; 3109 struct sock *sk = NULL; 3110 3111 switch (st->state) { 3112 case TCP_SEQ_STATE_LISTENING: 3113 sk = bpf_iter_tcp_resume_listening(seq); 3114 if (sk) 3115 break; 3116 st->bucket = 0; 3117 st->state = TCP_SEQ_STATE_ESTABLISHED; 3118 fallthrough; 3119 case TCP_SEQ_STATE_ESTABLISHED: 3120 sk = bpf_iter_tcp_resume_established(seq); 3121 break; 3122 } 3123 3124 return sk; 3125 } 3126 3127 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3128 struct sock **start_sk) 3129 { 3130 struct bpf_tcp_iter_state *iter = seq->private; 3131 struct hlist_nulls_node *node; 3132 unsigned int expected = 1; 3133 struct sock *sk; 3134 3135 sock_hold(*start_sk); 3136 iter->batch[iter->end_sk++].sk = *start_sk; 3137 3138 sk = sk_nulls_next(*start_sk); 3139 *start_sk = NULL; 3140 sk_nulls_for_each_from(sk, node) { 3141 if (seq_sk_match(seq, sk)) { 3142 if (iter->end_sk < iter->max_sk) { 3143 sock_hold(sk); 3144 iter->batch[iter->end_sk++].sk = sk; 3145 } else if (!*start_sk) { 3146 /* Remember where we left off. */ 3147 *start_sk = sk; 3148 } 3149 expected++; 3150 } 3151 } 3152 3153 return expected; 3154 } 3155 3156 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3157 struct sock **start_sk) 3158 { 3159 struct bpf_tcp_iter_state *iter = seq->private; 3160 struct hlist_nulls_node *node; 3161 unsigned int expected = 1; 3162 struct sock *sk; 3163 3164 sock_hold(*start_sk); 3165 iter->batch[iter->end_sk++].sk = *start_sk; 3166 3167 sk = sk_nulls_next(*start_sk); 3168 *start_sk = NULL; 3169 sk_nulls_for_each_from(sk, node) { 3170 if (seq_sk_match(seq, sk)) { 3171 if (iter->end_sk < iter->max_sk) { 3172 sock_hold(sk); 3173 iter->batch[iter->end_sk++].sk = sk; 3174 } else if (!*start_sk) { 3175 /* Remember where we left off. */ 3176 *start_sk = sk; 3177 } 3178 expected++; 3179 } 3180 } 3181 3182 return expected; 3183 } 3184 3185 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3186 struct sock **start_sk) 3187 { 3188 struct bpf_tcp_iter_state *iter = seq->private; 3189 struct tcp_iter_state *st = &iter->state; 3190 3191 if (st->state == TCP_SEQ_STATE_LISTENING) 3192 return bpf_iter_tcp_listening_batch(seq, start_sk); 3193 else 3194 return bpf_iter_tcp_established_batch(seq, start_sk); 3195 } 3196 3197 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3198 { 3199 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3200 struct bpf_tcp_iter_state *iter = seq->private; 3201 struct tcp_iter_state *st = &iter->state; 3202 3203 if (st->state == TCP_SEQ_STATE_LISTENING) 3204 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3205 else 3206 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3207 } 3208 3209 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3210 { 3211 struct bpf_tcp_iter_state *iter = seq->private; 3212 unsigned int expected; 3213 struct sock *sk; 3214 int err; 3215 3216 sk = bpf_iter_tcp_resume(seq); 3217 if (!sk) 3218 return NULL; /* Done */ 3219 3220 expected = bpf_iter_fill_batch(seq, &sk); 3221 if (likely(iter->end_sk == expected)) 3222 goto done; 3223 3224 /* Batch size was too small. */ 3225 bpf_iter_tcp_unlock_bucket(seq); 3226 bpf_iter_tcp_put_batch(iter); 3227 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3228 GFP_USER); 3229 if (err) 3230 return ERR_PTR(err); 3231 3232 sk = bpf_iter_tcp_resume(seq); 3233 if (!sk) 3234 return NULL; /* Done */ 3235 3236 expected = bpf_iter_fill_batch(seq, &sk); 3237 if (likely(iter->end_sk == expected)) 3238 goto done; 3239 3240 /* Batch size was still too small. Hold onto the lock while we try 3241 * again with a larger batch to make sure the current bucket's size 3242 * does not change in the meantime. 3243 */ 3244 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3245 if (err) { 3246 bpf_iter_tcp_unlock_bucket(seq); 3247 return ERR_PTR(err); 3248 } 3249 3250 expected = bpf_iter_fill_batch(seq, &sk); 3251 WARN_ON_ONCE(iter->end_sk != expected); 3252 done: 3253 bpf_iter_tcp_unlock_bucket(seq); 3254 return iter->batch[0].sk; 3255 } 3256 3257 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3258 { 3259 /* bpf iter does not support lseek, so it always 3260 * continue from where it was stop()-ped. 3261 */ 3262 if (*pos) 3263 return bpf_iter_tcp_batch(seq); 3264 3265 return SEQ_START_TOKEN; 3266 } 3267 3268 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3269 { 3270 struct bpf_tcp_iter_state *iter = seq->private; 3271 struct tcp_iter_state *st = &iter->state; 3272 struct sock *sk; 3273 3274 /* Whenever seq_next() is called, the iter->cur_sk is 3275 * done with seq_show(), so advance to the next sk in 3276 * the batch. 3277 */ 3278 if (iter->cur_sk < iter->end_sk) { 3279 /* Keeping st->num consistent in tcp_iter_state. 3280 * bpf_iter_tcp does not use st->num. 3281 * meta.seq_num is used instead. 3282 */ 3283 st->num++; 3284 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3285 } 3286 3287 if (iter->cur_sk < iter->end_sk) 3288 sk = iter->batch[iter->cur_sk].sk; 3289 else 3290 sk = bpf_iter_tcp_batch(seq); 3291 3292 ++*pos; 3293 /* Keeping st->last_pos consistent in tcp_iter_state. 3294 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3295 */ 3296 st->last_pos = *pos; 3297 return sk; 3298 } 3299 3300 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3301 { 3302 struct bpf_iter_meta meta; 3303 struct bpf_prog *prog; 3304 struct sock *sk = v; 3305 uid_t uid; 3306 int ret; 3307 3308 if (v == SEQ_START_TOKEN) 3309 return 0; 3310 3311 if (sk_fullsock(sk)) 3312 lock_sock(sk); 3313 3314 if (unlikely(sk_unhashed(sk))) { 3315 ret = SEQ_SKIP; 3316 goto unlock; 3317 } 3318 3319 if (sk->sk_state == TCP_TIME_WAIT) { 3320 uid = 0; 3321 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3322 const struct request_sock *req = v; 3323 3324 uid = from_kuid_munged(seq_user_ns(seq), 3325 sk_uid(req->rsk_listener)); 3326 } else { 3327 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3328 } 3329 3330 meta.seq = seq; 3331 prog = bpf_iter_get_info(&meta, false); 3332 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3333 3334 unlock: 3335 if (sk_fullsock(sk)) 3336 release_sock(sk); 3337 return ret; 3338 3339 } 3340 3341 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3342 { 3343 struct bpf_tcp_iter_state *iter = seq->private; 3344 struct bpf_iter_meta meta; 3345 struct bpf_prog *prog; 3346 3347 if (!v) { 3348 meta.seq = seq; 3349 prog = bpf_iter_get_info(&meta, true); 3350 if (prog) 3351 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3352 } 3353 3354 if (iter->cur_sk < iter->end_sk) 3355 bpf_iter_tcp_put_batch(iter); 3356 } 3357 3358 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3359 .show = bpf_iter_tcp_seq_show, 3360 .start = bpf_iter_tcp_seq_start, 3361 .next = bpf_iter_tcp_seq_next, 3362 .stop = bpf_iter_tcp_seq_stop, 3363 }; 3364 #endif 3365 static unsigned short seq_file_family(const struct seq_file *seq) 3366 { 3367 const struct tcp_seq_afinfo *afinfo; 3368 3369 #ifdef CONFIG_BPF_SYSCALL 3370 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3371 if (seq->op == &bpf_iter_tcp_seq_ops) 3372 return AF_UNSPEC; 3373 #endif 3374 3375 /* Iterated from proc fs */ 3376 afinfo = pde_data(file_inode(seq->file)); 3377 return afinfo->family; 3378 } 3379 3380 static const struct seq_operations tcp4_seq_ops = { 3381 .show = tcp4_seq_show, 3382 .start = tcp_seq_start, 3383 .next = tcp_seq_next, 3384 .stop = tcp_seq_stop, 3385 }; 3386 3387 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3388 .family = AF_INET, 3389 }; 3390 3391 static int __net_init tcp4_proc_init_net(struct net *net) 3392 { 3393 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3394 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3395 return -ENOMEM; 3396 return 0; 3397 } 3398 3399 static void __net_exit tcp4_proc_exit_net(struct net *net) 3400 { 3401 remove_proc_entry("tcp", net->proc_net); 3402 } 3403 3404 static struct pernet_operations tcp4_net_ops = { 3405 .init = tcp4_proc_init_net, 3406 .exit = tcp4_proc_exit_net, 3407 }; 3408 3409 int __init tcp4_proc_init(void) 3410 { 3411 return register_pernet_subsys(&tcp4_net_ops); 3412 } 3413 3414 void tcp4_proc_exit(void) 3415 { 3416 unregister_pernet_subsys(&tcp4_net_ops); 3417 } 3418 #endif /* CONFIG_PROC_FS */ 3419 3420 struct proto tcp_prot = { 3421 .name = "TCP", 3422 .owner = THIS_MODULE, 3423 .close = tcp_close, 3424 .pre_connect = tcp_v4_pre_connect, 3425 .connect = tcp_v4_connect, 3426 .disconnect = tcp_disconnect, 3427 .accept = inet_csk_accept, 3428 .ioctl = tcp_ioctl, 3429 .init = tcp_v4_init_sock, 3430 .destroy = tcp_v4_destroy_sock, 3431 .shutdown = tcp_shutdown, 3432 .setsockopt = tcp_setsockopt, 3433 .getsockopt = tcp_getsockopt, 3434 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3435 .keepalive = tcp_set_keepalive, 3436 .recvmsg = tcp_recvmsg, 3437 .sendmsg = tcp_sendmsg, 3438 .splice_eof = tcp_splice_eof, 3439 .backlog_rcv = tcp_v4_do_rcv, 3440 .release_cb = tcp_release_cb, 3441 .hash = inet_hash, 3442 .unhash = inet_unhash, 3443 .get_port = inet_csk_get_port, 3444 .put_port = inet_put_port, 3445 #ifdef CONFIG_BPF_SYSCALL 3446 .psock_update_sk_prot = tcp_bpf_update_proto, 3447 #endif 3448 .enter_memory_pressure = tcp_enter_memory_pressure, 3449 .leave_memory_pressure = tcp_leave_memory_pressure, 3450 .stream_memory_free = tcp_stream_memory_free, 3451 .sockets_allocated = &tcp_sockets_allocated, 3452 3453 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3454 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3455 3456 .memory_pressure = &tcp_memory_pressure, 3457 .sysctl_mem = sysctl_tcp_mem, 3458 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3459 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3460 .max_header = MAX_TCP_HEADER, 3461 .obj_size = sizeof(struct tcp_sock), 3462 .freeptr_offset = offsetof(struct tcp_sock, 3463 inet_conn.icsk_inet.sk.sk_freeptr), 3464 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3465 .twsk_prot = &tcp_timewait_sock_ops, 3466 .rsk_prot = &tcp_request_sock_ops, 3467 .h.hashinfo = NULL, 3468 .no_autobind = true, 3469 .diag_destroy = tcp_abort, 3470 }; 3471 EXPORT_SYMBOL(tcp_prot); 3472 3473 static void __net_exit tcp_sk_exit(struct net *net) 3474 { 3475 if (net->ipv4.tcp_congestion_control) 3476 bpf_module_put(net->ipv4.tcp_congestion_control, 3477 net->ipv4.tcp_congestion_control->owner); 3478 } 3479 3480 static void __net_init tcp_set_hashinfo(struct net *net) 3481 { 3482 struct inet_hashinfo *hinfo; 3483 unsigned int ehash_entries; 3484 struct net *old_net; 3485 3486 if (net_eq(net, &init_net)) 3487 goto fallback; 3488 3489 old_net = current->nsproxy->net_ns; 3490 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3491 if (!ehash_entries) 3492 goto fallback; 3493 3494 ehash_entries = roundup_pow_of_two(ehash_entries); 3495 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3496 if (!hinfo) { 3497 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3498 "for a netns, fallback to the global one\n", 3499 ehash_entries); 3500 fallback: 3501 hinfo = &tcp_hashinfo; 3502 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3503 } 3504 3505 net->ipv4.tcp_death_row.hashinfo = hinfo; 3506 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3507 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3508 } 3509 3510 static int __net_init tcp_sk_init(struct net *net) 3511 { 3512 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3513 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3514 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; 3515 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3516 3517 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3518 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3519 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3520 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3521 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3522 3523 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3524 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3525 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3526 3527 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3528 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3529 net->ipv4.sysctl_tcp_syncookies = 1; 3530 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3531 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3532 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3533 net->ipv4.sysctl_tcp_orphan_retries = 0; 3534 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3535 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3536 net->ipv4.sysctl_tcp_tw_reuse = 2; 3537 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3538 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3539 3540 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3541 tcp_set_hashinfo(net); 3542 3543 net->ipv4.sysctl_tcp_sack = 1; 3544 net->ipv4.sysctl_tcp_window_scaling = 1; 3545 net->ipv4.sysctl_tcp_timestamps = 1; 3546 net->ipv4.sysctl_tcp_early_retrans = 3; 3547 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3548 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3549 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3550 net->ipv4.sysctl_tcp_max_reordering = 300; 3551 net->ipv4.sysctl_tcp_dsack = 1; 3552 net->ipv4.sysctl_tcp_app_win = 31; 3553 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3554 net->ipv4.sysctl_tcp_frto = 2; 3555 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3556 net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC; 3557 /* This limits the percentage of the congestion window which we 3558 * will allow a single TSO frame to consume. Building TSO frames 3559 * which are too large can cause TCP streams to be bursty. 3560 */ 3561 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3562 /* Default TSQ limit of 4 MB */ 3563 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3564 3565 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3566 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3567 3568 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3569 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3570 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3571 net->ipv4.sysctl_tcp_autocorking = 1; 3572 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3573 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3574 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3575 if (net != &init_net) { 3576 memcpy(net->ipv4.sysctl_tcp_rmem, 3577 init_net.ipv4.sysctl_tcp_rmem, 3578 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3579 memcpy(net->ipv4.sysctl_tcp_wmem, 3580 init_net.ipv4.sysctl_tcp_wmem, 3581 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3582 } 3583 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3584 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC; 3585 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3586 net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33; 3587 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3588 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3589 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3590 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3591 3592 /* Set default values for PLB */ 3593 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3594 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3595 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3596 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3597 /* Default congestion threshold for PLB to mark a round is 50% */ 3598 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3599 3600 /* Reno is always built in */ 3601 if (!net_eq(net, &init_net) && 3602 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3603 init_net.ipv4.tcp_congestion_control->owner)) 3604 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3605 else 3606 net->ipv4.tcp_congestion_control = &tcp_reno; 3607 3608 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3609 net->ipv4.sysctl_tcp_shrink_window = 0; 3610 3611 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3612 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3613 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3614 3615 return 0; 3616 } 3617 3618 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3619 { 3620 struct net *net; 3621 3622 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3623 * and failed setup_net error unwinding path are serialized. 3624 * 3625 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3626 * net_exit_list, the thread that dismantles a particular twsk must 3627 * do so without other thread progressing to refcount_dec_and_test() of 3628 * tcp_death_row.tw_refcount. 3629 */ 3630 mutex_lock(&tcp_exit_batch_mutex); 3631 3632 tcp_twsk_purge(net_exit_list); 3633 3634 list_for_each_entry(net, net_exit_list, exit_list) { 3635 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3636 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3637 tcp_fastopen_ctx_destroy(net); 3638 } 3639 3640 mutex_unlock(&tcp_exit_batch_mutex); 3641 } 3642 3643 static struct pernet_operations __net_initdata tcp_sk_ops = { 3644 .init = tcp_sk_init, 3645 .exit = tcp_sk_exit, 3646 .exit_batch = tcp_sk_exit_batch, 3647 }; 3648 3649 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3650 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3651 struct sock_common *sk_common, uid_t uid) 3652 3653 #define INIT_BATCH_SZ 16 3654 3655 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3656 { 3657 struct bpf_tcp_iter_state *iter = priv_data; 3658 int err; 3659 3660 err = bpf_iter_init_seq_net(priv_data, aux); 3661 if (err) 3662 return err; 3663 3664 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3665 if (err) { 3666 bpf_iter_fini_seq_net(priv_data); 3667 return err; 3668 } 3669 3670 return 0; 3671 } 3672 3673 static void bpf_iter_fini_tcp(void *priv_data) 3674 { 3675 struct bpf_tcp_iter_state *iter = priv_data; 3676 3677 bpf_iter_fini_seq_net(priv_data); 3678 kvfree(iter->batch); 3679 } 3680 3681 static const struct bpf_iter_seq_info tcp_seq_info = { 3682 .seq_ops = &bpf_iter_tcp_seq_ops, 3683 .init_seq_private = bpf_iter_init_tcp, 3684 .fini_seq_private = bpf_iter_fini_tcp, 3685 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3686 }; 3687 3688 static const struct bpf_func_proto * 3689 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3690 const struct bpf_prog *prog) 3691 { 3692 switch (func_id) { 3693 case BPF_FUNC_setsockopt: 3694 return &bpf_sk_setsockopt_proto; 3695 case BPF_FUNC_getsockopt: 3696 return &bpf_sk_getsockopt_proto; 3697 default: 3698 return NULL; 3699 } 3700 } 3701 3702 static struct bpf_iter_reg tcp_reg_info = { 3703 .target = "tcp", 3704 .ctx_arg_info_size = 1, 3705 .ctx_arg_info = { 3706 { offsetof(struct bpf_iter__tcp, sk_common), 3707 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3708 }, 3709 .get_func_proto = bpf_iter_tcp_get_func_proto, 3710 .seq_info = &tcp_seq_info, 3711 }; 3712 3713 static void __init bpf_iter_register(void) 3714 { 3715 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3716 if (bpf_iter_reg_target(&tcp_reg_info)) 3717 pr_warn("Warning: could not register bpf iterator tcp\n"); 3718 } 3719 3720 #endif 3721 3722 void __init tcp_v4_init(void) 3723 { 3724 int cpu, res; 3725 3726 for_each_possible_cpu(cpu) { 3727 struct sock *sk; 3728 3729 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3730 IPPROTO_TCP, &init_net); 3731 if (res) 3732 panic("Failed to create the TCP control socket.\n"); 3733 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3734 3735 /* Please enforce IP_DF and IPID==0 for RST and 3736 * ACK sent in SYN-RECV and TIME-WAIT state. 3737 */ 3738 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3739 3740 sk->sk_clockid = CLOCK_MONOTONIC; 3741 3742 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3743 } 3744 if (register_pernet_subsys(&tcp_sk_ops)) 3745 panic("Failed to create the TCP control socket.\n"); 3746 3747 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3748 bpf_iter_register(); 3749 #endif 3750 } 3751