1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/fips.h> 57 #include <linux/jhash.h> 58 #include <linux/init.h> 59 #include <linux/times.h> 60 #include <linux/slab.h> 61 #include <linux/sched.h> 62 #include <linux/sock_diag.h> 63 64 #include <net/aligned_data.h> 65 #include <net/net_namespace.h> 66 #include <net/icmp.h> 67 #include <net/inet_hashtables.h> 68 #include <net/tcp.h> 69 #include <net/tcp_ecn.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/inet_ecn.h> 74 #include <net/timewait_sock.h> 75 #include <net/xfrm.h> 76 #include <net/secure_seq.h> 77 #include <net/busy_poll.h> 78 #include <net/rstreason.h> 79 #include <net/psp.h> 80 81 #include <linux/inet.h> 82 #include <linux/ipv6.h> 83 #include <linux/stddef.h> 84 #include <linux/proc_fs.h> 85 #include <linux/seq_file.h> 86 #include <linux/inetdevice.h> 87 #include <linux/btf_ids.h> 88 #include <linux/skbuff_ref.h> 89 90 #include <crypto/md5.h> 91 92 #include <trace/events/tcp.h> 93 94 #ifdef CONFIG_TCP_MD5SIG 95 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 96 __be32 daddr, __be32 saddr, const struct tcphdr *th); 97 #endif 98 99 struct inet_hashinfo tcp_hashinfo; 100 101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 102 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 103 }; 104 105 static DEFINE_MUTEX(tcp_exit_batch_mutex); 106 107 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 108 { 109 return secure_tcp_seq(ip_hdr(skb)->daddr, 110 ip_hdr(skb)->saddr, 111 tcp_hdr(skb)->dest, 112 tcp_hdr(skb)->source); 113 } 114 115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 116 { 117 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 118 } 119 120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 121 { 122 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 123 const struct inet_timewait_sock *tw = inet_twsk(sktw); 124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 125 struct tcp_sock *tp = tcp_sk(sk); 126 int ts_recent_stamp; 127 u32 reuse_thresh; 128 129 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 130 reuse = 0; 131 132 if (reuse == 2) { 133 /* Still does not detect *everything* that goes through 134 * lo, since we require a loopback src or dst address 135 * or direct binding to 'lo' interface. 136 */ 137 bool loopback = false; 138 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 139 loopback = true; 140 #if IS_ENABLED(CONFIG_IPV6) 141 if (tw->tw_family == AF_INET6) { 142 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 144 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 145 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 146 loopback = true; 147 } else 148 #endif 149 { 150 if (ipv4_is_loopback(tw->tw_daddr) || 151 ipv4_is_loopback(tw->tw_rcv_saddr)) 152 loopback = true; 153 } 154 if (!loopback) 155 reuse = 0; 156 } 157 158 /* With PAWS, it is safe from the viewpoint 159 of data integrity. Even without PAWS it is safe provided sequence 160 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 161 162 Actually, the idea is close to VJ's one, only timestamp cache is 163 held not per host, but per port pair and TW bucket is used as state 164 holder. 165 166 If TW bucket has been already destroyed we fall back to VJ's scheme 167 and use initial timestamp retrieved from peer table. 168 */ 169 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 170 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 171 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 172 if (ts_recent_stamp && 173 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 174 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 175 * and releasing the bucket lock. 176 */ 177 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 178 return 0; 179 180 /* In case of repair and re-using TIME-WAIT sockets we still 181 * want to be sure that it is safe as above but honor the 182 * sequence numbers and time stamps set as part of the repair 183 * process. 184 * 185 * Without this check re-using a TIME-WAIT socket with TCP 186 * repair would accumulate a -1 on the repair assigned 187 * sequence number. The first time it is reused the sequence 188 * is -1, the second time -2, etc. This fixes that issue 189 * without appearing to create any others. 190 */ 191 if (likely(!tp->repair)) { 192 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 193 194 if (!seq) 195 seq = 1; 196 WRITE_ONCE(tp->write_seq, seq); 197 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 198 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 199 } 200 201 return 1; 202 } 203 204 return 0; 205 } 206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 207 208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, 209 int addr_len) 210 { 211 /* This check is replicated from tcp_v4_connect() and intended to 212 * prevent BPF program called below from accessing bytes that are out 213 * of the bound specified by user in addr_len. 214 */ 215 if (addr_len < sizeof(struct sockaddr_in)) 216 return -EINVAL; 217 218 sock_owned_by_me(sk); 219 220 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 221 } 222 223 /* This will initiate an outgoing connection. */ 224 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) 225 { 226 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 227 struct inet_timewait_death_row *tcp_death_row; 228 struct inet_sock *inet = inet_sk(sk); 229 struct tcp_sock *tp = tcp_sk(sk); 230 struct ip_options_rcu *inet_opt; 231 struct net *net = sock_net(sk); 232 __be16 orig_sport, orig_dport; 233 __be32 daddr, nexthop; 234 struct flowi4 *fl4; 235 struct rtable *rt; 236 int err; 237 238 if (addr_len < sizeof(struct sockaddr_in)) 239 return -EINVAL; 240 241 if (usin->sin_family != AF_INET) 242 return -EAFNOSUPPORT; 243 244 nexthop = daddr = usin->sin_addr.s_addr; 245 inet_opt = rcu_dereference_protected(inet->inet_opt, 246 lockdep_sock_is_held(sk)); 247 if (inet_opt && inet_opt->opt.srr) { 248 if (!daddr) 249 return -EINVAL; 250 nexthop = inet_opt->opt.faddr; 251 } 252 253 orig_sport = inet->inet_sport; 254 orig_dport = usin->sin_port; 255 fl4 = &inet->cork.fl.u.ip4; 256 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 257 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 258 orig_dport, sk); 259 if (IS_ERR(rt)) { 260 err = PTR_ERR(rt); 261 if (err == -ENETUNREACH) 262 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 263 return err; 264 } 265 266 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 267 ip_rt_put(rt); 268 return -ENETUNREACH; 269 } 270 271 if (!inet_opt || !inet_opt->opt.srr) 272 daddr = fl4->daddr; 273 274 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 275 276 if (!inet->inet_saddr) { 277 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 278 if (err) { 279 ip_rt_put(rt); 280 return err; 281 } 282 } else { 283 sk_rcv_saddr_set(sk, inet->inet_saddr); 284 } 285 286 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 287 /* Reset inherited state */ 288 tp->rx_opt.ts_recent = 0; 289 tp->rx_opt.ts_recent_stamp = 0; 290 if (likely(!tp->repair)) 291 WRITE_ONCE(tp->write_seq, 0); 292 } 293 294 inet->inet_dport = usin->sin_port; 295 sk_daddr_set(sk, daddr); 296 297 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); 298 if (inet_opt) 299 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; 300 301 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 302 303 /* Socket identity is still unknown (sport may be zero). 304 * However we set state to SYN-SENT and not releasing socket 305 * lock select source port, enter ourselves into the hash tables and 306 * complete initialization after this. 307 */ 308 tcp_set_state(sk, TCP_SYN_SENT); 309 err = inet_hash_connect(tcp_death_row, sk); 310 if (err) 311 goto failure; 312 313 sk_set_txhash(sk); 314 315 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 316 inet->inet_sport, inet->inet_dport, sk); 317 if (IS_ERR(rt)) { 318 err = PTR_ERR(rt); 319 rt = NULL; 320 goto failure; 321 } 322 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 323 /* OK, now commit destination to socket. */ 324 sk->sk_gso_type = SKB_GSO_TCPV4; 325 sk_setup_caps(sk, &rt->dst); 326 rt = NULL; 327 328 if (likely(!tp->repair)) { 329 if (!tp->write_seq) 330 WRITE_ONCE(tp->write_seq, 331 secure_tcp_seq(inet->inet_saddr, 332 inet->inet_daddr, 333 inet->inet_sport, 334 usin->sin_port)); 335 WRITE_ONCE(tp->tsoffset, 336 secure_tcp_ts_off(net, inet->inet_saddr, 337 inet->inet_daddr)); 338 } 339 340 atomic_set(&inet->inet_id, get_random_u16()); 341 342 if (tcp_fastopen_defer_connect(sk, &err)) 343 return err; 344 if (err) 345 goto failure; 346 347 err = tcp_connect(sk); 348 349 if (err) 350 goto failure; 351 352 return 0; 353 354 failure: 355 /* 356 * This unhashes the socket and releases the local port, 357 * if necessary. 358 */ 359 tcp_set_state(sk, TCP_CLOSE); 360 inet_bhash2_reset_saddr(sk); 361 ip_rt_put(rt); 362 sk->sk_route_caps = 0; 363 inet->inet_dport = 0; 364 return err; 365 } 366 EXPORT_IPV6_MOD(tcp_v4_connect); 367 368 /* 369 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 370 * It can be called through tcp_release_cb() if socket was owned by user 371 * at the time tcp_v4_err() was called to handle ICMP message. 372 */ 373 void tcp_v4_mtu_reduced(struct sock *sk) 374 { 375 struct inet_sock *inet = inet_sk(sk); 376 struct dst_entry *dst; 377 u32 mtu, dmtu; 378 379 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 380 return; 381 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 382 dst = inet_csk_update_pmtu(sk, mtu); 383 if (!dst) 384 return; 385 386 /* Something is about to be wrong... Remember soft error 387 * for the case, if this connection will not able to recover. 388 */ 389 dmtu = dst4_mtu(dst); 390 if (mtu < dmtu && ip_dont_fragment(sk, dst)) 391 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 392 393 if (inet->pmtudisc != IP_PMTUDISC_DONT && 394 ip_sk_accept_pmtu(sk) && 395 inet_csk(sk)->icsk_pmtu_cookie > dmtu) { 396 tcp_sync_mss(sk, dmtu); 397 398 /* Resend the TCP packet because it's 399 * clear that the old packet has been 400 * dropped. This is the new "fast" path mtu 401 * discovery. 402 */ 403 tcp_simple_retransmit(sk); 404 } /* else let the usual retransmit timer handle it */ 405 } 406 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 407 408 static void do_redirect(struct sk_buff *skb, struct sock *sk) 409 { 410 struct dst_entry *dst = __sk_dst_check(sk, 0); 411 412 if (dst) 413 dst->ops->redirect(dst, sk, skb); 414 } 415 416 417 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 418 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 419 { 420 struct request_sock *req = inet_reqsk(sk); 421 struct net *net = sock_net(sk); 422 423 /* ICMPs are not backlogged, hence we cannot get 424 * an established socket here. 425 */ 426 if (seq != tcp_rsk(req)->snt_isn) { 427 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 428 } else if (abort) { 429 /* 430 * Still in SYN_RECV, just remove it silently. 431 * There is no good way to pass the error to the newly 432 * created socket, and POSIX does not want network 433 * errors returned from accept(). 434 */ 435 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 436 tcp_listendrop(req->rsk_listener); 437 } 438 reqsk_put(req); 439 } 440 EXPORT_IPV6_MOD(tcp_req_err); 441 442 /* TCP-LD (RFC 6069) logic */ 443 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 444 { 445 struct inet_connection_sock *icsk = inet_csk(sk); 446 struct tcp_sock *tp = tcp_sk(sk); 447 struct sk_buff *skb; 448 s32 remaining; 449 u32 delta_us; 450 451 if (sock_owned_by_user(sk)) 452 return; 453 454 if (seq != tp->snd_una || !icsk->icsk_retransmits || 455 !icsk->icsk_backoff) 456 return; 457 458 skb = tcp_rtx_queue_head(sk); 459 if (WARN_ON_ONCE(!skb)) 460 return; 461 462 icsk->icsk_backoff--; 463 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 464 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 465 466 tcp_mstamp_refresh(tp); 467 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 468 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 469 470 if (remaining > 0) { 471 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 472 } else { 473 /* RTO revert clocked out retransmission. 474 * Will retransmit now. 475 */ 476 tcp_retransmit_timer(sk); 477 } 478 } 479 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 480 481 /* 482 * This routine is called by the ICMP module when it gets some 483 * sort of error condition. If err < 0 then the socket should 484 * be closed and the error returned to the user. If err > 0 485 * it's just the icmp type << 8 | icmp code. After adjustment 486 * header points to the first 8 bytes of the tcp header. We need 487 * to find the appropriate port. 488 * 489 * The locking strategy used here is very "optimistic". When 490 * someone else accesses the socket the ICMP is just dropped 491 * and for some paths there is no check at all. 492 * A more general error queue to queue errors for later handling 493 * is probably better. 494 * 495 */ 496 497 int tcp_v4_err(struct sk_buff *skb, u32 info) 498 { 499 const struct iphdr *iph = (const struct iphdr *)skb->data; 500 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 501 struct net *net = dev_net_rcu(skb->dev); 502 const int type = icmp_hdr(skb)->type; 503 const int code = icmp_hdr(skb)->code; 504 struct request_sock *fastopen; 505 struct tcp_sock *tp; 506 u32 seq, snd_una; 507 struct sock *sk; 508 int err; 509 510 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 511 ntohs(th->source), inet_iif(skb), 0); 512 if (!sk) { 513 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 514 return -ENOENT; 515 } 516 if (sk->sk_state == TCP_TIME_WAIT) { 517 /* To increase the counter of ignored icmps for TCP-AO */ 518 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 519 inet_twsk_put(inet_twsk(sk)); 520 return 0; 521 } 522 seq = ntohl(th->seq); 523 if (sk->sk_state == TCP_NEW_SYN_RECV) { 524 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 525 type == ICMP_TIME_EXCEEDED || 526 (type == ICMP_DEST_UNREACH && 527 (code == ICMP_NET_UNREACH || 528 code == ICMP_HOST_UNREACH))); 529 return 0; 530 } 531 532 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 533 sock_put(sk); 534 return 0; 535 } 536 537 bh_lock_sock(sk); 538 /* If too many ICMPs get dropped on busy 539 * servers this needs to be solved differently. 540 * We do take care of PMTU discovery (RFC1191) special case : 541 * we can receive locally generated ICMP messages while socket is held. 542 */ 543 if (sock_owned_by_user(sk)) { 544 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 545 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 546 } 547 if (sk->sk_state == TCP_CLOSE) 548 goto out; 549 550 if (static_branch_unlikely(&ip4_min_ttl)) { 551 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 552 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 553 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 554 goto out; 555 } 556 } 557 558 tp = tcp_sk(sk); 559 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 560 fastopen = rcu_dereference(tp->fastopen_rsk); 561 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 562 if (sk->sk_state != TCP_LISTEN && 563 !between(seq, snd_una, tp->snd_nxt)) { 564 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 565 goto out; 566 } 567 568 switch (type) { 569 case ICMP_REDIRECT: 570 if (!sock_owned_by_user(sk)) 571 do_redirect(skb, sk); 572 goto out; 573 case ICMP_SOURCE_QUENCH: 574 /* Just silently ignore these. */ 575 goto out; 576 case ICMP_PARAMETERPROB: 577 err = EPROTO; 578 break; 579 case ICMP_DEST_UNREACH: 580 if (code > NR_ICMP_UNREACH) 581 goto out; 582 583 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 584 /* We are not interested in TCP_LISTEN and open_requests 585 * (SYN-ACKs send out by Linux are always <576bytes so 586 * they should go through unfragmented). 587 */ 588 if (sk->sk_state == TCP_LISTEN) 589 goto out; 590 591 WRITE_ONCE(tp->mtu_info, info); 592 if (!sock_owned_by_user(sk)) { 593 tcp_v4_mtu_reduced(sk); 594 } else { 595 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 596 sock_hold(sk); 597 } 598 goto out; 599 } 600 601 err = icmp_err_convert[code].errno; 602 /* check if this ICMP message allows revert of backoff. 603 * (see RFC 6069) 604 */ 605 if (!fastopen && 606 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 607 tcp_ld_RTO_revert(sk, seq); 608 break; 609 case ICMP_TIME_EXCEEDED: 610 err = EHOSTUNREACH; 611 break; 612 default: 613 goto out; 614 } 615 616 switch (sk->sk_state) { 617 case TCP_SYN_SENT: 618 case TCP_SYN_RECV: 619 /* Only in fast or simultaneous open. If a fast open socket is 620 * already accepted it is treated as a connected one below. 621 */ 622 if (fastopen && !fastopen->sk) 623 break; 624 625 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 626 627 if (!sock_owned_by_user(sk)) 628 tcp_done_with_error(sk, err); 629 else 630 WRITE_ONCE(sk->sk_err_soft, err); 631 goto out; 632 } 633 634 /* If we've already connected we will keep trying 635 * until we time out, or the user gives up. 636 * 637 * rfc1122 4.2.3.9 allows to consider as hard errors 638 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 639 * but it is obsoleted by pmtu discovery). 640 * 641 * Note, that in modern internet, where routing is unreliable 642 * and in each dark corner broken firewalls sit, sending random 643 * errors ordered by their masters even this two messages finally lose 644 * their original sense (even Linux sends invalid PORT_UNREACHs) 645 * 646 * Now we are in compliance with RFCs. 647 * --ANK (980905) 648 */ 649 650 if (!sock_owned_by_user(sk) && 651 inet_test_bit(RECVERR, sk)) { 652 WRITE_ONCE(sk->sk_err, err); 653 sk_error_report(sk); 654 } else { /* Only an error on timeout */ 655 WRITE_ONCE(sk->sk_err_soft, err); 656 } 657 658 out: 659 bh_unlock_sock(sk); 660 sock_put(sk); 661 return 0; 662 } 663 664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 665 { 666 struct tcphdr *th = tcp_hdr(skb); 667 668 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 669 skb->csum_start = skb_transport_header(skb) - skb->head; 670 skb->csum_offset = offsetof(struct tcphdr, check); 671 } 672 673 /* This routine computes an IPv4 TCP checksum. */ 674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 675 { 676 const struct inet_sock *inet = inet_sk(sk); 677 678 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 679 } 680 EXPORT_IPV6_MOD(tcp_v4_send_check); 681 682 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 683 684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 685 const struct tcp_ao_hdr *aoh, 686 struct ip_reply_arg *arg, struct tcphdr *reply, 687 __be32 reply_options[REPLY_OPTIONS_LEN]) 688 { 689 #ifdef CONFIG_TCP_AO 690 int sdif = tcp_v4_sdif(skb); 691 int dif = inet_iif(skb); 692 int l3index = sdif ? dif : 0; 693 bool allocated_traffic_key; 694 struct tcp_ao_key *key; 695 char *traffic_key; 696 bool drop = true; 697 u32 ao_sne = 0; 698 u8 keyid; 699 700 rcu_read_lock(); 701 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 702 &key, &traffic_key, &allocated_traffic_key, 703 &keyid, &ao_sne)) 704 goto out; 705 706 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 707 (aoh->rnext_keyid << 8) | keyid); 708 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 709 reply->doff = arg->iov[0].iov_len / 4; 710 711 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 712 key, traffic_key, 713 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 714 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 715 reply, ao_sne)) 716 goto out; 717 drop = false; 718 out: 719 rcu_read_unlock(); 720 if (allocated_traffic_key) 721 kfree(traffic_key); 722 return drop; 723 #else 724 return true; 725 #endif 726 } 727 728 /* 729 * This routine will send an RST to the other tcp. 730 * 731 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 732 * for reset. 733 * Answer: if a packet caused RST, it is not for a socket 734 * existing in our system, if it is matched to a socket, 735 * it is just duplicate segment or bug in other side's TCP. 736 * So that we build reply only basing on parameters 737 * arrived with segment. 738 * Exception: precedence violation. We do not implement it in any case. 739 */ 740 741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 742 enum sk_rst_reason reason) 743 { 744 const struct tcphdr *th = tcp_hdr(skb); 745 struct { 746 struct tcphdr th; 747 __be32 opt[REPLY_OPTIONS_LEN]; 748 } rep; 749 const __u8 *md5_hash_location = NULL; 750 const struct tcp_ao_hdr *aoh; 751 struct ip_reply_arg arg; 752 #ifdef CONFIG_TCP_MD5SIG 753 struct tcp_md5sig_key *key = NULL; 754 unsigned char newhash[16]; 755 struct sock *sk1 = NULL; 756 #endif 757 u64 transmit_time = 0; 758 struct sock *ctl_sk; 759 struct net *net; 760 u32 txhash = 0; 761 762 /* Never send a reset in response to a reset. */ 763 if (th->rst) 764 return; 765 766 /* If sk not NULL, it means we did a successful lookup and incoming 767 * route had to be correct. prequeue might have dropped our dst. 768 */ 769 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 770 return; 771 772 /* Swap the send and the receive. */ 773 memset(&rep, 0, sizeof(rep)); 774 rep.th.dest = th->source; 775 rep.th.source = th->dest; 776 rep.th.doff = sizeof(struct tcphdr) / 4; 777 rep.th.rst = 1; 778 779 if (th->ack) { 780 rep.th.seq = th->ack_seq; 781 } else { 782 rep.th.ack = 1; 783 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 784 skb->len - (th->doff << 2)); 785 } 786 787 memset(&arg, 0, sizeof(arg)); 788 arg.iov[0].iov_base = (unsigned char *)&rep; 789 arg.iov[0].iov_len = sizeof(rep.th); 790 791 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 792 793 /* Invalid TCP option size or twice included auth */ 794 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 795 return; 796 797 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 798 return; 799 800 #ifdef CONFIG_TCP_MD5SIG 801 rcu_read_lock(); 802 if (sk && sk_fullsock(sk)) { 803 const union tcp_md5_addr *addr; 804 int l3index; 805 806 /* sdif set, means packet ingressed via a device 807 * in an L3 domain and inet_iif is set to it. 808 */ 809 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 810 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 811 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 812 } else if (md5_hash_location) { 813 const union tcp_md5_addr *addr; 814 int sdif = tcp_v4_sdif(skb); 815 int dif = inet_iif(skb); 816 int l3index; 817 818 /* 819 * active side is lost. Try to find listening socket through 820 * source port, and then find md5 key through listening socket. 821 * we are not loose security here: 822 * Incoming packet is checked with md5 hash with finding key, 823 * no RST generated if md5 hash doesn't match. 824 */ 825 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 826 th->source, ip_hdr(skb)->daddr, 827 ntohs(th->source), dif, sdif); 828 /* don't send rst if it can't find key */ 829 if (!sk1) 830 goto out; 831 832 /* sdif set, means packet ingressed via a device 833 * in an L3 domain and dif is set to it. 834 */ 835 l3index = sdif ? dif : 0; 836 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 837 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 838 if (!key) 839 goto out; 840 841 tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 842 if (memcmp(md5_hash_location, newhash, 16) != 0) 843 goto out; 844 } 845 846 if (key) { 847 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 848 (TCPOPT_NOP << 16) | 849 (TCPOPT_MD5SIG << 8) | 850 TCPOLEN_MD5SIG); 851 /* Update length and the length the header thinks exists */ 852 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 853 rep.th.doff = arg.iov[0].iov_len / 4; 854 855 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 856 key, ip_hdr(skb)->saddr, 857 ip_hdr(skb)->daddr, &rep.th); 858 } 859 #endif 860 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 861 if (rep.opt[0] == 0) { 862 __be32 mrst = mptcp_reset_option(skb); 863 864 if (mrst) { 865 rep.opt[0] = mrst; 866 arg.iov[0].iov_len += sizeof(mrst); 867 rep.th.doff = arg.iov[0].iov_len / 4; 868 } 869 } 870 871 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 872 ip_hdr(skb)->saddr, /* XXX */ 873 arg.iov[0].iov_len, IPPROTO_TCP, 0); 874 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 875 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 876 877 /* When socket is gone, all binding information is lost. 878 * routing might fail in this case. No choice here, if we choose to force 879 * input interface, we will misroute in case of asymmetric route. 880 */ 881 if (sk) 882 arg.bound_dev_if = sk->sk_bound_dev_if; 883 884 trace_tcp_send_reset(sk, skb, reason); 885 886 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 887 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 888 889 /* ECN bits of TW reset are cleared */ 890 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 891 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 892 local_bh_disable(); 893 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 894 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 895 896 sock_net_set(ctl_sk, net); 897 if (sk) { 898 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 899 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 900 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 901 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 902 transmit_time = tcp_transmit_time(sk); 903 xfrm_sk_clone_policy(ctl_sk, sk); 904 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 905 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 906 } else { 907 ctl_sk->sk_mark = 0; 908 ctl_sk->sk_priority = 0; 909 } 910 ip_send_unicast_reply(ctl_sk, sk, 911 skb, &TCP_SKB_CB(skb)->header.h4.opt, 912 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 913 &arg, arg.iov[0].iov_len, 914 transmit_time, txhash); 915 916 xfrm_sk_free_policy(ctl_sk); 917 sock_net_set(ctl_sk, &init_net); 918 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 919 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 920 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 921 local_bh_enable(); 922 923 #ifdef CONFIG_TCP_MD5SIG 924 out: 925 rcu_read_unlock(); 926 #endif 927 } 928 929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 930 outside socket context is ugly, certainly. What can I do? 931 */ 932 933 static void tcp_v4_send_ack(const struct sock *sk, 934 struct sk_buff *skb, u32 seq, u32 ack, 935 u32 win, u32 tsval, u32 tsecr, int oif, 936 struct tcp_key *key, 937 int reply_flags, u8 tos, u32 txhash) 938 { 939 const struct tcphdr *th = tcp_hdr(skb); 940 struct { 941 struct tcphdr th; 942 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 943 } rep; 944 struct net *net = sock_net(sk); 945 struct ip_reply_arg arg; 946 struct sock *ctl_sk; 947 u64 transmit_time; 948 949 memset(&rep.th, 0, sizeof(struct tcphdr)); 950 memset(&arg, 0, sizeof(arg)); 951 952 arg.iov[0].iov_base = (unsigned char *)&rep; 953 arg.iov[0].iov_len = sizeof(rep.th); 954 if (tsecr) { 955 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 956 (TCPOPT_TIMESTAMP << 8) | 957 TCPOLEN_TIMESTAMP); 958 rep.opt[1] = htonl(tsval); 959 rep.opt[2] = htonl(tsecr); 960 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 961 } 962 963 /* Swap the send and the receive. */ 964 rep.th.dest = th->source; 965 rep.th.source = th->dest; 966 rep.th.doff = arg.iov[0].iov_len / 4; 967 rep.th.seq = htonl(seq); 968 rep.th.ack_seq = htonl(ack); 969 rep.th.ack = 1; 970 rep.th.window = htons(win); 971 972 #ifdef CONFIG_TCP_MD5SIG 973 if (tcp_key_is_md5(key)) { 974 int offset = (tsecr) ? 3 : 0; 975 976 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 977 (TCPOPT_NOP << 16) | 978 (TCPOPT_MD5SIG << 8) | 979 TCPOLEN_MD5SIG); 980 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 981 rep.th.doff = arg.iov[0].iov_len/4; 982 983 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 984 key->md5_key, ip_hdr(skb)->saddr, 985 ip_hdr(skb)->daddr, &rep.th); 986 } 987 #endif 988 #ifdef CONFIG_TCP_AO 989 if (tcp_key_is_ao(key)) { 990 int offset = (tsecr) ? 3 : 0; 991 992 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 993 (tcp_ao_len(key->ao_key) << 16) | 994 (key->ao_key->sndid << 8) | 995 key->rcv_next); 996 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 997 rep.th.doff = arg.iov[0].iov_len / 4; 998 999 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1000 key->ao_key, key->traffic_key, 1001 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1002 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1003 &rep.th, key->sne); 1004 } 1005 #endif 1006 arg.flags = reply_flags; 1007 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1008 ip_hdr(skb)->saddr, /* XXX */ 1009 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1010 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1011 if (oif) 1012 arg.bound_dev_if = oif; 1013 arg.tos = tos; 1014 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1015 local_bh_disable(); 1016 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1017 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1018 sock_net_set(ctl_sk, net); 1019 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1020 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1021 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1022 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1023 transmit_time = tcp_transmit_time(sk); 1024 ip_send_unicast_reply(ctl_sk, sk, 1025 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1026 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1027 &arg, arg.iov[0].iov_len, 1028 transmit_time, txhash); 1029 1030 sock_net_set(ctl_sk, &init_net); 1031 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1032 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1033 local_bh_enable(); 1034 } 1035 1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1037 enum tcp_tw_status tw_status) 1038 { 1039 struct inet_timewait_sock *tw = inet_twsk(sk); 1040 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1041 struct tcp_key key = {}; 1042 u8 tos = tw->tw_tos; 1043 1044 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1045 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1046 * being placed in a different service queues (Classic rather than L4S) 1047 */ 1048 if (tw_status == TCP_TW_ACK_OOW) 1049 tos &= ~INET_ECN_MASK; 1050 1051 #ifdef CONFIG_TCP_AO 1052 struct tcp_ao_info *ao_info; 1053 1054 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1055 /* FIXME: the segment to-be-acked is not verified yet */ 1056 ao_info = rcu_dereference(tcptw->ao_info); 1057 if (ao_info) { 1058 const struct tcp_ao_hdr *aoh; 1059 1060 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1061 inet_twsk_put(tw); 1062 return; 1063 } 1064 1065 if (aoh) 1066 key.ao_key = tcp_ao_established_key(sk, ao_info, 1067 aoh->rnext_keyid, -1); 1068 } 1069 } 1070 if (key.ao_key) { 1071 struct tcp_ao_key *rnext_key; 1072 1073 key.traffic_key = snd_other_key(key.ao_key); 1074 key.sne = READ_ONCE(ao_info->snd_sne); 1075 rnext_key = READ_ONCE(ao_info->rnext_key); 1076 key.rcv_next = rnext_key->rcvid; 1077 key.type = TCP_KEY_AO; 1078 #else 1079 if (0) { 1080 #endif 1081 } else if (static_branch_tcp_md5()) { 1082 key.md5_key = tcp_twsk_md5_key(tcptw); 1083 if (key.md5_key) 1084 key.type = TCP_KEY_MD5; 1085 } 1086 1087 tcp_v4_send_ack(sk, skb, 1088 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1089 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1090 tcp_tw_tsval(tcptw), 1091 READ_ONCE(tcptw->tw_ts_recent), 1092 tw->tw_bound_dev_if, &key, 1093 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1094 tos, 1095 tw->tw_txhash); 1096 1097 inet_twsk_put(tw); 1098 } 1099 1100 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1101 struct request_sock *req) 1102 { 1103 struct tcp_key key = {}; 1104 1105 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1106 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1107 */ 1108 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1109 tcp_sk(sk)->snd_nxt; 1110 1111 #ifdef CONFIG_TCP_AO 1112 if (static_branch_unlikely(&tcp_ao_needed.key) && 1113 tcp_rsk_used_ao(req)) { 1114 const union tcp_md5_addr *addr; 1115 const struct tcp_ao_hdr *aoh; 1116 int l3index; 1117 1118 /* Invalid TCP option size or twice included auth */ 1119 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1120 return; 1121 if (!aoh) 1122 return; 1123 1124 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1125 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1126 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1127 aoh->rnext_keyid, -1); 1128 if (unlikely(!key.ao_key)) { 1129 /* Send ACK with any matching MKT for the peer */ 1130 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1131 /* Matching key disappeared (user removed the key?) 1132 * let the handshake timeout. 1133 */ 1134 if (!key.ao_key) { 1135 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1136 addr, 1137 ntohs(tcp_hdr(skb)->source), 1138 &ip_hdr(skb)->daddr, 1139 ntohs(tcp_hdr(skb)->dest)); 1140 return; 1141 } 1142 } 1143 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1144 if (!key.traffic_key) 1145 return; 1146 1147 key.type = TCP_KEY_AO; 1148 key.rcv_next = aoh->keyid; 1149 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1150 #else 1151 if (0) { 1152 #endif 1153 } else if (static_branch_tcp_md5()) { 1154 const union tcp_md5_addr *addr; 1155 int l3index; 1156 1157 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1158 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1159 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1160 if (key.md5_key) 1161 key.type = TCP_KEY_MD5; 1162 } 1163 1164 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1165 tcp_v4_send_ack(sk, skb, seq, 1166 tcp_rsk(req)->rcv_nxt, 1167 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1168 tcp_rsk_tsval(tcp_rsk(req)), 1169 req->ts_recent, 1170 0, &key, 1171 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1172 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1173 READ_ONCE(tcp_rsk(req)->txhash)); 1174 if (tcp_key_is_ao(&key)) 1175 kfree(key.traffic_key); 1176 } 1177 1178 /* 1179 * Send a SYN-ACK after having received a SYN. 1180 * This still operates on a request_sock only, not on a big 1181 * socket. 1182 */ 1183 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1184 struct flowi *fl, 1185 struct request_sock *req, 1186 struct tcp_fastopen_cookie *foc, 1187 enum tcp_synack_type synack_type, 1188 struct sk_buff *syn_skb) 1189 { 1190 struct inet_request_sock *ireq = inet_rsk(req); 1191 struct flowi4 fl4; 1192 int err = -1; 1193 struct sk_buff *skb; 1194 u8 tos; 1195 1196 /* First, grab a route. */ 1197 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1198 return -1; 1199 1200 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1201 1202 if (skb) { 1203 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 1204 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1205 1206 tos = READ_ONCE(inet_sk(sk)->tos); 1207 1208 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1209 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1210 (tos & INET_ECN_MASK); 1211 1212 if (!INET_ECN_is_capable(tos) && 1213 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1214 tos |= INET_ECN_ECT_0; 1215 1216 rcu_read_lock(); 1217 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1218 ireq->ir_rmt_addr, 1219 rcu_dereference(ireq->ireq_opt), 1220 tos); 1221 rcu_read_unlock(); 1222 err = net_xmit_eval(err); 1223 } 1224 1225 return err; 1226 } 1227 1228 /* 1229 * IPv4 request_sock destructor. 1230 */ 1231 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1232 { 1233 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1234 } 1235 1236 #ifdef CONFIG_TCP_MD5SIG 1237 /* 1238 * RFC2385 MD5 checksumming requires a mapping of 1239 * IP address->MD5 Key. 1240 * We need to maintain these in the sk structure. 1241 */ 1242 1243 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1244 EXPORT_IPV6_MOD(tcp_md5_needed); 1245 1246 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1247 { 1248 if (!old) 1249 return true; 1250 1251 /* l3index always overrides non-l3index */ 1252 if (old->l3index && new->l3index == 0) 1253 return false; 1254 if (old->l3index == 0 && new->l3index) 1255 return true; 1256 1257 return old->prefixlen < new->prefixlen; 1258 } 1259 1260 /* Find the Key structure for an address. */ 1261 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1262 const union tcp_md5_addr *addr, 1263 int family, bool any_l3index) 1264 { 1265 const struct tcp_sock *tp = tcp_sk(sk); 1266 struct tcp_md5sig_key *key; 1267 const struct tcp_md5sig_info *md5sig; 1268 __be32 mask; 1269 struct tcp_md5sig_key *best_match = NULL; 1270 bool match; 1271 1272 /* caller either holds rcu_read_lock() or socket lock */ 1273 md5sig = rcu_dereference_check(tp->md5sig_info, 1274 lockdep_sock_is_held(sk)); 1275 if (!md5sig) 1276 return NULL; 1277 1278 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1279 lockdep_sock_is_held(sk)) { 1280 if (key->family != family) 1281 continue; 1282 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1283 key->l3index != l3index) 1284 continue; 1285 if (family == AF_INET) { 1286 mask = inet_make_mask(key->prefixlen); 1287 match = (key->addr.a4.s_addr & mask) == 1288 (addr->a4.s_addr & mask); 1289 #if IS_ENABLED(CONFIG_IPV6) 1290 } else if (family == AF_INET6) { 1291 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1292 key->prefixlen); 1293 #endif 1294 } else { 1295 match = false; 1296 } 1297 1298 if (match && better_md5_match(best_match, key)) 1299 best_match = key; 1300 } 1301 return best_match; 1302 } 1303 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1304 1305 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1306 const union tcp_md5_addr *addr, 1307 int family, u8 prefixlen, 1308 int l3index, u8 flags) 1309 { 1310 const struct tcp_sock *tp = tcp_sk(sk); 1311 struct tcp_md5sig_key *key; 1312 unsigned int size = sizeof(struct in_addr); 1313 const struct tcp_md5sig_info *md5sig; 1314 1315 /* caller either holds rcu_read_lock() or socket lock */ 1316 md5sig = rcu_dereference_check(tp->md5sig_info, 1317 lockdep_sock_is_held(sk)); 1318 if (!md5sig) 1319 return NULL; 1320 #if IS_ENABLED(CONFIG_IPV6) 1321 if (family == AF_INET6) 1322 size = sizeof(struct in6_addr); 1323 #endif 1324 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1325 lockdep_sock_is_held(sk)) { 1326 if (key->family != family) 1327 continue; 1328 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1329 continue; 1330 if (key->l3index != l3index) 1331 continue; 1332 if (!memcmp(&key->addr, addr, size) && 1333 key->prefixlen == prefixlen) 1334 return key; 1335 } 1336 return NULL; 1337 } 1338 1339 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1340 const struct sock *addr_sk) 1341 { 1342 const union tcp_md5_addr *addr; 1343 int l3index; 1344 1345 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1346 addr_sk->sk_bound_dev_if); 1347 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1348 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1349 } 1350 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1351 1352 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1353 { 1354 struct tcp_sock *tp = tcp_sk(sk); 1355 struct tcp_md5sig_info *md5sig; 1356 1357 md5sig = kmalloc(sizeof(*md5sig), gfp); 1358 if (!md5sig) 1359 return -ENOMEM; 1360 1361 sk_gso_disable(sk); 1362 INIT_HLIST_HEAD(&md5sig->head); 1363 rcu_assign_pointer(tp->md5sig_info, md5sig); 1364 return 0; 1365 } 1366 1367 /* This can be called on a newly created socket, from other files */ 1368 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1369 int family, u8 prefixlen, int l3index, u8 flags, 1370 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1371 { 1372 /* Add Key to the list */ 1373 struct tcp_md5sig_key *key; 1374 struct tcp_sock *tp = tcp_sk(sk); 1375 struct tcp_md5sig_info *md5sig; 1376 1377 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1378 if (key) { 1379 /* Pre-existing entry - just update that one. 1380 * Note that the key might be used concurrently. 1381 * data_race() is telling kcsan that we do not care of 1382 * key mismatches, since changing MD5 key on live flows 1383 * can lead to packet drops. 1384 */ 1385 data_race(memcpy(key->key, newkey, newkeylen)); 1386 1387 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1388 * Also note that a reader could catch new key->keylen value 1389 * but old key->key[], this is the reason we use __GFP_ZERO 1390 * at sock_kmalloc() time below these lines. 1391 */ 1392 WRITE_ONCE(key->keylen, newkeylen); 1393 1394 return 0; 1395 } 1396 1397 md5sig = rcu_dereference_protected(tp->md5sig_info, 1398 lockdep_sock_is_held(sk)); 1399 1400 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1401 if (!key) 1402 return -ENOMEM; 1403 1404 memcpy(key->key, newkey, newkeylen); 1405 key->keylen = newkeylen; 1406 key->family = family; 1407 key->prefixlen = prefixlen; 1408 key->l3index = l3index; 1409 key->flags = flags; 1410 memcpy(&key->addr, addr, 1411 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1412 sizeof(struct in_addr)); 1413 hlist_add_head_rcu(&key->node, &md5sig->head); 1414 return 0; 1415 } 1416 1417 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1418 int family, u8 prefixlen, int l3index, u8 flags, 1419 const u8 *newkey, u8 newkeylen) 1420 { 1421 struct tcp_sock *tp = tcp_sk(sk); 1422 1423 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1424 if (fips_enabled) { 1425 pr_warn_once("TCP-MD5 support is disabled due to FIPS\n"); 1426 return -EOPNOTSUPP; 1427 } 1428 1429 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1430 return -ENOMEM; 1431 1432 if (!static_branch_inc(&tcp_md5_needed.key)) { 1433 struct tcp_md5sig_info *md5sig; 1434 1435 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1436 rcu_assign_pointer(tp->md5sig_info, NULL); 1437 kfree_rcu(md5sig, rcu); 1438 return -EUSERS; 1439 } 1440 } 1441 1442 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1443 newkey, newkeylen, GFP_KERNEL); 1444 } 1445 EXPORT_IPV6_MOD(tcp_md5_do_add); 1446 1447 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1448 int family, u8 prefixlen, int l3index, 1449 struct tcp_md5sig_key *key) 1450 { 1451 struct tcp_sock *tp = tcp_sk(sk); 1452 1453 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1454 1455 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1456 return -ENOMEM; 1457 1458 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1459 struct tcp_md5sig_info *md5sig; 1460 1461 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1462 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1463 rcu_assign_pointer(tp->md5sig_info, NULL); 1464 kfree_rcu(md5sig, rcu); 1465 return -EUSERS; 1466 } 1467 } 1468 1469 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1470 key->flags, key->key, key->keylen, 1471 sk_gfp_mask(sk, GFP_ATOMIC)); 1472 } 1473 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1474 1475 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1476 u8 prefixlen, int l3index, u8 flags) 1477 { 1478 struct tcp_md5sig_key *key; 1479 1480 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1481 if (!key) 1482 return -ENOENT; 1483 hlist_del_rcu(&key->node); 1484 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1485 kfree_rcu(key, rcu); 1486 return 0; 1487 } 1488 EXPORT_IPV6_MOD(tcp_md5_do_del); 1489 1490 void tcp_clear_md5_list(struct sock *sk) 1491 { 1492 struct tcp_sock *tp = tcp_sk(sk); 1493 struct tcp_md5sig_key *key; 1494 struct hlist_node *n; 1495 struct tcp_md5sig_info *md5sig; 1496 1497 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1498 1499 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1500 hlist_del(&key->node); 1501 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1502 kfree(key); 1503 } 1504 } 1505 1506 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1507 sockptr_t optval, int optlen) 1508 { 1509 struct tcp_md5sig cmd; 1510 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1511 const union tcp_md5_addr *addr; 1512 u8 prefixlen = 32; 1513 int l3index = 0; 1514 bool l3flag; 1515 u8 flags; 1516 1517 if (optlen < sizeof(cmd)) 1518 return -EINVAL; 1519 1520 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1521 return -EFAULT; 1522 1523 if (sin->sin_family != AF_INET) 1524 return -EINVAL; 1525 1526 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1527 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1528 1529 if (optname == TCP_MD5SIG_EXT && 1530 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1531 prefixlen = cmd.tcpm_prefixlen; 1532 if (prefixlen > 32) 1533 return -EINVAL; 1534 } 1535 1536 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1537 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1538 struct net_device *dev; 1539 1540 rcu_read_lock(); 1541 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1542 if (dev && netif_is_l3_master(dev)) 1543 l3index = dev->ifindex; 1544 1545 rcu_read_unlock(); 1546 1547 /* ok to reference set/not set outside of rcu; 1548 * right now device MUST be an L3 master 1549 */ 1550 if (!dev || !l3index) 1551 return -EINVAL; 1552 } 1553 1554 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1555 1556 if (!cmd.tcpm_keylen) 1557 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1558 1559 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1560 return -EINVAL; 1561 1562 /* Don't allow keys for peers that have a matching TCP-AO key. 1563 * See the comment in tcp_ao_add_cmd() 1564 */ 1565 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1566 return -EKEYREJECTED; 1567 1568 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1569 cmd.tcpm_key, cmd.tcpm_keylen); 1570 } 1571 1572 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx, 1573 __be32 daddr, __be32 saddr, 1574 const struct tcphdr *th, int nbytes) 1575 { 1576 struct { 1577 struct tcp4_pseudohdr ip; 1578 struct tcphdr tcp; 1579 } h; 1580 1581 h.ip.saddr = saddr; 1582 h.ip.daddr = daddr; 1583 h.ip.pad = 0; 1584 h.ip.protocol = IPPROTO_TCP; 1585 h.ip.len = cpu_to_be16(nbytes); 1586 h.tcp = *th; 1587 h.tcp.check = 0; 1588 md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); 1589 } 1590 1591 static noinline_for_stack void 1592 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1593 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1594 { 1595 struct md5_ctx ctx; 1596 1597 md5_init(&ctx); 1598 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); 1599 tcp_md5_hash_key(&ctx, key); 1600 md5_final(&ctx, md5_hash); 1601 } 1602 1603 noinline_for_stack void 1604 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1605 const struct sock *sk, const struct sk_buff *skb) 1606 { 1607 const struct tcphdr *th = tcp_hdr(skb); 1608 __be32 saddr, daddr; 1609 struct md5_ctx ctx; 1610 1611 if (sk) { /* valid for establish/request sockets */ 1612 saddr = sk->sk_rcv_saddr; 1613 daddr = sk->sk_daddr; 1614 } else { 1615 const struct iphdr *iph = ip_hdr(skb); 1616 saddr = iph->saddr; 1617 daddr = iph->daddr; 1618 } 1619 1620 md5_init(&ctx); 1621 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); 1622 tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); 1623 tcp_md5_hash_key(&ctx, key); 1624 md5_final(&ctx, md5_hash); 1625 } 1626 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1627 1628 #endif 1629 1630 static void tcp_v4_init_req(struct request_sock *req, 1631 const struct sock *sk_listener, 1632 struct sk_buff *skb) 1633 { 1634 struct inet_request_sock *ireq = inet_rsk(req); 1635 struct net *net = sock_net(sk_listener); 1636 1637 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1638 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1639 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1640 } 1641 1642 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1643 struct sk_buff *skb, 1644 struct flowi *fl, 1645 struct request_sock *req, 1646 u32 tw_isn) 1647 { 1648 tcp_v4_init_req(req, sk, skb); 1649 1650 if (security_inet_conn_request(sk, skb, req)) 1651 return NULL; 1652 1653 return inet_csk_route_req(sk, &fl->u.ip4, req); 1654 } 1655 1656 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1657 .family = PF_INET, 1658 .obj_size = sizeof(struct tcp_request_sock), 1659 .send_ack = tcp_v4_reqsk_send_ack, 1660 .destructor = tcp_v4_reqsk_destructor, 1661 .send_reset = tcp_v4_send_reset, 1662 }; 1663 1664 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1665 .mss_clamp = TCP_MSS_DEFAULT, 1666 #ifdef CONFIG_TCP_MD5SIG 1667 .req_md5_lookup = tcp_v4_md5_lookup, 1668 .calc_md5_hash = tcp_v4_md5_hash_skb, 1669 #endif 1670 #ifdef CONFIG_TCP_AO 1671 .ao_lookup = tcp_v4_ao_lookup_rsk, 1672 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1673 .ao_synack_hash = tcp_v4_ao_synack_hash, 1674 #endif 1675 #ifdef CONFIG_SYN_COOKIES 1676 .cookie_init_seq = cookie_v4_init_sequence, 1677 #endif 1678 .route_req = tcp_v4_route_req, 1679 .init_seq = tcp_v4_init_seq, 1680 .init_ts_off = tcp_v4_init_ts_off, 1681 .send_synack = tcp_v4_send_synack, 1682 }; 1683 1684 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1685 { 1686 /* Never answer to SYNs send to broadcast or multicast */ 1687 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1688 goto drop; 1689 1690 return tcp_conn_request(&tcp_request_sock_ops, 1691 &tcp_request_sock_ipv4_ops, sk, skb); 1692 1693 drop: 1694 tcp_listendrop(sk); 1695 return 0; 1696 } 1697 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1698 1699 1700 /* 1701 * The three way handshake has completed - we got a valid synack - 1702 * now create the new socket. 1703 */ 1704 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1705 struct request_sock *req, 1706 struct dst_entry *dst, 1707 struct request_sock *req_unhash, 1708 bool *own_req, 1709 void (*opt_child_init)(struct sock *newsk, 1710 const struct sock *sk)) 1711 { 1712 struct inet_request_sock *ireq; 1713 bool found_dup_sk = false; 1714 struct inet_sock *newinet; 1715 struct tcp_sock *newtp; 1716 struct sock *newsk; 1717 #ifdef CONFIG_TCP_MD5SIG 1718 const union tcp_md5_addr *addr; 1719 struct tcp_md5sig_key *key; 1720 int l3index; 1721 #endif 1722 struct ip_options_rcu *inet_opt; 1723 1724 if (sk_acceptq_is_full(sk)) 1725 goto exit_overflow; 1726 1727 newsk = tcp_create_openreq_child(sk, req, skb); 1728 if (!newsk) 1729 goto exit_nonewsk; 1730 1731 newsk->sk_gso_type = SKB_GSO_TCPV4; 1732 inet_sk_rx_dst_set(newsk, skb); 1733 1734 newtp = tcp_sk(newsk); 1735 newinet = inet_sk(newsk); 1736 ireq = inet_rsk(req); 1737 inet_opt = rcu_dereference(ireq->ireq_opt); 1738 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1739 newinet->mc_index = inet_iif(skb); 1740 newinet->mc_ttl = ip_hdr(skb)->ttl; 1741 newinet->rcv_tos = ip_hdr(skb)->tos; 1742 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1743 if (inet_opt) 1744 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1745 atomic_set(&newinet->inet_id, get_random_u16()); 1746 1747 /* Set ToS of the new socket based upon the value of incoming SYN. 1748 * ECT bits are set later in tcp_init_transfer(). 1749 */ 1750 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1751 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1752 1753 if (!dst) { 1754 dst = inet_csk_route_child_sock(sk, newsk, req); 1755 if (!dst) 1756 goto put_and_exit; 1757 } else { 1758 /* syncookie case : see end of cookie_v4_check() */ 1759 } 1760 sk_setup_caps(newsk, dst); 1761 1762 #if IS_ENABLED(CONFIG_IPV6) 1763 if (opt_child_init) 1764 opt_child_init(newsk, sk); 1765 #endif 1766 tcp_ca_openreq_child(newsk, dst); 1767 1768 tcp_sync_mss(newsk, dst4_mtu(dst)); 1769 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1770 1771 tcp_initialize_rcv_mss(newsk); 1772 1773 #ifdef CONFIG_TCP_MD5SIG 1774 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1775 /* Copy over the MD5 key from the original socket */ 1776 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1777 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1778 if (key && !tcp_rsk_used_ao(req)) { 1779 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1780 goto put_and_exit; 1781 sk_gso_disable(newsk); 1782 } 1783 #endif 1784 #ifdef CONFIG_TCP_AO 1785 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1786 goto put_and_exit; /* OOM, release back memory */ 1787 #endif 1788 1789 if (__inet_inherit_port(sk, newsk) < 0) 1790 goto put_and_exit; 1791 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1792 &found_dup_sk); 1793 if (likely(*own_req)) { 1794 tcp_move_syn(newtp, req); 1795 ireq->ireq_opt = NULL; 1796 } else { 1797 newinet->inet_opt = NULL; 1798 1799 if (!req_unhash && found_dup_sk) { 1800 /* This code path should only be executed in the 1801 * syncookie case only 1802 */ 1803 bh_unlock_sock(newsk); 1804 sock_put(newsk); 1805 newsk = NULL; 1806 } 1807 } 1808 return newsk; 1809 1810 exit_overflow: 1811 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1812 exit_nonewsk: 1813 dst_release(dst); 1814 exit: 1815 tcp_listendrop(sk); 1816 return NULL; 1817 put_and_exit: 1818 newinet->inet_opt = NULL; 1819 inet_csk_prepare_forced_close(newsk); 1820 tcp_done(newsk); 1821 goto exit; 1822 } 1823 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1824 1825 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1826 { 1827 #ifdef CONFIG_SYN_COOKIES 1828 const struct tcphdr *th = tcp_hdr(skb); 1829 1830 if (!th->syn) 1831 sk = cookie_v4_check(sk, skb); 1832 #endif 1833 return sk; 1834 } 1835 1836 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1837 struct tcphdr *th, u32 *cookie) 1838 { 1839 u16 mss = 0; 1840 #ifdef CONFIG_SYN_COOKIES 1841 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1842 &tcp_request_sock_ipv4_ops, sk, th); 1843 if (mss) { 1844 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1845 tcp_synq_overflow(sk); 1846 } 1847 #endif 1848 return mss; 1849 } 1850 1851 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1852 u32)); 1853 /* The socket must have it's spinlock held when we get 1854 * here, unless it is a TCP_LISTEN socket. 1855 * 1856 * We have a potential double-lock case here, so even when 1857 * doing backlog processing we use the BH locking scheme. 1858 * This is because we cannot sleep with the original spinlock 1859 * held. 1860 */ 1861 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1862 { 1863 enum skb_drop_reason reason; 1864 struct sock *rsk; 1865 1866 reason = psp_sk_rx_policy_check(sk, skb); 1867 if (reason) 1868 goto err_discard; 1869 1870 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1871 struct dst_entry *dst; 1872 1873 dst = rcu_dereference_protected(sk->sk_rx_dst, 1874 lockdep_sock_is_held(sk)); 1875 1876 sock_rps_save_rxhash(sk, skb); 1877 sk_mark_napi_id(sk, skb); 1878 if (dst) { 1879 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1880 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1881 dst, 0)) { 1882 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1883 dst_release(dst); 1884 } 1885 } 1886 tcp_rcv_established(sk, skb); 1887 return 0; 1888 } 1889 1890 if (tcp_checksum_complete(skb)) 1891 goto csum_err; 1892 1893 if (sk->sk_state == TCP_LISTEN) { 1894 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1895 1896 if (!nsk) 1897 return 0; 1898 if (nsk != sk) { 1899 reason = tcp_child_process(sk, nsk, skb); 1900 if (reason) { 1901 rsk = nsk; 1902 goto reset; 1903 } 1904 return 0; 1905 } 1906 } else 1907 sock_rps_save_rxhash(sk, skb); 1908 1909 reason = tcp_rcv_state_process(sk, skb); 1910 if (reason) { 1911 rsk = sk; 1912 goto reset; 1913 } 1914 return 0; 1915 1916 reset: 1917 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1918 discard: 1919 sk_skb_reason_drop(sk, skb, reason); 1920 /* Be careful here. If this function gets more complicated and 1921 * gcc suffers from register pressure on the x86, sk (in %ebx) 1922 * might be destroyed here. This current version compiles correctly, 1923 * but you have been warned. 1924 */ 1925 return 0; 1926 1927 csum_err: 1928 reason = SKB_DROP_REASON_TCP_CSUM; 1929 trace_tcp_bad_csum(skb); 1930 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1931 err_discard: 1932 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1933 goto discard; 1934 } 1935 EXPORT_SYMBOL(tcp_v4_do_rcv); 1936 1937 int tcp_v4_early_demux(struct sk_buff *skb) 1938 { 1939 struct net *net = dev_net_rcu(skb->dev); 1940 const struct iphdr *iph; 1941 const struct tcphdr *th; 1942 struct sock *sk; 1943 1944 if (skb->pkt_type != PACKET_HOST) 1945 return 0; 1946 1947 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1948 return 0; 1949 1950 iph = ip_hdr(skb); 1951 th = tcp_hdr(skb); 1952 1953 if (th->doff < sizeof(struct tcphdr) / 4) 1954 return 0; 1955 1956 sk = __inet_lookup_established(net, iph->saddr, th->source, 1957 iph->daddr, ntohs(th->dest), 1958 skb->skb_iif, inet_sdif(skb)); 1959 if (sk) { 1960 skb->sk = sk; 1961 skb->destructor = sock_edemux; 1962 if (sk_fullsock(sk)) { 1963 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1964 1965 if (dst) 1966 dst = dst_check(dst, 0); 1967 if (dst && 1968 sk->sk_rx_dst_ifindex == skb->skb_iif) 1969 skb_dst_set_noref(skb, dst); 1970 } 1971 } 1972 return 0; 1973 } 1974 1975 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1976 enum skb_drop_reason *reason) 1977 { 1978 u32 tail_gso_size, tail_gso_segs; 1979 struct skb_shared_info *shinfo; 1980 const struct tcphdr *th; 1981 struct tcphdr *thtail; 1982 struct sk_buff *tail; 1983 unsigned int hdrlen; 1984 bool fragstolen; 1985 u32 gso_segs; 1986 u32 gso_size; 1987 u64 limit; 1988 int delta; 1989 int err; 1990 1991 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1992 * we can fix skb->truesize to its real value to avoid future drops. 1993 * This is valid because skb is not yet charged to the socket. 1994 * It has been noticed pure SACK packets were sometimes dropped 1995 * (if cooked by drivers without copybreak feature). 1996 */ 1997 skb_condense(skb); 1998 1999 tcp_cleanup_skb(skb); 2000 2001 if (unlikely(tcp_checksum_complete(skb))) { 2002 bh_unlock_sock(sk); 2003 trace_tcp_bad_csum(skb); 2004 *reason = SKB_DROP_REASON_TCP_CSUM; 2005 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2006 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2007 return true; 2008 } 2009 2010 /* Attempt coalescing to last skb in backlog, even if we are 2011 * above the limits. 2012 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2013 */ 2014 th = (const struct tcphdr *)skb->data; 2015 hdrlen = th->doff * 4; 2016 2017 tail = sk->sk_backlog.tail; 2018 if (!tail) 2019 goto no_coalesce; 2020 thtail = (struct tcphdr *)tail->data; 2021 2022 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2023 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2024 ((TCP_SKB_CB(tail)->tcp_flags | 2025 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2026 !((TCP_SKB_CB(tail)->tcp_flags & 2027 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2028 ((TCP_SKB_CB(tail)->tcp_flags ^ 2029 TCP_SKB_CB(skb)->tcp_flags) & 2030 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2031 !tcp_skb_can_collapse_rx(tail, skb) || 2032 thtail->doff != th->doff || 2033 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || 2034 /* prior to PSP Rx policy check, retain exact PSP metadata */ 2035 psp_skb_coalesce_diff(tail, skb)) 2036 goto no_coalesce; 2037 2038 __skb_pull(skb, hdrlen); 2039 2040 shinfo = skb_shinfo(skb); 2041 gso_size = shinfo->gso_size ?: skb->len; 2042 gso_segs = shinfo->gso_segs ?: 1; 2043 2044 shinfo = skb_shinfo(tail); 2045 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2046 tail_gso_segs = shinfo->gso_segs ?: 1; 2047 2048 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2049 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2050 2051 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2052 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2053 thtail->window = th->window; 2054 } 2055 2056 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2057 * thtail->fin, so that the fast path in tcp_rcv_established() 2058 * is not entered if we append a packet with a FIN. 2059 * SYN, RST, URG are not present. 2060 * ACK is set on both packets. 2061 * PSH : we do not really care in TCP stack, 2062 * at least for 'GRO' packets. 2063 */ 2064 thtail->fin |= th->fin; 2065 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2066 2067 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2068 TCP_SKB_CB(tail)->has_rxtstamp = true; 2069 tail->tstamp = skb->tstamp; 2070 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2071 } 2072 2073 /* Not as strict as GRO. We only need to carry mss max value */ 2074 shinfo->gso_size = max(gso_size, tail_gso_size); 2075 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2076 2077 sk->sk_backlog.len += delta; 2078 __NET_INC_STATS(sock_net(sk), 2079 LINUX_MIB_TCPBACKLOGCOALESCE); 2080 kfree_skb_partial(skb, fragstolen); 2081 return false; 2082 } 2083 __skb_push(skb, hdrlen); 2084 2085 no_coalesce: 2086 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2087 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2088 * sk_rcvbuf in normal conditions. 2089 */ 2090 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2091 2092 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2093 2094 /* Only socket owner can try to collapse/prune rx queues 2095 * to reduce memory overhead, so add a little headroom here. 2096 * Few sockets backlog are possibly concurrently non empty. 2097 */ 2098 limit += 64 * 1024; 2099 2100 limit = min_t(u64, limit, UINT_MAX); 2101 2102 err = sk_add_backlog(sk, skb, limit); 2103 if (unlikely(err)) { 2104 bh_unlock_sock(sk); 2105 if (err == -ENOMEM) { 2106 *reason = SKB_DROP_REASON_PFMEMALLOC; 2107 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2108 } else { 2109 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2110 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2111 } 2112 return true; 2113 } 2114 return false; 2115 } 2116 EXPORT_IPV6_MOD(tcp_add_backlog); 2117 2118 static void tcp_v4_restore_cb(struct sk_buff *skb) 2119 { 2120 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2121 sizeof(struct inet_skb_parm)); 2122 } 2123 2124 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2125 const struct tcphdr *th) 2126 { 2127 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2128 * barrier() makes sure compiler wont play fool^Waliasing games. 2129 */ 2130 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2131 sizeof(struct inet_skb_parm)); 2132 barrier(); 2133 2134 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2135 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2136 skb->len - th->doff * 4); 2137 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2138 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2139 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2140 TCP_SKB_CB(skb)->sacked = 0; 2141 TCP_SKB_CB(skb)->has_rxtstamp = 2142 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2143 } 2144 2145 /* 2146 * From tcp_input.c 2147 */ 2148 2149 int tcp_v4_rcv(struct sk_buff *skb) 2150 { 2151 struct net *net = dev_net_rcu(skb->dev); 2152 enum skb_drop_reason drop_reason; 2153 enum tcp_tw_status tw_status; 2154 int sdif = inet_sdif(skb); 2155 int dif = inet_iif(skb); 2156 const struct iphdr *iph; 2157 const struct tcphdr *th; 2158 struct sock *sk = NULL; 2159 bool refcounted; 2160 int ret; 2161 u32 isn; 2162 2163 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2164 if (skb->pkt_type != PACKET_HOST) 2165 goto discard_it; 2166 2167 /* Count it even if it's bad */ 2168 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2169 2170 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2171 goto discard_it; 2172 2173 th = (const struct tcphdr *)skb->data; 2174 2175 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2176 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2177 goto bad_packet; 2178 } 2179 if (!pskb_may_pull(skb, th->doff * 4)) 2180 goto discard_it; 2181 2182 /* An explanation is required here, I think. 2183 * Packet length and doff are validated by header prediction, 2184 * provided case of th->doff==0 is eliminated. 2185 * So, we defer the checks. */ 2186 2187 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2188 goto csum_error; 2189 2190 th = (const struct tcphdr *)skb->data; 2191 iph = ip_hdr(skb); 2192 lookup: 2193 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2194 th->dest, sdif, &refcounted); 2195 if (!sk) 2196 goto no_tcp_socket; 2197 2198 if (sk->sk_state == TCP_TIME_WAIT) 2199 goto do_time_wait; 2200 2201 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2202 struct request_sock *req = inet_reqsk(sk); 2203 bool req_stolen = false; 2204 struct sock *nsk; 2205 2206 sk = req->rsk_listener; 2207 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2208 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2209 else 2210 drop_reason = tcp_inbound_hash(sk, req, skb, 2211 &iph->saddr, &iph->daddr, 2212 AF_INET, dif, sdif); 2213 if (unlikely(drop_reason)) { 2214 sk_drops_skbadd(sk, skb); 2215 reqsk_put(req); 2216 goto discard_it; 2217 } 2218 if (tcp_checksum_complete(skb)) { 2219 reqsk_put(req); 2220 goto csum_error; 2221 } 2222 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2223 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2224 if (!nsk) { 2225 inet_csk_reqsk_queue_drop_and_put(sk, req); 2226 goto lookup; 2227 } 2228 sk = nsk; 2229 /* reuseport_migrate_sock() has already held one sk_refcnt 2230 * before returning. 2231 */ 2232 } else { 2233 /* We own a reference on the listener, increase it again 2234 * as we might lose it too soon. 2235 */ 2236 sock_hold(sk); 2237 } 2238 refcounted = true; 2239 nsk = NULL; 2240 if (!tcp_filter(sk, skb, &drop_reason)) { 2241 th = (const struct tcphdr *)skb->data; 2242 iph = ip_hdr(skb); 2243 tcp_v4_fill_cb(skb, iph, th); 2244 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2245 &drop_reason); 2246 } 2247 if (!nsk) { 2248 reqsk_put(req); 2249 if (req_stolen) { 2250 /* Another cpu got exclusive access to req 2251 * and created a full blown socket. 2252 * Try to feed this packet to this socket 2253 * instead of discarding it. 2254 */ 2255 tcp_v4_restore_cb(skb); 2256 sock_put(sk); 2257 goto lookup; 2258 } 2259 goto discard_and_relse; 2260 } 2261 nf_reset_ct(skb); 2262 if (nsk == sk) { 2263 reqsk_put(req); 2264 tcp_v4_restore_cb(skb); 2265 } else { 2266 drop_reason = tcp_child_process(sk, nsk, skb); 2267 if (drop_reason) { 2268 enum sk_rst_reason rst_reason; 2269 2270 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2271 tcp_v4_send_reset(nsk, skb, rst_reason); 2272 goto discard_and_relse; 2273 } 2274 sock_put(sk); 2275 return 0; 2276 } 2277 } 2278 2279 process: 2280 if (static_branch_unlikely(&ip4_min_ttl)) { 2281 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2282 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2283 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2284 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2285 goto discard_and_relse; 2286 } 2287 } 2288 2289 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2290 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2291 goto discard_and_relse; 2292 } 2293 2294 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2295 AF_INET, dif, sdif); 2296 if (drop_reason) 2297 goto discard_and_relse; 2298 2299 nf_reset_ct(skb); 2300 2301 if (tcp_filter(sk, skb, &drop_reason)) 2302 goto discard_and_relse; 2303 2304 th = (const struct tcphdr *)skb->data; 2305 iph = ip_hdr(skb); 2306 tcp_v4_fill_cb(skb, iph, th); 2307 2308 skb->dev = NULL; 2309 2310 if (sk->sk_state == TCP_LISTEN) { 2311 ret = tcp_v4_do_rcv(sk, skb); 2312 goto put_and_return; 2313 } 2314 2315 sk_incoming_cpu_update(sk); 2316 2317 bh_lock_sock_nested(sk); 2318 tcp_segs_in(tcp_sk(sk), skb); 2319 ret = 0; 2320 if (!sock_owned_by_user(sk)) { 2321 ret = tcp_v4_do_rcv(sk, skb); 2322 } else { 2323 if (tcp_add_backlog(sk, skb, &drop_reason)) 2324 goto discard_and_relse; 2325 } 2326 bh_unlock_sock(sk); 2327 2328 put_and_return: 2329 if (refcounted) 2330 sock_put(sk); 2331 2332 return ret; 2333 2334 no_tcp_socket: 2335 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2336 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2337 goto discard_it; 2338 2339 tcp_v4_fill_cb(skb, iph, th); 2340 2341 if (tcp_checksum_complete(skb)) { 2342 csum_error: 2343 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2344 trace_tcp_bad_csum(skb); 2345 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2346 bad_packet: 2347 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2348 } else { 2349 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2350 } 2351 2352 discard_it: 2353 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2354 /* Discard frame. */ 2355 sk_skb_reason_drop(sk, skb, drop_reason); 2356 return 0; 2357 2358 discard_and_relse: 2359 sk_drops_skbadd(sk, skb); 2360 if (refcounted) 2361 sock_put(sk); 2362 goto discard_it; 2363 2364 do_time_wait: 2365 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2366 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2367 inet_twsk_put(inet_twsk(sk)); 2368 goto discard_it; 2369 } 2370 2371 tcp_v4_fill_cb(skb, iph, th); 2372 2373 if (tcp_checksum_complete(skb)) { 2374 inet_twsk_put(inet_twsk(sk)); 2375 goto csum_error; 2376 } 2377 2378 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2379 &drop_reason); 2380 switch (tw_status) { 2381 case TCP_TW_SYN: { 2382 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2383 iph->saddr, th->source, 2384 iph->daddr, th->dest, 2385 inet_iif(skb), 2386 sdif); 2387 if (sk2) { 2388 inet_twsk_deschedule_put(inet_twsk(sk)); 2389 sk = sk2; 2390 tcp_v4_restore_cb(skb); 2391 refcounted = false; 2392 __this_cpu_write(tcp_tw_isn, isn); 2393 goto process; 2394 } 2395 2396 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); 2397 if (drop_reason) 2398 break; 2399 } 2400 /* to ACK */ 2401 fallthrough; 2402 case TCP_TW_ACK: 2403 case TCP_TW_ACK_OOW: 2404 tcp_v4_timewait_ack(sk, skb, tw_status); 2405 break; 2406 case TCP_TW_RST: 2407 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2408 inet_twsk_deschedule_put(inet_twsk(sk)); 2409 goto discard_it; 2410 case TCP_TW_SUCCESS:; 2411 } 2412 goto discard_it; 2413 } 2414 2415 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2416 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2417 }; 2418 2419 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2420 { 2421 struct dst_entry *dst = skb_dst(skb); 2422 2423 if (dst && dst_hold_safe(dst)) { 2424 rcu_assign_pointer(sk->sk_rx_dst, dst); 2425 sk->sk_rx_dst_ifindex = skb->skb_iif; 2426 } 2427 } 2428 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2429 2430 const struct inet_connection_sock_af_ops ipv4_specific = { 2431 .queue_xmit = ip_queue_xmit, 2432 .send_check = tcp_v4_send_check, 2433 .rebuild_header = inet_sk_rebuild_header, 2434 .sk_rx_dst_set = inet_sk_rx_dst_set, 2435 .conn_request = tcp_v4_conn_request, 2436 .syn_recv_sock = tcp_v4_syn_recv_sock, 2437 .net_header_len = sizeof(struct iphdr), 2438 .setsockopt = ip_setsockopt, 2439 .getsockopt = ip_getsockopt, 2440 .mtu_reduced = tcp_v4_mtu_reduced, 2441 }; 2442 EXPORT_IPV6_MOD(ipv4_specific); 2443 2444 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2445 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2446 #ifdef CONFIG_TCP_MD5SIG 2447 .md5_lookup = tcp_v4_md5_lookup, 2448 .calc_md5_hash = tcp_v4_md5_hash_skb, 2449 .md5_parse = tcp_v4_parse_md5_keys, 2450 #endif 2451 #ifdef CONFIG_TCP_AO 2452 .ao_lookup = tcp_v4_ao_lookup, 2453 .calc_ao_hash = tcp_v4_ao_hash_skb, 2454 .ao_parse = tcp_v4_parse_ao, 2455 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2456 #endif 2457 }; 2458 2459 static void tcp4_destruct_sock(struct sock *sk) 2460 { 2461 tcp_md5_destruct_sock(sk); 2462 tcp_ao_destroy_sock(sk, false); 2463 inet_sock_destruct(sk); 2464 } 2465 #endif 2466 2467 /* NOTE: A lot of things set to zero explicitly by call to 2468 * sk_alloc() so need not be done here. 2469 */ 2470 static int tcp_v4_init_sock(struct sock *sk) 2471 { 2472 struct inet_connection_sock *icsk = inet_csk(sk); 2473 2474 tcp_init_sock(sk); 2475 2476 icsk->icsk_af_ops = &ipv4_specific; 2477 2478 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2479 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2480 sk->sk_destruct = tcp4_destruct_sock; 2481 #endif 2482 2483 return 0; 2484 } 2485 2486 static void tcp_release_user_frags(struct sock *sk) 2487 { 2488 #ifdef CONFIG_PAGE_POOL 2489 unsigned long index; 2490 void *netmem; 2491 2492 xa_for_each(&sk->sk_user_frags, index, netmem) 2493 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2494 #endif 2495 } 2496 2497 void tcp_v4_destroy_sock(struct sock *sk) 2498 { 2499 struct tcp_sock *tp = tcp_sk(sk); 2500 2501 tcp_release_user_frags(sk); 2502 2503 xa_destroy(&sk->sk_user_frags); 2504 2505 trace_tcp_destroy_sock(sk); 2506 2507 tcp_clear_xmit_timers(sk); 2508 2509 tcp_cleanup_congestion_control(sk); 2510 2511 tcp_cleanup_ulp(sk); 2512 2513 /* Cleanup up the write buffer. */ 2514 tcp_write_queue_purge(sk); 2515 2516 /* Check if we want to disable active TFO */ 2517 tcp_fastopen_active_disable_ofo_check(sk); 2518 2519 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2520 skb_rbtree_purge(&tp->out_of_order_queue); 2521 2522 /* Clean up a referenced TCP bind bucket. */ 2523 if (inet_csk(sk)->icsk_bind_hash) 2524 inet_put_port(sk); 2525 2526 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2527 2528 /* If socket is aborted during connect operation */ 2529 tcp_free_fastopen_req(tp); 2530 tcp_fastopen_destroy_cipher(sk); 2531 tcp_saved_syn_free(tp); 2532 2533 sk_sockets_allocated_dec(sk); 2534 } 2535 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2536 2537 #ifdef CONFIG_PROC_FS 2538 /* Proc filesystem TCP sock list dumping. */ 2539 2540 static unsigned short seq_file_family(const struct seq_file *seq); 2541 2542 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2543 { 2544 unsigned short family = seq_file_family(seq); 2545 2546 /* AF_UNSPEC is used as a match all */ 2547 return ((family == AF_UNSPEC || family == sk->sk_family) && 2548 net_eq(sock_net(sk), seq_file_net(seq))); 2549 } 2550 2551 /* Find a non empty bucket (starting from st->bucket) 2552 * and return the first sk from it. 2553 */ 2554 static void *listening_get_first(struct seq_file *seq) 2555 { 2556 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2557 struct tcp_iter_state *st = seq->private; 2558 2559 st->offset = 0; 2560 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2561 struct inet_listen_hashbucket *ilb2; 2562 struct hlist_nulls_node *node; 2563 struct sock *sk; 2564 2565 ilb2 = &hinfo->lhash2[st->bucket]; 2566 if (hlist_nulls_empty(&ilb2->nulls_head)) 2567 continue; 2568 2569 spin_lock(&ilb2->lock); 2570 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2571 if (seq_sk_match(seq, sk)) 2572 return sk; 2573 } 2574 spin_unlock(&ilb2->lock); 2575 } 2576 2577 return NULL; 2578 } 2579 2580 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2581 * If "cur" is the last one in the st->bucket, 2582 * call listening_get_first() to return the first sk of the next 2583 * non empty bucket. 2584 */ 2585 static void *listening_get_next(struct seq_file *seq, void *cur) 2586 { 2587 struct tcp_iter_state *st = seq->private; 2588 struct inet_listen_hashbucket *ilb2; 2589 struct hlist_nulls_node *node; 2590 struct inet_hashinfo *hinfo; 2591 struct sock *sk = cur; 2592 2593 ++st->num; 2594 ++st->offset; 2595 2596 sk = sk_nulls_next(sk); 2597 sk_nulls_for_each_from(sk, node) { 2598 if (seq_sk_match(seq, sk)) 2599 return sk; 2600 } 2601 2602 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2603 ilb2 = &hinfo->lhash2[st->bucket]; 2604 spin_unlock(&ilb2->lock); 2605 ++st->bucket; 2606 return listening_get_first(seq); 2607 } 2608 2609 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2610 { 2611 struct tcp_iter_state *st = seq->private; 2612 void *rc; 2613 2614 st->bucket = 0; 2615 st->offset = 0; 2616 rc = listening_get_first(seq); 2617 2618 while (rc && *pos) { 2619 rc = listening_get_next(seq, rc); 2620 --*pos; 2621 } 2622 return rc; 2623 } 2624 2625 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2626 const struct tcp_iter_state *st) 2627 { 2628 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2629 } 2630 2631 /* 2632 * Get first established socket starting from bucket given in st->bucket. 2633 * If st->bucket is zero, the very first socket in the hash is returned. 2634 */ 2635 static void *established_get_first(struct seq_file *seq) 2636 { 2637 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2638 struct tcp_iter_state *st = seq->private; 2639 2640 st->offset = 0; 2641 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2642 struct sock *sk; 2643 struct hlist_nulls_node *node; 2644 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2645 2646 cond_resched(); 2647 2648 /* Lockless fast path for the common case of empty buckets */ 2649 if (empty_bucket(hinfo, st)) 2650 continue; 2651 2652 spin_lock_bh(lock); 2653 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2654 if (seq_sk_match(seq, sk)) 2655 return sk; 2656 } 2657 spin_unlock_bh(lock); 2658 } 2659 2660 return NULL; 2661 } 2662 2663 static void *established_get_next(struct seq_file *seq, void *cur) 2664 { 2665 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2666 struct tcp_iter_state *st = seq->private; 2667 struct hlist_nulls_node *node; 2668 struct sock *sk = cur; 2669 2670 ++st->num; 2671 ++st->offset; 2672 2673 sk = sk_nulls_next(sk); 2674 2675 sk_nulls_for_each_from(sk, node) { 2676 if (seq_sk_match(seq, sk)) 2677 return sk; 2678 } 2679 2680 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2681 ++st->bucket; 2682 return established_get_first(seq); 2683 } 2684 2685 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2686 { 2687 struct tcp_iter_state *st = seq->private; 2688 void *rc; 2689 2690 st->bucket = 0; 2691 rc = established_get_first(seq); 2692 2693 while (rc && pos) { 2694 rc = established_get_next(seq, rc); 2695 --pos; 2696 } 2697 return rc; 2698 } 2699 2700 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2701 { 2702 void *rc; 2703 struct tcp_iter_state *st = seq->private; 2704 2705 st->state = TCP_SEQ_STATE_LISTENING; 2706 rc = listening_get_idx(seq, &pos); 2707 2708 if (!rc) { 2709 st->state = TCP_SEQ_STATE_ESTABLISHED; 2710 rc = established_get_idx(seq, pos); 2711 } 2712 2713 return rc; 2714 } 2715 2716 static void *tcp_seek_last_pos(struct seq_file *seq) 2717 { 2718 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2719 struct tcp_iter_state *st = seq->private; 2720 int bucket = st->bucket; 2721 int offset = st->offset; 2722 int orig_num = st->num; 2723 void *rc = NULL; 2724 2725 switch (st->state) { 2726 case TCP_SEQ_STATE_LISTENING: 2727 if (st->bucket > hinfo->lhash2_mask) 2728 break; 2729 rc = listening_get_first(seq); 2730 while (offset-- && rc && bucket == st->bucket) 2731 rc = listening_get_next(seq, rc); 2732 if (rc) 2733 break; 2734 st->bucket = 0; 2735 st->state = TCP_SEQ_STATE_ESTABLISHED; 2736 fallthrough; 2737 case TCP_SEQ_STATE_ESTABLISHED: 2738 if (st->bucket > hinfo->ehash_mask) 2739 break; 2740 rc = established_get_first(seq); 2741 while (offset-- && rc && bucket == st->bucket) 2742 rc = established_get_next(seq, rc); 2743 } 2744 2745 st->num = orig_num; 2746 2747 return rc; 2748 } 2749 2750 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2751 { 2752 struct tcp_iter_state *st = seq->private; 2753 void *rc; 2754 2755 if (*pos && *pos == st->last_pos) { 2756 rc = tcp_seek_last_pos(seq); 2757 if (rc) 2758 goto out; 2759 } 2760 2761 st->state = TCP_SEQ_STATE_LISTENING; 2762 st->num = 0; 2763 st->bucket = 0; 2764 st->offset = 0; 2765 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2766 2767 out: 2768 st->last_pos = *pos; 2769 return rc; 2770 } 2771 EXPORT_IPV6_MOD(tcp_seq_start); 2772 2773 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2774 { 2775 struct tcp_iter_state *st = seq->private; 2776 void *rc = NULL; 2777 2778 if (v == SEQ_START_TOKEN) { 2779 rc = tcp_get_idx(seq, 0); 2780 goto out; 2781 } 2782 2783 switch (st->state) { 2784 case TCP_SEQ_STATE_LISTENING: 2785 rc = listening_get_next(seq, v); 2786 if (!rc) { 2787 st->state = TCP_SEQ_STATE_ESTABLISHED; 2788 st->bucket = 0; 2789 st->offset = 0; 2790 rc = established_get_first(seq); 2791 } 2792 break; 2793 case TCP_SEQ_STATE_ESTABLISHED: 2794 rc = established_get_next(seq, v); 2795 break; 2796 } 2797 out: 2798 ++*pos; 2799 st->last_pos = *pos; 2800 return rc; 2801 } 2802 EXPORT_IPV6_MOD(tcp_seq_next); 2803 2804 void tcp_seq_stop(struct seq_file *seq, void *v) 2805 { 2806 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2807 struct tcp_iter_state *st = seq->private; 2808 2809 switch (st->state) { 2810 case TCP_SEQ_STATE_LISTENING: 2811 if (v != SEQ_START_TOKEN) 2812 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2813 break; 2814 case TCP_SEQ_STATE_ESTABLISHED: 2815 if (v) 2816 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2817 break; 2818 } 2819 } 2820 EXPORT_IPV6_MOD(tcp_seq_stop); 2821 2822 static void get_openreq4(const struct request_sock *req, 2823 struct seq_file *f, int i) 2824 { 2825 const struct inet_request_sock *ireq = inet_rsk(req); 2826 long delta = req->rsk_timer.expires - jiffies; 2827 2828 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2829 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2830 i, 2831 ireq->ir_loc_addr, 2832 ireq->ir_num, 2833 ireq->ir_rmt_addr, 2834 ntohs(ireq->ir_rmt_port), 2835 TCP_SYN_RECV, 2836 0, 0, /* could print option size, but that is af dependent. */ 2837 1, /* timers active (only the expire timer) */ 2838 jiffies_delta_to_clock_t(delta), 2839 req->num_timeout, 2840 from_kuid_munged(seq_user_ns(f), 2841 sk_uid(req->rsk_listener)), 2842 0, /* non standard timer */ 2843 0, /* open_requests have no inode */ 2844 0, 2845 req); 2846 } 2847 2848 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2849 { 2850 int timer_active; 2851 unsigned long timer_expires; 2852 const struct tcp_sock *tp = tcp_sk(sk); 2853 const struct inet_connection_sock *icsk = inet_csk(sk); 2854 const struct inet_sock *inet = inet_sk(sk); 2855 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2856 __be32 dest = inet->inet_daddr; 2857 __be32 src = inet->inet_rcv_saddr; 2858 __u16 destp = ntohs(inet->inet_dport); 2859 __u16 srcp = ntohs(inet->inet_sport); 2860 u8 icsk_pending; 2861 int rx_queue; 2862 int state; 2863 2864 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2865 if (icsk_pending == ICSK_TIME_RETRANS || 2866 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2867 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2868 timer_active = 1; 2869 timer_expires = tcp_timeout_expires(sk); 2870 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2871 timer_active = 4; 2872 timer_expires = tcp_timeout_expires(sk); 2873 } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 2874 timer_active = 2; 2875 timer_expires = icsk->icsk_keepalive_timer.expires; 2876 } else { 2877 timer_active = 0; 2878 timer_expires = jiffies; 2879 } 2880 2881 state = inet_sk_state_load(sk); 2882 if (state == TCP_LISTEN) 2883 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2884 else 2885 /* Because we don't lock the socket, 2886 * we might find a transient negative value. 2887 */ 2888 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2889 READ_ONCE(tp->copied_seq), 0); 2890 2891 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2892 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2893 i, src, srcp, dest, destp, state, 2894 READ_ONCE(tp->write_seq) - tp->snd_una, 2895 rx_queue, 2896 timer_active, 2897 jiffies_delta_to_clock_t(timer_expires - jiffies), 2898 READ_ONCE(icsk->icsk_retransmits), 2899 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2900 READ_ONCE(icsk->icsk_probes_out), 2901 sock_i_ino(sk), 2902 refcount_read(&sk->sk_refcnt), sk, 2903 jiffies_to_clock_t(icsk->icsk_rto), 2904 jiffies_to_clock_t(icsk->icsk_ack.ato), 2905 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2906 tcp_snd_cwnd(tp), 2907 state == TCP_LISTEN ? 2908 fastopenq->max_qlen : 2909 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2910 } 2911 2912 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2913 struct seq_file *f, int i) 2914 { 2915 long delta = tw->tw_timer.expires - jiffies; 2916 __be32 dest, src; 2917 __u16 destp, srcp; 2918 2919 dest = tw->tw_daddr; 2920 src = tw->tw_rcv_saddr; 2921 destp = ntohs(tw->tw_dport); 2922 srcp = ntohs(tw->tw_sport); 2923 2924 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2925 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2926 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2927 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2928 refcount_read(&tw->tw_refcnt), tw); 2929 } 2930 2931 #define TMPSZ 150 2932 2933 static int tcp4_seq_show(struct seq_file *seq, void *v) 2934 { 2935 struct tcp_iter_state *st; 2936 struct sock *sk = v; 2937 2938 seq_setwidth(seq, TMPSZ - 1); 2939 if (v == SEQ_START_TOKEN) { 2940 seq_puts(seq, " sl local_address rem_address st tx_queue " 2941 "rx_queue tr tm->when retrnsmt uid timeout " 2942 "inode"); 2943 goto out; 2944 } 2945 st = seq->private; 2946 2947 if (sk->sk_state == TCP_TIME_WAIT) 2948 get_timewait4_sock(v, seq, st->num); 2949 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2950 get_openreq4(v, seq, st->num); 2951 else 2952 get_tcp4_sock(v, seq, st->num); 2953 out: 2954 seq_pad(seq, '\n'); 2955 return 0; 2956 } 2957 2958 #ifdef CONFIG_BPF_SYSCALL 2959 union bpf_tcp_iter_batch_item { 2960 struct sock *sk; 2961 __u64 cookie; 2962 }; 2963 2964 struct bpf_tcp_iter_state { 2965 struct tcp_iter_state state; 2966 unsigned int cur_sk; 2967 unsigned int end_sk; 2968 unsigned int max_sk; 2969 union bpf_tcp_iter_batch_item *batch; 2970 }; 2971 2972 struct bpf_iter__tcp { 2973 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2974 __bpf_md_ptr(struct sock_common *, sk_common); 2975 uid_t uid __aligned(8); 2976 }; 2977 2978 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2979 struct sock_common *sk_common, uid_t uid) 2980 { 2981 struct bpf_iter__tcp ctx; 2982 2983 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2984 ctx.meta = meta; 2985 ctx.sk_common = sk_common; 2986 ctx.uid = uid; 2987 return bpf_iter_run_prog(prog, &ctx); 2988 } 2989 2990 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2991 { 2992 union bpf_tcp_iter_batch_item *item; 2993 unsigned int cur_sk = iter->cur_sk; 2994 __u64 cookie; 2995 2996 /* Remember the cookies of the sockets we haven't seen yet, so we can 2997 * pick up where we left off next time around. 2998 */ 2999 while (cur_sk < iter->end_sk) { 3000 item = &iter->batch[cur_sk++]; 3001 cookie = sock_gen_cookie(item->sk); 3002 sock_gen_put(item->sk); 3003 item->cookie = cookie; 3004 } 3005 } 3006 3007 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3008 unsigned int new_batch_sz, gfp_t flags) 3009 { 3010 union bpf_tcp_iter_batch_item *new_batch; 3011 3012 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3013 flags | __GFP_NOWARN); 3014 if (!new_batch) 3015 return -ENOMEM; 3016 3017 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 3018 kvfree(iter->batch); 3019 iter->batch = new_batch; 3020 iter->max_sk = new_batch_sz; 3021 3022 return 0; 3023 } 3024 3025 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3026 union bpf_tcp_iter_batch_item *cookies, 3027 int n_cookies) 3028 { 3029 struct hlist_nulls_node *node; 3030 struct sock *sk; 3031 int i; 3032 3033 for (i = 0; i < n_cookies; i++) { 3034 sk = first_sk; 3035 sk_nulls_for_each_from(sk, node) 3036 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3037 return sk; 3038 } 3039 3040 return NULL; 3041 } 3042 3043 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3044 { 3045 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3046 struct bpf_tcp_iter_state *iter = seq->private; 3047 struct tcp_iter_state *st = &iter->state; 3048 unsigned int find_cookie = iter->cur_sk; 3049 unsigned int end_cookie = iter->end_sk; 3050 int resume_bucket = st->bucket; 3051 struct sock *sk; 3052 3053 if (end_cookie && find_cookie == end_cookie) 3054 ++st->bucket; 3055 3056 sk = listening_get_first(seq); 3057 iter->cur_sk = 0; 3058 iter->end_sk = 0; 3059 3060 if (sk && st->bucket == resume_bucket && end_cookie) { 3061 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3062 end_cookie - find_cookie); 3063 if (!sk) { 3064 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3065 ++st->bucket; 3066 sk = listening_get_first(seq); 3067 } 3068 } 3069 3070 return sk; 3071 } 3072 3073 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3074 { 3075 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3076 struct bpf_tcp_iter_state *iter = seq->private; 3077 struct tcp_iter_state *st = &iter->state; 3078 unsigned int find_cookie = iter->cur_sk; 3079 unsigned int end_cookie = iter->end_sk; 3080 int resume_bucket = st->bucket; 3081 struct sock *sk; 3082 3083 if (end_cookie && find_cookie == end_cookie) 3084 ++st->bucket; 3085 3086 sk = established_get_first(seq); 3087 iter->cur_sk = 0; 3088 iter->end_sk = 0; 3089 3090 if (sk && st->bucket == resume_bucket && end_cookie) { 3091 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3092 end_cookie - find_cookie); 3093 if (!sk) { 3094 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3095 ++st->bucket; 3096 sk = established_get_first(seq); 3097 } 3098 } 3099 3100 return sk; 3101 } 3102 3103 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3104 { 3105 struct bpf_tcp_iter_state *iter = seq->private; 3106 struct tcp_iter_state *st = &iter->state; 3107 struct sock *sk = NULL; 3108 3109 switch (st->state) { 3110 case TCP_SEQ_STATE_LISTENING: 3111 sk = bpf_iter_tcp_resume_listening(seq); 3112 if (sk) 3113 break; 3114 st->bucket = 0; 3115 st->state = TCP_SEQ_STATE_ESTABLISHED; 3116 fallthrough; 3117 case TCP_SEQ_STATE_ESTABLISHED: 3118 sk = bpf_iter_tcp_resume_established(seq); 3119 break; 3120 } 3121 3122 return sk; 3123 } 3124 3125 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3126 struct sock **start_sk) 3127 { 3128 struct bpf_tcp_iter_state *iter = seq->private; 3129 struct hlist_nulls_node *node; 3130 unsigned int expected = 1; 3131 struct sock *sk; 3132 3133 sock_hold(*start_sk); 3134 iter->batch[iter->end_sk++].sk = *start_sk; 3135 3136 sk = sk_nulls_next(*start_sk); 3137 *start_sk = NULL; 3138 sk_nulls_for_each_from(sk, node) { 3139 if (seq_sk_match(seq, sk)) { 3140 if (iter->end_sk < iter->max_sk) { 3141 sock_hold(sk); 3142 iter->batch[iter->end_sk++].sk = sk; 3143 } else if (!*start_sk) { 3144 /* Remember where we left off. */ 3145 *start_sk = sk; 3146 } 3147 expected++; 3148 } 3149 } 3150 3151 return expected; 3152 } 3153 3154 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3155 struct sock **start_sk) 3156 { 3157 struct bpf_tcp_iter_state *iter = seq->private; 3158 struct hlist_nulls_node *node; 3159 unsigned int expected = 1; 3160 struct sock *sk; 3161 3162 sock_hold(*start_sk); 3163 iter->batch[iter->end_sk++].sk = *start_sk; 3164 3165 sk = sk_nulls_next(*start_sk); 3166 *start_sk = NULL; 3167 sk_nulls_for_each_from(sk, node) { 3168 if (seq_sk_match(seq, sk)) { 3169 if (iter->end_sk < iter->max_sk) { 3170 sock_hold(sk); 3171 iter->batch[iter->end_sk++].sk = sk; 3172 } else if (!*start_sk) { 3173 /* Remember where we left off. */ 3174 *start_sk = sk; 3175 } 3176 expected++; 3177 } 3178 } 3179 3180 return expected; 3181 } 3182 3183 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3184 struct sock **start_sk) 3185 { 3186 struct bpf_tcp_iter_state *iter = seq->private; 3187 struct tcp_iter_state *st = &iter->state; 3188 3189 if (st->state == TCP_SEQ_STATE_LISTENING) 3190 return bpf_iter_tcp_listening_batch(seq, start_sk); 3191 else 3192 return bpf_iter_tcp_established_batch(seq, start_sk); 3193 } 3194 3195 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3196 { 3197 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3198 struct bpf_tcp_iter_state *iter = seq->private; 3199 struct tcp_iter_state *st = &iter->state; 3200 3201 if (st->state == TCP_SEQ_STATE_LISTENING) 3202 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3203 else 3204 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3205 } 3206 3207 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3208 { 3209 struct bpf_tcp_iter_state *iter = seq->private; 3210 unsigned int expected; 3211 struct sock *sk; 3212 int err; 3213 3214 sk = bpf_iter_tcp_resume(seq); 3215 if (!sk) 3216 return NULL; /* Done */ 3217 3218 expected = bpf_iter_fill_batch(seq, &sk); 3219 if (likely(iter->end_sk == expected)) 3220 goto done; 3221 3222 /* Batch size was too small. */ 3223 bpf_iter_tcp_unlock_bucket(seq); 3224 bpf_iter_tcp_put_batch(iter); 3225 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3226 GFP_USER); 3227 if (err) 3228 return ERR_PTR(err); 3229 3230 sk = bpf_iter_tcp_resume(seq); 3231 if (!sk) 3232 return NULL; /* Done */ 3233 3234 expected = bpf_iter_fill_batch(seq, &sk); 3235 if (likely(iter->end_sk == expected)) 3236 goto done; 3237 3238 /* Batch size was still too small. Hold onto the lock while we try 3239 * again with a larger batch to make sure the current bucket's size 3240 * does not change in the meantime. 3241 */ 3242 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3243 if (err) { 3244 bpf_iter_tcp_unlock_bucket(seq); 3245 return ERR_PTR(err); 3246 } 3247 3248 expected = bpf_iter_fill_batch(seq, &sk); 3249 WARN_ON_ONCE(iter->end_sk != expected); 3250 done: 3251 bpf_iter_tcp_unlock_bucket(seq); 3252 return iter->batch[0].sk; 3253 } 3254 3255 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3256 { 3257 /* bpf iter does not support lseek, so it always 3258 * continue from where it was stop()-ped. 3259 */ 3260 if (*pos) 3261 return bpf_iter_tcp_batch(seq); 3262 3263 return SEQ_START_TOKEN; 3264 } 3265 3266 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3267 { 3268 struct bpf_tcp_iter_state *iter = seq->private; 3269 struct tcp_iter_state *st = &iter->state; 3270 struct sock *sk; 3271 3272 /* Whenever seq_next() is called, the iter->cur_sk is 3273 * done with seq_show(), so advance to the next sk in 3274 * the batch. 3275 */ 3276 if (iter->cur_sk < iter->end_sk) { 3277 /* Keeping st->num consistent in tcp_iter_state. 3278 * bpf_iter_tcp does not use st->num. 3279 * meta.seq_num is used instead. 3280 */ 3281 st->num++; 3282 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3283 } 3284 3285 if (iter->cur_sk < iter->end_sk) 3286 sk = iter->batch[iter->cur_sk].sk; 3287 else 3288 sk = bpf_iter_tcp_batch(seq); 3289 3290 ++*pos; 3291 /* Keeping st->last_pos consistent in tcp_iter_state. 3292 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3293 */ 3294 st->last_pos = *pos; 3295 return sk; 3296 } 3297 3298 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3299 { 3300 struct bpf_iter_meta meta; 3301 struct bpf_prog *prog; 3302 struct sock *sk = v; 3303 uid_t uid; 3304 int ret; 3305 3306 if (v == SEQ_START_TOKEN) 3307 return 0; 3308 3309 if (sk_fullsock(sk)) 3310 lock_sock(sk); 3311 3312 if (unlikely(sk_unhashed(sk))) { 3313 ret = SEQ_SKIP; 3314 goto unlock; 3315 } 3316 3317 if (sk->sk_state == TCP_TIME_WAIT) { 3318 uid = 0; 3319 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3320 const struct request_sock *req = v; 3321 3322 uid = from_kuid_munged(seq_user_ns(seq), 3323 sk_uid(req->rsk_listener)); 3324 } else { 3325 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3326 } 3327 3328 meta.seq = seq; 3329 prog = bpf_iter_get_info(&meta, false); 3330 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3331 3332 unlock: 3333 if (sk_fullsock(sk)) 3334 release_sock(sk); 3335 return ret; 3336 3337 } 3338 3339 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3340 { 3341 struct bpf_tcp_iter_state *iter = seq->private; 3342 struct bpf_iter_meta meta; 3343 struct bpf_prog *prog; 3344 3345 if (!v) { 3346 meta.seq = seq; 3347 prog = bpf_iter_get_info(&meta, true); 3348 if (prog) 3349 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3350 } 3351 3352 if (iter->cur_sk < iter->end_sk) 3353 bpf_iter_tcp_put_batch(iter); 3354 } 3355 3356 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3357 .show = bpf_iter_tcp_seq_show, 3358 .start = bpf_iter_tcp_seq_start, 3359 .next = bpf_iter_tcp_seq_next, 3360 .stop = bpf_iter_tcp_seq_stop, 3361 }; 3362 #endif 3363 static unsigned short seq_file_family(const struct seq_file *seq) 3364 { 3365 const struct tcp_seq_afinfo *afinfo; 3366 3367 #ifdef CONFIG_BPF_SYSCALL 3368 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3369 if (seq->op == &bpf_iter_tcp_seq_ops) 3370 return AF_UNSPEC; 3371 #endif 3372 3373 /* Iterated from proc fs */ 3374 afinfo = pde_data(file_inode(seq->file)); 3375 return afinfo->family; 3376 } 3377 3378 static const struct seq_operations tcp4_seq_ops = { 3379 .show = tcp4_seq_show, 3380 .start = tcp_seq_start, 3381 .next = tcp_seq_next, 3382 .stop = tcp_seq_stop, 3383 }; 3384 3385 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3386 .family = AF_INET, 3387 }; 3388 3389 static int __net_init tcp4_proc_init_net(struct net *net) 3390 { 3391 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3392 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3393 return -ENOMEM; 3394 return 0; 3395 } 3396 3397 static void __net_exit tcp4_proc_exit_net(struct net *net) 3398 { 3399 remove_proc_entry("tcp", net->proc_net); 3400 } 3401 3402 static struct pernet_operations tcp4_net_ops = { 3403 .init = tcp4_proc_init_net, 3404 .exit = tcp4_proc_exit_net, 3405 }; 3406 3407 int __init tcp4_proc_init(void) 3408 { 3409 return register_pernet_subsys(&tcp4_net_ops); 3410 } 3411 3412 void tcp4_proc_exit(void) 3413 { 3414 unregister_pernet_subsys(&tcp4_net_ops); 3415 } 3416 #endif /* CONFIG_PROC_FS */ 3417 3418 struct proto tcp_prot = { 3419 .name = "TCP", 3420 .owner = THIS_MODULE, 3421 .close = tcp_close, 3422 .pre_connect = tcp_v4_pre_connect, 3423 .connect = tcp_v4_connect, 3424 .disconnect = tcp_disconnect, 3425 .accept = inet_csk_accept, 3426 .ioctl = tcp_ioctl, 3427 .init = tcp_v4_init_sock, 3428 .destroy = tcp_v4_destroy_sock, 3429 .shutdown = tcp_shutdown, 3430 .setsockopt = tcp_setsockopt, 3431 .getsockopt = tcp_getsockopt, 3432 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3433 .keepalive = tcp_set_keepalive, 3434 .recvmsg = tcp_recvmsg, 3435 .sendmsg = tcp_sendmsg, 3436 .splice_eof = tcp_splice_eof, 3437 .backlog_rcv = tcp_v4_do_rcv, 3438 .release_cb = tcp_release_cb, 3439 .hash = inet_hash, 3440 .unhash = inet_unhash, 3441 .get_port = inet_csk_get_port, 3442 .put_port = inet_put_port, 3443 #ifdef CONFIG_BPF_SYSCALL 3444 .psock_update_sk_prot = tcp_bpf_update_proto, 3445 #endif 3446 .enter_memory_pressure = tcp_enter_memory_pressure, 3447 .leave_memory_pressure = tcp_leave_memory_pressure, 3448 .stream_memory_free = tcp_stream_memory_free, 3449 .sockets_allocated = &tcp_sockets_allocated, 3450 3451 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3452 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3453 3454 .memory_pressure = &tcp_memory_pressure, 3455 .sysctl_mem = sysctl_tcp_mem, 3456 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3457 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3458 .max_header = MAX_TCP_HEADER, 3459 .obj_size = sizeof(struct tcp_sock), 3460 .freeptr_offset = offsetof(struct tcp_sock, 3461 inet_conn.icsk_inet.sk.sk_freeptr), 3462 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3463 .twsk_prot = &tcp_timewait_sock_ops, 3464 .rsk_prot = &tcp_request_sock_ops, 3465 .h.hashinfo = NULL, 3466 .no_autobind = true, 3467 .diag_destroy = tcp_abort, 3468 }; 3469 EXPORT_SYMBOL(tcp_prot); 3470 3471 static void __net_exit tcp_sk_exit(struct net *net) 3472 { 3473 if (net->ipv4.tcp_congestion_control) 3474 bpf_module_put(net->ipv4.tcp_congestion_control, 3475 net->ipv4.tcp_congestion_control->owner); 3476 } 3477 3478 static void __net_init tcp_set_hashinfo(struct net *net) 3479 { 3480 struct inet_hashinfo *hinfo; 3481 unsigned int ehash_entries; 3482 struct net *old_net; 3483 3484 if (net_eq(net, &init_net)) 3485 goto fallback; 3486 3487 old_net = current->nsproxy->net_ns; 3488 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3489 if (!ehash_entries) 3490 goto fallback; 3491 3492 ehash_entries = roundup_pow_of_two(ehash_entries); 3493 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3494 if (!hinfo) { 3495 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3496 "for a netns, fallback to the global one\n", 3497 ehash_entries); 3498 fallback: 3499 hinfo = &tcp_hashinfo; 3500 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3501 } 3502 3503 net->ipv4.tcp_death_row.hashinfo = hinfo; 3504 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3505 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3506 } 3507 3508 static int __net_init tcp_sk_init(struct net *net) 3509 { 3510 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3511 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3512 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; 3513 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3514 3515 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3516 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3517 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3518 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3519 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3520 3521 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3522 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3523 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3524 3525 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3526 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3527 net->ipv4.sysctl_tcp_syncookies = 1; 3528 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3529 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3530 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3531 net->ipv4.sysctl_tcp_orphan_retries = 0; 3532 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3533 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3534 net->ipv4.sysctl_tcp_tw_reuse = 2; 3535 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3536 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3537 3538 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3539 tcp_set_hashinfo(net); 3540 3541 net->ipv4.sysctl_tcp_sack = 1; 3542 net->ipv4.sysctl_tcp_window_scaling = 1; 3543 net->ipv4.sysctl_tcp_timestamps = 1; 3544 net->ipv4.sysctl_tcp_early_retrans = 3; 3545 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3546 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3547 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3548 net->ipv4.sysctl_tcp_max_reordering = 300; 3549 net->ipv4.sysctl_tcp_dsack = 1; 3550 net->ipv4.sysctl_tcp_app_win = 31; 3551 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3552 net->ipv4.sysctl_tcp_frto = 2; 3553 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3554 net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC; 3555 /* This limits the percentage of the congestion window which we 3556 * will allow a single TSO frame to consume. Building TSO frames 3557 * which are too large can cause TCP streams to be bursty. 3558 */ 3559 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3560 /* Default TSQ limit of 4 MB */ 3561 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3562 3563 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3564 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3565 3566 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3567 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3568 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3569 net->ipv4.sysctl_tcp_autocorking = 1; 3570 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3571 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3572 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3573 if (net != &init_net) { 3574 memcpy(net->ipv4.sysctl_tcp_rmem, 3575 init_net.ipv4.sysctl_tcp_rmem, 3576 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3577 memcpy(net->ipv4.sysctl_tcp_wmem, 3578 init_net.ipv4.sysctl_tcp_wmem, 3579 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3580 } 3581 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3582 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC; 3583 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3584 net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33; 3585 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3586 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3587 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3588 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3589 3590 /* Set default values for PLB */ 3591 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3592 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3593 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3594 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3595 /* Default congestion threshold for PLB to mark a round is 50% */ 3596 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3597 3598 /* Reno is always built in */ 3599 if (!net_eq(net, &init_net) && 3600 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3601 init_net.ipv4.tcp_congestion_control->owner)) 3602 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3603 else 3604 net->ipv4.tcp_congestion_control = &tcp_reno; 3605 3606 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3607 net->ipv4.sysctl_tcp_shrink_window = 0; 3608 3609 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3610 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3611 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3612 3613 return 0; 3614 } 3615 3616 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3617 { 3618 struct net *net; 3619 3620 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3621 * and failed setup_net error unwinding path are serialized. 3622 * 3623 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3624 * net_exit_list, the thread that dismantles a particular twsk must 3625 * do so without other thread progressing to refcount_dec_and_test() of 3626 * tcp_death_row.tw_refcount. 3627 */ 3628 mutex_lock(&tcp_exit_batch_mutex); 3629 3630 tcp_twsk_purge(net_exit_list); 3631 3632 list_for_each_entry(net, net_exit_list, exit_list) { 3633 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3634 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3635 tcp_fastopen_ctx_destroy(net); 3636 } 3637 3638 mutex_unlock(&tcp_exit_batch_mutex); 3639 } 3640 3641 static struct pernet_operations __net_initdata tcp_sk_ops = { 3642 .init = tcp_sk_init, 3643 .exit = tcp_sk_exit, 3644 .exit_batch = tcp_sk_exit_batch, 3645 }; 3646 3647 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3648 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3649 struct sock_common *sk_common, uid_t uid) 3650 3651 #define INIT_BATCH_SZ 16 3652 3653 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3654 { 3655 struct bpf_tcp_iter_state *iter = priv_data; 3656 int err; 3657 3658 err = bpf_iter_init_seq_net(priv_data, aux); 3659 if (err) 3660 return err; 3661 3662 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3663 if (err) { 3664 bpf_iter_fini_seq_net(priv_data); 3665 return err; 3666 } 3667 3668 return 0; 3669 } 3670 3671 static void bpf_iter_fini_tcp(void *priv_data) 3672 { 3673 struct bpf_tcp_iter_state *iter = priv_data; 3674 3675 bpf_iter_fini_seq_net(priv_data); 3676 kvfree(iter->batch); 3677 } 3678 3679 static const struct bpf_iter_seq_info tcp_seq_info = { 3680 .seq_ops = &bpf_iter_tcp_seq_ops, 3681 .init_seq_private = bpf_iter_init_tcp, 3682 .fini_seq_private = bpf_iter_fini_tcp, 3683 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3684 }; 3685 3686 static const struct bpf_func_proto * 3687 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3688 const struct bpf_prog *prog) 3689 { 3690 switch (func_id) { 3691 case BPF_FUNC_setsockopt: 3692 return &bpf_sk_setsockopt_proto; 3693 case BPF_FUNC_getsockopt: 3694 return &bpf_sk_getsockopt_proto; 3695 default: 3696 return NULL; 3697 } 3698 } 3699 3700 static struct bpf_iter_reg tcp_reg_info = { 3701 .target = "tcp", 3702 .ctx_arg_info_size = 1, 3703 .ctx_arg_info = { 3704 { offsetof(struct bpf_iter__tcp, sk_common), 3705 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3706 }, 3707 .get_func_proto = bpf_iter_tcp_get_func_proto, 3708 .seq_info = &tcp_seq_info, 3709 }; 3710 3711 static void __init bpf_iter_register(void) 3712 { 3713 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3714 if (bpf_iter_reg_target(&tcp_reg_info)) 3715 pr_warn("Warning: could not register bpf iterator tcp\n"); 3716 } 3717 3718 #endif 3719 3720 void __init tcp_v4_init(void) 3721 { 3722 int cpu, res; 3723 3724 for_each_possible_cpu(cpu) { 3725 struct sock *sk; 3726 3727 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3728 IPPROTO_TCP, &init_net); 3729 if (res) 3730 panic("Failed to create the TCP control socket.\n"); 3731 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3732 3733 /* Please enforce IP_DF and IPID==0 for RST and 3734 * ACK sent in SYN-RECV and TIME-WAIT state. 3735 */ 3736 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3737 3738 sk->sk_clockid = CLOCK_MONOTONIC; 3739 3740 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3741 } 3742 if (register_pernet_subsys(&tcp_sk_ops)) 3743 panic("Failed to create the TCP control socket.\n"); 3744 3745 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3746 bpf_iter_register(); 3747 #endif 3748 } 3749