1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/fips.h> 57 #include <linux/jhash.h> 58 #include <linux/init.h> 59 #include <linux/times.h> 60 #include <linux/slab.h> 61 #include <linux/sched.h> 62 #include <linux/sock_diag.h> 63 64 #include <net/aligned_data.h> 65 #include <net/net_namespace.h> 66 #include <net/icmp.h> 67 #include <net/inet_hashtables.h> 68 #include <net/tcp.h> 69 #include <net/tcp_ecn.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/inet_ecn.h> 74 #include <net/timewait_sock.h> 75 #include <net/xfrm.h> 76 #include <net/secure_seq.h> 77 #include <net/busy_poll.h> 78 #include <net/rstreason.h> 79 #include <net/psp.h> 80 81 #include <linux/inet.h> 82 #include <linux/ipv6.h> 83 #include <linux/stddef.h> 84 #include <linux/proc_fs.h> 85 #include <linux/seq_file.h> 86 #include <linux/inetdevice.h> 87 #include <linux/btf_ids.h> 88 #include <linux/skbuff_ref.h> 89 90 #include <crypto/md5.h> 91 92 #include <trace/events/tcp.h> 93 94 #ifdef CONFIG_TCP_MD5SIG 95 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 96 __be32 daddr, __be32 saddr, const struct tcphdr *th); 97 #endif 98 99 struct inet_hashinfo tcp_hashinfo; 100 101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 102 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 103 }; 104 105 static DEFINE_MUTEX(tcp_exit_batch_mutex); 106 107 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 108 { 109 return secure_tcp_seq(ip_hdr(skb)->daddr, 110 ip_hdr(skb)->saddr, 111 tcp_hdr(skb)->dest, 112 tcp_hdr(skb)->source); 113 } 114 115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 116 { 117 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 118 } 119 120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 121 { 122 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 123 const struct inet_timewait_sock *tw = inet_twsk(sktw); 124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 125 struct tcp_sock *tp = tcp_sk(sk); 126 int ts_recent_stamp; 127 u32 reuse_thresh; 128 129 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 130 reuse = 0; 131 132 if (reuse == 2) { 133 /* Still does not detect *everything* that goes through 134 * lo, since we require a loopback src or dst address 135 * or direct binding to 'lo' interface. 136 */ 137 bool loopback = false; 138 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 139 loopback = true; 140 #if IS_ENABLED(CONFIG_IPV6) 141 if (tw->tw_family == AF_INET6) { 142 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 144 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 145 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 146 loopback = true; 147 } else 148 #endif 149 { 150 if (ipv4_is_loopback(tw->tw_daddr) || 151 ipv4_is_loopback(tw->tw_rcv_saddr)) 152 loopback = true; 153 } 154 if (!loopback) 155 reuse = 0; 156 } 157 158 /* With PAWS, it is safe from the viewpoint 159 of data integrity. Even without PAWS it is safe provided sequence 160 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 161 162 Actually, the idea is close to VJ's one, only timestamp cache is 163 held not per host, but per port pair and TW bucket is used as state 164 holder. 165 166 If TW bucket has been already destroyed we fall back to VJ's scheme 167 and use initial timestamp retrieved from peer table. 168 */ 169 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 170 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 171 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 172 if (ts_recent_stamp && 173 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 174 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 175 * and releasing the bucket lock. 176 */ 177 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 178 return 0; 179 180 /* In case of repair and re-using TIME-WAIT sockets we still 181 * want to be sure that it is safe as above but honor the 182 * sequence numbers and time stamps set as part of the repair 183 * process. 184 * 185 * Without this check re-using a TIME-WAIT socket with TCP 186 * repair would accumulate a -1 on the repair assigned 187 * sequence number. The first time it is reused the sequence 188 * is -1, the second time -2, etc. This fixes that issue 189 * without appearing to create any others. 190 */ 191 if (likely(!tp->repair)) { 192 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 193 194 if (!seq) 195 seq = 1; 196 WRITE_ONCE(tp->write_seq, seq); 197 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 198 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 199 } 200 201 return 1; 202 } 203 204 return 0; 205 } 206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 207 208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, 209 int addr_len) 210 { 211 /* This check is replicated from tcp_v4_connect() and intended to 212 * prevent BPF program called below from accessing bytes that are out 213 * of the bound specified by user in addr_len. 214 */ 215 if (addr_len < sizeof(struct sockaddr_in)) 216 return -EINVAL; 217 218 sock_owned_by_me(sk); 219 220 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 221 } 222 223 /* This will initiate an outgoing connection. */ 224 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) 225 { 226 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 227 struct inet_timewait_death_row *tcp_death_row; 228 struct inet_sock *inet = inet_sk(sk); 229 struct tcp_sock *tp = tcp_sk(sk); 230 struct ip_options_rcu *inet_opt; 231 struct net *net = sock_net(sk); 232 __be16 orig_sport, orig_dport; 233 __be32 daddr, nexthop; 234 struct flowi4 *fl4; 235 struct rtable *rt; 236 int err; 237 238 if (addr_len < sizeof(struct sockaddr_in)) 239 return -EINVAL; 240 241 if (usin->sin_family != AF_INET) 242 return -EAFNOSUPPORT; 243 244 nexthop = daddr = usin->sin_addr.s_addr; 245 inet_opt = rcu_dereference_protected(inet->inet_opt, 246 lockdep_sock_is_held(sk)); 247 if (inet_opt && inet_opt->opt.srr) { 248 if (!daddr) 249 return -EINVAL; 250 nexthop = inet_opt->opt.faddr; 251 } 252 253 orig_sport = inet->inet_sport; 254 orig_dport = usin->sin_port; 255 fl4 = &inet->cork.fl.u.ip4; 256 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 257 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 258 orig_dport, sk); 259 if (IS_ERR(rt)) { 260 err = PTR_ERR(rt); 261 if (err == -ENETUNREACH) 262 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 263 return err; 264 } 265 266 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 267 ip_rt_put(rt); 268 return -ENETUNREACH; 269 } 270 271 if (!inet_opt || !inet_opt->opt.srr) 272 daddr = fl4->daddr; 273 274 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 275 276 if (!inet->inet_saddr) { 277 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 278 if (err) { 279 ip_rt_put(rt); 280 return err; 281 } 282 } else { 283 sk_rcv_saddr_set(sk, inet->inet_saddr); 284 } 285 286 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 287 /* Reset inherited state */ 288 tp->rx_opt.ts_recent = 0; 289 tp->rx_opt.ts_recent_stamp = 0; 290 if (likely(!tp->repair)) 291 WRITE_ONCE(tp->write_seq, 0); 292 } 293 294 inet->inet_dport = usin->sin_port; 295 sk_daddr_set(sk, daddr); 296 297 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); 298 if (inet_opt) 299 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; 300 301 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 302 303 /* Socket identity is still unknown (sport may be zero). 304 * However we set state to SYN-SENT and not releasing socket 305 * lock select source port, enter ourselves into the hash tables and 306 * complete initialization after this. 307 */ 308 tcp_set_state(sk, TCP_SYN_SENT); 309 err = inet_hash_connect(tcp_death_row, sk); 310 if (err) 311 goto failure; 312 313 sk_set_txhash(sk); 314 315 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 316 inet->inet_sport, inet->inet_dport, sk); 317 if (IS_ERR(rt)) { 318 err = PTR_ERR(rt); 319 rt = NULL; 320 goto failure; 321 } 322 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 323 /* OK, now commit destination to socket. */ 324 sk->sk_gso_type = SKB_GSO_TCPV4; 325 sk_setup_caps(sk, &rt->dst); 326 rt = NULL; 327 328 if (likely(!tp->repair)) { 329 if (!tp->write_seq) 330 WRITE_ONCE(tp->write_seq, 331 secure_tcp_seq(inet->inet_saddr, 332 inet->inet_daddr, 333 inet->inet_sport, 334 usin->sin_port)); 335 WRITE_ONCE(tp->tsoffset, 336 secure_tcp_ts_off(net, inet->inet_saddr, 337 inet->inet_daddr)); 338 } 339 340 atomic_set(&inet->inet_id, get_random_u16()); 341 342 if (tcp_fastopen_defer_connect(sk, &err)) 343 return err; 344 if (err) 345 goto failure; 346 347 err = tcp_connect(sk); 348 349 if (err) 350 goto failure; 351 352 return 0; 353 354 failure: 355 /* 356 * This unhashes the socket and releases the local port, 357 * if necessary. 358 */ 359 tcp_set_state(sk, TCP_CLOSE); 360 inet_bhash2_reset_saddr(sk); 361 ip_rt_put(rt); 362 sk->sk_route_caps = 0; 363 inet->inet_dport = 0; 364 return err; 365 } 366 EXPORT_IPV6_MOD(tcp_v4_connect); 367 368 /* 369 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 370 * It can be called through tcp_release_cb() if socket was owned by user 371 * at the time tcp_v4_err() was called to handle ICMP message. 372 */ 373 void tcp_v4_mtu_reduced(struct sock *sk) 374 { 375 struct inet_sock *inet = inet_sk(sk); 376 struct dst_entry *dst; 377 u32 mtu, dmtu; 378 379 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 380 return; 381 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 382 dst = inet_csk_update_pmtu(sk, mtu); 383 if (!dst) 384 return; 385 386 /* Something is about to be wrong... Remember soft error 387 * for the case, if this connection will not able to recover. 388 */ 389 dmtu = dst4_mtu(dst); 390 if (mtu < dmtu && ip_dont_fragment(sk, dst)) 391 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 392 393 if (inet->pmtudisc != IP_PMTUDISC_DONT && 394 ip_sk_accept_pmtu(sk) && 395 inet_csk(sk)->icsk_pmtu_cookie > dmtu) { 396 tcp_sync_mss(sk, dmtu); 397 398 /* Resend the TCP packet because it's 399 * clear that the old packet has been 400 * dropped. This is the new "fast" path mtu 401 * discovery. 402 */ 403 tcp_simple_retransmit(sk); 404 } /* else let the usual retransmit timer handle it */ 405 } 406 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 407 408 static void do_redirect(struct sk_buff *skb, struct sock *sk) 409 { 410 struct dst_entry *dst = __sk_dst_check(sk, 0); 411 412 if (dst) 413 dst->ops->redirect(dst, sk, skb); 414 } 415 416 417 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 418 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 419 { 420 struct request_sock *req = inet_reqsk(sk); 421 struct net *net = sock_net(sk); 422 423 /* ICMPs are not backlogged, hence we cannot get 424 * an established socket here. 425 */ 426 if (seq != tcp_rsk(req)->snt_isn) { 427 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 428 } else if (abort) { 429 /* 430 * Still in SYN_RECV, just remove it silently. 431 * There is no good way to pass the error to the newly 432 * created socket, and POSIX does not want network 433 * errors returned from accept(). 434 */ 435 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 436 tcp_listendrop(req->rsk_listener); 437 } 438 reqsk_put(req); 439 } 440 EXPORT_IPV6_MOD(tcp_req_err); 441 442 /* TCP-LD (RFC 6069) logic */ 443 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 444 { 445 struct inet_connection_sock *icsk = inet_csk(sk); 446 struct tcp_sock *tp = tcp_sk(sk); 447 struct sk_buff *skb; 448 s32 remaining; 449 u32 delta_us; 450 451 if (sock_owned_by_user(sk)) 452 return; 453 454 if (seq != tp->snd_una || !icsk->icsk_retransmits || 455 !icsk->icsk_backoff) 456 return; 457 458 skb = tcp_rtx_queue_head(sk); 459 if (WARN_ON_ONCE(!skb)) 460 return; 461 462 icsk->icsk_backoff--; 463 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 464 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 465 466 tcp_mstamp_refresh(tp); 467 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 468 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 469 470 if (remaining > 0) { 471 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 472 } else { 473 /* RTO revert clocked out retransmission. 474 * Will retransmit now. 475 */ 476 tcp_retransmit_timer(sk); 477 } 478 } 479 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 480 481 /* 482 * This routine is called by the ICMP module when it gets some 483 * sort of error condition. If err < 0 then the socket should 484 * be closed and the error returned to the user. If err > 0 485 * it's just the icmp type << 8 | icmp code. After adjustment 486 * header points to the first 8 bytes of the tcp header. We need 487 * to find the appropriate port. 488 * 489 * The locking strategy used here is very "optimistic". When 490 * someone else accesses the socket the ICMP is just dropped 491 * and for some paths there is no check at all. 492 * A more general error queue to queue errors for later handling 493 * is probably better. 494 * 495 */ 496 497 int tcp_v4_err(struct sk_buff *skb, u32 info) 498 { 499 const struct iphdr *iph = (const struct iphdr *)skb->data; 500 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 501 struct net *net = dev_net_rcu(skb->dev); 502 const int type = icmp_hdr(skb)->type; 503 const int code = icmp_hdr(skb)->code; 504 struct request_sock *fastopen; 505 struct tcp_sock *tp; 506 u32 seq, snd_una; 507 struct sock *sk; 508 int err; 509 510 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 511 ntohs(th->source), inet_iif(skb), 0); 512 if (!sk) { 513 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 514 return -ENOENT; 515 } 516 if (sk->sk_state == TCP_TIME_WAIT) { 517 /* To increase the counter of ignored icmps for TCP-AO */ 518 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 519 inet_twsk_put(inet_twsk(sk)); 520 return 0; 521 } 522 seq = ntohl(th->seq); 523 if (sk->sk_state == TCP_NEW_SYN_RECV) { 524 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 525 type == ICMP_TIME_EXCEEDED || 526 (type == ICMP_DEST_UNREACH && 527 (code == ICMP_NET_UNREACH || 528 code == ICMP_HOST_UNREACH))); 529 return 0; 530 } 531 532 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 533 sock_put(sk); 534 return 0; 535 } 536 537 bh_lock_sock(sk); 538 /* If too many ICMPs get dropped on busy 539 * servers this needs to be solved differently. 540 * We do take care of PMTU discovery (RFC1191) special case : 541 * we can receive locally generated ICMP messages while socket is held. 542 */ 543 if (sock_owned_by_user(sk)) { 544 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 545 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 546 } 547 if (sk->sk_state == TCP_CLOSE) 548 goto out; 549 550 if (static_branch_unlikely(&ip4_min_ttl)) { 551 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 552 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 553 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 554 goto out; 555 } 556 } 557 558 tp = tcp_sk(sk); 559 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 560 fastopen = rcu_dereference(tp->fastopen_rsk); 561 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 562 if (sk->sk_state != TCP_LISTEN && 563 !between(seq, snd_una, tp->snd_nxt)) { 564 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 565 goto out; 566 } 567 568 switch (type) { 569 case ICMP_REDIRECT: 570 if (!sock_owned_by_user(sk)) 571 do_redirect(skb, sk); 572 goto out; 573 case ICMP_SOURCE_QUENCH: 574 /* Just silently ignore these. */ 575 goto out; 576 case ICMP_PARAMETERPROB: 577 err = EPROTO; 578 break; 579 case ICMP_DEST_UNREACH: 580 if (code > NR_ICMP_UNREACH) 581 goto out; 582 583 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 584 /* We are not interested in TCP_LISTEN and open_requests 585 * (SYN-ACKs send out by Linux are always <576bytes so 586 * they should go through unfragmented). 587 */ 588 if (sk->sk_state == TCP_LISTEN) 589 goto out; 590 591 WRITE_ONCE(tp->mtu_info, info); 592 if (!sock_owned_by_user(sk)) { 593 tcp_v4_mtu_reduced(sk); 594 } else { 595 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 596 sock_hold(sk); 597 } 598 goto out; 599 } 600 601 err = icmp_err_convert[code].errno; 602 /* check if this ICMP message allows revert of backoff. 603 * (see RFC 6069) 604 */ 605 if (!fastopen && 606 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 607 tcp_ld_RTO_revert(sk, seq); 608 break; 609 case ICMP_TIME_EXCEEDED: 610 err = EHOSTUNREACH; 611 break; 612 default: 613 goto out; 614 } 615 616 switch (sk->sk_state) { 617 case TCP_SYN_SENT: 618 case TCP_SYN_RECV: 619 /* Only in fast or simultaneous open. If a fast open socket is 620 * already accepted it is treated as a connected one below. 621 */ 622 if (fastopen && !fastopen->sk) 623 break; 624 625 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 626 627 if (!sock_owned_by_user(sk)) 628 tcp_done_with_error(sk, err); 629 else 630 WRITE_ONCE(sk->sk_err_soft, err); 631 goto out; 632 } 633 634 /* If we've already connected we will keep trying 635 * until we time out, or the user gives up. 636 * 637 * rfc1122 4.2.3.9 allows to consider as hard errors 638 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 639 * but it is obsoleted by pmtu discovery). 640 * 641 * Note, that in modern internet, where routing is unreliable 642 * and in each dark corner broken firewalls sit, sending random 643 * errors ordered by their masters even this two messages finally lose 644 * their original sense (even Linux sends invalid PORT_UNREACHs) 645 * 646 * Now we are in compliance with RFCs. 647 * --ANK (980905) 648 */ 649 650 if (!sock_owned_by_user(sk) && 651 inet_test_bit(RECVERR, sk)) { 652 WRITE_ONCE(sk->sk_err, err); 653 sk_error_report(sk); 654 } else { /* Only an error on timeout */ 655 WRITE_ONCE(sk->sk_err_soft, err); 656 } 657 658 out: 659 bh_unlock_sock(sk); 660 sock_put(sk); 661 return 0; 662 } 663 664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 665 { 666 struct tcphdr *th = tcp_hdr(skb); 667 668 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 669 skb->csum_start = skb_transport_header(skb) - skb->head; 670 skb->csum_offset = offsetof(struct tcphdr, check); 671 } 672 673 /* This routine computes an IPv4 TCP checksum. */ 674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 675 { 676 const struct inet_sock *inet = inet_sk(sk); 677 678 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 679 } 680 EXPORT_IPV6_MOD(tcp_v4_send_check); 681 682 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 683 684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 685 const struct tcp_ao_hdr *aoh, 686 struct ip_reply_arg *arg, struct tcphdr *reply, 687 __be32 reply_options[REPLY_OPTIONS_LEN]) 688 { 689 #ifdef CONFIG_TCP_AO 690 int sdif = tcp_v4_sdif(skb); 691 int dif = inet_iif(skb); 692 int l3index = sdif ? dif : 0; 693 bool allocated_traffic_key; 694 struct tcp_ao_key *key; 695 char *traffic_key; 696 bool drop = true; 697 u32 ao_sne = 0; 698 u8 keyid; 699 700 rcu_read_lock(); 701 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 702 &key, &traffic_key, &allocated_traffic_key, 703 &keyid, &ao_sne)) 704 goto out; 705 706 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 707 (aoh->rnext_keyid << 8) | keyid); 708 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 709 reply->doff = arg->iov[0].iov_len / 4; 710 711 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 712 key, traffic_key, 713 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 714 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 715 reply, ao_sne)) 716 goto out; 717 drop = false; 718 out: 719 rcu_read_unlock(); 720 if (allocated_traffic_key) 721 kfree(traffic_key); 722 return drop; 723 #else 724 return true; 725 #endif 726 } 727 728 /* 729 * This routine will send an RST to the other tcp. 730 * 731 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 732 * for reset. 733 * Answer: if a packet caused RST, it is not for a socket 734 * existing in our system, if it is matched to a socket, 735 * it is just duplicate segment or bug in other side's TCP. 736 * So that we build reply only basing on parameters 737 * arrived with segment. 738 * Exception: precedence violation. We do not implement it in any case. 739 */ 740 741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 742 enum sk_rst_reason reason) 743 { 744 const struct tcphdr *th = tcp_hdr(skb); 745 struct { 746 struct tcphdr th; 747 __be32 opt[REPLY_OPTIONS_LEN]; 748 } rep; 749 const __u8 *md5_hash_location = NULL; 750 const struct tcp_ao_hdr *aoh; 751 struct ip_reply_arg arg; 752 #ifdef CONFIG_TCP_MD5SIG 753 struct tcp_md5sig_key *key = NULL; 754 unsigned char newhash[16]; 755 struct sock *sk1 = NULL; 756 #endif 757 u64 transmit_time = 0; 758 struct sock *ctl_sk; 759 struct net *net; 760 u32 txhash = 0; 761 762 /* Never send a reset in response to a reset. */ 763 if (th->rst) 764 return; 765 766 /* If sk not NULL, it means we did a successful lookup and incoming 767 * route had to be correct. prequeue might have dropped our dst. 768 */ 769 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 770 return; 771 772 /* Swap the send and the receive. */ 773 memset(&rep, 0, sizeof(rep)); 774 rep.th.dest = th->source; 775 rep.th.source = th->dest; 776 rep.th.doff = sizeof(struct tcphdr) / 4; 777 rep.th.rst = 1; 778 779 if (th->ack) { 780 rep.th.seq = th->ack_seq; 781 } else { 782 rep.th.ack = 1; 783 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 784 skb->len - (th->doff << 2)); 785 } 786 787 memset(&arg, 0, sizeof(arg)); 788 arg.iov[0].iov_base = (unsigned char *)&rep; 789 arg.iov[0].iov_len = sizeof(rep.th); 790 791 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 792 793 /* Invalid TCP option size or twice included auth */ 794 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 795 return; 796 797 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 798 return; 799 800 #ifdef CONFIG_TCP_MD5SIG 801 rcu_read_lock(); 802 if (sk && sk_fullsock(sk)) { 803 const union tcp_md5_addr *addr; 804 int l3index; 805 806 /* sdif set, means packet ingressed via a device 807 * in an L3 domain and inet_iif is set to it. 808 */ 809 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 810 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 811 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 812 } else if (md5_hash_location) { 813 const union tcp_md5_addr *addr; 814 int sdif = tcp_v4_sdif(skb); 815 int dif = inet_iif(skb); 816 int l3index; 817 818 /* 819 * active side is lost. Try to find listening socket through 820 * source port, and then find md5 key through listening socket. 821 * we are not loose security here: 822 * Incoming packet is checked with md5 hash with finding key, 823 * no RST generated if md5 hash doesn't match. 824 */ 825 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 826 th->source, ip_hdr(skb)->daddr, 827 ntohs(th->source), dif, sdif); 828 /* don't send rst if it can't find key */ 829 if (!sk1) 830 goto out; 831 832 /* sdif set, means packet ingressed via a device 833 * in an L3 domain and dif is set to it. 834 */ 835 l3index = sdif ? dif : 0; 836 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 837 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 838 if (!key) 839 goto out; 840 841 tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 842 if (memcmp(md5_hash_location, newhash, 16) != 0) 843 goto out; 844 } 845 846 if (key) { 847 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 848 (TCPOPT_NOP << 16) | 849 (TCPOPT_MD5SIG << 8) | 850 TCPOLEN_MD5SIG); 851 /* Update length and the length the header thinks exists */ 852 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 853 rep.th.doff = arg.iov[0].iov_len / 4; 854 855 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 856 key, ip_hdr(skb)->saddr, 857 ip_hdr(skb)->daddr, &rep.th); 858 } 859 #endif 860 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 861 if (rep.opt[0] == 0) { 862 __be32 mrst = mptcp_reset_option(skb); 863 864 if (mrst) { 865 rep.opt[0] = mrst; 866 arg.iov[0].iov_len += sizeof(mrst); 867 rep.th.doff = arg.iov[0].iov_len / 4; 868 } 869 } 870 871 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 872 ip_hdr(skb)->saddr, /* XXX */ 873 arg.iov[0].iov_len, IPPROTO_TCP, 0); 874 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 875 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 876 877 /* When socket is gone, all binding information is lost. 878 * routing might fail in this case. No choice here, if we choose to force 879 * input interface, we will misroute in case of asymmetric route. 880 */ 881 if (sk) 882 arg.bound_dev_if = sk->sk_bound_dev_if; 883 884 trace_tcp_send_reset(sk, skb, reason); 885 886 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 887 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 888 889 /* ECN bits of TW reset are cleared */ 890 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 891 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 892 local_bh_disable(); 893 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 894 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 895 896 sock_net_set(ctl_sk, net); 897 if (sk) { 898 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 899 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 900 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 901 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 902 transmit_time = tcp_transmit_time(sk); 903 xfrm_sk_clone_policy(ctl_sk, sk); 904 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 905 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 906 } else { 907 ctl_sk->sk_mark = 0; 908 ctl_sk->sk_priority = 0; 909 } 910 ip_send_unicast_reply(ctl_sk, sk, 911 skb, &TCP_SKB_CB(skb)->header.h4.opt, 912 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 913 &arg, arg.iov[0].iov_len, 914 transmit_time, txhash); 915 916 xfrm_sk_free_policy(ctl_sk); 917 sock_net_set(ctl_sk, &init_net); 918 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 919 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 920 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 921 local_bh_enable(); 922 923 #ifdef CONFIG_TCP_MD5SIG 924 out: 925 rcu_read_unlock(); 926 #endif 927 } 928 929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 930 outside socket context is ugly, certainly. What can I do? 931 */ 932 933 static void tcp_v4_send_ack(const struct sock *sk, 934 struct sk_buff *skb, u32 seq, u32 ack, 935 u32 win, u32 tsval, u32 tsecr, int oif, 936 struct tcp_key *key, 937 int reply_flags, u8 tos, u32 txhash) 938 { 939 const struct tcphdr *th = tcp_hdr(skb); 940 struct { 941 struct tcphdr th; 942 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 943 } rep; 944 struct net *net = sock_net(sk); 945 struct ip_reply_arg arg; 946 struct sock *ctl_sk; 947 u64 transmit_time; 948 949 memset(&rep.th, 0, sizeof(struct tcphdr)); 950 memset(&arg, 0, sizeof(arg)); 951 952 arg.iov[0].iov_base = (unsigned char *)&rep; 953 arg.iov[0].iov_len = sizeof(rep.th); 954 if (tsecr) { 955 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 956 (TCPOPT_TIMESTAMP << 8) | 957 TCPOLEN_TIMESTAMP); 958 rep.opt[1] = htonl(tsval); 959 rep.opt[2] = htonl(tsecr); 960 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 961 } 962 963 /* Swap the send and the receive. */ 964 rep.th.dest = th->source; 965 rep.th.source = th->dest; 966 rep.th.doff = arg.iov[0].iov_len / 4; 967 rep.th.seq = htonl(seq); 968 rep.th.ack_seq = htonl(ack); 969 rep.th.ack = 1; 970 rep.th.window = htons(win); 971 972 #ifdef CONFIG_TCP_MD5SIG 973 if (tcp_key_is_md5(key)) { 974 int offset = (tsecr) ? 3 : 0; 975 976 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 977 (TCPOPT_NOP << 16) | 978 (TCPOPT_MD5SIG << 8) | 979 TCPOLEN_MD5SIG); 980 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 981 rep.th.doff = arg.iov[0].iov_len/4; 982 983 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 984 key->md5_key, ip_hdr(skb)->saddr, 985 ip_hdr(skb)->daddr, &rep.th); 986 } 987 #endif 988 #ifdef CONFIG_TCP_AO 989 if (tcp_key_is_ao(key)) { 990 int offset = (tsecr) ? 3 : 0; 991 992 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 993 (tcp_ao_len(key->ao_key) << 16) | 994 (key->ao_key->sndid << 8) | 995 key->rcv_next); 996 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 997 rep.th.doff = arg.iov[0].iov_len / 4; 998 999 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1000 key->ao_key, key->traffic_key, 1001 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1002 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1003 &rep.th, key->sne); 1004 } 1005 #endif 1006 arg.flags = reply_flags; 1007 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1008 ip_hdr(skb)->saddr, /* XXX */ 1009 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1010 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1011 if (oif) 1012 arg.bound_dev_if = oif; 1013 arg.tos = tos; 1014 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1015 local_bh_disable(); 1016 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1017 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1018 sock_net_set(ctl_sk, net); 1019 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1020 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1021 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1022 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1023 transmit_time = tcp_transmit_time(sk); 1024 ip_send_unicast_reply(ctl_sk, sk, 1025 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1026 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1027 &arg, arg.iov[0].iov_len, 1028 transmit_time, txhash); 1029 1030 sock_net_set(ctl_sk, &init_net); 1031 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1032 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1033 local_bh_enable(); 1034 } 1035 1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1037 enum tcp_tw_status tw_status) 1038 { 1039 struct inet_timewait_sock *tw = inet_twsk(sk); 1040 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1041 struct tcp_key key = {}; 1042 u8 tos = tw->tw_tos; 1043 1044 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1045 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1046 * being placed in a different service queues (Classic rather than L4S) 1047 */ 1048 if (tw_status == TCP_TW_ACK_OOW) 1049 tos &= ~INET_ECN_MASK; 1050 1051 #ifdef CONFIG_TCP_AO 1052 struct tcp_ao_info *ao_info; 1053 1054 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1055 /* FIXME: the segment to-be-acked is not verified yet */ 1056 ao_info = rcu_dereference(tcptw->ao_info); 1057 if (ao_info) { 1058 const struct tcp_ao_hdr *aoh; 1059 1060 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1061 inet_twsk_put(tw); 1062 return; 1063 } 1064 1065 if (aoh) 1066 key.ao_key = tcp_ao_established_key(sk, ao_info, 1067 aoh->rnext_keyid, -1); 1068 } 1069 } 1070 if (key.ao_key) { 1071 struct tcp_ao_key *rnext_key; 1072 1073 key.traffic_key = snd_other_key(key.ao_key); 1074 key.sne = READ_ONCE(ao_info->snd_sne); 1075 rnext_key = READ_ONCE(ao_info->rnext_key); 1076 key.rcv_next = rnext_key->rcvid; 1077 key.type = TCP_KEY_AO; 1078 #else 1079 if (0) { 1080 #endif 1081 } else if (static_branch_tcp_md5()) { 1082 key.md5_key = tcp_twsk_md5_key(tcptw); 1083 if (key.md5_key) 1084 key.type = TCP_KEY_MD5; 1085 } 1086 1087 tcp_v4_send_ack(sk, skb, 1088 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1089 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1090 tcp_tw_tsval(tcptw), 1091 READ_ONCE(tcptw->tw_ts_recent), 1092 tw->tw_bound_dev_if, &key, 1093 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1094 tos, 1095 tw->tw_txhash); 1096 1097 inet_twsk_put(tw); 1098 } 1099 1100 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1101 struct request_sock *req) 1102 { 1103 struct tcp_key key = {}; 1104 1105 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1106 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1107 */ 1108 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1109 tcp_sk(sk)->snd_nxt; 1110 1111 #ifdef CONFIG_TCP_AO 1112 if (static_branch_unlikely(&tcp_ao_needed.key) && 1113 tcp_rsk_used_ao(req)) { 1114 const union tcp_md5_addr *addr; 1115 const struct tcp_ao_hdr *aoh; 1116 int l3index; 1117 1118 /* Invalid TCP option size or twice included auth */ 1119 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1120 return; 1121 if (!aoh) 1122 return; 1123 1124 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1125 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1126 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1127 aoh->rnext_keyid, -1); 1128 if (unlikely(!key.ao_key)) { 1129 /* Send ACK with any matching MKT for the peer */ 1130 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1131 /* Matching key disappeared (user removed the key?) 1132 * let the handshake timeout. 1133 */ 1134 if (!key.ao_key) { 1135 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1136 addr, 1137 ntohs(tcp_hdr(skb)->source), 1138 &ip_hdr(skb)->daddr, 1139 ntohs(tcp_hdr(skb)->dest)); 1140 return; 1141 } 1142 } 1143 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1144 if (!key.traffic_key) 1145 return; 1146 1147 key.type = TCP_KEY_AO; 1148 key.rcv_next = aoh->keyid; 1149 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1150 #else 1151 if (0) { 1152 #endif 1153 } else if (static_branch_tcp_md5()) { 1154 const union tcp_md5_addr *addr; 1155 int l3index; 1156 1157 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1158 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1159 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1160 if (key.md5_key) 1161 key.type = TCP_KEY_MD5; 1162 } 1163 1164 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1165 tcp_v4_send_ack(sk, skb, seq, 1166 tcp_rsk(req)->rcv_nxt, 1167 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1168 tcp_rsk_tsval(tcp_rsk(req)), 1169 req->ts_recent, 1170 0, &key, 1171 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1172 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1173 READ_ONCE(tcp_rsk(req)->txhash)); 1174 if (tcp_key_is_ao(&key)) 1175 kfree(key.traffic_key); 1176 } 1177 1178 /* 1179 * Send a SYN-ACK after having received a SYN. 1180 * This still operates on a request_sock only, not on a big 1181 * socket. 1182 */ 1183 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1184 struct flowi *fl, 1185 struct request_sock *req, 1186 struct tcp_fastopen_cookie *foc, 1187 enum tcp_synack_type synack_type, 1188 struct sk_buff *syn_skb) 1189 { 1190 struct inet_request_sock *ireq = inet_rsk(req); 1191 struct flowi4 fl4; 1192 int err = -1; 1193 struct sk_buff *skb; 1194 u8 tos; 1195 1196 /* First, grab a route. */ 1197 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1198 return -1; 1199 1200 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1201 1202 if (skb) { 1203 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 1204 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1205 1206 tos = READ_ONCE(inet_sk(sk)->tos); 1207 1208 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1209 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1210 (tos & INET_ECN_MASK); 1211 1212 if (!INET_ECN_is_capable(tos) && 1213 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1214 tos |= INET_ECN_ECT_0; 1215 1216 rcu_read_lock(); 1217 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1218 ireq->ir_rmt_addr, 1219 rcu_dereference(ireq->ireq_opt), 1220 tos); 1221 rcu_read_unlock(); 1222 err = net_xmit_eval(err); 1223 } 1224 1225 return err; 1226 } 1227 1228 /* 1229 * IPv4 request_sock destructor. 1230 */ 1231 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1232 { 1233 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1234 } 1235 1236 #ifdef CONFIG_TCP_MD5SIG 1237 /* 1238 * RFC2385 MD5 checksumming requires a mapping of 1239 * IP address->MD5 Key. 1240 * We need to maintain these in the sk structure. 1241 */ 1242 1243 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1244 EXPORT_IPV6_MOD(tcp_md5_needed); 1245 1246 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1247 { 1248 if (!old) 1249 return true; 1250 1251 /* l3index always overrides non-l3index */ 1252 if (old->l3index && new->l3index == 0) 1253 return false; 1254 if (old->l3index == 0 && new->l3index) 1255 return true; 1256 1257 return old->prefixlen < new->prefixlen; 1258 } 1259 1260 /* Find the Key structure for an address. */ 1261 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1262 const union tcp_md5_addr *addr, 1263 int family, bool any_l3index) 1264 { 1265 const struct tcp_sock *tp = tcp_sk(sk); 1266 struct tcp_md5sig_key *key; 1267 const struct tcp_md5sig_info *md5sig; 1268 __be32 mask; 1269 struct tcp_md5sig_key *best_match = NULL; 1270 bool match; 1271 1272 /* caller either holds rcu_read_lock() or socket lock */ 1273 md5sig = rcu_dereference_check(tp->md5sig_info, 1274 lockdep_sock_is_held(sk)); 1275 if (!md5sig) 1276 return NULL; 1277 1278 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1279 lockdep_sock_is_held(sk)) { 1280 if (key->family != family) 1281 continue; 1282 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1283 key->l3index != l3index) 1284 continue; 1285 if (family == AF_INET) { 1286 mask = inet_make_mask(key->prefixlen); 1287 match = (key->addr.a4.s_addr & mask) == 1288 (addr->a4.s_addr & mask); 1289 #if IS_ENABLED(CONFIG_IPV6) 1290 } else if (family == AF_INET6) { 1291 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1292 key->prefixlen); 1293 #endif 1294 } else { 1295 match = false; 1296 } 1297 1298 if (match && better_md5_match(best_match, key)) 1299 best_match = key; 1300 } 1301 return best_match; 1302 } 1303 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1304 1305 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1306 const union tcp_md5_addr *addr, 1307 int family, u8 prefixlen, 1308 int l3index, u8 flags) 1309 { 1310 const struct tcp_sock *tp = tcp_sk(sk); 1311 struct tcp_md5sig_key *key; 1312 unsigned int size = sizeof(struct in_addr); 1313 const struct tcp_md5sig_info *md5sig; 1314 1315 /* caller either holds rcu_read_lock() or socket lock */ 1316 md5sig = rcu_dereference_check(tp->md5sig_info, 1317 lockdep_sock_is_held(sk)); 1318 if (!md5sig) 1319 return NULL; 1320 #if IS_ENABLED(CONFIG_IPV6) 1321 if (family == AF_INET6) 1322 size = sizeof(struct in6_addr); 1323 #endif 1324 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1325 lockdep_sock_is_held(sk)) { 1326 if (key->family != family) 1327 continue; 1328 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1329 continue; 1330 if (key->l3index != l3index) 1331 continue; 1332 if (!memcmp(&key->addr, addr, size) && 1333 key->prefixlen == prefixlen) 1334 return key; 1335 } 1336 return NULL; 1337 } 1338 1339 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1340 const struct sock *addr_sk) 1341 { 1342 const union tcp_md5_addr *addr; 1343 int l3index; 1344 1345 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1346 addr_sk->sk_bound_dev_if); 1347 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1348 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1349 } 1350 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1351 1352 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1353 { 1354 struct tcp_sock *tp = tcp_sk(sk); 1355 struct tcp_md5sig_info *md5sig; 1356 1357 md5sig = kmalloc_obj(*md5sig, gfp); 1358 if (!md5sig) 1359 return -ENOMEM; 1360 1361 sk_gso_disable(sk); 1362 INIT_HLIST_HEAD(&md5sig->head); 1363 rcu_assign_pointer(tp->md5sig_info, md5sig); 1364 return 0; 1365 } 1366 1367 /* This can be called on a newly created socket, from other files */ 1368 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1369 int family, u8 prefixlen, int l3index, u8 flags, 1370 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1371 { 1372 /* Add Key to the list */ 1373 struct tcp_md5sig_key *key; 1374 struct tcp_sock *tp = tcp_sk(sk); 1375 struct tcp_md5sig_info *md5sig; 1376 1377 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1378 if (key) { 1379 /* Pre-existing entry - just update that one. 1380 * Note that the key might be used concurrently. 1381 * data_race() is telling kcsan that we do not care of 1382 * key mismatches, since changing MD5 key on live flows 1383 * can lead to packet drops. 1384 */ 1385 data_race(memcpy(key->key, newkey, newkeylen)); 1386 1387 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1388 * Also note that a reader could catch new key->keylen value 1389 * but old key->key[], this is the reason we use __GFP_ZERO 1390 * at sock_kmalloc() time below these lines. 1391 */ 1392 WRITE_ONCE(key->keylen, newkeylen); 1393 1394 return 0; 1395 } 1396 1397 md5sig = rcu_dereference_protected(tp->md5sig_info, 1398 lockdep_sock_is_held(sk)); 1399 1400 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1401 if (!key) 1402 return -ENOMEM; 1403 1404 memcpy(key->key, newkey, newkeylen); 1405 key->keylen = newkeylen; 1406 key->family = family; 1407 key->prefixlen = prefixlen; 1408 key->l3index = l3index; 1409 key->flags = flags; 1410 memcpy(&key->addr, addr, 1411 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1412 sizeof(struct in_addr)); 1413 hlist_add_head_rcu(&key->node, &md5sig->head); 1414 return 0; 1415 } 1416 1417 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1418 int family, u8 prefixlen, int l3index, u8 flags, 1419 const u8 *newkey, u8 newkeylen) 1420 { 1421 struct tcp_sock *tp = tcp_sk(sk); 1422 1423 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1424 if (fips_enabled) { 1425 pr_warn_once("TCP-MD5 support is disabled due to FIPS\n"); 1426 return -EOPNOTSUPP; 1427 } 1428 1429 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1430 return -ENOMEM; 1431 1432 if (!static_branch_inc(&tcp_md5_needed.key)) { 1433 struct tcp_md5sig_info *md5sig; 1434 1435 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1436 rcu_assign_pointer(tp->md5sig_info, NULL); 1437 kfree_rcu(md5sig, rcu); 1438 return -EUSERS; 1439 } 1440 } 1441 1442 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1443 newkey, newkeylen, GFP_KERNEL); 1444 } 1445 EXPORT_IPV6_MOD(tcp_md5_do_add); 1446 1447 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1448 int family, u8 prefixlen, int l3index, 1449 struct tcp_md5sig_key *key) 1450 { 1451 struct tcp_sock *tp = tcp_sk(sk); 1452 1453 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1454 1455 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1456 return -ENOMEM; 1457 1458 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1459 struct tcp_md5sig_info *md5sig; 1460 1461 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1462 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1463 rcu_assign_pointer(tp->md5sig_info, NULL); 1464 kfree_rcu(md5sig, rcu); 1465 return -EUSERS; 1466 } 1467 } 1468 1469 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1470 key->flags, key->key, key->keylen, 1471 sk_gfp_mask(sk, GFP_ATOMIC)); 1472 } 1473 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1474 1475 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1476 u8 prefixlen, int l3index, u8 flags) 1477 { 1478 struct tcp_md5sig_key *key; 1479 1480 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1481 if (!key) 1482 return -ENOENT; 1483 hlist_del_rcu(&key->node); 1484 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1485 kfree_rcu(key, rcu); 1486 return 0; 1487 } 1488 EXPORT_IPV6_MOD(tcp_md5_do_del); 1489 1490 void tcp_clear_md5_list(struct sock *sk) 1491 { 1492 struct tcp_sock *tp = tcp_sk(sk); 1493 struct tcp_md5sig_key *key; 1494 struct hlist_node *n; 1495 struct tcp_md5sig_info *md5sig; 1496 1497 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1498 1499 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1500 hlist_del(&key->node); 1501 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1502 kfree(key); 1503 } 1504 } 1505 1506 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1507 sockptr_t optval, int optlen) 1508 { 1509 struct tcp_md5sig cmd; 1510 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1511 const union tcp_md5_addr *addr; 1512 u8 prefixlen = 32; 1513 int l3index = 0; 1514 bool l3flag; 1515 u8 flags; 1516 1517 if (optlen < sizeof(cmd)) 1518 return -EINVAL; 1519 1520 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1521 return -EFAULT; 1522 1523 if (sin->sin_family != AF_INET) 1524 return -EINVAL; 1525 1526 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1527 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1528 1529 if (optname == TCP_MD5SIG_EXT && 1530 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1531 prefixlen = cmd.tcpm_prefixlen; 1532 if (prefixlen > 32) 1533 return -EINVAL; 1534 } 1535 1536 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1537 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1538 struct net_device *dev; 1539 1540 rcu_read_lock(); 1541 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1542 if (dev && netif_is_l3_master(dev)) 1543 l3index = dev->ifindex; 1544 1545 rcu_read_unlock(); 1546 1547 /* ok to reference set/not set outside of rcu; 1548 * right now device MUST be an L3 master 1549 */ 1550 if (!dev || !l3index) 1551 return -EINVAL; 1552 } 1553 1554 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1555 1556 if (!cmd.tcpm_keylen) 1557 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1558 1559 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1560 return -EINVAL; 1561 1562 /* Don't allow keys for peers that have a matching TCP-AO key. 1563 * See the comment in tcp_ao_add_cmd() 1564 */ 1565 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1566 return -EKEYREJECTED; 1567 1568 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1569 cmd.tcpm_key, cmd.tcpm_keylen); 1570 } 1571 1572 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx, 1573 __be32 daddr, __be32 saddr, 1574 const struct tcphdr *th, int nbytes) 1575 { 1576 struct { 1577 struct tcp4_pseudohdr ip; 1578 struct tcphdr tcp; 1579 } h; 1580 1581 h.ip.saddr = saddr; 1582 h.ip.daddr = daddr; 1583 h.ip.pad = 0; 1584 h.ip.protocol = IPPROTO_TCP; 1585 h.ip.len = cpu_to_be16(nbytes); 1586 h.tcp = *th; 1587 h.tcp.check = 0; 1588 md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); 1589 } 1590 1591 static noinline_for_stack void 1592 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1593 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1594 { 1595 struct md5_ctx ctx; 1596 1597 md5_init(&ctx); 1598 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); 1599 tcp_md5_hash_key(&ctx, key); 1600 md5_final(&ctx, md5_hash); 1601 } 1602 1603 noinline_for_stack void 1604 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1605 const struct sock *sk, const struct sk_buff *skb) 1606 { 1607 const struct tcphdr *th = tcp_hdr(skb); 1608 __be32 saddr, daddr; 1609 struct md5_ctx ctx; 1610 1611 if (sk) { /* valid for establish/request sockets */ 1612 saddr = sk->sk_rcv_saddr; 1613 daddr = sk->sk_daddr; 1614 } else { 1615 const struct iphdr *iph = ip_hdr(skb); 1616 saddr = iph->saddr; 1617 daddr = iph->daddr; 1618 } 1619 1620 md5_init(&ctx); 1621 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); 1622 tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); 1623 tcp_md5_hash_key(&ctx, key); 1624 md5_final(&ctx, md5_hash); 1625 } 1626 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1627 1628 #endif 1629 1630 static void tcp_v4_init_req(struct request_sock *req, 1631 const struct sock *sk_listener, 1632 struct sk_buff *skb) 1633 { 1634 struct inet_request_sock *ireq = inet_rsk(req); 1635 struct net *net = sock_net(sk_listener); 1636 1637 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1638 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1639 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1640 } 1641 1642 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1643 struct sk_buff *skb, 1644 struct flowi *fl, 1645 struct request_sock *req, 1646 u32 tw_isn) 1647 { 1648 tcp_v4_init_req(req, sk, skb); 1649 1650 if (security_inet_conn_request(sk, skb, req)) 1651 return NULL; 1652 1653 return inet_csk_route_req(sk, &fl->u.ip4, req); 1654 } 1655 1656 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1657 .family = PF_INET, 1658 .obj_size = sizeof(struct tcp_request_sock), 1659 .send_ack = tcp_v4_reqsk_send_ack, 1660 .destructor = tcp_v4_reqsk_destructor, 1661 .send_reset = tcp_v4_send_reset, 1662 }; 1663 1664 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1665 .mss_clamp = TCP_MSS_DEFAULT, 1666 #ifdef CONFIG_TCP_MD5SIG 1667 .req_md5_lookup = tcp_v4_md5_lookup, 1668 .calc_md5_hash = tcp_v4_md5_hash_skb, 1669 #endif 1670 #ifdef CONFIG_TCP_AO 1671 .ao_lookup = tcp_v4_ao_lookup_rsk, 1672 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1673 .ao_synack_hash = tcp_v4_ao_synack_hash, 1674 #endif 1675 #ifdef CONFIG_SYN_COOKIES 1676 .cookie_init_seq = cookie_v4_init_sequence, 1677 #endif 1678 .route_req = tcp_v4_route_req, 1679 .init_seq = tcp_v4_init_seq, 1680 .init_ts_off = tcp_v4_init_ts_off, 1681 .send_synack = tcp_v4_send_synack, 1682 }; 1683 1684 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1685 { 1686 /* Never answer to SYNs send to broadcast or multicast */ 1687 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1688 goto drop; 1689 1690 return tcp_conn_request(&tcp_request_sock_ops, 1691 &tcp_request_sock_ipv4_ops, sk, skb); 1692 1693 drop: 1694 tcp_listendrop(sk); 1695 return 0; 1696 } 1697 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1698 1699 1700 /* 1701 * The three way handshake has completed - we got a valid synack - 1702 * now create the new socket. 1703 */ 1704 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1705 struct request_sock *req, 1706 struct dst_entry *dst, 1707 struct request_sock *req_unhash, 1708 bool *own_req) 1709 { 1710 struct inet_request_sock *ireq; 1711 bool found_dup_sk = false; 1712 struct inet_sock *newinet; 1713 struct tcp_sock *newtp; 1714 struct sock *newsk; 1715 #ifdef CONFIG_TCP_MD5SIG 1716 const union tcp_md5_addr *addr; 1717 struct tcp_md5sig_key *key; 1718 int l3index; 1719 #endif 1720 struct ip_options_rcu *inet_opt; 1721 1722 if (sk_acceptq_is_full(sk)) 1723 goto exit_overflow; 1724 1725 newsk = tcp_create_openreq_child(sk, req, skb); 1726 if (!newsk) 1727 goto exit_nonewsk; 1728 1729 newsk->sk_gso_type = SKB_GSO_TCPV4; 1730 inet_sk_rx_dst_set(newsk, skb); 1731 1732 newtp = tcp_sk(newsk); 1733 newinet = inet_sk(newsk); 1734 ireq = inet_rsk(req); 1735 inet_opt = rcu_dereference(ireq->ireq_opt); 1736 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1737 newinet->mc_index = inet_iif(skb); 1738 newinet->mc_ttl = ip_hdr(skb)->ttl; 1739 newinet->rcv_tos = ip_hdr(skb)->tos; 1740 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1741 if (inet_opt) 1742 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1743 atomic_set(&newinet->inet_id, get_random_u16()); 1744 1745 /* Set ToS of the new socket based upon the value of incoming SYN. 1746 * ECT bits are set later in tcp_init_transfer(). 1747 */ 1748 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1749 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1750 1751 if (!dst) { 1752 dst = inet_csk_route_child_sock(sk, newsk, req); 1753 if (!dst) 1754 goto put_and_exit; 1755 } else { 1756 /* syncookie case : see end of cookie_v4_check() */ 1757 } 1758 sk_setup_caps(newsk, dst); 1759 1760 tcp_ca_openreq_child(newsk, dst); 1761 1762 tcp_sync_mss(newsk, dst4_mtu(dst)); 1763 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1764 1765 tcp_initialize_rcv_mss(newsk); 1766 1767 #ifdef CONFIG_TCP_MD5SIG 1768 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1769 /* Copy over the MD5 key from the original socket */ 1770 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1771 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1772 if (key && !tcp_rsk_used_ao(req)) { 1773 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1774 goto put_and_exit; 1775 sk_gso_disable(newsk); 1776 } 1777 #endif 1778 #ifdef CONFIG_TCP_AO 1779 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1780 goto put_and_exit; /* OOM, release back memory */ 1781 #endif 1782 1783 if (__inet_inherit_port(sk, newsk) < 0) 1784 goto put_and_exit; 1785 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1786 &found_dup_sk); 1787 if (likely(*own_req)) { 1788 tcp_move_syn(newtp, req); 1789 ireq->ireq_opt = NULL; 1790 } else { 1791 newinet->inet_opt = NULL; 1792 1793 if (!req_unhash && found_dup_sk) { 1794 /* This code path should only be executed in the 1795 * syncookie case only 1796 */ 1797 bh_unlock_sock(newsk); 1798 sock_put(newsk); 1799 newsk = NULL; 1800 } 1801 } 1802 return newsk; 1803 1804 exit_overflow: 1805 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1806 exit_nonewsk: 1807 dst_release(dst); 1808 exit: 1809 tcp_listendrop(sk); 1810 return NULL; 1811 put_and_exit: 1812 newinet->inet_opt = NULL; 1813 inet_csk_prepare_forced_close(newsk); 1814 tcp_done(newsk); 1815 goto exit; 1816 } 1817 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1818 1819 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1820 { 1821 #ifdef CONFIG_SYN_COOKIES 1822 const struct tcphdr *th = tcp_hdr(skb); 1823 1824 if (!th->syn) 1825 sk = cookie_v4_check(sk, skb); 1826 #endif 1827 return sk; 1828 } 1829 1830 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1831 struct tcphdr *th, u32 *cookie) 1832 { 1833 u16 mss = 0; 1834 #ifdef CONFIG_SYN_COOKIES 1835 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1836 &tcp_request_sock_ipv4_ops, sk, th); 1837 if (mss) { 1838 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1839 tcp_synq_overflow(sk); 1840 } 1841 #endif 1842 return mss; 1843 } 1844 1845 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1846 u32)); 1847 /* The socket must have it's spinlock held when we get 1848 * here, unless it is a TCP_LISTEN socket. 1849 * 1850 * We have a potential double-lock case here, so even when 1851 * doing backlog processing we use the BH locking scheme. 1852 * This is because we cannot sleep with the original spinlock 1853 * held. 1854 */ 1855 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1856 { 1857 enum skb_drop_reason reason; 1858 struct sock *rsk; 1859 1860 reason = psp_sk_rx_policy_check(sk, skb); 1861 if (reason) 1862 goto err_discard; 1863 1864 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1865 struct dst_entry *dst; 1866 1867 dst = rcu_dereference_protected(sk->sk_rx_dst, 1868 lockdep_sock_is_held(sk)); 1869 1870 sock_rps_save_rxhash(sk, skb); 1871 sk_mark_napi_id(sk, skb); 1872 if (dst) { 1873 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1874 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1875 dst, 0)) { 1876 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1877 dst_release(dst); 1878 } 1879 } 1880 tcp_rcv_established(sk, skb); 1881 return 0; 1882 } 1883 1884 if (tcp_checksum_complete(skb)) 1885 goto csum_err; 1886 1887 if (sk->sk_state == TCP_LISTEN) { 1888 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1889 1890 if (!nsk) 1891 return 0; 1892 if (nsk != sk) { 1893 reason = tcp_child_process(sk, nsk, skb); 1894 if (reason) { 1895 rsk = nsk; 1896 goto reset; 1897 } 1898 return 0; 1899 } 1900 } else 1901 sock_rps_save_rxhash(sk, skb); 1902 1903 reason = tcp_rcv_state_process(sk, skb); 1904 if (reason) { 1905 rsk = sk; 1906 goto reset; 1907 } 1908 return 0; 1909 1910 reset: 1911 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1912 discard: 1913 sk_skb_reason_drop(sk, skb, reason); 1914 /* Be careful here. If this function gets more complicated and 1915 * gcc suffers from register pressure on the x86, sk (in %ebx) 1916 * might be destroyed here. This current version compiles correctly, 1917 * but you have been warned. 1918 */ 1919 return 0; 1920 1921 csum_err: 1922 reason = SKB_DROP_REASON_TCP_CSUM; 1923 trace_tcp_bad_csum(skb); 1924 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1925 err_discard: 1926 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1927 goto discard; 1928 } 1929 EXPORT_SYMBOL(tcp_v4_do_rcv); 1930 1931 int tcp_v4_early_demux(struct sk_buff *skb) 1932 { 1933 struct net *net = dev_net_rcu(skb->dev); 1934 const struct iphdr *iph; 1935 const struct tcphdr *th; 1936 struct sock *sk; 1937 1938 if (skb->pkt_type != PACKET_HOST) 1939 return 0; 1940 1941 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1942 return 0; 1943 1944 iph = ip_hdr(skb); 1945 th = tcp_hdr(skb); 1946 1947 if (th->doff < sizeof(struct tcphdr) / 4) 1948 return 0; 1949 1950 sk = __inet_lookup_established(net, iph->saddr, th->source, 1951 iph->daddr, ntohs(th->dest), 1952 skb->skb_iif, inet_sdif(skb)); 1953 if (sk) { 1954 skb->sk = sk; 1955 skb->destructor = sock_edemux; 1956 if (sk_fullsock(sk)) { 1957 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1958 1959 if (dst) 1960 dst = dst_check(dst, 0); 1961 if (dst && 1962 sk->sk_rx_dst_ifindex == skb->skb_iif) 1963 skb_dst_set_noref(skb, dst); 1964 } 1965 } 1966 return 0; 1967 } 1968 1969 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1970 enum skb_drop_reason *reason) 1971 { 1972 u32 tail_gso_size, tail_gso_segs; 1973 struct skb_shared_info *shinfo; 1974 const struct tcphdr *th; 1975 struct tcphdr *thtail; 1976 struct sk_buff *tail; 1977 unsigned int hdrlen; 1978 bool fragstolen; 1979 u32 gso_segs; 1980 u32 gso_size; 1981 u64 limit; 1982 int delta; 1983 int err; 1984 1985 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1986 * we can fix skb->truesize to its real value to avoid future drops. 1987 * This is valid because skb is not yet charged to the socket. 1988 * It has been noticed pure SACK packets were sometimes dropped 1989 * (if cooked by drivers without copybreak feature). 1990 */ 1991 skb_condense(skb); 1992 1993 tcp_cleanup_skb(skb); 1994 1995 if (unlikely(tcp_checksum_complete(skb))) { 1996 bh_unlock_sock(sk); 1997 trace_tcp_bad_csum(skb); 1998 *reason = SKB_DROP_REASON_TCP_CSUM; 1999 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2000 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2001 return true; 2002 } 2003 2004 /* Attempt coalescing to last skb in backlog, even if we are 2005 * above the limits. 2006 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2007 */ 2008 th = (const struct tcphdr *)skb->data; 2009 hdrlen = th->doff * 4; 2010 2011 tail = sk->sk_backlog.tail; 2012 if (!tail) 2013 goto no_coalesce; 2014 thtail = (struct tcphdr *)tail->data; 2015 2016 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2017 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2018 ((TCP_SKB_CB(tail)->tcp_flags | 2019 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2020 !((TCP_SKB_CB(tail)->tcp_flags & 2021 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2022 ((TCP_SKB_CB(tail)->tcp_flags ^ 2023 TCP_SKB_CB(skb)->tcp_flags) & 2024 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2025 !tcp_skb_can_collapse_rx(tail, skb) || 2026 thtail->doff != th->doff || 2027 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || 2028 /* prior to PSP Rx policy check, retain exact PSP metadata */ 2029 psp_skb_coalesce_diff(tail, skb)) 2030 goto no_coalesce; 2031 2032 __skb_pull(skb, hdrlen); 2033 2034 shinfo = skb_shinfo(skb); 2035 gso_size = shinfo->gso_size ?: skb->len; 2036 gso_segs = shinfo->gso_segs ?: 1; 2037 2038 shinfo = skb_shinfo(tail); 2039 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2040 tail_gso_segs = shinfo->gso_segs ?: 1; 2041 2042 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2043 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2044 2045 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2046 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2047 thtail->window = th->window; 2048 } 2049 2050 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2051 * thtail->fin, so that the fast path in tcp_rcv_established() 2052 * is not entered if we append a packet with a FIN. 2053 * SYN, RST, URG are not present. 2054 * ACK is set on both packets. 2055 * PSH : we do not really care in TCP stack, 2056 * at least for 'GRO' packets. 2057 */ 2058 thtail->fin |= th->fin; 2059 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2060 2061 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2062 TCP_SKB_CB(tail)->has_rxtstamp = true; 2063 tail->tstamp = skb->tstamp; 2064 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2065 } 2066 2067 /* Not as strict as GRO. We only need to carry mss max value */ 2068 shinfo->gso_size = max(gso_size, tail_gso_size); 2069 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2070 2071 sk->sk_backlog.len += delta; 2072 __NET_INC_STATS(sock_net(sk), 2073 LINUX_MIB_TCPBACKLOGCOALESCE); 2074 kfree_skb_partial(skb, fragstolen); 2075 return false; 2076 } 2077 __skb_push(skb, hdrlen); 2078 2079 no_coalesce: 2080 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2081 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2082 * sk_rcvbuf in normal conditions. 2083 */ 2084 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2085 2086 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2087 2088 /* Only socket owner can try to collapse/prune rx queues 2089 * to reduce memory overhead, so add a little headroom here. 2090 * Few sockets backlog are possibly concurrently non empty. 2091 */ 2092 limit += 64 * 1024; 2093 2094 limit = min_t(u64, limit, UINT_MAX); 2095 2096 err = sk_add_backlog(sk, skb, limit); 2097 if (unlikely(err)) { 2098 bh_unlock_sock(sk); 2099 if (err == -ENOMEM) { 2100 *reason = SKB_DROP_REASON_PFMEMALLOC; 2101 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2102 } else { 2103 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2104 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2105 } 2106 return true; 2107 } 2108 return false; 2109 } 2110 EXPORT_IPV6_MOD(tcp_add_backlog); 2111 2112 static void tcp_v4_restore_cb(struct sk_buff *skb) 2113 { 2114 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2115 sizeof(struct inet_skb_parm)); 2116 } 2117 2118 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2119 const struct tcphdr *th) 2120 { 2121 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2122 * barrier() makes sure compiler wont play fool^Waliasing games. 2123 */ 2124 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2125 sizeof(struct inet_skb_parm)); 2126 barrier(); 2127 2128 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2129 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2130 skb->len - th->doff * 4); 2131 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2132 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2133 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2134 TCP_SKB_CB(skb)->sacked = 0; 2135 TCP_SKB_CB(skb)->has_rxtstamp = 2136 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2137 } 2138 2139 /* 2140 * From tcp_input.c 2141 */ 2142 2143 int tcp_v4_rcv(struct sk_buff *skb) 2144 { 2145 struct net *net = dev_net_rcu(skb->dev); 2146 enum skb_drop_reason drop_reason; 2147 enum tcp_tw_status tw_status; 2148 int sdif = inet_sdif(skb); 2149 int dif = inet_iif(skb); 2150 const struct iphdr *iph; 2151 const struct tcphdr *th; 2152 struct sock *sk = NULL; 2153 bool refcounted; 2154 int ret; 2155 u32 isn; 2156 2157 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2158 if (skb->pkt_type != PACKET_HOST) 2159 goto discard_it; 2160 2161 /* Count it even if it's bad */ 2162 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2163 2164 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2165 goto discard_it; 2166 2167 th = (const struct tcphdr *)skb->data; 2168 2169 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2170 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2171 goto bad_packet; 2172 } 2173 if (!pskb_may_pull(skb, th->doff * 4)) 2174 goto discard_it; 2175 2176 /* An explanation is required here, I think. 2177 * Packet length and doff are validated by header prediction, 2178 * provided case of th->doff==0 is eliminated. 2179 * So, we defer the checks. */ 2180 2181 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2182 goto csum_error; 2183 2184 th = (const struct tcphdr *)skb->data; 2185 iph = ip_hdr(skb); 2186 lookup: 2187 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2188 th->dest, sdif, &refcounted); 2189 if (!sk) 2190 goto no_tcp_socket; 2191 2192 if (sk->sk_state == TCP_TIME_WAIT) 2193 goto do_time_wait; 2194 2195 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2196 struct request_sock *req = inet_reqsk(sk); 2197 bool req_stolen = false; 2198 struct sock *nsk; 2199 2200 sk = req->rsk_listener; 2201 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2202 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2203 else 2204 drop_reason = tcp_inbound_hash(sk, req, skb, 2205 &iph->saddr, &iph->daddr, 2206 AF_INET, dif, sdif); 2207 if (unlikely(drop_reason)) { 2208 sk_drops_skbadd(sk, skb); 2209 reqsk_put(req); 2210 goto discard_it; 2211 } 2212 if (tcp_checksum_complete(skb)) { 2213 reqsk_put(req); 2214 goto csum_error; 2215 } 2216 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2217 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2218 if (!nsk) { 2219 inet_csk_reqsk_queue_drop_and_put(sk, req); 2220 goto lookup; 2221 } 2222 sk = nsk; 2223 /* reuseport_migrate_sock() has already held one sk_refcnt 2224 * before returning. 2225 */ 2226 } else { 2227 /* We own a reference on the listener, increase it again 2228 * as we might lose it too soon. 2229 */ 2230 sock_hold(sk); 2231 } 2232 refcounted = true; 2233 nsk = NULL; 2234 if (!tcp_filter(sk, skb, &drop_reason)) { 2235 th = (const struct tcphdr *)skb->data; 2236 iph = ip_hdr(skb); 2237 tcp_v4_fill_cb(skb, iph, th); 2238 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2239 &drop_reason); 2240 } 2241 if (!nsk) { 2242 reqsk_put(req); 2243 if (req_stolen) { 2244 /* Another cpu got exclusive access to req 2245 * and created a full blown socket. 2246 * Try to feed this packet to this socket 2247 * instead of discarding it. 2248 */ 2249 tcp_v4_restore_cb(skb); 2250 sock_put(sk); 2251 goto lookup; 2252 } 2253 goto discard_and_relse; 2254 } 2255 nf_reset_ct(skb); 2256 if (nsk == sk) { 2257 reqsk_put(req); 2258 tcp_v4_restore_cb(skb); 2259 } else { 2260 drop_reason = tcp_child_process(sk, nsk, skb); 2261 if (drop_reason) { 2262 enum sk_rst_reason rst_reason; 2263 2264 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2265 tcp_v4_send_reset(nsk, skb, rst_reason); 2266 goto discard_and_relse; 2267 } 2268 sock_put(sk); 2269 return 0; 2270 } 2271 } 2272 2273 process: 2274 if (static_branch_unlikely(&ip4_min_ttl)) { 2275 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2276 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2277 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2278 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2279 goto discard_and_relse; 2280 } 2281 } 2282 2283 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2284 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2285 goto discard_and_relse; 2286 } 2287 2288 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2289 AF_INET, dif, sdif); 2290 if (drop_reason) 2291 goto discard_and_relse; 2292 2293 nf_reset_ct(skb); 2294 2295 if (tcp_filter(sk, skb, &drop_reason)) 2296 goto discard_and_relse; 2297 2298 th = (const struct tcphdr *)skb->data; 2299 iph = ip_hdr(skb); 2300 tcp_v4_fill_cb(skb, iph, th); 2301 2302 skb->dev = NULL; 2303 2304 if (sk->sk_state == TCP_LISTEN) { 2305 ret = tcp_v4_do_rcv(sk, skb); 2306 goto put_and_return; 2307 } 2308 2309 sk_incoming_cpu_update(sk); 2310 2311 bh_lock_sock_nested(sk); 2312 tcp_segs_in(tcp_sk(sk), skb); 2313 ret = 0; 2314 if (!sock_owned_by_user(sk)) { 2315 ret = tcp_v4_do_rcv(sk, skb); 2316 } else { 2317 if (tcp_add_backlog(sk, skb, &drop_reason)) 2318 goto discard_and_relse; 2319 } 2320 bh_unlock_sock(sk); 2321 2322 put_and_return: 2323 if (refcounted) 2324 sock_put(sk); 2325 2326 return ret; 2327 2328 no_tcp_socket: 2329 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2330 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2331 goto discard_it; 2332 2333 tcp_v4_fill_cb(skb, iph, th); 2334 2335 if (tcp_checksum_complete(skb)) { 2336 csum_error: 2337 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2338 trace_tcp_bad_csum(skb); 2339 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2340 bad_packet: 2341 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2342 } else { 2343 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2344 } 2345 2346 discard_it: 2347 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2348 /* Discard frame. */ 2349 sk_skb_reason_drop(sk, skb, drop_reason); 2350 return 0; 2351 2352 discard_and_relse: 2353 sk_drops_skbadd(sk, skb); 2354 if (refcounted) 2355 sock_put(sk); 2356 goto discard_it; 2357 2358 do_time_wait: 2359 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2360 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2361 inet_twsk_put(inet_twsk(sk)); 2362 goto discard_it; 2363 } 2364 2365 tcp_v4_fill_cb(skb, iph, th); 2366 2367 if (tcp_checksum_complete(skb)) { 2368 inet_twsk_put(inet_twsk(sk)); 2369 goto csum_error; 2370 } 2371 2372 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2373 &drop_reason); 2374 switch (tw_status) { 2375 case TCP_TW_SYN: { 2376 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2377 iph->saddr, th->source, 2378 iph->daddr, th->dest, 2379 inet_iif(skb), 2380 sdif); 2381 if (sk2) { 2382 inet_twsk_deschedule_put(inet_twsk(sk)); 2383 sk = sk2; 2384 tcp_v4_restore_cb(skb); 2385 refcounted = false; 2386 __this_cpu_write(tcp_tw_isn, isn); 2387 goto process; 2388 } 2389 2390 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); 2391 if (drop_reason) 2392 break; 2393 } 2394 /* to ACK */ 2395 fallthrough; 2396 case TCP_TW_ACK: 2397 case TCP_TW_ACK_OOW: 2398 tcp_v4_timewait_ack(sk, skb, tw_status); 2399 break; 2400 case TCP_TW_RST: 2401 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2402 inet_twsk_deschedule_put(inet_twsk(sk)); 2403 goto discard_it; 2404 case TCP_TW_SUCCESS:; 2405 } 2406 goto discard_it; 2407 } 2408 2409 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2410 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2411 }; 2412 2413 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2414 { 2415 struct dst_entry *dst = skb_dst(skb); 2416 2417 if (dst && dst_hold_safe(dst)) { 2418 rcu_assign_pointer(sk->sk_rx_dst, dst); 2419 sk->sk_rx_dst_ifindex = skb->skb_iif; 2420 } 2421 } 2422 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2423 2424 const struct inet_connection_sock_af_ops ipv4_specific = { 2425 .queue_xmit = ip_queue_xmit, 2426 .send_check = tcp_v4_send_check, 2427 .rebuild_header = inet_sk_rebuild_header, 2428 .sk_rx_dst_set = inet_sk_rx_dst_set, 2429 .conn_request = tcp_v4_conn_request, 2430 .syn_recv_sock = tcp_v4_syn_recv_sock, 2431 .net_header_len = sizeof(struct iphdr), 2432 .setsockopt = ip_setsockopt, 2433 .getsockopt = ip_getsockopt, 2434 .mtu_reduced = tcp_v4_mtu_reduced, 2435 }; 2436 EXPORT_IPV6_MOD(ipv4_specific); 2437 2438 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2439 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2440 #ifdef CONFIG_TCP_MD5SIG 2441 .md5_lookup = tcp_v4_md5_lookup, 2442 .calc_md5_hash = tcp_v4_md5_hash_skb, 2443 .md5_parse = tcp_v4_parse_md5_keys, 2444 #endif 2445 #ifdef CONFIG_TCP_AO 2446 .ao_lookup = tcp_v4_ao_lookup, 2447 .calc_ao_hash = tcp_v4_ao_hash_skb, 2448 .ao_parse = tcp_v4_parse_ao, 2449 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2450 #endif 2451 }; 2452 2453 static void tcp4_destruct_sock(struct sock *sk) 2454 { 2455 tcp_md5_destruct_sock(sk); 2456 tcp_ao_destroy_sock(sk, false); 2457 inet_sock_destruct(sk); 2458 } 2459 #endif 2460 2461 /* NOTE: A lot of things set to zero explicitly by call to 2462 * sk_alloc() so need not be done here. 2463 */ 2464 static int tcp_v4_init_sock(struct sock *sk) 2465 { 2466 struct inet_connection_sock *icsk = inet_csk(sk); 2467 2468 tcp_init_sock(sk); 2469 2470 icsk->icsk_af_ops = &ipv4_specific; 2471 2472 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2473 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2474 sk->sk_destruct = tcp4_destruct_sock; 2475 #endif 2476 2477 return 0; 2478 } 2479 2480 static void tcp_release_user_frags(struct sock *sk) 2481 { 2482 #ifdef CONFIG_PAGE_POOL 2483 unsigned long index; 2484 void *netmem; 2485 2486 xa_for_each(&sk->sk_user_frags, index, netmem) 2487 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2488 #endif 2489 } 2490 2491 void tcp_v4_destroy_sock(struct sock *sk) 2492 { 2493 struct tcp_sock *tp = tcp_sk(sk); 2494 2495 tcp_release_user_frags(sk); 2496 2497 xa_destroy(&sk->sk_user_frags); 2498 2499 trace_tcp_destroy_sock(sk); 2500 2501 tcp_clear_xmit_timers(sk); 2502 2503 tcp_cleanup_congestion_control(sk); 2504 2505 tcp_cleanup_ulp(sk); 2506 2507 /* Cleanup up the write buffer. */ 2508 tcp_write_queue_purge(sk); 2509 2510 /* Check if we want to disable active TFO */ 2511 tcp_fastopen_active_disable_ofo_check(sk); 2512 2513 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2514 skb_rbtree_purge(&tp->out_of_order_queue); 2515 2516 /* Clean up a referenced TCP bind bucket. */ 2517 if (inet_csk(sk)->icsk_bind_hash) 2518 inet_put_port(sk); 2519 2520 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2521 2522 /* If socket is aborted during connect operation */ 2523 tcp_free_fastopen_req(tp); 2524 tcp_fastopen_destroy_cipher(sk); 2525 tcp_saved_syn_free(tp); 2526 2527 sk_sockets_allocated_dec(sk); 2528 } 2529 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2530 2531 #ifdef CONFIG_PROC_FS 2532 /* Proc filesystem TCP sock list dumping. */ 2533 2534 static unsigned short seq_file_family(const struct seq_file *seq); 2535 2536 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2537 { 2538 unsigned short family = seq_file_family(seq); 2539 2540 /* AF_UNSPEC is used as a match all */ 2541 return ((family == AF_UNSPEC || family == sk->sk_family) && 2542 net_eq(sock_net(sk), seq_file_net(seq))); 2543 } 2544 2545 /* Find a non empty bucket (starting from st->bucket) 2546 * and return the first sk from it. 2547 */ 2548 static void *listening_get_first(struct seq_file *seq) 2549 { 2550 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2551 struct tcp_iter_state *st = seq->private; 2552 2553 st->offset = 0; 2554 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2555 struct inet_listen_hashbucket *ilb2; 2556 struct hlist_nulls_node *node; 2557 struct sock *sk; 2558 2559 ilb2 = &hinfo->lhash2[st->bucket]; 2560 if (hlist_nulls_empty(&ilb2->nulls_head)) 2561 continue; 2562 2563 spin_lock(&ilb2->lock); 2564 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2565 if (seq_sk_match(seq, sk)) 2566 return sk; 2567 } 2568 spin_unlock(&ilb2->lock); 2569 } 2570 2571 return NULL; 2572 } 2573 2574 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2575 * If "cur" is the last one in the st->bucket, 2576 * call listening_get_first() to return the first sk of the next 2577 * non empty bucket. 2578 */ 2579 static void *listening_get_next(struct seq_file *seq, void *cur) 2580 { 2581 struct tcp_iter_state *st = seq->private; 2582 struct inet_listen_hashbucket *ilb2; 2583 struct hlist_nulls_node *node; 2584 struct inet_hashinfo *hinfo; 2585 struct sock *sk = cur; 2586 2587 ++st->num; 2588 ++st->offset; 2589 2590 sk = sk_nulls_next(sk); 2591 sk_nulls_for_each_from(sk, node) { 2592 if (seq_sk_match(seq, sk)) 2593 return sk; 2594 } 2595 2596 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2597 ilb2 = &hinfo->lhash2[st->bucket]; 2598 spin_unlock(&ilb2->lock); 2599 ++st->bucket; 2600 return listening_get_first(seq); 2601 } 2602 2603 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2604 { 2605 struct tcp_iter_state *st = seq->private; 2606 void *rc; 2607 2608 st->bucket = 0; 2609 st->offset = 0; 2610 rc = listening_get_first(seq); 2611 2612 while (rc && *pos) { 2613 rc = listening_get_next(seq, rc); 2614 --*pos; 2615 } 2616 return rc; 2617 } 2618 2619 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2620 const struct tcp_iter_state *st) 2621 { 2622 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2623 } 2624 2625 /* 2626 * Get first established socket starting from bucket given in st->bucket. 2627 * If st->bucket is zero, the very first socket in the hash is returned. 2628 */ 2629 static void *established_get_first(struct seq_file *seq) 2630 { 2631 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2632 struct tcp_iter_state *st = seq->private; 2633 2634 st->offset = 0; 2635 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2636 struct sock *sk; 2637 struct hlist_nulls_node *node; 2638 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2639 2640 cond_resched(); 2641 2642 /* Lockless fast path for the common case of empty buckets */ 2643 if (empty_bucket(hinfo, st)) 2644 continue; 2645 2646 spin_lock_bh(lock); 2647 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2648 if (seq_sk_match(seq, sk)) 2649 return sk; 2650 } 2651 spin_unlock_bh(lock); 2652 } 2653 2654 return NULL; 2655 } 2656 2657 static void *established_get_next(struct seq_file *seq, void *cur) 2658 { 2659 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2660 struct tcp_iter_state *st = seq->private; 2661 struct hlist_nulls_node *node; 2662 struct sock *sk = cur; 2663 2664 ++st->num; 2665 ++st->offset; 2666 2667 sk = sk_nulls_next(sk); 2668 2669 sk_nulls_for_each_from(sk, node) { 2670 if (seq_sk_match(seq, sk)) 2671 return sk; 2672 } 2673 2674 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2675 ++st->bucket; 2676 return established_get_first(seq); 2677 } 2678 2679 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2680 { 2681 struct tcp_iter_state *st = seq->private; 2682 void *rc; 2683 2684 st->bucket = 0; 2685 rc = established_get_first(seq); 2686 2687 while (rc && pos) { 2688 rc = established_get_next(seq, rc); 2689 --pos; 2690 } 2691 return rc; 2692 } 2693 2694 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2695 { 2696 void *rc; 2697 struct tcp_iter_state *st = seq->private; 2698 2699 st->state = TCP_SEQ_STATE_LISTENING; 2700 rc = listening_get_idx(seq, &pos); 2701 2702 if (!rc) { 2703 st->state = TCP_SEQ_STATE_ESTABLISHED; 2704 rc = established_get_idx(seq, pos); 2705 } 2706 2707 return rc; 2708 } 2709 2710 static void *tcp_seek_last_pos(struct seq_file *seq) 2711 { 2712 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2713 struct tcp_iter_state *st = seq->private; 2714 int bucket = st->bucket; 2715 int offset = st->offset; 2716 int orig_num = st->num; 2717 void *rc = NULL; 2718 2719 switch (st->state) { 2720 case TCP_SEQ_STATE_LISTENING: 2721 if (st->bucket > hinfo->lhash2_mask) 2722 break; 2723 rc = listening_get_first(seq); 2724 while (offset-- && rc && bucket == st->bucket) 2725 rc = listening_get_next(seq, rc); 2726 if (rc) 2727 break; 2728 st->bucket = 0; 2729 st->state = TCP_SEQ_STATE_ESTABLISHED; 2730 fallthrough; 2731 case TCP_SEQ_STATE_ESTABLISHED: 2732 if (st->bucket > hinfo->ehash_mask) 2733 break; 2734 rc = established_get_first(seq); 2735 while (offset-- && rc && bucket == st->bucket) 2736 rc = established_get_next(seq, rc); 2737 } 2738 2739 st->num = orig_num; 2740 2741 return rc; 2742 } 2743 2744 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2745 { 2746 struct tcp_iter_state *st = seq->private; 2747 void *rc; 2748 2749 if (*pos && *pos == st->last_pos) { 2750 rc = tcp_seek_last_pos(seq); 2751 if (rc) 2752 goto out; 2753 } 2754 2755 st->state = TCP_SEQ_STATE_LISTENING; 2756 st->num = 0; 2757 st->bucket = 0; 2758 st->offset = 0; 2759 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2760 2761 out: 2762 st->last_pos = *pos; 2763 return rc; 2764 } 2765 EXPORT_IPV6_MOD(tcp_seq_start); 2766 2767 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2768 { 2769 struct tcp_iter_state *st = seq->private; 2770 void *rc = NULL; 2771 2772 if (v == SEQ_START_TOKEN) { 2773 rc = tcp_get_idx(seq, 0); 2774 goto out; 2775 } 2776 2777 switch (st->state) { 2778 case TCP_SEQ_STATE_LISTENING: 2779 rc = listening_get_next(seq, v); 2780 if (!rc) { 2781 st->state = TCP_SEQ_STATE_ESTABLISHED; 2782 st->bucket = 0; 2783 st->offset = 0; 2784 rc = established_get_first(seq); 2785 } 2786 break; 2787 case TCP_SEQ_STATE_ESTABLISHED: 2788 rc = established_get_next(seq, v); 2789 break; 2790 } 2791 out: 2792 ++*pos; 2793 st->last_pos = *pos; 2794 return rc; 2795 } 2796 EXPORT_IPV6_MOD(tcp_seq_next); 2797 2798 void tcp_seq_stop(struct seq_file *seq, void *v) 2799 { 2800 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2801 struct tcp_iter_state *st = seq->private; 2802 2803 switch (st->state) { 2804 case TCP_SEQ_STATE_LISTENING: 2805 if (v != SEQ_START_TOKEN) 2806 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2807 break; 2808 case TCP_SEQ_STATE_ESTABLISHED: 2809 if (v) 2810 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2811 break; 2812 } 2813 } 2814 EXPORT_IPV6_MOD(tcp_seq_stop); 2815 2816 static void get_openreq4(const struct request_sock *req, 2817 struct seq_file *f, int i) 2818 { 2819 const struct inet_request_sock *ireq = inet_rsk(req); 2820 long delta = req->rsk_timer.expires - jiffies; 2821 2822 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2823 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2824 i, 2825 ireq->ir_loc_addr, 2826 ireq->ir_num, 2827 ireq->ir_rmt_addr, 2828 ntohs(ireq->ir_rmt_port), 2829 TCP_SYN_RECV, 2830 0, 0, /* could print option size, but that is af dependent. */ 2831 1, /* timers active (only the expire timer) */ 2832 jiffies_delta_to_clock_t(delta), 2833 req->num_timeout, 2834 from_kuid_munged(seq_user_ns(f), 2835 sk_uid(req->rsk_listener)), 2836 0, /* non standard timer */ 2837 0, /* open_requests have no inode */ 2838 0, 2839 req); 2840 } 2841 2842 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2843 { 2844 int timer_active; 2845 unsigned long timer_expires; 2846 const struct tcp_sock *tp = tcp_sk(sk); 2847 const struct inet_connection_sock *icsk = inet_csk(sk); 2848 const struct inet_sock *inet = inet_sk(sk); 2849 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2850 __be32 dest = inet->inet_daddr; 2851 __be32 src = inet->inet_rcv_saddr; 2852 __u16 destp = ntohs(inet->inet_dport); 2853 __u16 srcp = ntohs(inet->inet_sport); 2854 u8 icsk_pending; 2855 int rx_queue; 2856 int state; 2857 2858 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2859 if (icsk_pending == ICSK_TIME_RETRANS || 2860 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2861 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2862 timer_active = 1; 2863 timer_expires = tcp_timeout_expires(sk); 2864 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2865 timer_active = 4; 2866 timer_expires = tcp_timeout_expires(sk); 2867 } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 2868 timer_active = 2; 2869 timer_expires = icsk->icsk_keepalive_timer.expires; 2870 } else { 2871 timer_active = 0; 2872 timer_expires = jiffies; 2873 } 2874 2875 state = inet_sk_state_load(sk); 2876 if (state == TCP_LISTEN) 2877 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2878 else 2879 /* Because we don't lock the socket, 2880 * we might find a transient negative value. 2881 */ 2882 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2883 READ_ONCE(tp->copied_seq), 0); 2884 2885 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2886 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2887 i, src, srcp, dest, destp, state, 2888 READ_ONCE(tp->write_seq) - tp->snd_una, 2889 rx_queue, 2890 timer_active, 2891 jiffies_delta_to_clock_t(timer_expires - jiffies), 2892 READ_ONCE(icsk->icsk_retransmits), 2893 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2894 READ_ONCE(icsk->icsk_probes_out), 2895 sock_i_ino(sk), 2896 refcount_read(&sk->sk_refcnt), sk, 2897 jiffies_to_clock_t(icsk->icsk_rto), 2898 jiffies_to_clock_t(icsk->icsk_ack.ato), 2899 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2900 tcp_snd_cwnd(tp), 2901 state == TCP_LISTEN ? 2902 fastopenq->max_qlen : 2903 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2904 } 2905 2906 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2907 struct seq_file *f, int i) 2908 { 2909 long delta = tw->tw_timer.expires - jiffies; 2910 __be32 dest, src; 2911 __u16 destp, srcp; 2912 2913 dest = tw->tw_daddr; 2914 src = tw->tw_rcv_saddr; 2915 destp = ntohs(tw->tw_dport); 2916 srcp = ntohs(tw->tw_sport); 2917 2918 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2919 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2920 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2921 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2922 refcount_read(&tw->tw_refcnt), tw); 2923 } 2924 2925 #define TMPSZ 150 2926 2927 static int tcp4_seq_show(struct seq_file *seq, void *v) 2928 { 2929 struct tcp_iter_state *st; 2930 struct sock *sk = v; 2931 2932 seq_setwidth(seq, TMPSZ - 1); 2933 if (v == SEQ_START_TOKEN) { 2934 seq_puts(seq, " sl local_address rem_address st tx_queue " 2935 "rx_queue tr tm->when retrnsmt uid timeout " 2936 "inode"); 2937 goto out; 2938 } 2939 st = seq->private; 2940 2941 if (sk->sk_state == TCP_TIME_WAIT) 2942 get_timewait4_sock(v, seq, st->num); 2943 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2944 get_openreq4(v, seq, st->num); 2945 else 2946 get_tcp4_sock(v, seq, st->num); 2947 out: 2948 seq_pad(seq, '\n'); 2949 return 0; 2950 } 2951 2952 #ifdef CONFIG_BPF_SYSCALL 2953 union bpf_tcp_iter_batch_item { 2954 struct sock *sk; 2955 __u64 cookie; 2956 }; 2957 2958 struct bpf_tcp_iter_state { 2959 struct tcp_iter_state state; 2960 unsigned int cur_sk; 2961 unsigned int end_sk; 2962 unsigned int max_sk; 2963 union bpf_tcp_iter_batch_item *batch; 2964 }; 2965 2966 struct bpf_iter__tcp { 2967 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2968 __bpf_md_ptr(struct sock_common *, sk_common); 2969 uid_t uid __aligned(8); 2970 }; 2971 2972 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2973 struct sock_common *sk_common, uid_t uid) 2974 { 2975 struct bpf_iter__tcp ctx; 2976 2977 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2978 ctx.meta = meta; 2979 ctx.sk_common = sk_common; 2980 ctx.uid = uid; 2981 return bpf_iter_run_prog(prog, &ctx); 2982 } 2983 2984 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2985 { 2986 union bpf_tcp_iter_batch_item *item; 2987 unsigned int cur_sk = iter->cur_sk; 2988 __u64 cookie; 2989 2990 /* Remember the cookies of the sockets we haven't seen yet, so we can 2991 * pick up where we left off next time around. 2992 */ 2993 while (cur_sk < iter->end_sk) { 2994 item = &iter->batch[cur_sk++]; 2995 cookie = sock_gen_cookie(item->sk); 2996 sock_gen_put(item->sk); 2997 item->cookie = cookie; 2998 } 2999 } 3000 3001 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3002 unsigned int new_batch_sz, gfp_t flags) 3003 { 3004 union bpf_tcp_iter_batch_item *new_batch; 3005 3006 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3007 flags | __GFP_NOWARN); 3008 if (!new_batch) 3009 return -ENOMEM; 3010 3011 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 3012 kvfree(iter->batch); 3013 iter->batch = new_batch; 3014 iter->max_sk = new_batch_sz; 3015 3016 return 0; 3017 } 3018 3019 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3020 union bpf_tcp_iter_batch_item *cookies, 3021 int n_cookies) 3022 { 3023 struct hlist_nulls_node *node; 3024 struct sock *sk; 3025 int i; 3026 3027 for (i = 0; i < n_cookies; i++) { 3028 sk = first_sk; 3029 sk_nulls_for_each_from(sk, node) 3030 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3031 return sk; 3032 } 3033 3034 return NULL; 3035 } 3036 3037 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3038 { 3039 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3040 struct bpf_tcp_iter_state *iter = seq->private; 3041 struct tcp_iter_state *st = &iter->state; 3042 unsigned int find_cookie = iter->cur_sk; 3043 unsigned int end_cookie = iter->end_sk; 3044 int resume_bucket = st->bucket; 3045 struct sock *sk; 3046 3047 if (end_cookie && find_cookie == end_cookie) 3048 ++st->bucket; 3049 3050 sk = listening_get_first(seq); 3051 iter->cur_sk = 0; 3052 iter->end_sk = 0; 3053 3054 if (sk && st->bucket == resume_bucket && end_cookie) { 3055 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3056 end_cookie - find_cookie); 3057 if (!sk) { 3058 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3059 ++st->bucket; 3060 sk = listening_get_first(seq); 3061 } 3062 } 3063 3064 return sk; 3065 } 3066 3067 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3068 { 3069 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3070 struct bpf_tcp_iter_state *iter = seq->private; 3071 struct tcp_iter_state *st = &iter->state; 3072 unsigned int find_cookie = iter->cur_sk; 3073 unsigned int end_cookie = iter->end_sk; 3074 int resume_bucket = st->bucket; 3075 struct sock *sk; 3076 3077 if (end_cookie && find_cookie == end_cookie) 3078 ++st->bucket; 3079 3080 sk = established_get_first(seq); 3081 iter->cur_sk = 0; 3082 iter->end_sk = 0; 3083 3084 if (sk && st->bucket == resume_bucket && end_cookie) { 3085 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3086 end_cookie - find_cookie); 3087 if (!sk) { 3088 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3089 ++st->bucket; 3090 sk = established_get_first(seq); 3091 } 3092 } 3093 3094 return sk; 3095 } 3096 3097 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3098 { 3099 struct bpf_tcp_iter_state *iter = seq->private; 3100 struct tcp_iter_state *st = &iter->state; 3101 struct sock *sk = NULL; 3102 3103 switch (st->state) { 3104 case TCP_SEQ_STATE_LISTENING: 3105 sk = bpf_iter_tcp_resume_listening(seq); 3106 if (sk) 3107 break; 3108 st->bucket = 0; 3109 st->state = TCP_SEQ_STATE_ESTABLISHED; 3110 fallthrough; 3111 case TCP_SEQ_STATE_ESTABLISHED: 3112 sk = bpf_iter_tcp_resume_established(seq); 3113 break; 3114 } 3115 3116 return sk; 3117 } 3118 3119 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3120 struct sock **start_sk) 3121 { 3122 struct bpf_tcp_iter_state *iter = seq->private; 3123 struct hlist_nulls_node *node; 3124 unsigned int expected = 1; 3125 struct sock *sk; 3126 3127 sock_hold(*start_sk); 3128 iter->batch[iter->end_sk++].sk = *start_sk; 3129 3130 sk = sk_nulls_next(*start_sk); 3131 *start_sk = NULL; 3132 sk_nulls_for_each_from(sk, node) { 3133 if (seq_sk_match(seq, sk)) { 3134 if (iter->end_sk < iter->max_sk) { 3135 sock_hold(sk); 3136 iter->batch[iter->end_sk++].sk = sk; 3137 } else if (!*start_sk) { 3138 /* Remember where we left off. */ 3139 *start_sk = sk; 3140 } 3141 expected++; 3142 } 3143 } 3144 3145 return expected; 3146 } 3147 3148 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3149 struct sock **start_sk) 3150 { 3151 struct bpf_tcp_iter_state *iter = seq->private; 3152 struct hlist_nulls_node *node; 3153 unsigned int expected = 1; 3154 struct sock *sk; 3155 3156 sock_hold(*start_sk); 3157 iter->batch[iter->end_sk++].sk = *start_sk; 3158 3159 sk = sk_nulls_next(*start_sk); 3160 *start_sk = NULL; 3161 sk_nulls_for_each_from(sk, node) { 3162 if (seq_sk_match(seq, sk)) { 3163 if (iter->end_sk < iter->max_sk) { 3164 sock_hold(sk); 3165 iter->batch[iter->end_sk++].sk = sk; 3166 } else if (!*start_sk) { 3167 /* Remember where we left off. */ 3168 *start_sk = sk; 3169 } 3170 expected++; 3171 } 3172 } 3173 3174 return expected; 3175 } 3176 3177 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3178 struct sock **start_sk) 3179 { 3180 struct bpf_tcp_iter_state *iter = seq->private; 3181 struct tcp_iter_state *st = &iter->state; 3182 3183 if (st->state == TCP_SEQ_STATE_LISTENING) 3184 return bpf_iter_tcp_listening_batch(seq, start_sk); 3185 else 3186 return bpf_iter_tcp_established_batch(seq, start_sk); 3187 } 3188 3189 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3190 { 3191 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3192 struct bpf_tcp_iter_state *iter = seq->private; 3193 struct tcp_iter_state *st = &iter->state; 3194 3195 if (st->state == TCP_SEQ_STATE_LISTENING) 3196 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3197 else 3198 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3199 } 3200 3201 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3202 { 3203 struct bpf_tcp_iter_state *iter = seq->private; 3204 unsigned int expected; 3205 struct sock *sk; 3206 int err; 3207 3208 sk = bpf_iter_tcp_resume(seq); 3209 if (!sk) 3210 return NULL; /* Done */ 3211 3212 expected = bpf_iter_fill_batch(seq, &sk); 3213 if (likely(iter->end_sk == expected)) 3214 goto done; 3215 3216 /* Batch size was too small. */ 3217 bpf_iter_tcp_unlock_bucket(seq); 3218 bpf_iter_tcp_put_batch(iter); 3219 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3220 GFP_USER); 3221 if (err) 3222 return ERR_PTR(err); 3223 3224 sk = bpf_iter_tcp_resume(seq); 3225 if (!sk) 3226 return NULL; /* Done */ 3227 3228 expected = bpf_iter_fill_batch(seq, &sk); 3229 if (likely(iter->end_sk == expected)) 3230 goto done; 3231 3232 /* Batch size was still too small. Hold onto the lock while we try 3233 * again with a larger batch to make sure the current bucket's size 3234 * does not change in the meantime. 3235 */ 3236 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3237 if (err) { 3238 bpf_iter_tcp_unlock_bucket(seq); 3239 return ERR_PTR(err); 3240 } 3241 3242 expected = bpf_iter_fill_batch(seq, &sk); 3243 WARN_ON_ONCE(iter->end_sk != expected); 3244 done: 3245 bpf_iter_tcp_unlock_bucket(seq); 3246 return iter->batch[0].sk; 3247 } 3248 3249 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3250 { 3251 /* bpf iter does not support lseek, so it always 3252 * continue from where it was stop()-ped. 3253 */ 3254 if (*pos) 3255 return bpf_iter_tcp_batch(seq); 3256 3257 return SEQ_START_TOKEN; 3258 } 3259 3260 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3261 { 3262 struct bpf_tcp_iter_state *iter = seq->private; 3263 struct tcp_iter_state *st = &iter->state; 3264 struct sock *sk; 3265 3266 /* Whenever seq_next() is called, the iter->cur_sk is 3267 * done with seq_show(), so advance to the next sk in 3268 * the batch. 3269 */ 3270 if (iter->cur_sk < iter->end_sk) { 3271 /* Keeping st->num consistent in tcp_iter_state. 3272 * bpf_iter_tcp does not use st->num. 3273 * meta.seq_num is used instead. 3274 */ 3275 st->num++; 3276 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3277 } 3278 3279 if (iter->cur_sk < iter->end_sk) 3280 sk = iter->batch[iter->cur_sk].sk; 3281 else 3282 sk = bpf_iter_tcp_batch(seq); 3283 3284 ++*pos; 3285 /* Keeping st->last_pos consistent in tcp_iter_state. 3286 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3287 */ 3288 st->last_pos = *pos; 3289 return sk; 3290 } 3291 3292 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3293 { 3294 struct bpf_iter_meta meta; 3295 struct bpf_prog *prog; 3296 struct sock *sk = v; 3297 uid_t uid; 3298 int ret; 3299 3300 if (v == SEQ_START_TOKEN) 3301 return 0; 3302 3303 if (sk_fullsock(sk)) 3304 lock_sock(sk); 3305 3306 if (unlikely(sk_unhashed(sk))) { 3307 ret = SEQ_SKIP; 3308 goto unlock; 3309 } 3310 3311 if (sk->sk_state == TCP_TIME_WAIT) { 3312 uid = 0; 3313 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3314 const struct request_sock *req = v; 3315 3316 uid = from_kuid_munged(seq_user_ns(seq), 3317 sk_uid(req->rsk_listener)); 3318 } else { 3319 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3320 } 3321 3322 meta.seq = seq; 3323 prog = bpf_iter_get_info(&meta, false); 3324 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3325 3326 unlock: 3327 if (sk_fullsock(sk)) 3328 release_sock(sk); 3329 return ret; 3330 3331 } 3332 3333 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3334 { 3335 struct bpf_tcp_iter_state *iter = seq->private; 3336 struct bpf_iter_meta meta; 3337 struct bpf_prog *prog; 3338 3339 if (!v) { 3340 meta.seq = seq; 3341 prog = bpf_iter_get_info(&meta, true); 3342 if (prog) 3343 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3344 } 3345 3346 if (iter->cur_sk < iter->end_sk) 3347 bpf_iter_tcp_put_batch(iter); 3348 } 3349 3350 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3351 .show = bpf_iter_tcp_seq_show, 3352 .start = bpf_iter_tcp_seq_start, 3353 .next = bpf_iter_tcp_seq_next, 3354 .stop = bpf_iter_tcp_seq_stop, 3355 }; 3356 #endif 3357 static unsigned short seq_file_family(const struct seq_file *seq) 3358 { 3359 const struct tcp_seq_afinfo *afinfo; 3360 3361 #ifdef CONFIG_BPF_SYSCALL 3362 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3363 if (seq->op == &bpf_iter_tcp_seq_ops) 3364 return AF_UNSPEC; 3365 #endif 3366 3367 /* Iterated from proc fs */ 3368 afinfo = pde_data(file_inode(seq->file)); 3369 return afinfo->family; 3370 } 3371 3372 static const struct seq_operations tcp4_seq_ops = { 3373 .show = tcp4_seq_show, 3374 .start = tcp_seq_start, 3375 .next = tcp_seq_next, 3376 .stop = tcp_seq_stop, 3377 }; 3378 3379 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3380 .family = AF_INET, 3381 }; 3382 3383 static int __net_init tcp4_proc_init_net(struct net *net) 3384 { 3385 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3386 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3387 return -ENOMEM; 3388 return 0; 3389 } 3390 3391 static void __net_exit tcp4_proc_exit_net(struct net *net) 3392 { 3393 remove_proc_entry("tcp", net->proc_net); 3394 } 3395 3396 static struct pernet_operations tcp4_net_ops = { 3397 .init = tcp4_proc_init_net, 3398 .exit = tcp4_proc_exit_net, 3399 }; 3400 3401 int __init tcp4_proc_init(void) 3402 { 3403 return register_pernet_subsys(&tcp4_net_ops); 3404 } 3405 3406 void tcp4_proc_exit(void) 3407 { 3408 unregister_pernet_subsys(&tcp4_net_ops); 3409 } 3410 #endif /* CONFIG_PROC_FS */ 3411 3412 struct proto tcp_prot = { 3413 .name = "TCP", 3414 .owner = THIS_MODULE, 3415 .close = tcp_close, 3416 .pre_connect = tcp_v4_pre_connect, 3417 .connect = tcp_v4_connect, 3418 .disconnect = tcp_disconnect, 3419 .accept = inet_csk_accept, 3420 .ioctl = tcp_ioctl, 3421 .init = tcp_v4_init_sock, 3422 .destroy = tcp_v4_destroy_sock, 3423 .shutdown = tcp_shutdown, 3424 .setsockopt = tcp_setsockopt, 3425 .getsockopt = tcp_getsockopt, 3426 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3427 .keepalive = tcp_set_keepalive, 3428 .recvmsg = tcp_recvmsg, 3429 .sendmsg = tcp_sendmsg, 3430 .splice_eof = tcp_splice_eof, 3431 .backlog_rcv = tcp_v4_do_rcv, 3432 .release_cb = tcp_release_cb, 3433 .hash = inet_hash, 3434 .unhash = inet_unhash, 3435 .get_port = inet_csk_get_port, 3436 .put_port = inet_put_port, 3437 #ifdef CONFIG_BPF_SYSCALL 3438 .psock_update_sk_prot = tcp_bpf_update_proto, 3439 #endif 3440 .enter_memory_pressure = tcp_enter_memory_pressure, 3441 .leave_memory_pressure = tcp_leave_memory_pressure, 3442 .stream_memory_free = tcp_stream_memory_free, 3443 .sockets_allocated = &tcp_sockets_allocated, 3444 3445 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3446 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3447 3448 .memory_pressure = &tcp_memory_pressure, 3449 .sysctl_mem = sysctl_tcp_mem, 3450 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3451 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3452 .max_header = MAX_TCP_HEADER, 3453 .obj_size = sizeof(struct tcp_sock), 3454 .freeptr_offset = offsetof(struct tcp_sock, 3455 inet_conn.icsk_inet.sk.sk_freeptr), 3456 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3457 .twsk_prot = &tcp_timewait_sock_ops, 3458 .rsk_prot = &tcp_request_sock_ops, 3459 .h.hashinfo = NULL, 3460 .no_autobind = true, 3461 .diag_destroy = tcp_abort, 3462 }; 3463 EXPORT_SYMBOL(tcp_prot); 3464 3465 static void __net_exit tcp_sk_exit(struct net *net) 3466 { 3467 if (net->ipv4.tcp_congestion_control) 3468 bpf_module_put(net->ipv4.tcp_congestion_control, 3469 net->ipv4.tcp_congestion_control->owner); 3470 } 3471 3472 static void __net_init tcp_set_hashinfo(struct net *net) 3473 { 3474 struct inet_hashinfo *hinfo; 3475 unsigned int ehash_entries; 3476 struct net *old_net; 3477 3478 if (net_eq(net, &init_net)) 3479 goto fallback; 3480 3481 old_net = current->nsproxy->net_ns; 3482 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3483 if (!ehash_entries) 3484 goto fallback; 3485 3486 ehash_entries = roundup_pow_of_two(ehash_entries); 3487 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3488 if (!hinfo) { 3489 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3490 "for a netns, fallback to the global one\n", 3491 ehash_entries); 3492 fallback: 3493 hinfo = &tcp_hashinfo; 3494 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3495 } 3496 3497 net->ipv4.tcp_death_row.hashinfo = hinfo; 3498 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3499 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3500 } 3501 3502 static int __net_init tcp_sk_init(struct net *net) 3503 { 3504 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3505 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3506 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; 3507 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3508 3509 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3510 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3511 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3512 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3513 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3514 3515 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3516 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3517 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3518 3519 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3520 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3521 net->ipv4.sysctl_tcp_syncookies = 1; 3522 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3523 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3524 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3525 net->ipv4.sysctl_tcp_orphan_retries = 0; 3526 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3527 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3528 net->ipv4.sysctl_tcp_tw_reuse = 2; 3529 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3530 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3531 3532 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3533 tcp_set_hashinfo(net); 3534 3535 net->ipv4.sysctl_tcp_sack = 1; 3536 net->ipv4.sysctl_tcp_window_scaling = 1; 3537 net->ipv4.sysctl_tcp_timestamps = 1; 3538 net->ipv4.sysctl_tcp_early_retrans = 3; 3539 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3540 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3541 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3542 net->ipv4.sysctl_tcp_max_reordering = 300; 3543 net->ipv4.sysctl_tcp_dsack = 1; 3544 net->ipv4.sysctl_tcp_app_win = 31; 3545 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3546 net->ipv4.sysctl_tcp_frto = 2; 3547 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3548 net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC; 3549 /* This limits the percentage of the congestion window which we 3550 * will allow a single TSO frame to consume. Building TSO frames 3551 * which are too large can cause TCP streams to be bursty. 3552 */ 3553 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3554 /* Default TSQ limit of 4 MB */ 3555 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3556 3557 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3558 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3559 3560 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3561 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3562 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3563 net->ipv4.sysctl_tcp_autocorking = 1; 3564 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3565 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3566 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3567 if (net != &init_net) { 3568 memcpy(net->ipv4.sysctl_tcp_rmem, 3569 init_net.ipv4.sysctl_tcp_rmem, 3570 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3571 memcpy(net->ipv4.sysctl_tcp_wmem, 3572 init_net.ipv4.sysctl_tcp_wmem, 3573 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3574 } 3575 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3576 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC; 3577 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3578 net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33; 3579 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3580 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3581 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3582 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3583 3584 /* Set default values for PLB */ 3585 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3586 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3587 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3588 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3589 /* Default congestion threshold for PLB to mark a round is 50% */ 3590 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3591 3592 /* Reno is always built in */ 3593 if (!net_eq(net, &init_net) && 3594 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3595 init_net.ipv4.tcp_congestion_control->owner)) 3596 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3597 else 3598 net->ipv4.tcp_congestion_control = &tcp_reno; 3599 3600 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3601 net->ipv4.sysctl_tcp_shrink_window = 0; 3602 3603 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3604 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3605 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3606 3607 return 0; 3608 } 3609 3610 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3611 { 3612 struct net *net; 3613 3614 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3615 * and failed setup_net error unwinding path are serialized. 3616 * 3617 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3618 * net_exit_list, the thread that dismantles a particular twsk must 3619 * do so without other thread progressing to refcount_dec_and_test() of 3620 * tcp_death_row.tw_refcount. 3621 */ 3622 mutex_lock(&tcp_exit_batch_mutex); 3623 3624 tcp_twsk_purge(net_exit_list); 3625 3626 list_for_each_entry(net, net_exit_list, exit_list) { 3627 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3628 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3629 tcp_fastopen_ctx_destroy(net); 3630 } 3631 3632 mutex_unlock(&tcp_exit_batch_mutex); 3633 } 3634 3635 static struct pernet_operations __net_initdata tcp_sk_ops = { 3636 .init = tcp_sk_init, 3637 .exit = tcp_sk_exit, 3638 .exit_batch = tcp_sk_exit_batch, 3639 }; 3640 3641 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3642 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3643 struct sock_common *sk_common, uid_t uid) 3644 3645 #define INIT_BATCH_SZ 16 3646 3647 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3648 { 3649 struct bpf_tcp_iter_state *iter = priv_data; 3650 int err; 3651 3652 err = bpf_iter_init_seq_net(priv_data, aux); 3653 if (err) 3654 return err; 3655 3656 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3657 if (err) { 3658 bpf_iter_fini_seq_net(priv_data); 3659 return err; 3660 } 3661 3662 return 0; 3663 } 3664 3665 static void bpf_iter_fini_tcp(void *priv_data) 3666 { 3667 struct bpf_tcp_iter_state *iter = priv_data; 3668 3669 bpf_iter_fini_seq_net(priv_data); 3670 kvfree(iter->batch); 3671 } 3672 3673 static const struct bpf_iter_seq_info tcp_seq_info = { 3674 .seq_ops = &bpf_iter_tcp_seq_ops, 3675 .init_seq_private = bpf_iter_init_tcp, 3676 .fini_seq_private = bpf_iter_fini_tcp, 3677 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3678 }; 3679 3680 static const struct bpf_func_proto * 3681 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3682 const struct bpf_prog *prog) 3683 { 3684 switch (func_id) { 3685 case BPF_FUNC_setsockopt: 3686 return &bpf_sk_setsockopt_proto; 3687 case BPF_FUNC_getsockopt: 3688 return &bpf_sk_getsockopt_proto; 3689 default: 3690 return NULL; 3691 } 3692 } 3693 3694 static struct bpf_iter_reg tcp_reg_info = { 3695 .target = "tcp", 3696 .ctx_arg_info_size = 1, 3697 .ctx_arg_info = { 3698 { offsetof(struct bpf_iter__tcp, sk_common), 3699 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3700 }, 3701 .get_func_proto = bpf_iter_tcp_get_func_proto, 3702 .seq_info = &tcp_seq_info, 3703 }; 3704 3705 static void __init bpf_iter_register(void) 3706 { 3707 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3708 if (bpf_iter_reg_target(&tcp_reg_info)) 3709 pr_warn("Warning: could not register bpf iterator tcp\n"); 3710 } 3711 3712 #endif 3713 3714 void __init tcp_v4_init(void) 3715 { 3716 int cpu, res; 3717 3718 for_each_possible_cpu(cpu) { 3719 struct sock *sk; 3720 3721 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3722 IPPROTO_TCP, &init_net); 3723 if (res) 3724 panic("Failed to create the TCP control socket.\n"); 3725 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3726 3727 /* Please enforce IP_DF and IPID==0 for RST and 3728 * ACK sent in SYN-RECV and TIME-WAIT state. 3729 */ 3730 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3731 3732 sk->sk_clockid = CLOCK_MONOTONIC; 3733 3734 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3735 } 3736 if (register_pernet_subsys(&tcp_sk_ops)) 3737 panic("Failed to create the TCP control socket.\n"); 3738 3739 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3740 bpf_iter_register(); 3741 #endif 3742 } 3743