1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 #include <linux/sock_diag.h> 62 63 #include <net/aligned_data.h> 64 #include <net/net_namespace.h> 65 #include <net/icmp.h> 66 #include <net/inet_hashtables.h> 67 #include <net/tcp.h> 68 #include <net/tcp_ecn.h> 69 #include <net/transp_v6.h> 70 #include <net/ipv6.h> 71 #include <net/inet_common.h> 72 #include <net/inet_ecn.h> 73 #include <net/timewait_sock.h> 74 #include <net/xfrm.h> 75 #include <net/secure_seq.h> 76 #include <net/busy_poll.h> 77 #include <net/rstreason.h> 78 #include <net/psp.h> 79 80 #include <linux/inet.h> 81 #include <linux/ipv6.h> 82 #include <linux/stddef.h> 83 #include <linux/proc_fs.h> 84 #include <linux/seq_file.h> 85 #include <linux/inetdevice.h> 86 #include <linux/btf_ids.h> 87 #include <linux/skbuff_ref.h> 88 89 #include <crypto/hash.h> 90 #include <linux/scatterlist.h> 91 92 #include <trace/events/tcp.h> 93 94 #ifdef CONFIG_TCP_MD5SIG 95 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 96 __be32 daddr, __be32 saddr, const struct tcphdr *th); 97 #endif 98 99 struct inet_hashinfo tcp_hashinfo; 100 101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 102 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 103 }; 104 105 static DEFINE_MUTEX(tcp_exit_batch_mutex); 106 107 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 108 { 109 return secure_tcp_seq(ip_hdr(skb)->daddr, 110 ip_hdr(skb)->saddr, 111 tcp_hdr(skb)->dest, 112 tcp_hdr(skb)->source); 113 } 114 115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 116 { 117 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 118 } 119 120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 121 { 122 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 123 const struct inet_timewait_sock *tw = inet_twsk(sktw); 124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 125 struct tcp_sock *tp = tcp_sk(sk); 126 int ts_recent_stamp; 127 u32 reuse_thresh; 128 129 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 130 reuse = 0; 131 132 if (reuse == 2) { 133 /* Still does not detect *everything* that goes through 134 * lo, since we require a loopback src or dst address 135 * or direct binding to 'lo' interface. 136 */ 137 bool loopback = false; 138 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 139 loopback = true; 140 #if IS_ENABLED(CONFIG_IPV6) 141 if (tw->tw_family == AF_INET6) { 142 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 144 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 145 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 146 loopback = true; 147 } else 148 #endif 149 { 150 if (ipv4_is_loopback(tw->tw_daddr) || 151 ipv4_is_loopback(tw->tw_rcv_saddr)) 152 loopback = true; 153 } 154 if (!loopback) 155 reuse = 0; 156 } 157 158 /* With PAWS, it is safe from the viewpoint 159 of data integrity. Even without PAWS it is safe provided sequence 160 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 161 162 Actually, the idea is close to VJ's one, only timestamp cache is 163 held not per host, but per port pair and TW bucket is used as state 164 holder. 165 166 If TW bucket has been already destroyed we fall back to VJ's scheme 167 and use initial timestamp retrieved from peer table. 168 */ 169 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 170 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 171 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 172 if (ts_recent_stamp && 173 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 174 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 175 * and releasing the bucket lock. 176 */ 177 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 178 return 0; 179 180 /* In case of repair and re-using TIME-WAIT sockets we still 181 * want to be sure that it is safe as above but honor the 182 * sequence numbers and time stamps set as part of the repair 183 * process. 184 * 185 * Without this check re-using a TIME-WAIT socket with TCP 186 * repair would accumulate a -1 on the repair assigned 187 * sequence number. The first time it is reused the sequence 188 * is -1, the second time -2, etc. This fixes that issue 189 * without appearing to create any others. 190 */ 191 if (likely(!tp->repair)) { 192 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 193 194 if (!seq) 195 seq = 1; 196 WRITE_ONCE(tp->write_seq, seq); 197 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 198 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 199 } 200 201 return 1; 202 } 203 204 return 0; 205 } 206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 207 208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 209 int addr_len) 210 { 211 /* This check is replicated from tcp_v4_connect() and intended to 212 * prevent BPF program called below from accessing bytes that are out 213 * of the bound specified by user in addr_len. 214 */ 215 if (addr_len < sizeof(struct sockaddr_in)) 216 return -EINVAL; 217 218 sock_owned_by_me(sk); 219 220 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 221 } 222 223 /* This will initiate an outgoing connection. */ 224 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 225 { 226 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 227 struct inet_timewait_death_row *tcp_death_row; 228 struct inet_sock *inet = inet_sk(sk); 229 struct tcp_sock *tp = tcp_sk(sk); 230 struct ip_options_rcu *inet_opt; 231 struct net *net = sock_net(sk); 232 __be16 orig_sport, orig_dport; 233 __be32 daddr, nexthop; 234 struct flowi4 *fl4; 235 struct rtable *rt; 236 int err; 237 238 if (addr_len < sizeof(struct sockaddr_in)) 239 return -EINVAL; 240 241 if (usin->sin_family != AF_INET) 242 return -EAFNOSUPPORT; 243 244 nexthop = daddr = usin->sin_addr.s_addr; 245 inet_opt = rcu_dereference_protected(inet->inet_opt, 246 lockdep_sock_is_held(sk)); 247 if (inet_opt && inet_opt->opt.srr) { 248 if (!daddr) 249 return -EINVAL; 250 nexthop = inet_opt->opt.faddr; 251 } 252 253 orig_sport = inet->inet_sport; 254 orig_dport = usin->sin_port; 255 fl4 = &inet->cork.fl.u.ip4; 256 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 257 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 258 orig_dport, sk); 259 if (IS_ERR(rt)) { 260 err = PTR_ERR(rt); 261 if (err == -ENETUNREACH) 262 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 263 return err; 264 } 265 266 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 267 ip_rt_put(rt); 268 return -ENETUNREACH; 269 } 270 271 if (!inet_opt || !inet_opt->opt.srr) 272 daddr = fl4->daddr; 273 274 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 275 276 if (!inet->inet_saddr) { 277 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 278 if (err) { 279 ip_rt_put(rt); 280 return err; 281 } 282 } else { 283 sk_rcv_saddr_set(sk, inet->inet_saddr); 284 } 285 286 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 287 /* Reset inherited state */ 288 tp->rx_opt.ts_recent = 0; 289 tp->rx_opt.ts_recent_stamp = 0; 290 if (likely(!tp->repair)) 291 WRITE_ONCE(tp->write_seq, 0); 292 } 293 294 inet->inet_dport = usin->sin_port; 295 sk_daddr_set(sk, daddr); 296 297 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); 298 if (inet_opt) 299 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; 300 301 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 302 303 /* Socket identity is still unknown (sport may be zero). 304 * However we set state to SYN-SENT and not releasing socket 305 * lock select source port, enter ourselves into the hash tables and 306 * complete initialization after this. 307 */ 308 tcp_set_state(sk, TCP_SYN_SENT); 309 err = inet_hash_connect(tcp_death_row, sk); 310 if (err) 311 goto failure; 312 313 sk_set_txhash(sk); 314 315 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 316 inet->inet_sport, inet->inet_dport, sk); 317 if (IS_ERR(rt)) { 318 err = PTR_ERR(rt); 319 rt = NULL; 320 goto failure; 321 } 322 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 323 /* OK, now commit destination to socket. */ 324 sk->sk_gso_type = SKB_GSO_TCPV4; 325 sk_setup_caps(sk, &rt->dst); 326 rt = NULL; 327 328 if (likely(!tp->repair)) { 329 if (!tp->write_seq) 330 WRITE_ONCE(tp->write_seq, 331 secure_tcp_seq(inet->inet_saddr, 332 inet->inet_daddr, 333 inet->inet_sport, 334 usin->sin_port)); 335 WRITE_ONCE(tp->tsoffset, 336 secure_tcp_ts_off(net, inet->inet_saddr, 337 inet->inet_daddr)); 338 } 339 340 atomic_set(&inet->inet_id, get_random_u16()); 341 342 if (tcp_fastopen_defer_connect(sk, &err)) 343 return err; 344 if (err) 345 goto failure; 346 347 err = tcp_connect(sk); 348 349 if (err) 350 goto failure; 351 352 return 0; 353 354 failure: 355 /* 356 * This unhashes the socket and releases the local port, 357 * if necessary. 358 */ 359 tcp_set_state(sk, TCP_CLOSE); 360 inet_bhash2_reset_saddr(sk); 361 ip_rt_put(rt); 362 sk->sk_route_caps = 0; 363 inet->inet_dport = 0; 364 return err; 365 } 366 EXPORT_IPV6_MOD(tcp_v4_connect); 367 368 /* 369 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 370 * It can be called through tcp_release_cb() if socket was owned by user 371 * at the time tcp_v4_err() was called to handle ICMP message. 372 */ 373 void tcp_v4_mtu_reduced(struct sock *sk) 374 { 375 struct inet_sock *inet = inet_sk(sk); 376 struct dst_entry *dst; 377 u32 mtu; 378 379 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 380 return; 381 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 382 dst = inet_csk_update_pmtu(sk, mtu); 383 if (!dst) 384 return; 385 386 /* Something is about to be wrong... Remember soft error 387 * for the case, if this connection will not able to recover. 388 */ 389 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 390 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 391 392 mtu = dst_mtu(dst); 393 394 if (inet->pmtudisc != IP_PMTUDISC_DONT && 395 ip_sk_accept_pmtu(sk) && 396 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 397 tcp_sync_mss(sk, mtu); 398 399 /* Resend the TCP packet because it's 400 * clear that the old packet has been 401 * dropped. This is the new "fast" path mtu 402 * discovery. 403 */ 404 tcp_simple_retransmit(sk); 405 } /* else let the usual retransmit timer handle it */ 406 } 407 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 408 409 static void do_redirect(struct sk_buff *skb, struct sock *sk) 410 { 411 struct dst_entry *dst = __sk_dst_check(sk, 0); 412 413 if (dst) 414 dst->ops->redirect(dst, sk, skb); 415 } 416 417 418 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 419 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 420 { 421 struct request_sock *req = inet_reqsk(sk); 422 struct net *net = sock_net(sk); 423 424 /* ICMPs are not backlogged, hence we cannot get 425 * an established socket here. 426 */ 427 if (seq != tcp_rsk(req)->snt_isn) { 428 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 429 } else if (abort) { 430 /* 431 * Still in SYN_RECV, just remove it silently. 432 * There is no good way to pass the error to the newly 433 * created socket, and POSIX does not want network 434 * errors returned from accept(). 435 */ 436 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 437 tcp_listendrop(req->rsk_listener); 438 } 439 reqsk_put(req); 440 } 441 EXPORT_IPV6_MOD(tcp_req_err); 442 443 /* TCP-LD (RFC 6069) logic */ 444 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 445 { 446 struct inet_connection_sock *icsk = inet_csk(sk); 447 struct tcp_sock *tp = tcp_sk(sk); 448 struct sk_buff *skb; 449 s32 remaining; 450 u32 delta_us; 451 452 if (sock_owned_by_user(sk)) 453 return; 454 455 if (seq != tp->snd_una || !icsk->icsk_retransmits || 456 !icsk->icsk_backoff) 457 return; 458 459 skb = tcp_rtx_queue_head(sk); 460 if (WARN_ON_ONCE(!skb)) 461 return; 462 463 icsk->icsk_backoff--; 464 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 465 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 466 467 tcp_mstamp_refresh(tp); 468 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 469 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 470 471 if (remaining > 0) { 472 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 473 } else { 474 /* RTO revert clocked out retransmission. 475 * Will retransmit now. 476 */ 477 tcp_retransmit_timer(sk); 478 } 479 } 480 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 481 482 /* 483 * This routine is called by the ICMP module when it gets some 484 * sort of error condition. If err < 0 then the socket should 485 * be closed and the error returned to the user. If err > 0 486 * it's just the icmp type << 8 | icmp code. After adjustment 487 * header points to the first 8 bytes of the tcp header. We need 488 * to find the appropriate port. 489 * 490 * The locking strategy used here is very "optimistic". When 491 * someone else accesses the socket the ICMP is just dropped 492 * and for some paths there is no check at all. 493 * A more general error queue to queue errors for later handling 494 * is probably better. 495 * 496 */ 497 498 int tcp_v4_err(struct sk_buff *skb, u32 info) 499 { 500 const struct iphdr *iph = (const struct iphdr *)skb->data; 501 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 502 struct net *net = dev_net_rcu(skb->dev); 503 const int type = icmp_hdr(skb)->type; 504 const int code = icmp_hdr(skb)->code; 505 struct request_sock *fastopen; 506 struct tcp_sock *tp; 507 u32 seq, snd_una; 508 struct sock *sk; 509 int err; 510 511 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 512 ntohs(th->source), inet_iif(skb), 0); 513 if (!sk) { 514 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 515 return -ENOENT; 516 } 517 if (sk->sk_state == TCP_TIME_WAIT) { 518 /* To increase the counter of ignored icmps for TCP-AO */ 519 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 520 inet_twsk_put(inet_twsk(sk)); 521 return 0; 522 } 523 seq = ntohl(th->seq); 524 if (sk->sk_state == TCP_NEW_SYN_RECV) { 525 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 526 type == ICMP_TIME_EXCEEDED || 527 (type == ICMP_DEST_UNREACH && 528 (code == ICMP_NET_UNREACH || 529 code == ICMP_HOST_UNREACH))); 530 return 0; 531 } 532 533 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 534 sock_put(sk); 535 return 0; 536 } 537 538 bh_lock_sock(sk); 539 /* If too many ICMPs get dropped on busy 540 * servers this needs to be solved differently. 541 * We do take care of PMTU discovery (RFC1191) special case : 542 * we can receive locally generated ICMP messages while socket is held. 543 */ 544 if (sock_owned_by_user(sk)) { 545 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 546 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 547 } 548 if (sk->sk_state == TCP_CLOSE) 549 goto out; 550 551 if (static_branch_unlikely(&ip4_min_ttl)) { 552 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 553 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 554 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 555 goto out; 556 } 557 } 558 559 tp = tcp_sk(sk); 560 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 561 fastopen = rcu_dereference(tp->fastopen_rsk); 562 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 563 if (sk->sk_state != TCP_LISTEN && 564 !between(seq, snd_una, tp->snd_nxt)) { 565 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 566 goto out; 567 } 568 569 switch (type) { 570 case ICMP_REDIRECT: 571 if (!sock_owned_by_user(sk)) 572 do_redirect(skb, sk); 573 goto out; 574 case ICMP_SOURCE_QUENCH: 575 /* Just silently ignore these. */ 576 goto out; 577 case ICMP_PARAMETERPROB: 578 err = EPROTO; 579 break; 580 case ICMP_DEST_UNREACH: 581 if (code > NR_ICMP_UNREACH) 582 goto out; 583 584 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 585 /* We are not interested in TCP_LISTEN and open_requests 586 * (SYN-ACKs send out by Linux are always <576bytes so 587 * they should go through unfragmented). 588 */ 589 if (sk->sk_state == TCP_LISTEN) 590 goto out; 591 592 WRITE_ONCE(tp->mtu_info, info); 593 if (!sock_owned_by_user(sk)) { 594 tcp_v4_mtu_reduced(sk); 595 } else { 596 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 597 sock_hold(sk); 598 } 599 goto out; 600 } 601 602 err = icmp_err_convert[code].errno; 603 /* check if this ICMP message allows revert of backoff. 604 * (see RFC 6069) 605 */ 606 if (!fastopen && 607 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 608 tcp_ld_RTO_revert(sk, seq); 609 break; 610 case ICMP_TIME_EXCEEDED: 611 err = EHOSTUNREACH; 612 break; 613 default: 614 goto out; 615 } 616 617 switch (sk->sk_state) { 618 case TCP_SYN_SENT: 619 case TCP_SYN_RECV: 620 /* Only in fast or simultaneous open. If a fast open socket is 621 * already accepted it is treated as a connected one below. 622 */ 623 if (fastopen && !fastopen->sk) 624 break; 625 626 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 627 628 if (!sock_owned_by_user(sk)) 629 tcp_done_with_error(sk, err); 630 else 631 WRITE_ONCE(sk->sk_err_soft, err); 632 goto out; 633 } 634 635 /* If we've already connected we will keep trying 636 * until we time out, or the user gives up. 637 * 638 * rfc1122 4.2.3.9 allows to consider as hard errors 639 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 640 * but it is obsoleted by pmtu discovery). 641 * 642 * Note, that in modern internet, where routing is unreliable 643 * and in each dark corner broken firewalls sit, sending random 644 * errors ordered by their masters even this two messages finally lose 645 * their original sense (even Linux sends invalid PORT_UNREACHs) 646 * 647 * Now we are in compliance with RFCs. 648 * --ANK (980905) 649 */ 650 651 if (!sock_owned_by_user(sk) && 652 inet_test_bit(RECVERR, sk)) { 653 WRITE_ONCE(sk->sk_err, err); 654 sk_error_report(sk); 655 } else { /* Only an error on timeout */ 656 WRITE_ONCE(sk->sk_err_soft, err); 657 } 658 659 out: 660 bh_unlock_sock(sk); 661 sock_put(sk); 662 return 0; 663 } 664 665 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 666 { 667 struct tcphdr *th = tcp_hdr(skb); 668 669 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 670 skb->csum_start = skb_transport_header(skb) - skb->head; 671 skb->csum_offset = offsetof(struct tcphdr, check); 672 } 673 674 /* This routine computes an IPv4 TCP checksum. */ 675 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 676 { 677 const struct inet_sock *inet = inet_sk(sk); 678 679 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 680 } 681 EXPORT_IPV6_MOD(tcp_v4_send_check); 682 683 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 684 685 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 686 const struct tcp_ao_hdr *aoh, 687 struct ip_reply_arg *arg, struct tcphdr *reply, 688 __be32 reply_options[REPLY_OPTIONS_LEN]) 689 { 690 #ifdef CONFIG_TCP_AO 691 int sdif = tcp_v4_sdif(skb); 692 int dif = inet_iif(skb); 693 int l3index = sdif ? dif : 0; 694 bool allocated_traffic_key; 695 struct tcp_ao_key *key; 696 char *traffic_key; 697 bool drop = true; 698 u32 ao_sne = 0; 699 u8 keyid; 700 701 rcu_read_lock(); 702 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 703 &key, &traffic_key, &allocated_traffic_key, 704 &keyid, &ao_sne)) 705 goto out; 706 707 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 708 (aoh->rnext_keyid << 8) | keyid); 709 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 710 reply->doff = arg->iov[0].iov_len / 4; 711 712 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 713 key, traffic_key, 714 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 715 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 716 reply, ao_sne)) 717 goto out; 718 drop = false; 719 out: 720 rcu_read_unlock(); 721 if (allocated_traffic_key) 722 kfree(traffic_key); 723 return drop; 724 #else 725 return true; 726 #endif 727 } 728 729 /* 730 * This routine will send an RST to the other tcp. 731 * 732 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 733 * for reset. 734 * Answer: if a packet caused RST, it is not for a socket 735 * existing in our system, if it is matched to a socket, 736 * it is just duplicate segment or bug in other side's TCP. 737 * So that we build reply only basing on parameters 738 * arrived with segment. 739 * Exception: precedence violation. We do not implement it in any case. 740 */ 741 742 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 743 enum sk_rst_reason reason) 744 { 745 const struct tcphdr *th = tcp_hdr(skb); 746 struct { 747 struct tcphdr th; 748 __be32 opt[REPLY_OPTIONS_LEN]; 749 } rep; 750 const __u8 *md5_hash_location = NULL; 751 const struct tcp_ao_hdr *aoh; 752 struct ip_reply_arg arg; 753 #ifdef CONFIG_TCP_MD5SIG 754 struct tcp_md5sig_key *key = NULL; 755 unsigned char newhash[16]; 756 struct sock *sk1 = NULL; 757 int genhash; 758 #endif 759 u64 transmit_time = 0; 760 struct sock *ctl_sk; 761 struct net *net; 762 u32 txhash = 0; 763 764 /* Never send a reset in response to a reset. */ 765 if (th->rst) 766 return; 767 768 /* If sk not NULL, it means we did a successful lookup and incoming 769 * route had to be correct. prequeue might have dropped our dst. 770 */ 771 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 772 return; 773 774 /* Swap the send and the receive. */ 775 memset(&rep, 0, sizeof(rep)); 776 rep.th.dest = th->source; 777 rep.th.source = th->dest; 778 rep.th.doff = sizeof(struct tcphdr) / 4; 779 rep.th.rst = 1; 780 781 if (th->ack) { 782 rep.th.seq = th->ack_seq; 783 } else { 784 rep.th.ack = 1; 785 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 786 skb->len - (th->doff << 2)); 787 } 788 789 memset(&arg, 0, sizeof(arg)); 790 arg.iov[0].iov_base = (unsigned char *)&rep; 791 arg.iov[0].iov_len = sizeof(rep.th); 792 793 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 794 795 /* Invalid TCP option size or twice included auth */ 796 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 797 return; 798 799 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 800 return; 801 802 #ifdef CONFIG_TCP_MD5SIG 803 rcu_read_lock(); 804 if (sk && sk_fullsock(sk)) { 805 const union tcp_md5_addr *addr; 806 int l3index; 807 808 /* sdif set, means packet ingressed via a device 809 * in an L3 domain and inet_iif is set to it. 810 */ 811 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 812 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 813 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 814 } else if (md5_hash_location) { 815 const union tcp_md5_addr *addr; 816 int sdif = tcp_v4_sdif(skb); 817 int dif = inet_iif(skb); 818 int l3index; 819 820 /* 821 * active side is lost. Try to find listening socket through 822 * source port, and then find md5 key through listening socket. 823 * we are not loose security here: 824 * Incoming packet is checked with md5 hash with finding key, 825 * no RST generated if md5 hash doesn't match. 826 */ 827 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 828 th->source, ip_hdr(skb)->daddr, 829 ntohs(th->source), dif, sdif); 830 /* don't send rst if it can't find key */ 831 if (!sk1) 832 goto out; 833 834 /* sdif set, means packet ingressed via a device 835 * in an L3 domain and dif is set to it. 836 */ 837 l3index = sdif ? dif : 0; 838 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 839 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 840 if (!key) 841 goto out; 842 843 844 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 845 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 846 goto out; 847 848 } 849 850 if (key) { 851 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 852 (TCPOPT_NOP << 16) | 853 (TCPOPT_MD5SIG << 8) | 854 TCPOLEN_MD5SIG); 855 /* Update length and the length the header thinks exists */ 856 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 857 rep.th.doff = arg.iov[0].iov_len / 4; 858 859 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 860 key, ip_hdr(skb)->saddr, 861 ip_hdr(skb)->daddr, &rep.th); 862 } 863 #endif 864 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 865 if (rep.opt[0] == 0) { 866 __be32 mrst = mptcp_reset_option(skb); 867 868 if (mrst) { 869 rep.opt[0] = mrst; 870 arg.iov[0].iov_len += sizeof(mrst); 871 rep.th.doff = arg.iov[0].iov_len / 4; 872 } 873 } 874 875 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 876 ip_hdr(skb)->saddr, /* XXX */ 877 arg.iov[0].iov_len, IPPROTO_TCP, 0); 878 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 879 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 880 881 /* When socket is gone, all binding information is lost. 882 * routing might fail in this case. No choice here, if we choose to force 883 * input interface, we will misroute in case of asymmetric route. 884 */ 885 if (sk) 886 arg.bound_dev_if = sk->sk_bound_dev_if; 887 888 trace_tcp_send_reset(sk, skb, reason); 889 890 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 891 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 892 893 /* ECN bits of TW reset are cleared */ 894 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 895 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 896 local_bh_disable(); 897 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 898 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 899 900 sock_net_set(ctl_sk, net); 901 if (sk) { 902 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 903 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 904 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 905 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 906 transmit_time = tcp_transmit_time(sk); 907 xfrm_sk_clone_policy(ctl_sk, sk); 908 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 909 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 910 } else { 911 ctl_sk->sk_mark = 0; 912 ctl_sk->sk_priority = 0; 913 } 914 ip_send_unicast_reply(ctl_sk, sk, 915 skb, &TCP_SKB_CB(skb)->header.h4.opt, 916 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 917 &arg, arg.iov[0].iov_len, 918 transmit_time, txhash); 919 920 xfrm_sk_free_policy(ctl_sk); 921 sock_net_set(ctl_sk, &init_net); 922 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 923 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 924 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 925 local_bh_enable(); 926 927 #ifdef CONFIG_TCP_MD5SIG 928 out: 929 rcu_read_unlock(); 930 #endif 931 } 932 933 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 934 outside socket context is ugly, certainly. What can I do? 935 */ 936 937 static void tcp_v4_send_ack(const struct sock *sk, 938 struct sk_buff *skb, u32 seq, u32 ack, 939 u32 win, u32 tsval, u32 tsecr, int oif, 940 struct tcp_key *key, 941 int reply_flags, u8 tos, u32 txhash) 942 { 943 const struct tcphdr *th = tcp_hdr(skb); 944 struct { 945 struct tcphdr th; 946 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 947 } rep; 948 struct net *net = sock_net(sk); 949 struct ip_reply_arg arg; 950 struct sock *ctl_sk; 951 u64 transmit_time; 952 953 memset(&rep.th, 0, sizeof(struct tcphdr)); 954 memset(&arg, 0, sizeof(arg)); 955 956 arg.iov[0].iov_base = (unsigned char *)&rep; 957 arg.iov[0].iov_len = sizeof(rep.th); 958 if (tsecr) { 959 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 960 (TCPOPT_TIMESTAMP << 8) | 961 TCPOLEN_TIMESTAMP); 962 rep.opt[1] = htonl(tsval); 963 rep.opt[2] = htonl(tsecr); 964 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 965 } 966 967 /* Swap the send and the receive. */ 968 rep.th.dest = th->source; 969 rep.th.source = th->dest; 970 rep.th.doff = arg.iov[0].iov_len / 4; 971 rep.th.seq = htonl(seq); 972 rep.th.ack_seq = htonl(ack); 973 rep.th.ack = 1; 974 rep.th.window = htons(win); 975 976 #ifdef CONFIG_TCP_MD5SIG 977 if (tcp_key_is_md5(key)) { 978 int offset = (tsecr) ? 3 : 0; 979 980 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 981 (TCPOPT_NOP << 16) | 982 (TCPOPT_MD5SIG << 8) | 983 TCPOLEN_MD5SIG); 984 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 985 rep.th.doff = arg.iov[0].iov_len/4; 986 987 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 988 key->md5_key, ip_hdr(skb)->saddr, 989 ip_hdr(skb)->daddr, &rep.th); 990 } 991 #endif 992 #ifdef CONFIG_TCP_AO 993 if (tcp_key_is_ao(key)) { 994 int offset = (tsecr) ? 3 : 0; 995 996 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 997 (tcp_ao_len(key->ao_key) << 16) | 998 (key->ao_key->sndid << 8) | 999 key->rcv_next); 1000 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 1001 rep.th.doff = arg.iov[0].iov_len / 4; 1002 1003 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1004 key->ao_key, key->traffic_key, 1005 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1006 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1007 &rep.th, key->sne); 1008 } 1009 #endif 1010 arg.flags = reply_flags; 1011 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1012 ip_hdr(skb)->saddr, /* XXX */ 1013 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1014 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1015 if (oif) 1016 arg.bound_dev_if = oif; 1017 arg.tos = tos; 1018 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1019 local_bh_disable(); 1020 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1021 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1022 sock_net_set(ctl_sk, net); 1023 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1024 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1025 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1026 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1027 transmit_time = tcp_transmit_time(sk); 1028 ip_send_unicast_reply(ctl_sk, sk, 1029 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1030 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1031 &arg, arg.iov[0].iov_len, 1032 transmit_time, txhash); 1033 1034 sock_net_set(ctl_sk, &init_net); 1035 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1036 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1037 local_bh_enable(); 1038 } 1039 1040 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1041 enum tcp_tw_status tw_status) 1042 { 1043 struct inet_timewait_sock *tw = inet_twsk(sk); 1044 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1045 struct tcp_key key = {}; 1046 u8 tos = tw->tw_tos; 1047 1048 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1049 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1050 * being placed in a different service queues (Classic rather than L4S) 1051 */ 1052 if (tw_status == TCP_TW_ACK_OOW) 1053 tos &= ~INET_ECN_MASK; 1054 1055 #ifdef CONFIG_TCP_AO 1056 struct tcp_ao_info *ao_info; 1057 1058 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1059 /* FIXME: the segment to-be-acked is not verified yet */ 1060 ao_info = rcu_dereference(tcptw->ao_info); 1061 if (ao_info) { 1062 const struct tcp_ao_hdr *aoh; 1063 1064 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1065 inet_twsk_put(tw); 1066 return; 1067 } 1068 1069 if (aoh) 1070 key.ao_key = tcp_ao_established_key(sk, ao_info, 1071 aoh->rnext_keyid, -1); 1072 } 1073 } 1074 if (key.ao_key) { 1075 struct tcp_ao_key *rnext_key; 1076 1077 key.traffic_key = snd_other_key(key.ao_key); 1078 key.sne = READ_ONCE(ao_info->snd_sne); 1079 rnext_key = READ_ONCE(ao_info->rnext_key); 1080 key.rcv_next = rnext_key->rcvid; 1081 key.type = TCP_KEY_AO; 1082 #else 1083 if (0) { 1084 #endif 1085 } else if (static_branch_tcp_md5()) { 1086 key.md5_key = tcp_twsk_md5_key(tcptw); 1087 if (key.md5_key) 1088 key.type = TCP_KEY_MD5; 1089 } 1090 1091 tcp_v4_send_ack(sk, skb, 1092 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1093 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1094 tcp_tw_tsval(tcptw), 1095 READ_ONCE(tcptw->tw_ts_recent), 1096 tw->tw_bound_dev_if, &key, 1097 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1098 tos, 1099 tw->tw_txhash); 1100 1101 inet_twsk_put(tw); 1102 } 1103 1104 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1105 struct request_sock *req) 1106 { 1107 struct tcp_key key = {}; 1108 1109 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1110 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1111 */ 1112 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1113 tcp_sk(sk)->snd_nxt; 1114 1115 #ifdef CONFIG_TCP_AO 1116 if (static_branch_unlikely(&tcp_ao_needed.key) && 1117 tcp_rsk_used_ao(req)) { 1118 const union tcp_md5_addr *addr; 1119 const struct tcp_ao_hdr *aoh; 1120 int l3index; 1121 1122 /* Invalid TCP option size or twice included auth */ 1123 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1124 return; 1125 if (!aoh) 1126 return; 1127 1128 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1129 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1130 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1131 aoh->rnext_keyid, -1); 1132 if (unlikely(!key.ao_key)) { 1133 /* Send ACK with any matching MKT for the peer */ 1134 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1135 /* Matching key disappeared (user removed the key?) 1136 * let the handshake timeout. 1137 */ 1138 if (!key.ao_key) { 1139 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1140 addr, 1141 ntohs(tcp_hdr(skb)->source), 1142 &ip_hdr(skb)->daddr, 1143 ntohs(tcp_hdr(skb)->dest)); 1144 return; 1145 } 1146 } 1147 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1148 if (!key.traffic_key) 1149 return; 1150 1151 key.type = TCP_KEY_AO; 1152 key.rcv_next = aoh->keyid; 1153 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1154 #else 1155 if (0) { 1156 #endif 1157 } else if (static_branch_tcp_md5()) { 1158 const union tcp_md5_addr *addr; 1159 int l3index; 1160 1161 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1162 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1163 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1164 if (key.md5_key) 1165 key.type = TCP_KEY_MD5; 1166 } 1167 1168 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1169 tcp_v4_send_ack(sk, skb, seq, 1170 tcp_rsk(req)->rcv_nxt, 1171 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1172 tcp_rsk_tsval(tcp_rsk(req)), 1173 req->ts_recent, 1174 0, &key, 1175 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1176 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1177 READ_ONCE(tcp_rsk(req)->txhash)); 1178 if (tcp_key_is_ao(&key)) 1179 kfree(key.traffic_key); 1180 } 1181 1182 /* 1183 * Send a SYN-ACK after having received a SYN. 1184 * This still operates on a request_sock only, not on a big 1185 * socket. 1186 */ 1187 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1188 struct flowi *fl, 1189 struct request_sock *req, 1190 struct tcp_fastopen_cookie *foc, 1191 enum tcp_synack_type synack_type, 1192 struct sk_buff *syn_skb) 1193 { 1194 struct inet_request_sock *ireq = inet_rsk(req); 1195 struct flowi4 fl4; 1196 int err = -1; 1197 struct sk_buff *skb; 1198 u8 tos; 1199 1200 /* First, grab a route. */ 1201 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1202 return -1; 1203 1204 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1205 1206 if (skb) { 1207 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 1208 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1209 1210 tos = READ_ONCE(inet_sk(sk)->tos); 1211 1212 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1213 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1214 (tos & INET_ECN_MASK); 1215 1216 if (!INET_ECN_is_capable(tos) && 1217 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1218 tos |= INET_ECN_ECT_0; 1219 1220 rcu_read_lock(); 1221 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1222 ireq->ir_rmt_addr, 1223 rcu_dereference(ireq->ireq_opt), 1224 tos); 1225 rcu_read_unlock(); 1226 err = net_xmit_eval(err); 1227 } 1228 1229 return err; 1230 } 1231 1232 /* 1233 * IPv4 request_sock destructor. 1234 */ 1235 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1236 { 1237 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1238 } 1239 1240 #ifdef CONFIG_TCP_MD5SIG 1241 /* 1242 * RFC2385 MD5 checksumming requires a mapping of 1243 * IP address->MD5 Key. 1244 * We need to maintain these in the sk structure. 1245 */ 1246 1247 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1248 EXPORT_IPV6_MOD(tcp_md5_needed); 1249 1250 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1251 { 1252 if (!old) 1253 return true; 1254 1255 /* l3index always overrides non-l3index */ 1256 if (old->l3index && new->l3index == 0) 1257 return false; 1258 if (old->l3index == 0 && new->l3index) 1259 return true; 1260 1261 return old->prefixlen < new->prefixlen; 1262 } 1263 1264 /* Find the Key structure for an address. */ 1265 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1266 const union tcp_md5_addr *addr, 1267 int family, bool any_l3index) 1268 { 1269 const struct tcp_sock *tp = tcp_sk(sk); 1270 struct tcp_md5sig_key *key; 1271 const struct tcp_md5sig_info *md5sig; 1272 __be32 mask; 1273 struct tcp_md5sig_key *best_match = NULL; 1274 bool match; 1275 1276 /* caller either holds rcu_read_lock() or socket lock */ 1277 md5sig = rcu_dereference_check(tp->md5sig_info, 1278 lockdep_sock_is_held(sk)); 1279 if (!md5sig) 1280 return NULL; 1281 1282 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1283 lockdep_sock_is_held(sk)) { 1284 if (key->family != family) 1285 continue; 1286 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1287 key->l3index != l3index) 1288 continue; 1289 if (family == AF_INET) { 1290 mask = inet_make_mask(key->prefixlen); 1291 match = (key->addr.a4.s_addr & mask) == 1292 (addr->a4.s_addr & mask); 1293 #if IS_ENABLED(CONFIG_IPV6) 1294 } else if (family == AF_INET6) { 1295 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1296 key->prefixlen); 1297 #endif 1298 } else { 1299 match = false; 1300 } 1301 1302 if (match && better_md5_match(best_match, key)) 1303 best_match = key; 1304 } 1305 return best_match; 1306 } 1307 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1308 1309 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1310 const union tcp_md5_addr *addr, 1311 int family, u8 prefixlen, 1312 int l3index, u8 flags) 1313 { 1314 const struct tcp_sock *tp = tcp_sk(sk); 1315 struct tcp_md5sig_key *key; 1316 unsigned int size = sizeof(struct in_addr); 1317 const struct tcp_md5sig_info *md5sig; 1318 1319 /* caller either holds rcu_read_lock() or socket lock */ 1320 md5sig = rcu_dereference_check(tp->md5sig_info, 1321 lockdep_sock_is_held(sk)); 1322 if (!md5sig) 1323 return NULL; 1324 #if IS_ENABLED(CONFIG_IPV6) 1325 if (family == AF_INET6) 1326 size = sizeof(struct in6_addr); 1327 #endif 1328 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1329 lockdep_sock_is_held(sk)) { 1330 if (key->family != family) 1331 continue; 1332 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1333 continue; 1334 if (key->l3index != l3index) 1335 continue; 1336 if (!memcmp(&key->addr, addr, size) && 1337 key->prefixlen == prefixlen) 1338 return key; 1339 } 1340 return NULL; 1341 } 1342 1343 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1344 const struct sock *addr_sk) 1345 { 1346 const union tcp_md5_addr *addr; 1347 int l3index; 1348 1349 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1350 addr_sk->sk_bound_dev_if); 1351 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1352 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1353 } 1354 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1355 1356 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1357 { 1358 struct tcp_sock *tp = tcp_sk(sk); 1359 struct tcp_md5sig_info *md5sig; 1360 1361 md5sig = kmalloc(sizeof(*md5sig), gfp); 1362 if (!md5sig) 1363 return -ENOMEM; 1364 1365 sk_gso_disable(sk); 1366 INIT_HLIST_HEAD(&md5sig->head); 1367 rcu_assign_pointer(tp->md5sig_info, md5sig); 1368 return 0; 1369 } 1370 1371 /* This can be called on a newly created socket, from other files */ 1372 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1373 int family, u8 prefixlen, int l3index, u8 flags, 1374 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1375 { 1376 /* Add Key to the list */ 1377 struct tcp_md5sig_key *key; 1378 struct tcp_sock *tp = tcp_sk(sk); 1379 struct tcp_md5sig_info *md5sig; 1380 1381 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1382 if (key) { 1383 /* Pre-existing entry - just update that one. 1384 * Note that the key might be used concurrently. 1385 * data_race() is telling kcsan that we do not care of 1386 * key mismatches, since changing MD5 key on live flows 1387 * can lead to packet drops. 1388 */ 1389 data_race(memcpy(key->key, newkey, newkeylen)); 1390 1391 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1392 * Also note that a reader could catch new key->keylen value 1393 * but old key->key[], this is the reason we use __GFP_ZERO 1394 * at sock_kmalloc() time below these lines. 1395 */ 1396 WRITE_ONCE(key->keylen, newkeylen); 1397 1398 return 0; 1399 } 1400 1401 md5sig = rcu_dereference_protected(tp->md5sig_info, 1402 lockdep_sock_is_held(sk)); 1403 1404 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1405 if (!key) 1406 return -ENOMEM; 1407 1408 memcpy(key->key, newkey, newkeylen); 1409 key->keylen = newkeylen; 1410 key->family = family; 1411 key->prefixlen = prefixlen; 1412 key->l3index = l3index; 1413 key->flags = flags; 1414 memcpy(&key->addr, addr, 1415 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1416 sizeof(struct in_addr)); 1417 hlist_add_head_rcu(&key->node, &md5sig->head); 1418 return 0; 1419 } 1420 1421 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1422 int family, u8 prefixlen, int l3index, u8 flags, 1423 const u8 *newkey, u8 newkeylen) 1424 { 1425 struct tcp_sock *tp = tcp_sk(sk); 1426 1427 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1428 if (tcp_md5_alloc_sigpool()) 1429 return -ENOMEM; 1430 1431 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1432 tcp_md5_release_sigpool(); 1433 return -ENOMEM; 1434 } 1435 1436 if (!static_branch_inc(&tcp_md5_needed.key)) { 1437 struct tcp_md5sig_info *md5sig; 1438 1439 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1440 rcu_assign_pointer(tp->md5sig_info, NULL); 1441 kfree_rcu(md5sig, rcu); 1442 tcp_md5_release_sigpool(); 1443 return -EUSERS; 1444 } 1445 } 1446 1447 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1448 newkey, newkeylen, GFP_KERNEL); 1449 } 1450 EXPORT_IPV6_MOD(tcp_md5_do_add); 1451 1452 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1453 int family, u8 prefixlen, int l3index, 1454 struct tcp_md5sig_key *key) 1455 { 1456 struct tcp_sock *tp = tcp_sk(sk); 1457 1458 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1459 tcp_md5_add_sigpool(); 1460 1461 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1462 tcp_md5_release_sigpool(); 1463 return -ENOMEM; 1464 } 1465 1466 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1467 struct tcp_md5sig_info *md5sig; 1468 1469 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1470 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1471 rcu_assign_pointer(tp->md5sig_info, NULL); 1472 kfree_rcu(md5sig, rcu); 1473 tcp_md5_release_sigpool(); 1474 return -EUSERS; 1475 } 1476 } 1477 1478 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1479 key->flags, key->key, key->keylen, 1480 sk_gfp_mask(sk, GFP_ATOMIC)); 1481 } 1482 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1483 1484 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1485 u8 prefixlen, int l3index, u8 flags) 1486 { 1487 struct tcp_md5sig_key *key; 1488 1489 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1490 if (!key) 1491 return -ENOENT; 1492 hlist_del_rcu(&key->node); 1493 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1494 kfree_rcu(key, rcu); 1495 return 0; 1496 } 1497 EXPORT_IPV6_MOD(tcp_md5_do_del); 1498 1499 void tcp_clear_md5_list(struct sock *sk) 1500 { 1501 struct tcp_sock *tp = tcp_sk(sk); 1502 struct tcp_md5sig_key *key; 1503 struct hlist_node *n; 1504 struct tcp_md5sig_info *md5sig; 1505 1506 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1507 1508 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1509 hlist_del(&key->node); 1510 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1511 kfree(key); 1512 } 1513 } 1514 1515 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1516 sockptr_t optval, int optlen) 1517 { 1518 struct tcp_md5sig cmd; 1519 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1520 const union tcp_md5_addr *addr; 1521 u8 prefixlen = 32; 1522 int l3index = 0; 1523 bool l3flag; 1524 u8 flags; 1525 1526 if (optlen < sizeof(cmd)) 1527 return -EINVAL; 1528 1529 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1530 return -EFAULT; 1531 1532 if (sin->sin_family != AF_INET) 1533 return -EINVAL; 1534 1535 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1536 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1537 1538 if (optname == TCP_MD5SIG_EXT && 1539 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1540 prefixlen = cmd.tcpm_prefixlen; 1541 if (prefixlen > 32) 1542 return -EINVAL; 1543 } 1544 1545 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1546 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1547 struct net_device *dev; 1548 1549 rcu_read_lock(); 1550 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1551 if (dev && netif_is_l3_master(dev)) 1552 l3index = dev->ifindex; 1553 1554 rcu_read_unlock(); 1555 1556 /* ok to reference set/not set outside of rcu; 1557 * right now device MUST be an L3 master 1558 */ 1559 if (!dev || !l3index) 1560 return -EINVAL; 1561 } 1562 1563 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1564 1565 if (!cmd.tcpm_keylen) 1566 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1567 1568 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1569 return -EINVAL; 1570 1571 /* Don't allow keys for peers that have a matching TCP-AO key. 1572 * See the comment in tcp_ao_add_cmd() 1573 */ 1574 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1575 return -EKEYREJECTED; 1576 1577 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1578 cmd.tcpm_key, cmd.tcpm_keylen); 1579 } 1580 1581 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1582 __be32 daddr, __be32 saddr, 1583 const struct tcphdr *th, int nbytes) 1584 { 1585 struct tcp4_pseudohdr *bp; 1586 struct scatterlist sg; 1587 struct tcphdr *_th; 1588 1589 bp = hp->scratch; 1590 bp->saddr = saddr; 1591 bp->daddr = daddr; 1592 bp->pad = 0; 1593 bp->protocol = IPPROTO_TCP; 1594 bp->len = cpu_to_be16(nbytes); 1595 1596 _th = (struct tcphdr *)(bp + 1); 1597 memcpy(_th, th, sizeof(*th)); 1598 _th->check = 0; 1599 1600 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1601 ahash_request_set_crypt(hp->req, &sg, NULL, 1602 sizeof(*bp) + sizeof(*th)); 1603 return crypto_ahash_update(hp->req); 1604 } 1605 1606 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1607 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1608 { 1609 struct tcp_sigpool hp; 1610 1611 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1612 goto clear_hash_nostart; 1613 1614 if (crypto_ahash_init(hp.req)) 1615 goto clear_hash; 1616 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1617 goto clear_hash; 1618 if (tcp_md5_hash_key(&hp, key)) 1619 goto clear_hash; 1620 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1621 if (crypto_ahash_final(hp.req)) 1622 goto clear_hash; 1623 1624 tcp_sigpool_end(&hp); 1625 return 0; 1626 1627 clear_hash: 1628 tcp_sigpool_end(&hp); 1629 clear_hash_nostart: 1630 memset(md5_hash, 0, 16); 1631 return 1; 1632 } 1633 1634 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1635 const struct sock *sk, 1636 const struct sk_buff *skb) 1637 { 1638 const struct tcphdr *th = tcp_hdr(skb); 1639 struct tcp_sigpool hp; 1640 __be32 saddr, daddr; 1641 1642 if (sk) { /* valid for establish/request sockets */ 1643 saddr = sk->sk_rcv_saddr; 1644 daddr = sk->sk_daddr; 1645 } else { 1646 const struct iphdr *iph = ip_hdr(skb); 1647 saddr = iph->saddr; 1648 daddr = iph->daddr; 1649 } 1650 1651 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1652 goto clear_hash_nostart; 1653 1654 if (crypto_ahash_init(hp.req)) 1655 goto clear_hash; 1656 1657 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1658 goto clear_hash; 1659 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1660 goto clear_hash; 1661 if (tcp_md5_hash_key(&hp, key)) 1662 goto clear_hash; 1663 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1664 if (crypto_ahash_final(hp.req)) 1665 goto clear_hash; 1666 1667 tcp_sigpool_end(&hp); 1668 return 0; 1669 1670 clear_hash: 1671 tcp_sigpool_end(&hp); 1672 clear_hash_nostart: 1673 memset(md5_hash, 0, 16); 1674 return 1; 1675 } 1676 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1677 1678 #endif 1679 1680 static void tcp_v4_init_req(struct request_sock *req, 1681 const struct sock *sk_listener, 1682 struct sk_buff *skb) 1683 { 1684 struct inet_request_sock *ireq = inet_rsk(req); 1685 struct net *net = sock_net(sk_listener); 1686 1687 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1688 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1689 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1690 } 1691 1692 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1693 struct sk_buff *skb, 1694 struct flowi *fl, 1695 struct request_sock *req, 1696 u32 tw_isn) 1697 { 1698 tcp_v4_init_req(req, sk, skb); 1699 1700 if (security_inet_conn_request(sk, skb, req)) 1701 return NULL; 1702 1703 return inet_csk_route_req(sk, &fl->u.ip4, req); 1704 } 1705 1706 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1707 .family = PF_INET, 1708 .obj_size = sizeof(struct tcp_request_sock), 1709 .send_ack = tcp_v4_reqsk_send_ack, 1710 .destructor = tcp_v4_reqsk_destructor, 1711 .send_reset = tcp_v4_send_reset, 1712 .syn_ack_timeout = tcp_syn_ack_timeout, 1713 }; 1714 1715 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1716 .mss_clamp = TCP_MSS_DEFAULT, 1717 #ifdef CONFIG_TCP_MD5SIG 1718 .req_md5_lookup = tcp_v4_md5_lookup, 1719 .calc_md5_hash = tcp_v4_md5_hash_skb, 1720 #endif 1721 #ifdef CONFIG_TCP_AO 1722 .ao_lookup = tcp_v4_ao_lookup_rsk, 1723 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1724 .ao_synack_hash = tcp_v4_ao_synack_hash, 1725 #endif 1726 #ifdef CONFIG_SYN_COOKIES 1727 .cookie_init_seq = cookie_v4_init_sequence, 1728 #endif 1729 .route_req = tcp_v4_route_req, 1730 .init_seq = tcp_v4_init_seq, 1731 .init_ts_off = tcp_v4_init_ts_off, 1732 .send_synack = tcp_v4_send_synack, 1733 }; 1734 1735 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1736 { 1737 /* Never answer to SYNs send to broadcast or multicast */ 1738 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1739 goto drop; 1740 1741 return tcp_conn_request(&tcp_request_sock_ops, 1742 &tcp_request_sock_ipv4_ops, sk, skb); 1743 1744 drop: 1745 tcp_listendrop(sk); 1746 return 0; 1747 } 1748 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1749 1750 1751 /* 1752 * The three way handshake has completed - we got a valid synack - 1753 * now create the new socket. 1754 */ 1755 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1756 struct request_sock *req, 1757 struct dst_entry *dst, 1758 struct request_sock *req_unhash, 1759 bool *own_req) 1760 { 1761 struct inet_request_sock *ireq; 1762 bool found_dup_sk = false; 1763 struct inet_sock *newinet; 1764 struct tcp_sock *newtp; 1765 struct sock *newsk; 1766 #ifdef CONFIG_TCP_MD5SIG 1767 const union tcp_md5_addr *addr; 1768 struct tcp_md5sig_key *key; 1769 int l3index; 1770 #endif 1771 struct ip_options_rcu *inet_opt; 1772 1773 if (sk_acceptq_is_full(sk)) 1774 goto exit_overflow; 1775 1776 newsk = tcp_create_openreq_child(sk, req, skb); 1777 if (!newsk) 1778 goto exit_nonewsk; 1779 1780 newsk->sk_gso_type = SKB_GSO_TCPV4; 1781 inet_sk_rx_dst_set(newsk, skb); 1782 1783 newtp = tcp_sk(newsk); 1784 newinet = inet_sk(newsk); 1785 ireq = inet_rsk(req); 1786 inet_opt = rcu_dereference(ireq->ireq_opt); 1787 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1788 newinet->mc_index = inet_iif(skb); 1789 newinet->mc_ttl = ip_hdr(skb)->ttl; 1790 newinet->rcv_tos = ip_hdr(skb)->tos; 1791 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1792 if (inet_opt) 1793 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1794 atomic_set(&newinet->inet_id, get_random_u16()); 1795 1796 /* Set ToS of the new socket based upon the value of incoming SYN. 1797 * ECT bits are set later in tcp_init_transfer(). 1798 */ 1799 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1800 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1801 1802 if (!dst) { 1803 dst = inet_csk_route_child_sock(sk, newsk, req); 1804 if (!dst) 1805 goto put_and_exit; 1806 } else { 1807 /* syncookie case : see end of cookie_v4_check() */ 1808 } 1809 sk_setup_caps(newsk, dst); 1810 1811 tcp_ca_openreq_child(newsk, dst); 1812 1813 tcp_sync_mss(newsk, dst_mtu(dst)); 1814 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1815 1816 tcp_initialize_rcv_mss(newsk); 1817 1818 #ifdef CONFIG_TCP_MD5SIG 1819 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1820 /* Copy over the MD5 key from the original socket */ 1821 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1822 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1823 if (key && !tcp_rsk_used_ao(req)) { 1824 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1825 goto put_and_exit; 1826 sk_gso_disable(newsk); 1827 } 1828 #endif 1829 #ifdef CONFIG_TCP_AO 1830 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1831 goto put_and_exit; /* OOM, release back memory */ 1832 #endif 1833 1834 if (__inet_inherit_port(sk, newsk) < 0) 1835 goto put_and_exit; 1836 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1837 &found_dup_sk); 1838 if (likely(*own_req)) { 1839 tcp_move_syn(newtp, req); 1840 ireq->ireq_opt = NULL; 1841 } else { 1842 newinet->inet_opt = NULL; 1843 1844 if (!req_unhash && found_dup_sk) { 1845 /* This code path should only be executed in the 1846 * syncookie case only 1847 */ 1848 bh_unlock_sock(newsk); 1849 sock_put(newsk); 1850 newsk = NULL; 1851 } 1852 } 1853 return newsk; 1854 1855 exit_overflow: 1856 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1857 exit_nonewsk: 1858 dst_release(dst); 1859 exit: 1860 tcp_listendrop(sk); 1861 return NULL; 1862 put_and_exit: 1863 newinet->inet_opt = NULL; 1864 inet_csk_prepare_forced_close(newsk); 1865 tcp_done(newsk); 1866 goto exit; 1867 } 1868 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1869 1870 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1871 { 1872 #ifdef CONFIG_SYN_COOKIES 1873 const struct tcphdr *th = tcp_hdr(skb); 1874 1875 if (!th->syn) 1876 sk = cookie_v4_check(sk, skb); 1877 #endif 1878 return sk; 1879 } 1880 1881 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1882 struct tcphdr *th, u32 *cookie) 1883 { 1884 u16 mss = 0; 1885 #ifdef CONFIG_SYN_COOKIES 1886 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1887 &tcp_request_sock_ipv4_ops, sk, th); 1888 if (mss) { 1889 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1890 tcp_synq_overflow(sk); 1891 } 1892 #endif 1893 return mss; 1894 } 1895 1896 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1897 u32)); 1898 /* The socket must have it's spinlock held when we get 1899 * here, unless it is a TCP_LISTEN socket. 1900 * 1901 * We have a potential double-lock case here, so even when 1902 * doing backlog processing we use the BH locking scheme. 1903 * This is because we cannot sleep with the original spinlock 1904 * held. 1905 */ 1906 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1907 { 1908 enum skb_drop_reason reason; 1909 struct sock *rsk; 1910 1911 reason = psp_sk_rx_policy_check(sk, skb); 1912 if (reason) 1913 goto err_discard; 1914 1915 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1916 struct dst_entry *dst; 1917 1918 dst = rcu_dereference_protected(sk->sk_rx_dst, 1919 lockdep_sock_is_held(sk)); 1920 1921 sock_rps_save_rxhash(sk, skb); 1922 sk_mark_napi_id(sk, skb); 1923 if (dst) { 1924 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1925 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1926 dst, 0)) { 1927 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1928 dst_release(dst); 1929 } 1930 } 1931 tcp_rcv_established(sk, skb); 1932 return 0; 1933 } 1934 1935 if (tcp_checksum_complete(skb)) 1936 goto csum_err; 1937 1938 if (sk->sk_state == TCP_LISTEN) { 1939 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1940 1941 if (!nsk) 1942 return 0; 1943 if (nsk != sk) { 1944 reason = tcp_child_process(sk, nsk, skb); 1945 if (reason) { 1946 rsk = nsk; 1947 goto reset; 1948 } 1949 return 0; 1950 } 1951 } else 1952 sock_rps_save_rxhash(sk, skb); 1953 1954 reason = tcp_rcv_state_process(sk, skb); 1955 if (reason) { 1956 rsk = sk; 1957 goto reset; 1958 } 1959 return 0; 1960 1961 reset: 1962 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1963 discard: 1964 sk_skb_reason_drop(sk, skb, reason); 1965 /* Be careful here. If this function gets more complicated and 1966 * gcc suffers from register pressure on the x86, sk (in %ebx) 1967 * might be destroyed here. This current version compiles correctly, 1968 * but you have been warned. 1969 */ 1970 return 0; 1971 1972 csum_err: 1973 reason = SKB_DROP_REASON_TCP_CSUM; 1974 trace_tcp_bad_csum(skb); 1975 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1976 err_discard: 1977 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1978 goto discard; 1979 } 1980 EXPORT_SYMBOL(tcp_v4_do_rcv); 1981 1982 int tcp_v4_early_demux(struct sk_buff *skb) 1983 { 1984 struct net *net = dev_net_rcu(skb->dev); 1985 const struct iphdr *iph; 1986 const struct tcphdr *th; 1987 struct sock *sk; 1988 1989 if (skb->pkt_type != PACKET_HOST) 1990 return 0; 1991 1992 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1993 return 0; 1994 1995 iph = ip_hdr(skb); 1996 th = tcp_hdr(skb); 1997 1998 if (th->doff < sizeof(struct tcphdr) / 4) 1999 return 0; 2000 2001 sk = __inet_lookup_established(net, iph->saddr, th->source, 2002 iph->daddr, ntohs(th->dest), 2003 skb->skb_iif, inet_sdif(skb)); 2004 if (sk) { 2005 skb->sk = sk; 2006 skb->destructor = sock_edemux; 2007 if (sk_fullsock(sk)) { 2008 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 2009 2010 if (dst) 2011 dst = dst_check(dst, 0); 2012 if (dst && 2013 sk->sk_rx_dst_ifindex == skb->skb_iif) 2014 skb_dst_set_noref(skb, dst); 2015 } 2016 } 2017 return 0; 2018 } 2019 2020 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2021 enum skb_drop_reason *reason) 2022 { 2023 u32 tail_gso_size, tail_gso_segs; 2024 struct skb_shared_info *shinfo; 2025 const struct tcphdr *th; 2026 struct tcphdr *thtail; 2027 struct sk_buff *tail; 2028 unsigned int hdrlen; 2029 bool fragstolen; 2030 u32 gso_segs; 2031 u32 gso_size; 2032 u64 limit; 2033 int delta; 2034 int err; 2035 2036 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2037 * we can fix skb->truesize to its real value to avoid future drops. 2038 * This is valid because skb is not yet charged to the socket. 2039 * It has been noticed pure SACK packets were sometimes dropped 2040 * (if cooked by drivers without copybreak feature). 2041 */ 2042 skb_condense(skb); 2043 2044 tcp_cleanup_skb(skb); 2045 2046 if (unlikely(tcp_checksum_complete(skb))) { 2047 bh_unlock_sock(sk); 2048 trace_tcp_bad_csum(skb); 2049 *reason = SKB_DROP_REASON_TCP_CSUM; 2050 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2051 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2052 return true; 2053 } 2054 2055 /* Attempt coalescing to last skb in backlog, even if we are 2056 * above the limits. 2057 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2058 */ 2059 th = (const struct tcphdr *)skb->data; 2060 hdrlen = th->doff * 4; 2061 2062 tail = sk->sk_backlog.tail; 2063 if (!tail) 2064 goto no_coalesce; 2065 thtail = (struct tcphdr *)tail->data; 2066 2067 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2068 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2069 ((TCP_SKB_CB(tail)->tcp_flags | 2070 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2071 !((TCP_SKB_CB(tail)->tcp_flags & 2072 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2073 ((TCP_SKB_CB(tail)->tcp_flags ^ 2074 TCP_SKB_CB(skb)->tcp_flags) & 2075 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2076 !tcp_skb_can_collapse_rx(tail, skb) || 2077 thtail->doff != th->doff || 2078 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || 2079 /* prior to PSP Rx policy check, retain exact PSP metadata */ 2080 psp_skb_coalesce_diff(tail, skb)) 2081 goto no_coalesce; 2082 2083 __skb_pull(skb, hdrlen); 2084 2085 shinfo = skb_shinfo(skb); 2086 gso_size = shinfo->gso_size ?: skb->len; 2087 gso_segs = shinfo->gso_segs ?: 1; 2088 2089 shinfo = skb_shinfo(tail); 2090 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2091 tail_gso_segs = shinfo->gso_segs ?: 1; 2092 2093 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2094 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2095 2096 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2097 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2098 thtail->window = th->window; 2099 } 2100 2101 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2102 * thtail->fin, so that the fast path in tcp_rcv_established() 2103 * is not entered if we append a packet with a FIN. 2104 * SYN, RST, URG are not present. 2105 * ACK is set on both packets. 2106 * PSH : we do not really care in TCP stack, 2107 * at least for 'GRO' packets. 2108 */ 2109 thtail->fin |= th->fin; 2110 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2111 2112 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2113 TCP_SKB_CB(tail)->has_rxtstamp = true; 2114 tail->tstamp = skb->tstamp; 2115 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2116 } 2117 2118 /* Not as strict as GRO. We only need to carry mss max value */ 2119 shinfo->gso_size = max(gso_size, tail_gso_size); 2120 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2121 2122 sk->sk_backlog.len += delta; 2123 __NET_INC_STATS(sock_net(sk), 2124 LINUX_MIB_TCPBACKLOGCOALESCE); 2125 kfree_skb_partial(skb, fragstolen); 2126 return false; 2127 } 2128 __skb_push(skb, hdrlen); 2129 2130 no_coalesce: 2131 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2132 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2133 * sk_rcvbuf in normal conditions. 2134 */ 2135 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2136 2137 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2138 2139 /* Only socket owner can try to collapse/prune rx queues 2140 * to reduce memory overhead, so add a little headroom here. 2141 * Few sockets backlog are possibly concurrently non empty. 2142 */ 2143 limit += 64 * 1024; 2144 2145 limit = min_t(u64, limit, UINT_MAX); 2146 2147 err = sk_add_backlog(sk, skb, limit); 2148 if (unlikely(err)) { 2149 bh_unlock_sock(sk); 2150 if (err == -ENOMEM) { 2151 *reason = SKB_DROP_REASON_PFMEMALLOC; 2152 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2153 } else { 2154 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2155 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2156 } 2157 return true; 2158 } 2159 return false; 2160 } 2161 EXPORT_IPV6_MOD(tcp_add_backlog); 2162 2163 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) 2164 { 2165 struct tcphdr *th = (struct tcphdr *)skb->data; 2166 2167 return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); 2168 } 2169 EXPORT_IPV6_MOD(tcp_filter); 2170 2171 static void tcp_v4_restore_cb(struct sk_buff *skb) 2172 { 2173 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2174 sizeof(struct inet_skb_parm)); 2175 } 2176 2177 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2178 const struct tcphdr *th) 2179 { 2180 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2181 * barrier() makes sure compiler wont play fool^Waliasing games. 2182 */ 2183 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2184 sizeof(struct inet_skb_parm)); 2185 barrier(); 2186 2187 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2188 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2189 skb->len - th->doff * 4); 2190 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2191 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2192 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2193 TCP_SKB_CB(skb)->sacked = 0; 2194 TCP_SKB_CB(skb)->has_rxtstamp = 2195 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2196 } 2197 2198 /* 2199 * From tcp_input.c 2200 */ 2201 2202 int tcp_v4_rcv(struct sk_buff *skb) 2203 { 2204 struct net *net = dev_net_rcu(skb->dev); 2205 enum skb_drop_reason drop_reason; 2206 enum tcp_tw_status tw_status; 2207 int sdif = inet_sdif(skb); 2208 int dif = inet_iif(skb); 2209 const struct iphdr *iph; 2210 const struct tcphdr *th; 2211 struct sock *sk = NULL; 2212 bool refcounted; 2213 int ret; 2214 u32 isn; 2215 2216 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2217 if (skb->pkt_type != PACKET_HOST) 2218 goto discard_it; 2219 2220 /* Count it even if it's bad */ 2221 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2222 2223 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2224 goto discard_it; 2225 2226 th = (const struct tcphdr *)skb->data; 2227 2228 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2229 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2230 goto bad_packet; 2231 } 2232 if (!pskb_may_pull(skb, th->doff * 4)) 2233 goto discard_it; 2234 2235 /* An explanation is required here, I think. 2236 * Packet length and doff are validated by header prediction, 2237 * provided case of th->doff==0 is eliminated. 2238 * So, we defer the checks. */ 2239 2240 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2241 goto csum_error; 2242 2243 th = (const struct tcphdr *)skb->data; 2244 iph = ip_hdr(skb); 2245 lookup: 2246 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2247 th->dest, sdif, &refcounted); 2248 if (!sk) 2249 goto no_tcp_socket; 2250 2251 if (sk->sk_state == TCP_TIME_WAIT) 2252 goto do_time_wait; 2253 2254 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2255 struct request_sock *req = inet_reqsk(sk); 2256 bool req_stolen = false; 2257 struct sock *nsk; 2258 2259 sk = req->rsk_listener; 2260 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2261 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2262 else 2263 drop_reason = tcp_inbound_hash(sk, req, skb, 2264 &iph->saddr, &iph->daddr, 2265 AF_INET, dif, sdif); 2266 if (unlikely(drop_reason)) { 2267 sk_drops_skbadd(sk, skb); 2268 reqsk_put(req); 2269 goto discard_it; 2270 } 2271 if (tcp_checksum_complete(skb)) { 2272 reqsk_put(req); 2273 goto csum_error; 2274 } 2275 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2276 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2277 if (!nsk) { 2278 inet_csk_reqsk_queue_drop_and_put(sk, req); 2279 goto lookup; 2280 } 2281 sk = nsk; 2282 /* reuseport_migrate_sock() has already held one sk_refcnt 2283 * before returning. 2284 */ 2285 } else { 2286 /* We own a reference on the listener, increase it again 2287 * as we might lose it too soon. 2288 */ 2289 sock_hold(sk); 2290 } 2291 refcounted = true; 2292 nsk = NULL; 2293 if (!tcp_filter(sk, skb, &drop_reason)) { 2294 th = (const struct tcphdr *)skb->data; 2295 iph = ip_hdr(skb); 2296 tcp_v4_fill_cb(skb, iph, th); 2297 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2298 &drop_reason); 2299 } 2300 if (!nsk) { 2301 reqsk_put(req); 2302 if (req_stolen) { 2303 /* Another cpu got exclusive access to req 2304 * and created a full blown socket. 2305 * Try to feed this packet to this socket 2306 * instead of discarding it. 2307 */ 2308 tcp_v4_restore_cb(skb); 2309 sock_put(sk); 2310 goto lookup; 2311 } 2312 goto discard_and_relse; 2313 } 2314 nf_reset_ct(skb); 2315 if (nsk == sk) { 2316 reqsk_put(req); 2317 tcp_v4_restore_cb(skb); 2318 } else { 2319 drop_reason = tcp_child_process(sk, nsk, skb); 2320 if (drop_reason) { 2321 enum sk_rst_reason rst_reason; 2322 2323 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2324 tcp_v4_send_reset(nsk, skb, rst_reason); 2325 goto discard_and_relse; 2326 } 2327 sock_put(sk); 2328 return 0; 2329 } 2330 } 2331 2332 process: 2333 if (static_branch_unlikely(&ip4_min_ttl)) { 2334 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2335 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2336 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2337 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2338 goto discard_and_relse; 2339 } 2340 } 2341 2342 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2343 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2344 goto discard_and_relse; 2345 } 2346 2347 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2348 AF_INET, dif, sdif); 2349 if (drop_reason) 2350 goto discard_and_relse; 2351 2352 nf_reset_ct(skb); 2353 2354 if (tcp_filter(sk, skb, &drop_reason)) 2355 goto discard_and_relse; 2356 2357 th = (const struct tcphdr *)skb->data; 2358 iph = ip_hdr(skb); 2359 tcp_v4_fill_cb(skb, iph, th); 2360 2361 skb->dev = NULL; 2362 2363 if (sk->sk_state == TCP_LISTEN) { 2364 ret = tcp_v4_do_rcv(sk, skb); 2365 goto put_and_return; 2366 } 2367 2368 sk_incoming_cpu_update(sk); 2369 2370 bh_lock_sock_nested(sk); 2371 tcp_segs_in(tcp_sk(sk), skb); 2372 ret = 0; 2373 if (!sock_owned_by_user(sk)) { 2374 ret = tcp_v4_do_rcv(sk, skb); 2375 } else { 2376 if (tcp_add_backlog(sk, skb, &drop_reason)) 2377 goto discard_and_relse; 2378 } 2379 bh_unlock_sock(sk); 2380 2381 put_and_return: 2382 if (refcounted) 2383 sock_put(sk); 2384 2385 return ret; 2386 2387 no_tcp_socket: 2388 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2389 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2390 goto discard_it; 2391 2392 tcp_v4_fill_cb(skb, iph, th); 2393 2394 if (tcp_checksum_complete(skb)) { 2395 csum_error: 2396 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2397 trace_tcp_bad_csum(skb); 2398 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2399 bad_packet: 2400 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2401 } else { 2402 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2403 } 2404 2405 discard_it: 2406 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2407 /* Discard frame. */ 2408 sk_skb_reason_drop(sk, skb, drop_reason); 2409 return 0; 2410 2411 discard_and_relse: 2412 sk_drops_skbadd(sk, skb); 2413 if (refcounted) 2414 sock_put(sk); 2415 goto discard_it; 2416 2417 do_time_wait: 2418 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2419 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2420 inet_twsk_put(inet_twsk(sk)); 2421 goto discard_it; 2422 } 2423 2424 tcp_v4_fill_cb(skb, iph, th); 2425 2426 if (tcp_checksum_complete(skb)) { 2427 inet_twsk_put(inet_twsk(sk)); 2428 goto csum_error; 2429 } 2430 2431 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2432 &drop_reason); 2433 switch (tw_status) { 2434 case TCP_TW_SYN: { 2435 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2436 iph->saddr, th->source, 2437 iph->daddr, th->dest, 2438 inet_iif(skb), 2439 sdif); 2440 if (sk2) { 2441 inet_twsk_deschedule_put(inet_twsk(sk)); 2442 sk = sk2; 2443 tcp_v4_restore_cb(skb); 2444 refcounted = false; 2445 __this_cpu_write(tcp_tw_isn, isn); 2446 goto process; 2447 } 2448 2449 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); 2450 if (drop_reason) 2451 break; 2452 } 2453 /* to ACK */ 2454 fallthrough; 2455 case TCP_TW_ACK: 2456 case TCP_TW_ACK_OOW: 2457 tcp_v4_timewait_ack(sk, skb, tw_status); 2458 break; 2459 case TCP_TW_RST: 2460 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2461 inet_twsk_deschedule_put(inet_twsk(sk)); 2462 goto discard_it; 2463 case TCP_TW_SUCCESS:; 2464 } 2465 goto discard_it; 2466 } 2467 2468 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2469 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2470 }; 2471 2472 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2473 { 2474 struct dst_entry *dst = skb_dst(skb); 2475 2476 if (dst && dst_hold_safe(dst)) { 2477 rcu_assign_pointer(sk->sk_rx_dst, dst); 2478 sk->sk_rx_dst_ifindex = skb->skb_iif; 2479 } 2480 } 2481 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2482 2483 const struct inet_connection_sock_af_ops ipv4_specific = { 2484 .queue_xmit = ip_queue_xmit, 2485 .send_check = tcp_v4_send_check, 2486 .rebuild_header = inet_sk_rebuild_header, 2487 .sk_rx_dst_set = inet_sk_rx_dst_set, 2488 .conn_request = tcp_v4_conn_request, 2489 .syn_recv_sock = tcp_v4_syn_recv_sock, 2490 .net_header_len = sizeof(struct iphdr), 2491 .setsockopt = ip_setsockopt, 2492 .getsockopt = ip_getsockopt, 2493 .mtu_reduced = tcp_v4_mtu_reduced, 2494 }; 2495 EXPORT_IPV6_MOD(ipv4_specific); 2496 2497 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2498 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2499 #ifdef CONFIG_TCP_MD5SIG 2500 .md5_lookup = tcp_v4_md5_lookup, 2501 .calc_md5_hash = tcp_v4_md5_hash_skb, 2502 .md5_parse = tcp_v4_parse_md5_keys, 2503 #endif 2504 #ifdef CONFIG_TCP_AO 2505 .ao_lookup = tcp_v4_ao_lookup, 2506 .calc_ao_hash = tcp_v4_ao_hash_skb, 2507 .ao_parse = tcp_v4_parse_ao, 2508 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2509 #endif 2510 }; 2511 2512 static void tcp4_destruct_sock(struct sock *sk) 2513 { 2514 tcp_md5_destruct_sock(sk); 2515 tcp_ao_destroy_sock(sk, false); 2516 inet_sock_destruct(sk); 2517 } 2518 #endif 2519 2520 /* NOTE: A lot of things set to zero explicitly by call to 2521 * sk_alloc() so need not be done here. 2522 */ 2523 static int tcp_v4_init_sock(struct sock *sk) 2524 { 2525 struct inet_connection_sock *icsk = inet_csk(sk); 2526 2527 tcp_init_sock(sk); 2528 2529 icsk->icsk_af_ops = &ipv4_specific; 2530 2531 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2532 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2533 sk->sk_destruct = tcp4_destruct_sock; 2534 #endif 2535 2536 return 0; 2537 } 2538 2539 static void tcp_release_user_frags(struct sock *sk) 2540 { 2541 #ifdef CONFIG_PAGE_POOL 2542 unsigned long index; 2543 void *netmem; 2544 2545 xa_for_each(&sk->sk_user_frags, index, netmem) 2546 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2547 #endif 2548 } 2549 2550 void tcp_v4_destroy_sock(struct sock *sk) 2551 { 2552 struct tcp_sock *tp = tcp_sk(sk); 2553 2554 tcp_release_user_frags(sk); 2555 2556 xa_destroy(&sk->sk_user_frags); 2557 2558 trace_tcp_destroy_sock(sk); 2559 2560 tcp_clear_xmit_timers(sk); 2561 2562 tcp_cleanup_congestion_control(sk); 2563 2564 tcp_cleanup_ulp(sk); 2565 2566 /* Cleanup up the write buffer. */ 2567 tcp_write_queue_purge(sk); 2568 2569 /* Check if we want to disable active TFO */ 2570 tcp_fastopen_active_disable_ofo_check(sk); 2571 2572 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2573 skb_rbtree_purge(&tp->out_of_order_queue); 2574 2575 /* Clean up a referenced TCP bind bucket. */ 2576 if (inet_csk(sk)->icsk_bind_hash) 2577 inet_put_port(sk); 2578 2579 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2580 2581 /* If socket is aborted during connect operation */ 2582 tcp_free_fastopen_req(tp); 2583 tcp_fastopen_destroy_cipher(sk); 2584 tcp_saved_syn_free(tp); 2585 2586 sk_sockets_allocated_dec(sk); 2587 } 2588 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2589 2590 #ifdef CONFIG_PROC_FS 2591 /* Proc filesystem TCP sock list dumping. */ 2592 2593 static unsigned short seq_file_family(const struct seq_file *seq); 2594 2595 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2596 { 2597 unsigned short family = seq_file_family(seq); 2598 2599 /* AF_UNSPEC is used as a match all */ 2600 return ((family == AF_UNSPEC || family == sk->sk_family) && 2601 net_eq(sock_net(sk), seq_file_net(seq))); 2602 } 2603 2604 /* Find a non empty bucket (starting from st->bucket) 2605 * and return the first sk from it. 2606 */ 2607 static void *listening_get_first(struct seq_file *seq) 2608 { 2609 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2610 struct tcp_iter_state *st = seq->private; 2611 2612 st->offset = 0; 2613 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2614 struct inet_listen_hashbucket *ilb2; 2615 struct hlist_nulls_node *node; 2616 struct sock *sk; 2617 2618 ilb2 = &hinfo->lhash2[st->bucket]; 2619 if (hlist_nulls_empty(&ilb2->nulls_head)) 2620 continue; 2621 2622 spin_lock(&ilb2->lock); 2623 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2624 if (seq_sk_match(seq, sk)) 2625 return sk; 2626 } 2627 spin_unlock(&ilb2->lock); 2628 } 2629 2630 return NULL; 2631 } 2632 2633 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2634 * If "cur" is the last one in the st->bucket, 2635 * call listening_get_first() to return the first sk of the next 2636 * non empty bucket. 2637 */ 2638 static void *listening_get_next(struct seq_file *seq, void *cur) 2639 { 2640 struct tcp_iter_state *st = seq->private; 2641 struct inet_listen_hashbucket *ilb2; 2642 struct hlist_nulls_node *node; 2643 struct inet_hashinfo *hinfo; 2644 struct sock *sk = cur; 2645 2646 ++st->num; 2647 ++st->offset; 2648 2649 sk = sk_nulls_next(sk); 2650 sk_nulls_for_each_from(sk, node) { 2651 if (seq_sk_match(seq, sk)) 2652 return sk; 2653 } 2654 2655 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2656 ilb2 = &hinfo->lhash2[st->bucket]; 2657 spin_unlock(&ilb2->lock); 2658 ++st->bucket; 2659 return listening_get_first(seq); 2660 } 2661 2662 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2663 { 2664 struct tcp_iter_state *st = seq->private; 2665 void *rc; 2666 2667 st->bucket = 0; 2668 st->offset = 0; 2669 rc = listening_get_first(seq); 2670 2671 while (rc && *pos) { 2672 rc = listening_get_next(seq, rc); 2673 --*pos; 2674 } 2675 return rc; 2676 } 2677 2678 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2679 const struct tcp_iter_state *st) 2680 { 2681 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2682 } 2683 2684 /* 2685 * Get first established socket starting from bucket given in st->bucket. 2686 * If st->bucket is zero, the very first socket in the hash is returned. 2687 */ 2688 static void *established_get_first(struct seq_file *seq) 2689 { 2690 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2691 struct tcp_iter_state *st = seq->private; 2692 2693 st->offset = 0; 2694 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2695 struct sock *sk; 2696 struct hlist_nulls_node *node; 2697 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2698 2699 cond_resched(); 2700 2701 /* Lockless fast path for the common case of empty buckets */ 2702 if (empty_bucket(hinfo, st)) 2703 continue; 2704 2705 spin_lock_bh(lock); 2706 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2707 if (seq_sk_match(seq, sk)) 2708 return sk; 2709 } 2710 spin_unlock_bh(lock); 2711 } 2712 2713 return NULL; 2714 } 2715 2716 static void *established_get_next(struct seq_file *seq, void *cur) 2717 { 2718 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2719 struct tcp_iter_state *st = seq->private; 2720 struct hlist_nulls_node *node; 2721 struct sock *sk = cur; 2722 2723 ++st->num; 2724 ++st->offset; 2725 2726 sk = sk_nulls_next(sk); 2727 2728 sk_nulls_for_each_from(sk, node) { 2729 if (seq_sk_match(seq, sk)) 2730 return sk; 2731 } 2732 2733 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2734 ++st->bucket; 2735 return established_get_first(seq); 2736 } 2737 2738 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2739 { 2740 struct tcp_iter_state *st = seq->private; 2741 void *rc; 2742 2743 st->bucket = 0; 2744 rc = established_get_first(seq); 2745 2746 while (rc && pos) { 2747 rc = established_get_next(seq, rc); 2748 --pos; 2749 } 2750 return rc; 2751 } 2752 2753 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2754 { 2755 void *rc; 2756 struct tcp_iter_state *st = seq->private; 2757 2758 st->state = TCP_SEQ_STATE_LISTENING; 2759 rc = listening_get_idx(seq, &pos); 2760 2761 if (!rc) { 2762 st->state = TCP_SEQ_STATE_ESTABLISHED; 2763 rc = established_get_idx(seq, pos); 2764 } 2765 2766 return rc; 2767 } 2768 2769 static void *tcp_seek_last_pos(struct seq_file *seq) 2770 { 2771 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2772 struct tcp_iter_state *st = seq->private; 2773 int bucket = st->bucket; 2774 int offset = st->offset; 2775 int orig_num = st->num; 2776 void *rc = NULL; 2777 2778 switch (st->state) { 2779 case TCP_SEQ_STATE_LISTENING: 2780 if (st->bucket > hinfo->lhash2_mask) 2781 break; 2782 rc = listening_get_first(seq); 2783 while (offset-- && rc && bucket == st->bucket) 2784 rc = listening_get_next(seq, rc); 2785 if (rc) 2786 break; 2787 st->bucket = 0; 2788 st->state = TCP_SEQ_STATE_ESTABLISHED; 2789 fallthrough; 2790 case TCP_SEQ_STATE_ESTABLISHED: 2791 if (st->bucket > hinfo->ehash_mask) 2792 break; 2793 rc = established_get_first(seq); 2794 while (offset-- && rc && bucket == st->bucket) 2795 rc = established_get_next(seq, rc); 2796 } 2797 2798 st->num = orig_num; 2799 2800 return rc; 2801 } 2802 2803 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2804 { 2805 struct tcp_iter_state *st = seq->private; 2806 void *rc; 2807 2808 if (*pos && *pos == st->last_pos) { 2809 rc = tcp_seek_last_pos(seq); 2810 if (rc) 2811 goto out; 2812 } 2813 2814 st->state = TCP_SEQ_STATE_LISTENING; 2815 st->num = 0; 2816 st->bucket = 0; 2817 st->offset = 0; 2818 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2819 2820 out: 2821 st->last_pos = *pos; 2822 return rc; 2823 } 2824 EXPORT_IPV6_MOD(tcp_seq_start); 2825 2826 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2827 { 2828 struct tcp_iter_state *st = seq->private; 2829 void *rc = NULL; 2830 2831 if (v == SEQ_START_TOKEN) { 2832 rc = tcp_get_idx(seq, 0); 2833 goto out; 2834 } 2835 2836 switch (st->state) { 2837 case TCP_SEQ_STATE_LISTENING: 2838 rc = listening_get_next(seq, v); 2839 if (!rc) { 2840 st->state = TCP_SEQ_STATE_ESTABLISHED; 2841 st->bucket = 0; 2842 st->offset = 0; 2843 rc = established_get_first(seq); 2844 } 2845 break; 2846 case TCP_SEQ_STATE_ESTABLISHED: 2847 rc = established_get_next(seq, v); 2848 break; 2849 } 2850 out: 2851 ++*pos; 2852 st->last_pos = *pos; 2853 return rc; 2854 } 2855 EXPORT_IPV6_MOD(tcp_seq_next); 2856 2857 void tcp_seq_stop(struct seq_file *seq, void *v) 2858 { 2859 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2860 struct tcp_iter_state *st = seq->private; 2861 2862 switch (st->state) { 2863 case TCP_SEQ_STATE_LISTENING: 2864 if (v != SEQ_START_TOKEN) 2865 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2866 break; 2867 case TCP_SEQ_STATE_ESTABLISHED: 2868 if (v) 2869 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2870 break; 2871 } 2872 } 2873 EXPORT_IPV6_MOD(tcp_seq_stop); 2874 2875 static void get_openreq4(const struct request_sock *req, 2876 struct seq_file *f, int i) 2877 { 2878 const struct inet_request_sock *ireq = inet_rsk(req); 2879 long delta = req->rsk_timer.expires - jiffies; 2880 2881 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2882 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2883 i, 2884 ireq->ir_loc_addr, 2885 ireq->ir_num, 2886 ireq->ir_rmt_addr, 2887 ntohs(ireq->ir_rmt_port), 2888 TCP_SYN_RECV, 2889 0, 0, /* could print option size, but that is af dependent. */ 2890 1, /* timers active (only the expire timer) */ 2891 jiffies_delta_to_clock_t(delta), 2892 req->num_timeout, 2893 from_kuid_munged(seq_user_ns(f), 2894 sk_uid(req->rsk_listener)), 2895 0, /* non standard timer */ 2896 0, /* open_requests have no inode */ 2897 0, 2898 req); 2899 } 2900 2901 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2902 { 2903 int timer_active; 2904 unsigned long timer_expires; 2905 const struct tcp_sock *tp = tcp_sk(sk); 2906 const struct inet_connection_sock *icsk = inet_csk(sk); 2907 const struct inet_sock *inet = inet_sk(sk); 2908 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2909 __be32 dest = inet->inet_daddr; 2910 __be32 src = inet->inet_rcv_saddr; 2911 __u16 destp = ntohs(inet->inet_dport); 2912 __u16 srcp = ntohs(inet->inet_sport); 2913 u8 icsk_pending; 2914 int rx_queue; 2915 int state; 2916 2917 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2918 if (icsk_pending == ICSK_TIME_RETRANS || 2919 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2920 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2921 timer_active = 1; 2922 timer_expires = icsk_timeout(icsk); 2923 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2924 timer_active = 4; 2925 timer_expires = icsk_timeout(icsk); 2926 } else if (timer_pending(&sk->sk_timer)) { 2927 timer_active = 2; 2928 timer_expires = sk->sk_timer.expires; 2929 } else { 2930 timer_active = 0; 2931 timer_expires = jiffies; 2932 } 2933 2934 state = inet_sk_state_load(sk); 2935 if (state == TCP_LISTEN) 2936 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2937 else 2938 /* Because we don't lock the socket, 2939 * we might find a transient negative value. 2940 */ 2941 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2942 READ_ONCE(tp->copied_seq), 0); 2943 2944 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2945 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2946 i, src, srcp, dest, destp, state, 2947 READ_ONCE(tp->write_seq) - tp->snd_una, 2948 rx_queue, 2949 timer_active, 2950 jiffies_delta_to_clock_t(timer_expires - jiffies), 2951 READ_ONCE(icsk->icsk_retransmits), 2952 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2953 READ_ONCE(icsk->icsk_probes_out), 2954 sock_i_ino(sk), 2955 refcount_read(&sk->sk_refcnt), sk, 2956 jiffies_to_clock_t(icsk->icsk_rto), 2957 jiffies_to_clock_t(icsk->icsk_ack.ato), 2958 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2959 tcp_snd_cwnd(tp), 2960 state == TCP_LISTEN ? 2961 fastopenq->max_qlen : 2962 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2963 } 2964 2965 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2966 struct seq_file *f, int i) 2967 { 2968 long delta = tw->tw_timer.expires - jiffies; 2969 __be32 dest, src; 2970 __u16 destp, srcp; 2971 2972 dest = tw->tw_daddr; 2973 src = tw->tw_rcv_saddr; 2974 destp = ntohs(tw->tw_dport); 2975 srcp = ntohs(tw->tw_sport); 2976 2977 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2978 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2979 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2980 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2981 refcount_read(&tw->tw_refcnt), tw); 2982 } 2983 2984 #define TMPSZ 150 2985 2986 static int tcp4_seq_show(struct seq_file *seq, void *v) 2987 { 2988 struct tcp_iter_state *st; 2989 struct sock *sk = v; 2990 2991 seq_setwidth(seq, TMPSZ - 1); 2992 if (v == SEQ_START_TOKEN) { 2993 seq_puts(seq, " sl local_address rem_address st tx_queue " 2994 "rx_queue tr tm->when retrnsmt uid timeout " 2995 "inode"); 2996 goto out; 2997 } 2998 st = seq->private; 2999 3000 if (sk->sk_state == TCP_TIME_WAIT) 3001 get_timewait4_sock(v, seq, st->num); 3002 else if (sk->sk_state == TCP_NEW_SYN_RECV) 3003 get_openreq4(v, seq, st->num); 3004 else 3005 get_tcp4_sock(v, seq, st->num); 3006 out: 3007 seq_pad(seq, '\n'); 3008 return 0; 3009 } 3010 3011 #ifdef CONFIG_BPF_SYSCALL 3012 union bpf_tcp_iter_batch_item { 3013 struct sock *sk; 3014 __u64 cookie; 3015 }; 3016 3017 struct bpf_tcp_iter_state { 3018 struct tcp_iter_state state; 3019 unsigned int cur_sk; 3020 unsigned int end_sk; 3021 unsigned int max_sk; 3022 union bpf_tcp_iter_batch_item *batch; 3023 }; 3024 3025 struct bpf_iter__tcp { 3026 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3027 __bpf_md_ptr(struct sock_common *, sk_common); 3028 uid_t uid __aligned(8); 3029 }; 3030 3031 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3032 struct sock_common *sk_common, uid_t uid) 3033 { 3034 struct bpf_iter__tcp ctx; 3035 3036 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3037 ctx.meta = meta; 3038 ctx.sk_common = sk_common; 3039 ctx.uid = uid; 3040 return bpf_iter_run_prog(prog, &ctx); 3041 } 3042 3043 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3044 { 3045 union bpf_tcp_iter_batch_item *item; 3046 unsigned int cur_sk = iter->cur_sk; 3047 __u64 cookie; 3048 3049 /* Remember the cookies of the sockets we haven't seen yet, so we can 3050 * pick up where we left off next time around. 3051 */ 3052 while (cur_sk < iter->end_sk) { 3053 item = &iter->batch[cur_sk++]; 3054 cookie = sock_gen_cookie(item->sk); 3055 sock_gen_put(item->sk); 3056 item->cookie = cookie; 3057 } 3058 } 3059 3060 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3061 unsigned int new_batch_sz, gfp_t flags) 3062 { 3063 union bpf_tcp_iter_batch_item *new_batch; 3064 3065 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3066 flags | __GFP_NOWARN); 3067 if (!new_batch) 3068 return -ENOMEM; 3069 3070 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 3071 kvfree(iter->batch); 3072 iter->batch = new_batch; 3073 iter->max_sk = new_batch_sz; 3074 3075 return 0; 3076 } 3077 3078 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3079 union bpf_tcp_iter_batch_item *cookies, 3080 int n_cookies) 3081 { 3082 struct hlist_nulls_node *node; 3083 struct sock *sk; 3084 int i; 3085 3086 for (i = 0; i < n_cookies; i++) { 3087 sk = first_sk; 3088 sk_nulls_for_each_from(sk, node) 3089 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3090 return sk; 3091 } 3092 3093 return NULL; 3094 } 3095 3096 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3097 { 3098 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3099 struct bpf_tcp_iter_state *iter = seq->private; 3100 struct tcp_iter_state *st = &iter->state; 3101 unsigned int find_cookie = iter->cur_sk; 3102 unsigned int end_cookie = iter->end_sk; 3103 int resume_bucket = st->bucket; 3104 struct sock *sk; 3105 3106 if (end_cookie && find_cookie == end_cookie) 3107 ++st->bucket; 3108 3109 sk = listening_get_first(seq); 3110 iter->cur_sk = 0; 3111 iter->end_sk = 0; 3112 3113 if (sk && st->bucket == resume_bucket && end_cookie) { 3114 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3115 end_cookie - find_cookie); 3116 if (!sk) { 3117 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3118 ++st->bucket; 3119 sk = listening_get_first(seq); 3120 } 3121 } 3122 3123 return sk; 3124 } 3125 3126 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3127 { 3128 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3129 struct bpf_tcp_iter_state *iter = seq->private; 3130 struct tcp_iter_state *st = &iter->state; 3131 unsigned int find_cookie = iter->cur_sk; 3132 unsigned int end_cookie = iter->end_sk; 3133 int resume_bucket = st->bucket; 3134 struct sock *sk; 3135 3136 if (end_cookie && find_cookie == end_cookie) 3137 ++st->bucket; 3138 3139 sk = established_get_first(seq); 3140 iter->cur_sk = 0; 3141 iter->end_sk = 0; 3142 3143 if (sk && st->bucket == resume_bucket && end_cookie) { 3144 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3145 end_cookie - find_cookie); 3146 if (!sk) { 3147 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3148 ++st->bucket; 3149 sk = established_get_first(seq); 3150 } 3151 } 3152 3153 return sk; 3154 } 3155 3156 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3157 { 3158 struct bpf_tcp_iter_state *iter = seq->private; 3159 struct tcp_iter_state *st = &iter->state; 3160 struct sock *sk = NULL; 3161 3162 switch (st->state) { 3163 case TCP_SEQ_STATE_LISTENING: 3164 sk = bpf_iter_tcp_resume_listening(seq); 3165 if (sk) 3166 break; 3167 st->bucket = 0; 3168 st->state = TCP_SEQ_STATE_ESTABLISHED; 3169 fallthrough; 3170 case TCP_SEQ_STATE_ESTABLISHED: 3171 sk = bpf_iter_tcp_resume_established(seq); 3172 break; 3173 } 3174 3175 return sk; 3176 } 3177 3178 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3179 struct sock **start_sk) 3180 { 3181 struct bpf_tcp_iter_state *iter = seq->private; 3182 struct hlist_nulls_node *node; 3183 unsigned int expected = 1; 3184 struct sock *sk; 3185 3186 sock_hold(*start_sk); 3187 iter->batch[iter->end_sk++].sk = *start_sk; 3188 3189 sk = sk_nulls_next(*start_sk); 3190 *start_sk = NULL; 3191 sk_nulls_for_each_from(sk, node) { 3192 if (seq_sk_match(seq, sk)) { 3193 if (iter->end_sk < iter->max_sk) { 3194 sock_hold(sk); 3195 iter->batch[iter->end_sk++].sk = sk; 3196 } else if (!*start_sk) { 3197 /* Remember where we left off. */ 3198 *start_sk = sk; 3199 } 3200 expected++; 3201 } 3202 } 3203 3204 return expected; 3205 } 3206 3207 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3208 struct sock **start_sk) 3209 { 3210 struct bpf_tcp_iter_state *iter = seq->private; 3211 struct hlist_nulls_node *node; 3212 unsigned int expected = 1; 3213 struct sock *sk; 3214 3215 sock_hold(*start_sk); 3216 iter->batch[iter->end_sk++].sk = *start_sk; 3217 3218 sk = sk_nulls_next(*start_sk); 3219 *start_sk = NULL; 3220 sk_nulls_for_each_from(sk, node) { 3221 if (seq_sk_match(seq, sk)) { 3222 if (iter->end_sk < iter->max_sk) { 3223 sock_hold(sk); 3224 iter->batch[iter->end_sk++].sk = sk; 3225 } else if (!*start_sk) { 3226 /* Remember where we left off. */ 3227 *start_sk = sk; 3228 } 3229 expected++; 3230 } 3231 } 3232 3233 return expected; 3234 } 3235 3236 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3237 struct sock **start_sk) 3238 { 3239 struct bpf_tcp_iter_state *iter = seq->private; 3240 struct tcp_iter_state *st = &iter->state; 3241 3242 if (st->state == TCP_SEQ_STATE_LISTENING) 3243 return bpf_iter_tcp_listening_batch(seq, start_sk); 3244 else 3245 return bpf_iter_tcp_established_batch(seq, start_sk); 3246 } 3247 3248 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3249 { 3250 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3251 struct bpf_tcp_iter_state *iter = seq->private; 3252 struct tcp_iter_state *st = &iter->state; 3253 3254 if (st->state == TCP_SEQ_STATE_LISTENING) 3255 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3256 else 3257 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3258 } 3259 3260 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3261 { 3262 struct bpf_tcp_iter_state *iter = seq->private; 3263 unsigned int expected; 3264 struct sock *sk; 3265 int err; 3266 3267 sk = bpf_iter_tcp_resume(seq); 3268 if (!sk) 3269 return NULL; /* Done */ 3270 3271 expected = bpf_iter_fill_batch(seq, &sk); 3272 if (likely(iter->end_sk == expected)) 3273 goto done; 3274 3275 /* Batch size was too small. */ 3276 bpf_iter_tcp_unlock_bucket(seq); 3277 bpf_iter_tcp_put_batch(iter); 3278 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3279 GFP_USER); 3280 if (err) 3281 return ERR_PTR(err); 3282 3283 sk = bpf_iter_tcp_resume(seq); 3284 if (!sk) 3285 return NULL; /* Done */ 3286 3287 expected = bpf_iter_fill_batch(seq, &sk); 3288 if (likely(iter->end_sk == expected)) 3289 goto done; 3290 3291 /* Batch size was still too small. Hold onto the lock while we try 3292 * again with a larger batch to make sure the current bucket's size 3293 * does not change in the meantime. 3294 */ 3295 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3296 if (err) { 3297 bpf_iter_tcp_unlock_bucket(seq); 3298 return ERR_PTR(err); 3299 } 3300 3301 expected = bpf_iter_fill_batch(seq, &sk); 3302 WARN_ON_ONCE(iter->end_sk != expected); 3303 done: 3304 bpf_iter_tcp_unlock_bucket(seq); 3305 return iter->batch[0].sk; 3306 } 3307 3308 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3309 { 3310 /* bpf iter does not support lseek, so it always 3311 * continue from where it was stop()-ped. 3312 */ 3313 if (*pos) 3314 return bpf_iter_tcp_batch(seq); 3315 3316 return SEQ_START_TOKEN; 3317 } 3318 3319 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3320 { 3321 struct bpf_tcp_iter_state *iter = seq->private; 3322 struct tcp_iter_state *st = &iter->state; 3323 struct sock *sk; 3324 3325 /* Whenever seq_next() is called, the iter->cur_sk is 3326 * done with seq_show(), so advance to the next sk in 3327 * the batch. 3328 */ 3329 if (iter->cur_sk < iter->end_sk) { 3330 /* Keeping st->num consistent in tcp_iter_state. 3331 * bpf_iter_tcp does not use st->num. 3332 * meta.seq_num is used instead. 3333 */ 3334 st->num++; 3335 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3336 } 3337 3338 if (iter->cur_sk < iter->end_sk) 3339 sk = iter->batch[iter->cur_sk].sk; 3340 else 3341 sk = bpf_iter_tcp_batch(seq); 3342 3343 ++*pos; 3344 /* Keeping st->last_pos consistent in tcp_iter_state. 3345 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3346 */ 3347 st->last_pos = *pos; 3348 return sk; 3349 } 3350 3351 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3352 { 3353 struct bpf_iter_meta meta; 3354 struct bpf_prog *prog; 3355 struct sock *sk = v; 3356 uid_t uid; 3357 int ret; 3358 3359 if (v == SEQ_START_TOKEN) 3360 return 0; 3361 3362 if (sk_fullsock(sk)) 3363 lock_sock(sk); 3364 3365 if (unlikely(sk_unhashed(sk))) { 3366 ret = SEQ_SKIP; 3367 goto unlock; 3368 } 3369 3370 if (sk->sk_state == TCP_TIME_WAIT) { 3371 uid = 0; 3372 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3373 const struct request_sock *req = v; 3374 3375 uid = from_kuid_munged(seq_user_ns(seq), 3376 sk_uid(req->rsk_listener)); 3377 } else { 3378 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3379 } 3380 3381 meta.seq = seq; 3382 prog = bpf_iter_get_info(&meta, false); 3383 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3384 3385 unlock: 3386 if (sk_fullsock(sk)) 3387 release_sock(sk); 3388 return ret; 3389 3390 } 3391 3392 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3393 { 3394 struct bpf_tcp_iter_state *iter = seq->private; 3395 struct bpf_iter_meta meta; 3396 struct bpf_prog *prog; 3397 3398 if (!v) { 3399 meta.seq = seq; 3400 prog = bpf_iter_get_info(&meta, true); 3401 if (prog) 3402 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3403 } 3404 3405 if (iter->cur_sk < iter->end_sk) 3406 bpf_iter_tcp_put_batch(iter); 3407 } 3408 3409 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3410 .show = bpf_iter_tcp_seq_show, 3411 .start = bpf_iter_tcp_seq_start, 3412 .next = bpf_iter_tcp_seq_next, 3413 .stop = bpf_iter_tcp_seq_stop, 3414 }; 3415 #endif 3416 static unsigned short seq_file_family(const struct seq_file *seq) 3417 { 3418 const struct tcp_seq_afinfo *afinfo; 3419 3420 #ifdef CONFIG_BPF_SYSCALL 3421 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3422 if (seq->op == &bpf_iter_tcp_seq_ops) 3423 return AF_UNSPEC; 3424 #endif 3425 3426 /* Iterated from proc fs */ 3427 afinfo = pde_data(file_inode(seq->file)); 3428 return afinfo->family; 3429 } 3430 3431 static const struct seq_operations tcp4_seq_ops = { 3432 .show = tcp4_seq_show, 3433 .start = tcp_seq_start, 3434 .next = tcp_seq_next, 3435 .stop = tcp_seq_stop, 3436 }; 3437 3438 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3439 .family = AF_INET, 3440 }; 3441 3442 static int __net_init tcp4_proc_init_net(struct net *net) 3443 { 3444 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3445 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3446 return -ENOMEM; 3447 return 0; 3448 } 3449 3450 static void __net_exit tcp4_proc_exit_net(struct net *net) 3451 { 3452 remove_proc_entry("tcp", net->proc_net); 3453 } 3454 3455 static struct pernet_operations tcp4_net_ops = { 3456 .init = tcp4_proc_init_net, 3457 .exit = tcp4_proc_exit_net, 3458 }; 3459 3460 int __init tcp4_proc_init(void) 3461 { 3462 return register_pernet_subsys(&tcp4_net_ops); 3463 } 3464 3465 void tcp4_proc_exit(void) 3466 { 3467 unregister_pernet_subsys(&tcp4_net_ops); 3468 } 3469 #endif /* CONFIG_PROC_FS */ 3470 3471 /* @wake is one when sk_stream_write_space() calls us. 3472 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3473 * This mimics the strategy used in sock_def_write_space(). 3474 */ 3475 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3476 { 3477 const struct tcp_sock *tp = tcp_sk(sk); 3478 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3479 READ_ONCE(tp->snd_nxt); 3480 3481 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3482 } 3483 EXPORT_SYMBOL(tcp_stream_memory_free); 3484 3485 struct proto tcp_prot = { 3486 .name = "TCP", 3487 .owner = THIS_MODULE, 3488 .close = tcp_close, 3489 .pre_connect = tcp_v4_pre_connect, 3490 .connect = tcp_v4_connect, 3491 .disconnect = tcp_disconnect, 3492 .accept = inet_csk_accept, 3493 .ioctl = tcp_ioctl, 3494 .init = tcp_v4_init_sock, 3495 .destroy = tcp_v4_destroy_sock, 3496 .shutdown = tcp_shutdown, 3497 .setsockopt = tcp_setsockopt, 3498 .getsockopt = tcp_getsockopt, 3499 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3500 .keepalive = tcp_set_keepalive, 3501 .recvmsg = tcp_recvmsg, 3502 .sendmsg = tcp_sendmsg, 3503 .splice_eof = tcp_splice_eof, 3504 .backlog_rcv = tcp_v4_do_rcv, 3505 .release_cb = tcp_release_cb, 3506 .hash = inet_hash, 3507 .unhash = inet_unhash, 3508 .get_port = inet_csk_get_port, 3509 .put_port = inet_put_port, 3510 #ifdef CONFIG_BPF_SYSCALL 3511 .psock_update_sk_prot = tcp_bpf_update_proto, 3512 #endif 3513 .enter_memory_pressure = tcp_enter_memory_pressure, 3514 .leave_memory_pressure = tcp_leave_memory_pressure, 3515 .stream_memory_free = tcp_stream_memory_free, 3516 .sockets_allocated = &tcp_sockets_allocated, 3517 3518 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3519 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3520 3521 .memory_pressure = &tcp_memory_pressure, 3522 .sysctl_mem = sysctl_tcp_mem, 3523 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3524 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3525 .max_header = MAX_TCP_HEADER, 3526 .obj_size = sizeof(struct tcp_sock), 3527 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3528 .twsk_prot = &tcp_timewait_sock_ops, 3529 .rsk_prot = &tcp_request_sock_ops, 3530 .h.hashinfo = NULL, 3531 .no_autobind = true, 3532 .diag_destroy = tcp_abort, 3533 }; 3534 EXPORT_SYMBOL(tcp_prot); 3535 3536 static void __net_exit tcp_sk_exit(struct net *net) 3537 { 3538 if (net->ipv4.tcp_congestion_control) 3539 bpf_module_put(net->ipv4.tcp_congestion_control, 3540 net->ipv4.tcp_congestion_control->owner); 3541 } 3542 3543 static void __net_init tcp_set_hashinfo(struct net *net) 3544 { 3545 struct inet_hashinfo *hinfo; 3546 unsigned int ehash_entries; 3547 struct net *old_net; 3548 3549 if (net_eq(net, &init_net)) 3550 goto fallback; 3551 3552 old_net = current->nsproxy->net_ns; 3553 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3554 if (!ehash_entries) 3555 goto fallback; 3556 3557 ehash_entries = roundup_pow_of_two(ehash_entries); 3558 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3559 if (!hinfo) { 3560 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3561 "for a netns, fallback to the global one\n", 3562 ehash_entries); 3563 fallback: 3564 hinfo = &tcp_hashinfo; 3565 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3566 } 3567 3568 net->ipv4.tcp_death_row.hashinfo = hinfo; 3569 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3570 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3571 } 3572 3573 static int __net_init tcp_sk_init(struct net *net) 3574 { 3575 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3576 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3577 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; 3578 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3579 3580 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3581 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3582 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3583 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3584 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3585 3586 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3587 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3588 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3589 3590 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3591 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3592 net->ipv4.sysctl_tcp_syncookies = 1; 3593 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3594 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3595 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3596 net->ipv4.sysctl_tcp_orphan_retries = 0; 3597 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3598 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3599 net->ipv4.sysctl_tcp_tw_reuse = 2; 3600 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3601 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3602 3603 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3604 tcp_set_hashinfo(net); 3605 3606 net->ipv4.sysctl_tcp_sack = 1; 3607 net->ipv4.sysctl_tcp_window_scaling = 1; 3608 net->ipv4.sysctl_tcp_timestamps = 1; 3609 net->ipv4.sysctl_tcp_early_retrans = 3; 3610 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3611 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3612 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3613 net->ipv4.sysctl_tcp_max_reordering = 300; 3614 net->ipv4.sysctl_tcp_dsack = 1; 3615 net->ipv4.sysctl_tcp_app_win = 31; 3616 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3617 net->ipv4.sysctl_tcp_frto = 2; 3618 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3619 /* This limits the percentage of the congestion window which we 3620 * will allow a single TSO frame to consume. Building TSO frames 3621 * which are too large can cause TCP streams to be bursty. 3622 */ 3623 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3624 /* Default TSQ limit of 4 MB */ 3625 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3626 3627 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3628 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3629 3630 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3631 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3632 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3633 net->ipv4.sysctl_tcp_autocorking = 1; 3634 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3635 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3636 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3637 if (net != &init_net) { 3638 memcpy(net->ipv4.sysctl_tcp_rmem, 3639 init_net.ipv4.sysctl_tcp_rmem, 3640 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3641 memcpy(net->ipv4.sysctl_tcp_wmem, 3642 init_net.ipv4.sysctl_tcp_wmem, 3643 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3644 } 3645 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3646 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3647 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3648 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3649 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3650 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3651 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3652 3653 /* Set default values for PLB */ 3654 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3655 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3656 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3657 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3658 /* Default congestion threshold for PLB to mark a round is 50% */ 3659 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3660 3661 /* Reno is always built in */ 3662 if (!net_eq(net, &init_net) && 3663 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3664 init_net.ipv4.tcp_congestion_control->owner)) 3665 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3666 else 3667 net->ipv4.tcp_congestion_control = &tcp_reno; 3668 3669 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3670 net->ipv4.sysctl_tcp_shrink_window = 0; 3671 3672 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3673 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3674 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3675 3676 return 0; 3677 } 3678 3679 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3680 { 3681 struct net *net; 3682 3683 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3684 * and failed setup_net error unwinding path are serialized. 3685 * 3686 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3687 * net_exit_list, the thread that dismantles a particular twsk must 3688 * do so without other thread progressing to refcount_dec_and_test() of 3689 * tcp_death_row.tw_refcount. 3690 */ 3691 mutex_lock(&tcp_exit_batch_mutex); 3692 3693 tcp_twsk_purge(net_exit_list); 3694 3695 list_for_each_entry(net, net_exit_list, exit_list) { 3696 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3697 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3698 tcp_fastopen_ctx_destroy(net); 3699 } 3700 3701 mutex_unlock(&tcp_exit_batch_mutex); 3702 } 3703 3704 static struct pernet_operations __net_initdata tcp_sk_ops = { 3705 .init = tcp_sk_init, 3706 .exit = tcp_sk_exit, 3707 .exit_batch = tcp_sk_exit_batch, 3708 }; 3709 3710 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3711 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3712 struct sock_common *sk_common, uid_t uid) 3713 3714 #define INIT_BATCH_SZ 16 3715 3716 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3717 { 3718 struct bpf_tcp_iter_state *iter = priv_data; 3719 int err; 3720 3721 err = bpf_iter_init_seq_net(priv_data, aux); 3722 if (err) 3723 return err; 3724 3725 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3726 if (err) { 3727 bpf_iter_fini_seq_net(priv_data); 3728 return err; 3729 } 3730 3731 return 0; 3732 } 3733 3734 static void bpf_iter_fini_tcp(void *priv_data) 3735 { 3736 struct bpf_tcp_iter_state *iter = priv_data; 3737 3738 bpf_iter_fini_seq_net(priv_data); 3739 kvfree(iter->batch); 3740 } 3741 3742 static const struct bpf_iter_seq_info tcp_seq_info = { 3743 .seq_ops = &bpf_iter_tcp_seq_ops, 3744 .init_seq_private = bpf_iter_init_tcp, 3745 .fini_seq_private = bpf_iter_fini_tcp, 3746 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3747 }; 3748 3749 static const struct bpf_func_proto * 3750 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3751 const struct bpf_prog *prog) 3752 { 3753 switch (func_id) { 3754 case BPF_FUNC_setsockopt: 3755 return &bpf_sk_setsockopt_proto; 3756 case BPF_FUNC_getsockopt: 3757 return &bpf_sk_getsockopt_proto; 3758 default: 3759 return NULL; 3760 } 3761 } 3762 3763 static struct bpf_iter_reg tcp_reg_info = { 3764 .target = "tcp", 3765 .ctx_arg_info_size = 1, 3766 .ctx_arg_info = { 3767 { offsetof(struct bpf_iter__tcp, sk_common), 3768 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3769 }, 3770 .get_func_proto = bpf_iter_tcp_get_func_proto, 3771 .seq_info = &tcp_seq_info, 3772 }; 3773 3774 static void __init bpf_iter_register(void) 3775 { 3776 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3777 if (bpf_iter_reg_target(&tcp_reg_info)) 3778 pr_warn("Warning: could not register bpf iterator tcp\n"); 3779 } 3780 3781 #endif 3782 3783 void __init tcp_v4_init(void) 3784 { 3785 int cpu, res; 3786 3787 for_each_possible_cpu(cpu) { 3788 struct sock *sk; 3789 3790 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3791 IPPROTO_TCP, &init_net); 3792 if (res) 3793 panic("Failed to create the TCP control socket.\n"); 3794 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3795 3796 /* Please enforce IP_DF and IPID==0 for RST and 3797 * ACK sent in SYN-RECV and TIME-WAIT state. 3798 */ 3799 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3800 3801 sk->sk_clockid = CLOCK_MONOTONIC; 3802 3803 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3804 } 3805 if (register_pernet_subsys(&tcp_sk_ops)) 3806 panic("Failed to create the TCP control socket.\n"); 3807 3808 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3809 bpf_iter_register(); 3810 #endif 3811 } 3812