1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 #include <linux/sock_diag.h> 62 63 #include <net/aligned_data.h> 64 #include <net/net_namespace.h> 65 #include <net/icmp.h> 66 #include <net/inet_hashtables.h> 67 #include <net/tcp.h> 68 #include <net/transp_v6.h> 69 #include <net/ipv6.h> 70 #include <net/inet_common.h> 71 #include <net/inet_ecn.h> 72 #include <net/timewait_sock.h> 73 #include <net/xfrm.h> 74 #include <net/secure_seq.h> 75 #include <net/busy_poll.h> 76 #include <net/rstreason.h> 77 78 #include <linux/inet.h> 79 #include <linux/ipv6.h> 80 #include <linux/stddef.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 #include <linux/inetdevice.h> 84 #include <linux/btf_ids.h> 85 #include <linux/skbuff_ref.h> 86 87 #include <crypto/hash.h> 88 #include <linux/scatterlist.h> 89 90 #include <trace/events/tcp.h> 91 92 #ifdef CONFIG_TCP_MD5SIG 93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 94 __be32 daddr, __be32 saddr, const struct tcphdr *th); 95 #endif 96 97 struct inet_hashinfo tcp_hashinfo; 98 99 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 100 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 101 }; 102 103 static DEFINE_MUTEX(tcp_exit_batch_mutex); 104 105 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 106 { 107 return secure_tcp_seq(ip_hdr(skb)->daddr, 108 ip_hdr(skb)->saddr, 109 tcp_hdr(skb)->dest, 110 tcp_hdr(skb)->source); 111 } 112 113 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 114 { 115 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 116 } 117 118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 119 { 120 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 121 const struct inet_timewait_sock *tw = inet_twsk(sktw); 122 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 123 struct tcp_sock *tp = tcp_sk(sk); 124 int ts_recent_stamp; 125 u32 reuse_thresh; 126 127 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 128 reuse = 0; 129 130 if (reuse == 2) { 131 /* Still does not detect *everything* that goes through 132 * lo, since we require a loopback src or dst address 133 * or direct binding to 'lo' interface. 134 */ 135 bool loopback = false; 136 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 137 loopback = true; 138 #if IS_ENABLED(CONFIG_IPV6) 139 if (tw->tw_family == AF_INET6) { 140 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 142 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 144 loopback = true; 145 } else 146 #endif 147 { 148 if (ipv4_is_loopback(tw->tw_daddr) || 149 ipv4_is_loopback(tw->tw_rcv_saddr)) 150 loopback = true; 151 } 152 if (!loopback) 153 reuse = 0; 154 } 155 156 /* With PAWS, it is safe from the viewpoint 157 of data integrity. Even without PAWS it is safe provided sequence 158 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 159 160 Actually, the idea is close to VJ's one, only timestamp cache is 161 held not per host, but per port pair and TW bucket is used as state 162 holder. 163 164 If TW bucket has been already destroyed we fall back to VJ's scheme 165 and use initial timestamp retrieved from peer table. 166 */ 167 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 168 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 169 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 170 if (ts_recent_stamp && 171 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 172 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 173 * and releasing the bucket lock. 174 */ 175 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 176 return 0; 177 178 /* In case of repair and re-using TIME-WAIT sockets we still 179 * want to be sure that it is safe as above but honor the 180 * sequence numbers and time stamps set as part of the repair 181 * process. 182 * 183 * Without this check re-using a TIME-WAIT socket with TCP 184 * repair would accumulate a -1 on the repair assigned 185 * sequence number. The first time it is reused the sequence 186 * is -1, the second time -2, etc. This fixes that issue 187 * without appearing to create any others. 188 */ 189 if (likely(!tp->repair)) { 190 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 191 192 if (!seq) 193 seq = 1; 194 WRITE_ONCE(tp->write_seq, seq); 195 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 196 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 197 } 198 199 return 1; 200 } 201 202 return 0; 203 } 204 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 205 206 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 207 int addr_len) 208 { 209 /* This check is replicated from tcp_v4_connect() and intended to 210 * prevent BPF program called below from accessing bytes that are out 211 * of the bound specified by user in addr_len. 212 */ 213 if (addr_len < sizeof(struct sockaddr_in)) 214 return -EINVAL; 215 216 sock_owned_by_me(sk); 217 218 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 219 } 220 221 /* This will initiate an outgoing connection. */ 222 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 223 { 224 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 225 struct inet_timewait_death_row *tcp_death_row; 226 struct inet_sock *inet = inet_sk(sk); 227 struct tcp_sock *tp = tcp_sk(sk); 228 struct ip_options_rcu *inet_opt; 229 struct net *net = sock_net(sk); 230 __be16 orig_sport, orig_dport; 231 __be32 daddr, nexthop; 232 struct flowi4 *fl4; 233 struct rtable *rt; 234 int err; 235 236 if (addr_len < sizeof(struct sockaddr_in)) 237 return -EINVAL; 238 239 if (usin->sin_family != AF_INET) 240 return -EAFNOSUPPORT; 241 242 nexthop = daddr = usin->sin_addr.s_addr; 243 inet_opt = rcu_dereference_protected(inet->inet_opt, 244 lockdep_sock_is_held(sk)); 245 if (inet_opt && inet_opt->opt.srr) { 246 if (!daddr) 247 return -EINVAL; 248 nexthop = inet_opt->opt.faddr; 249 } 250 251 orig_sport = inet->inet_sport; 252 orig_dport = usin->sin_port; 253 fl4 = &inet->cork.fl.u.ip4; 254 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 255 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 256 orig_dport, sk); 257 if (IS_ERR(rt)) { 258 err = PTR_ERR(rt); 259 if (err == -ENETUNREACH) 260 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 261 return err; 262 } 263 264 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 265 ip_rt_put(rt); 266 return -ENETUNREACH; 267 } 268 269 if (!inet_opt || !inet_opt->opt.srr) 270 daddr = fl4->daddr; 271 272 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 273 274 if (!inet->inet_saddr) { 275 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 276 if (err) { 277 ip_rt_put(rt); 278 return err; 279 } 280 } else { 281 sk_rcv_saddr_set(sk, inet->inet_saddr); 282 } 283 284 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 285 /* Reset inherited state */ 286 tp->rx_opt.ts_recent = 0; 287 tp->rx_opt.ts_recent_stamp = 0; 288 if (likely(!tp->repair)) 289 WRITE_ONCE(tp->write_seq, 0); 290 } 291 292 inet->inet_dport = usin->sin_port; 293 sk_daddr_set(sk, daddr); 294 295 inet_csk(sk)->icsk_ext_hdr_len = 0; 296 if (inet_opt) 297 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 298 299 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 300 301 /* Socket identity is still unknown (sport may be zero). 302 * However we set state to SYN-SENT and not releasing socket 303 * lock select source port, enter ourselves into the hash tables and 304 * complete initialization after this. 305 */ 306 tcp_set_state(sk, TCP_SYN_SENT); 307 err = inet_hash_connect(tcp_death_row, sk); 308 if (err) 309 goto failure; 310 311 sk_set_txhash(sk); 312 313 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 314 inet->inet_sport, inet->inet_dport, sk); 315 if (IS_ERR(rt)) { 316 err = PTR_ERR(rt); 317 rt = NULL; 318 goto failure; 319 } 320 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 321 /* OK, now commit destination to socket. */ 322 sk->sk_gso_type = SKB_GSO_TCPV4; 323 sk_setup_caps(sk, &rt->dst); 324 rt = NULL; 325 326 if (likely(!tp->repair)) { 327 if (!tp->write_seq) 328 WRITE_ONCE(tp->write_seq, 329 secure_tcp_seq(inet->inet_saddr, 330 inet->inet_daddr, 331 inet->inet_sport, 332 usin->sin_port)); 333 WRITE_ONCE(tp->tsoffset, 334 secure_tcp_ts_off(net, inet->inet_saddr, 335 inet->inet_daddr)); 336 } 337 338 atomic_set(&inet->inet_id, get_random_u16()); 339 340 if (tcp_fastopen_defer_connect(sk, &err)) 341 return err; 342 if (err) 343 goto failure; 344 345 err = tcp_connect(sk); 346 347 if (err) 348 goto failure; 349 350 return 0; 351 352 failure: 353 /* 354 * This unhashes the socket and releases the local port, 355 * if necessary. 356 */ 357 tcp_set_state(sk, TCP_CLOSE); 358 inet_bhash2_reset_saddr(sk); 359 ip_rt_put(rt); 360 sk->sk_route_caps = 0; 361 inet->inet_dport = 0; 362 return err; 363 } 364 EXPORT_IPV6_MOD(tcp_v4_connect); 365 366 /* 367 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 368 * It can be called through tcp_release_cb() if socket was owned by user 369 * at the time tcp_v4_err() was called to handle ICMP message. 370 */ 371 void tcp_v4_mtu_reduced(struct sock *sk) 372 { 373 struct inet_sock *inet = inet_sk(sk); 374 struct dst_entry *dst; 375 u32 mtu; 376 377 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 378 return; 379 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 380 dst = inet_csk_update_pmtu(sk, mtu); 381 if (!dst) 382 return; 383 384 /* Something is about to be wrong... Remember soft error 385 * for the case, if this connection will not able to recover. 386 */ 387 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 388 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 389 390 mtu = dst_mtu(dst); 391 392 if (inet->pmtudisc != IP_PMTUDISC_DONT && 393 ip_sk_accept_pmtu(sk) && 394 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 395 tcp_sync_mss(sk, mtu); 396 397 /* Resend the TCP packet because it's 398 * clear that the old packet has been 399 * dropped. This is the new "fast" path mtu 400 * discovery. 401 */ 402 tcp_simple_retransmit(sk); 403 } /* else let the usual retransmit timer handle it */ 404 } 405 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 406 407 static void do_redirect(struct sk_buff *skb, struct sock *sk) 408 { 409 struct dst_entry *dst = __sk_dst_check(sk, 0); 410 411 if (dst) 412 dst->ops->redirect(dst, sk, skb); 413 } 414 415 416 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 417 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 418 { 419 struct request_sock *req = inet_reqsk(sk); 420 struct net *net = sock_net(sk); 421 422 /* ICMPs are not backlogged, hence we cannot get 423 * an established socket here. 424 */ 425 if (seq != tcp_rsk(req)->snt_isn) { 426 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 427 } else if (abort) { 428 /* 429 * Still in SYN_RECV, just remove it silently. 430 * There is no good way to pass the error to the newly 431 * created socket, and POSIX does not want network 432 * errors returned from accept(). 433 */ 434 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 435 tcp_listendrop(req->rsk_listener); 436 } 437 reqsk_put(req); 438 } 439 EXPORT_IPV6_MOD(tcp_req_err); 440 441 /* TCP-LD (RFC 6069) logic */ 442 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 443 { 444 struct inet_connection_sock *icsk = inet_csk(sk); 445 struct tcp_sock *tp = tcp_sk(sk); 446 struct sk_buff *skb; 447 s32 remaining; 448 u32 delta_us; 449 450 if (sock_owned_by_user(sk)) 451 return; 452 453 if (seq != tp->snd_una || !icsk->icsk_retransmits || 454 !icsk->icsk_backoff) 455 return; 456 457 skb = tcp_rtx_queue_head(sk); 458 if (WARN_ON_ONCE(!skb)) 459 return; 460 461 icsk->icsk_backoff--; 462 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 463 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 464 465 tcp_mstamp_refresh(tp); 466 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 467 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 468 469 if (remaining > 0) { 470 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 471 } else { 472 /* RTO revert clocked out retransmission. 473 * Will retransmit now. 474 */ 475 tcp_retransmit_timer(sk); 476 } 477 } 478 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 479 480 /* 481 * This routine is called by the ICMP module when it gets some 482 * sort of error condition. If err < 0 then the socket should 483 * be closed and the error returned to the user. If err > 0 484 * it's just the icmp type << 8 | icmp code. After adjustment 485 * header points to the first 8 bytes of the tcp header. We need 486 * to find the appropriate port. 487 * 488 * The locking strategy used here is very "optimistic". When 489 * someone else accesses the socket the ICMP is just dropped 490 * and for some paths there is no check at all. 491 * A more general error queue to queue errors for later handling 492 * is probably better. 493 * 494 */ 495 496 int tcp_v4_err(struct sk_buff *skb, u32 info) 497 { 498 const struct iphdr *iph = (const struct iphdr *)skb->data; 499 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 500 struct net *net = dev_net_rcu(skb->dev); 501 const int type = icmp_hdr(skb)->type; 502 const int code = icmp_hdr(skb)->code; 503 struct request_sock *fastopen; 504 struct tcp_sock *tp; 505 u32 seq, snd_una; 506 struct sock *sk; 507 int err; 508 509 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 510 ntohs(th->source), inet_iif(skb), 0); 511 if (!sk) { 512 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 513 return -ENOENT; 514 } 515 if (sk->sk_state == TCP_TIME_WAIT) { 516 /* To increase the counter of ignored icmps for TCP-AO */ 517 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 518 inet_twsk_put(inet_twsk(sk)); 519 return 0; 520 } 521 seq = ntohl(th->seq); 522 if (sk->sk_state == TCP_NEW_SYN_RECV) { 523 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 524 type == ICMP_TIME_EXCEEDED || 525 (type == ICMP_DEST_UNREACH && 526 (code == ICMP_NET_UNREACH || 527 code == ICMP_HOST_UNREACH))); 528 return 0; 529 } 530 531 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 532 sock_put(sk); 533 return 0; 534 } 535 536 bh_lock_sock(sk); 537 /* If too many ICMPs get dropped on busy 538 * servers this needs to be solved differently. 539 * We do take care of PMTU discovery (RFC1191) special case : 540 * we can receive locally generated ICMP messages while socket is held. 541 */ 542 if (sock_owned_by_user(sk)) { 543 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 544 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 545 } 546 if (sk->sk_state == TCP_CLOSE) 547 goto out; 548 549 if (static_branch_unlikely(&ip4_min_ttl)) { 550 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 551 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 552 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 553 goto out; 554 } 555 } 556 557 tp = tcp_sk(sk); 558 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 559 fastopen = rcu_dereference(tp->fastopen_rsk); 560 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 561 if (sk->sk_state != TCP_LISTEN && 562 !between(seq, snd_una, tp->snd_nxt)) { 563 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 564 goto out; 565 } 566 567 switch (type) { 568 case ICMP_REDIRECT: 569 if (!sock_owned_by_user(sk)) 570 do_redirect(skb, sk); 571 goto out; 572 case ICMP_SOURCE_QUENCH: 573 /* Just silently ignore these. */ 574 goto out; 575 case ICMP_PARAMETERPROB: 576 err = EPROTO; 577 break; 578 case ICMP_DEST_UNREACH: 579 if (code > NR_ICMP_UNREACH) 580 goto out; 581 582 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 583 /* We are not interested in TCP_LISTEN and open_requests 584 * (SYN-ACKs send out by Linux are always <576bytes so 585 * they should go through unfragmented). 586 */ 587 if (sk->sk_state == TCP_LISTEN) 588 goto out; 589 590 WRITE_ONCE(tp->mtu_info, info); 591 if (!sock_owned_by_user(sk)) { 592 tcp_v4_mtu_reduced(sk); 593 } else { 594 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 595 sock_hold(sk); 596 } 597 goto out; 598 } 599 600 err = icmp_err_convert[code].errno; 601 /* check if this ICMP message allows revert of backoff. 602 * (see RFC 6069) 603 */ 604 if (!fastopen && 605 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 606 tcp_ld_RTO_revert(sk, seq); 607 break; 608 case ICMP_TIME_EXCEEDED: 609 err = EHOSTUNREACH; 610 break; 611 default: 612 goto out; 613 } 614 615 switch (sk->sk_state) { 616 case TCP_SYN_SENT: 617 case TCP_SYN_RECV: 618 /* Only in fast or simultaneous open. If a fast open socket is 619 * already accepted it is treated as a connected one below. 620 */ 621 if (fastopen && !fastopen->sk) 622 break; 623 624 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 625 626 if (!sock_owned_by_user(sk)) 627 tcp_done_with_error(sk, err); 628 else 629 WRITE_ONCE(sk->sk_err_soft, err); 630 goto out; 631 } 632 633 /* If we've already connected we will keep trying 634 * until we time out, or the user gives up. 635 * 636 * rfc1122 4.2.3.9 allows to consider as hard errors 637 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 638 * but it is obsoleted by pmtu discovery). 639 * 640 * Note, that in modern internet, where routing is unreliable 641 * and in each dark corner broken firewalls sit, sending random 642 * errors ordered by their masters even this two messages finally lose 643 * their original sense (even Linux sends invalid PORT_UNREACHs) 644 * 645 * Now we are in compliance with RFCs. 646 * --ANK (980905) 647 */ 648 649 if (!sock_owned_by_user(sk) && 650 inet_test_bit(RECVERR, sk)) { 651 WRITE_ONCE(sk->sk_err, err); 652 sk_error_report(sk); 653 } else { /* Only an error on timeout */ 654 WRITE_ONCE(sk->sk_err_soft, err); 655 } 656 657 out: 658 bh_unlock_sock(sk); 659 sock_put(sk); 660 return 0; 661 } 662 663 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 664 { 665 struct tcphdr *th = tcp_hdr(skb); 666 667 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 668 skb->csum_start = skb_transport_header(skb) - skb->head; 669 skb->csum_offset = offsetof(struct tcphdr, check); 670 } 671 672 /* This routine computes an IPv4 TCP checksum. */ 673 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 674 { 675 const struct inet_sock *inet = inet_sk(sk); 676 677 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 678 } 679 EXPORT_IPV6_MOD(tcp_v4_send_check); 680 681 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 682 683 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 684 const struct tcp_ao_hdr *aoh, 685 struct ip_reply_arg *arg, struct tcphdr *reply, 686 __be32 reply_options[REPLY_OPTIONS_LEN]) 687 { 688 #ifdef CONFIG_TCP_AO 689 int sdif = tcp_v4_sdif(skb); 690 int dif = inet_iif(skb); 691 int l3index = sdif ? dif : 0; 692 bool allocated_traffic_key; 693 struct tcp_ao_key *key; 694 char *traffic_key; 695 bool drop = true; 696 u32 ao_sne = 0; 697 u8 keyid; 698 699 rcu_read_lock(); 700 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 701 &key, &traffic_key, &allocated_traffic_key, 702 &keyid, &ao_sne)) 703 goto out; 704 705 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 706 (aoh->rnext_keyid << 8) | keyid); 707 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 708 reply->doff = arg->iov[0].iov_len / 4; 709 710 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 711 key, traffic_key, 712 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 713 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 714 reply, ao_sne)) 715 goto out; 716 drop = false; 717 out: 718 rcu_read_unlock(); 719 if (allocated_traffic_key) 720 kfree(traffic_key); 721 return drop; 722 #else 723 return true; 724 #endif 725 } 726 727 /* 728 * This routine will send an RST to the other tcp. 729 * 730 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 731 * for reset. 732 * Answer: if a packet caused RST, it is not for a socket 733 * existing in our system, if it is matched to a socket, 734 * it is just duplicate segment or bug in other side's TCP. 735 * So that we build reply only basing on parameters 736 * arrived with segment. 737 * Exception: precedence violation. We do not implement it in any case. 738 */ 739 740 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 741 enum sk_rst_reason reason) 742 { 743 const struct tcphdr *th = tcp_hdr(skb); 744 struct { 745 struct tcphdr th; 746 __be32 opt[REPLY_OPTIONS_LEN]; 747 } rep; 748 const __u8 *md5_hash_location = NULL; 749 const struct tcp_ao_hdr *aoh; 750 struct ip_reply_arg arg; 751 #ifdef CONFIG_TCP_MD5SIG 752 struct tcp_md5sig_key *key = NULL; 753 unsigned char newhash[16]; 754 struct sock *sk1 = NULL; 755 int genhash; 756 #endif 757 u64 transmit_time = 0; 758 struct sock *ctl_sk; 759 struct net *net; 760 u32 txhash = 0; 761 762 /* Never send a reset in response to a reset. */ 763 if (th->rst) 764 return; 765 766 /* If sk not NULL, it means we did a successful lookup and incoming 767 * route had to be correct. prequeue might have dropped our dst. 768 */ 769 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 770 return; 771 772 /* Swap the send and the receive. */ 773 memset(&rep, 0, sizeof(rep)); 774 rep.th.dest = th->source; 775 rep.th.source = th->dest; 776 rep.th.doff = sizeof(struct tcphdr) / 4; 777 rep.th.rst = 1; 778 779 if (th->ack) { 780 rep.th.seq = th->ack_seq; 781 } else { 782 rep.th.ack = 1; 783 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 784 skb->len - (th->doff << 2)); 785 } 786 787 memset(&arg, 0, sizeof(arg)); 788 arg.iov[0].iov_base = (unsigned char *)&rep; 789 arg.iov[0].iov_len = sizeof(rep.th); 790 791 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 792 793 /* Invalid TCP option size or twice included auth */ 794 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 795 return; 796 797 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 798 return; 799 800 #ifdef CONFIG_TCP_MD5SIG 801 rcu_read_lock(); 802 if (sk && sk_fullsock(sk)) { 803 const union tcp_md5_addr *addr; 804 int l3index; 805 806 /* sdif set, means packet ingressed via a device 807 * in an L3 domain and inet_iif is set to it. 808 */ 809 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 810 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 811 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 812 } else if (md5_hash_location) { 813 const union tcp_md5_addr *addr; 814 int sdif = tcp_v4_sdif(skb); 815 int dif = inet_iif(skb); 816 int l3index; 817 818 /* 819 * active side is lost. Try to find listening socket through 820 * source port, and then find md5 key through listening socket. 821 * we are not loose security here: 822 * Incoming packet is checked with md5 hash with finding key, 823 * no RST generated if md5 hash doesn't match. 824 */ 825 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 826 th->source, ip_hdr(skb)->daddr, 827 ntohs(th->source), dif, sdif); 828 /* don't send rst if it can't find key */ 829 if (!sk1) 830 goto out; 831 832 /* sdif set, means packet ingressed via a device 833 * in an L3 domain and dif is set to it. 834 */ 835 l3index = sdif ? dif : 0; 836 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 837 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 838 if (!key) 839 goto out; 840 841 842 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 843 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 844 goto out; 845 846 } 847 848 if (key) { 849 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 850 (TCPOPT_NOP << 16) | 851 (TCPOPT_MD5SIG << 8) | 852 TCPOLEN_MD5SIG); 853 /* Update length and the length the header thinks exists */ 854 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 855 rep.th.doff = arg.iov[0].iov_len / 4; 856 857 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 858 key, ip_hdr(skb)->saddr, 859 ip_hdr(skb)->daddr, &rep.th); 860 } 861 #endif 862 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 863 if (rep.opt[0] == 0) { 864 __be32 mrst = mptcp_reset_option(skb); 865 866 if (mrst) { 867 rep.opt[0] = mrst; 868 arg.iov[0].iov_len += sizeof(mrst); 869 rep.th.doff = arg.iov[0].iov_len / 4; 870 } 871 } 872 873 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 874 ip_hdr(skb)->saddr, /* XXX */ 875 arg.iov[0].iov_len, IPPROTO_TCP, 0); 876 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 877 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 878 879 /* When socket is gone, all binding information is lost. 880 * routing might fail in this case. No choice here, if we choose to force 881 * input interface, we will misroute in case of asymmetric route. 882 */ 883 if (sk) 884 arg.bound_dev_if = sk->sk_bound_dev_if; 885 886 trace_tcp_send_reset(sk, skb, reason); 887 888 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 889 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 890 891 /* ECN bits of TW reset are cleared */ 892 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 893 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 894 local_bh_disable(); 895 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 896 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 897 898 sock_net_set(ctl_sk, net); 899 if (sk) { 900 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 901 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 902 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 903 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 904 transmit_time = tcp_transmit_time(sk); 905 xfrm_sk_clone_policy(ctl_sk, sk); 906 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 907 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 908 } else { 909 ctl_sk->sk_mark = 0; 910 ctl_sk->sk_priority = 0; 911 } 912 ip_send_unicast_reply(ctl_sk, sk, 913 skb, &TCP_SKB_CB(skb)->header.h4.opt, 914 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 915 &arg, arg.iov[0].iov_len, 916 transmit_time, txhash); 917 918 xfrm_sk_free_policy(ctl_sk); 919 sock_net_set(ctl_sk, &init_net); 920 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 921 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 922 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 923 local_bh_enable(); 924 925 #ifdef CONFIG_TCP_MD5SIG 926 out: 927 rcu_read_unlock(); 928 #endif 929 } 930 931 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 932 outside socket context is ugly, certainly. What can I do? 933 */ 934 935 static void tcp_v4_send_ack(const struct sock *sk, 936 struct sk_buff *skb, u32 seq, u32 ack, 937 u32 win, u32 tsval, u32 tsecr, int oif, 938 struct tcp_key *key, 939 int reply_flags, u8 tos, u32 txhash) 940 { 941 const struct tcphdr *th = tcp_hdr(skb); 942 struct { 943 struct tcphdr th; 944 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 945 } rep; 946 struct net *net = sock_net(sk); 947 struct ip_reply_arg arg; 948 struct sock *ctl_sk; 949 u64 transmit_time; 950 951 memset(&rep.th, 0, sizeof(struct tcphdr)); 952 memset(&arg, 0, sizeof(arg)); 953 954 arg.iov[0].iov_base = (unsigned char *)&rep; 955 arg.iov[0].iov_len = sizeof(rep.th); 956 if (tsecr) { 957 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 958 (TCPOPT_TIMESTAMP << 8) | 959 TCPOLEN_TIMESTAMP); 960 rep.opt[1] = htonl(tsval); 961 rep.opt[2] = htonl(tsecr); 962 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 963 } 964 965 /* Swap the send and the receive. */ 966 rep.th.dest = th->source; 967 rep.th.source = th->dest; 968 rep.th.doff = arg.iov[0].iov_len / 4; 969 rep.th.seq = htonl(seq); 970 rep.th.ack_seq = htonl(ack); 971 rep.th.ack = 1; 972 rep.th.window = htons(win); 973 974 #ifdef CONFIG_TCP_MD5SIG 975 if (tcp_key_is_md5(key)) { 976 int offset = (tsecr) ? 3 : 0; 977 978 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 979 (TCPOPT_NOP << 16) | 980 (TCPOPT_MD5SIG << 8) | 981 TCPOLEN_MD5SIG); 982 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 983 rep.th.doff = arg.iov[0].iov_len/4; 984 985 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 986 key->md5_key, ip_hdr(skb)->saddr, 987 ip_hdr(skb)->daddr, &rep.th); 988 } 989 #endif 990 #ifdef CONFIG_TCP_AO 991 if (tcp_key_is_ao(key)) { 992 int offset = (tsecr) ? 3 : 0; 993 994 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 995 (tcp_ao_len(key->ao_key) << 16) | 996 (key->ao_key->sndid << 8) | 997 key->rcv_next); 998 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 999 rep.th.doff = arg.iov[0].iov_len / 4; 1000 1001 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1002 key->ao_key, key->traffic_key, 1003 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1004 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1005 &rep.th, key->sne); 1006 } 1007 #endif 1008 arg.flags = reply_flags; 1009 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1010 ip_hdr(skb)->saddr, /* XXX */ 1011 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1012 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1013 if (oif) 1014 arg.bound_dev_if = oif; 1015 arg.tos = tos; 1016 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1017 local_bh_disable(); 1018 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1019 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1020 sock_net_set(ctl_sk, net); 1021 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1022 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1023 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1024 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1025 transmit_time = tcp_transmit_time(sk); 1026 ip_send_unicast_reply(ctl_sk, sk, 1027 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1028 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1029 &arg, arg.iov[0].iov_len, 1030 transmit_time, txhash); 1031 1032 sock_net_set(ctl_sk, &init_net); 1033 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1034 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1035 local_bh_enable(); 1036 } 1037 1038 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1039 enum tcp_tw_status tw_status) 1040 { 1041 struct inet_timewait_sock *tw = inet_twsk(sk); 1042 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1043 struct tcp_key key = {}; 1044 u8 tos = tw->tw_tos; 1045 1046 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1047 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1048 * being placed in a different service queues (Classic rather than L4S) 1049 */ 1050 if (tw_status == TCP_TW_ACK_OOW) 1051 tos &= ~INET_ECN_MASK; 1052 1053 #ifdef CONFIG_TCP_AO 1054 struct tcp_ao_info *ao_info; 1055 1056 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1057 /* FIXME: the segment to-be-acked is not verified yet */ 1058 ao_info = rcu_dereference(tcptw->ao_info); 1059 if (ao_info) { 1060 const struct tcp_ao_hdr *aoh; 1061 1062 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1063 inet_twsk_put(tw); 1064 return; 1065 } 1066 1067 if (aoh) 1068 key.ao_key = tcp_ao_established_key(sk, ao_info, 1069 aoh->rnext_keyid, -1); 1070 } 1071 } 1072 if (key.ao_key) { 1073 struct tcp_ao_key *rnext_key; 1074 1075 key.traffic_key = snd_other_key(key.ao_key); 1076 key.sne = READ_ONCE(ao_info->snd_sne); 1077 rnext_key = READ_ONCE(ao_info->rnext_key); 1078 key.rcv_next = rnext_key->rcvid; 1079 key.type = TCP_KEY_AO; 1080 #else 1081 if (0) { 1082 #endif 1083 } else if (static_branch_tcp_md5()) { 1084 key.md5_key = tcp_twsk_md5_key(tcptw); 1085 if (key.md5_key) 1086 key.type = TCP_KEY_MD5; 1087 } 1088 1089 tcp_v4_send_ack(sk, skb, 1090 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1091 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1092 tcp_tw_tsval(tcptw), 1093 READ_ONCE(tcptw->tw_ts_recent), 1094 tw->tw_bound_dev_if, &key, 1095 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1096 tos, 1097 tw->tw_txhash); 1098 1099 inet_twsk_put(tw); 1100 } 1101 1102 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1103 struct request_sock *req) 1104 { 1105 struct tcp_key key = {}; 1106 1107 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1108 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1109 */ 1110 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1111 tcp_sk(sk)->snd_nxt; 1112 1113 #ifdef CONFIG_TCP_AO 1114 if (static_branch_unlikely(&tcp_ao_needed.key) && 1115 tcp_rsk_used_ao(req)) { 1116 const union tcp_md5_addr *addr; 1117 const struct tcp_ao_hdr *aoh; 1118 int l3index; 1119 1120 /* Invalid TCP option size or twice included auth */ 1121 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1122 return; 1123 if (!aoh) 1124 return; 1125 1126 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1127 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1128 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1129 aoh->rnext_keyid, -1); 1130 if (unlikely(!key.ao_key)) { 1131 /* Send ACK with any matching MKT for the peer */ 1132 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1133 /* Matching key disappeared (user removed the key?) 1134 * let the handshake timeout. 1135 */ 1136 if (!key.ao_key) { 1137 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1138 addr, 1139 ntohs(tcp_hdr(skb)->source), 1140 &ip_hdr(skb)->daddr, 1141 ntohs(tcp_hdr(skb)->dest)); 1142 return; 1143 } 1144 } 1145 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1146 if (!key.traffic_key) 1147 return; 1148 1149 key.type = TCP_KEY_AO; 1150 key.rcv_next = aoh->keyid; 1151 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1152 #else 1153 if (0) { 1154 #endif 1155 } else if (static_branch_tcp_md5()) { 1156 const union tcp_md5_addr *addr; 1157 int l3index; 1158 1159 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1160 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1161 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1162 if (key.md5_key) 1163 key.type = TCP_KEY_MD5; 1164 } 1165 1166 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1167 tcp_v4_send_ack(sk, skb, seq, 1168 tcp_rsk(req)->rcv_nxt, 1169 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1170 tcp_rsk_tsval(tcp_rsk(req)), 1171 req->ts_recent, 1172 0, &key, 1173 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1174 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1175 READ_ONCE(tcp_rsk(req)->txhash)); 1176 if (tcp_key_is_ao(&key)) 1177 kfree(key.traffic_key); 1178 } 1179 1180 /* 1181 * Send a SYN-ACK after having received a SYN. 1182 * This still operates on a request_sock only, not on a big 1183 * socket. 1184 */ 1185 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1186 struct flowi *fl, 1187 struct request_sock *req, 1188 struct tcp_fastopen_cookie *foc, 1189 enum tcp_synack_type synack_type, 1190 struct sk_buff *syn_skb) 1191 { 1192 const struct inet_request_sock *ireq = inet_rsk(req); 1193 struct flowi4 fl4; 1194 int err = -1; 1195 struct sk_buff *skb; 1196 u8 tos; 1197 1198 /* First, grab a route. */ 1199 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1200 return -1; 1201 1202 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1203 1204 if (skb) { 1205 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1206 1207 tos = READ_ONCE(inet_sk(sk)->tos); 1208 1209 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1210 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1211 (tos & INET_ECN_MASK); 1212 1213 if (!INET_ECN_is_capable(tos) && 1214 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1215 tos |= INET_ECN_ECT_0; 1216 1217 rcu_read_lock(); 1218 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1219 ireq->ir_rmt_addr, 1220 rcu_dereference(ireq->ireq_opt), 1221 tos); 1222 rcu_read_unlock(); 1223 err = net_xmit_eval(err); 1224 } 1225 1226 return err; 1227 } 1228 1229 /* 1230 * IPv4 request_sock destructor. 1231 */ 1232 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1233 { 1234 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1235 } 1236 1237 #ifdef CONFIG_TCP_MD5SIG 1238 /* 1239 * RFC2385 MD5 checksumming requires a mapping of 1240 * IP address->MD5 Key. 1241 * We need to maintain these in the sk structure. 1242 */ 1243 1244 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1245 EXPORT_IPV6_MOD(tcp_md5_needed); 1246 1247 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1248 { 1249 if (!old) 1250 return true; 1251 1252 /* l3index always overrides non-l3index */ 1253 if (old->l3index && new->l3index == 0) 1254 return false; 1255 if (old->l3index == 0 && new->l3index) 1256 return true; 1257 1258 return old->prefixlen < new->prefixlen; 1259 } 1260 1261 /* Find the Key structure for an address. */ 1262 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1263 const union tcp_md5_addr *addr, 1264 int family, bool any_l3index) 1265 { 1266 const struct tcp_sock *tp = tcp_sk(sk); 1267 struct tcp_md5sig_key *key; 1268 const struct tcp_md5sig_info *md5sig; 1269 __be32 mask; 1270 struct tcp_md5sig_key *best_match = NULL; 1271 bool match; 1272 1273 /* caller either holds rcu_read_lock() or socket lock */ 1274 md5sig = rcu_dereference_check(tp->md5sig_info, 1275 lockdep_sock_is_held(sk)); 1276 if (!md5sig) 1277 return NULL; 1278 1279 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1280 lockdep_sock_is_held(sk)) { 1281 if (key->family != family) 1282 continue; 1283 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1284 key->l3index != l3index) 1285 continue; 1286 if (family == AF_INET) { 1287 mask = inet_make_mask(key->prefixlen); 1288 match = (key->addr.a4.s_addr & mask) == 1289 (addr->a4.s_addr & mask); 1290 #if IS_ENABLED(CONFIG_IPV6) 1291 } else if (family == AF_INET6) { 1292 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1293 key->prefixlen); 1294 #endif 1295 } else { 1296 match = false; 1297 } 1298 1299 if (match && better_md5_match(best_match, key)) 1300 best_match = key; 1301 } 1302 return best_match; 1303 } 1304 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1305 1306 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1307 const union tcp_md5_addr *addr, 1308 int family, u8 prefixlen, 1309 int l3index, u8 flags) 1310 { 1311 const struct tcp_sock *tp = tcp_sk(sk); 1312 struct tcp_md5sig_key *key; 1313 unsigned int size = sizeof(struct in_addr); 1314 const struct tcp_md5sig_info *md5sig; 1315 1316 /* caller either holds rcu_read_lock() or socket lock */ 1317 md5sig = rcu_dereference_check(tp->md5sig_info, 1318 lockdep_sock_is_held(sk)); 1319 if (!md5sig) 1320 return NULL; 1321 #if IS_ENABLED(CONFIG_IPV6) 1322 if (family == AF_INET6) 1323 size = sizeof(struct in6_addr); 1324 #endif 1325 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1326 lockdep_sock_is_held(sk)) { 1327 if (key->family != family) 1328 continue; 1329 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1330 continue; 1331 if (key->l3index != l3index) 1332 continue; 1333 if (!memcmp(&key->addr, addr, size) && 1334 key->prefixlen == prefixlen) 1335 return key; 1336 } 1337 return NULL; 1338 } 1339 1340 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1341 const struct sock *addr_sk) 1342 { 1343 const union tcp_md5_addr *addr; 1344 int l3index; 1345 1346 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1347 addr_sk->sk_bound_dev_if); 1348 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1349 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1350 } 1351 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1352 1353 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1354 { 1355 struct tcp_sock *tp = tcp_sk(sk); 1356 struct tcp_md5sig_info *md5sig; 1357 1358 md5sig = kmalloc(sizeof(*md5sig), gfp); 1359 if (!md5sig) 1360 return -ENOMEM; 1361 1362 sk_gso_disable(sk); 1363 INIT_HLIST_HEAD(&md5sig->head); 1364 rcu_assign_pointer(tp->md5sig_info, md5sig); 1365 return 0; 1366 } 1367 1368 /* This can be called on a newly created socket, from other files */ 1369 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1370 int family, u8 prefixlen, int l3index, u8 flags, 1371 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1372 { 1373 /* Add Key to the list */ 1374 struct tcp_md5sig_key *key; 1375 struct tcp_sock *tp = tcp_sk(sk); 1376 struct tcp_md5sig_info *md5sig; 1377 1378 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1379 if (key) { 1380 /* Pre-existing entry - just update that one. 1381 * Note that the key might be used concurrently. 1382 * data_race() is telling kcsan that we do not care of 1383 * key mismatches, since changing MD5 key on live flows 1384 * can lead to packet drops. 1385 */ 1386 data_race(memcpy(key->key, newkey, newkeylen)); 1387 1388 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1389 * Also note that a reader could catch new key->keylen value 1390 * but old key->key[], this is the reason we use __GFP_ZERO 1391 * at sock_kmalloc() time below these lines. 1392 */ 1393 WRITE_ONCE(key->keylen, newkeylen); 1394 1395 return 0; 1396 } 1397 1398 md5sig = rcu_dereference_protected(tp->md5sig_info, 1399 lockdep_sock_is_held(sk)); 1400 1401 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1402 if (!key) 1403 return -ENOMEM; 1404 1405 memcpy(key->key, newkey, newkeylen); 1406 key->keylen = newkeylen; 1407 key->family = family; 1408 key->prefixlen = prefixlen; 1409 key->l3index = l3index; 1410 key->flags = flags; 1411 memcpy(&key->addr, addr, 1412 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1413 sizeof(struct in_addr)); 1414 hlist_add_head_rcu(&key->node, &md5sig->head); 1415 return 0; 1416 } 1417 1418 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1419 int family, u8 prefixlen, int l3index, u8 flags, 1420 const u8 *newkey, u8 newkeylen) 1421 { 1422 struct tcp_sock *tp = tcp_sk(sk); 1423 1424 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1425 if (tcp_md5_alloc_sigpool()) 1426 return -ENOMEM; 1427 1428 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1429 tcp_md5_release_sigpool(); 1430 return -ENOMEM; 1431 } 1432 1433 if (!static_branch_inc(&tcp_md5_needed.key)) { 1434 struct tcp_md5sig_info *md5sig; 1435 1436 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1437 rcu_assign_pointer(tp->md5sig_info, NULL); 1438 kfree_rcu(md5sig, rcu); 1439 tcp_md5_release_sigpool(); 1440 return -EUSERS; 1441 } 1442 } 1443 1444 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1445 newkey, newkeylen, GFP_KERNEL); 1446 } 1447 EXPORT_IPV6_MOD(tcp_md5_do_add); 1448 1449 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1450 int family, u8 prefixlen, int l3index, 1451 struct tcp_md5sig_key *key) 1452 { 1453 struct tcp_sock *tp = tcp_sk(sk); 1454 1455 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1456 tcp_md5_add_sigpool(); 1457 1458 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1459 tcp_md5_release_sigpool(); 1460 return -ENOMEM; 1461 } 1462 1463 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1464 struct tcp_md5sig_info *md5sig; 1465 1466 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1467 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1468 rcu_assign_pointer(tp->md5sig_info, NULL); 1469 kfree_rcu(md5sig, rcu); 1470 tcp_md5_release_sigpool(); 1471 return -EUSERS; 1472 } 1473 } 1474 1475 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1476 key->flags, key->key, key->keylen, 1477 sk_gfp_mask(sk, GFP_ATOMIC)); 1478 } 1479 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1480 1481 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1482 u8 prefixlen, int l3index, u8 flags) 1483 { 1484 struct tcp_md5sig_key *key; 1485 1486 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1487 if (!key) 1488 return -ENOENT; 1489 hlist_del_rcu(&key->node); 1490 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1491 kfree_rcu(key, rcu); 1492 return 0; 1493 } 1494 EXPORT_IPV6_MOD(tcp_md5_do_del); 1495 1496 void tcp_clear_md5_list(struct sock *sk) 1497 { 1498 struct tcp_sock *tp = tcp_sk(sk); 1499 struct tcp_md5sig_key *key; 1500 struct hlist_node *n; 1501 struct tcp_md5sig_info *md5sig; 1502 1503 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1504 1505 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1506 hlist_del(&key->node); 1507 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1508 kfree(key); 1509 } 1510 } 1511 1512 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1513 sockptr_t optval, int optlen) 1514 { 1515 struct tcp_md5sig cmd; 1516 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1517 const union tcp_md5_addr *addr; 1518 u8 prefixlen = 32; 1519 int l3index = 0; 1520 bool l3flag; 1521 u8 flags; 1522 1523 if (optlen < sizeof(cmd)) 1524 return -EINVAL; 1525 1526 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1527 return -EFAULT; 1528 1529 if (sin->sin_family != AF_INET) 1530 return -EINVAL; 1531 1532 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1533 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1534 1535 if (optname == TCP_MD5SIG_EXT && 1536 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1537 prefixlen = cmd.tcpm_prefixlen; 1538 if (prefixlen > 32) 1539 return -EINVAL; 1540 } 1541 1542 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1543 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1544 struct net_device *dev; 1545 1546 rcu_read_lock(); 1547 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1548 if (dev && netif_is_l3_master(dev)) 1549 l3index = dev->ifindex; 1550 1551 rcu_read_unlock(); 1552 1553 /* ok to reference set/not set outside of rcu; 1554 * right now device MUST be an L3 master 1555 */ 1556 if (!dev || !l3index) 1557 return -EINVAL; 1558 } 1559 1560 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1561 1562 if (!cmd.tcpm_keylen) 1563 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1564 1565 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1566 return -EINVAL; 1567 1568 /* Don't allow keys for peers that have a matching TCP-AO key. 1569 * See the comment in tcp_ao_add_cmd() 1570 */ 1571 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1572 return -EKEYREJECTED; 1573 1574 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1575 cmd.tcpm_key, cmd.tcpm_keylen); 1576 } 1577 1578 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1579 __be32 daddr, __be32 saddr, 1580 const struct tcphdr *th, int nbytes) 1581 { 1582 struct tcp4_pseudohdr *bp; 1583 struct scatterlist sg; 1584 struct tcphdr *_th; 1585 1586 bp = hp->scratch; 1587 bp->saddr = saddr; 1588 bp->daddr = daddr; 1589 bp->pad = 0; 1590 bp->protocol = IPPROTO_TCP; 1591 bp->len = cpu_to_be16(nbytes); 1592 1593 _th = (struct tcphdr *)(bp + 1); 1594 memcpy(_th, th, sizeof(*th)); 1595 _th->check = 0; 1596 1597 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1598 ahash_request_set_crypt(hp->req, &sg, NULL, 1599 sizeof(*bp) + sizeof(*th)); 1600 return crypto_ahash_update(hp->req); 1601 } 1602 1603 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1604 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1605 { 1606 struct tcp_sigpool hp; 1607 1608 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1609 goto clear_hash_nostart; 1610 1611 if (crypto_ahash_init(hp.req)) 1612 goto clear_hash; 1613 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1614 goto clear_hash; 1615 if (tcp_md5_hash_key(&hp, key)) 1616 goto clear_hash; 1617 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1618 if (crypto_ahash_final(hp.req)) 1619 goto clear_hash; 1620 1621 tcp_sigpool_end(&hp); 1622 return 0; 1623 1624 clear_hash: 1625 tcp_sigpool_end(&hp); 1626 clear_hash_nostart: 1627 memset(md5_hash, 0, 16); 1628 return 1; 1629 } 1630 1631 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1632 const struct sock *sk, 1633 const struct sk_buff *skb) 1634 { 1635 const struct tcphdr *th = tcp_hdr(skb); 1636 struct tcp_sigpool hp; 1637 __be32 saddr, daddr; 1638 1639 if (sk) { /* valid for establish/request sockets */ 1640 saddr = sk->sk_rcv_saddr; 1641 daddr = sk->sk_daddr; 1642 } else { 1643 const struct iphdr *iph = ip_hdr(skb); 1644 saddr = iph->saddr; 1645 daddr = iph->daddr; 1646 } 1647 1648 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1649 goto clear_hash_nostart; 1650 1651 if (crypto_ahash_init(hp.req)) 1652 goto clear_hash; 1653 1654 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1655 goto clear_hash; 1656 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1657 goto clear_hash; 1658 if (tcp_md5_hash_key(&hp, key)) 1659 goto clear_hash; 1660 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1661 if (crypto_ahash_final(hp.req)) 1662 goto clear_hash; 1663 1664 tcp_sigpool_end(&hp); 1665 return 0; 1666 1667 clear_hash: 1668 tcp_sigpool_end(&hp); 1669 clear_hash_nostart: 1670 memset(md5_hash, 0, 16); 1671 return 1; 1672 } 1673 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1674 1675 #endif 1676 1677 static void tcp_v4_init_req(struct request_sock *req, 1678 const struct sock *sk_listener, 1679 struct sk_buff *skb) 1680 { 1681 struct inet_request_sock *ireq = inet_rsk(req); 1682 struct net *net = sock_net(sk_listener); 1683 1684 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1685 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1686 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1687 } 1688 1689 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1690 struct sk_buff *skb, 1691 struct flowi *fl, 1692 struct request_sock *req, 1693 u32 tw_isn) 1694 { 1695 tcp_v4_init_req(req, sk, skb); 1696 1697 if (security_inet_conn_request(sk, skb, req)) 1698 return NULL; 1699 1700 return inet_csk_route_req(sk, &fl->u.ip4, req); 1701 } 1702 1703 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1704 .family = PF_INET, 1705 .obj_size = sizeof(struct tcp_request_sock), 1706 .send_ack = tcp_v4_reqsk_send_ack, 1707 .destructor = tcp_v4_reqsk_destructor, 1708 .send_reset = tcp_v4_send_reset, 1709 .syn_ack_timeout = tcp_syn_ack_timeout, 1710 }; 1711 1712 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1713 .mss_clamp = TCP_MSS_DEFAULT, 1714 #ifdef CONFIG_TCP_MD5SIG 1715 .req_md5_lookup = tcp_v4_md5_lookup, 1716 .calc_md5_hash = tcp_v4_md5_hash_skb, 1717 #endif 1718 #ifdef CONFIG_TCP_AO 1719 .ao_lookup = tcp_v4_ao_lookup_rsk, 1720 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1721 .ao_synack_hash = tcp_v4_ao_synack_hash, 1722 #endif 1723 #ifdef CONFIG_SYN_COOKIES 1724 .cookie_init_seq = cookie_v4_init_sequence, 1725 #endif 1726 .route_req = tcp_v4_route_req, 1727 .init_seq = tcp_v4_init_seq, 1728 .init_ts_off = tcp_v4_init_ts_off, 1729 .send_synack = tcp_v4_send_synack, 1730 }; 1731 1732 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1733 { 1734 /* Never answer to SYNs send to broadcast or multicast */ 1735 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1736 goto drop; 1737 1738 return tcp_conn_request(&tcp_request_sock_ops, 1739 &tcp_request_sock_ipv4_ops, sk, skb); 1740 1741 drop: 1742 tcp_listendrop(sk); 1743 return 0; 1744 } 1745 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1746 1747 1748 /* 1749 * The three way handshake has completed - we got a valid synack - 1750 * now create the new socket. 1751 */ 1752 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1753 struct request_sock *req, 1754 struct dst_entry *dst, 1755 struct request_sock *req_unhash, 1756 bool *own_req) 1757 { 1758 struct inet_request_sock *ireq; 1759 bool found_dup_sk = false; 1760 struct inet_sock *newinet; 1761 struct tcp_sock *newtp; 1762 struct sock *newsk; 1763 #ifdef CONFIG_TCP_MD5SIG 1764 const union tcp_md5_addr *addr; 1765 struct tcp_md5sig_key *key; 1766 int l3index; 1767 #endif 1768 struct ip_options_rcu *inet_opt; 1769 1770 if (sk_acceptq_is_full(sk)) 1771 goto exit_overflow; 1772 1773 newsk = tcp_create_openreq_child(sk, req, skb); 1774 if (!newsk) 1775 goto exit_nonewsk; 1776 1777 newsk->sk_gso_type = SKB_GSO_TCPV4; 1778 inet_sk_rx_dst_set(newsk, skb); 1779 1780 newtp = tcp_sk(newsk); 1781 newinet = inet_sk(newsk); 1782 ireq = inet_rsk(req); 1783 inet_opt = rcu_dereference(ireq->ireq_opt); 1784 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1785 newinet->mc_index = inet_iif(skb); 1786 newinet->mc_ttl = ip_hdr(skb)->ttl; 1787 newinet->rcv_tos = ip_hdr(skb)->tos; 1788 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1789 if (inet_opt) 1790 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1791 atomic_set(&newinet->inet_id, get_random_u16()); 1792 1793 /* Set ToS of the new socket based upon the value of incoming SYN. 1794 * ECT bits are set later in tcp_init_transfer(). 1795 */ 1796 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1797 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1798 1799 if (!dst) { 1800 dst = inet_csk_route_child_sock(sk, newsk, req); 1801 if (!dst) 1802 goto put_and_exit; 1803 } else { 1804 /* syncookie case : see end of cookie_v4_check() */ 1805 } 1806 sk_setup_caps(newsk, dst); 1807 1808 tcp_ca_openreq_child(newsk, dst); 1809 1810 tcp_sync_mss(newsk, dst_mtu(dst)); 1811 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1812 1813 tcp_initialize_rcv_mss(newsk); 1814 1815 #ifdef CONFIG_TCP_MD5SIG 1816 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1817 /* Copy over the MD5 key from the original socket */ 1818 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1819 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1820 if (key && !tcp_rsk_used_ao(req)) { 1821 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1822 goto put_and_exit; 1823 sk_gso_disable(newsk); 1824 } 1825 #endif 1826 #ifdef CONFIG_TCP_AO 1827 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1828 goto put_and_exit; /* OOM, release back memory */ 1829 #endif 1830 1831 if (__inet_inherit_port(sk, newsk) < 0) 1832 goto put_and_exit; 1833 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1834 &found_dup_sk); 1835 if (likely(*own_req)) { 1836 tcp_move_syn(newtp, req); 1837 ireq->ireq_opt = NULL; 1838 } else { 1839 newinet->inet_opt = NULL; 1840 1841 if (!req_unhash && found_dup_sk) { 1842 /* This code path should only be executed in the 1843 * syncookie case only 1844 */ 1845 bh_unlock_sock(newsk); 1846 sock_put(newsk); 1847 newsk = NULL; 1848 } 1849 } 1850 return newsk; 1851 1852 exit_overflow: 1853 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1854 exit_nonewsk: 1855 dst_release(dst); 1856 exit: 1857 tcp_listendrop(sk); 1858 return NULL; 1859 put_and_exit: 1860 newinet->inet_opt = NULL; 1861 inet_csk_prepare_forced_close(newsk); 1862 tcp_done(newsk); 1863 goto exit; 1864 } 1865 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1866 1867 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1868 { 1869 #ifdef CONFIG_SYN_COOKIES 1870 const struct tcphdr *th = tcp_hdr(skb); 1871 1872 if (!th->syn) 1873 sk = cookie_v4_check(sk, skb); 1874 #endif 1875 return sk; 1876 } 1877 1878 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1879 struct tcphdr *th, u32 *cookie) 1880 { 1881 u16 mss = 0; 1882 #ifdef CONFIG_SYN_COOKIES 1883 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1884 &tcp_request_sock_ipv4_ops, sk, th); 1885 if (mss) { 1886 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1887 tcp_synq_overflow(sk); 1888 } 1889 #endif 1890 return mss; 1891 } 1892 1893 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1894 u32)); 1895 /* The socket must have it's spinlock held when we get 1896 * here, unless it is a TCP_LISTEN socket. 1897 * 1898 * We have a potential double-lock case here, so even when 1899 * doing backlog processing we use the BH locking scheme. 1900 * This is because we cannot sleep with the original spinlock 1901 * held. 1902 */ 1903 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1904 { 1905 enum skb_drop_reason reason; 1906 struct sock *rsk; 1907 1908 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1909 struct dst_entry *dst; 1910 1911 dst = rcu_dereference_protected(sk->sk_rx_dst, 1912 lockdep_sock_is_held(sk)); 1913 1914 sock_rps_save_rxhash(sk, skb); 1915 sk_mark_napi_id(sk, skb); 1916 if (dst) { 1917 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1918 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1919 dst, 0)) { 1920 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1921 dst_release(dst); 1922 } 1923 } 1924 tcp_rcv_established(sk, skb); 1925 return 0; 1926 } 1927 1928 if (tcp_checksum_complete(skb)) 1929 goto csum_err; 1930 1931 if (sk->sk_state == TCP_LISTEN) { 1932 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1933 1934 if (!nsk) 1935 return 0; 1936 if (nsk != sk) { 1937 reason = tcp_child_process(sk, nsk, skb); 1938 if (reason) { 1939 rsk = nsk; 1940 goto reset; 1941 } 1942 return 0; 1943 } 1944 } else 1945 sock_rps_save_rxhash(sk, skb); 1946 1947 reason = tcp_rcv_state_process(sk, skb); 1948 if (reason) { 1949 rsk = sk; 1950 goto reset; 1951 } 1952 return 0; 1953 1954 reset: 1955 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1956 discard: 1957 sk_skb_reason_drop(sk, skb, reason); 1958 /* Be careful here. If this function gets more complicated and 1959 * gcc suffers from register pressure on the x86, sk (in %ebx) 1960 * might be destroyed here. This current version compiles correctly, 1961 * but you have been warned. 1962 */ 1963 return 0; 1964 1965 csum_err: 1966 reason = SKB_DROP_REASON_TCP_CSUM; 1967 trace_tcp_bad_csum(skb); 1968 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1969 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1970 goto discard; 1971 } 1972 EXPORT_SYMBOL(tcp_v4_do_rcv); 1973 1974 int tcp_v4_early_demux(struct sk_buff *skb) 1975 { 1976 struct net *net = dev_net_rcu(skb->dev); 1977 const struct iphdr *iph; 1978 const struct tcphdr *th; 1979 struct sock *sk; 1980 1981 if (skb->pkt_type != PACKET_HOST) 1982 return 0; 1983 1984 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1985 return 0; 1986 1987 iph = ip_hdr(skb); 1988 th = tcp_hdr(skb); 1989 1990 if (th->doff < sizeof(struct tcphdr) / 4) 1991 return 0; 1992 1993 sk = __inet_lookup_established(net, iph->saddr, th->source, 1994 iph->daddr, ntohs(th->dest), 1995 skb->skb_iif, inet_sdif(skb)); 1996 if (sk) { 1997 skb->sk = sk; 1998 skb->destructor = sock_edemux; 1999 if (sk_fullsock(sk)) { 2000 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 2001 2002 if (dst) 2003 dst = dst_check(dst, 0); 2004 if (dst && 2005 sk->sk_rx_dst_ifindex == skb->skb_iif) 2006 skb_dst_set_noref(skb, dst); 2007 } 2008 } 2009 return 0; 2010 } 2011 2012 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2013 enum skb_drop_reason *reason) 2014 { 2015 u32 tail_gso_size, tail_gso_segs; 2016 struct skb_shared_info *shinfo; 2017 const struct tcphdr *th; 2018 struct tcphdr *thtail; 2019 struct sk_buff *tail; 2020 unsigned int hdrlen; 2021 bool fragstolen; 2022 u32 gso_segs; 2023 u32 gso_size; 2024 u64 limit; 2025 int delta; 2026 int err; 2027 2028 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2029 * we can fix skb->truesize to its real value to avoid future drops. 2030 * This is valid because skb is not yet charged to the socket. 2031 * It has been noticed pure SACK packets were sometimes dropped 2032 * (if cooked by drivers without copybreak feature). 2033 */ 2034 skb_condense(skb); 2035 2036 tcp_cleanup_skb(skb); 2037 2038 if (unlikely(tcp_checksum_complete(skb))) { 2039 bh_unlock_sock(sk); 2040 trace_tcp_bad_csum(skb); 2041 *reason = SKB_DROP_REASON_TCP_CSUM; 2042 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2043 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2044 return true; 2045 } 2046 2047 /* Attempt coalescing to last skb in backlog, even if we are 2048 * above the limits. 2049 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2050 */ 2051 th = (const struct tcphdr *)skb->data; 2052 hdrlen = th->doff * 4; 2053 2054 tail = sk->sk_backlog.tail; 2055 if (!tail) 2056 goto no_coalesce; 2057 thtail = (struct tcphdr *)tail->data; 2058 2059 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2060 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2061 ((TCP_SKB_CB(tail)->tcp_flags | 2062 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2063 !((TCP_SKB_CB(tail)->tcp_flags & 2064 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2065 ((TCP_SKB_CB(tail)->tcp_flags ^ 2066 TCP_SKB_CB(skb)->tcp_flags) & 2067 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2068 !tcp_skb_can_collapse_rx(tail, skb) || 2069 thtail->doff != th->doff || 2070 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2071 goto no_coalesce; 2072 2073 __skb_pull(skb, hdrlen); 2074 2075 shinfo = skb_shinfo(skb); 2076 gso_size = shinfo->gso_size ?: skb->len; 2077 gso_segs = shinfo->gso_segs ?: 1; 2078 2079 shinfo = skb_shinfo(tail); 2080 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2081 tail_gso_segs = shinfo->gso_segs ?: 1; 2082 2083 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2084 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2085 2086 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2087 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2088 thtail->window = th->window; 2089 } 2090 2091 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2092 * thtail->fin, so that the fast path in tcp_rcv_established() 2093 * is not entered if we append a packet with a FIN. 2094 * SYN, RST, URG are not present. 2095 * ACK is set on both packets. 2096 * PSH : we do not really care in TCP stack, 2097 * at least for 'GRO' packets. 2098 */ 2099 thtail->fin |= th->fin; 2100 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2101 2102 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2103 TCP_SKB_CB(tail)->has_rxtstamp = true; 2104 tail->tstamp = skb->tstamp; 2105 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2106 } 2107 2108 /* Not as strict as GRO. We only need to carry mss max value */ 2109 shinfo->gso_size = max(gso_size, tail_gso_size); 2110 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2111 2112 sk->sk_backlog.len += delta; 2113 __NET_INC_STATS(sock_net(sk), 2114 LINUX_MIB_TCPBACKLOGCOALESCE); 2115 kfree_skb_partial(skb, fragstolen); 2116 return false; 2117 } 2118 __skb_push(skb, hdrlen); 2119 2120 no_coalesce: 2121 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2122 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2123 * sk_rcvbuf in normal conditions. 2124 */ 2125 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2126 2127 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2128 2129 /* Only socket owner can try to collapse/prune rx queues 2130 * to reduce memory overhead, so add a little headroom here. 2131 * Few sockets backlog are possibly concurrently non empty. 2132 */ 2133 limit += 64 * 1024; 2134 2135 limit = min_t(u64, limit, UINT_MAX); 2136 2137 err = sk_add_backlog(sk, skb, limit); 2138 if (unlikely(err)) { 2139 bh_unlock_sock(sk); 2140 if (err == -ENOMEM) { 2141 *reason = SKB_DROP_REASON_PFMEMALLOC; 2142 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2143 } else { 2144 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2145 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2146 } 2147 return true; 2148 } 2149 return false; 2150 } 2151 EXPORT_IPV6_MOD(tcp_add_backlog); 2152 2153 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) 2154 { 2155 struct tcphdr *th = (struct tcphdr *)skb->data; 2156 2157 return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); 2158 } 2159 EXPORT_IPV6_MOD(tcp_filter); 2160 2161 static void tcp_v4_restore_cb(struct sk_buff *skb) 2162 { 2163 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2164 sizeof(struct inet_skb_parm)); 2165 } 2166 2167 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2168 const struct tcphdr *th) 2169 { 2170 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2171 * barrier() makes sure compiler wont play fool^Waliasing games. 2172 */ 2173 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2174 sizeof(struct inet_skb_parm)); 2175 barrier(); 2176 2177 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2178 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2179 skb->len - th->doff * 4); 2180 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2181 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2182 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2183 TCP_SKB_CB(skb)->sacked = 0; 2184 TCP_SKB_CB(skb)->has_rxtstamp = 2185 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2186 } 2187 2188 /* 2189 * From tcp_input.c 2190 */ 2191 2192 int tcp_v4_rcv(struct sk_buff *skb) 2193 { 2194 struct net *net = dev_net_rcu(skb->dev); 2195 enum skb_drop_reason drop_reason; 2196 enum tcp_tw_status tw_status; 2197 int sdif = inet_sdif(skb); 2198 int dif = inet_iif(skb); 2199 const struct iphdr *iph; 2200 const struct tcphdr *th; 2201 struct sock *sk = NULL; 2202 bool refcounted; 2203 int ret; 2204 u32 isn; 2205 2206 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2207 if (skb->pkt_type != PACKET_HOST) 2208 goto discard_it; 2209 2210 /* Count it even if it's bad */ 2211 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2212 2213 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2214 goto discard_it; 2215 2216 th = (const struct tcphdr *)skb->data; 2217 2218 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2219 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2220 goto bad_packet; 2221 } 2222 if (!pskb_may_pull(skb, th->doff * 4)) 2223 goto discard_it; 2224 2225 /* An explanation is required here, I think. 2226 * Packet length and doff are validated by header prediction, 2227 * provided case of th->doff==0 is eliminated. 2228 * So, we defer the checks. */ 2229 2230 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2231 goto csum_error; 2232 2233 th = (const struct tcphdr *)skb->data; 2234 iph = ip_hdr(skb); 2235 lookup: 2236 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2237 th->dest, sdif, &refcounted); 2238 if (!sk) 2239 goto no_tcp_socket; 2240 2241 if (sk->sk_state == TCP_TIME_WAIT) 2242 goto do_time_wait; 2243 2244 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2245 struct request_sock *req = inet_reqsk(sk); 2246 bool req_stolen = false; 2247 struct sock *nsk; 2248 2249 sk = req->rsk_listener; 2250 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2251 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2252 else 2253 drop_reason = tcp_inbound_hash(sk, req, skb, 2254 &iph->saddr, &iph->daddr, 2255 AF_INET, dif, sdif); 2256 if (unlikely(drop_reason)) { 2257 sk_drops_skbadd(sk, skb); 2258 reqsk_put(req); 2259 goto discard_it; 2260 } 2261 if (tcp_checksum_complete(skb)) { 2262 reqsk_put(req); 2263 goto csum_error; 2264 } 2265 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2266 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2267 if (!nsk) { 2268 inet_csk_reqsk_queue_drop_and_put(sk, req); 2269 goto lookup; 2270 } 2271 sk = nsk; 2272 /* reuseport_migrate_sock() has already held one sk_refcnt 2273 * before returning. 2274 */ 2275 } else { 2276 /* We own a reference on the listener, increase it again 2277 * as we might lose it too soon. 2278 */ 2279 sock_hold(sk); 2280 } 2281 refcounted = true; 2282 nsk = NULL; 2283 if (!tcp_filter(sk, skb, &drop_reason)) { 2284 th = (const struct tcphdr *)skb->data; 2285 iph = ip_hdr(skb); 2286 tcp_v4_fill_cb(skb, iph, th); 2287 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2288 &drop_reason); 2289 } 2290 if (!nsk) { 2291 reqsk_put(req); 2292 if (req_stolen) { 2293 /* Another cpu got exclusive access to req 2294 * and created a full blown socket. 2295 * Try to feed this packet to this socket 2296 * instead of discarding it. 2297 */ 2298 tcp_v4_restore_cb(skb); 2299 sock_put(sk); 2300 goto lookup; 2301 } 2302 goto discard_and_relse; 2303 } 2304 nf_reset_ct(skb); 2305 if (nsk == sk) { 2306 reqsk_put(req); 2307 tcp_v4_restore_cb(skb); 2308 } else { 2309 drop_reason = tcp_child_process(sk, nsk, skb); 2310 if (drop_reason) { 2311 enum sk_rst_reason rst_reason; 2312 2313 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2314 tcp_v4_send_reset(nsk, skb, rst_reason); 2315 goto discard_and_relse; 2316 } 2317 sock_put(sk); 2318 return 0; 2319 } 2320 } 2321 2322 process: 2323 if (static_branch_unlikely(&ip4_min_ttl)) { 2324 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2325 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2326 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2327 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2328 goto discard_and_relse; 2329 } 2330 } 2331 2332 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2333 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2334 goto discard_and_relse; 2335 } 2336 2337 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2338 AF_INET, dif, sdif); 2339 if (drop_reason) 2340 goto discard_and_relse; 2341 2342 nf_reset_ct(skb); 2343 2344 if (tcp_filter(sk, skb, &drop_reason)) 2345 goto discard_and_relse; 2346 2347 th = (const struct tcphdr *)skb->data; 2348 iph = ip_hdr(skb); 2349 tcp_v4_fill_cb(skb, iph, th); 2350 2351 skb->dev = NULL; 2352 2353 if (sk->sk_state == TCP_LISTEN) { 2354 ret = tcp_v4_do_rcv(sk, skb); 2355 goto put_and_return; 2356 } 2357 2358 sk_incoming_cpu_update(sk); 2359 2360 bh_lock_sock_nested(sk); 2361 tcp_segs_in(tcp_sk(sk), skb); 2362 ret = 0; 2363 if (!sock_owned_by_user(sk)) { 2364 ret = tcp_v4_do_rcv(sk, skb); 2365 } else { 2366 if (tcp_add_backlog(sk, skb, &drop_reason)) 2367 goto discard_and_relse; 2368 } 2369 bh_unlock_sock(sk); 2370 2371 put_and_return: 2372 if (refcounted) 2373 sock_put(sk); 2374 2375 return ret; 2376 2377 no_tcp_socket: 2378 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2379 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2380 goto discard_it; 2381 2382 tcp_v4_fill_cb(skb, iph, th); 2383 2384 if (tcp_checksum_complete(skb)) { 2385 csum_error: 2386 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2387 trace_tcp_bad_csum(skb); 2388 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2389 bad_packet: 2390 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2391 } else { 2392 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2393 } 2394 2395 discard_it: 2396 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2397 /* Discard frame. */ 2398 sk_skb_reason_drop(sk, skb, drop_reason); 2399 return 0; 2400 2401 discard_and_relse: 2402 sk_drops_skbadd(sk, skb); 2403 if (refcounted) 2404 sock_put(sk); 2405 goto discard_it; 2406 2407 do_time_wait: 2408 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2409 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2410 inet_twsk_put(inet_twsk(sk)); 2411 goto discard_it; 2412 } 2413 2414 tcp_v4_fill_cb(skb, iph, th); 2415 2416 if (tcp_checksum_complete(skb)) { 2417 inet_twsk_put(inet_twsk(sk)); 2418 goto csum_error; 2419 } 2420 2421 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2422 &drop_reason); 2423 switch (tw_status) { 2424 case TCP_TW_SYN: { 2425 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2426 iph->saddr, th->source, 2427 iph->daddr, th->dest, 2428 inet_iif(skb), 2429 sdif); 2430 if (sk2) { 2431 inet_twsk_deschedule_put(inet_twsk(sk)); 2432 sk = sk2; 2433 tcp_v4_restore_cb(skb); 2434 refcounted = false; 2435 __this_cpu_write(tcp_tw_isn, isn); 2436 goto process; 2437 } 2438 } 2439 /* to ACK */ 2440 fallthrough; 2441 case TCP_TW_ACK: 2442 case TCP_TW_ACK_OOW: 2443 tcp_v4_timewait_ack(sk, skb, tw_status); 2444 break; 2445 case TCP_TW_RST: 2446 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2447 inet_twsk_deschedule_put(inet_twsk(sk)); 2448 goto discard_it; 2449 case TCP_TW_SUCCESS:; 2450 } 2451 goto discard_it; 2452 } 2453 2454 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2455 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2456 }; 2457 2458 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2459 { 2460 struct dst_entry *dst = skb_dst(skb); 2461 2462 if (dst && dst_hold_safe(dst)) { 2463 rcu_assign_pointer(sk->sk_rx_dst, dst); 2464 sk->sk_rx_dst_ifindex = skb->skb_iif; 2465 } 2466 } 2467 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2468 2469 const struct inet_connection_sock_af_ops ipv4_specific = { 2470 .queue_xmit = ip_queue_xmit, 2471 .send_check = tcp_v4_send_check, 2472 .rebuild_header = inet_sk_rebuild_header, 2473 .sk_rx_dst_set = inet_sk_rx_dst_set, 2474 .conn_request = tcp_v4_conn_request, 2475 .syn_recv_sock = tcp_v4_syn_recv_sock, 2476 .net_header_len = sizeof(struct iphdr), 2477 .setsockopt = ip_setsockopt, 2478 .getsockopt = ip_getsockopt, 2479 .mtu_reduced = tcp_v4_mtu_reduced, 2480 }; 2481 EXPORT_IPV6_MOD(ipv4_specific); 2482 2483 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2484 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2485 #ifdef CONFIG_TCP_MD5SIG 2486 .md5_lookup = tcp_v4_md5_lookup, 2487 .calc_md5_hash = tcp_v4_md5_hash_skb, 2488 .md5_parse = tcp_v4_parse_md5_keys, 2489 #endif 2490 #ifdef CONFIG_TCP_AO 2491 .ao_lookup = tcp_v4_ao_lookup, 2492 .calc_ao_hash = tcp_v4_ao_hash_skb, 2493 .ao_parse = tcp_v4_parse_ao, 2494 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2495 #endif 2496 }; 2497 2498 static void tcp4_destruct_sock(struct sock *sk) 2499 { 2500 tcp_md5_destruct_sock(sk); 2501 tcp_ao_destroy_sock(sk, false); 2502 inet_sock_destruct(sk); 2503 } 2504 #endif 2505 2506 /* NOTE: A lot of things set to zero explicitly by call to 2507 * sk_alloc() so need not be done here. 2508 */ 2509 static int tcp_v4_init_sock(struct sock *sk) 2510 { 2511 struct inet_connection_sock *icsk = inet_csk(sk); 2512 2513 tcp_init_sock(sk); 2514 2515 icsk->icsk_af_ops = &ipv4_specific; 2516 2517 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2518 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2519 sk->sk_destruct = tcp4_destruct_sock; 2520 #endif 2521 2522 return 0; 2523 } 2524 2525 static void tcp_release_user_frags(struct sock *sk) 2526 { 2527 #ifdef CONFIG_PAGE_POOL 2528 unsigned long index; 2529 void *netmem; 2530 2531 xa_for_each(&sk->sk_user_frags, index, netmem) 2532 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2533 #endif 2534 } 2535 2536 void tcp_v4_destroy_sock(struct sock *sk) 2537 { 2538 struct tcp_sock *tp = tcp_sk(sk); 2539 2540 tcp_release_user_frags(sk); 2541 2542 xa_destroy(&sk->sk_user_frags); 2543 2544 trace_tcp_destroy_sock(sk); 2545 2546 tcp_clear_xmit_timers(sk); 2547 2548 tcp_cleanup_congestion_control(sk); 2549 2550 tcp_cleanup_ulp(sk); 2551 2552 /* Cleanup up the write buffer. */ 2553 tcp_write_queue_purge(sk); 2554 2555 /* Check if we want to disable active TFO */ 2556 tcp_fastopen_active_disable_ofo_check(sk); 2557 2558 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2559 skb_rbtree_purge(&tp->out_of_order_queue); 2560 2561 /* Clean up a referenced TCP bind bucket. */ 2562 if (inet_csk(sk)->icsk_bind_hash) 2563 inet_put_port(sk); 2564 2565 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2566 2567 /* If socket is aborted during connect operation */ 2568 tcp_free_fastopen_req(tp); 2569 tcp_fastopen_destroy_cipher(sk); 2570 tcp_saved_syn_free(tp); 2571 2572 sk_sockets_allocated_dec(sk); 2573 } 2574 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2575 2576 #ifdef CONFIG_PROC_FS 2577 /* Proc filesystem TCP sock list dumping. */ 2578 2579 static unsigned short seq_file_family(const struct seq_file *seq); 2580 2581 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2582 { 2583 unsigned short family = seq_file_family(seq); 2584 2585 /* AF_UNSPEC is used as a match all */ 2586 return ((family == AF_UNSPEC || family == sk->sk_family) && 2587 net_eq(sock_net(sk), seq_file_net(seq))); 2588 } 2589 2590 /* Find a non empty bucket (starting from st->bucket) 2591 * and return the first sk from it. 2592 */ 2593 static void *listening_get_first(struct seq_file *seq) 2594 { 2595 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2596 struct tcp_iter_state *st = seq->private; 2597 2598 st->offset = 0; 2599 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2600 struct inet_listen_hashbucket *ilb2; 2601 struct hlist_nulls_node *node; 2602 struct sock *sk; 2603 2604 ilb2 = &hinfo->lhash2[st->bucket]; 2605 if (hlist_nulls_empty(&ilb2->nulls_head)) 2606 continue; 2607 2608 spin_lock(&ilb2->lock); 2609 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2610 if (seq_sk_match(seq, sk)) 2611 return sk; 2612 } 2613 spin_unlock(&ilb2->lock); 2614 } 2615 2616 return NULL; 2617 } 2618 2619 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2620 * If "cur" is the last one in the st->bucket, 2621 * call listening_get_first() to return the first sk of the next 2622 * non empty bucket. 2623 */ 2624 static void *listening_get_next(struct seq_file *seq, void *cur) 2625 { 2626 struct tcp_iter_state *st = seq->private; 2627 struct inet_listen_hashbucket *ilb2; 2628 struct hlist_nulls_node *node; 2629 struct inet_hashinfo *hinfo; 2630 struct sock *sk = cur; 2631 2632 ++st->num; 2633 ++st->offset; 2634 2635 sk = sk_nulls_next(sk); 2636 sk_nulls_for_each_from(sk, node) { 2637 if (seq_sk_match(seq, sk)) 2638 return sk; 2639 } 2640 2641 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2642 ilb2 = &hinfo->lhash2[st->bucket]; 2643 spin_unlock(&ilb2->lock); 2644 ++st->bucket; 2645 return listening_get_first(seq); 2646 } 2647 2648 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2649 { 2650 struct tcp_iter_state *st = seq->private; 2651 void *rc; 2652 2653 st->bucket = 0; 2654 st->offset = 0; 2655 rc = listening_get_first(seq); 2656 2657 while (rc && *pos) { 2658 rc = listening_get_next(seq, rc); 2659 --*pos; 2660 } 2661 return rc; 2662 } 2663 2664 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2665 const struct tcp_iter_state *st) 2666 { 2667 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2668 } 2669 2670 /* 2671 * Get first established socket starting from bucket given in st->bucket. 2672 * If st->bucket is zero, the very first socket in the hash is returned. 2673 */ 2674 static void *established_get_first(struct seq_file *seq) 2675 { 2676 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2677 struct tcp_iter_state *st = seq->private; 2678 2679 st->offset = 0; 2680 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2681 struct sock *sk; 2682 struct hlist_nulls_node *node; 2683 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2684 2685 cond_resched(); 2686 2687 /* Lockless fast path for the common case of empty buckets */ 2688 if (empty_bucket(hinfo, st)) 2689 continue; 2690 2691 spin_lock_bh(lock); 2692 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2693 if (seq_sk_match(seq, sk)) 2694 return sk; 2695 } 2696 spin_unlock_bh(lock); 2697 } 2698 2699 return NULL; 2700 } 2701 2702 static void *established_get_next(struct seq_file *seq, void *cur) 2703 { 2704 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2705 struct tcp_iter_state *st = seq->private; 2706 struct hlist_nulls_node *node; 2707 struct sock *sk = cur; 2708 2709 ++st->num; 2710 ++st->offset; 2711 2712 sk = sk_nulls_next(sk); 2713 2714 sk_nulls_for_each_from(sk, node) { 2715 if (seq_sk_match(seq, sk)) 2716 return sk; 2717 } 2718 2719 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2720 ++st->bucket; 2721 return established_get_first(seq); 2722 } 2723 2724 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2725 { 2726 struct tcp_iter_state *st = seq->private; 2727 void *rc; 2728 2729 st->bucket = 0; 2730 rc = established_get_first(seq); 2731 2732 while (rc && pos) { 2733 rc = established_get_next(seq, rc); 2734 --pos; 2735 } 2736 return rc; 2737 } 2738 2739 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2740 { 2741 void *rc; 2742 struct tcp_iter_state *st = seq->private; 2743 2744 st->state = TCP_SEQ_STATE_LISTENING; 2745 rc = listening_get_idx(seq, &pos); 2746 2747 if (!rc) { 2748 st->state = TCP_SEQ_STATE_ESTABLISHED; 2749 rc = established_get_idx(seq, pos); 2750 } 2751 2752 return rc; 2753 } 2754 2755 static void *tcp_seek_last_pos(struct seq_file *seq) 2756 { 2757 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2758 struct tcp_iter_state *st = seq->private; 2759 int bucket = st->bucket; 2760 int offset = st->offset; 2761 int orig_num = st->num; 2762 void *rc = NULL; 2763 2764 switch (st->state) { 2765 case TCP_SEQ_STATE_LISTENING: 2766 if (st->bucket > hinfo->lhash2_mask) 2767 break; 2768 rc = listening_get_first(seq); 2769 while (offset-- && rc && bucket == st->bucket) 2770 rc = listening_get_next(seq, rc); 2771 if (rc) 2772 break; 2773 st->bucket = 0; 2774 st->state = TCP_SEQ_STATE_ESTABLISHED; 2775 fallthrough; 2776 case TCP_SEQ_STATE_ESTABLISHED: 2777 if (st->bucket > hinfo->ehash_mask) 2778 break; 2779 rc = established_get_first(seq); 2780 while (offset-- && rc && bucket == st->bucket) 2781 rc = established_get_next(seq, rc); 2782 } 2783 2784 st->num = orig_num; 2785 2786 return rc; 2787 } 2788 2789 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2790 { 2791 struct tcp_iter_state *st = seq->private; 2792 void *rc; 2793 2794 if (*pos && *pos == st->last_pos) { 2795 rc = tcp_seek_last_pos(seq); 2796 if (rc) 2797 goto out; 2798 } 2799 2800 st->state = TCP_SEQ_STATE_LISTENING; 2801 st->num = 0; 2802 st->bucket = 0; 2803 st->offset = 0; 2804 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2805 2806 out: 2807 st->last_pos = *pos; 2808 return rc; 2809 } 2810 EXPORT_IPV6_MOD(tcp_seq_start); 2811 2812 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2813 { 2814 struct tcp_iter_state *st = seq->private; 2815 void *rc = NULL; 2816 2817 if (v == SEQ_START_TOKEN) { 2818 rc = tcp_get_idx(seq, 0); 2819 goto out; 2820 } 2821 2822 switch (st->state) { 2823 case TCP_SEQ_STATE_LISTENING: 2824 rc = listening_get_next(seq, v); 2825 if (!rc) { 2826 st->state = TCP_SEQ_STATE_ESTABLISHED; 2827 st->bucket = 0; 2828 st->offset = 0; 2829 rc = established_get_first(seq); 2830 } 2831 break; 2832 case TCP_SEQ_STATE_ESTABLISHED: 2833 rc = established_get_next(seq, v); 2834 break; 2835 } 2836 out: 2837 ++*pos; 2838 st->last_pos = *pos; 2839 return rc; 2840 } 2841 EXPORT_IPV6_MOD(tcp_seq_next); 2842 2843 void tcp_seq_stop(struct seq_file *seq, void *v) 2844 { 2845 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2846 struct tcp_iter_state *st = seq->private; 2847 2848 switch (st->state) { 2849 case TCP_SEQ_STATE_LISTENING: 2850 if (v != SEQ_START_TOKEN) 2851 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2852 break; 2853 case TCP_SEQ_STATE_ESTABLISHED: 2854 if (v) 2855 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2856 break; 2857 } 2858 } 2859 EXPORT_IPV6_MOD(tcp_seq_stop); 2860 2861 static void get_openreq4(const struct request_sock *req, 2862 struct seq_file *f, int i) 2863 { 2864 const struct inet_request_sock *ireq = inet_rsk(req); 2865 long delta = req->rsk_timer.expires - jiffies; 2866 2867 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2868 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2869 i, 2870 ireq->ir_loc_addr, 2871 ireq->ir_num, 2872 ireq->ir_rmt_addr, 2873 ntohs(ireq->ir_rmt_port), 2874 TCP_SYN_RECV, 2875 0, 0, /* could print option size, but that is af dependent. */ 2876 1, /* timers active (only the expire timer) */ 2877 jiffies_delta_to_clock_t(delta), 2878 req->num_timeout, 2879 from_kuid_munged(seq_user_ns(f), 2880 sk_uid(req->rsk_listener)), 2881 0, /* non standard timer */ 2882 0, /* open_requests have no inode */ 2883 0, 2884 req); 2885 } 2886 2887 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2888 { 2889 int timer_active; 2890 unsigned long timer_expires; 2891 const struct tcp_sock *tp = tcp_sk(sk); 2892 const struct inet_connection_sock *icsk = inet_csk(sk); 2893 const struct inet_sock *inet = inet_sk(sk); 2894 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2895 __be32 dest = inet->inet_daddr; 2896 __be32 src = inet->inet_rcv_saddr; 2897 __u16 destp = ntohs(inet->inet_dport); 2898 __u16 srcp = ntohs(inet->inet_sport); 2899 u8 icsk_pending; 2900 int rx_queue; 2901 int state; 2902 2903 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2904 if (icsk_pending == ICSK_TIME_RETRANS || 2905 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2906 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2907 timer_active = 1; 2908 timer_expires = icsk_timeout(icsk); 2909 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2910 timer_active = 4; 2911 timer_expires = icsk_timeout(icsk); 2912 } else if (timer_pending(&sk->sk_timer)) { 2913 timer_active = 2; 2914 timer_expires = sk->sk_timer.expires; 2915 } else { 2916 timer_active = 0; 2917 timer_expires = jiffies; 2918 } 2919 2920 state = inet_sk_state_load(sk); 2921 if (state == TCP_LISTEN) 2922 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2923 else 2924 /* Because we don't lock the socket, 2925 * we might find a transient negative value. 2926 */ 2927 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2928 READ_ONCE(tp->copied_seq), 0); 2929 2930 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2931 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2932 i, src, srcp, dest, destp, state, 2933 READ_ONCE(tp->write_seq) - tp->snd_una, 2934 rx_queue, 2935 timer_active, 2936 jiffies_delta_to_clock_t(timer_expires - jiffies), 2937 READ_ONCE(icsk->icsk_retransmits), 2938 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2939 READ_ONCE(icsk->icsk_probes_out), 2940 sock_i_ino(sk), 2941 refcount_read(&sk->sk_refcnt), sk, 2942 jiffies_to_clock_t(icsk->icsk_rto), 2943 jiffies_to_clock_t(icsk->icsk_ack.ato), 2944 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2945 tcp_snd_cwnd(tp), 2946 state == TCP_LISTEN ? 2947 fastopenq->max_qlen : 2948 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2949 } 2950 2951 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2952 struct seq_file *f, int i) 2953 { 2954 long delta = tw->tw_timer.expires - jiffies; 2955 __be32 dest, src; 2956 __u16 destp, srcp; 2957 2958 dest = tw->tw_daddr; 2959 src = tw->tw_rcv_saddr; 2960 destp = ntohs(tw->tw_dport); 2961 srcp = ntohs(tw->tw_sport); 2962 2963 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2964 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2965 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2966 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2967 refcount_read(&tw->tw_refcnt), tw); 2968 } 2969 2970 #define TMPSZ 150 2971 2972 static int tcp4_seq_show(struct seq_file *seq, void *v) 2973 { 2974 struct tcp_iter_state *st; 2975 struct sock *sk = v; 2976 2977 seq_setwidth(seq, TMPSZ - 1); 2978 if (v == SEQ_START_TOKEN) { 2979 seq_puts(seq, " sl local_address rem_address st tx_queue " 2980 "rx_queue tr tm->when retrnsmt uid timeout " 2981 "inode"); 2982 goto out; 2983 } 2984 st = seq->private; 2985 2986 if (sk->sk_state == TCP_TIME_WAIT) 2987 get_timewait4_sock(v, seq, st->num); 2988 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2989 get_openreq4(v, seq, st->num); 2990 else 2991 get_tcp4_sock(v, seq, st->num); 2992 out: 2993 seq_pad(seq, '\n'); 2994 return 0; 2995 } 2996 2997 #ifdef CONFIG_BPF_SYSCALL 2998 union bpf_tcp_iter_batch_item { 2999 struct sock *sk; 3000 __u64 cookie; 3001 }; 3002 3003 struct bpf_tcp_iter_state { 3004 struct tcp_iter_state state; 3005 unsigned int cur_sk; 3006 unsigned int end_sk; 3007 unsigned int max_sk; 3008 union bpf_tcp_iter_batch_item *batch; 3009 }; 3010 3011 struct bpf_iter__tcp { 3012 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3013 __bpf_md_ptr(struct sock_common *, sk_common); 3014 uid_t uid __aligned(8); 3015 }; 3016 3017 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3018 struct sock_common *sk_common, uid_t uid) 3019 { 3020 struct bpf_iter__tcp ctx; 3021 3022 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3023 ctx.meta = meta; 3024 ctx.sk_common = sk_common; 3025 ctx.uid = uid; 3026 return bpf_iter_run_prog(prog, &ctx); 3027 } 3028 3029 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3030 { 3031 union bpf_tcp_iter_batch_item *item; 3032 unsigned int cur_sk = iter->cur_sk; 3033 __u64 cookie; 3034 3035 /* Remember the cookies of the sockets we haven't seen yet, so we can 3036 * pick up where we left off next time around. 3037 */ 3038 while (cur_sk < iter->end_sk) { 3039 item = &iter->batch[cur_sk++]; 3040 cookie = sock_gen_cookie(item->sk); 3041 sock_gen_put(item->sk); 3042 item->cookie = cookie; 3043 } 3044 } 3045 3046 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3047 unsigned int new_batch_sz, gfp_t flags) 3048 { 3049 union bpf_tcp_iter_batch_item *new_batch; 3050 3051 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3052 flags | __GFP_NOWARN); 3053 if (!new_batch) 3054 return -ENOMEM; 3055 3056 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 3057 kvfree(iter->batch); 3058 iter->batch = new_batch; 3059 iter->max_sk = new_batch_sz; 3060 3061 return 0; 3062 } 3063 3064 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3065 union bpf_tcp_iter_batch_item *cookies, 3066 int n_cookies) 3067 { 3068 struct hlist_nulls_node *node; 3069 struct sock *sk; 3070 int i; 3071 3072 for (i = 0; i < n_cookies; i++) { 3073 sk = first_sk; 3074 sk_nulls_for_each_from(sk, node) 3075 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3076 return sk; 3077 } 3078 3079 return NULL; 3080 } 3081 3082 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3083 { 3084 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3085 struct bpf_tcp_iter_state *iter = seq->private; 3086 struct tcp_iter_state *st = &iter->state; 3087 unsigned int find_cookie = iter->cur_sk; 3088 unsigned int end_cookie = iter->end_sk; 3089 int resume_bucket = st->bucket; 3090 struct sock *sk; 3091 3092 if (end_cookie && find_cookie == end_cookie) 3093 ++st->bucket; 3094 3095 sk = listening_get_first(seq); 3096 iter->cur_sk = 0; 3097 iter->end_sk = 0; 3098 3099 if (sk && st->bucket == resume_bucket && end_cookie) { 3100 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3101 end_cookie - find_cookie); 3102 if (!sk) { 3103 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3104 ++st->bucket; 3105 sk = listening_get_first(seq); 3106 } 3107 } 3108 3109 return sk; 3110 } 3111 3112 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3113 { 3114 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3115 struct bpf_tcp_iter_state *iter = seq->private; 3116 struct tcp_iter_state *st = &iter->state; 3117 unsigned int find_cookie = iter->cur_sk; 3118 unsigned int end_cookie = iter->end_sk; 3119 int resume_bucket = st->bucket; 3120 struct sock *sk; 3121 3122 if (end_cookie && find_cookie == end_cookie) 3123 ++st->bucket; 3124 3125 sk = established_get_first(seq); 3126 iter->cur_sk = 0; 3127 iter->end_sk = 0; 3128 3129 if (sk && st->bucket == resume_bucket && end_cookie) { 3130 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3131 end_cookie - find_cookie); 3132 if (!sk) { 3133 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3134 ++st->bucket; 3135 sk = established_get_first(seq); 3136 } 3137 } 3138 3139 return sk; 3140 } 3141 3142 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3143 { 3144 struct bpf_tcp_iter_state *iter = seq->private; 3145 struct tcp_iter_state *st = &iter->state; 3146 struct sock *sk = NULL; 3147 3148 switch (st->state) { 3149 case TCP_SEQ_STATE_LISTENING: 3150 sk = bpf_iter_tcp_resume_listening(seq); 3151 if (sk) 3152 break; 3153 st->bucket = 0; 3154 st->state = TCP_SEQ_STATE_ESTABLISHED; 3155 fallthrough; 3156 case TCP_SEQ_STATE_ESTABLISHED: 3157 sk = bpf_iter_tcp_resume_established(seq); 3158 break; 3159 } 3160 3161 return sk; 3162 } 3163 3164 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3165 struct sock **start_sk) 3166 { 3167 struct bpf_tcp_iter_state *iter = seq->private; 3168 struct hlist_nulls_node *node; 3169 unsigned int expected = 1; 3170 struct sock *sk; 3171 3172 sock_hold(*start_sk); 3173 iter->batch[iter->end_sk++].sk = *start_sk; 3174 3175 sk = sk_nulls_next(*start_sk); 3176 *start_sk = NULL; 3177 sk_nulls_for_each_from(sk, node) { 3178 if (seq_sk_match(seq, sk)) { 3179 if (iter->end_sk < iter->max_sk) { 3180 sock_hold(sk); 3181 iter->batch[iter->end_sk++].sk = sk; 3182 } else if (!*start_sk) { 3183 /* Remember where we left off. */ 3184 *start_sk = sk; 3185 } 3186 expected++; 3187 } 3188 } 3189 3190 return expected; 3191 } 3192 3193 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3194 struct sock **start_sk) 3195 { 3196 struct bpf_tcp_iter_state *iter = seq->private; 3197 struct hlist_nulls_node *node; 3198 unsigned int expected = 1; 3199 struct sock *sk; 3200 3201 sock_hold(*start_sk); 3202 iter->batch[iter->end_sk++].sk = *start_sk; 3203 3204 sk = sk_nulls_next(*start_sk); 3205 *start_sk = NULL; 3206 sk_nulls_for_each_from(sk, node) { 3207 if (seq_sk_match(seq, sk)) { 3208 if (iter->end_sk < iter->max_sk) { 3209 sock_hold(sk); 3210 iter->batch[iter->end_sk++].sk = sk; 3211 } else if (!*start_sk) { 3212 /* Remember where we left off. */ 3213 *start_sk = sk; 3214 } 3215 expected++; 3216 } 3217 } 3218 3219 return expected; 3220 } 3221 3222 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3223 struct sock **start_sk) 3224 { 3225 struct bpf_tcp_iter_state *iter = seq->private; 3226 struct tcp_iter_state *st = &iter->state; 3227 3228 if (st->state == TCP_SEQ_STATE_LISTENING) 3229 return bpf_iter_tcp_listening_batch(seq, start_sk); 3230 else 3231 return bpf_iter_tcp_established_batch(seq, start_sk); 3232 } 3233 3234 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3235 { 3236 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3237 struct bpf_tcp_iter_state *iter = seq->private; 3238 struct tcp_iter_state *st = &iter->state; 3239 3240 if (st->state == TCP_SEQ_STATE_LISTENING) 3241 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3242 else 3243 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3244 } 3245 3246 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3247 { 3248 struct bpf_tcp_iter_state *iter = seq->private; 3249 unsigned int expected; 3250 struct sock *sk; 3251 int err; 3252 3253 sk = bpf_iter_tcp_resume(seq); 3254 if (!sk) 3255 return NULL; /* Done */ 3256 3257 expected = bpf_iter_fill_batch(seq, &sk); 3258 if (likely(iter->end_sk == expected)) 3259 goto done; 3260 3261 /* Batch size was too small. */ 3262 bpf_iter_tcp_unlock_bucket(seq); 3263 bpf_iter_tcp_put_batch(iter); 3264 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3265 GFP_USER); 3266 if (err) 3267 return ERR_PTR(err); 3268 3269 sk = bpf_iter_tcp_resume(seq); 3270 if (!sk) 3271 return NULL; /* Done */ 3272 3273 expected = bpf_iter_fill_batch(seq, &sk); 3274 if (likely(iter->end_sk == expected)) 3275 goto done; 3276 3277 /* Batch size was still too small. Hold onto the lock while we try 3278 * again with a larger batch to make sure the current bucket's size 3279 * does not change in the meantime. 3280 */ 3281 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3282 if (err) { 3283 bpf_iter_tcp_unlock_bucket(seq); 3284 return ERR_PTR(err); 3285 } 3286 3287 expected = bpf_iter_fill_batch(seq, &sk); 3288 WARN_ON_ONCE(iter->end_sk != expected); 3289 done: 3290 bpf_iter_tcp_unlock_bucket(seq); 3291 return iter->batch[0].sk; 3292 } 3293 3294 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3295 { 3296 /* bpf iter does not support lseek, so it always 3297 * continue from where it was stop()-ped. 3298 */ 3299 if (*pos) 3300 return bpf_iter_tcp_batch(seq); 3301 3302 return SEQ_START_TOKEN; 3303 } 3304 3305 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3306 { 3307 struct bpf_tcp_iter_state *iter = seq->private; 3308 struct tcp_iter_state *st = &iter->state; 3309 struct sock *sk; 3310 3311 /* Whenever seq_next() is called, the iter->cur_sk is 3312 * done with seq_show(), so advance to the next sk in 3313 * the batch. 3314 */ 3315 if (iter->cur_sk < iter->end_sk) { 3316 /* Keeping st->num consistent in tcp_iter_state. 3317 * bpf_iter_tcp does not use st->num. 3318 * meta.seq_num is used instead. 3319 */ 3320 st->num++; 3321 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3322 } 3323 3324 if (iter->cur_sk < iter->end_sk) 3325 sk = iter->batch[iter->cur_sk].sk; 3326 else 3327 sk = bpf_iter_tcp_batch(seq); 3328 3329 ++*pos; 3330 /* Keeping st->last_pos consistent in tcp_iter_state. 3331 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3332 */ 3333 st->last_pos = *pos; 3334 return sk; 3335 } 3336 3337 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3338 { 3339 struct bpf_iter_meta meta; 3340 struct bpf_prog *prog; 3341 struct sock *sk = v; 3342 uid_t uid; 3343 int ret; 3344 3345 if (v == SEQ_START_TOKEN) 3346 return 0; 3347 3348 if (sk_fullsock(sk)) 3349 lock_sock(sk); 3350 3351 if (unlikely(sk_unhashed(sk))) { 3352 ret = SEQ_SKIP; 3353 goto unlock; 3354 } 3355 3356 if (sk->sk_state == TCP_TIME_WAIT) { 3357 uid = 0; 3358 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3359 const struct request_sock *req = v; 3360 3361 uid = from_kuid_munged(seq_user_ns(seq), 3362 sk_uid(req->rsk_listener)); 3363 } else { 3364 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3365 } 3366 3367 meta.seq = seq; 3368 prog = bpf_iter_get_info(&meta, false); 3369 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3370 3371 unlock: 3372 if (sk_fullsock(sk)) 3373 release_sock(sk); 3374 return ret; 3375 3376 } 3377 3378 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3379 { 3380 struct bpf_tcp_iter_state *iter = seq->private; 3381 struct bpf_iter_meta meta; 3382 struct bpf_prog *prog; 3383 3384 if (!v) { 3385 meta.seq = seq; 3386 prog = bpf_iter_get_info(&meta, true); 3387 if (prog) 3388 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3389 } 3390 3391 if (iter->cur_sk < iter->end_sk) 3392 bpf_iter_tcp_put_batch(iter); 3393 } 3394 3395 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3396 .show = bpf_iter_tcp_seq_show, 3397 .start = bpf_iter_tcp_seq_start, 3398 .next = bpf_iter_tcp_seq_next, 3399 .stop = bpf_iter_tcp_seq_stop, 3400 }; 3401 #endif 3402 static unsigned short seq_file_family(const struct seq_file *seq) 3403 { 3404 const struct tcp_seq_afinfo *afinfo; 3405 3406 #ifdef CONFIG_BPF_SYSCALL 3407 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3408 if (seq->op == &bpf_iter_tcp_seq_ops) 3409 return AF_UNSPEC; 3410 #endif 3411 3412 /* Iterated from proc fs */ 3413 afinfo = pde_data(file_inode(seq->file)); 3414 return afinfo->family; 3415 } 3416 3417 static const struct seq_operations tcp4_seq_ops = { 3418 .show = tcp4_seq_show, 3419 .start = tcp_seq_start, 3420 .next = tcp_seq_next, 3421 .stop = tcp_seq_stop, 3422 }; 3423 3424 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3425 .family = AF_INET, 3426 }; 3427 3428 static int __net_init tcp4_proc_init_net(struct net *net) 3429 { 3430 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3431 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3432 return -ENOMEM; 3433 return 0; 3434 } 3435 3436 static void __net_exit tcp4_proc_exit_net(struct net *net) 3437 { 3438 remove_proc_entry("tcp", net->proc_net); 3439 } 3440 3441 static struct pernet_operations tcp4_net_ops = { 3442 .init = tcp4_proc_init_net, 3443 .exit = tcp4_proc_exit_net, 3444 }; 3445 3446 int __init tcp4_proc_init(void) 3447 { 3448 return register_pernet_subsys(&tcp4_net_ops); 3449 } 3450 3451 void tcp4_proc_exit(void) 3452 { 3453 unregister_pernet_subsys(&tcp4_net_ops); 3454 } 3455 #endif /* CONFIG_PROC_FS */ 3456 3457 /* @wake is one when sk_stream_write_space() calls us. 3458 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3459 * This mimics the strategy used in sock_def_write_space(). 3460 */ 3461 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3462 { 3463 const struct tcp_sock *tp = tcp_sk(sk); 3464 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3465 READ_ONCE(tp->snd_nxt); 3466 3467 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3468 } 3469 EXPORT_SYMBOL(tcp_stream_memory_free); 3470 3471 struct proto tcp_prot = { 3472 .name = "TCP", 3473 .owner = THIS_MODULE, 3474 .close = tcp_close, 3475 .pre_connect = tcp_v4_pre_connect, 3476 .connect = tcp_v4_connect, 3477 .disconnect = tcp_disconnect, 3478 .accept = inet_csk_accept, 3479 .ioctl = tcp_ioctl, 3480 .init = tcp_v4_init_sock, 3481 .destroy = tcp_v4_destroy_sock, 3482 .shutdown = tcp_shutdown, 3483 .setsockopt = tcp_setsockopt, 3484 .getsockopt = tcp_getsockopt, 3485 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3486 .keepalive = tcp_set_keepalive, 3487 .recvmsg = tcp_recvmsg, 3488 .sendmsg = tcp_sendmsg, 3489 .splice_eof = tcp_splice_eof, 3490 .backlog_rcv = tcp_v4_do_rcv, 3491 .release_cb = tcp_release_cb, 3492 .hash = inet_hash, 3493 .unhash = inet_unhash, 3494 .get_port = inet_csk_get_port, 3495 .put_port = inet_put_port, 3496 #ifdef CONFIG_BPF_SYSCALL 3497 .psock_update_sk_prot = tcp_bpf_update_proto, 3498 #endif 3499 .enter_memory_pressure = tcp_enter_memory_pressure, 3500 .leave_memory_pressure = tcp_leave_memory_pressure, 3501 .stream_memory_free = tcp_stream_memory_free, 3502 .sockets_allocated = &tcp_sockets_allocated, 3503 3504 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3505 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3506 3507 .memory_pressure = &tcp_memory_pressure, 3508 .sysctl_mem = sysctl_tcp_mem, 3509 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3510 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3511 .max_header = MAX_TCP_HEADER, 3512 .obj_size = sizeof(struct tcp_sock), 3513 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3514 .twsk_prot = &tcp_timewait_sock_ops, 3515 .rsk_prot = &tcp_request_sock_ops, 3516 .h.hashinfo = NULL, 3517 .no_autobind = true, 3518 .diag_destroy = tcp_abort, 3519 }; 3520 EXPORT_SYMBOL(tcp_prot); 3521 3522 static void __net_exit tcp_sk_exit(struct net *net) 3523 { 3524 if (net->ipv4.tcp_congestion_control) 3525 bpf_module_put(net->ipv4.tcp_congestion_control, 3526 net->ipv4.tcp_congestion_control->owner); 3527 } 3528 3529 static void __net_init tcp_set_hashinfo(struct net *net) 3530 { 3531 struct inet_hashinfo *hinfo; 3532 unsigned int ehash_entries; 3533 struct net *old_net; 3534 3535 if (net_eq(net, &init_net)) 3536 goto fallback; 3537 3538 old_net = current->nsproxy->net_ns; 3539 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3540 if (!ehash_entries) 3541 goto fallback; 3542 3543 ehash_entries = roundup_pow_of_two(ehash_entries); 3544 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3545 if (!hinfo) { 3546 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3547 "for a netns, fallback to the global one\n", 3548 ehash_entries); 3549 fallback: 3550 hinfo = &tcp_hashinfo; 3551 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3552 } 3553 3554 net->ipv4.tcp_death_row.hashinfo = hinfo; 3555 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3556 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3557 } 3558 3559 static int __net_init tcp_sk_init(struct net *net) 3560 { 3561 net->ipv4.sysctl_tcp_ecn = 2; 3562 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3563 3564 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3565 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3566 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3567 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3568 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3569 3570 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3571 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3572 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3573 3574 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3575 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3576 net->ipv4.sysctl_tcp_syncookies = 1; 3577 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3578 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3579 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3580 net->ipv4.sysctl_tcp_orphan_retries = 0; 3581 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3582 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3583 net->ipv4.sysctl_tcp_tw_reuse = 2; 3584 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3585 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3586 3587 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3588 tcp_set_hashinfo(net); 3589 3590 net->ipv4.sysctl_tcp_sack = 1; 3591 net->ipv4.sysctl_tcp_window_scaling = 1; 3592 net->ipv4.sysctl_tcp_timestamps = 1; 3593 net->ipv4.sysctl_tcp_early_retrans = 3; 3594 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3595 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3596 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3597 net->ipv4.sysctl_tcp_max_reordering = 300; 3598 net->ipv4.sysctl_tcp_dsack = 1; 3599 net->ipv4.sysctl_tcp_app_win = 31; 3600 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3601 net->ipv4.sysctl_tcp_frto = 2; 3602 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3603 /* This limits the percentage of the congestion window which we 3604 * will allow a single TSO frame to consume. Building TSO frames 3605 * which are too large can cause TCP streams to be bursty. 3606 */ 3607 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3608 /* Default TSQ limit of 4 MB */ 3609 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3610 3611 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3612 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3613 3614 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3615 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3616 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3617 net->ipv4.sysctl_tcp_autocorking = 1; 3618 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3619 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3620 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3621 if (net != &init_net) { 3622 memcpy(net->ipv4.sysctl_tcp_rmem, 3623 init_net.ipv4.sysctl_tcp_rmem, 3624 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3625 memcpy(net->ipv4.sysctl_tcp_wmem, 3626 init_net.ipv4.sysctl_tcp_wmem, 3627 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3628 } 3629 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3630 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3631 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3632 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3633 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3634 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3635 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3636 3637 /* Set default values for PLB */ 3638 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3639 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3640 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3641 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3642 /* Default congestion threshold for PLB to mark a round is 50% */ 3643 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3644 3645 /* Reno is always built in */ 3646 if (!net_eq(net, &init_net) && 3647 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3648 init_net.ipv4.tcp_congestion_control->owner)) 3649 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3650 else 3651 net->ipv4.tcp_congestion_control = &tcp_reno; 3652 3653 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3654 net->ipv4.sysctl_tcp_shrink_window = 0; 3655 3656 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3657 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3658 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3659 3660 return 0; 3661 } 3662 3663 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3664 { 3665 struct net *net; 3666 3667 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3668 * and failed setup_net error unwinding path are serialized. 3669 * 3670 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3671 * net_exit_list, the thread that dismantles a particular twsk must 3672 * do so without other thread progressing to refcount_dec_and_test() of 3673 * tcp_death_row.tw_refcount. 3674 */ 3675 mutex_lock(&tcp_exit_batch_mutex); 3676 3677 tcp_twsk_purge(net_exit_list); 3678 3679 list_for_each_entry(net, net_exit_list, exit_list) { 3680 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3681 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3682 tcp_fastopen_ctx_destroy(net); 3683 } 3684 3685 mutex_unlock(&tcp_exit_batch_mutex); 3686 } 3687 3688 static struct pernet_operations __net_initdata tcp_sk_ops = { 3689 .init = tcp_sk_init, 3690 .exit = tcp_sk_exit, 3691 .exit_batch = tcp_sk_exit_batch, 3692 }; 3693 3694 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3695 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3696 struct sock_common *sk_common, uid_t uid) 3697 3698 #define INIT_BATCH_SZ 16 3699 3700 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3701 { 3702 struct bpf_tcp_iter_state *iter = priv_data; 3703 int err; 3704 3705 err = bpf_iter_init_seq_net(priv_data, aux); 3706 if (err) 3707 return err; 3708 3709 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3710 if (err) { 3711 bpf_iter_fini_seq_net(priv_data); 3712 return err; 3713 } 3714 3715 return 0; 3716 } 3717 3718 static void bpf_iter_fini_tcp(void *priv_data) 3719 { 3720 struct bpf_tcp_iter_state *iter = priv_data; 3721 3722 bpf_iter_fini_seq_net(priv_data); 3723 kvfree(iter->batch); 3724 } 3725 3726 static const struct bpf_iter_seq_info tcp_seq_info = { 3727 .seq_ops = &bpf_iter_tcp_seq_ops, 3728 .init_seq_private = bpf_iter_init_tcp, 3729 .fini_seq_private = bpf_iter_fini_tcp, 3730 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3731 }; 3732 3733 static const struct bpf_func_proto * 3734 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3735 const struct bpf_prog *prog) 3736 { 3737 switch (func_id) { 3738 case BPF_FUNC_setsockopt: 3739 return &bpf_sk_setsockopt_proto; 3740 case BPF_FUNC_getsockopt: 3741 return &bpf_sk_getsockopt_proto; 3742 default: 3743 return NULL; 3744 } 3745 } 3746 3747 static struct bpf_iter_reg tcp_reg_info = { 3748 .target = "tcp", 3749 .ctx_arg_info_size = 1, 3750 .ctx_arg_info = { 3751 { offsetof(struct bpf_iter__tcp, sk_common), 3752 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3753 }, 3754 .get_func_proto = bpf_iter_tcp_get_func_proto, 3755 .seq_info = &tcp_seq_info, 3756 }; 3757 3758 static void __init bpf_iter_register(void) 3759 { 3760 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3761 if (bpf_iter_reg_target(&tcp_reg_info)) 3762 pr_warn("Warning: could not register bpf iterator tcp\n"); 3763 } 3764 3765 #endif 3766 3767 void __init tcp_v4_init(void) 3768 { 3769 int cpu, res; 3770 3771 for_each_possible_cpu(cpu) { 3772 struct sock *sk; 3773 3774 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3775 IPPROTO_TCP, &init_net); 3776 if (res) 3777 panic("Failed to create the TCP control socket.\n"); 3778 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3779 3780 /* Please enforce IP_DF and IPID==0 for RST and 3781 * ACK sent in SYN-RECV and TIME-WAIT state. 3782 */ 3783 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3784 3785 sk->sk_clockid = CLOCK_MONOTONIC; 3786 3787 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3788 } 3789 if (register_pernet_subsys(&tcp_sk_ops)) 3790 panic("Failed to create the TCP control socket.\n"); 3791 3792 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3793 bpf_iter_register(); 3794 #endif 3795 } 3796