1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 #include <linux/sock_diag.h> 62 63 #include <net/aligned_data.h> 64 #include <net/net_namespace.h> 65 #include <net/icmp.h> 66 #include <net/inet_hashtables.h> 67 #include <net/tcp.h> 68 #include <net/tcp_ecn.h> 69 #include <net/transp_v6.h> 70 #include <net/ipv6.h> 71 #include <net/inet_common.h> 72 #include <net/inet_ecn.h> 73 #include <net/timewait_sock.h> 74 #include <net/xfrm.h> 75 #include <net/secure_seq.h> 76 #include <net/busy_poll.h> 77 #include <net/rstreason.h> 78 79 #include <linux/inet.h> 80 #include <linux/ipv6.h> 81 #include <linux/stddef.h> 82 #include <linux/proc_fs.h> 83 #include <linux/seq_file.h> 84 #include <linux/inetdevice.h> 85 #include <linux/btf_ids.h> 86 #include <linux/skbuff_ref.h> 87 88 #include <crypto/hash.h> 89 #include <linux/scatterlist.h> 90 91 #include <trace/events/tcp.h> 92 93 #ifdef CONFIG_TCP_MD5SIG 94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 95 __be32 daddr, __be32 saddr, const struct tcphdr *th); 96 #endif 97 98 struct inet_hashinfo tcp_hashinfo; 99 100 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 101 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 102 }; 103 104 static DEFINE_MUTEX(tcp_exit_batch_mutex); 105 106 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 107 { 108 return secure_tcp_seq(ip_hdr(skb)->daddr, 109 ip_hdr(skb)->saddr, 110 tcp_hdr(skb)->dest, 111 tcp_hdr(skb)->source); 112 } 113 114 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 115 { 116 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 117 } 118 119 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 120 { 121 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 122 const struct inet_timewait_sock *tw = inet_twsk(sktw); 123 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 124 struct tcp_sock *tp = tcp_sk(sk); 125 int ts_recent_stamp; 126 u32 reuse_thresh; 127 128 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 129 reuse = 0; 130 131 if (reuse == 2) { 132 /* Still does not detect *everything* that goes through 133 * lo, since we require a loopback src or dst address 134 * or direct binding to 'lo' interface. 135 */ 136 bool loopback = false; 137 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 138 loopback = true; 139 #if IS_ENABLED(CONFIG_IPV6) 140 if (tw->tw_family == AF_INET6) { 141 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 142 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 143 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 144 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 145 loopback = true; 146 } else 147 #endif 148 { 149 if (ipv4_is_loopback(tw->tw_daddr) || 150 ipv4_is_loopback(tw->tw_rcv_saddr)) 151 loopback = true; 152 } 153 if (!loopback) 154 reuse = 0; 155 } 156 157 /* With PAWS, it is safe from the viewpoint 158 of data integrity. Even without PAWS it is safe provided sequence 159 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 160 161 Actually, the idea is close to VJ's one, only timestamp cache is 162 held not per host, but per port pair and TW bucket is used as state 163 holder. 164 165 If TW bucket has been already destroyed we fall back to VJ's scheme 166 and use initial timestamp retrieved from peer table. 167 */ 168 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 169 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 170 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 171 if (ts_recent_stamp && 172 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 173 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 174 * and releasing the bucket lock. 175 */ 176 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 177 return 0; 178 179 /* In case of repair and re-using TIME-WAIT sockets we still 180 * want to be sure that it is safe as above but honor the 181 * sequence numbers and time stamps set as part of the repair 182 * process. 183 * 184 * Without this check re-using a TIME-WAIT socket with TCP 185 * repair would accumulate a -1 on the repair assigned 186 * sequence number. The first time it is reused the sequence 187 * is -1, the second time -2, etc. This fixes that issue 188 * without appearing to create any others. 189 */ 190 if (likely(!tp->repair)) { 191 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 192 193 if (!seq) 194 seq = 1; 195 WRITE_ONCE(tp->write_seq, seq); 196 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 197 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 198 } 199 200 return 1; 201 } 202 203 return 0; 204 } 205 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 206 207 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 208 int addr_len) 209 { 210 /* This check is replicated from tcp_v4_connect() and intended to 211 * prevent BPF program called below from accessing bytes that are out 212 * of the bound specified by user in addr_len. 213 */ 214 if (addr_len < sizeof(struct sockaddr_in)) 215 return -EINVAL; 216 217 sock_owned_by_me(sk); 218 219 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 220 } 221 222 /* This will initiate an outgoing connection. */ 223 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 224 { 225 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 226 struct inet_timewait_death_row *tcp_death_row; 227 struct inet_sock *inet = inet_sk(sk); 228 struct tcp_sock *tp = tcp_sk(sk); 229 struct ip_options_rcu *inet_opt; 230 struct net *net = sock_net(sk); 231 __be16 orig_sport, orig_dport; 232 __be32 daddr, nexthop; 233 struct flowi4 *fl4; 234 struct rtable *rt; 235 int err; 236 237 if (addr_len < sizeof(struct sockaddr_in)) 238 return -EINVAL; 239 240 if (usin->sin_family != AF_INET) 241 return -EAFNOSUPPORT; 242 243 nexthop = daddr = usin->sin_addr.s_addr; 244 inet_opt = rcu_dereference_protected(inet->inet_opt, 245 lockdep_sock_is_held(sk)); 246 if (inet_opt && inet_opt->opt.srr) { 247 if (!daddr) 248 return -EINVAL; 249 nexthop = inet_opt->opt.faddr; 250 } 251 252 orig_sport = inet->inet_sport; 253 orig_dport = usin->sin_port; 254 fl4 = &inet->cork.fl.u.ip4; 255 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 256 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 257 orig_dport, sk); 258 if (IS_ERR(rt)) { 259 err = PTR_ERR(rt); 260 if (err == -ENETUNREACH) 261 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 262 return err; 263 } 264 265 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 266 ip_rt_put(rt); 267 return -ENETUNREACH; 268 } 269 270 if (!inet_opt || !inet_opt->opt.srr) 271 daddr = fl4->daddr; 272 273 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 274 275 if (!inet->inet_saddr) { 276 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 277 if (err) { 278 ip_rt_put(rt); 279 return err; 280 } 281 } else { 282 sk_rcv_saddr_set(sk, inet->inet_saddr); 283 } 284 285 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 286 /* Reset inherited state */ 287 tp->rx_opt.ts_recent = 0; 288 tp->rx_opt.ts_recent_stamp = 0; 289 if (likely(!tp->repair)) 290 WRITE_ONCE(tp->write_seq, 0); 291 } 292 293 inet->inet_dport = usin->sin_port; 294 sk_daddr_set(sk, daddr); 295 296 inet_csk(sk)->icsk_ext_hdr_len = 0; 297 if (inet_opt) 298 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 299 300 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 301 302 /* Socket identity is still unknown (sport may be zero). 303 * However we set state to SYN-SENT and not releasing socket 304 * lock select source port, enter ourselves into the hash tables and 305 * complete initialization after this. 306 */ 307 tcp_set_state(sk, TCP_SYN_SENT); 308 err = inet_hash_connect(tcp_death_row, sk); 309 if (err) 310 goto failure; 311 312 sk_set_txhash(sk); 313 314 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 315 inet->inet_sport, inet->inet_dport, sk); 316 if (IS_ERR(rt)) { 317 err = PTR_ERR(rt); 318 rt = NULL; 319 goto failure; 320 } 321 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 322 /* OK, now commit destination to socket. */ 323 sk->sk_gso_type = SKB_GSO_TCPV4; 324 sk_setup_caps(sk, &rt->dst); 325 rt = NULL; 326 327 if (likely(!tp->repair)) { 328 if (!tp->write_seq) 329 WRITE_ONCE(tp->write_seq, 330 secure_tcp_seq(inet->inet_saddr, 331 inet->inet_daddr, 332 inet->inet_sport, 333 usin->sin_port)); 334 WRITE_ONCE(tp->tsoffset, 335 secure_tcp_ts_off(net, inet->inet_saddr, 336 inet->inet_daddr)); 337 } 338 339 atomic_set(&inet->inet_id, get_random_u16()); 340 341 if (tcp_fastopen_defer_connect(sk, &err)) 342 return err; 343 if (err) 344 goto failure; 345 346 err = tcp_connect(sk); 347 348 if (err) 349 goto failure; 350 351 return 0; 352 353 failure: 354 /* 355 * This unhashes the socket and releases the local port, 356 * if necessary. 357 */ 358 tcp_set_state(sk, TCP_CLOSE); 359 inet_bhash2_reset_saddr(sk); 360 ip_rt_put(rt); 361 sk->sk_route_caps = 0; 362 inet->inet_dport = 0; 363 return err; 364 } 365 EXPORT_IPV6_MOD(tcp_v4_connect); 366 367 /* 368 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 369 * It can be called through tcp_release_cb() if socket was owned by user 370 * at the time tcp_v4_err() was called to handle ICMP message. 371 */ 372 void tcp_v4_mtu_reduced(struct sock *sk) 373 { 374 struct inet_sock *inet = inet_sk(sk); 375 struct dst_entry *dst; 376 u32 mtu; 377 378 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 379 return; 380 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 381 dst = inet_csk_update_pmtu(sk, mtu); 382 if (!dst) 383 return; 384 385 /* Something is about to be wrong... Remember soft error 386 * for the case, if this connection will not able to recover. 387 */ 388 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 389 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 390 391 mtu = dst_mtu(dst); 392 393 if (inet->pmtudisc != IP_PMTUDISC_DONT && 394 ip_sk_accept_pmtu(sk) && 395 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 396 tcp_sync_mss(sk, mtu); 397 398 /* Resend the TCP packet because it's 399 * clear that the old packet has been 400 * dropped. This is the new "fast" path mtu 401 * discovery. 402 */ 403 tcp_simple_retransmit(sk); 404 } /* else let the usual retransmit timer handle it */ 405 } 406 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 407 408 static void do_redirect(struct sk_buff *skb, struct sock *sk) 409 { 410 struct dst_entry *dst = __sk_dst_check(sk, 0); 411 412 if (dst) 413 dst->ops->redirect(dst, sk, skb); 414 } 415 416 417 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 418 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 419 { 420 struct request_sock *req = inet_reqsk(sk); 421 struct net *net = sock_net(sk); 422 423 /* ICMPs are not backlogged, hence we cannot get 424 * an established socket here. 425 */ 426 if (seq != tcp_rsk(req)->snt_isn) { 427 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 428 } else if (abort) { 429 /* 430 * Still in SYN_RECV, just remove it silently. 431 * There is no good way to pass the error to the newly 432 * created socket, and POSIX does not want network 433 * errors returned from accept(). 434 */ 435 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 436 tcp_listendrop(req->rsk_listener); 437 } 438 reqsk_put(req); 439 } 440 EXPORT_IPV6_MOD(tcp_req_err); 441 442 /* TCP-LD (RFC 6069) logic */ 443 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 444 { 445 struct inet_connection_sock *icsk = inet_csk(sk); 446 struct tcp_sock *tp = tcp_sk(sk); 447 struct sk_buff *skb; 448 s32 remaining; 449 u32 delta_us; 450 451 if (sock_owned_by_user(sk)) 452 return; 453 454 if (seq != tp->snd_una || !icsk->icsk_retransmits || 455 !icsk->icsk_backoff) 456 return; 457 458 skb = tcp_rtx_queue_head(sk); 459 if (WARN_ON_ONCE(!skb)) 460 return; 461 462 icsk->icsk_backoff--; 463 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 464 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 465 466 tcp_mstamp_refresh(tp); 467 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 468 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 469 470 if (remaining > 0) { 471 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 472 } else { 473 /* RTO revert clocked out retransmission. 474 * Will retransmit now. 475 */ 476 tcp_retransmit_timer(sk); 477 } 478 } 479 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 480 481 /* 482 * This routine is called by the ICMP module when it gets some 483 * sort of error condition. If err < 0 then the socket should 484 * be closed and the error returned to the user. If err > 0 485 * it's just the icmp type << 8 | icmp code. After adjustment 486 * header points to the first 8 bytes of the tcp header. We need 487 * to find the appropriate port. 488 * 489 * The locking strategy used here is very "optimistic". When 490 * someone else accesses the socket the ICMP is just dropped 491 * and for some paths there is no check at all. 492 * A more general error queue to queue errors for later handling 493 * is probably better. 494 * 495 */ 496 497 int tcp_v4_err(struct sk_buff *skb, u32 info) 498 { 499 const struct iphdr *iph = (const struct iphdr *)skb->data; 500 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 501 struct net *net = dev_net_rcu(skb->dev); 502 const int type = icmp_hdr(skb)->type; 503 const int code = icmp_hdr(skb)->code; 504 struct request_sock *fastopen; 505 struct tcp_sock *tp; 506 u32 seq, snd_una; 507 struct sock *sk; 508 int err; 509 510 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 511 ntohs(th->source), inet_iif(skb), 0); 512 if (!sk) { 513 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 514 return -ENOENT; 515 } 516 if (sk->sk_state == TCP_TIME_WAIT) { 517 /* To increase the counter of ignored icmps for TCP-AO */ 518 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 519 inet_twsk_put(inet_twsk(sk)); 520 return 0; 521 } 522 seq = ntohl(th->seq); 523 if (sk->sk_state == TCP_NEW_SYN_RECV) { 524 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 525 type == ICMP_TIME_EXCEEDED || 526 (type == ICMP_DEST_UNREACH && 527 (code == ICMP_NET_UNREACH || 528 code == ICMP_HOST_UNREACH))); 529 return 0; 530 } 531 532 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 533 sock_put(sk); 534 return 0; 535 } 536 537 bh_lock_sock(sk); 538 /* If too many ICMPs get dropped on busy 539 * servers this needs to be solved differently. 540 * We do take care of PMTU discovery (RFC1191) special case : 541 * we can receive locally generated ICMP messages while socket is held. 542 */ 543 if (sock_owned_by_user(sk)) { 544 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 545 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 546 } 547 if (sk->sk_state == TCP_CLOSE) 548 goto out; 549 550 if (static_branch_unlikely(&ip4_min_ttl)) { 551 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 552 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 553 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 554 goto out; 555 } 556 } 557 558 tp = tcp_sk(sk); 559 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 560 fastopen = rcu_dereference(tp->fastopen_rsk); 561 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 562 if (sk->sk_state != TCP_LISTEN && 563 !between(seq, snd_una, tp->snd_nxt)) { 564 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 565 goto out; 566 } 567 568 switch (type) { 569 case ICMP_REDIRECT: 570 if (!sock_owned_by_user(sk)) 571 do_redirect(skb, sk); 572 goto out; 573 case ICMP_SOURCE_QUENCH: 574 /* Just silently ignore these. */ 575 goto out; 576 case ICMP_PARAMETERPROB: 577 err = EPROTO; 578 break; 579 case ICMP_DEST_UNREACH: 580 if (code > NR_ICMP_UNREACH) 581 goto out; 582 583 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 584 /* We are not interested in TCP_LISTEN and open_requests 585 * (SYN-ACKs send out by Linux are always <576bytes so 586 * they should go through unfragmented). 587 */ 588 if (sk->sk_state == TCP_LISTEN) 589 goto out; 590 591 WRITE_ONCE(tp->mtu_info, info); 592 if (!sock_owned_by_user(sk)) { 593 tcp_v4_mtu_reduced(sk); 594 } else { 595 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 596 sock_hold(sk); 597 } 598 goto out; 599 } 600 601 err = icmp_err_convert[code].errno; 602 /* check if this ICMP message allows revert of backoff. 603 * (see RFC 6069) 604 */ 605 if (!fastopen && 606 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 607 tcp_ld_RTO_revert(sk, seq); 608 break; 609 case ICMP_TIME_EXCEEDED: 610 err = EHOSTUNREACH; 611 break; 612 default: 613 goto out; 614 } 615 616 switch (sk->sk_state) { 617 case TCP_SYN_SENT: 618 case TCP_SYN_RECV: 619 /* Only in fast or simultaneous open. If a fast open socket is 620 * already accepted it is treated as a connected one below. 621 */ 622 if (fastopen && !fastopen->sk) 623 break; 624 625 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 626 627 if (!sock_owned_by_user(sk)) 628 tcp_done_with_error(sk, err); 629 else 630 WRITE_ONCE(sk->sk_err_soft, err); 631 goto out; 632 } 633 634 /* If we've already connected we will keep trying 635 * until we time out, or the user gives up. 636 * 637 * rfc1122 4.2.3.9 allows to consider as hard errors 638 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 639 * but it is obsoleted by pmtu discovery). 640 * 641 * Note, that in modern internet, where routing is unreliable 642 * and in each dark corner broken firewalls sit, sending random 643 * errors ordered by their masters even this two messages finally lose 644 * their original sense (even Linux sends invalid PORT_UNREACHs) 645 * 646 * Now we are in compliance with RFCs. 647 * --ANK (980905) 648 */ 649 650 if (!sock_owned_by_user(sk) && 651 inet_test_bit(RECVERR, sk)) { 652 WRITE_ONCE(sk->sk_err, err); 653 sk_error_report(sk); 654 } else { /* Only an error on timeout */ 655 WRITE_ONCE(sk->sk_err_soft, err); 656 } 657 658 out: 659 bh_unlock_sock(sk); 660 sock_put(sk); 661 return 0; 662 } 663 664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 665 { 666 struct tcphdr *th = tcp_hdr(skb); 667 668 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 669 skb->csum_start = skb_transport_header(skb) - skb->head; 670 skb->csum_offset = offsetof(struct tcphdr, check); 671 } 672 673 /* This routine computes an IPv4 TCP checksum. */ 674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 675 { 676 const struct inet_sock *inet = inet_sk(sk); 677 678 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 679 } 680 EXPORT_IPV6_MOD(tcp_v4_send_check); 681 682 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 683 684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 685 const struct tcp_ao_hdr *aoh, 686 struct ip_reply_arg *arg, struct tcphdr *reply, 687 __be32 reply_options[REPLY_OPTIONS_LEN]) 688 { 689 #ifdef CONFIG_TCP_AO 690 int sdif = tcp_v4_sdif(skb); 691 int dif = inet_iif(skb); 692 int l3index = sdif ? dif : 0; 693 bool allocated_traffic_key; 694 struct tcp_ao_key *key; 695 char *traffic_key; 696 bool drop = true; 697 u32 ao_sne = 0; 698 u8 keyid; 699 700 rcu_read_lock(); 701 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 702 &key, &traffic_key, &allocated_traffic_key, 703 &keyid, &ao_sne)) 704 goto out; 705 706 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 707 (aoh->rnext_keyid << 8) | keyid); 708 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 709 reply->doff = arg->iov[0].iov_len / 4; 710 711 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 712 key, traffic_key, 713 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 714 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 715 reply, ao_sne)) 716 goto out; 717 drop = false; 718 out: 719 rcu_read_unlock(); 720 if (allocated_traffic_key) 721 kfree(traffic_key); 722 return drop; 723 #else 724 return true; 725 #endif 726 } 727 728 /* 729 * This routine will send an RST to the other tcp. 730 * 731 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 732 * for reset. 733 * Answer: if a packet caused RST, it is not for a socket 734 * existing in our system, if it is matched to a socket, 735 * it is just duplicate segment or bug in other side's TCP. 736 * So that we build reply only basing on parameters 737 * arrived with segment. 738 * Exception: precedence violation. We do not implement it in any case. 739 */ 740 741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 742 enum sk_rst_reason reason) 743 { 744 const struct tcphdr *th = tcp_hdr(skb); 745 struct { 746 struct tcphdr th; 747 __be32 opt[REPLY_OPTIONS_LEN]; 748 } rep; 749 const __u8 *md5_hash_location = NULL; 750 const struct tcp_ao_hdr *aoh; 751 struct ip_reply_arg arg; 752 #ifdef CONFIG_TCP_MD5SIG 753 struct tcp_md5sig_key *key = NULL; 754 unsigned char newhash[16]; 755 struct sock *sk1 = NULL; 756 int genhash; 757 #endif 758 u64 transmit_time = 0; 759 struct sock *ctl_sk; 760 struct net *net; 761 u32 txhash = 0; 762 763 /* Never send a reset in response to a reset. */ 764 if (th->rst) 765 return; 766 767 /* If sk not NULL, it means we did a successful lookup and incoming 768 * route had to be correct. prequeue might have dropped our dst. 769 */ 770 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 771 return; 772 773 /* Swap the send and the receive. */ 774 memset(&rep, 0, sizeof(rep)); 775 rep.th.dest = th->source; 776 rep.th.source = th->dest; 777 rep.th.doff = sizeof(struct tcphdr) / 4; 778 rep.th.rst = 1; 779 780 if (th->ack) { 781 rep.th.seq = th->ack_seq; 782 } else { 783 rep.th.ack = 1; 784 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 785 skb->len - (th->doff << 2)); 786 } 787 788 memset(&arg, 0, sizeof(arg)); 789 arg.iov[0].iov_base = (unsigned char *)&rep; 790 arg.iov[0].iov_len = sizeof(rep.th); 791 792 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 793 794 /* Invalid TCP option size or twice included auth */ 795 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 796 return; 797 798 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 799 return; 800 801 #ifdef CONFIG_TCP_MD5SIG 802 rcu_read_lock(); 803 if (sk && sk_fullsock(sk)) { 804 const union tcp_md5_addr *addr; 805 int l3index; 806 807 /* sdif set, means packet ingressed via a device 808 * in an L3 domain and inet_iif is set to it. 809 */ 810 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 811 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 812 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 813 } else if (md5_hash_location) { 814 const union tcp_md5_addr *addr; 815 int sdif = tcp_v4_sdif(skb); 816 int dif = inet_iif(skb); 817 int l3index; 818 819 /* 820 * active side is lost. Try to find listening socket through 821 * source port, and then find md5 key through listening socket. 822 * we are not loose security here: 823 * Incoming packet is checked with md5 hash with finding key, 824 * no RST generated if md5 hash doesn't match. 825 */ 826 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 827 th->source, ip_hdr(skb)->daddr, 828 ntohs(th->source), dif, sdif); 829 /* don't send rst if it can't find key */ 830 if (!sk1) 831 goto out; 832 833 /* sdif set, means packet ingressed via a device 834 * in an L3 domain and dif is set to it. 835 */ 836 l3index = sdif ? dif : 0; 837 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 838 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 839 if (!key) 840 goto out; 841 842 843 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 844 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 845 goto out; 846 847 } 848 849 if (key) { 850 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 851 (TCPOPT_NOP << 16) | 852 (TCPOPT_MD5SIG << 8) | 853 TCPOLEN_MD5SIG); 854 /* Update length and the length the header thinks exists */ 855 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 856 rep.th.doff = arg.iov[0].iov_len / 4; 857 858 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 859 key, ip_hdr(skb)->saddr, 860 ip_hdr(skb)->daddr, &rep.th); 861 } 862 #endif 863 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 864 if (rep.opt[0] == 0) { 865 __be32 mrst = mptcp_reset_option(skb); 866 867 if (mrst) { 868 rep.opt[0] = mrst; 869 arg.iov[0].iov_len += sizeof(mrst); 870 rep.th.doff = arg.iov[0].iov_len / 4; 871 } 872 } 873 874 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 875 ip_hdr(skb)->saddr, /* XXX */ 876 arg.iov[0].iov_len, IPPROTO_TCP, 0); 877 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 878 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 879 880 /* When socket is gone, all binding information is lost. 881 * routing might fail in this case. No choice here, if we choose to force 882 * input interface, we will misroute in case of asymmetric route. 883 */ 884 if (sk) 885 arg.bound_dev_if = sk->sk_bound_dev_if; 886 887 trace_tcp_send_reset(sk, skb, reason); 888 889 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 890 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 891 892 /* ECN bits of TW reset are cleared */ 893 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 894 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 895 local_bh_disable(); 896 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 897 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 898 899 sock_net_set(ctl_sk, net); 900 if (sk) { 901 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 902 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 903 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 904 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 905 transmit_time = tcp_transmit_time(sk); 906 xfrm_sk_clone_policy(ctl_sk, sk); 907 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 908 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 909 } else { 910 ctl_sk->sk_mark = 0; 911 ctl_sk->sk_priority = 0; 912 } 913 ip_send_unicast_reply(ctl_sk, sk, 914 skb, &TCP_SKB_CB(skb)->header.h4.opt, 915 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 916 &arg, arg.iov[0].iov_len, 917 transmit_time, txhash); 918 919 xfrm_sk_free_policy(ctl_sk); 920 sock_net_set(ctl_sk, &init_net); 921 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 922 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 923 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 924 local_bh_enable(); 925 926 #ifdef CONFIG_TCP_MD5SIG 927 out: 928 rcu_read_unlock(); 929 #endif 930 } 931 932 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 933 outside socket context is ugly, certainly. What can I do? 934 */ 935 936 static void tcp_v4_send_ack(const struct sock *sk, 937 struct sk_buff *skb, u32 seq, u32 ack, 938 u32 win, u32 tsval, u32 tsecr, int oif, 939 struct tcp_key *key, 940 int reply_flags, u8 tos, u32 txhash) 941 { 942 const struct tcphdr *th = tcp_hdr(skb); 943 struct { 944 struct tcphdr th; 945 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 946 } rep; 947 struct net *net = sock_net(sk); 948 struct ip_reply_arg arg; 949 struct sock *ctl_sk; 950 u64 transmit_time; 951 952 memset(&rep.th, 0, sizeof(struct tcphdr)); 953 memset(&arg, 0, sizeof(arg)); 954 955 arg.iov[0].iov_base = (unsigned char *)&rep; 956 arg.iov[0].iov_len = sizeof(rep.th); 957 if (tsecr) { 958 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 959 (TCPOPT_TIMESTAMP << 8) | 960 TCPOLEN_TIMESTAMP); 961 rep.opt[1] = htonl(tsval); 962 rep.opt[2] = htonl(tsecr); 963 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 964 } 965 966 /* Swap the send and the receive. */ 967 rep.th.dest = th->source; 968 rep.th.source = th->dest; 969 rep.th.doff = arg.iov[0].iov_len / 4; 970 rep.th.seq = htonl(seq); 971 rep.th.ack_seq = htonl(ack); 972 rep.th.ack = 1; 973 rep.th.window = htons(win); 974 975 #ifdef CONFIG_TCP_MD5SIG 976 if (tcp_key_is_md5(key)) { 977 int offset = (tsecr) ? 3 : 0; 978 979 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 980 (TCPOPT_NOP << 16) | 981 (TCPOPT_MD5SIG << 8) | 982 TCPOLEN_MD5SIG); 983 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 984 rep.th.doff = arg.iov[0].iov_len/4; 985 986 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 987 key->md5_key, ip_hdr(skb)->saddr, 988 ip_hdr(skb)->daddr, &rep.th); 989 } 990 #endif 991 #ifdef CONFIG_TCP_AO 992 if (tcp_key_is_ao(key)) { 993 int offset = (tsecr) ? 3 : 0; 994 995 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 996 (tcp_ao_len(key->ao_key) << 16) | 997 (key->ao_key->sndid << 8) | 998 key->rcv_next); 999 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 1000 rep.th.doff = arg.iov[0].iov_len / 4; 1001 1002 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1003 key->ao_key, key->traffic_key, 1004 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1005 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1006 &rep.th, key->sne); 1007 } 1008 #endif 1009 arg.flags = reply_flags; 1010 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1011 ip_hdr(skb)->saddr, /* XXX */ 1012 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1013 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1014 if (oif) 1015 arg.bound_dev_if = oif; 1016 arg.tos = tos; 1017 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1018 local_bh_disable(); 1019 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1020 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1021 sock_net_set(ctl_sk, net); 1022 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1023 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1024 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1025 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1026 transmit_time = tcp_transmit_time(sk); 1027 ip_send_unicast_reply(ctl_sk, sk, 1028 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1029 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1030 &arg, arg.iov[0].iov_len, 1031 transmit_time, txhash); 1032 1033 sock_net_set(ctl_sk, &init_net); 1034 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1035 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1036 local_bh_enable(); 1037 } 1038 1039 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1040 enum tcp_tw_status tw_status) 1041 { 1042 struct inet_timewait_sock *tw = inet_twsk(sk); 1043 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1044 struct tcp_key key = {}; 1045 u8 tos = tw->tw_tos; 1046 1047 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1048 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1049 * being placed in a different service queues (Classic rather than L4S) 1050 */ 1051 if (tw_status == TCP_TW_ACK_OOW) 1052 tos &= ~INET_ECN_MASK; 1053 1054 #ifdef CONFIG_TCP_AO 1055 struct tcp_ao_info *ao_info; 1056 1057 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1058 /* FIXME: the segment to-be-acked is not verified yet */ 1059 ao_info = rcu_dereference(tcptw->ao_info); 1060 if (ao_info) { 1061 const struct tcp_ao_hdr *aoh; 1062 1063 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1064 inet_twsk_put(tw); 1065 return; 1066 } 1067 1068 if (aoh) 1069 key.ao_key = tcp_ao_established_key(sk, ao_info, 1070 aoh->rnext_keyid, -1); 1071 } 1072 } 1073 if (key.ao_key) { 1074 struct tcp_ao_key *rnext_key; 1075 1076 key.traffic_key = snd_other_key(key.ao_key); 1077 key.sne = READ_ONCE(ao_info->snd_sne); 1078 rnext_key = READ_ONCE(ao_info->rnext_key); 1079 key.rcv_next = rnext_key->rcvid; 1080 key.type = TCP_KEY_AO; 1081 #else 1082 if (0) { 1083 #endif 1084 } else if (static_branch_tcp_md5()) { 1085 key.md5_key = tcp_twsk_md5_key(tcptw); 1086 if (key.md5_key) 1087 key.type = TCP_KEY_MD5; 1088 } 1089 1090 tcp_v4_send_ack(sk, skb, 1091 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1092 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1093 tcp_tw_tsval(tcptw), 1094 READ_ONCE(tcptw->tw_ts_recent), 1095 tw->tw_bound_dev_if, &key, 1096 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1097 tos, 1098 tw->tw_txhash); 1099 1100 inet_twsk_put(tw); 1101 } 1102 1103 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1104 struct request_sock *req) 1105 { 1106 struct tcp_key key = {}; 1107 1108 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1109 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1110 */ 1111 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1112 tcp_sk(sk)->snd_nxt; 1113 1114 #ifdef CONFIG_TCP_AO 1115 if (static_branch_unlikely(&tcp_ao_needed.key) && 1116 tcp_rsk_used_ao(req)) { 1117 const union tcp_md5_addr *addr; 1118 const struct tcp_ao_hdr *aoh; 1119 int l3index; 1120 1121 /* Invalid TCP option size or twice included auth */ 1122 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1123 return; 1124 if (!aoh) 1125 return; 1126 1127 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1128 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1129 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1130 aoh->rnext_keyid, -1); 1131 if (unlikely(!key.ao_key)) { 1132 /* Send ACK with any matching MKT for the peer */ 1133 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1134 /* Matching key disappeared (user removed the key?) 1135 * let the handshake timeout. 1136 */ 1137 if (!key.ao_key) { 1138 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1139 addr, 1140 ntohs(tcp_hdr(skb)->source), 1141 &ip_hdr(skb)->daddr, 1142 ntohs(tcp_hdr(skb)->dest)); 1143 return; 1144 } 1145 } 1146 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1147 if (!key.traffic_key) 1148 return; 1149 1150 key.type = TCP_KEY_AO; 1151 key.rcv_next = aoh->keyid; 1152 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1153 #else 1154 if (0) { 1155 #endif 1156 } else if (static_branch_tcp_md5()) { 1157 const union tcp_md5_addr *addr; 1158 int l3index; 1159 1160 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1161 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1162 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1163 if (key.md5_key) 1164 key.type = TCP_KEY_MD5; 1165 } 1166 1167 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1168 tcp_v4_send_ack(sk, skb, seq, 1169 tcp_rsk(req)->rcv_nxt, 1170 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1171 tcp_rsk_tsval(tcp_rsk(req)), 1172 req->ts_recent, 1173 0, &key, 1174 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1175 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1176 READ_ONCE(tcp_rsk(req)->txhash)); 1177 if (tcp_key_is_ao(&key)) 1178 kfree(key.traffic_key); 1179 } 1180 1181 /* 1182 * Send a SYN-ACK after having received a SYN. 1183 * This still operates on a request_sock only, not on a big 1184 * socket. 1185 */ 1186 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1187 struct flowi *fl, 1188 struct request_sock *req, 1189 struct tcp_fastopen_cookie *foc, 1190 enum tcp_synack_type synack_type, 1191 struct sk_buff *syn_skb) 1192 { 1193 struct inet_request_sock *ireq = inet_rsk(req); 1194 struct flowi4 fl4; 1195 int err = -1; 1196 struct sk_buff *skb; 1197 u8 tos; 1198 1199 /* First, grab a route. */ 1200 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1201 return -1; 1202 1203 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1204 1205 if (skb) { 1206 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 1207 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1208 1209 tos = READ_ONCE(inet_sk(sk)->tos); 1210 1211 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1212 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1213 (tos & INET_ECN_MASK); 1214 1215 if (!INET_ECN_is_capable(tos) && 1216 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1217 tos |= INET_ECN_ECT_0; 1218 1219 rcu_read_lock(); 1220 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1221 ireq->ir_rmt_addr, 1222 rcu_dereference(ireq->ireq_opt), 1223 tos); 1224 rcu_read_unlock(); 1225 err = net_xmit_eval(err); 1226 } 1227 1228 return err; 1229 } 1230 1231 /* 1232 * IPv4 request_sock destructor. 1233 */ 1234 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1235 { 1236 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1237 } 1238 1239 #ifdef CONFIG_TCP_MD5SIG 1240 /* 1241 * RFC2385 MD5 checksumming requires a mapping of 1242 * IP address->MD5 Key. 1243 * We need to maintain these in the sk structure. 1244 */ 1245 1246 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1247 EXPORT_IPV6_MOD(tcp_md5_needed); 1248 1249 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1250 { 1251 if (!old) 1252 return true; 1253 1254 /* l3index always overrides non-l3index */ 1255 if (old->l3index && new->l3index == 0) 1256 return false; 1257 if (old->l3index == 0 && new->l3index) 1258 return true; 1259 1260 return old->prefixlen < new->prefixlen; 1261 } 1262 1263 /* Find the Key structure for an address. */ 1264 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1265 const union tcp_md5_addr *addr, 1266 int family, bool any_l3index) 1267 { 1268 const struct tcp_sock *tp = tcp_sk(sk); 1269 struct tcp_md5sig_key *key; 1270 const struct tcp_md5sig_info *md5sig; 1271 __be32 mask; 1272 struct tcp_md5sig_key *best_match = NULL; 1273 bool match; 1274 1275 /* caller either holds rcu_read_lock() or socket lock */ 1276 md5sig = rcu_dereference_check(tp->md5sig_info, 1277 lockdep_sock_is_held(sk)); 1278 if (!md5sig) 1279 return NULL; 1280 1281 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1282 lockdep_sock_is_held(sk)) { 1283 if (key->family != family) 1284 continue; 1285 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1286 key->l3index != l3index) 1287 continue; 1288 if (family == AF_INET) { 1289 mask = inet_make_mask(key->prefixlen); 1290 match = (key->addr.a4.s_addr & mask) == 1291 (addr->a4.s_addr & mask); 1292 #if IS_ENABLED(CONFIG_IPV6) 1293 } else if (family == AF_INET6) { 1294 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1295 key->prefixlen); 1296 #endif 1297 } else { 1298 match = false; 1299 } 1300 1301 if (match && better_md5_match(best_match, key)) 1302 best_match = key; 1303 } 1304 return best_match; 1305 } 1306 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1307 1308 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1309 const union tcp_md5_addr *addr, 1310 int family, u8 prefixlen, 1311 int l3index, u8 flags) 1312 { 1313 const struct tcp_sock *tp = tcp_sk(sk); 1314 struct tcp_md5sig_key *key; 1315 unsigned int size = sizeof(struct in_addr); 1316 const struct tcp_md5sig_info *md5sig; 1317 1318 /* caller either holds rcu_read_lock() or socket lock */ 1319 md5sig = rcu_dereference_check(tp->md5sig_info, 1320 lockdep_sock_is_held(sk)); 1321 if (!md5sig) 1322 return NULL; 1323 #if IS_ENABLED(CONFIG_IPV6) 1324 if (family == AF_INET6) 1325 size = sizeof(struct in6_addr); 1326 #endif 1327 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1328 lockdep_sock_is_held(sk)) { 1329 if (key->family != family) 1330 continue; 1331 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1332 continue; 1333 if (key->l3index != l3index) 1334 continue; 1335 if (!memcmp(&key->addr, addr, size) && 1336 key->prefixlen == prefixlen) 1337 return key; 1338 } 1339 return NULL; 1340 } 1341 1342 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1343 const struct sock *addr_sk) 1344 { 1345 const union tcp_md5_addr *addr; 1346 int l3index; 1347 1348 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1349 addr_sk->sk_bound_dev_if); 1350 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1351 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1352 } 1353 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1354 1355 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1356 { 1357 struct tcp_sock *tp = tcp_sk(sk); 1358 struct tcp_md5sig_info *md5sig; 1359 1360 md5sig = kmalloc(sizeof(*md5sig), gfp); 1361 if (!md5sig) 1362 return -ENOMEM; 1363 1364 sk_gso_disable(sk); 1365 INIT_HLIST_HEAD(&md5sig->head); 1366 rcu_assign_pointer(tp->md5sig_info, md5sig); 1367 return 0; 1368 } 1369 1370 /* This can be called on a newly created socket, from other files */ 1371 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1372 int family, u8 prefixlen, int l3index, u8 flags, 1373 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1374 { 1375 /* Add Key to the list */ 1376 struct tcp_md5sig_key *key; 1377 struct tcp_sock *tp = tcp_sk(sk); 1378 struct tcp_md5sig_info *md5sig; 1379 1380 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1381 if (key) { 1382 /* Pre-existing entry - just update that one. 1383 * Note that the key might be used concurrently. 1384 * data_race() is telling kcsan that we do not care of 1385 * key mismatches, since changing MD5 key on live flows 1386 * can lead to packet drops. 1387 */ 1388 data_race(memcpy(key->key, newkey, newkeylen)); 1389 1390 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1391 * Also note that a reader could catch new key->keylen value 1392 * but old key->key[], this is the reason we use __GFP_ZERO 1393 * at sock_kmalloc() time below these lines. 1394 */ 1395 WRITE_ONCE(key->keylen, newkeylen); 1396 1397 return 0; 1398 } 1399 1400 md5sig = rcu_dereference_protected(tp->md5sig_info, 1401 lockdep_sock_is_held(sk)); 1402 1403 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1404 if (!key) 1405 return -ENOMEM; 1406 1407 memcpy(key->key, newkey, newkeylen); 1408 key->keylen = newkeylen; 1409 key->family = family; 1410 key->prefixlen = prefixlen; 1411 key->l3index = l3index; 1412 key->flags = flags; 1413 memcpy(&key->addr, addr, 1414 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1415 sizeof(struct in_addr)); 1416 hlist_add_head_rcu(&key->node, &md5sig->head); 1417 return 0; 1418 } 1419 1420 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1421 int family, u8 prefixlen, int l3index, u8 flags, 1422 const u8 *newkey, u8 newkeylen) 1423 { 1424 struct tcp_sock *tp = tcp_sk(sk); 1425 1426 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1427 if (tcp_md5_alloc_sigpool()) 1428 return -ENOMEM; 1429 1430 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1431 tcp_md5_release_sigpool(); 1432 return -ENOMEM; 1433 } 1434 1435 if (!static_branch_inc(&tcp_md5_needed.key)) { 1436 struct tcp_md5sig_info *md5sig; 1437 1438 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1439 rcu_assign_pointer(tp->md5sig_info, NULL); 1440 kfree_rcu(md5sig, rcu); 1441 tcp_md5_release_sigpool(); 1442 return -EUSERS; 1443 } 1444 } 1445 1446 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1447 newkey, newkeylen, GFP_KERNEL); 1448 } 1449 EXPORT_IPV6_MOD(tcp_md5_do_add); 1450 1451 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1452 int family, u8 prefixlen, int l3index, 1453 struct tcp_md5sig_key *key) 1454 { 1455 struct tcp_sock *tp = tcp_sk(sk); 1456 1457 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1458 tcp_md5_add_sigpool(); 1459 1460 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1461 tcp_md5_release_sigpool(); 1462 return -ENOMEM; 1463 } 1464 1465 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1466 struct tcp_md5sig_info *md5sig; 1467 1468 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1469 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1470 rcu_assign_pointer(tp->md5sig_info, NULL); 1471 kfree_rcu(md5sig, rcu); 1472 tcp_md5_release_sigpool(); 1473 return -EUSERS; 1474 } 1475 } 1476 1477 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1478 key->flags, key->key, key->keylen, 1479 sk_gfp_mask(sk, GFP_ATOMIC)); 1480 } 1481 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1482 1483 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1484 u8 prefixlen, int l3index, u8 flags) 1485 { 1486 struct tcp_md5sig_key *key; 1487 1488 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1489 if (!key) 1490 return -ENOENT; 1491 hlist_del_rcu(&key->node); 1492 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1493 kfree_rcu(key, rcu); 1494 return 0; 1495 } 1496 EXPORT_IPV6_MOD(tcp_md5_do_del); 1497 1498 void tcp_clear_md5_list(struct sock *sk) 1499 { 1500 struct tcp_sock *tp = tcp_sk(sk); 1501 struct tcp_md5sig_key *key; 1502 struct hlist_node *n; 1503 struct tcp_md5sig_info *md5sig; 1504 1505 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1506 1507 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1508 hlist_del(&key->node); 1509 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1510 kfree(key); 1511 } 1512 } 1513 1514 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1515 sockptr_t optval, int optlen) 1516 { 1517 struct tcp_md5sig cmd; 1518 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1519 const union tcp_md5_addr *addr; 1520 u8 prefixlen = 32; 1521 int l3index = 0; 1522 bool l3flag; 1523 u8 flags; 1524 1525 if (optlen < sizeof(cmd)) 1526 return -EINVAL; 1527 1528 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1529 return -EFAULT; 1530 1531 if (sin->sin_family != AF_INET) 1532 return -EINVAL; 1533 1534 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1535 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1536 1537 if (optname == TCP_MD5SIG_EXT && 1538 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1539 prefixlen = cmd.tcpm_prefixlen; 1540 if (prefixlen > 32) 1541 return -EINVAL; 1542 } 1543 1544 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1545 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1546 struct net_device *dev; 1547 1548 rcu_read_lock(); 1549 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1550 if (dev && netif_is_l3_master(dev)) 1551 l3index = dev->ifindex; 1552 1553 rcu_read_unlock(); 1554 1555 /* ok to reference set/not set outside of rcu; 1556 * right now device MUST be an L3 master 1557 */ 1558 if (!dev || !l3index) 1559 return -EINVAL; 1560 } 1561 1562 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1563 1564 if (!cmd.tcpm_keylen) 1565 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1566 1567 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1568 return -EINVAL; 1569 1570 /* Don't allow keys for peers that have a matching TCP-AO key. 1571 * See the comment in tcp_ao_add_cmd() 1572 */ 1573 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1574 return -EKEYREJECTED; 1575 1576 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1577 cmd.tcpm_key, cmd.tcpm_keylen); 1578 } 1579 1580 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1581 __be32 daddr, __be32 saddr, 1582 const struct tcphdr *th, int nbytes) 1583 { 1584 struct tcp4_pseudohdr *bp; 1585 struct scatterlist sg; 1586 struct tcphdr *_th; 1587 1588 bp = hp->scratch; 1589 bp->saddr = saddr; 1590 bp->daddr = daddr; 1591 bp->pad = 0; 1592 bp->protocol = IPPROTO_TCP; 1593 bp->len = cpu_to_be16(nbytes); 1594 1595 _th = (struct tcphdr *)(bp + 1); 1596 memcpy(_th, th, sizeof(*th)); 1597 _th->check = 0; 1598 1599 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1600 ahash_request_set_crypt(hp->req, &sg, NULL, 1601 sizeof(*bp) + sizeof(*th)); 1602 return crypto_ahash_update(hp->req); 1603 } 1604 1605 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1606 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1607 { 1608 struct tcp_sigpool hp; 1609 1610 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1611 goto clear_hash_nostart; 1612 1613 if (crypto_ahash_init(hp.req)) 1614 goto clear_hash; 1615 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1616 goto clear_hash; 1617 if (tcp_md5_hash_key(&hp, key)) 1618 goto clear_hash; 1619 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1620 if (crypto_ahash_final(hp.req)) 1621 goto clear_hash; 1622 1623 tcp_sigpool_end(&hp); 1624 return 0; 1625 1626 clear_hash: 1627 tcp_sigpool_end(&hp); 1628 clear_hash_nostart: 1629 memset(md5_hash, 0, 16); 1630 return 1; 1631 } 1632 1633 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1634 const struct sock *sk, 1635 const struct sk_buff *skb) 1636 { 1637 const struct tcphdr *th = tcp_hdr(skb); 1638 struct tcp_sigpool hp; 1639 __be32 saddr, daddr; 1640 1641 if (sk) { /* valid for establish/request sockets */ 1642 saddr = sk->sk_rcv_saddr; 1643 daddr = sk->sk_daddr; 1644 } else { 1645 const struct iphdr *iph = ip_hdr(skb); 1646 saddr = iph->saddr; 1647 daddr = iph->daddr; 1648 } 1649 1650 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1651 goto clear_hash_nostart; 1652 1653 if (crypto_ahash_init(hp.req)) 1654 goto clear_hash; 1655 1656 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1657 goto clear_hash; 1658 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1659 goto clear_hash; 1660 if (tcp_md5_hash_key(&hp, key)) 1661 goto clear_hash; 1662 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1663 if (crypto_ahash_final(hp.req)) 1664 goto clear_hash; 1665 1666 tcp_sigpool_end(&hp); 1667 return 0; 1668 1669 clear_hash: 1670 tcp_sigpool_end(&hp); 1671 clear_hash_nostart: 1672 memset(md5_hash, 0, 16); 1673 return 1; 1674 } 1675 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1676 1677 #endif 1678 1679 static void tcp_v4_init_req(struct request_sock *req, 1680 const struct sock *sk_listener, 1681 struct sk_buff *skb) 1682 { 1683 struct inet_request_sock *ireq = inet_rsk(req); 1684 struct net *net = sock_net(sk_listener); 1685 1686 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1687 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1688 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1689 } 1690 1691 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1692 struct sk_buff *skb, 1693 struct flowi *fl, 1694 struct request_sock *req, 1695 u32 tw_isn) 1696 { 1697 tcp_v4_init_req(req, sk, skb); 1698 1699 if (security_inet_conn_request(sk, skb, req)) 1700 return NULL; 1701 1702 return inet_csk_route_req(sk, &fl->u.ip4, req); 1703 } 1704 1705 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1706 .family = PF_INET, 1707 .obj_size = sizeof(struct tcp_request_sock), 1708 .send_ack = tcp_v4_reqsk_send_ack, 1709 .destructor = tcp_v4_reqsk_destructor, 1710 .send_reset = tcp_v4_send_reset, 1711 .syn_ack_timeout = tcp_syn_ack_timeout, 1712 }; 1713 1714 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1715 .mss_clamp = TCP_MSS_DEFAULT, 1716 #ifdef CONFIG_TCP_MD5SIG 1717 .req_md5_lookup = tcp_v4_md5_lookup, 1718 .calc_md5_hash = tcp_v4_md5_hash_skb, 1719 #endif 1720 #ifdef CONFIG_TCP_AO 1721 .ao_lookup = tcp_v4_ao_lookup_rsk, 1722 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1723 .ao_synack_hash = tcp_v4_ao_synack_hash, 1724 #endif 1725 #ifdef CONFIG_SYN_COOKIES 1726 .cookie_init_seq = cookie_v4_init_sequence, 1727 #endif 1728 .route_req = tcp_v4_route_req, 1729 .init_seq = tcp_v4_init_seq, 1730 .init_ts_off = tcp_v4_init_ts_off, 1731 .send_synack = tcp_v4_send_synack, 1732 }; 1733 1734 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1735 { 1736 /* Never answer to SYNs send to broadcast or multicast */ 1737 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1738 goto drop; 1739 1740 return tcp_conn_request(&tcp_request_sock_ops, 1741 &tcp_request_sock_ipv4_ops, sk, skb); 1742 1743 drop: 1744 tcp_listendrop(sk); 1745 return 0; 1746 } 1747 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1748 1749 1750 /* 1751 * The three way handshake has completed - we got a valid synack - 1752 * now create the new socket. 1753 */ 1754 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1755 struct request_sock *req, 1756 struct dst_entry *dst, 1757 struct request_sock *req_unhash, 1758 bool *own_req) 1759 { 1760 struct inet_request_sock *ireq; 1761 bool found_dup_sk = false; 1762 struct inet_sock *newinet; 1763 struct tcp_sock *newtp; 1764 struct sock *newsk; 1765 #ifdef CONFIG_TCP_MD5SIG 1766 const union tcp_md5_addr *addr; 1767 struct tcp_md5sig_key *key; 1768 int l3index; 1769 #endif 1770 struct ip_options_rcu *inet_opt; 1771 1772 if (sk_acceptq_is_full(sk)) 1773 goto exit_overflow; 1774 1775 newsk = tcp_create_openreq_child(sk, req, skb); 1776 if (!newsk) 1777 goto exit_nonewsk; 1778 1779 newsk->sk_gso_type = SKB_GSO_TCPV4; 1780 inet_sk_rx_dst_set(newsk, skb); 1781 1782 newtp = tcp_sk(newsk); 1783 newinet = inet_sk(newsk); 1784 ireq = inet_rsk(req); 1785 inet_opt = rcu_dereference(ireq->ireq_opt); 1786 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1787 newinet->mc_index = inet_iif(skb); 1788 newinet->mc_ttl = ip_hdr(skb)->ttl; 1789 newinet->rcv_tos = ip_hdr(skb)->tos; 1790 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1791 if (inet_opt) 1792 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1793 atomic_set(&newinet->inet_id, get_random_u16()); 1794 1795 /* Set ToS of the new socket based upon the value of incoming SYN. 1796 * ECT bits are set later in tcp_init_transfer(). 1797 */ 1798 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1799 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1800 1801 if (!dst) { 1802 dst = inet_csk_route_child_sock(sk, newsk, req); 1803 if (!dst) 1804 goto put_and_exit; 1805 } else { 1806 /* syncookie case : see end of cookie_v4_check() */ 1807 } 1808 sk_setup_caps(newsk, dst); 1809 1810 tcp_ca_openreq_child(newsk, dst); 1811 1812 tcp_sync_mss(newsk, dst_mtu(dst)); 1813 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1814 1815 tcp_initialize_rcv_mss(newsk); 1816 1817 #ifdef CONFIG_TCP_MD5SIG 1818 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1819 /* Copy over the MD5 key from the original socket */ 1820 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1821 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1822 if (key && !tcp_rsk_used_ao(req)) { 1823 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1824 goto put_and_exit; 1825 sk_gso_disable(newsk); 1826 } 1827 #endif 1828 #ifdef CONFIG_TCP_AO 1829 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1830 goto put_and_exit; /* OOM, release back memory */ 1831 #endif 1832 1833 if (__inet_inherit_port(sk, newsk) < 0) 1834 goto put_and_exit; 1835 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1836 &found_dup_sk); 1837 if (likely(*own_req)) { 1838 tcp_move_syn(newtp, req); 1839 ireq->ireq_opt = NULL; 1840 } else { 1841 newinet->inet_opt = NULL; 1842 1843 if (!req_unhash && found_dup_sk) { 1844 /* This code path should only be executed in the 1845 * syncookie case only 1846 */ 1847 bh_unlock_sock(newsk); 1848 sock_put(newsk); 1849 newsk = NULL; 1850 } 1851 } 1852 return newsk; 1853 1854 exit_overflow: 1855 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1856 exit_nonewsk: 1857 dst_release(dst); 1858 exit: 1859 tcp_listendrop(sk); 1860 return NULL; 1861 put_and_exit: 1862 newinet->inet_opt = NULL; 1863 inet_csk_prepare_forced_close(newsk); 1864 tcp_done(newsk); 1865 goto exit; 1866 } 1867 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1868 1869 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1870 { 1871 #ifdef CONFIG_SYN_COOKIES 1872 const struct tcphdr *th = tcp_hdr(skb); 1873 1874 if (!th->syn) 1875 sk = cookie_v4_check(sk, skb); 1876 #endif 1877 return sk; 1878 } 1879 1880 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1881 struct tcphdr *th, u32 *cookie) 1882 { 1883 u16 mss = 0; 1884 #ifdef CONFIG_SYN_COOKIES 1885 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1886 &tcp_request_sock_ipv4_ops, sk, th); 1887 if (mss) { 1888 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1889 tcp_synq_overflow(sk); 1890 } 1891 #endif 1892 return mss; 1893 } 1894 1895 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1896 u32)); 1897 /* The socket must have it's spinlock held when we get 1898 * here, unless it is a TCP_LISTEN socket. 1899 * 1900 * We have a potential double-lock case here, so even when 1901 * doing backlog processing we use the BH locking scheme. 1902 * This is because we cannot sleep with the original spinlock 1903 * held. 1904 */ 1905 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1906 { 1907 enum skb_drop_reason reason; 1908 struct sock *rsk; 1909 1910 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1911 struct dst_entry *dst; 1912 1913 dst = rcu_dereference_protected(sk->sk_rx_dst, 1914 lockdep_sock_is_held(sk)); 1915 1916 sock_rps_save_rxhash(sk, skb); 1917 sk_mark_napi_id(sk, skb); 1918 if (dst) { 1919 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1920 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1921 dst, 0)) { 1922 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1923 dst_release(dst); 1924 } 1925 } 1926 tcp_rcv_established(sk, skb); 1927 return 0; 1928 } 1929 1930 if (tcp_checksum_complete(skb)) 1931 goto csum_err; 1932 1933 if (sk->sk_state == TCP_LISTEN) { 1934 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1935 1936 if (!nsk) 1937 return 0; 1938 if (nsk != sk) { 1939 reason = tcp_child_process(sk, nsk, skb); 1940 if (reason) { 1941 rsk = nsk; 1942 goto reset; 1943 } 1944 return 0; 1945 } 1946 } else 1947 sock_rps_save_rxhash(sk, skb); 1948 1949 reason = tcp_rcv_state_process(sk, skb); 1950 if (reason) { 1951 rsk = sk; 1952 goto reset; 1953 } 1954 return 0; 1955 1956 reset: 1957 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1958 discard: 1959 sk_skb_reason_drop(sk, skb, reason); 1960 /* Be careful here. If this function gets more complicated and 1961 * gcc suffers from register pressure on the x86, sk (in %ebx) 1962 * might be destroyed here. This current version compiles correctly, 1963 * but you have been warned. 1964 */ 1965 return 0; 1966 1967 csum_err: 1968 reason = SKB_DROP_REASON_TCP_CSUM; 1969 trace_tcp_bad_csum(skb); 1970 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1971 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1972 goto discard; 1973 } 1974 EXPORT_SYMBOL(tcp_v4_do_rcv); 1975 1976 int tcp_v4_early_demux(struct sk_buff *skb) 1977 { 1978 struct net *net = dev_net_rcu(skb->dev); 1979 const struct iphdr *iph; 1980 const struct tcphdr *th; 1981 struct sock *sk; 1982 1983 if (skb->pkt_type != PACKET_HOST) 1984 return 0; 1985 1986 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1987 return 0; 1988 1989 iph = ip_hdr(skb); 1990 th = tcp_hdr(skb); 1991 1992 if (th->doff < sizeof(struct tcphdr) / 4) 1993 return 0; 1994 1995 sk = __inet_lookup_established(net, iph->saddr, th->source, 1996 iph->daddr, ntohs(th->dest), 1997 skb->skb_iif, inet_sdif(skb)); 1998 if (sk) { 1999 skb->sk = sk; 2000 skb->destructor = sock_edemux; 2001 if (sk_fullsock(sk)) { 2002 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 2003 2004 if (dst) 2005 dst = dst_check(dst, 0); 2006 if (dst && 2007 sk->sk_rx_dst_ifindex == skb->skb_iif) 2008 skb_dst_set_noref(skb, dst); 2009 } 2010 } 2011 return 0; 2012 } 2013 2014 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2015 enum skb_drop_reason *reason) 2016 { 2017 u32 tail_gso_size, tail_gso_segs; 2018 struct skb_shared_info *shinfo; 2019 const struct tcphdr *th; 2020 struct tcphdr *thtail; 2021 struct sk_buff *tail; 2022 unsigned int hdrlen; 2023 bool fragstolen; 2024 u32 gso_segs; 2025 u32 gso_size; 2026 u64 limit; 2027 int delta; 2028 int err; 2029 2030 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2031 * we can fix skb->truesize to its real value to avoid future drops. 2032 * This is valid because skb is not yet charged to the socket. 2033 * It has been noticed pure SACK packets were sometimes dropped 2034 * (if cooked by drivers without copybreak feature). 2035 */ 2036 skb_condense(skb); 2037 2038 tcp_cleanup_skb(skb); 2039 2040 if (unlikely(tcp_checksum_complete(skb))) { 2041 bh_unlock_sock(sk); 2042 trace_tcp_bad_csum(skb); 2043 *reason = SKB_DROP_REASON_TCP_CSUM; 2044 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2045 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2046 return true; 2047 } 2048 2049 /* Attempt coalescing to last skb in backlog, even if we are 2050 * above the limits. 2051 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2052 */ 2053 th = (const struct tcphdr *)skb->data; 2054 hdrlen = th->doff * 4; 2055 2056 tail = sk->sk_backlog.tail; 2057 if (!tail) 2058 goto no_coalesce; 2059 thtail = (struct tcphdr *)tail->data; 2060 2061 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2062 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2063 ((TCP_SKB_CB(tail)->tcp_flags | 2064 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2065 !((TCP_SKB_CB(tail)->tcp_flags & 2066 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2067 ((TCP_SKB_CB(tail)->tcp_flags ^ 2068 TCP_SKB_CB(skb)->tcp_flags) & 2069 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2070 !tcp_skb_can_collapse_rx(tail, skb) || 2071 thtail->doff != th->doff || 2072 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2073 goto no_coalesce; 2074 2075 __skb_pull(skb, hdrlen); 2076 2077 shinfo = skb_shinfo(skb); 2078 gso_size = shinfo->gso_size ?: skb->len; 2079 gso_segs = shinfo->gso_segs ?: 1; 2080 2081 shinfo = skb_shinfo(tail); 2082 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2083 tail_gso_segs = shinfo->gso_segs ?: 1; 2084 2085 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2086 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2087 2088 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2089 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2090 thtail->window = th->window; 2091 } 2092 2093 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2094 * thtail->fin, so that the fast path in tcp_rcv_established() 2095 * is not entered if we append a packet with a FIN. 2096 * SYN, RST, URG are not present. 2097 * ACK is set on both packets. 2098 * PSH : we do not really care in TCP stack, 2099 * at least for 'GRO' packets. 2100 */ 2101 thtail->fin |= th->fin; 2102 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2103 2104 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2105 TCP_SKB_CB(tail)->has_rxtstamp = true; 2106 tail->tstamp = skb->tstamp; 2107 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2108 } 2109 2110 /* Not as strict as GRO. We only need to carry mss max value */ 2111 shinfo->gso_size = max(gso_size, tail_gso_size); 2112 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2113 2114 sk->sk_backlog.len += delta; 2115 __NET_INC_STATS(sock_net(sk), 2116 LINUX_MIB_TCPBACKLOGCOALESCE); 2117 kfree_skb_partial(skb, fragstolen); 2118 return false; 2119 } 2120 __skb_push(skb, hdrlen); 2121 2122 no_coalesce: 2123 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2124 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2125 * sk_rcvbuf in normal conditions. 2126 */ 2127 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2128 2129 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2130 2131 /* Only socket owner can try to collapse/prune rx queues 2132 * to reduce memory overhead, so add a little headroom here. 2133 * Few sockets backlog are possibly concurrently non empty. 2134 */ 2135 limit += 64 * 1024; 2136 2137 limit = min_t(u64, limit, UINT_MAX); 2138 2139 err = sk_add_backlog(sk, skb, limit); 2140 if (unlikely(err)) { 2141 bh_unlock_sock(sk); 2142 if (err == -ENOMEM) { 2143 *reason = SKB_DROP_REASON_PFMEMALLOC; 2144 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2145 } else { 2146 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2147 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2148 } 2149 return true; 2150 } 2151 return false; 2152 } 2153 EXPORT_IPV6_MOD(tcp_add_backlog); 2154 2155 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) 2156 { 2157 struct tcphdr *th = (struct tcphdr *)skb->data; 2158 2159 return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); 2160 } 2161 EXPORT_IPV6_MOD(tcp_filter); 2162 2163 static void tcp_v4_restore_cb(struct sk_buff *skb) 2164 { 2165 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2166 sizeof(struct inet_skb_parm)); 2167 } 2168 2169 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2170 const struct tcphdr *th) 2171 { 2172 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2173 * barrier() makes sure compiler wont play fool^Waliasing games. 2174 */ 2175 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2176 sizeof(struct inet_skb_parm)); 2177 barrier(); 2178 2179 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2180 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2181 skb->len - th->doff * 4); 2182 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2183 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2184 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2185 TCP_SKB_CB(skb)->sacked = 0; 2186 TCP_SKB_CB(skb)->has_rxtstamp = 2187 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2188 } 2189 2190 /* 2191 * From tcp_input.c 2192 */ 2193 2194 int tcp_v4_rcv(struct sk_buff *skb) 2195 { 2196 struct net *net = dev_net_rcu(skb->dev); 2197 enum skb_drop_reason drop_reason; 2198 enum tcp_tw_status tw_status; 2199 int sdif = inet_sdif(skb); 2200 int dif = inet_iif(skb); 2201 const struct iphdr *iph; 2202 const struct tcphdr *th; 2203 struct sock *sk = NULL; 2204 bool refcounted; 2205 int ret; 2206 u32 isn; 2207 2208 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2209 if (skb->pkt_type != PACKET_HOST) 2210 goto discard_it; 2211 2212 /* Count it even if it's bad */ 2213 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2214 2215 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2216 goto discard_it; 2217 2218 th = (const struct tcphdr *)skb->data; 2219 2220 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2221 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2222 goto bad_packet; 2223 } 2224 if (!pskb_may_pull(skb, th->doff * 4)) 2225 goto discard_it; 2226 2227 /* An explanation is required here, I think. 2228 * Packet length and doff are validated by header prediction, 2229 * provided case of th->doff==0 is eliminated. 2230 * So, we defer the checks. */ 2231 2232 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2233 goto csum_error; 2234 2235 th = (const struct tcphdr *)skb->data; 2236 iph = ip_hdr(skb); 2237 lookup: 2238 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2239 th->dest, sdif, &refcounted); 2240 if (!sk) 2241 goto no_tcp_socket; 2242 2243 if (sk->sk_state == TCP_TIME_WAIT) 2244 goto do_time_wait; 2245 2246 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2247 struct request_sock *req = inet_reqsk(sk); 2248 bool req_stolen = false; 2249 struct sock *nsk; 2250 2251 sk = req->rsk_listener; 2252 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2253 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2254 else 2255 drop_reason = tcp_inbound_hash(sk, req, skb, 2256 &iph->saddr, &iph->daddr, 2257 AF_INET, dif, sdif); 2258 if (unlikely(drop_reason)) { 2259 sk_drops_skbadd(sk, skb); 2260 reqsk_put(req); 2261 goto discard_it; 2262 } 2263 if (tcp_checksum_complete(skb)) { 2264 reqsk_put(req); 2265 goto csum_error; 2266 } 2267 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2268 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2269 if (!nsk) { 2270 inet_csk_reqsk_queue_drop_and_put(sk, req); 2271 goto lookup; 2272 } 2273 sk = nsk; 2274 /* reuseport_migrate_sock() has already held one sk_refcnt 2275 * before returning. 2276 */ 2277 } else { 2278 /* We own a reference on the listener, increase it again 2279 * as we might lose it too soon. 2280 */ 2281 sock_hold(sk); 2282 } 2283 refcounted = true; 2284 nsk = NULL; 2285 if (!tcp_filter(sk, skb, &drop_reason)) { 2286 th = (const struct tcphdr *)skb->data; 2287 iph = ip_hdr(skb); 2288 tcp_v4_fill_cb(skb, iph, th); 2289 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2290 &drop_reason); 2291 } 2292 if (!nsk) { 2293 reqsk_put(req); 2294 if (req_stolen) { 2295 /* Another cpu got exclusive access to req 2296 * and created a full blown socket. 2297 * Try to feed this packet to this socket 2298 * instead of discarding it. 2299 */ 2300 tcp_v4_restore_cb(skb); 2301 sock_put(sk); 2302 goto lookup; 2303 } 2304 goto discard_and_relse; 2305 } 2306 nf_reset_ct(skb); 2307 if (nsk == sk) { 2308 reqsk_put(req); 2309 tcp_v4_restore_cb(skb); 2310 } else { 2311 drop_reason = tcp_child_process(sk, nsk, skb); 2312 if (drop_reason) { 2313 enum sk_rst_reason rst_reason; 2314 2315 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2316 tcp_v4_send_reset(nsk, skb, rst_reason); 2317 goto discard_and_relse; 2318 } 2319 sock_put(sk); 2320 return 0; 2321 } 2322 } 2323 2324 process: 2325 if (static_branch_unlikely(&ip4_min_ttl)) { 2326 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2327 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2328 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2329 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2330 goto discard_and_relse; 2331 } 2332 } 2333 2334 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2335 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2336 goto discard_and_relse; 2337 } 2338 2339 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2340 AF_INET, dif, sdif); 2341 if (drop_reason) 2342 goto discard_and_relse; 2343 2344 nf_reset_ct(skb); 2345 2346 if (tcp_filter(sk, skb, &drop_reason)) 2347 goto discard_and_relse; 2348 2349 th = (const struct tcphdr *)skb->data; 2350 iph = ip_hdr(skb); 2351 tcp_v4_fill_cb(skb, iph, th); 2352 2353 skb->dev = NULL; 2354 2355 if (sk->sk_state == TCP_LISTEN) { 2356 ret = tcp_v4_do_rcv(sk, skb); 2357 goto put_and_return; 2358 } 2359 2360 sk_incoming_cpu_update(sk); 2361 2362 bh_lock_sock_nested(sk); 2363 tcp_segs_in(tcp_sk(sk), skb); 2364 ret = 0; 2365 if (!sock_owned_by_user(sk)) { 2366 ret = tcp_v4_do_rcv(sk, skb); 2367 } else { 2368 if (tcp_add_backlog(sk, skb, &drop_reason)) 2369 goto discard_and_relse; 2370 } 2371 bh_unlock_sock(sk); 2372 2373 put_and_return: 2374 if (refcounted) 2375 sock_put(sk); 2376 2377 return ret; 2378 2379 no_tcp_socket: 2380 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2381 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2382 goto discard_it; 2383 2384 tcp_v4_fill_cb(skb, iph, th); 2385 2386 if (tcp_checksum_complete(skb)) { 2387 csum_error: 2388 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2389 trace_tcp_bad_csum(skb); 2390 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2391 bad_packet: 2392 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2393 } else { 2394 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2395 } 2396 2397 discard_it: 2398 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2399 /* Discard frame. */ 2400 sk_skb_reason_drop(sk, skb, drop_reason); 2401 return 0; 2402 2403 discard_and_relse: 2404 sk_drops_skbadd(sk, skb); 2405 if (refcounted) 2406 sock_put(sk); 2407 goto discard_it; 2408 2409 do_time_wait: 2410 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2411 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2412 inet_twsk_put(inet_twsk(sk)); 2413 goto discard_it; 2414 } 2415 2416 tcp_v4_fill_cb(skb, iph, th); 2417 2418 if (tcp_checksum_complete(skb)) { 2419 inet_twsk_put(inet_twsk(sk)); 2420 goto csum_error; 2421 } 2422 2423 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2424 &drop_reason); 2425 switch (tw_status) { 2426 case TCP_TW_SYN: { 2427 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2428 iph->saddr, th->source, 2429 iph->daddr, th->dest, 2430 inet_iif(skb), 2431 sdif); 2432 if (sk2) { 2433 inet_twsk_deschedule_put(inet_twsk(sk)); 2434 sk = sk2; 2435 tcp_v4_restore_cb(skb); 2436 refcounted = false; 2437 __this_cpu_write(tcp_tw_isn, isn); 2438 goto process; 2439 } 2440 } 2441 /* to ACK */ 2442 fallthrough; 2443 case TCP_TW_ACK: 2444 case TCP_TW_ACK_OOW: 2445 tcp_v4_timewait_ack(sk, skb, tw_status); 2446 break; 2447 case TCP_TW_RST: 2448 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2449 inet_twsk_deschedule_put(inet_twsk(sk)); 2450 goto discard_it; 2451 case TCP_TW_SUCCESS:; 2452 } 2453 goto discard_it; 2454 } 2455 2456 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2457 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2458 }; 2459 2460 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2461 { 2462 struct dst_entry *dst = skb_dst(skb); 2463 2464 if (dst && dst_hold_safe(dst)) { 2465 rcu_assign_pointer(sk->sk_rx_dst, dst); 2466 sk->sk_rx_dst_ifindex = skb->skb_iif; 2467 } 2468 } 2469 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2470 2471 const struct inet_connection_sock_af_ops ipv4_specific = { 2472 .queue_xmit = ip_queue_xmit, 2473 .send_check = tcp_v4_send_check, 2474 .rebuild_header = inet_sk_rebuild_header, 2475 .sk_rx_dst_set = inet_sk_rx_dst_set, 2476 .conn_request = tcp_v4_conn_request, 2477 .syn_recv_sock = tcp_v4_syn_recv_sock, 2478 .net_header_len = sizeof(struct iphdr), 2479 .setsockopt = ip_setsockopt, 2480 .getsockopt = ip_getsockopt, 2481 .mtu_reduced = tcp_v4_mtu_reduced, 2482 }; 2483 EXPORT_IPV6_MOD(ipv4_specific); 2484 2485 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2486 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2487 #ifdef CONFIG_TCP_MD5SIG 2488 .md5_lookup = tcp_v4_md5_lookup, 2489 .calc_md5_hash = tcp_v4_md5_hash_skb, 2490 .md5_parse = tcp_v4_parse_md5_keys, 2491 #endif 2492 #ifdef CONFIG_TCP_AO 2493 .ao_lookup = tcp_v4_ao_lookup, 2494 .calc_ao_hash = tcp_v4_ao_hash_skb, 2495 .ao_parse = tcp_v4_parse_ao, 2496 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2497 #endif 2498 }; 2499 2500 static void tcp4_destruct_sock(struct sock *sk) 2501 { 2502 tcp_md5_destruct_sock(sk); 2503 tcp_ao_destroy_sock(sk, false); 2504 inet_sock_destruct(sk); 2505 } 2506 #endif 2507 2508 /* NOTE: A lot of things set to zero explicitly by call to 2509 * sk_alloc() so need not be done here. 2510 */ 2511 static int tcp_v4_init_sock(struct sock *sk) 2512 { 2513 struct inet_connection_sock *icsk = inet_csk(sk); 2514 2515 tcp_init_sock(sk); 2516 2517 icsk->icsk_af_ops = &ipv4_specific; 2518 2519 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2520 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2521 sk->sk_destruct = tcp4_destruct_sock; 2522 #endif 2523 2524 return 0; 2525 } 2526 2527 static void tcp_release_user_frags(struct sock *sk) 2528 { 2529 #ifdef CONFIG_PAGE_POOL 2530 unsigned long index; 2531 void *netmem; 2532 2533 xa_for_each(&sk->sk_user_frags, index, netmem) 2534 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2535 #endif 2536 } 2537 2538 void tcp_v4_destroy_sock(struct sock *sk) 2539 { 2540 struct tcp_sock *tp = tcp_sk(sk); 2541 2542 tcp_release_user_frags(sk); 2543 2544 xa_destroy(&sk->sk_user_frags); 2545 2546 trace_tcp_destroy_sock(sk); 2547 2548 tcp_clear_xmit_timers(sk); 2549 2550 tcp_cleanup_congestion_control(sk); 2551 2552 tcp_cleanup_ulp(sk); 2553 2554 /* Cleanup up the write buffer. */ 2555 tcp_write_queue_purge(sk); 2556 2557 /* Check if we want to disable active TFO */ 2558 tcp_fastopen_active_disable_ofo_check(sk); 2559 2560 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2561 skb_rbtree_purge(&tp->out_of_order_queue); 2562 2563 /* Clean up a referenced TCP bind bucket. */ 2564 if (inet_csk(sk)->icsk_bind_hash) 2565 inet_put_port(sk); 2566 2567 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2568 2569 /* If socket is aborted during connect operation */ 2570 tcp_free_fastopen_req(tp); 2571 tcp_fastopen_destroy_cipher(sk); 2572 tcp_saved_syn_free(tp); 2573 2574 sk_sockets_allocated_dec(sk); 2575 } 2576 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2577 2578 #ifdef CONFIG_PROC_FS 2579 /* Proc filesystem TCP sock list dumping. */ 2580 2581 static unsigned short seq_file_family(const struct seq_file *seq); 2582 2583 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2584 { 2585 unsigned short family = seq_file_family(seq); 2586 2587 /* AF_UNSPEC is used as a match all */ 2588 return ((family == AF_UNSPEC || family == sk->sk_family) && 2589 net_eq(sock_net(sk), seq_file_net(seq))); 2590 } 2591 2592 /* Find a non empty bucket (starting from st->bucket) 2593 * and return the first sk from it. 2594 */ 2595 static void *listening_get_first(struct seq_file *seq) 2596 { 2597 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2598 struct tcp_iter_state *st = seq->private; 2599 2600 st->offset = 0; 2601 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2602 struct inet_listen_hashbucket *ilb2; 2603 struct hlist_nulls_node *node; 2604 struct sock *sk; 2605 2606 ilb2 = &hinfo->lhash2[st->bucket]; 2607 if (hlist_nulls_empty(&ilb2->nulls_head)) 2608 continue; 2609 2610 spin_lock(&ilb2->lock); 2611 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2612 if (seq_sk_match(seq, sk)) 2613 return sk; 2614 } 2615 spin_unlock(&ilb2->lock); 2616 } 2617 2618 return NULL; 2619 } 2620 2621 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2622 * If "cur" is the last one in the st->bucket, 2623 * call listening_get_first() to return the first sk of the next 2624 * non empty bucket. 2625 */ 2626 static void *listening_get_next(struct seq_file *seq, void *cur) 2627 { 2628 struct tcp_iter_state *st = seq->private; 2629 struct inet_listen_hashbucket *ilb2; 2630 struct hlist_nulls_node *node; 2631 struct inet_hashinfo *hinfo; 2632 struct sock *sk = cur; 2633 2634 ++st->num; 2635 ++st->offset; 2636 2637 sk = sk_nulls_next(sk); 2638 sk_nulls_for_each_from(sk, node) { 2639 if (seq_sk_match(seq, sk)) 2640 return sk; 2641 } 2642 2643 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2644 ilb2 = &hinfo->lhash2[st->bucket]; 2645 spin_unlock(&ilb2->lock); 2646 ++st->bucket; 2647 return listening_get_first(seq); 2648 } 2649 2650 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2651 { 2652 struct tcp_iter_state *st = seq->private; 2653 void *rc; 2654 2655 st->bucket = 0; 2656 st->offset = 0; 2657 rc = listening_get_first(seq); 2658 2659 while (rc && *pos) { 2660 rc = listening_get_next(seq, rc); 2661 --*pos; 2662 } 2663 return rc; 2664 } 2665 2666 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2667 const struct tcp_iter_state *st) 2668 { 2669 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2670 } 2671 2672 /* 2673 * Get first established socket starting from bucket given in st->bucket. 2674 * If st->bucket is zero, the very first socket in the hash is returned. 2675 */ 2676 static void *established_get_first(struct seq_file *seq) 2677 { 2678 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2679 struct tcp_iter_state *st = seq->private; 2680 2681 st->offset = 0; 2682 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2683 struct sock *sk; 2684 struct hlist_nulls_node *node; 2685 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2686 2687 cond_resched(); 2688 2689 /* Lockless fast path for the common case of empty buckets */ 2690 if (empty_bucket(hinfo, st)) 2691 continue; 2692 2693 spin_lock_bh(lock); 2694 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2695 if (seq_sk_match(seq, sk)) 2696 return sk; 2697 } 2698 spin_unlock_bh(lock); 2699 } 2700 2701 return NULL; 2702 } 2703 2704 static void *established_get_next(struct seq_file *seq, void *cur) 2705 { 2706 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2707 struct tcp_iter_state *st = seq->private; 2708 struct hlist_nulls_node *node; 2709 struct sock *sk = cur; 2710 2711 ++st->num; 2712 ++st->offset; 2713 2714 sk = sk_nulls_next(sk); 2715 2716 sk_nulls_for_each_from(sk, node) { 2717 if (seq_sk_match(seq, sk)) 2718 return sk; 2719 } 2720 2721 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2722 ++st->bucket; 2723 return established_get_first(seq); 2724 } 2725 2726 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2727 { 2728 struct tcp_iter_state *st = seq->private; 2729 void *rc; 2730 2731 st->bucket = 0; 2732 rc = established_get_first(seq); 2733 2734 while (rc && pos) { 2735 rc = established_get_next(seq, rc); 2736 --pos; 2737 } 2738 return rc; 2739 } 2740 2741 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2742 { 2743 void *rc; 2744 struct tcp_iter_state *st = seq->private; 2745 2746 st->state = TCP_SEQ_STATE_LISTENING; 2747 rc = listening_get_idx(seq, &pos); 2748 2749 if (!rc) { 2750 st->state = TCP_SEQ_STATE_ESTABLISHED; 2751 rc = established_get_idx(seq, pos); 2752 } 2753 2754 return rc; 2755 } 2756 2757 static void *tcp_seek_last_pos(struct seq_file *seq) 2758 { 2759 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2760 struct tcp_iter_state *st = seq->private; 2761 int bucket = st->bucket; 2762 int offset = st->offset; 2763 int orig_num = st->num; 2764 void *rc = NULL; 2765 2766 switch (st->state) { 2767 case TCP_SEQ_STATE_LISTENING: 2768 if (st->bucket > hinfo->lhash2_mask) 2769 break; 2770 rc = listening_get_first(seq); 2771 while (offset-- && rc && bucket == st->bucket) 2772 rc = listening_get_next(seq, rc); 2773 if (rc) 2774 break; 2775 st->bucket = 0; 2776 st->state = TCP_SEQ_STATE_ESTABLISHED; 2777 fallthrough; 2778 case TCP_SEQ_STATE_ESTABLISHED: 2779 if (st->bucket > hinfo->ehash_mask) 2780 break; 2781 rc = established_get_first(seq); 2782 while (offset-- && rc && bucket == st->bucket) 2783 rc = established_get_next(seq, rc); 2784 } 2785 2786 st->num = orig_num; 2787 2788 return rc; 2789 } 2790 2791 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2792 { 2793 struct tcp_iter_state *st = seq->private; 2794 void *rc; 2795 2796 if (*pos && *pos == st->last_pos) { 2797 rc = tcp_seek_last_pos(seq); 2798 if (rc) 2799 goto out; 2800 } 2801 2802 st->state = TCP_SEQ_STATE_LISTENING; 2803 st->num = 0; 2804 st->bucket = 0; 2805 st->offset = 0; 2806 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2807 2808 out: 2809 st->last_pos = *pos; 2810 return rc; 2811 } 2812 EXPORT_IPV6_MOD(tcp_seq_start); 2813 2814 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2815 { 2816 struct tcp_iter_state *st = seq->private; 2817 void *rc = NULL; 2818 2819 if (v == SEQ_START_TOKEN) { 2820 rc = tcp_get_idx(seq, 0); 2821 goto out; 2822 } 2823 2824 switch (st->state) { 2825 case TCP_SEQ_STATE_LISTENING: 2826 rc = listening_get_next(seq, v); 2827 if (!rc) { 2828 st->state = TCP_SEQ_STATE_ESTABLISHED; 2829 st->bucket = 0; 2830 st->offset = 0; 2831 rc = established_get_first(seq); 2832 } 2833 break; 2834 case TCP_SEQ_STATE_ESTABLISHED: 2835 rc = established_get_next(seq, v); 2836 break; 2837 } 2838 out: 2839 ++*pos; 2840 st->last_pos = *pos; 2841 return rc; 2842 } 2843 EXPORT_IPV6_MOD(tcp_seq_next); 2844 2845 void tcp_seq_stop(struct seq_file *seq, void *v) 2846 { 2847 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2848 struct tcp_iter_state *st = seq->private; 2849 2850 switch (st->state) { 2851 case TCP_SEQ_STATE_LISTENING: 2852 if (v != SEQ_START_TOKEN) 2853 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2854 break; 2855 case TCP_SEQ_STATE_ESTABLISHED: 2856 if (v) 2857 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2858 break; 2859 } 2860 } 2861 EXPORT_IPV6_MOD(tcp_seq_stop); 2862 2863 static void get_openreq4(const struct request_sock *req, 2864 struct seq_file *f, int i) 2865 { 2866 const struct inet_request_sock *ireq = inet_rsk(req); 2867 long delta = req->rsk_timer.expires - jiffies; 2868 2869 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2870 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2871 i, 2872 ireq->ir_loc_addr, 2873 ireq->ir_num, 2874 ireq->ir_rmt_addr, 2875 ntohs(ireq->ir_rmt_port), 2876 TCP_SYN_RECV, 2877 0, 0, /* could print option size, but that is af dependent. */ 2878 1, /* timers active (only the expire timer) */ 2879 jiffies_delta_to_clock_t(delta), 2880 req->num_timeout, 2881 from_kuid_munged(seq_user_ns(f), 2882 sk_uid(req->rsk_listener)), 2883 0, /* non standard timer */ 2884 0, /* open_requests have no inode */ 2885 0, 2886 req); 2887 } 2888 2889 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2890 { 2891 int timer_active; 2892 unsigned long timer_expires; 2893 const struct tcp_sock *tp = tcp_sk(sk); 2894 const struct inet_connection_sock *icsk = inet_csk(sk); 2895 const struct inet_sock *inet = inet_sk(sk); 2896 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2897 __be32 dest = inet->inet_daddr; 2898 __be32 src = inet->inet_rcv_saddr; 2899 __u16 destp = ntohs(inet->inet_dport); 2900 __u16 srcp = ntohs(inet->inet_sport); 2901 u8 icsk_pending; 2902 int rx_queue; 2903 int state; 2904 2905 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2906 if (icsk_pending == ICSK_TIME_RETRANS || 2907 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2908 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2909 timer_active = 1; 2910 timer_expires = icsk_timeout(icsk); 2911 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2912 timer_active = 4; 2913 timer_expires = icsk_timeout(icsk); 2914 } else if (timer_pending(&sk->sk_timer)) { 2915 timer_active = 2; 2916 timer_expires = sk->sk_timer.expires; 2917 } else { 2918 timer_active = 0; 2919 timer_expires = jiffies; 2920 } 2921 2922 state = inet_sk_state_load(sk); 2923 if (state == TCP_LISTEN) 2924 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2925 else 2926 /* Because we don't lock the socket, 2927 * we might find a transient negative value. 2928 */ 2929 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2930 READ_ONCE(tp->copied_seq), 0); 2931 2932 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2933 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2934 i, src, srcp, dest, destp, state, 2935 READ_ONCE(tp->write_seq) - tp->snd_una, 2936 rx_queue, 2937 timer_active, 2938 jiffies_delta_to_clock_t(timer_expires - jiffies), 2939 READ_ONCE(icsk->icsk_retransmits), 2940 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2941 READ_ONCE(icsk->icsk_probes_out), 2942 sock_i_ino(sk), 2943 refcount_read(&sk->sk_refcnt), sk, 2944 jiffies_to_clock_t(icsk->icsk_rto), 2945 jiffies_to_clock_t(icsk->icsk_ack.ato), 2946 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2947 tcp_snd_cwnd(tp), 2948 state == TCP_LISTEN ? 2949 fastopenq->max_qlen : 2950 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2951 } 2952 2953 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2954 struct seq_file *f, int i) 2955 { 2956 long delta = tw->tw_timer.expires - jiffies; 2957 __be32 dest, src; 2958 __u16 destp, srcp; 2959 2960 dest = tw->tw_daddr; 2961 src = tw->tw_rcv_saddr; 2962 destp = ntohs(tw->tw_dport); 2963 srcp = ntohs(tw->tw_sport); 2964 2965 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2966 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2967 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2968 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2969 refcount_read(&tw->tw_refcnt), tw); 2970 } 2971 2972 #define TMPSZ 150 2973 2974 static int tcp4_seq_show(struct seq_file *seq, void *v) 2975 { 2976 struct tcp_iter_state *st; 2977 struct sock *sk = v; 2978 2979 seq_setwidth(seq, TMPSZ - 1); 2980 if (v == SEQ_START_TOKEN) { 2981 seq_puts(seq, " sl local_address rem_address st tx_queue " 2982 "rx_queue tr tm->when retrnsmt uid timeout " 2983 "inode"); 2984 goto out; 2985 } 2986 st = seq->private; 2987 2988 if (sk->sk_state == TCP_TIME_WAIT) 2989 get_timewait4_sock(v, seq, st->num); 2990 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2991 get_openreq4(v, seq, st->num); 2992 else 2993 get_tcp4_sock(v, seq, st->num); 2994 out: 2995 seq_pad(seq, '\n'); 2996 return 0; 2997 } 2998 2999 #ifdef CONFIG_BPF_SYSCALL 3000 union bpf_tcp_iter_batch_item { 3001 struct sock *sk; 3002 __u64 cookie; 3003 }; 3004 3005 struct bpf_tcp_iter_state { 3006 struct tcp_iter_state state; 3007 unsigned int cur_sk; 3008 unsigned int end_sk; 3009 unsigned int max_sk; 3010 union bpf_tcp_iter_batch_item *batch; 3011 }; 3012 3013 struct bpf_iter__tcp { 3014 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3015 __bpf_md_ptr(struct sock_common *, sk_common); 3016 uid_t uid __aligned(8); 3017 }; 3018 3019 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3020 struct sock_common *sk_common, uid_t uid) 3021 { 3022 struct bpf_iter__tcp ctx; 3023 3024 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3025 ctx.meta = meta; 3026 ctx.sk_common = sk_common; 3027 ctx.uid = uid; 3028 return bpf_iter_run_prog(prog, &ctx); 3029 } 3030 3031 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3032 { 3033 union bpf_tcp_iter_batch_item *item; 3034 unsigned int cur_sk = iter->cur_sk; 3035 __u64 cookie; 3036 3037 /* Remember the cookies of the sockets we haven't seen yet, so we can 3038 * pick up where we left off next time around. 3039 */ 3040 while (cur_sk < iter->end_sk) { 3041 item = &iter->batch[cur_sk++]; 3042 cookie = sock_gen_cookie(item->sk); 3043 sock_gen_put(item->sk); 3044 item->cookie = cookie; 3045 } 3046 } 3047 3048 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3049 unsigned int new_batch_sz, gfp_t flags) 3050 { 3051 union bpf_tcp_iter_batch_item *new_batch; 3052 3053 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3054 flags | __GFP_NOWARN); 3055 if (!new_batch) 3056 return -ENOMEM; 3057 3058 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 3059 kvfree(iter->batch); 3060 iter->batch = new_batch; 3061 iter->max_sk = new_batch_sz; 3062 3063 return 0; 3064 } 3065 3066 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3067 union bpf_tcp_iter_batch_item *cookies, 3068 int n_cookies) 3069 { 3070 struct hlist_nulls_node *node; 3071 struct sock *sk; 3072 int i; 3073 3074 for (i = 0; i < n_cookies; i++) { 3075 sk = first_sk; 3076 sk_nulls_for_each_from(sk, node) 3077 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3078 return sk; 3079 } 3080 3081 return NULL; 3082 } 3083 3084 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3085 { 3086 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3087 struct bpf_tcp_iter_state *iter = seq->private; 3088 struct tcp_iter_state *st = &iter->state; 3089 unsigned int find_cookie = iter->cur_sk; 3090 unsigned int end_cookie = iter->end_sk; 3091 int resume_bucket = st->bucket; 3092 struct sock *sk; 3093 3094 if (end_cookie && find_cookie == end_cookie) 3095 ++st->bucket; 3096 3097 sk = listening_get_first(seq); 3098 iter->cur_sk = 0; 3099 iter->end_sk = 0; 3100 3101 if (sk && st->bucket == resume_bucket && end_cookie) { 3102 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3103 end_cookie - find_cookie); 3104 if (!sk) { 3105 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3106 ++st->bucket; 3107 sk = listening_get_first(seq); 3108 } 3109 } 3110 3111 return sk; 3112 } 3113 3114 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3115 { 3116 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3117 struct bpf_tcp_iter_state *iter = seq->private; 3118 struct tcp_iter_state *st = &iter->state; 3119 unsigned int find_cookie = iter->cur_sk; 3120 unsigned int end_cookie = iter->end_sk; 3121 int resume_bucket = st->bucket; 3122 struct sock *sk; 3123 3124 if (end_cookie && find_cookie == end_cookie) 3125 ++st->bucket; 3126 3127 sk = established_get_first(seq); 3128 iter->cur_sk = 0; 3129 iter->end_sk = 0; 3130 3131 if (sk && st->bucket == resume_bucket && end_cookie) { 3132 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3133 end_cookie - find_cookie); 3134 if (!sk) { 3135 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3136 ++st->bucket; 3137 sk = established_get_first(seq); 3138 } 3139 } 3140 3141 return sk; 3142 } 3143 3144 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3145 { 3146 struct bpf_tcp_iter_state *iter = seq->private; 3147 struct tcp_iter_state *st = &iter->state; 3148 struct sock *sk = NULL; 3149 3150 switch (st->state) { 3151 case TCP_SEQ_STATE_LISTENING: 3152 sk = bpf_iter_tcp_resume_listening(seq); 3153 if (sk) 3154 break; 3155 st->bucket = 0; 3156 st->state = TCP_SEQ_STATE_ESTABLISHED; 3157 fallthrough; 3158 case TCP_SEQ_STATE_ESTABLISHED: 3159 sk = bpf_iter_tcp_resume_established(seq); 3160 break; 3161 } 3162 3163 return sk; 3164 } 3165 3166 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3167 struct sock **start_sk) 3168 { 3169 struct bpf_tcp_iter_state *iter = seq->private; 3170 struct hlist_nulls_node *node; 3171 unsigned int expected = 1; 3172 struct sock *sk; 3173 3174 sock_hold(*start_sk); 3175 iter->batch[iter->end_sk++].sk = *start_sk; 3176 3177 sk = sk_nulls_next(*start_sk); 3178 *start_sk = NULL; 3179 sk_nulls_for_each_from(sk, node) { 3180 if (seq_sk_match(seq, sk)) { 3181 if (iter->end_sk < iter->max_sk) { 3182 sock_hold(sk); 3183 iter->batch[iter->end_sk++].sk = sk; 3184 } else if (!*start_sk) { 3185 /* Remember where we left off. */ 3186 *start_sk = sk; 3187 } 3188 expected++; 3189 } 3190 } 3191 3192 return expected; 3193 } 3194 3195 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3196 struct sock **start_sk) 3197 { 3198 struct bpf_tcp_iter_state *iter = seq->private; 3199 struct hlist_nulls_node *node; 3200 unsigned int expected = 1; 3201 struct sock *sk; 3202 3203 sock_hold(*start_sk); 3204 iter->batch[iter->end_sk++].sk = *start_sk; 3205 3206 sk = sk_nulls_next(*start_sk); 3207 *start_sk = NULL; 3208 sk_nulls_for_each_from(sk, node) { 3209 if (seq_sk_match(seq, sk)) { 3210 if (iter->end_sk < iter->max_sk) { 3211 sock_hold(sk); 3212 iter->batch[iter->end_sk++].sk = sk; 3213 } else if (!*start_sk) { 3214 /* Remember where we left off. */ 3215 *start_sk = sk; 3216 } 3217 expected++; 3218 } 3219 } 3220 3221 return expected; 3222 } 3223 3224 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3225 struct sock **start_sk) 3226 { 3227 struct bpf_tcp_iter_state *iter = seq->private; 3228 struct tcp_iter_state *st = &iter->state; 3229 3230 if (st->state == TCP_SEQ_STATE_LISTENING) 3231 return bpf_iter_tcp_listening_batch(seq, start_sk); 3232 else 3233 return bpf_iter_tcp_established_batch(seq, start_sk); 3234 } 3235 3236 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3237 { 3238 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3239 struct bpf_tcp_iter_state *iter = seq->private; 3240 struct tcp_iter_state *st = &iter->state; 3241 3242 if (st->state == TCP_SEQ_STATE_LISTENING) 3243 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3244 else 3245 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3246 } 3247 3248 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3249 { 3250 struct bpf_tcp_iter_state *iter = seq->private; 3251 unsigned int expected; 3252 struct sock *sk; 3253 int err; 3254 3255 sk = bpf_iter_tcp_resume(seq); 3256 if (!sk) 3257 return NULL; /* Done */ 3258 3259 expected = bpf_iter_fill_batch(seq, &sk); 3260 if (likely(iter->end_sk == expected)) 3261 goto done; 3262 3263 /* Batch size was too small. */ 3264 bpf_iter_tcp_unlock_bucket(seq); 3265 bpf_iter_tcp_put_batch(iter); 3266 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3267 GFP_USER); 3268 if (err) 3269 return ERR_PTR(err); 3270 3271 sk = bpf_iter_tcp_resume(seq); 3272 if (!sk) 3273 return NULL; /* Done */ 3274 3275 expected = bpf_iter_fill_batch(seq, &sk); 3276 if (likely(iter->end_sk == expected)) 3277 goto done; 3278 3279 /* Batch size was still too small. Hold onto the lock while we try 3280 * again with a larger batch to make sure the current bucket's size 3281 * does not change in the meantime. 3282 */ 3283 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3284 if (err) { 3285 bpf_iter_tcp_unlock_bucket(seq); 3286 return ERR_PTR(err); 3287 } 3288 3289 expected = bpf_iter_fill_batch(seq, &sk); 3290 WARN_ON_ONCE(iter->end_sk != expected); 3291 done: 3292 bpf_iter_tcp_unlock_bucket(seq); 3293 return iter->batch[0].sk; 3294 } 3295 3296 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3297 { 3298 /* bpf iter does not support lseek, so it always 3299 * continue from where it was stop()-ped. 3300 */ 3301 if (*pos) 3302 return bpf_iter_tcp_batch(seq); 3303 3304 return SEQ_START_TOKEN; 3305 } 3306 3307 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3308 { 3309 struct bpf_tcp_iter_state *iter = seq->private; 3310 struct tcp_iter_state *st = &iter->state; 3311 struct sock *sk; 3312 3313 /* Whenever seq_next() is called, the iter->cur_sk is 3314 * done with seq_show(), so advance to the next sk in 3315 * the batch. 3316 */ 3317 if (iter->cur_sk < iter->end_sk) { 3318 /* Keeping st->num consistent in tcp_iter_state. 3319 * bpf_iter_tcp does not use st->num. 3320 * meta.seq_num is used instead. 3321 */ 3322 st->num++; 3323 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3324 } 3325 3326 if (iter->cur_sk < iter->end_sk) 3327 sk = iter->batch[iter->cur_sk].sk; 3328 else 3329 sk = bpf_iter_tcp_batch(seq); 3330 3331 ++*pos; 3332 /* Keeping st->last_pos consistent in tcp_iter_state. 3333 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3334 */ 3335 st->last_pos = *pos; 3336 return sk; 3337 } 3338 3339 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3340 { 3341 struct bpf_iter_meta meta; 3342 struct bpf_prog *prog; 3343 struct sock *sk = v; 3344 uid_t uid; 3345 int ret; 3346 3347 if (v == SEQ_START_TOKEN) 3348 return 0; 3349 3350 if (sk_fullsock(sk)) 3351 lock_sock(sk); 3352 3353 if (unlikely(sk_unhashed(sk))) { 3354 ret = SEQ_SKIP; 3355 goto unlock; 3356 } 3357 3358 if (sk->sk_state == TCP_TIME_WAIT) { 3359 uid = 0; 3360 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3361 const struct request_sock *req = v; 3362 3363 uid = from_kuid_munged(seq_user_ns(seq), 3364 sk_uid(req->rsk_listener)); 3365 } else { 3366 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3367 } 3368 3369 meta.seq = seq; 3370 prog = bpf_iter_get_info(&meta, false); 3371 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3372 3373 unlock: 3374 if (sk_fullsock(sk)) 3375 release_sock(sk); 3376 return ret; 3377 3378 } 3379 3380 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3381 { 3382 struct bpf_tcp_iter_state *iter = seq->private; 3383 struct bpf_iter_meta meta; 3384 struct bpf_prog *prog; 3385 3386 if (!v) { 3387 meta.seq = seq; 3388 prog = bpf_iter_get_info(&meta, true); 3389 if (prog) 3390 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3391 } 3392 3393 if (iter->cur_sk < iter->end_sk) 3394 bpf_iter_tcp_put_batch(iter); 3395 } 3396 3397 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3398 .show = bpf_iter_tcp_seq_show, 3399 .start = bpf_iter_tcp_seq_start, 3400 .next = bpf_iter_tcp_seq_next, 3401 .stop = bpf_iter_tcp_seq_stop, 3402 }; 3403 #endif 3404 static unsigned short seq_file_family(const struct seq_file *seq) 3405 { 3406 const struct tcp_seq_afinfo *afinfo; 3407 3408 #ifdef CONFIG_BPF_SYSCALL 3409 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3410 if (seq->op == &bpf_iter_tcp_seq_ops) 3411 return AF_UNSPEC; 3412 #endif 3413 3414 /* Iterated from proc fs */ 3415 afinfo = pde_data(file_inode(seq->file)); 3416 return afinfo->family; 3417 } 3418 3419 static const struct seq_operations tcp4_seq_ops = { 3420 .show = tcp4_seq_show, 3421 .start = tcp_seq_start, 3422 .next = tcp_seq_next, 3423 .stop = tcp_seq_stop, 3424 }; 3425 3426 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3427 .family = AF_INET, 3428 }; 3429 3430 static int __net_init tcp4_proc_init_net(struct net *net) 3431 { 3432 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3433 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3434 return -ENOMEM; 3435 return 0; 3436 } 3437 3438 static void __net_exit tcp4_proc_exit_net(struct net *net) 3439 { 3440 remove_proc_entry("tcp", net->proc_net); 3441 } 3442 3443 static struct pernet_operations tcp4_net_ops = { 3444 .init = tcp4_proc_init_net, 3445 .exit = tcp4_proc_exit_net, 3446 }; 3447 3448 int __init tcp4_proc_init(void) 3449 { 3450 return register_pernet_subsys(&tcp4_net_ops); 3451 } 3452 3453 void tcp4_proc_exit(void) 3454 { 3455 unregister_pernet_subsys(&tcp4_net_ops); 3456 } 3457 #endif /* CONFIG_PROC_FS */ 3458 3459 /* @wake is one when sk_stream_write_space() calls us. 3460 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3461 * This mimics the strategy used in sock_def_write_space(). 3462 */ 3463 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3464 { 3465 const struct tcp_sock *tp = tcp_sk(sk); 3466 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3467 READ_ONCE(tp->snd_nxt); 3468 3469 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3470 } 3471 EXPORT_SYMBOL(tcp_stream_memory_free); 3472 3473 struct proto tcp_prot = { 3474 .name = "TCP", 3475 .owner = THIS_MODULE, 3476 .close = tcp_close, 3477 .pre_connect = tcp_v4_pre_connect, 3478 .connect = tcp_v4_connect, 3479 .disconnect = tcp_disconnect, 3480 .accept = inet_csk_accept, 3481 .ioctl = tcp_ioctl, 3482 .init = tcp_v4_init_sock, 3483 .destroy = tcp_v4_destroy_sock, 3484 .shutdown = tcp_shutdown, 3485 .setsockopt = tcp_setsockopt, 3486 .getsockopt = tcp_getsockopt, 3487 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3488 .keepalive = tcp_set_keepalive, 3489 .recvmsg = tcp_recvmsg, 3490 .sendmsg = tcp_sendmsg, 3491 .splice_eof = tcp_splice_eof, 3492 .backlog_rcv = tcp_v4_do_rcv, 3493 .release_cb = tcp_release_cb, 3494 .hash = inet_hash, 3495 .unhash = inet_unhash, 3496 .get_port = inet_csk_get_port, 3497 .put_port = inet_put_port, 3498 #ifdef CONFIG_BPF_SYSCALL 3499 .psock_update_sk_prot = tcp_bpf_update_proto, 3500 #endif 3501 .enter_memory_pressure = tcp_enter_memory_pressure, 3502 .leave_memory_pressure = tcp_leave_memory_pressure, 3503 .stream_memory_free = tcp_stream_memory_free, 3504 .sockets_allocated = &tcp_sockets_allocated, 3505 3506 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3507 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3508 3509 .memory_pressure = &tcp_memory_pressure, 3510 .sysctl_mem = sysctl_tcp_mem, 3511 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3512 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3513 .max_header = MAX_TCP_HEADER, 3514 .obj_size = sizeof(struct tcp_sock), 3515 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3516 .twsk_prot = &tcp_timewait_sock_ops, 3517 .rsk_prot = &tcp_request_sock_ops, 3518 .h.hashinfo = NULL, 3519 .no_autobind = true, 3520 .diag_destroy = tcp_abort, 3521 }; 3522 EXPORT_SYMBOL(tcp_prot); 3523 3524 static void __net_exit tcp_sk_exit(struct net *net) 3525 { 3526 if (net->ipv4.tcp_congestion_control) 3527 bpf_module_put(net->ipv4.tcp_congestion_control, 3528 net->ipv4.tcp_congestion_control->owner); 3529 } 3530 3531 static void __net_init tcp_set_hashinfo(struct net *net) 3532 { 3533 struct inet_hashinfo *hinfo; 3534 unsigned int ehash_entries; 3535 struct net *old_net; 3536 3537 if (net_eq(net, &init_net)) 3538 goto fallback; 3539 3540 old_net = current->nsproxy->net_ns; 3541 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3542 if (!ehash_entries) 3543 goto fallback; 3544 3545 ehash_entries = roundup_pow_of_two(ehash_entries); 3546 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3547 if (!hinfo) { 3548 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3549 "for a netns, fallback to the global one\n", 3550 ehash_entries); 3551 fallback: 3552 hinfo = &tcp_hashinfo; 3553 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3554 } 3555 3556 net->ipv4.tcp_death_row.hashinfo = hinfo; 3557 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3558 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3559 } 3560 3561 static int __net_init tcp_sk_init(struct net *net) 3562 { 3563 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3564 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3565 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; 3566 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3567 3568 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3569 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3570 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3571 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3572 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3573 3574 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3575 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3576 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3577 3578 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3579 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3580 net->ipv4.sysctl_tcp_syncookies = 1; 3581 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3582 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3583 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3584 net->ipv4.sysctl_tcp_orphan_retries = 0; 3585 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3586 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3587 net->ipv4.sysctl_tcp_tw_reuse = 2; 3588 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3589 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3590 3591 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3592 tcp_set_hashinfo(net); 3593 3594 net->ipv4.sysctl_tcp_sack = 1; 3595 net->ipv4.sysctl_tcp_window_scaling = 1; 3596 net->ipv4.sysctl_tcp_timestamps = 1; 3597 net->ipv4.sysctl_tcp_early_retrans = 3; 3598 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3599 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3600 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3601 net->ipv4.sysctl_tcp_max_reordering = 300; 3602 net->ipv4.sysctl_tcp_dsack = 1; 3603 net->ipv4.sysctl_tcp_app_win = 31; 3604 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3605 net->ipv4.sysctl_tcp_frto = 2; 3606 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3607 /* This limits the percentage of the congestion window which we 3608 * will allow a single TSO frame to consume. Building TSO frames 3609 * which are too large can cause TCP streams to be bursty. 3610 */ 3611 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3612 /* Default TSQ limit of 4 MB */ 3613 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3614 3615 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3616 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3617 3618 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3619 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3620 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3621 net->ipv4.sysctl_tcp_autocorking = 1; 3622 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3623 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3624 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3625 if (net != &init_net) { 3626 memcpy(net->ipv4.sysctl_tcp_rmem, 3627 init_net.ipv4.sysctl_tcp_rmem, 3628 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3629 memcpy(net->ipv4.sysctl_tcp_wmem, 3630 init_net.ipv4.sysctl_tcp_wmem, 3631 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3632 } 3633 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3634 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3635 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3636 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3637 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3638 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3639 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3640 3641 /* Set default values for PLB */ 3642 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3643 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3644 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3645 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3646 /* Default congestion threshold for PLB to mark a round is 50% */ 3647 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3648 3649 /* Reno is always built in */ 3650 if (!net_eq(net, &init_net) && 3651 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3652 init_net.ipv4.tcp_congestion_control->owner)) 3653 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3654 else 3655 net->ipv4.tcp_congestion_control = &tcp_reno; 3656 3657 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3658 net->ipv4.sysctl_tcp_shrink_window = 0; 3659 3660 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3661 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3662 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3663 3664 return 0; 3665 } 3666 3667 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3668 { 3669 struct net *net; 3670 3671 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3672 * and failed setup_net error unwinding path are serialized. 3673 * 3674 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3675 * net_exit_list, the thread that dismantles a particular twsk must 3676 * do so without other thread progressing to refcount_dec_and_test() of 3677 * tcp_death_row.tw_refcount. 3678 */ 3679 mutex_lock(&tcp_exit_batch_mutex); 3680 3681 tcp_twsk_purge(net_exit_list); 3682 3683 list_for_each_entry(net, net_exit_list, exit_list) { 3684 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3685 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3686 tcp_fastopen_ctx_destroy(net); 3687 } 3688 3689 mutex_unlock(&tcp_exit_batch_mutex); 3690 } 3691 3692 static struct pernet_operations __net_initdata tcp_sk_ops = { 3693 .init = tcp_sk_init, 3694 .exit = tcp_sk_exit, 3695 .exit_batch = tcp_sk_exit_batch, 3696 }; 3697 3698 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3699 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3700 struct sock_common *sk_common, uid_t uid) 3701 3702 #define INIT_BATCH_SZ 16 3703 3704 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3705 { 3706 struct bpf_tcp_iter_state *iter = priv_data; 3707 int err; 3708 3709 err = bpf_iter_init_seq_net(priv_data, aux); 3710 if (err) 3711 return err; 3712 3713 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3714 if (err) { 3715 bpf_iter_fini_seq_net(priv_data); 3716 return err; 3717 } 3718 3719 return 0; 3720 } 3721 3722 static void bpf_iter_fini_tcp(void *priv_data) 3723 { 3724 struct bpf_tcp_iter_state *iter = priv_data; 3725 3726 bpf_iter_fini_seq_net(priv_data); 3727 kvfree(iter->batch); 3728 } 3729 3730 static const struct bpf_iter_seq_info tcp_seq_info = { 3731 .seq_ops = &bpf_iter_tcp_seq_ops, 3732 .init_seq_private = bpf_iter_init_tcp, 3733 .fini_seq_private = bpf_iter_fini_tcp, 3734 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3735 }; 3736 3737 static const struct bpf_func_proto * 3738 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3739 const struct bpf_prog *prog) 3740 { 3741 switch (func_id) { 3742 case BPF_FUNC_setsockopt: 3743 return &bpf_sk_setsockopt_proto; 3744 case BPF_FUNC_getsockopt: 3745 return &bpf_sk_getsockopt_proto; 3746 default: 3747 return NULL; 3748 } 3749 } 3750 3751 static struct bpf_iter_reg tcp_reg_info = { 3752 .target = "tcp", 3753 .ctx_arg_info_size = 1, 3754 .ctx_arg_info = { 3755 { offsetof(struct bpf_iter__tcp, sk_common), 3756 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3757 }, 3758 .get_func_proto = bpf_iter_tcp_get_func_proto, 3759 .seq_info = &tcp_seq_info, 3760 }; 3761 3762 static void __init bpf_iter_register(void) 3763 { 3764 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3765 if (bpf_iter_reg_target(&tcp_reg_info)) 3766 pr_warn("Warning: could not register bpf iterator tcp\n"); 3767 } 3768 3769 #endif 3770 3771 void __init tcp_v4_init(void) 3772 { 3773 int cpu, res; 3774 3775 for_each_possible_cpu(cpu) { 3776 struct sock *sk; 3777 3778 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3779 IPPROTO_TCP, &init_net); 3780 if (res) 3781 panic("Failed to create the TCP control socket.\n"); 3782 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3783 3784 /* Please enforce IP_DF and IPID==0 for RST and 3785 * ACK sent in SYN-RECV and TIME-WAIT state. 3786 */ 3787 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3788 3789 sk->sk_clockid = CLOCK_MONOTONIC; 3790 3791 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3792 } 3793 if (register_pernet_subsys(&tcp_sk_ops)) 3794 panic("Failed to create the TCP control socket.\n"); 3795 3796 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3797 bpf_iter_register(); 3798 #endif 3799 } 3800