1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 #include <linux/sock_diag.h> 62 63 #include <net/aligned_data.h> 64 #include <net/net_namespace.h> 65 #include <net/icmp.h> 66 #include <net/inet_hashtables.h> 67 #include <net/tcp.h> 68 #include <net/transp_v6.h> 69 #include <net/ipv6.h> 70 #include <net/inet_common.h> 71 #include <net/inet_ecn.h> 72 #include <net/timewait_sock.h> 73 #include <net/xfrm.h> 74 #include <net/secure_seq.h> 75 #include <net/busy_poll.h> 76 #include <net/rstreason.h> 77 78 #include <linux/inet.h> 79 #include <linux/ipv6.h> 80 #include <linux/stddef.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 #include <linux/inetdevice.h> 84 #include <linux/btf_ids.h> 85 #include <linux/skbuff_ref.h> 86 87 #include <crypto/hash.h> 88 #include <linux/scatterlist.h> 89 90 #include <trace/events/tcp.h> 91 92 #ifdef CONFIG_TCP_MD5SIG 93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 94 __be32 daddr, __be32 saddr, const struct tcphdr *th); 95 #endif 96 97 struct inet_hashinfo tcp_hashinfo; 98 99 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 100 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 101 }; 102 103 static DEFINE_MUTEX(tcp_exit_batch_mutex); 104 105 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 106 { 107 return secure_tcp_seq(ip_hdr(skb)->daddr, 108 ip_hdr(skb)->saddr, 109 tcp_hdr(skb)->dest, 110 tcp_hdr(skb)->source); 111 } 112 113 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 114 { 115 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 116 } 117 118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 119 { 120 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 121 const struct inet_timewait_sock *tw = inet_twsk(sktw); 122 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 123 struct tcp_sock *tp = tcp_sk(sk); 124 int ts_recent_stamp; 125 u32 reuse_thresh; 126 127 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 128 reuse = 0; 129 130 if (reuse == 2) { 131 /* Still does not detect *everything* that goes through 132 * lo, since we require a loopback src or dst address 133 * or direct binding to 'lo' interface. 134 */ 135 bool loopback = false; 136 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 137 loopback = true; 138 #if IS_ENABLED(CONFIG_IPV6) 139 if (tw->tw_family == AF_INET6) { 140 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 142 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 144 loopback = true; 145 } else 146 #endif 147 { 148 if (ipv4_is_loopback(tw->tw_daddr) || 149 ipv4_is_loopback(tw->tw_rcv_saddr)) 150 loopback = true; 151 } 152 if (!loopback) 153 reuse = 0; 154 } 155 156 /* With PAWS, it is safe from the viewpoint 157 of data integrity. Even without PAWS it is safe provided sequence 158 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 159 160 Actually, the idea is close to VJ's one, only timestamp cache is 161 held not per host, but per port pair and TW bucket is used as state 162 holder. 163 164 If TW bucket has been already destroyed we fall back to VJ's scheme 165 and use initial timestamp retrieved from peer table. 166 */ 167 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 168 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 169 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 170 if (ts_recent_stamp && 171 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 172 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 173 * and releasing the bucket lock. 174 */ 175 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 176 return 0; 177 178 /* In case of repair and re-using TIME-WAIT sockets we still 179 * want to be sure that it is safe as above but honor the 180 * sequence numbers and time stamps set as part of the repair 181 * process. 182 * 183 * Without this check re-using a TIME-WAIT socket with TCP 184 * repair would accumulate a -1 on the repair assigned 185 * sequence number. The first time it is reused the sequence 186 * is -1, the second time -2, etc. This fixes that issue 187 * without appearing to create any others. 188 */ 189 if (likely(!tp->repair)) { 190 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 191 192 if (!seq) 193 seq = 1; 194 WRITE_ONCE(tp->write_seq, seq); 195 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 196 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 197 } 198 199 return 1; 200 } 201 202 return 0; 203 } 204 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 205 206 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 207 int addr_len) 208 { 209 /* This check is replicated from tcp_v4_connect() and intended to 210 * prevent BPF program called below from accessing bytes that are out 211 * of the bound specified by user in addr_len. 212 */ 213 if (addr_len < sizeof(struct sockaddr_in)) 214 return -EINVAL; 215 216 sock_owned_by_me(sk); 217 218 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 219 } 220 221 /* This will initiate an outgoing connection. */ 222 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 223 { 224 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 225 struct inet_timewait_death_row *tcp_death_row; 226 struct inet_sock *inet = inet_sk(sk); 227 struct tcp_sock *tp = tcp_sk(sk); 228 struct ip_options_rcu *inet_opt; 229 struct net *net = sock_net(sk); 230 __be16 orig_sport, orig_dport; 231 __be32 daddr, nexthop; 232 struct flowi4 *fl4; 233 struct rtable *rt; 234 int err; 235 236 if (addr_len < sizeof(struct sockaddr_in)) 237 return -EINVAL; 238 239 if (usin->sin_family != AF_INET) 240 return -EAFNOSUPPORT; 241 242 nexthop = daddr = usin->sin_addr.s_addr; 243 inet_opt = rcu_dereference_protected(inet->inet_opt, 244 lockdep_sock_is_held(sk)); 245 if (inet_opt && inet_opt->opt.srr) { 246 if (!daddr) 247 return -EINVAL; 248 nexthop = inet_opt->opt.faddr; 249 } 250 251 orig_sport = inet->inet_sport; 252 orig_dport = usin->sin_port; 253 fl4 = &inet->cork.fl.u.ip4; 254 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 255 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 256 orig_dport, sk); 257 if (IS_ERR(rt)) { 258 err = PTR_ERR(rt); 259 if (err == -ENETUNREACH) 260 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 261 return err; 262 } 263 264 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 265 ip_rt_put(rt); 266 return -ENETUNREACH; 267 } 268 269 if (!inet_opt || !inet_opt->opt.srr) 270 daddr = fl4->daddr; 271 272 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 273 274 if (!inet->inet_saddr) { 275 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 276 if (err) { 277 ip_rt_put(rt); 278 return err; 279 } 280 } else { 281 sk_rcv_saddr_set(sk, inet->inet_saddr); 282 } 283 284 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 285 /* Reset inherited state */ 286 tp->rx_opt.ts_recent = 0; 287 tp->rx_opt.ts_recent_stamp = 0; 288 if (likely(!tp->repair)) 289 WRITE_ONCE(tp->write_seq, 0); 290 } 291 292 inet->inet_dport = usin->sin_port; 293 sk_daddr_set(sk, daddr); 294 295 inet_csk(sk)->icsk_ext_hdr_len = 0; 296 if (inet_opt) 297 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 298 299 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 300 301 /* Socket identity is still unknown (sport may be zero). 302 * However we set state to SYN-SENT and not releasing socket 303 * lock select source port, enter ourselves into the hash tables and 304 * complete initialization after this. 305 */ 306 tcp_set_state(sk, TCP_SYN_SENT); 307 err = inet_hash_connect(tcp_death_row, sk); 308 if (err) 309 goto failure; 310 311 sk_set_txhash(sk); 312 313 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 314 inet->inet_sport, inet->inet_dport, sk); 315 if (IS_ERR(rt)) { 316 err = PTR_ERR(rt); 317 rt = NULL; 318 goto failure; 319 } 320 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 321 /* OK, now commit destination to socket. */ 322 sk->sk_gso_type = SKB_GSO_TCPV4; 323 sk_setup_caps(sk, &rt->dst); 324 rt = NULL; 325 326 if (likely(!tp->repair)) { 327 if (!tp->write_seq) 328 WRITE_ONCE(tp->write_seq, 329 secure_tcp_seq(inet->inet_saddr, 330 inet->inet_daddr, 331 inet->inet_sport, 332 usin->sin_port)); 333 WRITE_ONCE(tp->tsoffset, 334 secure_tcp_ts_off(net, inet->inet_saddr, 335 inet->inet_daddr)); 336 } 337 338 atomic_set(&inet->inet_id, get_random_u16()); 339 340 if (tcp_fastopen_defer_connect(sk, &err)) 341 return err; 342 if (err) 343 goto failure; 344 345 err = tcp_connect(sk); 346 347 if (err) 348 goto failure; 349 350 return 0; 351 352 failure: 353 /* 354 * This unhashes the socket and releases the local port, 355 * if necessary. 356 */ 357 tcp_set_state(sk, TCP_CLOSE); 358 inet_bhash2_reset_saddr(sk); 359 ip_rt_put(rt); 360 sk->sk_route_caps = 0; 361 inet->inet_dport = 0; 362 return err; 363 } 364 EXPORT_IPV6_MOD(tcp_v4_connect); 365 366 /* 367 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 368 * It can be called through tcp_release_cb() if socket was owned by user 369 * at the time tcp_v4_err() was called to handle ICMP message. 370 */ 371 void tcp_v4_mtu_reduced(struct sock *sk) 372 { 373 struct inet_sock *inet = inet_sk(sk); 374 struct dst_entry *dst; 375 u32 mtu; 376 377 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 378 return; 379 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 380 dst = inet_csk_update_pmtu(sk, mtu); 381 if (!dst) 382 return; 383 384 /* Something is about to be wrong... Remember soft error 385 * for the case, if this connection will not able to recover. 386 */ 387 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 388 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 389 390 mtu = dst_mtu(dst); 391 392 if (inet->pmtudisc != IP_PMTUDISC_DONT && 393 ip_sk_accept_pmtu(sk) && 394 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 395 tcp_sync_mss(sk, mtu); 396 397 /* Resend the TCP packet because it's 398 * clear that the old packet has been 399 * dropped. This is the new "fast" path mtu 400 * discovery. 401 */ 402 tcp_simple_retransmit(sk); 403 } /* else let the usual retransmit timer handle it */ 404 } 405 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 406 407 static void do_redirect(struct sk_buff *skb, struct sock *sk) 408 { 409 struct dst_entry *dst = __sk_dst_check(sk, 0); 410 411 if (dst) 412 dst->ops->redirect(dst, sk, skb); 413 } 414 415 416 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 417 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 418 { 419 struct request_sock *req = inet_reqsk(sk); 420 struct net *net = sock_net(sk); 421 422 /* ICMPs are not backlogged, hence we cannot get 423 * an established socket here. 424 */ 425 if (seq != tcp_rsk(req)->snt_isn) { 426 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 427 } else if (abort) { 428 /* 429 * Still in SYN_RECV, just remove it silently. 430 * There is no good way to pass the error to the newly 431 * created socket, and POSIX does not want network 432 * errors returned from accept(). 433 */ 434 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 435 tcp_listendrop(req->rsk_listener); 436 } 437 reqsk_put(req); 438 } 439 EXPORT_IPV6_MOD(tcp_req_err); 440 441 /* TCP-LD (RFC 6069) logic */ 442 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 443 { 444 struct inet_connection_sock *icsk = inet_csk(sk); 445 struct tcp_sock *tp = tcp_sk(sk); 446 struct sk_buff *skb; 447 s32 remaining; 448 u32 delta_us; 449 450 if (sock_owned_by_user(sk)) 451 return; 452 453 if (seq != tp->snd_una || !icsk->icsk_retransmits || 454 !icsk->icsk_backoff) 455 return; 456 457 skb = tcp_rtx_queue_head(sk); 458 if (WARN_ON_ONCE(!skb)) 459 return; 460 461 icsk->icsk_backoff--; 462 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 463 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 464 465 tcp_mstamp_refresh(tp); 466 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 467 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 468 469 if (remaining > 0) { 470 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 471 } else { 472 /* RTO revert clocked out retransmission. 473 * Will retransmit now. 474 */ 475 tcp_retransmit_timer(sk); 476 } 477 } 478 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 479 480 /* 481 * This routine is called by the ICMP module when it gets some 482 * sort of error condition. If err < 0 then the socket should 483 * be closed and the error returned to the user. If err > 0 484 * it's just the icmp type << 8 | icmp code. After adjustment 485 * header points to the first 8 bytes of the tcp header. We need 486 * to find the appropriate port. 487 * 488 * The locking strategy used here is very "optimistic". When 489 * someone else accesses the socket the ICMP is just dropped 490 * and for some paths there is no check at all. 491 * A more general error queue to queue errors for later handling 492 * is probably better. 493 * 494 */ 495 496 int tcp_v4_err(struct sk_buff *skb, u32 info) 497 { 498 const struct iphdr *iph = (const struct iphdr *)skb->data; 499 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 500 struct net *net = dev_net_rcu(skb->dev); 501 const int type = icmp_hdr(skb)->type; 502 const int code = icmp_hdr(skb)->code; 503 struct request_sock *fastopen; 504 struct tcp_sock *tp; 505 u32 seq, snd_una; 506 struct sock *sk; 507 int err; 508 509 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 510 ntohs(th->source), inet_iif(skb), 0); 511 if (!sk) { 512 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 513 return -ENOENT; 514 } 515 if (sk->sk_state == TCP_TIME_WAIT) { 516 /* To increase the counter of ignored icmps for TCP-AO */ 517 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 518 inet_twsk_put(inet_twsk(sk)); 519 return 0; 520 } 521 seq = ntohl(th->seq); 522 if (sk->sk_state == TCP_NEW_SYN_RECV) { 523 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 524 type == ICMP_TIME_EXCEEDED || 525 (type == ICMP_DEST_UNREACH && 526 (code == ICMP_NET_UNREACH || 527 code == ICMP_HOST_UNREACH))); 528 return 0; 529 } 530 531 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 532 sock_put(sk); 533 return 0; 534 } 535 536 bh_lock_sock(sk); 537 /* If too many ICMPs get dropped on busy 538 * servers this needs to be solved differently. 539 * We do take care of PMTU discovery (RFC1191) special case : 540 * we can receive locally generated ICMP messages while socket is held. 541 */ 542 if (sock_owned_by_user(sk)) { 543 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 544 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 545 } 546 if (sk->sk_state == TCP_CLOSE) 547 goto out; 548 549 if (static_branch_unlikely(&ip4_min_ttl)) { 550 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 551 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 552 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 553 goto out; 554 } 555 } 556 557 tp = tcp_sk(sk); 558 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 559 fastopen = rcu_dereference(tp->fastopen_rsk); 560 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 561 if (sk->sk_state != TCP_LISTEN && 562 !between(seq, snd_una, tp->snd_nxt)) { 563 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 564 goto out; 565 } 566 567 switch (type) { 568 case ICMP_REDIRECT: 569 if (!sock_owned_by_user(sk)) 570 do_redirect(skb, sk); 571 goto out; 572 case ICMP_SOURCE_QUENCH: 573 /* Just silently ignore these. */ 574 goto out; 575 case ICMP_PARAMETERPROB: 576 err = EPROTO; 577 break; 578 case ICMP_DEST_UNREACH: 579 if (code > NR_ICMP_UNREACH) 580 goto out; 581 582 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 583 /* We are not interested in TCP_LISTEN and open_requests 584 * (SYN-ACKs send out by Linux are always <576bytes so 585 * they should go through unfragmented). 586 */ 587 if (sk->sk_state == TCP_LISTEN) 588 goto out; 589 590 WRITE_ONCE(tp->mtu_info, info); 591 if (!sock_owned_by_user(sk)) { 592 tcp_v4_mtu_reduced(sk); 593 } else { 594 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 595 sock_hold(sk); 596 } 597 goto out; 598 } 599 600 err = icmp_err_convert[code].errno; 601 /* check if this ICMP message allows revert of backoff. 602 * (see RFC 6069) 603 */ 604 if (!fastopen && 605 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 606 tcp_ld_RTO_revert(sk, seq); 607 break; 608 case ICMP_TIME_EXCEEDED: 609 err = EHOSTUNREACH; 610 break; 611 default: 612 goto out; 613 } 614 615 switch (sk->sk_state) { 616 case TCP_SYN_SENT: 617 case TCP_SYN_RECV: 618 /* Only in fast or simultaneous open. If a fast open socket is 619 * already accepted it is treated as a connected one below. 620 */ 621 if (fastopen && !fastopen->sk) 622 break; 623 624 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 625 626 if (!sock_owned_by_user(sk)) 627 tcp_done_with_error(sk, err); 628 else 629 WRITE_ONCE(sk->sk_err_soft, err); 630 goto out; 631 } 632 633 /* If we've already connected we will keep trying 634 * until we time out, or the user gives up. 635 * 636 * rfc1122 4.2.3.9 allows to consider as hard errors 637 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 638 * but it is obsoleted by pmtu discovery). 639 * 640 * Note, that in modern internet, where routing is unreliable 641 * and in each dark corner broken firewalls sit, sending random 642 * errors ordered by their masters even this two messages finally lose 643 * their original sense (even Linux sends invalid PORT_UNREACHs) 644 * 645 * Now we are in compliance with RFCs. 646 * --ANK (980905) 647 */ 648 649 if (!sock_owned_by_user(sk) && 650 inet_test_bit(RECVERR, sk)) { 651 WRITE_ONCE(sk->sk_err, err); 652 sk_error_report(sk); 653 } else { /* Only an error on timeout */ 654 WRITE_ONCE(sk->sk_err_soft, err); 655 } 656 657 out: 658 bh_unlock_sock(sk); 659 sock_put(sk); 660 return 0; 661 } 662 663 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 664 { 665 struct tcphdr *th = tcp_hdr(skb); 666 667 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 668 skb->csum_start = skb_transport_header(skb) - skb->head; 669 skb->csum_offset = offsetof(struct tcphdr, check); 670 } 671 672 /* This routine computes an IPv4 TCP checksum. */ 673 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 674 { 675 const struct inet_sock *inet = inet_sk(sk); 676 677 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 678 } 679 EXPORT_IPV6_MOD(tcp_v4_send_check); 680 681 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 682 683 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 684 const struct tcp_ao_hdr *aoh, 685 struct ip_reply_arg *arg, struct tcphdr *reply, 686 __be32 reply_options[REPLY_OPTIONS_LEN]) 687 { 688 #ifdef CONFIG_TCP_AO 689 int sdif = tcp_v4_sdif(skb); 690 int dif = inet_iif(skb); 691 int l3index = sdif ? dif : 0; 692 bool allocated_traffic_key; 693 struct tcp_ao_key *key; 694 char *traffic_key; 695 bool drop = true; 696 u32 ao_sne = 0; 697 u8 keyid; 698 699 rcu_read_lock(); 700 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 701 &key, &traffic_key, &allocated_traffic_key, 702 &keyid, &ao_sne)) 703 goto out; 704 705 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 706 (aoh->rnext_keyid << 8) | keyid); 707 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 708 reply->doff = arg->iov[0].iov_len / 4; 709 710 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 711 key, traffic_key, 712 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 713 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 714 reply, ao_sne)) 715 goto out; 716 drop = false; 717 out: 718 rcu_read_unlock(); 719 if (allocated_traffic_key) 720 kfree(traffic_key); 721 return drop; 722 #else 723 return true; 724 #endif 725 } 726 727 /* 728 * This routine will send an RST to the other tcp. 729 * 730 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 731 * for reset. 732 * Answer: if a packet caused RST, it is not for a socket 733 * existing in our system, if it is matched to a socket, 734 * it is just duplicate segment or bug in other side's TCP. 735 * So that we build reply only basing on parameters 736 * arrived with segment. 737 * Exception: precedence violation. We do not implement it in any case. 738 */ 739 740 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 741 enum sk_rst_reason reason) 742 { 743 const struct tcphdr *th = tcp_hdr(skb); 744 struct { 745 struct tcphdr th; 746 __be32 opt[REPLY_OPTIONS_LEN]; 747 } rep; 748 const __u8 *md5_hash_location = NULL; 749 const struct tcp_ao_hdr *aoh; 750 struct ip_reply_arg arg; 751 #ifdef CONFIG_TCP_MD5SIG 752 struct tcp_md5sig_key *key = NULL; 753 unsigned char newhash[16]; 754 struct sock *sk1 = NULL; 755 int genhash; 756 #endif 757 u64 transmit_time = 0; 758 struct sock *ctl_sk; 759 struct net *net; 760 u32 txhash = 0; 761 762 /* Never send a reset in response to a reset. */ 763 if (th->rst) 764 return; 765 766 /* If sk not NULL, it means we did a successful lookup and incoming 767 * route had to be correct. prequeue might have dropped our dst. 768 */ 769 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 770 return; 771 772 /* Swap the send and the receive. */ 773 memset(&rep, 0, sizeof(rep)); 774 rep.th.dest = th->source; 775 rep.th.source = th->dest; 776 rep.th.doff = sizeof(struct tcphdr) / 4; 777 rep.th.rst = 1; 778 779 if (th->ack) { 780 rep.th.seq = th->ack_seq; 781 } else { 782 rep.th.ack = 1; 783 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 784 skb->len - (th->doff << 2)); 785 } 786 787 memset(&arg, 0, sizeof(arg)); 788 arg.iov[0].iov_base = (unsigned char *)&rep; 789 arg.iov[0].iov_len = sizeof(rep.th); 790 791 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 792 793 /* Invalid TCP option size or twice included auth */ 794 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 795 return; 796 797 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 798 return; 799 800 #ifdef CONFIG_TCP_MD5SIG 801 rcu_read_lock(); 802 if (sk && sk_fullsock(sk)) { 803 const union tcp_md5_addr *addr; 804 int l3index; 805 806 /* sdif set, means packet ingressed via a device 807 * in an L3 domain and inet_iif is set to it. 808 */ 809 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 810 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 811 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 812 } else if (md5_hash_location) { 813 const union tcp_md5_addr *addr; 814 int sdif = tcp_v4_sdif(skb); 815 int dif = inet_iif(skb); 816 int l3index; 817 818 /* 819 * active side is lost. Try to find listening socket through 820 * source port, and then find md5 key through listening socket. 821 * we are not loose security here: 822 * Incoming packet is checked with md5 hash with finding key, 823 * no RST generated if md5 hash doesn't match. 824 */ 825 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 826 th->source, ip_hdr(skb)->daddr, 827 ntohs(th->source), dif, sdif); 828 /* don't send rst if it can't find key */ 829 if (!sk1) 830 goto out; 831 832 /* sdif set, means packet ingressed via a device 833 * in an L3 domain and dif is set to it. 834 */ 835 l3index = sdif ? dif : 0; 836 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 837 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 838 if (!key) 839 goto out; 840 841 842 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 843 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 844 goto out; 845 846 } 847 848 if (key) { 849 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 850 (TCPOPT_NOP << 16) | 851 (TCPOPT_MD5SIG << 8) | 852 TCPOLEN_MD5SIG); 853 /* Update length and the length the header thinks exists */ 854 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 855 rep.th.doff = arg.iov[0].iov_len / 4; 856 857 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 858 key, ip_hdr(skb)->saddr, 859 ip_hdr(skb)->daddr, &rep.th); 860 } 861 #endif 862 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 863 if (rep.opt[0] == 0) { 864 __be32 mrst = mptcp_reset_option(skb); 865 866 if (mrst) { 867 rep.opt[0] = mrst; 868 arg.iov[0].iov_len += sizeof(mrst); 869 rep.th.doff = arg.iov[0].iov_len / 4; 870 } 871 } 872 873 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 874 ip_hdr(skb)->saddr, /* XXX */ 875 arg.iov[0].iov_len, IPPROTO_TCP, 0); 876 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 877 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 878 879 /* When socket is gone, all binding information is lost. 880 * routing might fail in this case. No choice here, if we choose to force 881 * input interface, we will misroute in case of asymmetric route. 882 */ 883 if (sk) 884 arg.bound_dev_if = sk->sk_bound_dev_if; 885 886 trace_tcp_send_reset(sk, skb, reason); 887 888 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 889 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 890 891 /* ECN bits of TW reset are cleared */ 892 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 893 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 894 local_bh_disable(); 895 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 896 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 897 898 sock_net_set(ctl_sk, net); 899 if (sk) { 900 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 901 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 902 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 903 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 904 transmit_time = tcp_transmit_time(sk); 905 xfrm_sk_clone_policy(ctl_sk, sk); 906 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 907 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 908 } else { 909 ctl_sk->sk_mark = 0; 910 ctl_sk->sk_priority = 0; 911 } 912 ip_send_unicast_reply(ctl_sk, sk, 913 skb, &TCP_SKB_CB(skb)->header.h4.opt, 914 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 915 &arg, arg.iov[0].iov_len, 916 transmit_time, txhash); 917 918 xfrm_sk_free_policy(ctl_sk); 919 sock_net_set(ctl_sk, &init_net); 920 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 921 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 922 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 923 local_bh_enable(); 924 925 #ifdef CONFIG_TCP_MD5SIG 926 out: 927 rcu_read_unlock(); 928 #endif 929 } 930 931 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 932 outside socket context is ugly, certainly. What can I do? 933 */ 934 935 static void tcp_v4_send_ack(const struct sock *sk, 936 struct sk_buff *skb, u32 seq, u32 ack, 937 u32 win, u32 tsval, u32 tsecr, int oif, 938 struct tcp_key *key, 939 int reply_flags, u8 tos, u32 txhash) 940 { 941 const struct tcphdr *th = tcp_hdr(skb); 942 struct { 943 struct tcphdr th; 944 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 945 } rep; 946 struct net *net = sock_net(sk); 947 struct ip_reply_arg arg; 948 struct sock *ctl_sk; 949 u64 transmit_time; 950 951 memset(&rep.th, 0, sizeof(struct tcphdr)); 952 memset(&arg, 0, sizeof(arg)); 953 954 arg.iov[0].iov_base = (unsigned char *)&rep; 955 arg.iov[0].iov_len = sizeof(rep.th); 956 if (tsecr) { 957 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 958 (TCPOPT_TIMESTAMP << 8) | 959 TCPOLEN_TIMESTAMP); 960 rep.opt[1] = htonl(tsval); 961 rep.opt[2] = htonl(tsecr); 962 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 963 } 964 965 /* Swap the send and the receive. */ 966 rep.th.dest = th->source; 967 rep.th.source = th->dest; 968 rep.th.doff = arg.iov[0].iov_len / 4; 969 rep.th.seq = htonl(seq); 970 rep.th.ack_seq = htonl(ack); 971 rep.th.ack = 1; 972 rep.th.window = htons(win); 973 974 #ifdef CONFIG_TCP_MD5SIG 975 if (tcp_key_is_md5(key)) { 976 int offset = (tsecr) ? 3 : 0; 977 978 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 979 (TCPOPT_NOP << 16) | 980 (TCPOPT_MD5SIG << 8) | 981 TCPOLEN_MD5SIG); 982 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 983 rep.th.doff = arg.iov[0].iov_len/4; 984 985 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 986 key->md5_key, ip_hdr(skb)->saddr, 987 ip_hdr(skb)->daddr, &rep.th); 988 } 989 #endif 990 #ifdef CONFIG_TCP_AO 991 if (tcp_key_is_ao(key)) { 992 int offset = (tsecr) ? 3 : 0; 993 994 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 995 (tcp_ao_len(key->ao_key) << 16) | 996 (key->ao_key->sndid << 8) | 997 key->rcv_next); 998 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 999 rep.th.doff = arg.iov[0].iov_len / 4; 1000 1001 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1002 key->ao_key, key->traffic_key, 1003 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1004 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1005 &rep.th, key->sne); 1006 } 1007 #endif 1008 arg.flags = reply_flags; 1009 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1010 ip_hdr(skb)->saddr, /* XXX */ 1011 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1012 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1013 if (oif) 1014 arg.bound_dev_if = oif; 1015 arg.tos = tos; 1016 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1017 local_bh_disable(); 1018 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1019 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1020 sock_net_set(ctl_sk, net); 1021 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1022 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1023 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1024 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1025 transmit_time = tcp_transmit_time(sk); 1026 ip_send_unicast_reply(ctl_sk, sk, 1027 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1028 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1029 &arg, arg.iov[0].iov_len, 1030 transmit_time, txhash); 1031 1032 sock_net_set(ctl_sk, &init_net); 1033 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1034 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1035 local_bh_enable(); 1036 } 1037 1038 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1039 enum tcp_tw_status tw_status) 1040 { 1041 struct inet_timewait_sock *tw = inet_twsk(sk); 1042 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1043 struct tcp_key key = {}; 1044 u8 tos = tw->tw_tos; 1045 1046 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1047 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1048 * being placed in a different service queues (Classic rather than L4S) 1049 */ 1050 if (tw_status == TCP_TW_ACK_OOW) 1051 tos &= ~INET_ECN_MASK; 1052 1053 #ifdef CONFIG_TCP_AO 1054 struct tcp_ao_info *ao_info; 1055 1056 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1057 /* FIXME: the segment to-be-acked is not verified yet */ 1058 ao_info = rcu_dereference(tcptw->ao_info); 1059 if (ao_info) { 1060 const struct tcp_ao_hdr *aoh; 1061 1062 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1063 inet_twsk_put(tw); 1064 return; 1065 } 1066 1067 if (aoh) 1068 key.ao_key = tcp_ao_established_key(sk, ao_info, 1069 aoh->rnext_keyid, -1); 1070 } 1071 } 1072 if (key.ao_key) { 1073 struct tcp_ao_key *rnext_key; 1074 1075 key.traffic_key = snd_other_key(key.ao_key); 1076 key.sne = READ_ONCE(ao_info->snd_sne); 1077 rnext_key = READ_ONCE(ao_info->rnext_key); 1078 key.rcv_next = rnext_key->rcvid; 1079 key.type = TCP_KEY_AO; 1080 #else 1081 if (0) { 1082 #endif 1083 } else if (static_branch_tcp_md5()) { 1084 key.md5_key = tcp_twsk_md5_key(tcptw); 1085 if (key.md5_key) 1086 key.type = TCP_KEY_MD5; 1087 } 1088 1089 tcp_v4_send_ack(sk, skb, 1090 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1091 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1092 tcp_tw_tsval(tcptw), 1093 READ_ONCE(tcptw->tw_ts_recent), 1094 tw->tw_bound_dev_if, &key, 1095 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1096 tos, 1097 tw->tw_txhash); 1098 1099 inet_twsk_put(tw); 1100 } 1101 1102 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1103 struct request_sock *req) 1104 { 1105 struct tcp_key key = {}; 1106 1107 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1108 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1109 */ 1110 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1111 tcp_sk(sk)->snd_nxt; 1112 1113 #ifdef CONFIG_TCP_AO 1114 if (static_branch_unlikely(&tcp_ao_needed.key) && 1115 tcp_rsk_used_ao(req)) { 1116 const union tcp_md5_addr *addr; 1117 const struct tcp_ao_hdr *aoh; 1118 int l3index; 1119 1120 /* Invalid TCP option size or twice included auth */ 1121 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1122 return; 1123 if (!aoh) 1124 return; 1125 1126 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1127 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1128 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1129 aoh->rnext_keyid, -1); 1130 if (unlikely(!key.ao_key)) { 1131 /* Send ACK with any matching MKT for the peer */ 1132 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1133 /* Matching key disappeared (user removed the key?) 1134 * let the handshake timeout. 1135 */ 1136 if (!key.ao_key) { 1137 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1138 addr, 1139 ntohs(tcp_hdr(skb)->source), 1140 &ip_hdr(skb)->daddr, 1141 ntohs(tcp_hdr(skb)->dest)); 1142 return; 1143 } 1144 } 1145 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1146 if (!key.traffic_key) 1147 return; 1148 1149 key.type = TCP_KEY_AO; 1150 key.rcv_next = aoh->keyid; 1151 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1152 #else 1153 if (0) { 1154 #endif 1155 } else if (static_branch_tcp_md5()) { 1156 const union tcp_md5_addr *addr; 1157 int l3index; 1158 1159 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1160 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1161 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1162 if (key.md5_key) 1163 key.type = TCP_KEY_MD5; 1164 } 1165 1166 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1167 tcp_v4_send_ack(sk, skb, seq, 1168 tcp_rsk(req)->rcv_nxt, 1169 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1170 tcp_rsk_tsval(tcp_rsk(req)), 1171 req->ts_recent, 1172 0, &key, 1173 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1174 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1175 READ_ONCE(tcp_rsk(req)->txhash)); 1176 if (tcp_key_is_ao(&key)) 1177 kfree(key.traffic_key); 1178 } 1179 1180 /* 1181 * Send a SYN-ACK after having received a SYN. 1182 * This still operates on a request_sock only, not on a big 1183 * socket. 1184 */ 1185 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1186 struct flowi *fl, 1187 struct request_sock *req, 1188 struct tcp_fastopen_cookie *foc, 1189 enum tcp_synack_type synack_type, 1190 struct sk_buff *syn_skb) 1191 { 1192 const struct inet_request_sock *ireq = inet_rsk(req); 1193 struct flowi4 fl4; 1194 int err = -1; 1195 struct sk_buff *skb; 1196 u8 tos; 1197 1198 /* First, grab a route. */ 1199 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1200 return -1; 1201 1202 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1203 1204 if (skb) { 1205 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1206 1207 tos = READ_ONCE(inet_sk(sk)->tos); 1208 1209 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1210 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1211 (tos & INET_ECN_MASK); 1212 1213 if (!INET_ECN_is_capable(tos) && 1214 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1215 tos |= INET_ECN_ECT_0; 1216 1217 rcu_read_lock(); 1218 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1219 ireq->ir_rmt_addr, 1220 rcu_dereference(ireq->ireq_opt), 1221 tos); 1222 rcu_read_unlock(); 1223 err = net_xmit_eval(err); 1224 } 1225 1226 return err; 1227 } 1228 1229 /* 1230 * IPv4 request_sock destructor. 1231 */ 1232 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1233 { 1234 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1235 } 1236 1237 #ifdef CONFIG_TCP_MD5SIG 1238 /* 1239 * RFC2385 MD5 checksumming requires a mapping of 1240 * IP address->MD5 Key. 1241 * We need to maintain these in the sk structure. 1242 */ 1243 1244 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1245 EXPORT_IPV6_MOD(tcp_md5_needed); 1246 1247 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1248 { 1249 if (!old) 1250 return true; 1251 1252 /* l3index always overrides non-l3index */ 1253 if (old->l3index && new->l3index == 0) 1254 return false; 1255 if (old->l3index == 0 && new->l3index) 1256 return true; 1257 1258 return old->prefixlen < new->prefixlen; 1259 } 1260 1261 /* Find the Key structure for an address. */ 1262 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1263 const union tcp_md5_addr *addr, 1264 int family, bool any_l3index) 1265 { 1266 const struct tcp_sock *tp = tcp_sk(sk); 1267 struct tcp_md5sig_key *key; 1268 const struct tcp_md5sig_info *md5sig; 1269 __be32 mask; 1270 struct tcp_md5sig_key *best_match = NULL; 1271 bool match; 1272 1273 /* caller either holds rcu_read_lock() or socket lock */ 1274 md5sig = rcu_dereference_check(tp->md5sig_info, 1275 lockdep_sock_is_held(sk)); 1276 if (!md5sig) 1277 return NULL; 1278 1279 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1280 lockdep_sock_is_held(sk)) { 1281 if (key->family != family) 1282 continue; 1283 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1284 key->l3index != l3index) 1285 continue; 1286 if (family == AF_INET) { 1287 mask = inet_make_mask(key->prefixlen); 1288 match = (key->addr.a4.s_addr & mask) == 1289 (addr->a4.s_addr & mask); 1290 #if IS_ENABLED(CONFIG_IPV6) 1291 } else if (family == AF_INET6) { 1292 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1293 key->prefixlen); 1294 #endif 1295 } else { 1296 match = false; 1297 } 1298 1299 if (match && better_md5_match(best_match, key)) 1300 best_match = key; 1301 } 1302 return best_match; 1303 } 1304 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1305 1306 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1307 const union tcp_md5_addr *addr, 1308 int family, u8 prefixlen, 1309 int l3index, u8 flags) 1310 { 1311 const struct tcp_sock *tp = tcp_sk(sk); 1312 struct tcp_md5sig_key *key; 1313 unsigned int size = sizeof(struct in_addr); 1314 const struct tcp_md5sig_info *md5sig; 1315 1316 /* caller either holds rcu_read_lock() or socket lock */ 1317 md5sig = rcu_dereference_check(tp->md5sig_info, 1318 lockdep_sock_is_held(sk)); 1319 if (!md5sig) 1320 return NULL; 1321 #if IS_ENABLED(CONFIG_IPV6) 1322 if (family == AF_INET6) 1323 size = sizeof(struct in6_addr); 1324 #endif 1325 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1326 lockdep_sock_is_held(sk)) { 1327 if (key->family != family) 1328 continue; 1329 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1330 continue; 1331 if (key->l3index != l3index) 1332 continue; 1333 if (!memcmp(&key->addr, addr, size) && 1334 key->prefixlen == prefixlen) 1335 return key; 1336 } 1337 return NULL; 1338 } 1339 1340 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1341 const struct sock *addr_sk) 1342 { 1343 const union tcp_md5_addr *addr; 1344 int l3index; 1345 1346 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1347 addr_sk->sk_bound_dev_if); 1348 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1349 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1350 } 1351 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1352 1353 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1354 { 1355 struct tcp_sock *tp = tcp_sk(sk); 1356 struct tcp_md5sig_info *md5sig; 1357 1358 md5sig = kmalloc(sizeof(*md5sig), gfp); 1359 if (!md5sig) 1360 return -ENOMEM; 1361 1362 sk_gso_disable(sk); 1363 INIT_HLIST_HEAD(&md5sig->head); 1364 rcu_assign_pointer(tp->md5sig_info, md5sig); 1365 return 0; 1366 } 1367 1368 /* This can be called on a newly created socket, from other files */ 1369 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1370 int family, u8 prefixlen, int l3index, u8 flags, 1371 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1372 { 1373 /* Add Key to the list */ 1374 struct tcp_md5sig_key *key; 1375 struct tcp_sock *tp = tcp_sk(sk); 1376 struct tcp_md5sig_info *md5sig; 1377 1378 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1379 if (key) { 1380 /* Pre-existing entry - just update that one. 1381 * Note that the key might be used concurrently. 1382 * data_race() is telling kcsan that we do not care of 1383 * key mismatches, since changing MD5 key on live flows 1384 * can lead to packet drops. 1385 */ 1386 data_race(memcpy(key->key, newkey, newkeylen)); 1387 1388 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1389 * Also note that a reader could catch new key->keylen value 1390 * but old key->key[], this is the reason we use __GFP_ZERO 1391 * at sock_kmalloc() time below these lines. 1392 */ 1393 WRITE_ONCE(key->keylen, newkeylen); 1394 1395 return 0; 1396 } 1397 1398 md5sig = rcu_dereference_protected(tp->md5sig_info, 1399 lockdep_sock_is_held(sk)); 1400 1401 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1402 if (!key) 1403 return -ENOMEM; 1404 1405 memcpy(key->key, newkey, newkeylen); 1406 key->keylen = newkeylen; 1407 key->family = family; 1408 key->prefixlen = prefixlen; 1409 key->l3index = l3index; 1410 key->flags = flags; 1411 memcpy(&key->addr, addr, 1412 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1413 sizeof(struct in_addr)); 1414 hlist_add_head_rcu(&key->node, &md5sig->head); 1415 return 0; 1416 } 1417 1418 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1419 int family, u8 prefixlen, int l3index, u8 flags, 1420 const u8 *newkey, u8 newkeylen) 1421 { 1422 struct tcp_sock *tp = tcp_sk(sk); 1423 1424 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1425 if (tcp_md5_alloc_sigpool()) 1426 return -ENOMEM; 1427 1428 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1429 tcp_md5_release_sigpool(); 1430 return -ENOMEM; 1431 } 1432 1433 if (!static_branch_inc(&tcp_md5_needed.key)) { 1434 struct tcp_md5sig_info *md5sig; 1435 1436 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1437 rcu_assign_pointer(tp->md5sig_info, NULL); 1438 kfree_rcu(md5sig, rcu); 1439 tcp_md5_release_sigpool(); 1440 return -EUSERS; 1441 } 1442 } 1443 1444 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1445 newkey, newkeylen, GFP_KERNEL); 1446 } 1447 EXPORT_IPV6_MOD(tcp_md5_do_add); 1448 1449 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1450 int family, u8 prefixlen, int l3index, 1451 struct tcp_md5sig_key *key) 1452 { 1453 struct tcp_sock *tp = tcp_sk(sk); 1454 1455 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1456 tcp_md5_add_sigpool(); 1457 1458 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1459 tcp_md5_release_sigpool(); 1460 return -ENOMEM; 1461 } 1462 1463 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1464 struct tcp_md5sig_info *md5sig; 1465 1466 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1467 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1468 rcu_assign_pointer(tp->md5sig_info, NULL); 1469 kfree_rcu(md5sig, rcu); 1470 tcp_md5_release_sigpool(); 1471 return -EUSERS; 1472 } 1473 } 1474 1475 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1476 key->flags, key->key, key->keylen, 1477 sk_gfp_mask(sk, GFP_ATOMIC)); 1478 } 1479 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1480 1481 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1482 u8 prefixlen, int l3index, u8 flags) 1483 { 1484 struct tcp_md5sig_key *key; 1485 1486 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1487 if (!key) 1488 return -ENOENT; 1489 hlist_del_rcu(&key->node); 1490 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1491 kfree_rcu(key, rcu); 1492 return 0; 1493 } 1494 EXPORT_IPV6_MOD(tcp_md5_do_del); 1495 1496 void tcp_clear_md5_list(struct sock *sk) 1497 { 1498 struct tcp_sock *tp = tcp_sk(sk); 1499 struct tcp_md5sig_key *key; 1500 struct hlist_node *n; 1501 struct tcp_md5sig_info *md5sig; 1502 1503 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1504 1505 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1506 hlist_del_rcu(&key->node); 1507 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1508 kfree_rcu(key, rcu); 1509 } 1510 } 1511 1512 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1513 sockptr_t optval, int optlen) 1514 { 1515 struct tcp_md5sig cmd; 1516 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1517 const union tcp_md5_addr *addr; 1518 u8 prefixlen = 32; 1519 int l3index = 0; 1520 bool l3flag; 1521 u8 flags; 1522 1523 if (optlen < sizeof(cmd)) 1524 return -EINVAL; 1525 1526 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1527 return -EFAULT; 1528 1529 if (sin->sin_family != AF_INET) 1530 return -EINVAL; 1531 1532 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1533 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1534 1535 if (optname == TCP_MD5SIG_EXT && 1536 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1537 prefixlen = cmd.tcpm_prefixlen; 1538 if (prefixlen > 32) 1539 return -EINVAL; 1540 } 1541 1542 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1543 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1544 struct net_device *dev; 1545 1546 rcu_read_lock(); 1547 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1548 if (dev && netif_is_l3_master(dev)) 1549 l3index = dev->ifindex; 1550 1551 rcu_read_unlock(); 1552 1553 /* ok to reference set/not set outside of rcu; 1554 * right now device MUST be an L3 master 1555 */ 1556 if (!dev || !l3index) 1557 return -EINVAL; 1558 } 1559 1560 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1561 1562 if (!cmd.tcpm_keylen) 1563 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1564 1565 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1566 return -EINVAL; 1567 1568 /* Don't allow keys for peers that have a matching TCP-AO key. 1569 * See the comment in tcp_ao_add_cmd() 1570 */ 1571 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1572 return -EKEYREJECTED; 1573 1574 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1575 cmd.tcpm_key, cmd.tcpm_keylen); 1576 } 1577 1578 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1579 __be32 daddr, __be32 saddr, 1580 const struct tcphdr *th, int nbytes) 1581 { 1582 struct tcp4_pseudohdr *bp; 1583 struct scatterlist sg; 1584 struct tcphdr *_th; 1585 1586 bp = hp->scratch; 1587 bp->saddr = saddr; 1588 bp->daddr = daddr; 1589 bp->pad = 0; 1590 bp->protocol = IPPROTO_TCP; 1591 bp->len = cpu_to_be16(nbytes); 1592 1593 _th = (struct tcphdr *)(bp + 1); 1594 memcpy(_th, th, sizeof(*th)); 1595 _th->check = 0; 1596 1597 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1598 ahash_request_set_crypt(hp->req, &sg, NULL, 1599 sizeof(*bp) + sizeof(*th)); 1600 return crypto_ahash_update(hp->req); 1601 } 1602 1603 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1604 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1605 { 1606 struct tcp_sigpool hp; 1607 1608 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1609 goto clear_hash_nostart; 1610 1611 if (crypto_ahash_init(hp.req)) 1612 goto clear_hash; 1613 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1614 goto clear_hash; 1615 if (tcp_md5_hash_key(&hp, key)) 1616 goto clear_hash; 1617 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1618 if (crypto_ahash_final(hp.req)) 1619 goto clear_hash; 1620 1621 tcp_sigpool_end(&hp); 1622 return 0; 1623 1624 clear_hash: 1625 tcp_sigpool_end(&hp); 1626 clear_hash_nostart: 1627 memset(md5_hash, 0, 16); 1628 return 1; 1629 } 1630 1631 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1632 const struct sock *sk, 1633 const struct sk_buff *skb) 1634 { 1635 const struct tcphdr *th = tcp_hdr(skb); 1636 struct tcp_sigpool hp; 1637 __be32 saddr, daddr; 1638 1639 if (sk) { /* valid for establish/request sockets */ 1640 saddr = sk->sk_rcv_saddr; 1641 daddr = sk->sk_daddr; 1642 } else { 1643 const struct iphdr *iph = ip_hdr(skb); 1644 saddr = iph->saddr; 1645 daddr = iph->daddr; 1646 } 1647 1648 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1649 goto clear_hash_nostart; 1650 1651 if (crypto_ahash_init(hp.req)) 1652 goto clear_hash; 1653 1654 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1655 goto clear_hash; 1656 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1657 goto clear_hash; 1658 if (tcp_md5_hash_key(&hp, key)) 1659 goto clear_hash; 1660 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1661 if (crypto_ahash_final(hp.req)) 1662 goto clear_hash; 1663 1664 tcp_sigpool_end(&hp); 1665 return 0; 1666 1667 clear_hash: 1668 tcp_sigpool_end(&hp); 1669 clear_hash_nostart: 1670 memset(md5_hash, 0, 16); 1671 return 1; 1672 } 1673 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1674 1675 #endif 1676 1677 static void tcp_v4_init_req(struct request_sock *req, 1678 const struct sock *sk_listener, 1679 struct sk_buff *skb) 1680 { 1681 struct inet_request_sock *ireq = inet_rsk(req); 1682 struct net *net = sock_net(sk_listener); 1683 1684 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1685 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1686 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1687 } 1688 1689 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1690 struct sk_buff *skb, 1691 struct flowi *fl, 1692 struct request_sock *req, 1693 u32 tw_isn) 1694 { 1695 tcp_v4_init_req(req, sk, skb); 1696 1697 if (security_inet_conn_request(sk, skb, req)) 1698 return NULL; 1699 1700 return inet_csk_route_req(sk, &fl->u.ip4, req); 1701 } 1702 1703 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1704 .family = PF_INET, 1705 .obj_size = sizeof(struct tcp_request_sock), 1706 .send_ack = tcp_v4_reqsk_send_ack, 1707 .destructor = tcp_v4_reqsk_destructor, 1708 .send_reset = tcp_v4_send_reset, 1709 .syn_ack_timeout = tcp_syn_ack_timeout, 1710 }; 1711 1712 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1713 .mss_clamp = TCP_MSS_DEFAULT, 1714 #ifdef CONFIG_TCP_MD5SIG 1715 .req_md5_lookup = tcp_v4_md5_lookup, 1716 .calc_md5_hash = tcp_v4_md5_hash_skb, 1717 #endif 1718 #ifdef CONFIG_TCP_AO 1719 .ao_lookup = tcp_v4_ao_lookup_rsk, 1720 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1721 .ao_synack_hash = tcp_v4_ao_synack_hash, 1722 #endif 1723 #ifdef CONFIG_SYN_COOKIES 1724 .cookie_init_seq = cookie_v4_init_sequence, 1725 #endif 1726 .route_req = tcp_v4_route_req, 1727 .init_seq = tcp_v4_init_seq, 1728 .init_ts_off = tcp_v4_init_ts_off, 1729 .send_synack = tcp_v4_send_synack, 1730 }; 1731 1732 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1733 { 1734 /* Never answer to SYNs send to broadcast or multicast */ 1735 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1736 goto drop; 1737 1738 return tcp_conn_request(&tcp_request_sock_ops, 1739 &tcp_request_sock_ipv4_ops, sk, skb); 1740 1741 drop: 1742 tcp_listendrop(sk); 1743 return 0; 1744 } 1745 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1746 1747 1748 /* 1749 * The three way handshake has completed - we got a valid synack - 1750 * now create the new socket. 1751 */ 1752 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1753 struct request_sock *req, 1754 struct dst_entry *dst, 1755 struct request_sock *req_unhash, 1756 bool *own_req) 1757 { 1758 struct inet_request_sock *ireq; 1759 bool found_dup_sk = false; 1760 struct inet_sock *newinet; 1761 struct tcp_sock *newtp; 1762 struct sock *newsk; 1763 #ifdef CONFIG_TCP_MD5SIG 1764 const union tcp_md5_addr *addr; 1765 struct tcp_md5sig_key *key; 1766 int l3index; 1767 #endif 1768 struct ip_options_rcu *inet_opt; 1769 1770 if (sk_acceptq_is_full(sk)) 1771 goto exit_overflow; 1772 1773 newsk = tcp_create_openreq_child(sk, req, skb); 1774 if (!newsk) 1775 goto exit_nonewsk; 1776 1777 newsk->sk_gso_type = SKB_GSO_TCPV4; 1778 inet_sk_rx_dst_set(newsk, skb); 1779 1780 newtp = tcp_sk(newsk); 1781 newinet = inet_sk(newsk); 1782 ireq = inet_rsk(req); 1783 inet_opt = rcu_dereference(ireq->ireq_opt); 1784 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1785 newinet->mc_index = inet_iif(skb); 1786 newinet->mc_ttl = ip_hdr(skb)->ttl; 1787 newinet->rcv_tos = ip_hdr(skb)->tos; 1788 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1789 if (inet_opt) 1790 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1791 atomic_set(&newinet->inet_id, get_random_u16()); 1792 1793 /* Set ToS of the new socket based upon the value of incoming SYN. 1794 * ECT bits are set later in tcp_init_transfer(). 1795 */ 1796 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1797 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1798 1799 if (!dst) { 1800 dst = inet_csk_route_child_sock(sk, newsk, req); 1801 if (!dst) 1802 goto put_and_exit; 1803 } else { 1804 /* syncookie case : see end of cookie_v4_check() */ 1805 } 1806 sk_setup_caps(newsk, dst); 1807 1808 tcp_ca_openreq_child(newsk, dst); 1809 1810 tcp_sync_mss(newsk, dst_mtu(dst)); 1811 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1812 1813 tcp_initialize_rcv_mss(newsk); 1814 1815 #ifdef CONFIG_TCP_MD5SIG 1816 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1817 /* Copy over the MD5 key from the original socket */ 1818 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1819 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1820 if (key && !tcp_rsk_used_ao(req)) { 1821 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1822 goto put_and_exit; 1823 sk_gso_disable(newsk); 1824 } 1825 #endif 1826 #ifdef CONFIG_TCP_AO 1827 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1828 goto put_and_exit; /* OOM, release back memory */ 1829 #endif 1830 1831 if (__inet_inherit_port(sk, newsk) < 0) 1832 goto put_and_exit; 1833 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1834 &found_dup_sk); 1835 if (likely(*own_req)) { 1836 tcp_move_syn(newtp, req); 1837 ireq->ireq_opt = NULL; 1838 } else { 1839 newinet->inet_opt = NULL; 1840 1841 if (!req_unhash && found_dup_sk) { 1842 /* This code path should only be executed in the 1843 * syncookie case only 1844 */ 1845 bh_unlock_sock(newsk); 1846 sock_put(newsk); 1847 newsk = NULL; 1848 } 1849 } 1850 return newsk; 1851 1852 exit_overflow: 1853 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1854 exit_nonewsk: 1855 dst_release(dst); 1856 exit: 1857 tcp_listendrop(sk); 1858 return NULL; 1859 put_and_exit: 1860 newinet->inet_opt = NULL; 1861 inet_csk_prepare_forced_close(newsk); 1862 tcp_done(newsk); 1863 goto exit; 1864 } 1865 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1866 1867 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1868 { 1869 #ifdef CONFIG_SYN_COOKIES 1870 const struct tcphdr *th = tcp_hdr(skb); 1871 1872 if (!th->syn) 1873 sk = cookie_v4_check(sk, skb); 1874 #endif 1875 return sk; 1876 } 1877 1878 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1879 struct tcphdr *th, u32 *cookie) 1880 { 1881 u16 mss = 0; 1882 #ifdef CONFIG_SYN_COOKIES 1883 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1884 &tcp_request_sock_ipv4_ops, sk, th); 1885 if (mss) { 1886 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1887 tcp_synq_overflow(sk); 1888 } 1889 #endif 1890 return mss; 1891 } 1892 1893 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1894 u32)); 1895 /* The socket must have it's spinlock held when we get 1896 * here, unless it is a TCP_LISTEN socket. 1897 * 1898 * We have a potential double-lock case here, so even when 1899 * doing backlog processing we use the BH locking scheme. 1900 * This is because we cannot sleep with the original spinlock 1901 * held. 1902 */ 1903 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1904 { 1905 enum skb_drop_reason reason; 1906 struct sock *rsk; 1907 1908 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1909 struct dst_entry *dst; 1910 1911 dst = rcu_dereference_protected(sk->sk_rx_dst, 1912 lockdep_sock_is_held(sk)); 1913 1914 sock_rps_save_rxhash(sk, skb); 1915 sk_mark_napi_id(sk, skb); 1916 if (dst) { 1917 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1918 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1919 dst, 0)) { 1920 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1921 dst_release(dst); 1922 } 1923 } 1924 tcp_rcv_established(sk, skb); 1925 return 0; 1926 } 1927 1928 if (tcp_checksum_complete(skb)) 1929 goto csum_err; 1930 1931 if (sk->sk_state == TCP_LISTEN) { 1932 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1933 1934 if (!nsk) 1935 return 0; 1936 if (nsk != sk) { 1937 reason = tcp_child_process(sk, nsk, skb); 1938 if (reason) { 1939 rsk = nsk; 1940 goto reset; 1941 } 1942 return 0; 1943 } 1944 } else 1945 sock_rps_save_rxhash(sk, skb); 1946 1947 reason = tcp_rcv_state_process(sk, skb); 1948 if (reason) { 1949 rsk = sk; 1950 goto reset; 1951 } 1952 return 0; 1953 1954 reset: 1955 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1956 discard: 1957 sk_skb_reason_drop(sk, skb, reason); 1958 /* Be careful here. If this function gets more complicated and 1959 * gcc suffers from register pressure on the x86, sk (in %ebx) 1960 * might be destroyed here. This current version compiles correctly, 1961 * but you have been warned. 1962 */ 1963 return 0; 1964 1965 csum_err: 1966 reason = SKB_DROP_REASON_TCP_CSUM; 1967 trace_tcp_bad_csum(skb); 1968 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1969 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1970 goto discard; 1971 } 1972 EXPORT_SYMBOL(tcp_v4_do_rcv); 1973 1974 int tcp_v4_early_demux(struct sk_buff *skb) 1975 { 1976 struct net *net = dev_net_rcu(skb->dev); 1977 const struct iphdr *iph; 1978 const struct tcphdr *th; 1979 struct sock *sk; 1980 1981 if (skb->pkt_type != PACKET_HOST) 1982 return 0; 1983 1984 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1985 return 0; 1986 1987 iph = ip_hdr(skb); 1988 th = tcp_hdr(skb); 1989 1990 if (th->doff < sizeof(struct tcphdr) / 4) 1991 return 0; 1992 1993 sk = __inet_lookup_established(net, iph->saddr, th->source, 1994 iph->daddr, ntohs(th->dest), 1995 skb->skb_iif, inet_sdif(skb)); 1996 if (sk) { 1997 skb->sk = sk; 1998 skb->destructor = sock_edemux; 1999 if (sk_fullsock(sk)) { 2000 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 2001 2002 if (dst) 2003 dst = dst_check(dst, 0); 2004 if (dst && 2005 sk->sk_rx_dst_ifindex == skb->skb_iif) 2006 skb_dst_set_noref(skb, dst); 2007 } 2008 } 2009 return 0; 2010 } 2011 2012 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2013 enum skb_drop_reason *reason) 2014 { 2015 u32 tail_gso_size, tail_gso_segs; 2016 struct skb_shared_info *shinfo; 2017 const struct tcphdr *th; 2018 struct tcphdr *thtail; 2019 struct sk_buff *tail; 2020 unsigned int hdrlen; 2021 bool fragstolen; 2022 u32 gso_segs; 2023 u32 gso_size; 2024 u64 limit; 2025 int delta; 2026 int err; 2027 2028 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2029 * we can fix skb->truesize to its real value to avoid future drops. 2030 * This is valid because skb is not yet charged to the socket. 2031 * It has been noticed pure SACK packets were sometimes dropped 2032 * (if cooked by drivers without copybreak feature). 2033 */ 2034 skb_condense(skb); 2035 2036 tcp_cleanup_skb(skb); 2037 2038 if (unlikely(tcp_checksum_complete(skb))) { 2039 bh_unlock_sock(sk); 2040 trace_tcp_bad_csum(skb); 2041 *reason = SKB_DROP_REASON_TCP_CSUM; 2042 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2043 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2044 return true; 2045 } 2046 2047 /* Attempt coalescing to last skb in backlog, even if we are 2048 * above the limits. 2049 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2050 */ 2051 th = (const struct tcphdr *)skb->data; 2052 hdrlen = th->doff * 4; 2053 2054 tail = sk->sk_backlog.tail; 2055 if (!tail) 2056 goto no_coalesce; 2057 thtail = (struct tcphdr *)tail->data; 2058 2059 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2060 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2061 ((TCP_SKB_CB(tail)->tcp_flags | 2062 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2063 !((TCP_SKB_CB(tail)->tcp_flags & 2064 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2065 ((TCP_SKB_CB(tail)->tcp_flags ^ 2066 TCP_SKB_CB(skb)->tcp_flags) & 2067 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2068 !tcp_skb_can_collapse_rx(tail, skb) || 2069 thtail->doff != th->doff || 2070 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2071 goto no_coalesce; 2072 2073 __skb_pull(skb, hdrlen); 2074 2075 shinfo = skb_shinfo(skb); 2076 gso_size = shinfo->gso_size ?: skb->len; 2077 gso_segs = shinfo->gso_segs ?: 1; 2078 2079 shinfo = skb_shinfo(tail); 2080 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2081 tail_gso_segs = shinfo->gso_segs ?: 1; 2082 2083 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2084 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2085 2086 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2087 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2088 thtail->window = th->window; 2089 } 2090 2091 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2092 * thtail->fin, so that the fast path in tcp_rcv_established() 2093 * is not entered if we append a packet with a FIN. 2094 * SYN, RST, URG are not present. 2095 * ACK is set on both packets. 2096 * PSH : we do not really care in TCP stack, 2097 * at least for 'GRO' packets. 2098 */ 2099 thtail->fin |= th->fin; 2100 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2101 2102 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2103 TCP_SKB_CB(tail)->has_rxtstamp = true; 2104 tail->tstamp = skb->tstamp; 2105 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2106 } 2107 2108 /* Not as strict as GRO. We only need to carry mss max value */ 2109 shinfo->gso_size = max(gso_size, tail_gso_size); 2110 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2111 2112 sk->sk_backlog.len += delta; 2113 __NET_INC_STATS(sock_net(sk), 2114 LINUX_MIB_TCPBACKLOGCOALESCE); 2115 kfree_skb_partial(skb, fragstolen); 2116 return false; 2117 } 2118 __skb_push(skb, hdrlen); 2119 2120 no_coalesce: 2121 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2122 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2123 * sk_rcvbuf in normal conditions. 2124 */ 2125 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2126 2127 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2128 2129 /* Only socket owner can try to collapse/prune rx queues 2130 * to reduce memory overhead, so add a little headroom here. 2131 * Few sockets backlog are possibly concurrently non empty. 2132 */ 2133 limit += 64 * 1024; 2134 2135 limit = min_t(u64, limit, UINT_MAX); 2136 2137 err = sk_add_backlog(sk, skb, limit); 2138 if (unlikely(err)) { 2139 bh_unlock_sock(sk); 2140 if (err == -ENOMEM) { 2141 *reason = SKB_DROP_REASON_PFMEMALLOC; 2142 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2143 } else { 2144 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2145 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2146 } 2147 return true; 2148 } 2149 return false; 2150 } 2151 EXPORT_IPV6_MOD(tcp_add_backlog); 2152 2153 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) 2154 { 2155 struct tcphdr *th = (struct tcphdr *)skb->data; 2156 2157 return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); 2158 } 2159 EXPORT_IPV6_MOD(tcp_filter); 2160 2161 static void tcp_v4_restore_cb(struct sk_buff *skb) 2162 { 2163 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2164 sizeof(struct inet_skb_parm)); 2165 } 2166 2167 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2168 const struct tcphdr *th) 2169 { 2170 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2171 * barrier() makes sure compiler wont play fool^Waliasing games. 2172 */ 2173 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2174 sizeof(struct inet_skb_parm)); 2175 barrier(); 2176 2177 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2178 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2179 skb->len - th->doff * 4); 2180 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2181 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2182 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2183 TCP_SKB_CB(skb)->sacked = 0; 2184 TCP_SKB_CB(skb)->has_rxtstamp = 2185 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2186 } 2187 2188 /* 2189 * From tcp_input.c 2190 */ 2191 2192 int tcp_v4_rcv(struct sk_buff *skb) 2193 { 2194 struct net *net = dev_net_rcu(skb->dev); 2195 enum skb_drop_reason drop_reason; 2196 enum tcp_tw_status tw_status; 2197 int sdif = inet_sdif(skb); 2198 int dif = inet_iif(skb); 2199 const struct iphdr *iph; 2200 const struct tcphdr *th; 2201 struct sock *sk = NULL; 2202 bool refcounted; 2203 int ret; 2204 u32 isn; 2205 2206 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2207 if (skb->pkt_type != PACKET_HOST) 2208 goto discard_it; 2209 2210 /* Count it even if it's bad */ 2211 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2212 2213 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2214 goto discard_it; 2215 2216 th = (const struct tcphdr *)skb->data; 2217 2218 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2219 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2220 goto bad_packet; 2221 } 2222 if (!pskb_may_pull(skb, th->doff * 4)) 2223 goto discard_it; 2224 2225 /* An explanation is required here, I think. 2226 * Packet length and doff are validated by header prediction, 2227 * provided case of th->doff==0 is eliminated. 2228 * So, we defer the checks. */ 2229 2230 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2231 goto csum_error; 2232 2233 th = (const struct tcphdr *)skb->data; 2234 iph = ip_hdr(skb); 2235 lookup: 2236 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2237 th->dest, sdif, &refcounted); 2238 if (!sk) 2239 goto no_tcp_socket; 2240 2241 if (sk->sk_state == TCP_TIME_WAIT) 2242 goto do_time_wait; 2243 2244 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2245 struct request_sock *req = inet_reqsk(sk); 2246 bool req_stolen = false; 2247 struct sock *nsk; 2248 2249 sk = req->rsk_listener; 2250 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2251 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2252 else 2253 drop_reason = tcp_inbound_hash(sk, req, skb, 2254 &iph->saddr, &iph->daddr, 2255 AF_INET, dif, sdif); 2256 if (unlikely(drop_reason)) { 2257 sk_drops_skbadd(sk, skb); 2258 reqsk_put(req); 2259 goto discard_it; 2260 } 2261 if (tcp_checksum_complete(skb)) { 2262 reqsk_put(req); 2263 goto csum_error; 2264 } 2265 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2266 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2267 if (!nsk) { 2268 inet_csk_reqsk_queue_drop_and_put(sk, req); 2269 goto lookup; 2270 } 2271 sk = nsk; 2272 /* reuseport_migrate_sock() has already held one sk_refcnt 2273 * before returning. 2274 */ 2275 } else { 2276 /* We own a reference on the listener, increase it again 2277 * as we might lose it too soon. 2278 */ 2279 sock_hold(sk); 2280 } 2281 refcounted = true; 2282 nsk = NULL; 2283 if (!tcp_filter(sk, skb, &drop_reason)) { 2284 th = (const struct tcphdr *)skb->data; 2285 iph = ip_hdr(skb); 2286 tcp_v4_fill_cb(skb, iph, th); 2287 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2288 &drop_reason); 2289 } 2290 if (!nsk) { 2291 reqsk_put(req); 2292 if (req_stolen) { 2293 /* Another cpu got exclusive access to req 2294 * and created a full blown socket. 2295 * Try to feed this packet to this socket 2296 * instead of discarding it. 2297 */ 2298 tcp_v4_restore_cb(skb); 2299 sock_put(sk); 2300 goto lookup; 2301 } 2302 goto discard_and_relse; 2303 } 2304 nf_reset_ct(skb); 2305 if (nsk == sk) { 2306 reqsk_put(req); 2307 tcp_v4_restore_cb(skb); 2308 } else { 2309 drop_reason = tcp_child_process(sk, nsk, skb); 2310 if (drop_reason) { 2311 enum sk_rst_reason rst_reason; 2312 2313 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2314 tcp_v4_send_reset(nsk, skb, rst_reason); 2315 goto discard_and_relse; 2316 } 2317 sock_put(sk); 2318 return 0; 2319 } 2320 } 2321 2322 process: 2323 if (static_branch_unlikely(&ip4_min_ttl)) { 2324 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2325 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2326 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2327 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2328 goto discard_and_relse; 2329 } 2330 } 2331 2332 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2333 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2334 goto discard_and_relse; 2335 } 2336 2337 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2338 AF_INET, dif, sdif); 2339 if (drop_reason) 2340 goto discard_and_relse; 2341 2342 nf_reset_ct(skb); 2343 2344 if (tcp_filter(sk, skb, &drop_reason)) 2345 goto discard_and_relse; 2346 2347 th = (const struct tcphdr *)skb->data; 2348 iph = ip_hdr(skb); 2349 tcp_v4_fill_cb(skb, iph, th); 2350 2351 skb->dev = NULL; 2352 2353 if (sk->sk_state == TCP_LISTEN) { 2354 ret = tcp_v4_do_rcv(sk, skb); 2355 goto put_and_return; 2356 } 2357 2358 sk_incoming_cpu_update(sk); 2359 2360 bh_lock_sock_nested(sk); 2361 tcp_segs_in(tcp_sk(sk), skb); 2362 ret = 0; 2363 if (!sock_owned_by_user(sk)) { 2364 ret = tcp_v4_do_rcv(sk, skb); 2365 } else { 2366 if (tcp_add_backlog(sk, skb, &drop_reason)) 2367 goto discard_and_relse; 2368 } 2369 bh_unlock_sock(sk); 2370 2371 put_and_return: 2372 if (refcounted) 2373 sock_put(sk); 2374 2375 return ret; 2376 2377 no_tcp_socket: 2378 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2379 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2380 goto discard_it; 2381 2382 tcp_v4_fill_cb(skb, iph, th); 2383 2384 if (tcp_checksum_complete(skb)) { 2385 csum_error: 2386 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2387 trace_tcp_bad_csum(skb); 2388 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2389 bad_packet: 2390 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2391 } else { 2392 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2393 } 2394 2395 discard_it: 2396 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2397 /* Discard frame. */ 2398 sk_skb_reason_drop(sk, skb, drop_reason); 2399 return 0; 2400 2401 discard_and_relse: 2402 sk_drops_skbadd(sk, skb); 2403 if (refcounted) 2404 sock_put(sk); 2405 goto discard_it; 2406 2407 do_time_wait: 2408 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2409 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2410 inet_twsk_put(inet_twsk(sk)); 2411 goto discard_it; 2412 } 2413 2414 tcp_v4_fill_cb(skb, iph, th); 2415 2416 if (tcp_checksum_complete(skb)) { 2417 inet_twsk_put(inet_twsk(sk)); 2418 goto csum_error; 2419 } 2420 2421 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2422 &drop_reason); 2423 switch (tw_status) { 2424 case TCP_TW_SYN: { 2425 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2426 iph->saddr, th->source, 2427 iph->daddr, th->dest, 2428 inet_iif(skb), 2429 sdif); 2430 if (sk2) { 2431 inet_twsk_deschedule_put(inet_twsk(sk)); 2432 sk = sk2; 2433 tcp_v4_restore_cb(skb); 2434 refcounted = false; 2435 __this_cpu_write(tcp_tw_isn, isn); 2436 goto process; 2437 } 2438 } 2439 /* to ACK */ 2440 fallthrough; 2441 case TCP_TW_ACK: 2442 case TCP_TW_ACK_OOW: 2443 tcp_v4_timewait_ack(sk, skb, tw_status); 2444 break; 2445 case TCP_TW_RST: 2446 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2447 inet_twsk_deschedule_put(inet_twsk(sk)); 2448 goto discard_it; 2449 case TCP_TW_SUCCESS:; 2450 } 2451 goto discard_it; 2452 } 2453 2454 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2455 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2456 }; 2457 2458 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2459 { 2460 struct dst_entry *dst = skb_dst(skb); 2461 2462 if (dst && dst_hold_safe(dst)) { 2463 rcu_assign_pointer(sk->sk_rx_dst, dst); 2464 sk->sk_rx_dst_ifindex = skb->skb_iif; 2465 } 2466 } 2467 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2468 2469 const struct inet_connection_sock_af_ops ipv4_specific = { 2470 .queue_xmit = ip_queue_xmit, 2471 .send_check = tcp_v4_send_check, 2472 .rebuild_header = inet_sk_rebuild_header, 2473 .sk_rx_dst_set = inet_sk_rx_dst_set, 2474 .conn_request = tcp_v4_conn_request, 2475 .syn_recv_sock = tcp_v4_syn_recv_sock, 2476 .net_header_len = sizeof(struct iphdr), 2477 .setsockopt = ip_setsockopt, 2478 .getsockopt = ip_getsockopt, 2479 .mtu_reduced = tcp_v4_mtu_reduced, 2480 }; 2481 EXPORT_IPV6_MOD(ipv4_specific); 2482 2483 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2484 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2485 #ifdef CONFIG_TCP_MD5SIG 2486 .md5_lookup = tcp_v4_md5_lookup, 2487 .calc_md5_hash = tcp_v4_md5_hash_skb, 2488 .md5_parse = tcp_v4_parse_md5_keys, 2489 #endif 2490 #ifdef CONFIG_TCP_AO 2491 .ao_lookup = tcp_v4_ao_lookup, 2492 .calc_ao_hash = tcp_v4_ao_hash_skb, 2493 .ao_parse = tcp_v4_parse_ao, 2494 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2495 #endif 2496 }; 2497 #endif 2498 2499 /* NOTE: A lot of things set to zero explicitly by call to 2500 * sk_alloc() so need not be done here. 2501 */ 2502 static int tcp_v4_init_sock(struct sock *sk) 2503 { 2504 struct inet_connection_sock *icsk = inet_csk(sk); 2505 2506 tcp_init_sock(sk); 2507 2508 icsk->icsk_af_ops = &ipv4_specific; 2509 2510 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2511 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2512 #endif 2513 2514 return 0; 2515 } 2516 2517 #ifdef CONFIG_TCP_MD5SIG 2518 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2519 { 2520 struct tcp_md5sig_info *md5sig; 2521 2522 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2523 kfree(md5sig); 2524 static_branch_slow_dec_deferred(&tcp_md5_needed); 2525 tcp_md5_release_sigpool(); 2526 } 2527 #endif 2528 2529 static void tcp_release_user_frags(struct sock *sk) 2530 { 2531 #ifdef CONFIG_PAGE_POOL 2532 unsigned long index; 2533 void *netmem; 2534 2535 xa_for_each(&sk->sk_user_frags, index, netmem) 2536 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2537 #endif 2538 } 2539 2540 void tcp_v4_destroy_sock(struct sock *sk) 2541 { 2542 struct tcp_sock *tp = tcp_sk(sk); 2543 2544 tcp_release_user_frags(sk); 2545 2546 xa_destroy(&sk->sk_user_frags); 2547 2548 trace_tcp_destroy_sock(sk); 2549 2550 tcp_clear_xmit_timers(sk); 2551 2552 tcp_cleanup_congestion_control(sk); 2553 2554 tcp_cleanup_ulp(sk); 2555 2556 /* Cleanup up the write buffer. */ 2557 tcp_write_queue_purge(sk); 2558 2559 /* Check if we want to disable active TFO */ 2560 tcp_fastopen_active_disable_ofo_check(sk); 2561 2562 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2563 skb_rbtree_purge(&tp->out_of_order_queue); 2564 2565 #ifdef CONFIG_TCP_MD5SIG 2566 /* Clean up the MD5 key list, if any */ 2567 if (tp->md5sig_info) { 2568 struct tcp_md5sig_info *md5sig; 2569 2570 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2571 tcp_clear_md5_list(sk); 2572 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2573 rcu_assign_pointer(tp->md5sig_info, NULL); 2574 } 2575 #endif 2576 tcp_ao_destroy_sock(sk, false); 2577 2578 /* Clean up a referenced TCP bind bucket. */ 2579 if (inet_csk(sk)->icsk_bind_hash) 2580 inet_put_port(sk); 2581 2582 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2583 2584 /* If socket is aborted during connect operation */ 2585 tcp_free_fastopen_req(tp); 2586 tcp_fastopen_destroy_cipher(sk); 2587 tcp_saved_syn_free(tp); 2588 2589 sk_sockets_allocated_dec(sk); 2590 } 2591 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2592 2593 #ifdef CONFIG_PROC_FS 2594 /* Proc filesystem TCP sock list dumping. */ 2595 2596 static unsigned short seq_file_family(const struct seq_file *seq); 2597 2598 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2599 { 2600 unsigned short family = seq_file_family(seq); 2601 2602 /* AF_UNSPEC is used as a match all */ 2603 return ((family == AF_UNSPEC || family == sk->sk_family) && 2604 net_eq(sock_net(sk), seq_file_net(seq))); 2605 } 2606 2607 /* Find a non empty bucket (starting from st->bucket) 2608 * and return the first sk from it. 2609 */ 2610 static void *listening_get_first(struct seq_file *seq) 2611 { 2612 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2613 struct tcp_iter_state *st = seq->private; 2614 2615 st->offset = 0; 2616 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2617 struct inet_listen_hashbucket *ilb2; 2618 struct hlist_nulls_node *node; 2619 struct sock *sk; 2620 2621 ilb2 = &hinfo->lhash2[st->bucket]; 2622 if (hlist_nulls_empty(&ilb2->nulls_head)) 2623 continue; 2624 2625 spin_lock(&ilb2->lock); 2626 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2627 if (seq_sk_match(seq, sk)) 2628 return sk; 2629 } 2630 spin_unlock(&ilb2->lock); 2631 } 2632 2633 return NULL; 2634 } 2635 2636 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2637 * If "cur" is the last one in the st->bucket, 2638 * call listening_get_first() to return the first sk of the next 2639 * non empty bucket. 2640 */ 2641 static void *listening_get_next(struct seq_file *seq, void *cur) 2642 { 2643 struct tcp_iter_state *st = seq->private; 2644 struct inet_listen_hashbucket *ilb2; 2645 struct hlist_nulls_node *node; 2646 struct inet_hashinfo *hinfo; 2647 struct sock *sk = cur; 2648 2649 ++st->num; 2650 ++st->offset; 2651 2652 sk = sk_nulls_next(sk); 2653 sk_nulls_for_each_from(sk, node) { 2654 if (seq_sk_match(seq, sk)) 2655 return sk; 2656 } 2657 2658 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2659 ilb2 = &hinfo->lhash2[st->bucket]; 2660 spin_unlock(&ilb2->lock); 2661 ++st->bucket; 2662 return listening_get_first(seq); 2663 } 2664 2665 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2666 { 2667 struct tcp_iter_state *st = seq->private; 2668 void *rc; 2669 2670 st->bucket = 0; 2671 st->offset = 0; 2672 rc = listening_get_first(seq); 2673 2674 while (rc && *pos) { 2675 rc = listening_get_next(seq, rc); 2676 --*pos; 2677 } 2678 return rc; 2679 } 2680 2681 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2682 const struct tcp_iter_state *st) 2683 { 2684 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2685 } 2686 2687 /* 2688 * Get first established socket starting from bucket given in st->bucket. 2689 * If st->bucket is zero, the very first socket in the hash is returned. 2690 */ 2691 static void *established_get_first(struct seq_file *seq) 2692 { 2693 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2694 struct tcp_iter_state *st = seq->private; 2695 2696 st->offset = 0; 2697 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2698 struct sock *sk; 2699 struct hlist_nulls_node *node; 2700 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2701 2702 cond_resched(); 2703 2704 /* Lockless fast path for the common case of empty buckets */ 2705 if (empty_bucket(hinfo, st)) 2706 continue; 2707 2708 spin_lock_bh(lock); 2709 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2710 if (seq_sk_match(seq, sk)) 2711 return sk; 2712 } 2713 spin_unlock_bh(lock); 2714 } 2715 2716 return NULL; 2717 } 2718 2719 static void *established_get_next(struct seq_file *seq, void *cur) 2720 { 2721 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2722 struct tcp_iter_state *st = seq->private; 2723 struct hlist_nulls_node *node; 2724 struct sock *sk = cur; 2725 2726 ++st->num; 2727 ++st->offset; 2728 2729 sk = sk_nulls_next(sk); 2730 2731 sk_nulls_for_each_from(sk, node) { 2732 if (seq_sk_match(seq, sk)) 2733 return sk; 2734 } 2735 2736 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2737 ++st->bucket; 2738 return established_get_first(seq); 2739 } 2740 2741 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2742 { 2743 struct tcp_iter_state *st = seq->private; 2744 void *rc; 2745 2746 st->bucket = 0; 2747 rc = established_get_first(seq); 2748 2749 while (rc && pos) { 2750 rc = established_get_next(seq, rc); 2751 --pos; 2752 } 2753 return rc; 2754 } 2755 2756 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2757 { 2758 void *rc; 2759 struct tcp_iter_state *st = seq->private; 2760 2761 st->state = TCP_SEQ_STATE_LISTENING; 2762 rc = listening_get_idx(seq, &pos); 2763 2764 if (!rc) { 2765 st->state = TCP_SEQ_STATE_ESTABLISHED; 2766 rc = established_get_idx(seq, pos); 2767 } 2768 2769 return rc; 2770 } 2771 2772 static void *tcp_seek_last_pos(struct seq_file *seq) 2773 { 2774 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2775 struct tcp_iter_state *st = seq->private; 2776 int bucket = st->bucket; 2777 int offset = st->offset; 2778 int orig_num = st->num; 2779 void *rc = NULL; 2780 2781 switch (st->state) { 2782 case TCP_SEQ_STATE_LISTENING: 2783 if (st->bucket > hinfo->lhash2_mask) 2784 break; 2785 rc = listening_get_first(seq); 2786 while (offset-- && rc && bucket == st->bucket) 2787 rc = listening_get_next(seq, rc); 2788 if (rc) 2789 break; 2790 st->bucket = 0; 2791 st->state = TCP_SEQ_STATE_ESTABLISHED; 2792 fallthrough; 2793 case TCP_SEQ_STATE_ESTABLISHED: 2794 if (st->bucket > hinfo->ehash_mask) 2795 break; 2796 rc = established_get_first(seq); 2797 while (offset-- && rc && bucket == st->bucket) 2798 rc = established_get_next(seq, rc); 2799 } 2800 2801 st->num = orig_num; 2802 2803 return rc; 2804 } 2805 2806 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2807 { 2808 struct tcp_iter_state *st = seq->private; 2809 void *rc; 2810 2811 if (*pos && *pos == st->last_pos) { 2812 rc = tcp_seek_last_pos(seq); 2813 if (rc) 2814 goto out; 2815 } 2816 2817 st->state = TCP_SEQ_STATE_LISTENING; 2818 st->num = 0; 2819 st->bucket = 0; 2820 st->offset = 0; 2821 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2822 2823 out: 2824 st->last_pos = *pos; 2825 return rc; 2826 } 2827 EXPORT_IPV6_MOD(tcp_seq_start); 2828 2829 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2830 { 2831 struct tcp_iter_state *st = seq->private; 2832 void *rc = NULL; 2833 2834 if (v == SEQ_START_TOKEN) { 2835 rc = tcp_get_idx(seq, 0); 2836 goto out; 2837 } 2838 2839 switch (st->state) { 2840 case TCP_SEQ_STATE_LISTENING: 2841 rc = listening_get_next(seq, v); 2842 if (!rc) { 2843 st->state = TCP_SEQ_STATE_ESTABLISHED; 2844 st->bucket = 0; 2845 st->offset = 0; 2846 rc = established_get_first(seq); 2847 } 2848 break; 2849 case TCP_SEQ_STATE_ESTABLISHED: 2850 rc = established_get_next(seq, v); 2851 break; 2852 } 2853 out: 2854 ++*pos; 2855 st->last_pos = *pos; 2856 return rc; 2857 } 2858 EXPORT_IPV6_MOD(tcp_seq_next); 2859 2860 void tcp_seq_stop(struct seq_file *seq, void *v) 2861 { 2862 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2863 struct tcp_iter_state *st = seq->private; 2864 2865 switch (st->state) { 2866 case TCP_SEQ_STATE_LISTENING: 2867 if (v != SEQ_START_TOKEN) 2868 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2869 break; 2870 case TCP_SEQ_STATE_ESTABLISHED: 2871 if (v) 2872 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2873 break; 2874 } 2875 } 2876 EXPORT_IPV6_MOD(tcp_seq_stop); 2877 2878 static void get_openreq4(const struct request_sock *req, 2879 struct seq_file *f, int i) 2880 { 2881 const struct inet_request_sock *ireq = inet_rsk(req); 2882 long delta = req->rsk_timer.expires - jiffies; 2883 2884 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2885 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2886 i, 2887 ireq->ir_loc_addr, 2888 ireq->ir_num, 2889 ireq->ir_rmt_addr, 2890 ntohs(ireq->ir_rmt_port), 2891 TCP_SYN_RECV, 2892 0, 0, /* could print option size, but that is af dependent. */ 2893 1, /* timers active (only the expire timer) */ 2894 jiffies_delta_to_clock_t(delta), 2895 req->num_timeout, 2896 from_kuid_munged(seq_user_ns(f), 2897 sk_uid(req->rsk_listener)), 2898 0, /* non standard timer */ 2899 0, /* open_requests have no inode */ 2900 0, 2901 req); 2902 } 2903 2904 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2905 { 2906 int timer_active; 2907 unsigned long timer_expires; 2908 const struct tcp_sock *tp = tcp_sk(sk); 2909 const struct inet_connection_sock *icsk = inet_csk(sk); 2910 const struct inet_sock *inet = inet_sk(sk); 2911 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2912 __be32 dest = inet->inet_daddr; 2913 __be32 src = inet->inet_rcv_saddr; 2914 __u16 destp = ntohs(inet->inet_dport); 2915 __u16 srcp = ntohs(inet->inet_sport); 2916 u8 icsk_pending; 2917 int rx_queue; 2918 int state; 2919 2920 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2921 if (icsk_pending == ICSK_TIME_RETRANS || 2922 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2923 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2924 timer_active = 1; 2925 timer_expires = icsk_timeout(icsk); 2926 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2927 timer_active = 4; 2928 timer_expires = icsk_timeout(icsk); 2929 } else if (timer_pending(&sk->sk_timer)) { 2930 timer_active = 2; 2931 timer_expires = sk->sk_timer.expires; 2932 } else { 2933 timer_active = 0; 2934 timer_expires = jiffies; 2935 } 2936 2937 state = inet_sk_state_load(sk); 2938 if (state == TCP_LISTEN) 2939 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2940 else 2941 /* Because we don't lock the socket, 2942 * we might find a transient negative value. 2943 */ 2944 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2945 READ_ONCE(tp->copied_seq), 0); 2946 2947 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2948 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2949 i, src, srcp, dest, destp, state, 2950 READ_ONCE(tp->write_seq) - tp->snd_una, 2951 rx_queue, 2952 timer_active, 2953 jiffies_delta_to_clock_t(timer_expires - jiffies), 2954 READ_ONCE(icsk->icsk_retransmits), 2955 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2956 READ_ONCE(icsk->icsk_probes_out), 2957 sock_i_ino(sk), 2958 refcount_read(&sk->sk_refcnt), sk, 2959 jiffies_to_clock_t(icsk->icsk_rto), 2960 jiffies_to_clock_t(icsk->icsk_ack.ato), 2961 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2962 tcp_snd_cwnd(tp), 2963 state == TCP_LISTEN ? 2964 fastopenq->max_qlen : 2965 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2966 } 2967 2968 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2969 struct seq_file *f, int i) 2970 { 2971 long delta = tw->tw_timer.expires - jiffies; 2972 __be32 dest, src; 2973 __u16 destp, srcp; 2974 2975 dest = tw->tw_daddr; 2976 src = tw->tw_rcv_saddr; 2977 destp = ntohs(tw->tw_dport); 2978 srcp = ntohs(tw->tw_sport); 2979 2980 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2981 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2982 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2983 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2984 refcount_read(&tw->tw_refcnt), tw); 2985 } 2986 2987 #define TMPSZ 150 2988 2989 static int tcp4_seq_show(struct seq_file *seq, void *v) 2990 { 2991 struct tcp_iter_state *st; 2992 struct sock *sk = v; 2993 2994 seq_setwidth(seq, TMPSZ - 1); 2995 if (v == SEQ_START_TOKEN) { 2996 seq_puts(seq, " sl local_address rem_address st tx_queue " 2997 "rx_queue tr tm->when retrnsmt uid timeout " 2998 "inode"); 2999 goto out; 3000 } 3001 st = seq->private; 3002 3003 if (sk->sk_state == TCP_TIME_WAIT) 3004 get_timewait4_sock(v, seq, st->num); 3005 else if (sk->sk_state == TCP_NEW_SYN_RECV) 3006 get_openreq4(v, seq, st->num); 3007 else 3008 get_tcp4_sock(v, seq, st->num); 3009 out: 3010 seq_pad(seq, '\n'); 3011 return 0; 3012 } 3013 3014 #ifdef CONFIG_BPF_SYSCALL 3015 union bpf_tcp_iter_batch_item { 3016 struct sock *sk; 3017 __u64 cookie; 3018 }; 3019 3020 struct bpf_tcp_iter_state { 3021 struct tcp_iter_state state; 3022 unsigned int cur_sk; 3023 unsigned int end_sk; 3024 unsigned int max_sk; 3025 union bpf_tcp_iter_batch_item *batch; 3026 }; 3027 3028 struct bpf_iter__tcp { 3029 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3030 __bpf_md_ptr(struct sock_common *, sk_common); 3031 uid_t uid __aligned(8); 3032 }; 3033 3034 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3035 struct sock_common *sk_common, uid_t uid) 3036 { 3037 struct bpf_iter__tcp ctx; 3038 3039 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3040 ctx.meta = meta; 3041 ctx.sk_common = sk_common; 3042 ctx.uid = uid; 3043 return bpf_iter_run_prog(prog, &ctx); 3044 } 3045 3046 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3047 { 3048 union bpf_tcp_iter_batch_item *item; 3049 unsigned int cur_sk = iter->cur_sk; 3050 __u64 cookie; 3051 3052 /* Remember the cookies of the sockets we haven't seen yet, so we can 3053 * pick up where we left off next time around. 3054 */ 3055 while (cur_sk < iter->end_sk) { 3056 item = &iter->batch[cur_sk++]; 3057 cookie = sock_gen_cookie(item->sk); 3058 sock_gen_put(item->sk); 3059 item->cookie = cookie; 3060 } 3061 } 3062 3063 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3064 unsigned int new_batch_sz, gfp_t flags) 3065 { 3066 union bpf_tcp_iter_batch_item *new_batch; 3067 3068 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3069 flags | __GFP_NOWARN); 3070 if (!new_batch) 3071 return -ENOMEM; 3072 3073 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 3074 kvfree(iter->batch); 3075 iter->batch = new_batch; 3076 iter->max_sk = new_batch_sz; 3077 3078 return 0; 3079 } 3080 3081 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3082 union bpf_tcp_iter_batch_item *cookies, 3083 int n_cookies) 3084 { 3085 struct hlist_nulls_node *node; 3086 struct sock *sk; 3087 int i; 3088 3089 for (i = 0; i < n_cookies; i++) { 3090 sk = first_sk; 3091 sk_nulls_for_each_from(sk, node) 3092 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3093 return sk; 3094 } 3095 3096 return NULL; 3097 } 3098 3099 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3100 { 3101 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3102 struct bpf_tcp_iter_state *iter = seq->private; 3103 struct tcp_iter_state *st = &iter->state; 3104 unsigned int find_cookie = iter->cur_sk; 3105 unsigned int end_cookie = iter->end_sk; 3106 int resume_bucket = st->bucket; 3107 struct sock *sk; 3108 3109 if (end_cookie && find_cookie == end_cookie) 3110 ++st->bucket; 3111 3112 sk = listening_get_first(seq); 3113 iter->cur_sk = 0; 3114 iter->end_sk = 0; 3115 3116 if (sk && st->bucket == resume_bucket && end_cookie) { 3117 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3118 end_cookie - find_cookie); 3119 if (!sk) { 3120 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3121 ++st->bucket; 3122 sk = listening_get_first(seq); 3123 } 3124 } 3125 3126 return sk; 3127 } 3128 3129 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3130 { 3131 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3132 struct bpf_tcp_iter_state *iter = seq->private; 3133 struct tcp_iter_state *st = &iter->state; 3134 unsigned int find_cookie = iter->cur_sk; 3135 unsigned int end_cookie = iter->end_sk; 3136 int resume_bucket = st->bucket; 3137 struct sock *sk; 3138 3139 if (end_cookie && find_cookie == end_cookie) 3140 ++st->bucket; 3141 3142 sk = established_get_first(seq); 3143 iter->cur_sk = 0; 3144 iter->end_sk = 0; 3145 3146 if (sk && st->bucket == resume_bucket && end_cookie) { 3147 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3148 end_cookie - find_cookie); 3149 if (!sk) { 3150 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3151 ++st->bucket; 3152 sk = established_get_first(seq); 3153 } 3154 } 3155 3156 return sk; 3157 } 3158 3159 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3160 { 3161 struct bpf_tcp_iter_state *iter = seq->private; 3162 struct tcp_iter_state *st = &iter->state; 3163 struct sock *sk = NULL; 3164 3165 switch (st->state) { 3166 case TCP_SEQ_STATE_LISTENING: 3167 sk = bpf_iter_tcp_resume_listening(seq); 3168 if (sk) 3169 break; 3170 st->bucket = 0; 3171 st->state = TCP_SEQ_STATE_ESTABLISHED; 3172 fallthrough; 3173 case TCP_SEQ_STATE_ESTABLISHED: 3174 sk = bpf_iter_tcp_resume_established(seq); 3175 break; 3176 } 3177 3178 return sk; 3179 } 3180 3181 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3182 struct sock **start_sk) 3183 { 3184 struct bpf_tcp_iter_state *iter = seq->private; 3185 struct hlist_nulls_node *node; 3186 unsigned int expected = 1; 3187 struct sock *sk; 3188 3189 sock_hold(*start_sk); 3190 iter->batch[iter->end_sk++].sk = *start_sk; 3191 3192 sk = sk_nulls_next(*start_sk); 3193 *start_sk = NULL; 3194 sk_nulls_for_each_from(sk, node) { 3195 if (seq_sk_match(seq, sk)) { 3196 if (iter->end_sk < iter->max_sk) { 3197 sock_hold(sk); 3198 iter->batch[iter->end_sk++].sk = sk; 3199 } else if (!*start_sk) { 3200 /* Remember where we left off. */ 3201 *start_sk = sk; 3202 } 3203 expected++; 3204 } 3205 } 3206 3207 return expected; 3208 } 3209 3210 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3211 struct sock **start_sk) 3212 { 3213 struct bpf_tcp_iter_state *iter = seq->private; 3214 struct hlist_nulls_node *node; 3215 unsigned int expected = 1; 3216 struct sock *sk; 3217 3218 sock_hold(*start_sk); 3219 iter->batch[iter->end_sk++].sk = *start_sk; 3220 3221 sk = sk_nulls_next(*start_sk); 3222 *start_sk = NULL; 3223 sk_nulls_for_each_from(sk, node) { 3224 if (seq_sk_match(seq, sk)) { 3225 if (iter->end_sk < iter->max_sk) { 3226 sock_hold(sk); 3227 iter->batch[iter->end_sk++].sk = sk; 3228 } else if (!*start_sk) { 3229 /* Remember where we left off. */ 3230 *start_sk = sk; 3231 } 3232 expected++; 3233 } 3234 } 3235 3236 return expected; 3237 } 3238 3239 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3240 struct sock **start_sk) 3241 { 3242 struct bpf_tcp_iter_state *iter = seq->private; 3243 struct tcp_iter_state *st = &iter->state; 3244 3245 if (st->state == TCP_SEQ_STATE_LISTENING) 3246 return bpf_iter_tcp_listening_batch(seq, start_sk); 3247 else 3248 return bpf_iter_tcp_established_batch(seq, start_sk); 3249 } 3250 3251 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3252 { 3253 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3254 struct bpf_tcp_iter_state *iter = seq->private; 3255 struct tcp_iter_state *st = &iter->state; 3256 3257 if (st->state == TCP_SEQ_STATE_LISTENING) 3258 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3259 else 3260 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3261 } 3262 3263 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3264 { 3265 struct bpf_tcp_iter_state *iter = seq->private; 3266 unsigned int expected; 3267 struct sock *sk; 3268 int err; 3269 3270 sk = bpf_iter_tcp_resume(seq); 3271 if (!sk) 3272 return NULL; /* Done */ 3273 3274 expected = bpf_iter_fill_batch(seq, &sk); 3275 if (likely(iter->end_sk == expected)) 3276 goto done; 3277 3278 /* Batch size was too small. */ 3279 bpf_iter_tcp_unlock_bucket(seq); 3280 bpf_iter_tcp_put_batch(iter); 3281 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3282 GFP_USER); 3283 if (err) 3284 return ERR_PTR(err); 3285 3286 sk = bpf_iter_tcp_resume(seq); 3287 if (!sk) 3288 return NULL; /* Done */ 3289 3290 expected = bpf_iter_fill_batch(seq, &sk); 3291 if (likely(iter->end_sk == expected)) 3292 goto done; 3293 3294 /* Batch size was still too small. Hold onto the lock while we try 3295 * again with a larger batch to make sure the current bucket's size 3296 * does not change in the meantime. 3297 */ 3298 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3299 if (err) { 3300 bpf_iter_tcp_unlock_bucket(seq); 3301 return ERR_PTR(err); 3302 } 3303 3304 expected = bpf_iter_fill_batch(seq, &sk); 3305 WARN_ON_ONCE(iter->end_sk != expected); 3306 done: 3307 bpf_iter_tcp_unlock_bucket(seq); 3308 return iter->batch[0].sk; 3309 } 3310 3311 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3312 { 3313 /* bpf iter does not support lseek, so it always 3314 * continue from where it was stop()-ped. 3315 */ 3316 if (*pos) 3317 return bpf_iter_tcp_batch(seq); 3318 3319 return SEQ_START_TOKEN; 3320 } 3321 3322 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3323 { 3324 struct bpf_tcp_iter_state *iter = seq->private; 3325 struct tcp_iter_state *st = &iter->state; 3326 struct sock *sk; 3327 3328 /* Whenever seq_next() is called, the iter->cur_sk is 3329 * done with seq_show(), so advance to the next sk in 3330 * the batch. 3331 */ 3332 if (iter->cur_sk < iter->end_sk) { 3333 /* Keeping st->num consistent in tcp_iter_state. 3334 * bpf_iter_tcp does not use st->num. 3335 * meta.seq_num is used instead. 3336 */ 3337 st->num++; 3338 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3339 } 3340 3341 if (iter->cur_sk < iter->end_sk) 3342 sk = iter->batch[iter->cur_sk].sk; 3343 else 3344 sk = bpf_iter_tcp_batch(seq); 3345 3346 ++*pos; 3347 /* Keeping st->last_pos consistent in tcp_iter_state. 3348 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3349 */ 3350 st->last_pos = *pos; 3351 return sk; 3352 } 3353 3354 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3355 { 3356 struct bpf_iter_meta meta; 3357 struct bpf_prog *prog; 3358 struct sock *sk = v; 3359 uid_t uid; 3360 int ret; 3361 3362 if (v == SEQ_START_TOKEN) 3363 return 0; 3364 3365 if (sk_fullsock(sk)) 3366 lock_sock(sk); 3367 3368 if (unlikely(sk_unhashed(sk))) { 3369 ret = SEQ_SKIP; 3370 goto unlock; 3371 } 3372 3373 if (sk->sk_state == TCP_TIME_WAIT) { 3374 uid = 0; 3375 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3376 const struct request_sock *req = v; 3377 3378 uid = from_kuid_munged(seq_user_ns(seq), 3379 sk_uid(req->rsk_listener)); 3380 } else { 3381 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3382 } 3383 3384 meta.seq = seq; 3385 prog = bpf_iter_get_info(&meta, false); 3386 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3387 3388 unlock: 3389 if (sk_fullsock(sk)) 3390 release_sock(sk); 3391 return ret; 3392 3393 } 3394 3395 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3396 { 3397 struct bpf_tcp_iter_state *iter = seq->private; 3398 struct bpf_iter_meta meta; 3399 struct bpf_prog *prog; 3400 3401 if (!v) { 3402 meta.seq = seq; 3403 prog = bpf_iter_get_info(&meta, true); 3404 if (prog) 3405 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3406 } 3407 3408 if (iter->cur_sk < iter->end_sk) 3409 bpf_iter_tcp_put_batch(iter); 3410 } 3411 3412 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3413 .show = bpf_iter_tcp_seq_show, 3414 .start = bpf_iter_tcp_seq_start, 3415 .next = bpf_iter_tcp_seq_next, 3416 .stop = bpf_iter_tcp_seq_stop, 3417 }; 3418 #endif 3419 static unsigned short seq_file_family(const struct seq_file *seq) 3420 { 3421 const struct tcp_seq_afinfo *afinfo; 3422 3423 #ifdef CONFIG_BPF_SYSCALL 3424 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3425 if (seq->op == &bpf_iter_tcp_seq_ops) 3426 return AF_UNSPEC; 3427 #endif 3428 3429 /* Iterated from proc fs */ 3430 afinfo = pde_data(file_inode(seq->file)); 3431 return afinfo->family; 3432 } 3433 3434 static const struct seq_operations tcp4_seq_ops = { 3435 .show = tcp4_seq_show, 3436 .start = tcp_seq_start, 3437 .next = tcp_seq_next, 3438 .stop = tcp_seq_stop, 3439 }; 3440 3441 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3442 .family = AF_INET, 3443 }; 3444 3445 static int __net_init tcp4_proc_init_net(struct net *net) 3446 { 3447 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3448 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3449 return -ENOMEM; 3450 return 0; 3451 } 3452 3453 static void __net_exit tcp4_proc_exit_net(struct net *net) 3454 { 3455 remove_proc_entry("tcp", net->proc_net); 3456 } 3457 3458 static struct pernet_operations tcp4_net_ops = { 3459 .init = tcp4_proc_init_net, 3460 .exit = tcp4_proc_exit_net, 3461 }; 3462 3463 int __init tcp4_proc_init(void) 3464 { 3465 return register_pernet_subsys(&tcp4_net_ops); 3466 } 3467 3468 void tcp4_proc_exit(void) 3469 { 3470 unregister_pernet_subsys(&tcp4_net_ops); 3471 } 3472 #endif /* CONFIG_PROC_FS */ 3473 3474 /* @wake is one when sk_stream_write_space() calls us. 3475 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3476 * This mimics the strategy used in sock_def_write_space(). 3477 */ 3478 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3479 { 3480 const struct tcp_sock *tp = tcp_sk(sk); 3481 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3482 READ_ONCE(tp->snd_nxt); 3483 3484 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3485 } 3486 EXPORT_SYMBOL(tcp_stream_memory_free); 3487 3488 struct proto tcp_prot = { 3489 .name = "TCP", 3490 .owner = THIS_MODULE, 3491 .close = tcp_close, 3492 .pre_connect = tcp_v4_pre_connect, 3493 .connect = tcp_v4_connect, 3494 .disconnect = tcp_disconnect, 3495 .accept = inet_csk_accept, 3496 .ioctl = tcp_ioctl, 3497 .init = tcp_v4_init_sock, 3498 .destroy = tcp_v4_destroy_sock, 3499 .shutdown = tcp_shutdown, 3500 .setsockopt = tcp_setsockopt, 3501 .getsockopt = tcp_getsockopt, 3502 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3503 .keepalive = tcp_set_keepalive, 3504 .recvmsg = tcp_recvmsg, 3505 .sendmsg = tcp_sendmsg, 3506 .splice_eof = tcp_splice_eof, 3507 .backlog_rcv = tcp_v4_do_rcv, 3508 .release_cb = tcp_release_cb, 3509 .hash = inet_hash, 3510 .unhash = inet_unhash, 3511 .get_port = inet_csk_get_port, 3512 .put_port = inet_put_port, 3513 #ifdef CONFIG_BPF_SYSCALL 3514 .psock_update_sk_prot = tcp_bpf_update_proto, 3515 #endif 3516 .enter_memory_pressure = tcp_enter_memory_pressure, 3517 .leave_memory_pressure = tcp_leave_memory_pressure, 3518 .stream_memory_free = tcp_stream_memory_free, 3519 .sockets_allocated = &tcp_sockets_allocated, 3520 .orphan_count = &tcp_orphan_count, 3521 3522 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3523 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3524 3525 .memory_pressure = &tcp_memory_pressure, 3526 .sysctl_mem = sysctl_tcp_mem, 3527 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3528 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3529 .max_header = MAX_TCP_HEADER, 3530 .obj_size = sizeof(struct tcp_sock), 3531 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3532 .twsk_prot = &tcp_timewait_sock_ops, 3533 .rsk_prot = &tcp_request_sock_ops, 3534 .h.hashinfo = NULL, 3535 .no_autobind = true, 3536 .diag_destroy = tcp_abort, 3537 }; 3538 EXPORT_SYMBOL(tcp_prot); 3539 3540 static void __net_exit tcp_sk_exit(struct net *net) 3541 { 3542 if (net->ipv4.tcp_congestion_control) 3543 bpf_module_put(net->ipv4.tcp_congestion_control, 3544 net->ipv4.tcp_congestion_control->owner); 3545 } 3546 3547 static void __net_init tcp_set_hashinfo(struct net *net) 3548 { 3549 struct inet_hashinfo *hinfo; 3550 unsigned int ehash_entries; 3551 struct net *old_net; 3552 3553 if (net_eq(net, &init_net)) 3554 goto fallback; 3555 3556 old_net = current->nsproxy->net_ns; 3557 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3558 if (!ehash_entries) 3559 goto fallback; 3560 3561 ehash_entries = roundup_pow_of_two(ehash_entries); 3562 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3563 if (!hinfo) { 3564 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3565 "for a netns, fallback to the global one\n", 3566 ehash_entries); 3567 fallback: 3568 hinfo = &tcp_hashinfo; 3569 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3570 } 3571 3572 net->ipv4.tcp_death_row.hashinfo = hinfo; 3573 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3574 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3575 } 3576 3577 static int __net_init tcp_sk_init(struct net *net) 3578 { 3579 net->ipv4.sysctl_tcp_ecn = 2; 3580 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3581 3582 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3583 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3584 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3585 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3586 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3587 3588 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3589 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3590 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3591 3592 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3593 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3594 net->ipv4.sysctl_tcp_syncookies = 1; 3595 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3596 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3597 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3598 net->ipv4.sysctl_tcp_orphan_retries = 0; 3599 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3600 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3601 net->ipv4.sysctl_tcp_tw_reuse = 2; 3602 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3603 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3604 3605 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3606 tcp_set_hashinfo(net); 3607 3608 net->ipv4.sysctl_tcp_sack = 1; 3609 net->ipv4.sysctl_tcp_window_scaling = 1; 3610 net->ipv4.sysctl_tcp_timestamps = 1; 3611 net->ipv4.sysctl_tcp_early_retrans = 3; 3612 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3613 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3614 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3615 net->ipv4.sysctl_tcp_max_reordering = 300; 3616 net->ipv4.sysctl_tcp_dsack = 1; 3617 net->ipv4.sysctl_tcp_app_win = 31; 3618 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3619 net->ipv4.sysctl_tcp_frto = 2; 3620 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3621 /* This limits the percentage of the congestion window which we 3622 * will allow a single TSO frame to consume. Building TSO frames 3623 * which are too large can cause TCP streams to be bursty. 3624 */ 3625 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3626 /* Default TSQ limit of 4 MB */ 3627 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3628 3629 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3630 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3631 3632 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3633 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3634 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3635 net->ipv4.sysctl_tcp_autocorking = 1; 3636 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3637 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3638 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3639 if (net != &init_net) { 3640 memcpy(net->ipv4.sysctl_tcp_rmem, 3641 init_net.ipv4.sysctl_tcp_rmem, 3642 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3643 memcpy(net->ipv4.sysctl_tcp_wmem, 3644 init_net.ipv4.sysctl_tcp_wmem, 3645 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3646 } 3647 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3648 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3649 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3650 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3651 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3652 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3653 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3654 3655 /* Set default values for PLB */ 3656 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3657 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3658 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3659 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3660 /* Default congestion threshold for PLB to mark a round is 50% */ 3661 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3662 3663 /* Reno is always built in */ 3664 if (!net_eq(net, &init_net) && 3665 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3666 init_net.ipv4.tcp_congestion_control->owner)) 3667 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3668 else 3669 net->ipv4.tcp_congestion_control = &tcp_reno; 3670 3671 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3672 net->ipv4.sysctl_tcp_shrink_window = 0; 3673 3674 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3675 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3676 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3677 3678 return 0; 3679 } 3680 3681 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3682 { 3683 struct net *net; 3684 3685 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3686 * and failed setup_net error unwinding path are serialized. 3687 * 3688 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3689 * net_exit_list, the thread that dismantles a particular twsk must 3690 * do so without other thread progressing to refcount_dec_and_test() of 3691 * tcp_death_row.tw_refcount. 3692 */ 3693 mutex_lock(&tcp_exit_batch_mutex); 3694 3695 tcp_twsk_purge(net_exit_list); 3696 3697 list_for_each_entry(net, net_exit_list, exit_list) { 3698 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3699 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3700 tcp_fastopen_ctx_destroy(net); 3701 } 3702 3703 mutex_unlock(&tcp_exit_batch_mutex); 3704 } 3705 3706 static struct pernet_operations __net_initdata tcp_sk_ops = { 3707 .init = tcp_sk_init, 3708 .exit = tcp_sk_exit, 3709 .exit_batch = tcp_sk_exit_batch, 3710 }; 3711 3712 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3713 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3714 struct sock_common *sk_common, uid_t uid) 3715 3716 #define INIT_BATCH_SZ 16 3717 3718 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3719 { 3720 struct bpf_tcp_iter_state *iter = priv_data; 3721 int err; 3722 3723 err = bpf_iter_init_seq_net(priv_data, aux); 3724 if (err) 3725 return err; 3726 3727 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3728 if (err) { 3729 bpf_iter_fini_seq_net(priv_data); 3730 return err; 3731 } 3732 3733 return 0; 3734 } 3735 3736 static void bpf_iter_fini_tcp(void *priv_data) 3737 { 3738 struct bpf_tcp_iter_state *iter = priv_data; 3739 3740 bpf_iter_fini_seq_net(priv_data); 3741 kvfree(iter->batch); 3742 } 3743 3744 static const struct bpf_iter_seq_info tcp_seq_info = { 3745 .seq_ops = &bpf_iter_tcp_seq_ops, 3746 .init_seq_private = bpf_iter_init_tcp, 3747 .fini_seq_private = bpf_iter_fini_tcp, 3748 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3749 }; 3750 3751 static const struct bpf_func_proto * 3752 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3753 const struct bpf_prog *prog) 3754 { 3755 switch (func_id) { 3756 case BPF_FUNC_setsockopt: 3757 return &bpf_sk_setsockopt_proto; 3758 case BPF_FUNC_getsockopt: 3759 return &bpf_sk_getsockopt_proto; 3760 default: 3761 return NULL; 3762 } 3763 } 3764 3765 static struct bpf_iter_reg tcp_reg_info = { 3766 .target = "tcp", 3767 .ctx_arg_info_size = 1, 3768 .ctx_arg_info = { 3769 { offsetof(struct bpf_iter__tcp, sk_common), 3770 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3771 }, 3772 .get_func_proto = bpf_iter_tcp_get_func_proto, 3773 .seq_info = &tcp_seq_info, 3774 }; 3775 3776 static void __init bpf_iter_register(void) 3777 { 3778 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3779 if (bpf_iter_reg_target(&tcp_reg_info)) 3780 pr_warn("Warning: could not register bpf iterator tcp\n"); 3781 } 3782 3783 #endif 3784 3785 void __init tcp_v4_init(void) 3786 { 3787 int cpu, res; 3788 3789 for_each_possible_cpu(cpu) { 3790 struct sock *sk; 3791 3792 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3793 IPPROTO_TCP, &init_net); 3794 if (res) 3795 panic("Failed to create the TCP control socket.\n"); 3796 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3797 3798 /* Please enforce IP_DF and IPID==0 for RST and 3799 * ACK sent in SYN-RECV and TIME-WAIT state. 3800 */ 3801 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3802 3803 sk->sk_clockid = CLOCK_MONOTONIC; 3804 3805 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3806 } 3807 if (register_pernet_subsys(&tcp_sk_ops)) 3808 panic("Failed to create the TCP control socket.\n"); 3809 3810 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3811 bpf_iter_register(); 3812 #endif 3813 } 3814