1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 #include <linux/sock_diag.h> 62 63 #include <net/aligned_data.h> 64 #include <net/net_namespace.h> 65 #include <net/icmp.h> 66 #include <net/inet_hashtables.h> 67 #include <net/tcp.h> 68 #include <net/transp_v6.h> 69 #include <net/ipv6.h> 70 #include <net/inet_common.h> 71 #include <net/inet_ecn.h> 72 #include <net/timewait_sock.h> 73 #include <net/xfrm.h> 74 #include <net/secure_seq.h> 75 #include <net/busy_poll.h> 76 #include <net/rstreason.h> 77 78 #include <linux/inet.h> 79 #include <linux/ipv6.h> 80 #include <linux/stddef.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 #include <linux/inetdevice.h> 84 #include <linux/btf_ids.h> 85 #include <linux/skbuff_ref.h> 86 87 #include <crypto/hash.h> 88 #include <linux/scatterlist.h> 89 90 #include <trace/events/tcp.h> 91 92 #ifdef CONFIG_TCP_MD5SIG 93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 94 __be32 daddr, __be32 saddr, const struct tcphdr *th); 95 #endif 96 97 struct inet_hashinfo tcp_hashinfo; 98 99 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 100 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 101 }; 102 103 static DEFINE_MUTEX(tcp_exit_batch_mutex); 104 105 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 106 { 107 return secure_tcp_seq(ip_hdr(skb)->daddr, 108 ip_hdr(skb)->saddr, 109 tcp_hdr(skb)->dest, 110 tcp_hdr(skb)->source); 111 } 112 113 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 114 { 115 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 116 } 117 118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 119 { 120 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 121 const struct inet_timewait_sock *tw = inet_twsk(sktw); 122 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 123 struct tcp_sock *tp = tcp_sk(sk); 124 int ts_recent_stamp; 125 u32 reuse_thresh; 126 127 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 128 reuse = 0; 129 130 if (reuse == 2) { 131 /* Still does not detect *everything* that goes through 132 * lo, since we require a loopback src or dst address 133 * or direct binding to 'lo' interface. 134 */ 135 bool loopback = false; 136 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 137 loopback = true; 138 #if IS_ENABLED(CONFIG_IPV6) 139 if (tw->tw_family == AF_INET6) { 140 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 142 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 144 loopback = true; 145 } else 146 #endif 147 { 148 if (ipv4_is_loopback(tw->tw_daddr) || 149 ipv4_is_loopback(tw->tw_rcv_saddr)) 150 loopback = true; 151 } 152 if (!loopback) 153 reuse = 0; 154 } 155 156 /* With PAWS, it is safe from the viewpoint 157 of data integrity. Even without PAWS it is safe provided sequence 158 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 159 160 Actually, the idea is close to VJ's one, only timestamp cache is 161 held not per host, but per port pair and TW bucket is used as state 162 holder. 163 164 If TW bucket has been already destroyed we fall back to VJ's scheme 165 and use initial timestamp retrieved from peer table. 166 */ 167 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 168 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 169 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 170 if (ts_recent_stamp && 171 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 172 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 173 * and releasing the bucket lock. 174 */ 175 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 176 return 0; 177 178 /* In case of repair and re-using TIME-WAIT sockets we still 179 * want to be sure that it is safe as above but honor the 180 * sequence numbers and time stamps set as part of the repair 181 * process. 182 * 183 * Without this check re-using a TIME-WAIT socket with TCP 184 * repair would accumulate a -1 on the repair assigned 185 * sequence number. The first time it is reused the sequence 186 * is -1, the second time -2, etc. This fixes that issue 187 * without appearing to create any others. 188 */ 189 if (likely(!tp->repair)) { 190 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 191 192 if (!seq) 193 seq = 1; 194 WRITE_ONCE(tp->write_seq, seq); 195 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 196 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 197 } 198 199 return 1; 200 } 201 202 return 0; 203 } 204 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 205 206 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 207 int addr_len) 208 { 209 /* This check is replicated from tcp_v4_connect() and intended to 210 * prevent BPF program called below from accessing bytes that are out 211 * of the bound specified by user in addr_len. 212 */ 213 if (addr_len < sizeof(struct sockaddr_in)) 214 return -EINVAL; 215 216 sock_owned_by_me(sk); 217 218 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 219 } 220 221 /* This will initiate an outgoing connection. */ 222 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 223 { 224 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 225 struct inet_timewait_death_row *tcp_death_row; 226 struct inet_sock *inet = inet_sk(sk); 227 struct tcp_sock *tp = tcp_sk(sk); 228 struct ip_options_rcu *inet_opt; 229 struct net *net = sock_net(sk); 230 __be16 orig_sport, orig_dport; 231 __be32 daddr, nexthop; 232 struct flowi4 *fl4; 233 struct rtable *rt; 234 int err; 235 236 if (addr_len < sizeof(struct sockaddr_in)) 237 return -EINVAL; 238 239 if (usin->sin_family != AF_INET) 240 return -EAFNOSUPPORT; 241 242 nexthop = daddr = usin->sin_addr.s_addr; 243 inet_opt = rcu_dereference_protected(inet->inet_opt, 244 lockdep_sock_is_held(sk)); 245 if (inet_opt && inet_opt->opt.srr) { 246 if (!daddr) 247 return -EINVAL; 248 nexthop = inet_opt->opt.faddr; 249 } 250 251 orig_sport = inet->inet_sport; 252 orig_dport = usin->sin_port; 253 fl4 = &inet->cork.fl.u.ip4; 254 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 255 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 256 orig_dport, sk); 257 if (IS_ERR(rt)) { 258 err = PTR_ERR(rt); 259 if (err == -ENETUNREACH) 260 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 261 return err; 262 } 263 264 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 265 ip_rt_put(rt); 266 return -ENETUNREACH; 267 } 268 269 if (!inet_opt || !inet_opt->opt.srr) 270 daddr = fl4->daddr; 271 272 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 273 274 if (!inet->inet_saddr) { 275 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 276 if (err) { 277 ip_rt_put(rt); 278 return err; 279 } 280 } else { 281 sk_rcv_saddr_set(sk, inet->inet_saddr); 282 } 283 284 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 285 /* Reset inherited state */ 286 tp->rx_opt.ts_recent = 0; 287 tp->rx_opt.ts_recent_stamp = 0; 288 if (likely(!tp->repair)) 289 WRITE_ONCE(tp->write_seq, 0); 290 } 291 292 inet->inet_dport = usin->sin_port; 293 sk_daddr_set(sk, daddr); 294 295 inet_csk(sk)->icsk_ext_hdr_len = 0; 296 if (inet_opt) 297 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 298 299 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 300 301 /* Socket identity is still unknown (sport may be zero). 302 * However we set state to SYN-SENT and not releasing socket 303 * lock select source port, enter ourselves into the hash tables and 304 * complete initialization after this. 305 */ 306 tcp_set_state(sk, TCP_SYN_SENT); 307 err = inet_hash_connect(tcp_death_row, sk); 308 if (err) 309 goto failure; 310 311 sk_set_txhash(sk); 312 313 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 314 inet->inet_sport, inet->inet_dport, sk); 315 if (IS_ERR(rt)) { 316 err = PTR_ERR(rt); 317 rt = NULL; 318 goto failure; 319 } 320 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 321 /* OK, now commit destination to socket. */ 322 sk->sk_gso_type = SKB_GSO_TCPV4; 323 sk_setup_caps(sk, &rt->dst); 324 rt = NULL; 325 326 if (likely(!tp->repair)) { 327 if (!tp->write_seq) 328 WRITE_ONCE(tp->write_seq, 329 secure_tcp_seq(inet->inet_saddr, 330 inet->inet_daddr, 331 inet->inet_sport, 332 usin->sin_port)); 333 WRITE_ONCE(tp->tsoffset, 334 secure_tcp_ts_off(net, inet->inet_saddr, 335 inet->inet_daddr)); 336 } 337 338 atomic_set(&inet->inet_id, get_random_u16()); 339 340 if (tcp_fastopen_defer_connect(sk, &err)) 341 return err; 342 if (err) 343 goto failure; 344 345 err = tcp_connect(sk); 346 347 if (err) 348 goto failure; 349 350 return 0; 351 352 failure: 353 /* 354 * This unhashes the socket and releases the local port, 355 * if necessary. 356 */ 357 tcp_set_state(sk, TCP_CLOSE); 358 inet_bhash2_reset_saddr(sk); 359 ip_rt_put(rt); 360 sk->sk_route_caps = 0; 361 inet->inet_dport = 0; 362 return err; 363 } 364 EXPORT_IPV6_MOD(tcp_v4_connect); 365 366 /* 367 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 368 * It can be called through tcp_release_cb() if socket was owned by user 369 * at the time tcp_v4_err() was called to handle ICMP message. 370 */ 371 void tcp_v4_mtu_reduced(struct sock *sk) 372 { 373 struct inet_sock *inet = inet_sk(sk); 374 struct dst_entry *dst; 375 u32 mtu; 376 377 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 378 return; 379 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 380 dst = inet_csk_update_pmtu(sk, mtu); 381 if (!dst) 382 return; 383 384 /* Something is about to be wrong... Remember soft error 385 * for the case, if this connection will not able to recover. 386 */ 387 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 388 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 389 390 mtu = dst_mtu(dst); 391 392 if (inet->pmtudisc != IP_PMTUDISC_DONT && 393 ip_sk_accept_pmtu(sk) && 394 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 395 tcp_sync_mss(sk, mtu); 396 397 /* Resend the TCP packet because it's 398 * clear that the old packet has been 399 * dropped. This is the new "fast" path mtu 400 * discovery. 401 */ 402 tcp_simple_retransmit(sk); 403 } /* else let the usual retransmit timer handle it */ 404 } 405 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 406 407 static void do_redirect(struct sk_buff *skb, struct sock *sk) 408 { 409 struct dst_entry *dst = __sk_dst_check(sk, 0); 410 411 if (dst) 412 dst->ops->redirect(dst, sk, skb); 413 } 414 415 416 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 417 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 418 { 419 struct request_sock *req = inet_reqsk(sk); 420 struct net *net = sock_net(sk); 421 422 /* ICMPs are not backlogged, hence we cannot get 423 * an established socket here. 424 */ 425 if (seq != tcp_rsk(req)->snt_isn) { 426 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 427 } else if (abort) { 428 /* 429 * Still in SYN_RECV, just remove it silently. 430 * There is no good way to pass the error to the newly 431 * created socket, and POSIX does not want network 432 * errors returned from accept(). 433 */ 434 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 435 tcp_listendrop(req->rsk_listener); 436 } 437 reqsk_put(req); 438 } 439 EXPORT_IPV6_MOD(tcp_req_err); 440 441 /* TCP-LD (RFC 6069) logic */ 442 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 443 { 444 struct inet_connection_sock *icsk = inet_csk(sk); 445 struct tcp_sock *tp = tcp_sk(sk); 446 struct sk_buff *skb; 447 s32 remaining; 448 u32 delta_us; 449 450 if (sock_owned_by_user(sk)) 451 return; 452 453 if (seq != tp->snd_una || !icsk->icsk_retransmits || 454 !icsk->icsk_backoff) 455 return; 456 457 skb = tcp_rtx_queue_head(sk); 458 if (WARN_ON_ONCE(!skb)) 459 return; 460 461 icsk->icsk_backoff--; 462 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 463 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 464 465 tcp_mstamp_refresh(tp); 466 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 467 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 468 469 if (remaining > 0) { 470 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 471 } else { 472 /* RTO revert clocked out retransmission. 473 * Will retransmit now. 474 */ 475 tcp_retransmit_timer(sk); 476 } 477 } 478 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 479 480 /* 481 * This routine is called by the ICMP module when it gets some 482 * sort of error condition. If err < 0 then the socket should 483 * be closed and the error returned to the user. If err > 0 484 * it's just the icmp type << 8 | icmp code. After adjustment 485 * header points to the first 8 bytes of the tcp header. We need 486 * to find the appropriate port. 487 * 488 * The locking strategy used here is very "optimistic". When 489 * someone else accesses the socket the ICMP is just dropped 490 * and for some paths there is no check at all. 491 * A more general error queue to queue errors for later handling 492 * is probably better. 493 * 494 */ 495 496 int tcp_v4_err(struct sk_buff *skb, u32 info) 497 { 498 const struct iphdr *iph = (const struct iphdr *)skb->data; 499 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 500 struct net *net = dev_net_rcu(skb->dev); 501 const int type = icmp_hdr(skb)->type; 502 const int code = icmp_hdr(skb)->code; 503 struct request_sock *fastopen; 504 struct tcp_sock *tp; 505 u32 seq, snd_una; 506 struct sock *sk; 507 int err; 508 509 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 510 iph->daddr, th->dest, iph->saddr, 511 ntohs(th->source), inet_iif(skb), 0); 512 if (!sk) { 513 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 514 return -ENOENT; 515 } 516 if (sk->sk_state == TCP_TIME_WAIT) { 517 /* To increase the counter of ignored icmps for TCP-AO */ 518 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 519 inet_twsk_put(inet_twsk(sk)); 520 return 0; 521 } 522 seq = ntohl(th->seq); 523 if (sk->sk_state == TCP_NEW_SYN_RECV) { 524 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 525 type == ICMP_TIME_EXCEEDED || 526 (type == ICMP_DEST_UNREACH && 527 (code == ICMP_NET_UNREACH || 528 code == ICMP_HOST_UNREACH))); 529 return 0; 530 } 531 532 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 533 sock_put(sk); 534 return 0; 535 } 536 537 bh_lock_sock(sk); 538 /* If too many ICMPs get dropped on busy 539 * servers this needs to be solved differently. 540 * We do take care of PMTU discovery (RFC1191) special case : 541 * we can receive locally generated ICMP messages while socket is held. 542 */ 543 if (sock_owned_by_user(sk)) { 544 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 545 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 546 } 547 if (sk->sk_state == TCP_CLOSE) 548 goto out; 549 550 if (static_branch_unlikely(&ip4_min_ttl)) { 551 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 552 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 553 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 554 goto out; 555 } 556 } 557 558 tp = tcp_sk(sk); 559 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 560 fastopen = rcu_dereference(tp->fastopen_rsk); 561 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 562 if (sk->sk_state != TCP_LISTEN && 563 !between(seq, snd_una, tp->snd_nxt)) { 564 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 565 goto out; 566 } 567 568 switch (type) { 569 case ICMP_REDIRECT: 570 if (!sock_owned_by_user(sk)) 571 do_redirect(skb, sk); 572 goto out; 573 case ICMP_SOURCE_QUENCH: 574 /* Just silently ignore these. */ 575 goto out; 576 case ICMP_PARAMETERPROB: 577 err = EPROTO; 578 break; 579 case ICMP_DEST_UNREACH: 580 if (code > NR_ICMP_UNREACH) 581 goto out; 582 583 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 584 /* We are not interested in TCP_LISTEN and open_requests 585 * (SYN-ACKs send out by Linux are always <576bytes so 586 * they should go through unfragmented). 587 */ 588 if (sk->sk_state == TCP_LISTEN) 589 goto out; 590 591 WRITE_ONCE(tp->mtu_info, info); 592 if (!sock_owned_by_user(sk)) { 593 tcp_v4_mtu_reduced(sk); 594 } else { 595 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 596 sock_hold(sk); 597 } 598 goto out; 599 } 600 601 err = icmp_err_convert[code].errno; 602 /* check if this ICMP message allows revert of backoff. 603 * (see RFC 6069) 604 */ 605 if (!fastopen && 606 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 607 tcp_ld_RTO_revert(sk, seq); 608 break; 609 case ICMP_TIME_EXCEEDED: 610 err = EHOSTUNREACH; 611 break; 612 default: 613 goto out; 614 } 615 616 switch (sk->sk_state) { 617 case TCP_SYN_SENT: 618 case TCP_SYN_RECV: 619 /* Only in fast or simultaneous open. If a fast open socket is 620 * already accepted it is treated as a connected one below. 621 */ 622 if (fastopen && !fastopen->sk) 623 break; 624 625 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 626 627 if (!sock_owned_by_user(sk)) 628 tcp_done_with_error(sk, err); 629 else 630 WRITE_ONCE(sk->sk_err_soft, err); 631 goto out; 632 } 633 634 /* If we've already connected we will keep trying 635 * until we time out, or the user gives up. 636 * 637 * rfc1122 4.2.3.9 allows to consider as hard errors 638 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 639 * but it is obsoleted by pmtu discovery). 640 * 641 * Note, that in modern internet, where routing is unreliable 642 * and in each dark corner broken firewalls sit, sending random 643 * errors ordered by their masters even this two messages finally lose 644 * their original sense (even Linux sends invalid PORT_UNREACHs) 645 * 646 * Now we are in compliance with RFCs. 647 * --ANK (980905) 648 */ 649 650 if (!sock_owned_by_user(sk) && 651 inet_test_bit(RECVERR, sk)) { 652 WRITE_ONCE(sk->sk_err, err); 653 sk_error_report(sk); 654 } else { /* Only an error on timeout */ 655 WRITE_ONCE(sk->sk_err_soft, err); 656 } 657 658 out: 659 bh_unlock_sock(sk); 660 sock_put(sk); 661 return 0; 662 } 663 664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 665 { 666 struct tcphdr *th = tcp_hdr(skb); 667 668 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 669 skb->csum_start = skb_transport_header(skb) - skb->head; 670 skb->csum_offset = offsetof(struct tcphdr, check); 671 } 672 673 /* This routine computes an IPv4 TCP checksum. */ 674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 675 { 676 const struct inet_sock *inet = inet_sk(sk); 677 678 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 679 } 680 EXPORT_IPV6_MOD(tcp_v4_send_check); 681 682 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 683 684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 685 const struct tcp_ao_hdr *aoh, 686 struct ip_reply_arg *arg, struct tcphdr *reply, 687 __be32 reply_options[REPLY_OPTIONS_LEN]) 688 { 689 #ifdef CONFIG_TCP_AO 690 int sdif = tcp_v4_sdif(skb); 691 int dif = inet_iif(skb); 692 int l3index = sdif ? dif : 0; 693 bool allocated_traffic_key; 694 struct tcp_ao_key *key; 695 char *traffic_key; 696 bool drop = true; 697 u32 ao_sne = 0; 698 u8 keyid; 699 700 rcu_read_lock(); 701 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 702 &key, &traffic_key, &allocated_traffic_key, 703 &keyid, &ao_sne)) 704 goto out; 705 706 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 707 (aoh->rnext_keyid << 8) | keyid); 708 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 709 reply->doff = arg->iov[0].iov_len / 4; 710 711 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 712 key, traffic_key, 713 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 714 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 715 reply, ao_sne)) 716 goto out; 717 drop = false; 718 out: 719 rcu_read_unlock(); 720 if (allocated_traffic_key) 721 kfree(traffic_key); 722 return drop; 723 #else 724 return true; 725 #endif 726 } 727 728 /* 729 * This routine will send an RST to the other tcp. 730 * 731 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 732 * for reset. 733 * Answer: if a packet caused RST, it is not for a socket 734 * existing in our system, if it is matched to a socket, 735 * it is just duplicate segment or bug in other side's TCP. 736 * So that we build reply only basing on parameters 737 * arrived with segment. 738 * Exception: precedence violation. We do not implement it in any case. 739 */ 740 741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 742 enum sk_rst_reason reason) 743 { 744 const struct tcphdr *th = tcp_hdr(skb); 745 struct { 746 struct tcphdr th; 747 __be32 opt[REPLY_OPTIONS_LEN]; 748 } rep; 749 const __u8 *md5_hash_location = NULL; 750 const struct tcp_ao_hdr *aoh; 751 struct ip_reply_arg arg; 752 #ifdef CONFIG_TCP_MD5SIG 753 struct tcp_md5sig_key *key = NULL; 754 unsigned char newhash[16]; 755 struct sock *sk1 = NULL; 756 int genhash; 757 #endif 758 u64 transmit_time = 0; 759 struct sock *ctl_sk; 760 struct net *net; 761 u32 txhash = 0; 762 763 /* Never send a reset in response to a reset. */ 764 if (th->rst) 765 return; 766 767 /* If sk not NULL, it means we did a successful lookup and incoming 768 * route had to be correct. prequeue might have dropped our dst. 769 */ 770 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 771 return; 772 773 /* Swap the send and the receive. */ 774 memset(&rep, 0, sizeof(rep)); 775 rep.th.dest = th->source; 776 rep.th.source = th->dest; 777 rep.th.doff = sizeof(struct tcphdr) / 4; 778 rep.th.rst = 1; 779 780 if (th->ack) { 781 rep.th.seq = th->ack_seq; 782 } else { 783 rep.th.ack = 1; 784 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 785 skb->len - (th->doff << 2)); 786 } 787 788 memset(&arg, 0, sizeof(arg)); 789 arg.iov[0].iov_base = (unsigned char *)&rep; 790 arg.iov[0].iov_len = sizeof(rep.th); 791 792 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 793 794 /* Invalid TCP option size or twice included auth */ 795 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 796 return; 797 798 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 799 return; 800 801 #ifdef CONFIG_TCP_MD5SIG 802 rcu_read_lock(); 803 if (sk && sk_fullsock(sk)) { 804 const union tcp_md5_addr *addr; 805 int l3index; 806 807 /* sdif set, means packet ingressed via a device 808 * in an L3 domain and inet_iif is set to it. 809 */ 810 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 811 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 812 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 813 } else if (md5_hash_location) { 814 const union tcp_md5_addr *addr; 815 int sdif = tcp_v4_sdif(skb); 816 int dif = inet_iif(skb); 817 int l3index; 818 819 /* 820 * active side is lost. Try to find listening socket through 821 * source port, and then find md5 key through listening socket. 822 * we are not loose security here: 823 * Incoming packet is checked with md5 hash with finding key, 824 * no RST generated if md5 hash doesn't match. 825 */ 826 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 827 NULL, 0, ip_hdr(skb)->saddr, 828 th->source, ip_hdr(skb)->daddr, 829 ntohs(th->source), dif, sdif); 830 /* don't send rst if it can't find key */ 831 if (!sk1) 832 goto out; 833 834 /* sdif set, means packet ingressed via a device 835 * in an L3 domain and dif is set to it. 836 */ 837 l3index = sdif ? dif : 0; 838 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 839 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 840 if (!key) 841 goto out; 842 843 844 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 845 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 846 goto out; 847 848 } 849 850 if (key) { 851 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 852 (TCPOPT_NOP << 16) | 853 (TCPOPT_MD5SIG << 8) | 854 TCPOLEN_MD5SIG); 855 /* Update length and the length the header thinks exists */ 856 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 857 rep.th.doff = arg.iov[0].iov_len / 4; 858 859 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 860 key, ip_hdr(skb)->saddr, 861 ip_hdr(skb)->daddr, &rep.th); 862 } 863 #endif 864 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 865 if (rep.opt[0] == 0) { 866 __be32 mrst = mptcp_reset_option(skb); 867 868 if (mrst) { 869 rep.opt[0] = mrst; 870 arg.iov[0].iov_len += sizeof(mrst); 871 rep.th.doff = arg.iov[0].iov_len / 4; 872 } 873 } 874 875 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 876 ip_hdr(skb)->saddr, /* XXX */ 877 arg.iov[0].iov_len, IPPROTO_TCP, 0); 878 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 879 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 880 881 /* When socket is gone, all binding information is lost. 882 * routing might fail in this case. No choice here, if we choose to force 883 * input interface, we will misroute in case of asymmetric route. 884 */ 885 if (sk) 886 arg.bound_dev_if = sk->sk_bound_dev_if; 887 888 trace_tcp_send_reset(sk, skb, reason); 889 890 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 891 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 892 893 /* ECN bits of TW reset are cleared */ 894 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 895 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 896 local_bh_disable(); 897 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 898 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 899 900 sock_net_set(ctl_sk, net); 901 if (sk) { 902 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 903 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 904 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 905 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 906 transmit_time = tcp_transmit_time(sk); 907 xfrm_sk_clone_policy(ctl_sk, sk); 908 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 909 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 910 } else { 911 ctl_sk->sk_mark = 0; 912 ctl_sk->sk_priority = 0; 913 } 914 ip_send_unicast_reply(ctl_sk, sk, 915 skb, &TCP_SKB_CB(skb)->header.h4.opt, 916 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 917 &arg, arg.iov[0].iov_len, 918 transmit_time, txhash); 919 920 xfrm_sk_free_policy(ctl_sk); 921 sock_net_set(ctl_sk, &init_net); 922 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 923 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 924 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 925 local_bh_enable(); 926 927 #ifdef CONFIG_TCP_MD5SIG 928 out: 929 rcu_read_unlock(); 930 #endif 931 } 932 933 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 934 outside socket context is ugly, certainly. What can I do? 935 */ 936 937 static void tcp_v4_send_ack(const struct sock *sk, 938 struct sk_buff *skb, u32 seq, u32 ack, 939 u32 win, u32 tsval, u32 tsecr, int oif, 940 struct tcp_key *key, 941 int reply_flags, u8 tos, u32 txhash) 942 { 943 const struct tcphdr *th = tcp_hdr(skb); 944 struct { 945 struct tcphdr th; 946 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 947 } rep; 948 struct net *net = sock_net(sk); 949 struct ip_reply_arg arg; 950 struct sock *ctl_sk; 951 u64 transmit_time; 952 953 memset(&rep.th, 0, sizeof(struct tcphdr)); 954 memset(&arg, 0, sizeof(arg)); 955 956 arg.iov[0].iov_base = (unsigned char *)&rep; 957 arg.iov[0].iov_len = sizeof(rep.th); 958 if (tsecr) { 959 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 960 (TCPOPT_TIMESTAMP << 8) | 961 TCPOLEN_TIMESTAMP); 962 rep.opt[1] = htonl(tsval); 963 rep.opt[2] = htonl(tsecr); 964 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 965 } 966 967 /* Swap the send and the receive. */ 968 rep.th.dest = th->source; 969 rep.th.source = th->dest; 970 rep.th.doff = arg.iov[0].iov_len / 4; 971 rep.th.seq = htonl(seq); 972 rep.th.ack_seq = htonl(ack); 973 rep.th.ack = 1; 974 rep.th.window = htons(win); 975 976 #ifdef CONFIG_TCP_MD5SIG 977 if (tcp_key_is_md5(key)) { 978 int offset = (tsecr) ? 3 : 0; 979 980 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 981 (TCPOPT_NOP << 16) | 982 (TCPOPT_MD5SIG << 8) | 983 TCPOLEN_MD5SIG); 984 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 985 rep.th.doff = arg.iov[0].iov_len/4; 986 987 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 988 key->md5_key, ip_hdr(skb)->saddr, 989 ip_hdr(skb)->daddr, &rep.th); 990 } 991 #endif 992 #ifdef CONFIG_TCP_AO 993 if (tcp_key_is_ao(key)) { 994 int offset = (tsecr) ? 3 : 0; 995 996 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 997 (tcp_ao_len(key->ao_key) << 16) | 998 (key->ao_key->sndid << 8) | 999 key->rcv_next); 1000 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 1001 rep.th.doff = arg.iov[0].iov_len / 4; 1002 1003 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1004 key->ao_key, key->traffic_key, 1005 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1006 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1007 &rep.th, key->sne); 1008 } 1009 #endif 1010 arg.flags = reply_flags; 1011 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1012 ip_hdr(skb)->saddr, /* XXX */ 1013 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1014 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1015 if (oif) 1016 arg.bound_dev_if = oif; 1017 arg.tos = tos; 1018 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1019 local_bh_disable(); 1020 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1021 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1022 sock_net_set(ctl_sk, net); 1023 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1024 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1025 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1026 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1027 transmit_time = tcp_transmit_time(sk); 1028 ip_send_unicast_reply(ctl_sk, sk, 1029 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1030 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1031 &arg, arg.iov[0].iov_len, 1032 transmit_time, txhash); 1033 1034 sock_net_set(ctl_sk, &init_net); 1035 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1036 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1037 local_bh_enable(); 1038 } 1039 1040 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1041 enum tcp_tw_status tw_status) 1042 { 1043 struct inet_timewait_sock *tw = inet_twsk(sk); 1044 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1045 struct tcp_key key = {}; 1046 u8 tos = tw->tw_tos; 1047 1048 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1049 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1050 * being placed in a different service queues (Classic rather than L4S) 1051 */ 1052 if (tw_status == TCP_TW_ACK_OOW) 1053 tos &= ~INET_ECN_MASK; 1054 1055 #ifdef CONFIG_TCP_AO 1056 struct tcp_ao_info *ao_info; 1057 1058 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1059 /* FIXME: the segment to-be-acked is not verified yet */ 1060 ao_info = rcu_dereference(tcptw->ao_info); 1061 if (ao_info) { 1062 const struct tcp_ao_hdr *aoh; 1063 1064 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1065 inet_twsk_put(tw); 1066 return; 1067 } 1068 1069 if (aoh) 1070 key.ao_key = tcp_ao_established_key(sk, ao_info, 1071 aoh->rnext_keyid, -1); 1072 } 1073 } 1074 if (key.ao_key) { 1075 struct tcp_ao_key *rnext_key; 1076 1077 key.traffic_key = snd_other_key(key.ao_key); 1078 key.sne = READ_ONCE(ao_info->snd_sne); 1079 rnext_key = READ_ONCE(ao_info->rnext_key); 1080 key.rcv_next = rnext_key->rcvid; 1081 key.type = TCP_KEY_AO; 1082 #else 1083 if (0) { 1084 #endif 1085 } else if (static_branch_tcp_md5()) { 1086 key.md5_key = tcp_twsk_md5_key(tcptw); 1087 if (key.md5_key) 1088 key.type = TCP_KEY_MD5; 1089 } 1090 1091 tcp_v4_send_ack(sk, skb, 1092 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1093 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1094 tcp_tw_tsval(tcptw), 1095 READ_ONCE(tcptw->tw_ts_recent), 1096 tw->tw_bound_dev_if, &key, 1097 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1098 tos, 1099 tw->tw_txhash); 1100 1101 inet_twsk_put(tw); 1102 } 1103 1104 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1105 struct request_sock *req) 1106 { 1107 struct tcp_key key = {}; 1108 1109 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1110 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1111 */ 1112 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1113 tcp_sk(sk)->snd_nxt; 1114 1115 #ifdef CONFIG_TCP_AO 1116 if (static_branch_unlikely(&tcp_ao_needed.key) && 1117 tcp_rsk_used_ao(req)) { 1118 const union tcp_md5_addr *addr; 1119 const struct tcp_ao_hdr *aoh; 1120 int l3index; 1121 1122 /* Invalid TCP option size or twice included auth */ 1123 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1124 return; 1125 if (!aoh) 1126 return; 1127 1128 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1129 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1130 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1131 aoh->rnext_keyid, -1); 1132 if (unlikely(!key.ao_key)) { 1133 /* Send ACK with any matching MKT for the peer */ 1134 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1135 /* Matching key disappeared (user removed the key?) 1136 * let the handshake timeout. 1137 */ 1138 if (!key.ao_key) { 1139 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1140 addr, 1141 ntohs(tcp_hdr(skb)->source), 1142 &ip_hdr(skb)->daddr, 1143 ntohs(tcp_hdr(skb)->dest)); 1144 return; 1145 } 1146 } 1147 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1148 if (!key.traffic_key) 1149 return; 1150 1151 key.type = TCP_KEY_AO; 1152 key.rcv_next = aoh->keyid; 1153 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1154 #else 1155 if (0) { 1156 #endif 1157 } else if (static_branch_tcp_md5()) { 1158 const union tcp_md5_addr *addr; 1159 int l3index; 1160 1161 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1162 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1163 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1164 if (key.md5_key) 1165 key.type = TCP_KEY_MD5; 1166 } 1167 1168 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1169 tcp_v4_send_ack(sk, skb, seq, 1170 tcp_rsk(req)->rcv_nxt, 1171 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1172 tcp_rsk_tsval(tcp_rsk(req)), 1173 req->ts_recent, 1174 0, &key, 1175 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1176 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1177 READ_ONCE(tcp_rsk(req)->txhash)); 1178 if (tcp_key_is_ao(&key)) 1179 kfree(key.traffic_key); 1180 } 1181 1182 /* 1183 * Send a SYN-ACK after having received a SYN. 1184 * This still operates on a request_sock only, not on a big 1185 * socket. 1186 */ 1187 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1188 struct flowi *fl, 1189 struct request_sock *req, 1190 struct tcp_fastopen_cookie *foc, 1191 enum tcp_synack_type synack_type, 1192 struct sk_buff *syn_skb) 1193 { 1194 const struct inet_request_sock *ireq = inet_rsk(req); 1195 struct flowi4 fl4; 1196 int err = -1; 1197 struct sk_buff *skb; 1198 u8 tos; 1199 1200 /* First, grab a route. */ 1201 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1202 return -1; 1203 1204 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1205 1206 if (skb) { 1207 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1208 1209 tos = READ_ONCE(inet_sk(sk)->tos); 1210 1211 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1212 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1213 (tos & INET_ECN_MASK); 1214 1215 if (!INET_ECN_is_capable(tos) && 1216 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1217 tos |= INET_ECN_ECT_0; 1218 1219 rcu_read_lock(); 1220 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1221 ireq->ir_rmt_addr, 1222 rcu_dereference(ireq->ireq_opt), 1223 tos); 1224 rcu_read_unlock(); 1225 err = net_xmit_eval(err); 1226 } 1227 1228 return err; 1229 } 1230 1231 /* 1232 * IPv4 request_sock destructor. 1233 */ 1234 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1235 { 1236 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1237 } 1238 1239 #ifdef CONFIG_TCP_MD5SIG 1240 /* 1241 * RFC2385 MD5 checksumming requires a mapping of 1242 * IP address->MD5 Key. 1243 * We need to maintain these in the sk structure. 1244 */ 1245 1246 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1247 EXPORT_IPV6_MOD(tcp_md5_needed); 1248 1249 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1250 { 1251 if (!old) 1252 return true; 1253 1254 /* l3index always overrides non-l3index */ 1255 if (old->l3index && new->l3index == 0) 1256 return false; 1257 if (old->l3index == 0 && new->l3index) 1258 return true; 1259 1260 return old->prefixlen < new->prefixlen; 1261 } 1262 1263 /* Find the Key structure for an address. */ 1264 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1265 const union tcp_md5_addr *addr, 1266 int family, bool any_l3index) 1267 { 1268 const struct tcp_sock *tp = tcp_sk(sk); 1269 struct tcp_md5sig_key *key; 1270 const struct tcp_md5sig_info *md5sig; 1271 __be32 mask; 1272 struct tcp_md5sig_key *best_match = NULL; 1273 bool match; 1274 1275 /* caller either holds rcu_read_lock() or socket lock */ 1276 md5sig = rcu_dereference_check(tp->md5sig_info, 1277 lockdep_sock_is_held(sk)); 1278 if (!md5sig) 1279 return NULL; 1280 1281 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1282 lockdep_sock_is_held(sk)) { 1283 if (key->family != family) 1284 continue; 1285 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1286 key->l3index != l3index) 1287 continue; 1288 if (family == AF_INET) { 1289 mask = inet_make_mask(key->prefixlen); 1290 match = (key->addr.a4.s_addr & mask) == 1291 (addr->a4.s_addr & mask); 1292 #if IS_ENABLED(CONFIG_IPV6) 1293 } else if (family == AF_INET6) { 1294 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1295 key->prefixlen); 1296 #endif 1297 } else { 1298 match = false; 1299 } 1300 1301 if (match && better_md5_match(best_match, key)) 1302 best_match = key; 1303 } 1304 return best_match; 1305 } 1306 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1307 1308 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1309 const union tcp_md5_addr *addr, 1310 int family, u8 prefixlen, 1311 int l3index, u8 flags) 1312 { 1313 const struct tcp_sock *tp = tcp_sk(sk); 1314 struct tcp_md5sig_key *key; 1315 unsigned int size = sizeof(struct in_addr); 1316 const struct tcp_md5sig_info *md5sig; 1317 1318 /* caller either holds rcu_read_lock() or socket lock */ 1319 md5sig = rcu_dereference_check(tp->md5sig_info, 1320 lockdep_sock_is_held(sk)); 1321 if (!md5sig) 1322 return NULL; 1323 #if IS_ENABLED(CONFIG_IPV6) 1324 if (family == AF_INET6) 1325 size = sizeof(struct in6_addr); 1326 #endif 1327 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1328 lockdep_sock_is_held(sk)) { 1329 if (key->family != family) 1330 continue; 1331 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1332 continue; 1333 if (key->l3index != l3index) 1334 continue; 1335 if (!memcmp(&key->addr, addr, size) && 1336 key->prefixlen == prefixlen) 1337 return key; 1338 } 1339 return NULL; 1340 } 1341 1342 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1343 const struct sock *addr_sk) 1344 { 1345 const union tcp_md5_addr *addr; 1346 int l3index; 1347 1348 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1349 addr_sk->sk_bound_dev_if); 1350 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1351 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1352 } 1353 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1354 1355 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1356 { 1357 struct tcp_sock *tp = tcp_sk(sk); 1358 struct tcp_md5sig_info *md5sig; 1359 1360 md5sig = kmalloc(sizeof(*md5sig), gfp); 1361 if (!md5sig) 1362 return -ENOMEM; 1363 1364 sk_gso_disable(sk); 1365 INIT_HLIST_HEAD(&md5sig->head); 1366 rcu_assign_pointer(tp->md5sig_info, md5sig); 1367 return 0; 1368 } 1369 1370 /* This can be called on a newly created socket, from other files */ 1371 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1372 int family, u8 prefixlen, int l3index, u8 flags, 1373 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1374 { 1375 /* Add Key to the list */ 1376 struct tcp_md5sig_key *key; 1377 struct tcp_sock *tp = tcp_sk(sk); 1378 struct tcp_md5sig_info *md5sig; 1379 1380 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1381 if (key) { 1382 /* Pre-existing entry - just update that one. 1383 * Note that the key might be used concurrently. 1384 * data_race() is telling kcsan that we do not care of 1385 * key mismatches, since changing MD5 key on live flows 1386 * can lead to packet drops. 1387 */ 1388 data_race(memcpy(key->key, newkey, newkeylen)); 1389 1390 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1391 * Also note that a reader could catch new key->keylen value 1392 * but old key->key[], this is the reason we use __GFP_ZERO 1393 * at sock_kmalloc() time below these lines. 1394 */ 1395 WRITE_ONCE(key->keylen, newkeylen); 1396 1397 return 0; 1398 } 1399 1400 md5sig = rcu_dereference_protected(tp->md5sig_info, 1401 lockdep_sock_is_held(sk)); 1402 1403 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1404 if (!key) 1405 return -ENOMEM; 1406 1407 memcpy(key->key, newkey, newkeylen); 1408 key->keylen = newkeylen; 1409 key->family = family; 1410 key->prefixlen = prefixlen; 1411 key->l3index = l3index; 1412 key->flags = flags; 1413 memcpy(&key->addr, addr, 1414 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1415 sizeof(struct in_addr)); 1416 hlist_add_head_rcu(&key->node, &md5sig->head); 1417 return 0; 1418 } 1419 1420 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1421 int family, u8 prefixlen, int l3index, u8 flags, 1422 const u8 *newkey, u8 newkeylen) 1423 { 1424 struct tcp_sock *tp = tcp_sk(sk); 1425 1426 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1427 if (tcp_md5_alloc_sigpool()) 1428 return -ENOMEM; 1429 1430 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1431 tcp_md5_release_sigpool(); 1432 return -ENOMEM; 1433 } 1434 1435 if (!static_branch_inc(&tcp_md5_needed.key)) { 1436 struct tcp_md5sig_info *md5sig; 1437 1438 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1439 rcu_assign_pointer(tp->md5sig_info, NULL); 1440 kfree_rcu(md5sig, rcu); 1441 tcp_md5_release_sigpool(); 1442 return -EUSERS; 1443 } 1444 } 1445 1446 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1447 newkey, newkeylen, GFP_KERNEL); 1448 } 1449 EXPORT_IPV6_MOD(tcp_md5_do_add); 1450 1451 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1452 int family, u8 prefixlen, int l3index, 1453 struct tcp_md5sig_key *key) 1454 { 1455 struct tcp_sock *tp = tcp_sk(sk); 1456 1457 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1458 tcp_md5_add_sigpool(); 1459 1460 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1461 tcp_md5_release_sigpool(); 1462 return -ENOMEM; 1463 } 1464 1465 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1466 struct tcp_md5sig_info *md5sig; 1467 1468 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1469 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1470 rcu_assign_pointer(tp->md5sig_info, NULL); 1471 kfree_rcu(md5sig, rcu); 1472 tcp_md5_release_sigpool(); 1473 return -EUSERS; 1474 } 1475 } 1476 1477 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1478 key->flags, key->key, key->keylen, 1479 sk_gfp_mask(sk, GFP_ATOMIC)); 1480 } 1481 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1482 1483 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1484 u8 prefixlen, int l3index, u8 flags) 1485 { 1486 struct tcp_md5sig_key *key; 1487 1488 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1489 if (!key) 1490 return -ENOENT; 1491 hlist_del_rcu(&key->node); 1492 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1493 kfree_rcu(key, rcu); 1494 return 0; 1495 } 1496 EXPORT_IPV6_MOD(tcp_md5_do_del); 1497 1498 void tcp_clear_md5_list(struct sock *sk) 1499 { 1500 struct tcp_sock *tp = tcp_sk(sk); 1501 struct tcp_md5sig_key *key; 1502 struct hlist_node *n; 1503 struct tcp_md5sig_info *md5sig; 1504 1505 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1506 1507 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1508 hlist_del_rcu(&key->node); 1509 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1510 kfree_rcu(key, rcu); 1511 } 1512 } 1513 1514 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1515 sockptr_t optval, int optlen) 1516 { 1517 struct tcp_md5sig cmd; 1518 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1519 const union tcp_md5_addr *addr; 1520 u8 prefixlen = 32; 1521 int l3index = 0; 1522 bool l3flag; 1523 u8 flags; 1524 1525 if (optlen < sizeof(cmd)) 1526 return -EINVAL; 1527 1528 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1529 return -EFAULT; 1530 1531 if (sin->sin_family != AF_INET) 1532 return -EINVAL; 1533 1534 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1535 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1536 1537 if (optname == TCP_MD5SIG_EXT && 1538 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1539 prefixlen = cmd.tcpm_prefixlen; 1540 if (prefixlen > 32) 1541 return -EINVAL; 1542 } 1543 1544 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1545 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1546 struct net_device *dev; 1547 1548 rcu_read_lock(); 1549 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1550 if (dev && netif_is_l3_master(dev)) 1551 l3index = dev->ifindex; 1552 1553 rcu_read_unlock(); 1554 1555 /* ok to reference set/not set outside of rcu; 1556 * right now device MUST be an L3 master 1557 */ 1558 if (!dev || !l3index) 1559 return -EINVAL; 1560 } 1561 1562 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1563 1564 if (!cmd.tcpm_keylen) 1565 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1566 1567 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1568 return -EINVAL; 1569 1570 /* Don't allow keys for peers that have a matching TCP-AO key. 1571 * See the comment in tcp_ao_add_cmd() 1572 */ 1573 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1574 return -EKEYREJECTED; 1575 1576 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1577 cmd.tcpm_key, cmd.tcpm_keylen); 1578 } 1579 1580 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1581 __be32 daddr, __be32 saddr, 1582 const struct tcphdr *th, int nbytes) 1583 { 1584 struct tcp4_pseudohdr *bp; 1585 struct scatterlist sg; 1586 struct tcphdr *_th; 1587 1588 bp = hp->scratch; 1589 bp->saddr = saddr; 1590 bp->daddr = daddr; 1591 bp->pad = 0; 1592 bp->protocol = IPPROTO_TCP; 1593 bp->len = cpu_to_be16(nbytes); 1594 1595 _th = (struct tcphdr *)(bp + 1); 1596 memcpy(_th, th, sizeof(*th)); 1597 _th->check = 0; 1598 1599 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1600 ahash_request_set_crypt(hp->req, &sg, NULL, 1601 sizeof(*bp) + sizeof(*th)); 1602 return crypto_ahash_update(hp->req); 1603 } 1604 1605 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1606 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1607 { 1608 struct tcp_sigpool hp; 1609 1610 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1611 goto clear_hash_nostart; 1612 1613 if (crypto_ahash_init(hp.req)) 1614 goto clear_hash; 1615 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1616 goto clear_hash; 1617 if (tcp_md5_hash_key(&hp, key)) 1618 goto clear_hash; 1619 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1620 if (crypto_ahash_final(hp.req)) 1621 goto clear_hash; 1622 1623 tcp_sigpool_end(&hp); 1624 return 0; 1625 1626 clear_hash: 1627 tcp_sigpool_end(&hp); 1628 clear_hash_nostart: 1629 memset(md5_hash, 0, 16); 1630 return 1; 1631 } 1632 1633 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1634 const struct sock *sk, 1635 const struct sk_buff *skb) 1636 { 1637 const struct tcphdr *th = tcp_hdr(skb); 1638 struct tcp_sigpool hp; 1639 __be32 saddr, daddr; 1640 1641 if (sk) { /* valid for establish/request sockets */ 1642 saddr = sk->sk_rcv_saddr; 1643 daddr = sk->sk_daddr; 1644 } else { 1645 const struct iphdr *iph = ip_hdr(skb); 1646 saddr = iph->saddr; 1647 daddr = iph->daddr; 1648 } 1649 1650 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1651 goto clear_hash_nostart; 1652 1653 if (crypto_ahash_init(hp.req)) 1654 goto clear_hash; 1655 1656 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1657 goto clear_hash; 1658 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1659 goto clear_hash; 1660 if (tcp_md5_hash_key(&hp, key)) 1661 goto clear_hash; 1662 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1663 if (crypto_ahash_final(hp.req)) 1664 goto clear_hash; 1665 1666 tcp_sigpool_end(&hp); 1667 return 0; 1668 1669 clear_hash: 1670 tcp_sigpool_end(&hp); 1671 clear_hash_nostart: 1672 memset(md5_hash, 0, 16); 1673 return 1; 1674 } 1675 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1676 1677 #endif 1678 1679 static void tcp_v4_init_req(struct request_sock *req, 1680 const struct sock *sk_listener, 1681 struct sk_buff *skb) 1682 { 1683 struct inet_request_sock *ireq = inet_rsk(req); 1684 struct net *net = sock_net(sk_listener); 1685 1686 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1687 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1688 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1689 } 1690 1691 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1692 struct sk_buff *skb, 1693 struct flowi *fl, 1694 struct request_sock *req, 1695 u32 tw_isn) 1696 { 1697 tcp_v4_init_req(req, sk, skb); 1698 1699 if (security_inet_conn_request(sk, skb, req)) 1700 return NULL; 1701 1702 return inet_csk_route_req(sk, &fl->u.ip4, req); 1703 } 1704 1705 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1706 .family = PF_INET, 1707 .obj_size = sizeof(struct tcp_request_sock), 1708 .send_ack = tcp_v4_reqsk_send_ack, 1709 .destructor = tcp_v4_reqsk_destructor, 1710 .send_reset = tcp_v4_send_reset, 1711 .syn_ack_timeout = tcp_syn_ack_timeout, 1712 }; 1713 1714 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1715 .mss_clamp = TCP_MSS_DEFAULT, 1716 #ifdef CONFIG_TCP_MD5SIG 1717 .req_md5_lookup = tcp_v4_md5_lookup, 1718 .calc_md5_hash = tcp_v4_md5_hash_skb, 1719 #endif 1720 #ifdef CONFIG_TCP_AO 1721 .ao_lookup = tcp_v4_ao_lookup_rsk, 1722 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1723 .ao_synack_hash = tcp_v4_ao_synack_hash, 1724 #endif 1725 #ifdef CONFIG_SYN_COOKIES 1726 .cookie_init_seq = cookie_v4_init_sequence, 1727 #endif 1728 .route_req = tcp_v4_route_req, 1729 .init_seq = tcp_v4_init_seq, 1730 .init_ts_off = tcp_v4_init_ts_off, 1731 .send_synack = tcp_v4_send_synack, 1732 }; 1733 1734 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1735 { 1736 /* Never answer to SYNs send to broadcast or multicast */ 1737 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1738 goto drop; 1739 1740 return tcp_conn_request(&tcp_request_sock_ops, 1741 &tcp_request_sock_ipv4_ops, sk, skb); 1742 1743 drop: 1744 tcp_listendrop(sk); 1745 return 0; 1746 } 1747 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1748 1749 1750 /* 1751 * The three way handshake has completed - we got a valid synack - 1752 * now create the new socket. 1753 */ 1754 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1755 struct request_sock *req, 1756 struct dst_entry *dst, 1757 struct request_sock *req_unhash, 1758 bool *own_req) 1759 { 1760 struct inet_request_sock *ireq; 1761 bool found_dup_sk = false; 1762 struct inet_sock *newinet; 1763 struct tcp_sock *newtp; 1764 struct sock *newsk; 1765 #ifdef CONFIG_TCP_MD5SIG 1766 const union tcp_md5_addr *addr; 1767 struct tcp_md5sig_key *key; 1768 int l3index; 1769 #endif 1770 struct ip_options_rcu *inet_opt; 1771 1772 if (sk_acceptq_is_full(sk)) 1773 goto exit_overflow; 1774 1775 newsk = tcp_create_openreq_child(sk, req, skb); 1776 if (!newsk) 1777 goto exit_nonewsk; 1778 1779 newsk->sk_gso_type = SKB_GSO_TCPV4; 1780 inet_sk_rx_dst_set(newsk, skb); 1781 1782 newtp = tcp_sk(newsk); 1783 newinet = inet_sk(newsk); 1784 ireq = inet_rsk(req); 1785 inet_opt = rcu_dereference(ireq->ireq_opt); 1786 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1787 newinet->mc_index = inet_iif(skb); 1788 newinet->mc_ttl = ip_hdr(skb)->ttl; 1789 newinet->rcv_tos = ip_hdr(skb)->tos; 1790 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1791 if (inet_opt) 1792 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1793 atomic_set(&newinet->inet_id, get_random_u16()); 1794 1795 /* Set ToS of the new socket based upon the value of incoming SYN. 1796 * ECT bits are set later in tcp_init_transfer(). 1797 */ 1798 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1799 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1800 1801 if (!dst) { 1802 dst = inet_csk_route_child_sock(sk, newsk, req); 1803 if (!dst) 1804 goto put_and_exit; 1805 } else { 1806 /* syncookie case : see end of cookie_v4_check() */ 1807 } 1808 sk_setup_caps(newsk, dst); 1809 1810 tcp_ca_openreq_child(newsk, dst); 1811 1812 tcp_sync_mss(newsk, dst_mtu(dst)); 1813 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1814 1815 tcp_initialize_rcv_mss(newsk); 1816 1817 #ifdef CONFIG_TCP_MD5SIG 1818 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1819 /* Copy over the MD5 key from the original socket */ 1820 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1821 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1822 if (key && !tcp_rsk_used_ao(req)) { 1823 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1824 goto put_and_exit; 1825 sk_gso_disable(newsk); 1826 } 1827 #endif 1828 #ifdef CONFIG_TCP_AO 1829 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1830 goto put_and_exit; /* OOM, release back memory */ 1831 #endif 1832 1833 if (__inet_inherit_port(sk, newsk) < 0) 1834 goto put_and_exit; 1835 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1836 &found_dup_sk); 1837 if (likely(*own_req)) { 1838 tcp_move_syn(newtp, req); 1839 ireq->ireq_opt = NULL; 1840 } else { 1841 newinet->inet_opt = NULL; 1842 1843 if (!req_unhash && found_dup_sk) { 1844 /* This code path should only be executed in the 1845 * syncookie case only 1846 */ 1847 bh_unlock_sock(newsk); 1848 sock_put(newsk); 1849 newsk = NULL; 1850 } 1851 } 1852 return newsk; 1853 1854 exit_overflow: 1855 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1856 exit_nonewsk: 1857 dst_release(dst); 1858 exit: 1859 tcp_listendrop(sk); 1860 return NULL; 1861 put_and_exit: 1862 newinet->inet_opt = NULL; 1863 inet_csk_prepare_forced_close(newsk); 1864 tcp_done(newsk); 1865 goto exit; 1866 } 1867 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1868 1869 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1870 { 1871 #ifdef CONFIG_SYN_COOKIES 1872 const struct tcphdr *th = tcp_hdr(skb); 1873 1874 if (!th->syn) 1875 sk = cookie_v4_check(sk, skb); 1876 #endif 1877 return sk; 1878 } 1879 1880 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1881 struct tcphdr *th, u32 *cookie) 1882 { 1883 u16 mss = 0; 1884 #ifdef CONFIG_SYN_COOKIES 1885 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1886 &tcp_request_sock_ipv4_ops, sk, th); 1887 if (mss) { 1888 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1889 tcp_synq_overflow(sk); 1890 } 1891 #endif 1892 return mss; 1893 } 1894 1895 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1896 u32)); 1897 /* The socket must have it's spinlock held when we get 1898 * here, unless it is a TCP_LISTEN socket. 1899 * 1900 * We have a potential double-lock case here, so even when 1901 * doing backlog processing we use the BH locking scheme. 1902 * This is because we cannot sleep with the original spinlock 1903 * held. 1904 */ 1905 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1906 { 1907 enum skb_drop_reason reason; 1908 struct sock *rsk; 1909 1910 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1911 struct dst_entry *dst; 1912 1913 dst = rcu_dereference_protected(sk->sk_rx_dst, 1914 lockdep_sock_is_held(sk)); 1915 1916 sock_rps_save_rxhash(sk, skb); 1917 sk_mark_napi_id(sk, skb); 1918 if (dst) { 1919 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1920 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1921 dst, 0)) { 1922 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1923 dst_release(dst); 1924 } 1925 } 1926 tcp_rcv_established(sk, skb); 1927 return 0; 1928 } 1929 1930 if (tcp_checksum_complete(skb)) 1931 goto csum_err; 1932 1933 if (sk->sk_state == TCP_LISTEN) { 1934 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1935 1936 if (!nsk) 1937 return 0; 1938 if (nsk != sk) { 1939 reason = tcp_child_process(sk, nsk, skb); 1940 if (reason) { 1941 rsk = nsk; 1942 goto reset; 1943 } 1944 return 0; 1945 } 1946 } else 1947 sock_rps_save_rxhash(sk, skb); 1948 1949 reason = tcp_rcv_state_process(sk, skb); 1950 if (reason) { 1951 rsk = sk; 1952 goto reset; 1953 } 1954 return 0; 1955 1956 reset: 1957 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1958 discard: 1959 sk_skb_reason_drop(sk, skb, reason); 1960 /* Be careful here. If this function gets more complicated and 1961 * gcc suffers from register pressure on the x86, sk (in %ebx) 1962 * might be destroyed here. This current version compiles correctly, 1963 * but you have been warned. 1964 */ 1965 return 0; 1966 1967 csum_err: 1968 reason = SKB_DROP_REASON_TCP_CSUM; 1969 trace_tcp_bad_csum(skb); 1970 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1971 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1972 goto discard; 1973 } 1974 EXPORT_SYMBOL(tcp_v4_do_rcv); 1975 1976 int tcp_v4_early_demux(struct sk_buff *skb) 1977 { 1978 struct net *net = dev_net_rcu(skb->dev); 1979 const struct iphdr *iph; 1980 const struct tcphdr *th; 1981 struct sock *sk; 1982 1983 if (skb->pkt_type != PACKET_HOST) 1984 return 0; 1985 1986 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1987 return 0; 1988 1989 iph = ip_hdr(skb); 1990 th = tcp_hdr(skb); 1991 1992 if (th->doff < sizeof(struct tcphdr) / 4) 1993 return 0; 1994 1995 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1996 iph->saddr, th->source, 1997 iph->daddr, ntohs(th->dest), 1998 skb->skb_iif, inet_sdif(skb)); 1999 if (sk) { 2000 skb->sk = sk; 2001 skb->destructor = sock_edemux; 2002 if (sk_fullsock(sk)) { 2003 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 2004 2005 if (dst) 2006 dst = dst_check(dst, 0); 2007 if (dst && 2008 sk->sk_rx_dst_ifindex == skb->skb_iif) 2009 skb_dst_set_noref(skb, dst); 2010 } 2011 } 2012 return 0; 2013 } 2014 2015 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2016 enum skb_drop_reason *reason) 2017 { 2018 u32 tail_gso_size, tail_gso_segs; 2019 struct skb_shared_info *shinfo; 2020 const struct tcphdr *th; 2021 struct tcphdr *thtail; 2022 struct sk_buff *tail; 2023 unsigned int hdrlen; 2024 bool fragstolen; 2025 u32 gso_segs; 2026 u32 gso_size; 2027 u64 limit; 2028 int delta; 2029 2030 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2031 * we can fix skb->truesize to its real value to avoid future drops. 2032 * This is valid because skb is not yet charged to the socket. 2033 * It has been noticed pure SACK packets were sometimes dropped 2034 * (if cooked by drivers without copybreak feature). 2035 */ 2036 skb_condense(skb); 2037 2038 tcp_cleanup_skb(skb); 2039 2040 if (unlikely(tcp_checksum_complete(skb))) { 2041 bh_unlock_sock(sk); 2042 trace_tcp_bad_csum(skb); 2043 *reason = SKB_DROP_REASON_TCP_CSUM; 2044 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2045 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2046 return true; 2047 } 2048 2049 /* Attempt coalescing to last skb in backlog, even if we are 2050 * above the limits. 2051 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2052 */ 2053 th = (const struct tcphdr *)skb->data; 2054 hdrlen = th->doff * 4; 2055 2056 tail = sk->sk_backlog.tail; 2057 if (!tail) 2058 goto no_coalesce; 2059 thtail = (struct tcphdr *)tail->data; 2060 2061 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2062 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2063 ((TCP_SKB_CB(tail)->tcp_flags | 2064 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2065 !((TCP_SKB_CB(tail)->tcp_flags & 2066 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2067 ((TCP_SKB_CB(tail)->tcp_flags ^ 2068 TCP_SKB_CB(skb)->tcp_flags) & 2069 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2070 !tcp_skb_can_collapse_rx(tail, skb) || 2071 thtail->doff != th->doff || 2072 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2073 goto no_coalesce; 2074 2075 __skb_pull(skb, hdrlen); 2076 2077 shinfo = skb_shinfo(skb); 2078 gso_size = shinfo->gso_size ?: skb->len; 2079 gso_segs = shinfo->gso_segs ?: 1; 2080 2081 shinfo = skb_shinfo(tail); 2082 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2083 tail_gso_segs = shinfo->gso_segs ?: 1; 2084 2085 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2086 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2087 2088 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2089 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2090 thtail->window = th->window; 2091 } 2092 2093 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2094 * thtail->fin, so that the fast path in tcp_rcv_established() 2095 * is not entered if we append a packet with a FIN. 2096 * SYN, RST, URG are not present. 2097 * ACK is set on both packets. 2098 * PSH : we do not really care in TCP stack, 2099 * at least for 'GRO' packets. 2100 */ 2101 thtail->fin |= th->fin; 2102 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2103 2104 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2105 TCP_SKB_CB(tail)->has_rxtstamp = true; 2106 tail->tstamp = skb->tstamp; 2107 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2108 } 2109 2110 /* Not as strict as GRO. We only need to carry mss max value */ 2111 shinfo->gso_size = max(gso_size, tail_gso_size); 2112 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2113 2114 sk->sk_backlog.len += delta; 2115 __NET_INC_STATS(sock_net(sk), 2116 LINUX_MIB_TCPBACKLOGCOALESCE); 2117 kfree_skb_partial(skb, fragstolen); 2118 return false; 2119 } 2120 __skb_push(skb, hdrlen); 2121 2122 no_coalesce: 2123 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2124 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2125 * sk_rcvbuf in normal conditions. 2126 */ 2127 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2128 2129 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2130 2131 /* Only socket owner can try to collapse/prune rx queues 2132 * to reduce memory overhead, so add a little headroom here. 2133 * Few sockets backlog are possibly concurrently non empty. 2134 */ 2135 limit += 64 * 1024; 2136 2137 limit = min_t(u64, limit, UINT_MAX); 2138 2139 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2140 bh_unlock_sock(sk); 2141 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2142 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2143 return true; 2144 } 2145 return false; 2146 } 2147 EXPORT_IPV6_MOD(tcp_add_backlog); 2148 2149 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2150 { 2151 struct tcphdr *th = (struct tcphdr *)skb->data; 2152 2153 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2154 } 2155 EXPORT_IPV6_MOD(tcp_filter); 2156 2157 static void tcp_v4_restore_cb(struct sk_buff *skb) 2158 { 2159 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2160 sizeof(struct inet_skb_parm)); 2161 } 2162 2163 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2164 const struct tcphdr *th) 2165 { 2166 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2167 * barrier() makes sure compiler wont play fool^Waliasing games. 2168 */ 2169 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2170 sizeof(struct inet_skb_parm)); 2171 barrier(); 2172 2173 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2174 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2175 skb->len - th->doff * 4); 2176 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2177 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2178 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2179 TCP_SKB_CB(skb)->sacked = 0; 2180 TCP_SKB_CB(skb)->has_rxtstamp = 2181 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2182 } 2183 2184 /* 2185 * From tcp_input.c 2186 */ 2187 2188 int tcp_v4_rcv(struct sk_buff *skb) 2189 { 2190 struct net *net = dev_net_rcu(skb->dev); 2191 enum skb_drop_reason drop_reason; 2192 enum tcp_tw_status tw_status; 2193 int sdif = inet_sdif(skb); 2194 int dif = inet_iif(skb); 2195 const struct iphdr *iph; 2196 const struct tcphdr *th; 2197 struct sock *sk = NULL; 2198 bool refcounted; 2199 int ret; 2200 u32 isn; 2201 2202 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2203 if (skb->pkt_type != PACKET_HOST) 2204 goto discard_it; 2205 2206 /* Count it even if it's bad */ 2207 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2208 2209 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2210 goto discard_it; 2211 2212 th = (const struct tcphdr *)skb->data; 2213 2214 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2215 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2216 goto bad_packet; 2217 } 2218 if (!pskb_may_pull(skb, th->doff * 4)) 2219 goto discard_it; 2220 2221 /* An explanation is required here, I think. 2222 * Packet length and doff are validated by header prediction, 2223 * provided case of th->doff==0 is eliminated. 2224 * So, we defer the checks. */ 2225 2226 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2227 goto csum_error; 2228 2229 th = (const struct tcphdr *)skb->data; 2230 iph = ip_hdr(skb); 2231 lookup: 2232 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2233 skb, __tcp_hdrlen(th), th->source, 2234 th->dest, sdif, &refcounted); 2235 if (!sk) 2236 goto no_tcp_socket; 2237 2238 if (sk->sk_state == TCP_TIME_WAIT) 2239 goto do_time_wait; 2240 2241 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2242 struct request_sock *req = inet_reqsk(sk); 2243 bool req_stolen = false; 2244 struct sock *nsk; 2245 2246 sk = req->rsk_listener; 2247 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2248 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2249 else 2250 drop_reason = tcp_inbound_hash(sk, req, skb, 2251 &iph->saddr, &iph->daddr, 2252 AF_INET, dif, sdif); 2253 if (unlikely(drop_reason)) { 2254 sk_drops_add(sk, skb); 2255 reqsk_put(req); 2256 goto discard_it; 2257 } 2258 if (tcp_checksum_complete(skb)) { 2259 reqsk_put(req); 2260 goto csum_error; 2261 } 2262 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2263 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2264 if (!nsk) { 2265 inet_csk_reqsk_queue_drop_and_put(sk, req); 2266 goto lookup; 2267 } 2268 sk = nsk; 2269 /* reuseport_migrate_sock() has already held one sk_refcnt 2270 * before returning. 2271 */ 2272 } else { 2273 /* We own a reference on the listener, increase it again 2274 * as we might lose it too soon. 2275 */ 2276 sock_hold(sk); 2277 } 2278 refcounted = true; 2279 nsk = NULL; 2280 if (!tcp_filter(sk, skb)) { 2281 th = (const struct tcphdr *)skb->data; 2282 iph = ip_hdr(skb); 2283 tcp_v4_fill_cb(skb, iph, th); 2284 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2285 &drop_reason); 2286 } else { 2287 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2288 } 2289 if (!nsk) { 2290 reqsk_put(req); 2291 if (req_stolen) { 2292 /* Another cpu got exclusive access to req 2293 * and created a full blown socket. 2294 * Try to feed this packet to this socket 2295 * instead of discarding it. 2296 */ 2297 tcp_v4_restore_cb(skb); 2298 sock_put(sk); 2299 goto lookup; 2300 } 2301 goto discard_and_relse; 2302 } 2303 nf_reset_ct(skb); 2304 if (nsk == sk) { 2305 reqsk_put(req); 2306 tcp_v4_restore_cb(skb); 2307 } else { 2308 drop_reason = tcp_child_process(sk, nsk, skb); 2309 if (drop_reason) { 2310 enum sk_rst_reason rst_reason; 2311 2312 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2313 tcp_v4_send_reset(nsk, skb, rst_reason); 2314 goto discard_and_relse; 2315 } 2316 sock_put(sk); 2317 return 0; 2318 } 2319 } 2320 2321 process: 2322 if (static_branch_unlikely(&ip4_min_ttl)) { 2323 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2324 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2325 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2326 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2327 goto discard_and_relse; 2328 } 2329 } 2330 2331 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2332 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2333 goto discard_and_relse; 2334 } 2335 2336 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2337 AF_INET, dif, sdif); 2338 if (drop_reason) 2339 goto discard_and_relse; 2340 2341 nf_reset_ct(skb); 2342 2343 if (tcp_filter(sk, skb)) { 2344 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2345 goto discard_and_relse; 2346 } 2347 th = (const struct tcphdr *)skb->data; 2348 iph = ip_hdr(skb); 2349 tcp_v4_fill_cb(skb, iph, th); 2350 2351 skb->dev = NULL; 2352 2353 if (sk->sk_state == TCP_LISTEN) { 2354 ret = tcp_v4_do_rcv(sk, skb); 2355 goto put_and_return; 2356 } 2357 2358 sk_incoming_cpu_update(sk); 2359 2360 bh_lock_sock_nested(sk); 2361 tcp_segs_in(tcp_sk(sk), skb); 2362 ret = 0; 2363 if (!sock_owned_by_user(sk)) { 2364 ret = tcp_v4_do_rcv(sk, skb); 2365 } else { 2366 if (tcp_add_backlog(sk, skb, &drop_reason)) 2367 goto discard_and_relse; 2368 } 2369 bh_unlock_sock(sk); 2370 2371 put_and_return: 2372 if (refcounted) 2373 sock_put(sk); 2374 2375 return ret; 2376 2377 no_tcp_socket: 2378 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2379 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2380 goto discard_it; 2381 2382 tcp_v4_fill_cb(skb, iph, th); 2383 2384 if (tcp_checksum_complete(skb)) { 2385 csum_error: 2386 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2387 trace_tcp_bad_csum(skb); 2388 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2389 bad_packet: 2390 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2391 } else { 2392 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2393 } 2394 2395 discard_it: 2396 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2397 /* Discard frame. */ 2398 sk_skb_reason_drop(sk, skb, drop_reason); 2399 return 0; 2400 2401 discard_and_relse: 2402 sk_drops_add(sk, skb); 2403 if (refcounted) 2404 sock_put(sk); 2405 goto discard_it; 2406 2407 do_time_wait: 2408 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2409 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2410 inet_twsk_put(inet_twsk(sk)); 2411 goto discard_it; 2412 } 2413 2414 tcp_v4_fill_cb(skb, iph, th); 2415 2416 if (tcp_checksum_complete(skb)) { 2417 inet_twsk_put(inet_twsk(sk)); 2418 goto csum_error; 2419 } 2420 2421 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2422 &drop_reason); 2423 switch (tw_status) { 2424 case TCP_TW_SYN: { 2425 struct sock *sk2 = inet_lookup_listener(net, 2426 net->ipv4.tcp_death_row.hashinfo, 2427 skb, __tcp_hdrlen(th), 2428 iph->saddr, th->source, 2429 iph->daddr, th->dest, 2430 inet_iif(skb), 2431 sdif); 2432 if (sk2) { 2433 inet_twsk_deschedule_put(inet_twsk(sk)); 2434 sk = sk2; 2435 tcp_v4_restore_cb(skb); 2436 refcounted = false; 2437 __this_cpu_write(tcp_tw_isn, isn); 2438 goto process; 2439 } 2440 } 2441 /* to ACK */ 2442 fallthrough; 2443 case TCP_TW_ACK: 2444 case TCP_TW_ACK_OOW: 2445 tcp_v4_timewait_ack(sk, skb, tw_status); 2446 break; 2447 case TCP_TW_RST: 2448 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2449 inet_twsk_deschedule_put(inet_twsk(sk)); 2450 goto discard_it; 2451 case TCP_TW_SUCCESS:; 2452 } 2453 goto discard_it; 2454 } 2455 2456 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2457 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2458 .twsk_destructor= tcp_twsk_destructor, 2459 }; 2460 2461 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2462 { 2463 struct dst_entry *dst = skb_dst(skb); 2464 2465 if (dst && dst_hold_safe(dst)) { 2466 rcu_assign_pointer(sk->sk_rx_dst, dst); 2467 sk->sk_rx_dst_ifindex = skb->skb_iif; 2468 } 2469 } 2470 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2471 2472 const struct inet_connection_sock_af_ops ipv4_specific = { 2473 .queue_xmit = ip_queue_xmit, 2474 .send_check = tcp_v4_send_check, 2475 .rebuild_header = inet_sk_rebuild_header, 2476 .sk_rx_dst_set = inet_sk_rx_dst_set, 2477 .conn_request = tcp_v4_conn_request, 2478 .syn_recv_sock = tcp_v4_syn_recv_sock, 2479 .net_header_len = sizeof(struct iphdr), 2480 .setsockopt = ip_setsockopt, 2481 .getsockopt = ip_getsockopt, 2482 .mtu_reduced = tcp_v4_mtu_reduced, 2483 }; 2484 EXPORT_IPV6_MOD(ipv4_specific); 2485 2486 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2487 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2488 #ifdef CONFIG_TCP_MD5SIG 2489 .md5_lookup = tcp_v4_md5_lookup, 2490 .calc_md5_hash = tcp_v4_md5_hash_skb, 2491 .md5_parse = tcp_v4_parse_md5_keys, 2492 #endif 2493 #ifdef CONFIG_TCP_AO 2494 .ao_lookup = tcp_v4_ao_lookup, 2495 .calc_ao_hash = tcp_v4_ao_hash_skb, 2496 .ao_parse = tcp_v4_parse_ao, 2497 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2498 #endif 2499 }; 2500 #endif 2501 2502 /* NOTE: A lot of things set to zero explicitly by call to 2503 * sk_alloc() so need not be done here. 2504 */ 2505 static int tcp_v4_init_sock(struct sock *sk) 2506 { 2507 struct inet_connection_sock *icsk = inet_csk(sk); 2508 2509 tcp_init_sock(sk); 2510 2511 icsk->icsk_af_ops = &ipv4_specific; 2512 2513 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2514 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2515 #endif 2516 2517 return 0; 2518 } 2519 2520 #ifdef CONFIG_TCP_MD5SIG 2521 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2522 { 2523 struct tcp_md5sig_info *md5sig; 2524 2525 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2526 kfree(md5sig); 2527 static_branch_slow_dec_deferred(&tcp_md5_needed); 2528 tcp_md5_release_sigpool(); 2529 } 2530 #endif 2531 2532 static void tcp_release_user_frags(struct sock *sk) 2533 { 2534 #ifdef CONFIG_PAGE_POOL 2535 unsigned long index; 2536 void *netmem; 2537 2538 xa_for_each(&sk->sk_user_frags, index, netmem) 2539 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2540 #endif 2541 } 2542 2543 void tcp_v4_destroy_sock(struct sock *sk) 2544 { 2545 struct tcp_sock *tp = tcp_sk(sk); 2546 2547 tcp_release_user_frags(sk); 2548 2549 xa_destroy(&sk->sk_user_frags); 2550 2551 trace_tcp_destroy_sock(sk); 2552 2553 tcp_clear_xmit_timers(sk); 2554 2555 tcp_cleanup_congestion_control(sk); 2556 2557 tcp_cleanup_ulp(sk); 2558 2559 /* Cleanup up the write buffer. */ 2560 tcp_write_queue_purge(sk); 2561 2562 /* Check if we want to disable active TFO */ 2563 tcp_fastopen_active_disable_ofo_check(sk); 2564 2565 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2566 skb_rbtree_purge(&tp->out_of_order_queue); 2567 2568 #ifdef CONFIG_TCP_MD5SIG 2569 /* Clean up the MD5 key list, if any */ 2570 if (tp->md5sig_info) { 2571 struct tcp_md5sig_info *md5sig; 2572 2573 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2574 tcp_clear_md5_list(sk); 2575 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2576 rcu_assign_pointer(tp->md5sig_info, NULL); 2577 } 2578 #endif 2579 tcp_ao_destroy_sock(sk, false); 2580 2581 /* Clean up a referenced TCP bind bucket. */ 2582 if (inet_csk(sk)->icsk_bind_hash) 2583 inet_put_port(sk); 2584 2585 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2586 2587 /* If socket is aborted during connect operation */ 2588 tcp_free_fastopen_req(tp); 2589 tcp_fastopen_destroy_cipher(sk); 2590 tcp_saved_syn_free(tp); 2591 2592 sk_sockets_allocated_dec(sk); 2593 } 2594 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2595 2596 #ifdef CONFIG_PROC_FS 2597 /* Proc filesystem TCP sock list dumping. */ 2598 2599 static unsigned short seq_file_family(const struct seq_file *seq); 2600 2601 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2602 { 2603 unsigned short family = seq_file_family(seq); 2604 2605 /* AF_UNSPEC is used as a match all */ 2606 return ((family == AF_UNSPEC || family == sk->sk_family) && 2607 net_eq(sock_net(sk), seq_file_net(seq))); 2608 } 2609 2610 /* Find a non empty bucket (starting from st->bucket) 2611 * and return the first sk from it. 2612 */ 2613 static void *listening_get_first(struct seq_file *seq) 2614 { 2615 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2616 struct tcp_iter_state *st = seq->private; 2617 2618 st->offset = 0; 2619 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2620 struct inet_listen_hashbucket *ilb2; 2621 struct hlist_nulls_node *node; 2622 struct sock *sk; 2623 2624 ilb2 = &hinfo->lhash2[st->bucket]; 2625 if (hlist_nulls_empty(&ilb2->nulls_head)) 2626 continue; 2627 2628 spin_lock(&ilb2->lock); 2629 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2630 if (seq_sk_match(seq, sk)) 2631 return sk; 2632 } 2633 spin_unlock(&ilb2->lock); 2634 } 2635 2636 return NULL; 2637 } 2638 2639 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2640 * If "cur" is the last one in the st->bucket, 2641 * call listening_get_first() to return the first sk of the next 2642 * non empty bucket. 2643 */ 2644 static void *listening_get_next(struct seq_file *seq, void *cur) 2645 { 2646 struct tcp_iter_state *st = seq->private; 2647 struct inet_listen_hashbucket *ilb2; 2648 struct hlist_nulls_node *node; 2649 struct inet_hashinfo *hinfo; 2650 struct sock *sk = cur; 2651 2652 ++st->num; 2653 ++st->offset; 2654 2655 sk = sk_nulls_next(sk); 2656 sk_nulls_for_each_from(sk, node) { 2657 if (seq_sk_match(seq, sk)) 2658 return sk; 2659 } 2660 2661 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2662 ilb2 = &hinfo->lhash2[st->bucket]; 2663 spin_unlock(&ilb2->lock); 2664 ++st->bucket; 2665 return listening_get_first(seq); 2666 } 2667 2668 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2669 { 2670 struct tcp_iter_state *st = seq->private; 2671 void *rc; 2672 2673 st->bucket = 0; 2674 st->offset = 0; 2675 rc = listening_get_first(seq); 2676 2677 while (rc && *pos) { 2678 rc = listening_get_next(seq, rc); 2679 --*pos; 2680 } 2681 return rc; 2682 } 2683 2684 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2685 const struct tcp_iter_state *st) 2686 { 2687 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2688 } 2689 2690 /* 2691 * Get first established socket starting from bucket given in st->bucket. 2692 * If st->bucket is zero, the very first socket in the hash is returned. 2693 */ 2694 static void *established_get_first(struct seq_file *seq) 2695 { 2696 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2697 struct tcp_iter_state *st = seq->private; 2698 2699 st->offset = 0; 2700 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2701 struct sock *sk; 2702 struct hlist_nulls_node *node; 2703 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2704 2705 cond_resched(); 2706 2707 /* Lockless fast path for the common case of empty buckets */ 2708 if (empty_bucket(hinfo, st)) 2709 continue; 2710 2711 spin_lock_bh(lock); 2712 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2713 if (seq_sk_match(seq, sk)) 2714 return sk; 2715 } 2716 spin_unlock_bh(lock); 2717 } 2718 2719 return NULL; 2720 } 2721 2722 static void *established_get_next(struct seq_file *seq, void *cur) 2723 { 2724 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2725 struct tcp_iter_state *st = seq->private; 2726 struct hlist_nulls_node *node; 2727 struct sock *sk = cur; 2728 2729 ++st->num; 2730 ++st->offset; 2731 2732 sk = sk_nulls_next(sk); 2733 2734 sk_nulls_for_each_from(sk, node) { 2735 if (seq_sk_match(seq, sk)) 2736 return sk; 2737 } 2738 2739 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2740 ++st->bucket; 2741 return established_get_first(seq); 2742 } 2743 2744 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2745 { 2746 struct tcp_iter_state *st = seq->private; 2747 void *rc; 2748 2749 st->bucket = 0; 2750 rc = established_get_first(seq); 2751 2752 while (rc && pos) { 2753 rc = established_get_next(seq, rc); 2754 --pos; 2755 } 2756 return rc; 2757 } 2758 2759 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2760 { 2761 void *rc; 2762 struct tcp_iter_state *st = seq->private; 2763 2764 st->state = TCP_SEQ_STATE_LISTENING; 2765 rc = listening_get_idx(seq, &pos); 2766 2767 if (!rc) { 2768 st->state = TCP_SEQ_STATE_ESTABLISHED; 2769 rc = established_get_idx(seq, pos); 2770 } 2771 2772 return rc; 2773 } 2774 2775 static void *tcp_seek_last_pos(struct seq_file *seq) 2776 { 2777 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2778 struct tcp_iter_state *st = seq->private; 2779 int bucket = st->bucket; 2780 int offset = st->offset; 2781 int orig_num = st->num; 2782 void *rc = NULL; 2783 2784 switch (st->state) { 2785 case TCP_SEQ_STATE_LISTENING: 2786 if (st->bucket > hinfo->lhash2_mask) 2787 break; 2788 rc = listening_get_first(seq); 2789 while (offset-- && rc && bucket == st->bucket) 2790 rc = listening_get_next(seq, rc); 2791 if (rc) 2792 break; 2793 st->bucket = 0; 2794 st->state = TCP_SEQ_STATE_ESTABLISHED; 2795 fallthrough; 2796 case TCP_SEQ_STATE_ESTABLISHED: 2797 if (st->bucket > hinfo->ehash_mask) 2798 break; 2799 rc = established_get_first(seq); 2800 while (offset-- && rc && bucket == st->bucket) 2801 rc = established_get_next(seq, rc); 2802 } 2803 2804 st->num = orig_num; 2805 2806 return rc; 2807 } 2808 2809 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2810 { 2811 struct tcp_iter_state *st = seq->private; 2812 void *rc; 2813 2814 if (*pos && *pos == st->last_pos) { 2815 rc = tcp_seek_last_pos(seq); 2816 if (rc) 2817 goto out; 2818 } 2819 2820 st->state = TCP_SEQ_STATE_LISTENING; 2821 st->num = 0; 2822 st->bucket = 0; 2823 st->offset = 0; 2824 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2825 2826 out: 2827 st->last_pos = *pos; 2828 return rc; 2829 } 2830 EXPORT_IPV6_MOD(tcp_seq_start); 2831 2832 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2833 { 2834 struct tcp_iter_state *st = seq->private; 2835 void *rc = NULL; 2836 2837 if (v == SEQ_START_TOKEN) { 2838 rc = tcp_get_idx(seq, 0); 2839 goto out; 2840 } 2841 2842 switch (st->state) { 2843 case TCP_SEQ_STATE_LISTENING: 2844 rc = listening_get_next(seq, v); 2845 if (!rc) { 2846 st->state = TCP_SEQ_STATE_ESTABLISHED; 2847 st->bucket = 0; 2848 st->offset = 0; 2849 rc = established_get_first(seq); 2850 } 2851 break; 2852 case TCP_SEQ_STATE_ESTABLISHED: 2853 rc = established_get_next(seq, v); 2854 break; 2855 } 2856 out: 2857 ++*pos; 2858 st->last_pos = *pos; 2859 return rc; 2860 } 2861 EXPORT_IPV6_MOD(tcp_seq_next); 2862 2863 void tcp_seq_stop(struct seq_file *seq, void *v) 2864 { 2865 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2866 struct tcp_iter_state *st = seq->private; 2867 2868 switch (st->state) { 2869 case TCP_SEQ_STATE_LISTENING: 2870 if (v != SEQ_START_TOKEN) 2871 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2872 break; 2873 case TCP_SEQ_STATE_ESTABLISHED: 2874 if (v) 2875 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2876 break; 2877 } 2878 } 2879 EXPORT_IPV6_MOD(tcp_seq_stop); 2880 2881 static void get_openreq4(const struct request_sock *req, 2882 struct seq_file *f, int i) 2883 { 2884 const struct inet_request_sock *ireq = inet_rsk(req); 2885 long delta = req->rsk_timer.expires - jiffies; 2886 2887 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2888 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2889 i, 2890 ireq->ir_loc_addr, 2891 ireq->ir_num, 2892 ireq->ir_rmt_addr, 2893 ntohs(ireq->ir_rmt_port), 2894 TCP_SYN_RECV, 2895 0, 0, /* could print option size, but that is af dependent. */ 2896 1, /* timers active (only the expire timer) */ 2897 jiffies_delta_to_clock_t(delta), 2898 req->num_timeout, 2899 from_kuid_munged(seq_user_ns(f), 2900 sk_uid(req->rsk_listener)), 2901 0, /* non standard timer */ 2902 0, /* open_requests have no inode */ 2903 0, 2904 req); 2905 } 2906 2907 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2908 { 2909 int timer_active; 2910 unsigned long timer_expires; 2911 const struct tcp_sock *tp = tcp_sk(sk); 2912 const struct inet_connection_sock *icsk = inet_csk(sk); 2913 const struct inet_sock *inet = inet_sk(sk); 2914 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2915 __be32 dest = inet->inet_daddr; 2916 __be32 src = inet->inet_rcv_saddr; 2917 __u16 destp = ntohs(inet->inet_dport); 2918 __u16 srcp = ntohs(inet->inet_sport); 2919 u8 icsk_pending; 2920 int rx_queue; 2921 int state; 2922 2923 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2924 if (icsk_pending == ICSK_TIME_RETRANS || 2925 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2926 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2927 timer_active = 1; 2928 timer_expires = icsk_timeout(icsk); 2929 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2930 timer_active = 4; 2931 timer_expires = icsk_timeout(icsk); 2932 } else if (timer_pending(&sk->sk_timer)) { 2933 timer_active = 2; 2934 timer_expires = sk->sk_timer.expires; 2935 } else { 2936 timer_active = 0; 2937 timer_expires = jiffies; 2938 } 2939 2940 state = inet_sk_state_load(sk); 2941 if (state == TCP_LISTEN) 2942 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2943 else 2944 /* Because we don't lock the socket, 2945 * we might find a transient negative value. 2946 */ 2947 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2948 READ_ONCE(tp->copied_seq), 0); 2949 2950 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2951 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2952 i, src, srcp, dest, destp, state, 2953 READ_ONCE(tp->write_seq) - tp->snd_una, 2954 rx_queue, 2955 timer_active, 2956 jiffies_delta_to_clock_t(timer_expires - jiffies), 2957 icsk->icsk_retransmits, 2958 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2959 icsk->icsk_probes_out, 2960 sock_i_ino(sk), 2961 refcount_read(&sk->sk_refcnt), sk, 2962 jiffies_to_clock_t(icsk->icsk_rto), 2963 jiffies_to_clock_t(icsk->icsk_ack.ato), 2964 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2965 tcp_snd_cwnd(tp), 2966 state == TCP_LISTEN ? 2967 fastopenq->max_qlen : 2968 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2969 } 2970 2971 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2972 struct seq_file *f, int i) 2973 { 2974 long delta = tw->tw_timer.expires - jiffies; 2975 __be32 dest, src; 2976 __u16 destp, srcp; 2977 2978 dest = tw->tw_daddr; 2979 src = tw->tw_rcv_saddr; 2980 destp = ntohs(tw->tw_dport); 2981 srcp = ntohs(tw->tw_sport); 2982 2983 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2984 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2985 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2986 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2987 refcount_read(&tw->tw_refcnt), tw); 2988 } 2989 2990 #define TMPSZ 150 2991 2992 static int tcp4_seq_show(struct seq_file *seq, void *v) 2993 { 2994 struct tcp_iter_state *st; 2995 struct sock *sk = v; 2996 2997 seq_setwidth(seq, TMPSZ - 1); 2998 if (v == SEQ_START_TOKEN) { 2999 seq_puts(seq, " sl local_address rem_address st tx_queue " 3000 "rx_queue tr tm->when retrnsmt uid timeout " 3001 "inode"); 3002 goto out; 3003 } 3004 st = seq->private; 3005 3006 if (sk->sk_state == TCP_TIME_WAIT) 3007 get_timewait4_sock(v, seq, st->num); 3008 else if (sk->sk_state == TCP_NEW_SYN_RECV) 3009 get_openreq4(v, seq, st->num); 3010 else 3011 get_tcp4_sock(v, seq, st->num); 3012 out: 3013 seq_pad(seq, '\n'); 3014 return 0; 3015 } 3016 3017 #ifdef CONFIG_BPF_SYSCALL 3018 union bpf_tcp_iter_batch_item { 3019 struct sock *sk; 3020 __u64 cookie; 3021 }; 3022 3023 struct bpf_tcp_iter_state { 3024 struct tcp_iter_state state; 3025 unsigned int cur_sk; 3026 unsigned int end_sk; 3027 unsigned int max_sk; 3028 union bpf_tcp_iter_batch_item *batch; 3029 }; 3030 3031 struct bpf_iter__tcp { 3032 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3033 __bpf_md_ptr(struct sock_common *, sk_common); 3034 uid_t uid __aligned(8); 3035 }; 3036 3037 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3038 struct sock_common *sk_common, uid_t uid) 3039 { 3040 struct bpf_iter__tcp ctx; 3041 3042 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3043 ctx.meta = meta; 3044 ctx.sk_common = sk_common; 3045 ctx.uid = uid; 3046 return bpf_iter_run_prog(prog, &ctx); 3047 } 3048 3049 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3050 { 3051 union bpf_tcp_iter_batch_item *item; 3052 unsigned int cur_sk = iter->cur_sk; 3053 __u64 cookie; 3054 3055 /* Remember the cookies of the sockets we haven't seen yet, so we can 3056 * pick up where we left off next time around. 3057 */ 3058 while (cur_sk < iter->end_sk) { 3059 item = &iter->batch[cur_sk++]; 3060 cookie = sock_gen_cookie(item->sk); 3061 sock_gen_put(item->sk); 3062 item->cookie = cookie; 3063 } 3064 } 3065 3066 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3067 unsigned int new_batch_sz, gfp_t flags) 3068 { 3069 union bpf_tcp_iter_batch_item *new_batch; 3070 3071 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3072 flags | __GFP_NOWARN); 3073 if (!new_batch) 3074 return -ENOMEM; 3075 3076 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 3077 kvfree(iter->batch); 3078 iter->batch = new_batch; 3079 iter->max_sk = new_batch_sz; 3080 3081 return 0; 3082 } 3083 3084 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 3085 union bpf_tcp_iter_batch_item *cookies, 3086 int n_cookies) 3087 { 3088 struct hlist_nulls_node *node; 3089 struct sock *sk; 3090 int i; 3091 3092 for (i = 0; i < n_cookies; i++) { 3093 sk = first_sk; 3094 sk_nulls_for_each_from(sk, node) 3095 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 3096 return sk; 3097 } 3098 3099 return NULL; 3100 } 3101 3102 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 3103 { 3104 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3105 struct bpf_tcp_iter_state *iter = seq->private; 3106 struct tcp_iter_state *st = &iter->state; 3107 unsigned int find_cookie = iter->cur_sk; 3108 unsigned int end_cookie = iter->end_sk; 3109 int resume_bucket = st->bucket; 3110 struct sock *sk; 3111 3112 if (end_cookie && find_cookie == end_cookie) 3113 ++st->bucket; 3114 3115 sk = listening_get_first(seq); 3116 iter->cur_sk = 0; 3117 iter->end_sk = 0; 3118 3119 if (sk && st->bucket == resume_bucket && end_cookie) { 3120 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3121 end_cookie - find_cookie); 3122 if (!sk) { 3123 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3124 ++st->bucket; 3125 sk = listening_get_first(seq); 3126 } 3127 } 3128 3129 return sk; 3130 } 3131 3132 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 3133 { 3134 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3135 struct bpf_tcp_iter_state *iter = seq->private; 3136 struct tcp_iter_state *st = &iter->state; 3137 unsigned int find_cookie = iter->cur_sk; 3138 unsigned int end_cookie = iter->end_sk; 3139 int resume_bucket = st->bucket; 3140 struct sock *sk; 3141 3142 if (end_cookie && find_cookie == end_cookie) 3143 ++st->bucket; 3144 3145 sk = established_get_first(seq); 3146 iter->cur_sk = 0; 3147 iter->end_sk = 0; 3148 3149 if (sk && st->bucket == resume_bucket && end_cookie) { 3150 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3151 end_cookie - find_cookie); 3152 if (!sk) { 3153 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3154 ++st->bucket; 3155 sk = established_get_first(seq); 3156 } 3157 } 3158 3159 return sk; 3160 } 3161 3162 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3163 { 3164 struct bpf_tcp_iter_state *iter = seq->private; 3165 struct tcp_iter_state *st = &iter->state; 3166 struct sock *sk = NULL; 3167 3168 switch (st->state) { 3169 case TCP_SEQ_STATE_LISTENING: 3170 sk = bpf_iter_tcp_resume_listening(seq); 3171 if (sk) 3172 break; 3173 st->bucket = 0; 3174 st->state = TCP_SEQ_STATE_ESTABLISHED; 3175 fallthrough; 3176 case TCP_SEQ_STATE_ESTABLISHED: 3177 sk = bpf_iter_tcp_resume_established(seq); 3178 break; 3179 } 3180 3181 return sk; 3182 } 3183 3184 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3185 struct sock **start_sk) 3186 { 3187 struct bpf_tcp_iter_state *iter = seq->private; 3188 struct hlist_nulls_node *node; 3189 unsigned int expected = 1; 3190 struct sock *sk; 3191 3192 sock_hold(*start_sk); 3193 iter->batch[iter->end_sk++].sk = *start_sk; 3194 3195 sk = sk_nulls_next(*start_sk); 3196 *start_sk = NULL; 3197 sk_nulls_for_each_from(sk, node) { 3198 if (seq_sk_match(seq, sk)) { 3199 if (iter->end_sk < iter->max_sk) { 3200 sock_hold(sk); 3201 iter->batch[iter->end_sk++].sk = sk; 3202 } else if (!*start_sk) { 3203 /* Remember where we left off. */ 3204 *start_sk = sk; 3205 } 3206 expected++; 3207 } 3208 } 3209 3210 return expected; 3211 } 3212 3213 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3214 struct sock **start_sk) 3215 { 3216 struct bpf_tcp_iter_state *iter = seq->private; 3217 struct hlist_nulls_node *node; 3218 unsigned int expected = 1; 3219 struct sock *sk; 3220 3221 sock_hold(*start_sk); 3222 iter->batch[iter->end_sk++].sk = *start_sk; 3223 3224 sk = sk_nulls_next(*start_sk); 3225 *start_sk = NULL; 3226 sk_nulls_for_each_from(sk, node) { 3227 if (seq_sk_match(seq, sk)) { 3228 if (iter->end_sk < iter->max_sk) { 3229 sock_hold(sk); 3230 iter->batch[iter->end_sk++].sk = sk; 3231 } else if (!*start_sk) { 3232 /* Remember where we left off. */ 3233 *start_sk = sk; 3234 } 3235 expected++; 3236 } 3237 } 3238 3239 return expected; 3240 } 3241 3242 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3243 struct sock **start_sk) 3244 { 3245 struct bpf_tcp_iter_state *iter = seq->private; 3246 struct tcp_iter_state *st = &iter->state; 3247 3248 if (st->state == TCP_SEQ_STATE_LISTENING) 3249 return bpf_iter_tcp_listening_batch(seq, start_sk); 3250 else 3251 return bpf_iter_tcp_established_batch(seq, start_sk); 3252 } 3253 3254 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3255 { 3256 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3257 struct bpf_tcp_iter_state *iter = seq->private; 3258 struct tcp_iter_state *st = &iter->state; 3259 3260 if (st->state == TCP_SEQ_STATE_LISTENING) 3261 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3262 else 3263 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3264 } 3265 3266 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3267 { 3268 struct bpf_tcp_iter_state *iter = seq->private; 3269 unsigned int expected; 3270 struct sock *sk; 3271 int err; 3272 3273 sk = bpf_iter_tcp_resume(seq); 3274 if (!sk) 3275 return NULL; /* Done */ 3276 3277 expected = bpf_iter_fill_batch(seq, &sk); 3278 if (likely(iter->end_sk == expected)) 3279 goto done; 3280 3281 /* Batch size was too small. */ 3282 bpf_iter_tcp_unlock_bucket(seq); 3283 bpf_iter_tcp_put_batch(iter); 3284 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3285 GFP_USER); 3286 if (err) 3287 return ERR_PTR(err); 3288 3289 sk = bpf_iter_tcp_resume(seq); 3290 if (!sk) 3291 return NULL; /* Done */ 3292 3293 expected = bpf_iter_fill_batch(seq, &sk); 3294 if (likely(iter->end_sk == expected)) 3295 goto done; 3296 3297 /* Batch size was still too small. Hold onto the lock while we try 3298 * again with a larger batch to make sure the current bucket's size 3299 * does not change in the meantime. 3300 */ 3301 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3302 if (err) { 3303 bpf_iter_tcp_unlock_bucket(seq); 3304 return ERR_PTR(err); 3305 } 3306 3307 expected = bpf_iter_fill_batch(seq, &sk); 3308 WARN_ON_ONCE(iter->end_sk != expected); 3309 done: 3310 bpf_iter_tcp_unlock_bucket(seq); 3311 return iter->batch[0].sk; 3312 } 3313 3314 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3315 { 3316 /* bpf iter does not support lseek, so it always 3317 * continue from where it was stop()-ped. 3318 */ 3319 if (*pos) 3320 return bpf_iter_tcp_batch(seq); 3321 3322 return SEQ_START_TOKEN; 3323 } 3324 3325 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3326 { 3327 struct bpf_tcp_iter_state *iter = seq->private; 3328 struct tcp_iter_state *st = &iter->state; 3329 struct sock *sk; 3330 3331 /* Whenever seq_next() is called, the iter->cur_sk is 3332 * done with seq_show(), so advance to the next sk in 3333 * the batch. 3334 */ 3335 if (iter->cur_sk < iter->end_sk) { 3336 /* Keeping st->num consistent in tcp_iter_state. 3337 * bpf_iter_tcp does not use st->num. 3338 * meta.seq_num is used instead. 3339 */ 3340 st->num++; 3341 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3342 } 3343 3344 if (iter->cur_sk < iter->end_sk) 3345 sk = iter->batch[iter->cur_sk].sk; 3346 else 3347 sk = bpf_iter_tcp_batch(seq); 3348 3349 ++*pos; 3350 /* Keeping st->last_pos consistent in tcp_iter_state. 3351 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3352 */ 3353 st->last_pos = *pos; 3354 return sk; 3355 } 3356 3357 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3358 { 3359 struct bpf_iter_meta meta; 3360 struct bpf_prog *prog; 3361 struct sock *sk = v; 3362 uid_t uid; 3363 int ret; 3364 3365 if (v == SEQ_START_TOKEN) 3366 return 0; 3367 3368 if (sk_fullsock(sk)) 3369 lock_sock(sk); 3370 3371 if (unlikely(sk_unhashed(sk))) { 3372 ret = SEQ_SKIP; 3373 goto unlock; 3374 } 3375 3376 if (sk->sk_state == TCP_TIME_WAIT) { 3377 uid = 0; 3378 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3379 const struct request_sock *req = v; 3380 3381 uid = from_kuid_munged(seq_user_ns(seq), 3382 sk_uid(req->rsk_listener)); 3383 } else { 3384 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3385 } 3386 3387 meta.seq = seq; 3388 prog = bpf_iter_get_info(&meta, false); 3389 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3390 3391 unlock: 3392 if (sk_fullsock(sk)) 3393 release_sock(sk); 3394 return ret; 3395 3396 } 3397 3398 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3399 { 3400 struct bpf_tcp_iter_state *iter = seq->private; 3401 struct bpf_iter_meta meta; 3402 struct bpf_prog *prog; 3403 3404 if (!v) { 3405 meta.seq = seq; 3406 prog = bpf_iter_get_info(&meta, true); 3407 if (prog) 3408 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3409 } 3410 3411 if (iter->cur_sk < iter->end_sk) 3412 bpf_iter_tcp_put_batch(iter); 3413 } 3414 3415 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3416 .show = bpf_iter_tcp_seq_show, 3417 .start = bpf_iter_tcp_seq_start, 3418 .next = bpf_iter_tcp_seq_next, 3419 .stop = bpf_iter_tcp_seq_stop, 3420 }; 3421 #endif 3422 static unsigned short seq_file_family(const struct seq_file *seq) 3423 { 3424 const struct tcp_seq_afinfo *afinfo; 3425 3426 #ifdef CONFIG_BPF_SYSCALL 3427 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3428 if (seq->op == &bpf_iter_tcp_seq_ops) 3429 return AF_UNSPEC; 3430 #endif 3431 3432 /* Iterated from proc fs */ 3433 afinfo = pde_data(file_inode(seq->file)); 3434 return afinfo->family; 3435 } 3436 3437 static const struct seq_operations tcp4_seq_ops = { 3438 .show = tcp4_seq_show, 3439 .start = tcp_seq_start, 3440 .next = tcp_seq_next, 3441 .stop = tcp_seq_stop, 3442 }; 3443 3444 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3445 .family = AF_INET, 3446 }; 3447 3448 static int __net_init tcp4_proc_init_net(struct net *net) 3449 { 3450 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3451 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3452 return -ENOMEM; 3453 return 0; 3454 } 3455 3456 static void __net_exit tcp4_proc_exit_net(struct net *net) 3457 { 3458 remove_proc_entry("tcp", net->proc_net); 3459 } 3460 3461 static struct pernet_operations tcp4_net_ops = { 3462 .init = tcp4_proc_init_net, 3463 .exit = tcp4_proc_exit_net, 3464 }; 3465 3466 int __init tcp4_proc_init(void) 3467 { 3468 return register_pernet_subsys(&tcp4_net_ops); 3469 } 3470 3471 void tcp4_proc_exit(void) 3472 { 3473 unregister_pernet_subsys(&tcp4_net_ops); 3474 } 3475 #endif /* CONFIG_PROC_FS */ 3476 3477 /* @wake is one when sk_stream_write_space() calls us. 3478 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3479 * This mimics the strategy used in sock_def_write_space(). 3480 */ 3481 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3482 { 3483 const struct tcp_sock *tp = tcp_sk(sk); 3484 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3485 READ_ONCE(tp->snd_nxt); 3486 3487 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3488 } 3489 EXPORT_SYMBOL(tcp_stream_memory_free); 3490 3491 struct proto tcp_prot = { 3492 .name = "TCP", 3493 .owner = THIS_MODULE, 3494 .close = tcp_close, 3495 .pre_connect = tcp_v4_pre_connect, 3496 .connect = tcp_v4_connect, 3497 .disconnect = tcp_disconnect, 3498 .accept = inet_csk_accept, 3499 .ioctl = tcp_ioctl, 3500 .init = tcp_v4_init_sock, 3501 .destroy = tcp_v4_destroy_sock, 3502 .shutdown = tcp_shutdown, 3503 .setsockopt = tcp_setsockopt, 3504 .getsockopt = tcp_getsockopt, 3505 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3506 .keepalive = tcp_set_keepalive, 3507 .recvmsg = tcp_recvmsg, 3508 .sendmsg = tcp_sendmsg, 3509 .splice_eof = tcp_splice_eof, 3510 .backlog_rcv = tcp_v4_do_rcv, 3511 .release_cb = tcp_release_cb, 3512 .hash = inet_hash, 3513 .unhash = inet_unhash, 3514 .get_port = inet_csk_get_port, 3515 .put_port = inet_put_port, 3516 #ifdef CONFIG_BPF_SYSCALL 3517 .psock_update_sk_prot = tcp_bpf_update_proto, 3518 #endif 3519 .enter_memory_pressure = tcp_enter_memory_pressure, 3520 .leave_memory_pressure = tcp_leave_memory_pressure, 3521 .stream_memory_free = tcp_stream_memory_free, 3522 .sockets_allocated = &tcp_sockets_allocated, 3523 .orphan_count = &tcp_orphan_count, 3524 3525 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3526 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3527 3528 .memory_pressure = &tcp_memory_pressure, 3529 .sysctl_mem = sysctl_tcp_mem, 3530 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3531 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3532 .max_header = MAX_TCP_HEADER, 3533 .obj_size = sizeof(struct tcp_sock), 3534 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3535 .twsk_prot = &tcp_timewait_sock_ops, 3536 .rsk_prot = &tcp_request_sock_ops, 3537 .h.hashinfo = NULL, 3538 .no_autobind = true, 3539 .diag_destroy = tcp_abort, 3540 }; 3541 EXPORT_SYMBOL(tcp_prot); 3542 3543 static void __net_exit tcp_sk_exit(struct net *net) 3544 { 3545 if (net->ipv4.tcp_congestion_control) 3546 bpf_module_put(net->ipv4.tcp_congestion_control, 3547 net->ipv4.tcp_congestion_control->owner); 3548 } 3549 3550 static void __net_init tcp_set_hashinfo(struct net *net) 3551 { 3552 struct inet_hashinfo *hinfo; 3553 unsigned int ehash_entries; 3554 struct net *old_net; 3555 3556 if (net_eq(net, &init_net)) 3557 goto fallback; 3558 3559 old_net = current->nsproxy->net_ns; 3560 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3561 if (!ehash_entries) 3562 goto fallback; 3563 3564 ehash_entries = roundup_pow_of_two(ehash_entries); 3565 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3566 if (!hinfo) { 3567 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3568 "for a netns, fallback to the global one\n", 3569 ehash_entries); 3570 fallback: 3571 hinfo = &tcp_hashinfo; 3572 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3573 } 3574 3575 net->ipv4.tcp_death_row.hashinfo = hinfo; 3576 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3577 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3578 } 3579 3580 static int __net_init tcp_sk_init(struct net *net) 3581 { 3582 net->ipv4.sysctl_tcp_ecn = 2; 3583 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3584 3585 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3586 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3587 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3588 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3589 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3590 3591 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3592 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3593 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3594 3595 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3596 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3597 net->ipv4.sysctl_tcp_syncookies = 1; 3598 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3599 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3600 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3601 net->ipv4.sysctl_tcp_orphan_retries = 0; 3602 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3603 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3604 net->ipv4.sysctl_tcp_tw_reuse = 2; 3605 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3606 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3607 3608 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3609 tcp_set_hashinfo(net); 3610 3611 net->ipv4.sysctl_tcp_sack = 1; 3612 net->ipv4.sysctl_tcp_window_scaling = 1; 3613 net->ipv4.sysctl_tcp_timestamps = 1; 3614 net->ipv4.sysctl_tcp_early_retrans = 3; 3615 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3616 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3617 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3618 net->ipv4.sysctl_tcp_max_reordering = 300; 3619 net->ipv4.sysctl_tcp_dsack = 1; 3620 net->ipv4.sysctl_tcp_app_win = 31; 3621 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3622 net->ipv4.sysctl_tcp_frto = 2; 3623 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3624 /* This limits the percentage of the congestion window which we 3625 * will allow a single TSO frame to consume. Building TSO frames 3626 * which are too large can cause TCP streams to be bursty. 3627 */ 3628 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3629 /* Default TSQ limit of 4 MB */ 3630 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3631 3632 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3633 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3634 3635 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3636 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3637 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3638 net->ipv4.sysctl_tcp_autocorking = 1; 3639 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3640 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3641 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3642 if (net != &init_net) { 3643 memcpy(net->ipv4.sysctl_tcp_rmem, 3644 init_net.ipv4.sysctl_tcp_rmem, 3645 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3646 memcpy(net->ipv4.sysctl_tcp_wmem, 3647 init_net.ipv4.sysctl_tcp_wmem, 3648 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3649 } 3650 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3651 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3652 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3653 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3654 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3655 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3656 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3657 3658 /* Set default values for PLB */ 3659 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3660 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3661 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3662 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3663 /* Default congestion threshold for PLB to mark a round is 50% */ 3664 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3665 3666 /* Reno is always built in */ 3667 if (!net_eq(net, &init_net) && 3668 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3669 init_net.ipv4.tcp_congestion_control->owner)) 3670 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3671 else 3672 net->ipv4.tcp_congestion_control = &tcp_reno; 3673 3674 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3675 net->ipv4.sysctl_tcp_shrink_window = 0; 3676 3677 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3678 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3679 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3680 3681 return 0; 3682 } 3683 3684 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3685 { 3686 struct net *net; 3687 3688 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3689 * and failed setup_net error unwinding path are serialized. 3690 * 3691 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3692 * net_exit_list, the thread that dismantles a particular twsk must 3693 * do so without other thread progressing to refcount_dec_and_test() of 3694 * tcp_death_row.tw_refcount. 3695 */ 3696 mutex_lock(&tcp_exit_batch_mutex); 3697 3698 tcp_twsk_purge(net_exit_list); 3699 3700 list_for_each_entry(net, net_exit_list, exit_list) { 3701 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3702 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3703 tcp_fastopen_ctx_destroy(net); 3704 } 3705 3706 mutex_unlock(&tcp_exit_batch_mutex); 3707 } 3708 3709 static struct pernet_operations __net_initdata tcp_sk_ops = { 3710 .init = tcp_sk_init, 3711 .exit = tcp_sk_exit, 3712 .exit_batch = tcp_sk_exit_batch, 3713 }; 3714 3715 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3716 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3717 struct sock_common *sk_common, uid_t uid) 3718 3719 #define INIT_BATCH_SZ 16 3720 3721 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3722 { 3723 struct bpf_tcp_iter_state *iter = priv_data; 3724 int err; 3725 3726 err = bpf_iter_init_seq_net(priv_data, aux); 3727 if (err) 3728 return err; 3729 3730 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3731 if (err) { 3732 bpf_iter_fini_seq_net(priv_data); 3733 return err; 3734 } 3735 3736 return 0; 3737 } 3738 3739 static void bpf_iter_fini_tcp(void *priv_data) 3740 { 3741 struct bpf_tcp_iter_state *iter = priv_data; 3742 3743 bpf_iter_fini_seq_net(priv_data); 3744 kvfree(iter->batch); 3745 } 3746 3747 static const struct bpf_iter_seq_info tcp_seq_info = { 3748 .seq_ops = &bpf_iter_tcp_seq_ops, 3749 .init_seq_private = bpf_iter_init_tcp, 3750 .fini_seq_private = bpf_iter_fini_tcp, 3751 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3752 }; 3753 3754 static const struct bpf_func_proto * 3755 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3756 const struct bpf_prog *prog) 3757 { 3758 switch (func_id) { 3759 case BPF_FUNC_setsockopt: 3760 return &bpf_sk_setsockopt_proto; 3761 case BPF_FUNC_getsockopt: 3762 return &bpf_sk_getsockopt_proto; 3763 default: 3764 return NULL; 3765 } 3766 } 3767 3768 static struct bpf_iter_reg tcp_reg_info = { 3769 .target = "tcp", 3770 .ctx_arg_info_size = 1, 3771 .ctx_arg_info = { 3772 { offsetof(struct bpf_iter__tcp, sk_common), 3773 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3774 }, 3775 .get_func_proto = bpf_iter_tcp_get_func_proto, 3776 .seq_info = &tcp_seq_info, 3777 }; 3778 3779 static void __init bpf_iter_register(void) 3780 { 3781 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3782 if (bpf_iter_reg_target(&tcp_reg_info)) 3783 pr_warn("Warning: could not register bpf iterator tcp\n"); 3784 } 3785 3786 #endif 3787 3788 void __init tcp_v4_init(void) 3789 { 3790 int cpu, res; 3791 3792 for_each_possible_cpu(cpu) { 3793 struct sock *sk; 3794 3795 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3796 IPPROTO_TCP, &init_net); 3797 if (res) 3798 panic("Failed to create the TCP control socket.\n"); 3799 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3800 3801 /* Please enforce IP_DF and IPID==0 for RST and 3802 * ACK sent in SYN-RECV and TIME-WAIT state. 3803 */ 3804 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3805 3806 sk->sk_clockid = CLOCK_MONOTONIC; 3807 3808 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3809 } 3810 if (register_pernet_subsys(&tcp_sk_ops)) 3811 panic("Failed to create the TCP control socket.\n"); 3812 3813 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3814 bpf_iter_register(); 3815 #endif 3816 } 3817