1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/aligned_data.h> 63 #include <net/net_namespace.h> 64 #include <net/icmp.h> 65 #include <net/inet_hashtables.h> 66 #include <net/tcp.h> 67 #include <net/transp_v6.h> 68 #include <net/ipv6.h> 69 #include <net/inet_common.h> 70 #include <net/inet_ecn.h> 71 #include <net/timewait_sock.h> 72 #include <net/xfrm.h> 73 #include <net/secure_seq.h> 74 #include <net/busy_poll.h> 75 #include <net/rstreason.h> 76 77 #include <linux/inet.h> 78 #include <linux/ipv6.h> 79 #include <linux/stddef.h> 80 #include <linux/proc_fs.h> 81 #include <linux/seq_file.h> 82 #include <linux/inetdevice.h> 83 #include <linux/btf_ids.h> 84 #include <linux/skbuff_ref.h> 85 86 #include <crypto/hash.h> 87 #include <linux/scatterlist.h> 88 89 #include <trace/events/tcp.h> 90 91 #ifdef CONFIG_TCP_MD5SIG 92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 93 __be32 daddr, __be32 saddr, const struct tcphdr *th); 94 #endif 95 96 struct inet_hashinfo tcp_hashinfo; 97 98 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 99 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 100 }; 101 102 static DEFINE_MUTEX(tcp_exit_batch_mutex); 103 104 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 105 { 106 return secure_tcp_seq(ip_hdr(skb)->daddr, 107 ip_hdr(skb)->saddr, 108 tcp_hdr(skb)->dest, 109 tcp_hdr(skb)->source); 110 } 111 112 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 113 { 114 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 115 } 116 117 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 118 { 119 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 120 const struct inet_timewait_sock *tw = inet_twsk(sktw); 121 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 122 struct tcp_sock *tp = tcp_sk(sk); 123 int ts_recent_stamp; 124 u32 reuse_thresh; 125 126 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 127 reuse = 0; 128 129 if (reuse == 2) { 130 /* Still does not detect *everything* that goes through 131 * lo, since we require a loopback src or dst address 132 * or direct binding to 'lo' interface. 133 */ 134 bool loopback = false; 135 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 136 loopback = true; 137 #if IS_ENABLED(CONFIG_IPV6) 138 if (tw->tw_family == AF_INET6) { 139 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 140 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 141 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 142 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 143 loopback = true; 144 } else 145 #endif 146 { 147 if (ipv4_is_loopback(tw->tw_daddr) || 148 ipv4_is_loopback(tw->tw_rcv_saddr)) 149 loopback = true; 150 } 151 if (!loopback) 152 reuse = 0; 153 } 154 155 /* With PAWS, it is safe from the viewpoint 156 of data integrity. Even without PAWS it is safe provided sequence 157 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 158 159 Actually, the idea is close to VJ's one, only timestamp cache is 160 held not per host, but per port pair and TW bucket is used as state 161 holder. 162 163 If TW bucket has been already destroyed we fall back to VJ's scheme 164 and use initial timestamp retrieved from peer table. 165 */ 166 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 167 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 168 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 169 if (ts_recent_stamp && 170 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 171 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 172 * and releasing the bucket lock. 173 */ 174 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 175 return 0; 176 177 /* In case of repair and re-using TIME-WAIT sockets we still 178 * want to be sure that it is safe as above but honor the 179 * sequence numbers and time stamps set as part of the repair 180 * process. 181 * 182 * Without this check re-using a TIME-WAIT socket with TCP 183 * repair would accumulate a -1 on the repair assigned 184 * sequence number. The first time it is reused the sequence 185 * is -1, the second time -2, etc. This fixes that issue 186 * without appearing to create any others. 187 */ 188 if (likely(!tp->repair)) { 189 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 190 191 if (!seq) 192 seq = 1; 193 WRITE_ONCE(tp->write_seq, seq); 194 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 195 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 196 } 197 198 return 1; 199 } 200 201 return 0; 202 } 203 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 204 205 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 206 int addr_len) 207 { 208 /* This check is replicated from tcp_v4_connect() and intended to 209 * prevent BPF program called below from accessing bytes that are out 210 * of the bound specified by user in addr_len. 211 */ 212 if (addr_len < sizeof(struct sockaddr_in)) 213 return -EINVAL; 214 215 sock_owned_by_me(sk); 216 217 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 218 } 219 220 /* This will initiate an outgoing connection. */ 221 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 222 { 223 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 224 struct inet_timewait_death_row *tcp_death_row; 225 struct inet_sock *inet = inet_sk(sk); 226 struct tcp_sock *tp = tcp_sk(sk); 227 struct ip_options_rcu *inet_opt; 228 struct net *net = sock_net(sk); 229 __be16 orig_sport, orig_dport; 230 __be32 daddr, nexthop; 231 struct flowi4 *fl4; 232 struct rtable *rt; 233 int err; 234 235 if (addr_len < sizeof(struct sockaddr_in)) 236 return -EINVAL; 237 238 if (usin->sin_family != AF_INET) 239 return -EAFNOSUPPORT; 240 241 nexthop = daddr = usin->sin_addr.s_addr; 242 inet_opt = rcu_dereference_protected(inet->inet_opt, 243 lockdep_sock_is_held(sk)); 244 if (inet_opt && inet_opt->opt.srr) { 245 if (!daddr) 246 return -EINVAL; 247 nexthop = inet_opt->opt.faddr; 248 } 249 250 orig_sport = inet->inet_sport; 251 orig_dport = usin->sin_port; 252 fl4 = &inet->cork.fl.u.ip4; 253 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 254 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 255 orig_dport, sk); 256 if (IS_ERR(rt)) { 257 err = PTR_ERR(rt); 258 if (err == -ENETUNREACH) 259 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 260 return err; 261 } 262 263 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 264 ip_rt_put(rt); 265 return -ENETUNREACH; 266 } 267 268 if (!inet_opt || !inet_opt->opt.srr) 269 daddr = fl4->daddr; 270 271 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 272 273 if (!inet->inet_saddr) { 274 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 275 if (err) { 276 ip_rt_put(rt); 277 return err; 278 } 279 } else { 280 sk_rcv_saddr_set(sk, inet->inet_saddr); 281 } 282 283 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 284 /* Reset inherited state */ 285 tp->rx_opt.ts_recent = 0; 286 tp->rx_opt.ts_recent_stamp = 0; 287 if (likely(!tp->repair)) 288 WRITE_ONCE(tp->write_seq, 0); 289 } 290 291 inet->inet_dport = usin->sin_port; 292 sk_daddr_set(sk, daddr); 293 294 inet_csk(sk)->icsk_ext_hdr_len = 0; 295 if (inet_opt) 296 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 297 298 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 299 300 /* Socket identity is still unknown (sport may be zero). 301 * However we set state to SYN-SENT and not releasing socket 302 * lock select source port, enter ourselves into the hash tables and 303 * complete initialization after this. 304 */ 305 tcp_set_state(sk, TCP_SYN_SENT); 306 err = inet_hash_connect(tcp_death_row, sk); 307 if (err) 308 goto failure; 309 310 sk_set_txhash(sk); 311 312 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 313 inet->inet_sport, inet->inet_dport, sk); 314 if (IS_ERR(rt)) { 315 err = PTR_ERR(rt); 316 rt = NULL; 317 goto failure; 318 } 319 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 320 /* OK, now commit destination to socket. */ 321 sk->sk_gso_type = SKB_GSO_TCPV4; 322 sk_setup_caps(sk, &rt->dst); 323 rt = NULL; 324 325 if (likely(!tp->repair)) { 326 if (!tp->write_seq) 327 WRITE_ONCE(tp->write_seq, 328 secure_tcp_seq(inet->inet_saddr, 329 inet->inet_daddr, 330 inet->inet_sport, 331 usin->sin_port)); 332 WRITE_ONCE(tp->tsoffset, 333 secure_tcp_ts_off(net, inet->inet_saddr, 334 inet->inet_daddr)); 335 } 336 337 atomic_set(&inet->inet_id, get_random_u16()); 338 339 if (tcp_fastopen_defer_connect(sk, &err)) 340 return err; 341 if (err) 342 goto failure; 343 344 err = tcp_connect(sk); 345 346 if (err) 347 goto failure; 348 349 return 0; 350 351 failure: 352 /* 353 * This unhashes the socket and releases the local port, 354 * if necessary. 355 */ 356 tcp_set_state(sk, TCP_CLOSE); 357 inet_bhash2_reset_saddr(sk); 358 ip_rt_put(rt); 359 sk->sk_route_caps = 0; 360 inet->inet_dport = 0; 361 return err; 362 } 363 EXPORT_IPV6_MOD(tcp_v4_connect); 364 365 /* 366 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 367 * It can be called through tcp_release_cb() if socket was owned by user 368 * at the time tcp_v4_err() was called to handle ICMP message. 369 */ 370 void tcp_v4_mtu_reduced(struct sock *sk) 371 { 372 struct inet_sock *inet = inet_sk(sk); 373 struct dst_entry *dst; 374 u32 mtu; 375 376 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 377 return; 378 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 379 dst = inet_csk_update_pmtu(sk, mtu); 380 if (!dst) 381 return; 382 383 /* Something is about to be wrong... Remember soft error 384 * for the case, if this connection will not able to recover. 385 */ 386 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 387 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 388 389 mtu = dst_mtu(dst); 390 391 if (inet->pmtudisc != IP_PMTUDISC_DONT && 392 ip_sk_accept_pmtu(sk) && 393 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 394 tcp_sync_mss(sk, mtu); 395 396 /* Resend the TCP packet because it's 397 * clear that the old packet has been 398 * dropped. This is the new "fast" path mtu 399 * discovery. 400 */ 401 tcp_simple_retransmit(sk); 402 } /* else let the usual retransmit timer handle it */ 403 } 404 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 405 406 static void do_redirect(struct sk_buff *skb, struct sock *sk) 407 { 408 struct dst_entry *dst = __sk_dst_check(sk, 0); 409 410 if (dst) 411 dst->ops->redirect(dst, sk, skb); 412 } 413 414 415 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 416 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 417 { 418 struct request_sock *req = inet_reqsk(sk); 419 struct net *net = sock_net(sk); 420 421 /* ICMPs are not backlogged, hence we cannot get 422 * an established socket here. 423 */ 424 if (seq != tcp_rsk(req)->snt_isn) { 425 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 426 } else if (abort) { 427 /* 428 * Still in SYN_RECV, just remove it silently. 429 * There is no good way to pass the error to the newly 430 * created socket, and POSIX does not want network 431 * errors returned from accept(). 432 */ 433 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 434 tcp_listendrop(req->rsk_listener); 435 } 436 reqsk_put(req); 437 } 438 EXPORT_IPV6_MOD(tcp_req_err); 439 440 /* TCP-LD (RFC 6069) logic */ 441 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 442 { 443 struct inet_connection_sock *icsk = inet_csk(sk); 444 struct tcp_sock *tp = tcp_sk(sk); 445 struct sk_buff *skb; 446 s32 remaining; 447 u32 delta_us; 448 449 if (sock_owned_by_user(sk)) 450 return; 451 452 if (seq != tp->snd_una || !icsk->icsk_retransmits || 453 !icsk->icsk_backoff) 454 return; 455 456 skb = tcp_rtx_queue_head(sk); 457 if (WARN_ON_ONCE(!skb)) 458 return; 459 460 icsk->icsk_backoff--; 461 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 462 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 463 464 tcp_mstamp_refresh(tp); 465 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 466 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 467 468 if (remaining > 0) { 469 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 470 } else { 471 /* RTO revert clocked out retransmission. 472 * Will retransmit now. 473 */ 474 tcp_retransmit_timer(sk); 475 } 476 } 477 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 478 479 /* 480 * This routine is called by the ICMP module when it gets some 481 * sort of error condition. If err < 0 then the socket should 482 * be closed and the error returned to the user. If err > 0 483 * it's just the icmp type << 8 | icmp code. After adjustment 484 * header points to the first 8 bytes of the tcp header. We need 485 * to find the appropriate port. 486 * 487 * The locking strategy used here is very "optimistic". When 488 * someone else accesses the socket the ICMP is just dropped 489 * and for some paths there is no check at all. 490 * A more general error queue to queue errors for later handling 491 * is probably better. 492 * 493 */ 494 495 int tcp_v4_err(struct sk_buff *skb, u32 info) 496 { 497 const struct iphdr *iph = (const struct iphdr *)skb->data; 498 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 499 struct net *net = dev_net_rcu(skb->dev); 500 const int type = icmp_hdr(skb)->type; 501 const int code = icmp_hdr(skb)->code; 502 struct request_sock *fastopen; 503 struct tcp_sock *tp; 504 u32 seq, snd_una; 505 struct sock *sk; 506 int err; 507 508 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 509 iph->daddr, th->dest, iph->saddr, 510 ntohs(th->source), inet_iif(skb), 0); 511 if (!sk) { 512 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 513 return -ENOENT; 514 } 515 if (sk->sk_state == TCP_TIME_WAIT) { 516 /* To increase the counter of ignored icmps for TCP-AO */ 517 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 518 inet_twsk_put(inet_twsk(sk)); 519 return 0; 520 } 521 seq = ntohl(th->seq); 522 if (sk->sk_state == TCP_NEW_SYN_RECV) { 523 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 524 type == ICMP_TIME_EXCEEDED || 525 (type == ICMP_DEST_UNREACH && 526 (code == ICMP_NET_UNREACH || 527 code == ICMP_HOST_UNREACH))); 528 return 0; 529 } 530 531 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 532 sock_put(sk); 533 return 0; 534 } 535 536 bh_lock_sock(sk); 537 /* If too many ICMPs get dropped on busy 538 * servers this needs to be solved differently. 539 * We do take care of PMTU discovery (RFC1191) special case : 540 * we can receive locally generated ICMP messages while socket is held. 541 */ 542 if (sock_owned_by_user(sk)) { 543 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 544 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 545 } 546 if (sk->sk_state == TCP_CLOSE) 547 goto out; 548 549 if (static_branch_unlikely(&ip4_min_ttl)) { 550 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 551 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 552 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 553 goto out; 554 } 555 } 556 557 tp = tcp_sk(sk); 558 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 559 fastopen = rcu_dereference(tp->fastopen_rsk); 560 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 561 if (sk->sk_state != TCP_LISTEN && 562 !between(seq, snd_una, tp->snd_nxt)) { 563 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 564 goto out; 565 } 566 567 switch (type) { 568 case ICMP_REDIRECT: 569 if (!sock_owned_by_user(sk)) 570 do_redirect(skb, sk); 571 goto out; 572 case ICMP_SOURCE_QUENCH: 573 /* Just silently ignore these. */ 574 goto out; 575 case ICMP_PARAMETERPROB: 576 err = EPROTO; 577 break; 578 case ICMP_DEST_UNREACH: 579 if (code > NR_ICMP_UNREACH) 580 goto out; 581 582 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 583 /* We are not interested in TCP_LISTEN and open_requests 584 * (SYN-ACKs send out by Linux are always <576bytes so 585 * they should go through unfragmented). 586 */ 587 if (sk->sk_state == TCP_LISTEN) 588 goto out; 589 590 WRITE_ONCE(tp->mtu_info, info); 591 if (!sock_owned_by_user(sk)) { 592 tcp_v4_mtu_reduced(sk); 593 } else { 594 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 595 sock_hold(sk); 596 } 597 goto out; 598 } 599 600 err = icmp_err_convert[code].errno; 601 /* check if this ICMP message allows revert of backoff. 602 * (see RFC 6069) 603 */ 604 if (!fastopen && 605 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 606 tcp_ld_RTO_revert(sk, seq); 607 break; 608 case ICMP_TIME_EXCEEDED: 609 err = EHOSTUNREACH; 610 break; 611 default: 612 goto out; 613 } 614 615 switch (sk->sk_state) { 616 case TCP_SYN_SENT: 617 case TCP_SYN_RECV: 618 /* Only in fast or simultaneous open. If a fast open socket is 619 * already accepted it is treated as a connected one below. 620 */ 621 if (fastopen && !fastopen->sk) 622 break; 623 624 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 625 626 if (!sock_owned_by_user(sk)) 627 tcp_done_with_error(sk, err); 628 else 629 WRITE_ONCE(sk->sk_err_soft, err); 630 goto out; 631 } 632 633 /* If we've already connected we will keep trying 634 * until we time out, or the user gives up. 635 * 636 * rfc1122 4.2.3.9 allows to consider as hard errors 637 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 638 * but it is obsoleted by pmtu discovery). 639 * 640 * Note, that in modern internet, where routing is unreliable 641 * and in each dark corner broken firewalls sit, sending random 642 * errors ordered by their masters even this two messages finally lose 643 * their original sense (even Linux sends invalid PORT_UNREACHs) 644 * 645 * Now we are in compliance with RFCs. 646 * --ANK (980905) 647 */ 648 649 if (!sock_owned_by_user(sk) && 650 inet_test_bit(RECVERR, sk)) { 651 WRITE_ONCE(sk->sk_err, err); 652 sk_error_report(sk); 653 } else { /* Only an error on timeout */ 654 WRITE_ONCE(sk->sk_err_soft, err); 655 } 656 657 out: 658 bh_unlock_sock(sk); 659 sock_put(sk); 660 return 0; 661 } 662 663 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 664 { 665 struct tcphdr *th = tcp_hdr(skb); 666 667 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 668 skb->csum_start = skb_transport_header(skb) - skb->head; 669 skb->csum_offset = offsetof(struct tcphdr, check); 670 } 671 672 /* This routine computes an IPv4 TCP checksum. */ 673 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 674 { 675 const struct inet_sock *inet = inet_sk(sk); 676 677 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 678 } 679 EXPORT_IPV6_MOD(tcp_v4_send_check); 680 681 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 682 683 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 684 const struct tcp_ao_hdr *aoh, 685 struct ip_reply_arg *arg, struct tcphdr *reply, 686 __be32 reply_options[REPLY_OPTIONS_LEN]) 687 { 688 #ifdef CONFIG_TCP_AO 689 int sdif = tcp_v4_sdif(skb); 690 int dif = inet_iif(skb); 691 int l3index = sdif ? dif : 0; 692 bool allocated_traffic_key; 693 struct tcp_ao_key *key; 694 char *traffic_key; 695 bool drop = true; 696 u32 ao_sne = 0; 697 u8 keyid; 698 699 rcu_read_lock(); 700 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 701 &key, &traffic_key, &allocated_traffic_key, 702 &keyid, &ao_sne)) 703 goto out; 704 705 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 706 (aoh->rnext_keyid << 8) | keyid); 707 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 708 reply->doff = arg->iov[0].iov_len / 4; 709 710 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 711 key, traffic_key, 712 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 713 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 714 reply, ao_sne)) 715 goto out; 716 drop = false; 717 out: 718 rcu_read_unlock(); 719 if (allocated_traffic_key) 720 kfree(traffic_key); 721 return drop; 722 #else 723 return true; 724 #endif 725 } 726 727 /* 728 * This routine will send an RST to the other tcp. 729 * 730 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 731 * for reset. 732 * Answer: if a packet caused RST, it is not for a socket 733 * existing in our system, if it is matched to a socket, 734 * it is just duplicate segment or bug in other side's TCP. 735 * So that we build reply only basing on parameters 736 * arrived with segment. 737 * Exception: precedence violation. We do not implement it in any case. 738 */ 739 740 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 741 enum sk_rst_reason reason) 742 { 743 const struct tcphdr *th = tcp_hdr(skb); 744 struct { 745 struct tcphdr th; 746 __be32 opt[REPLY_OPTIONS_LEN]; 747 } rep; 748 const __u8 *md5_hash_location = NULL; 749 const struct tcp_ao_hdr *aoh; 750 struct ip_reply_arg arg; 751 #ifdef CONFIG_TCP_MD5SIG 752 struct tcp_md5sig_key *key = NULL; 753 unsigned char newhash[16]; 754 struct sock *sk1 = NULL; 755 int genhash; 756 #endif 757 u64 transmit_time = 0; 758 struct sock *ctl_sk; 759 struct net *net; 760 u32 txhash = 0; 761 762 /* Never send a reset in response to a reset. */ 763 if (th->rst) 764 return; 765 766 /* If sk not NULL, it means we did a successful lookup and incoming 767 * route had to be correct. prequeue might have dropped our dst. 768 */ 769 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 770 return; 771 772 /* Swap the send and the receive. */ 773 memset(&rep, 0, sizeof(rep)); 774 rep.th.dest = th->source; 775 rep.th.source = th->dest; 776 rep.th.doff = sizeof(struct tcphdr) / 4; 777 rep.th.rst = 1; 778 779 if (th->ack) { 780 rep.th.seq = th->ack_seq; 781 } else { 782 rep.th.ack = 1; 783 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 784 skb->len - (th->doff << 2)); 785 } 786 787 memset(&arg, 0, sizeof(arg)); 788 arg.iov[0].iov_base = (unsigned char *)&rep; 789 arg.iov[0].iov_len = sizeof(rep.th); 790 791 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 792 793 /* Invalid TCP option size or twice included auth */ 794 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 795 return; 796 797 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 798 return; 799 800 #ifdef CONFIG_TCP_MD5SIG 801 rcu_read_lock(); 802 if (sk && sk_fullsock(sk)) { 803 const union tcp_md5_addr *addr; 804 int l3index; 805 806 /* sdif set, means packet ingressed via a device 807 * in an L3 domain and inet_iif is set to it. 808 */ 809 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 810 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 811 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 812 } else if (md5_hash_location) { 813 const union tcp_md5_addr *addr; 814 int sdif = tcp_v4_sdif(skb); 815 int dif = inet_iif(skb); 816 int l3index; 817 818 /* 819 * active side is lost. Try to find listening socket through 820 * source port, and then find md5 key through listening socket. 821 * we are not loose security here: 822 * Incoming packet is checked with md5 hash with finding key, 823 * no RST generated if md5 hash doesn't match. 824 */ 825 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 826 NULL, 0, ip_hdr(skb)->saddr, 827 th->source, ip_hdr(skb)->daddr, 828 ntohs(th->source), dif, sdif); 829 /* don't send rst if it can't find key */ 830 if (!sk1) 831 goto out; 832 833 /* sdif set, means packet ingressed via a device 834 * in an L3 domain and dif is set to it. 835 */ 836 l3index = sdif ? dif : 0; 837 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 838 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 839 if (!key) 840 goto out; 841 842 843 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 844 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 845 goto out; 846 847 } 848 849 if (key) { 850 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 851 (TCPOPT_NOP << 16) | 852 (TCPOPT_MD5SIG << 8) | 853 TCPOLEN_MD5SIG); 854 /* Update length and the length the header thinks exists */ 855 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 856 rep.th.doff = arg.iov[0].iov_len / 4; 857 858 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 859 key, ip_hdr(skb)->saddr, 860 ip_hdr(skb)->daddr, &rep.th); 861 } 862 #endif 863 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 864 if (rep.opt[0] == 0) { 865 __be32 mrst = mptcp_reset_option(skb); 866 867 if (mrst) { 868 rep.opt[0] = mrst; 869 arg.iov[0].iov_len += sizeof(mrst); 870 rep.th.doff = arg.iov[0].iov_len / 4; 871 } 872 } 873 874 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 875 ip_hdr(skb)->saddr, /* XXX */ 876 arg.iov[0].iov_len, IPPROTO_TCP, 0); 877 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 878 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 879 880 /* When socket is gone, all binding information is lost. 881 * routing might fail in this case. No choice here, if we choose to force 882 * input interface, we will misroute in case of asymmetric route. 883 */ 884 if (sk) 885 arg.bound_dev_if = sk->sk_bound_dev_if; 886 887 trace_tcp_send_reset(sk, skb, reason); 888 889 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 890 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 891 892 /* ECN bits of TW reset are cleared */ 893 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 894 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 895 local_bh_disable(); 896 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 897 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 898 899 sock_net_set(ctl_sk, net); 900 if (sk) { 901 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 902 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 903 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 904 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 905 transmit_time = tcp_transmit_time(sk); 906 xfrm_sk_clone_policy(ctl_sk, sk); 907 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 908 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 909 } else { 910 ctl_sk->sk_mark = 0; 911 ctl_sk->sk_priority = 0; 912 } 913 ip_send_unicast_reply(ctl_sk, sk, 914 skb, &TCP_SKB_CB(skb)->header.h4.opt, 915 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 916 &arg, arg.iov[0].iov_len, 917 transmit_time, txhash); 918 919 xfrm_sk_free_policy(ctl_sk); 920 sock_net_set(ctl_sk, &init_net); 921 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 922 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 923 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 924 local_bh_enable(); 925 926 #ifdef CONFIG_TCP_MD5SIG 927 out: 928 rcu_read_unlock(); 929 #endif 930 } 931 932 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 933 outside socket context is ugly, certainly. What can I do? 934 */ 935 936 static void tcp_v4_send_ack(const struct sock *sk, 937 struct sk_buff *skb, u32 seq, u32 ack, 938 u32 win, u32 tsval, u32 tsecr, int oif, 939 struct tcp_key *key, 940 int reply_flags, u8 tos, u32 txhash) 941 { 942 const struct tcphdr *th = tcp_hdr(skb); 943 struct { 944 struct tcphdr th; 945 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 946 } rep; 947 struct net *net = sock_net(sk); 948 struct ip_reply_arg arg; 949 struct sock *ctl_sk; 950 u64 transmit_time; 951 952 memset(&rep.th, 0, sizeof(struct tcphdr)); 953 memset(&arg, 0, sizeof(arg)); 954 955 arg.iov[0].iov_base = (unsigned char *)&rep; 956 arg.iov[0].iov_len = sizeof(rep.th); 957 if (tsecr) { 958 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 959 (TCPOPT_TIMESTAMP << 8) | 960 TCPOLEN_TIMESTAMP); 961 rep.opt[1] = htonl(tsval); 962 rep.opt[2] = htonl(tsecr); 963 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 964 } 965 966 /* Swap the send and the receive. */ 967 rep.th.dest = th->source; 968 rep.th.source = th->dest; 969 rep.th.doff = arg.iov[0].iov_len / 4; 970 rep.th.seq = htonl(seq); 971 rep.th.ack_seq = htonl(ack); 972 rep.th.ack = 1; 973 rep.th.window = htons(win); 974 975 #ifdef CONFIG_TCP_MD5SIG 976 if (tcp_key_is_md5(key)) { 977 int offset = (tsecr) ? 3 : 0; 978 979 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 980 (TCPOPT_NOP << 16) | 981 (TCPOPT_MD5SIG << 8) | 982 TCPOLEN_MD5SIG); 983 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 984 rep.th.doff = arg.iov[0].iov_len/4; 985 986 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 987 key->md5_key, ip_hdr(skb)->saddr, 988 ip_hdr(skb)->daddr, &rep.th); 989 } 990 #endif 991 #ifdef CONFIG_TCP_AO 992 if (tcp_key_is_ao(key)) { 993 int offset = (tsecr) ? 3 : 0; 994 995 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 996 (tcp_ao_len(key->ao_key) << 16) | 997 (key->ao_key->sndid << 8) | 998 key->rcv_next); 999 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 1000 rep.th.doff = arg.iov[0].iov_len / 4; 1001 1002 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1003 key->ao_key, key->traffic_key, 1004 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1005 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1006 &rep.th, key->sne); 1007 } 1008 #endif 1009 arg.flags = reply_flags; 1010 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1011 ip_hdr(skb)->saddr, /* XXX */ 1012 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1013 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1014 if (oif) 1015 arg.bound_dev_if = oif; 1016 arg.tos = tos; 1017 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1018 local_bh_disable(); 1019 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1020 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1021 sock_net_set(ctl_sk, net); 1022 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1023 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1024 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1025 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1026 transmit_time = tcp_transmit_time(sk); 1027 ip_send_unicast_reply(ctl_sk, sk, 1028 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1029 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1030 &arg, arg.iov[0].iov_len, 1031 transmit_time, txhash); 1032 1033 sock_net_set(ctl_sk, &init_net); 1034 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1035 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1036 local_bh_enable(); 1037 } 1038 1039 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1040 enum tcp_tw_status tw_status) 1041 { 1042 struct inet_timewait_sock *tw = inet_twsk(sk); 1043 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1044 struct tcp_key key = {}; 1045 u8 tos = tw->tw_tos; 1046 1047 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1048 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1049 * being placed in a different service queues (Classic rather than L4S) 1050 */ 1051 if (tw_status == TCP_TW_ACK_OOW) 1052 tos &= ~INET_ECN_MASK; 1053 1054 #ifdef CONFIG_TCP_AO 1055 struct tcp_ao_info *ao_info; 1056 1057 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1058 /* FIXME: the segment to-be-acked is not verified yet */ 1059 ao_info = rcu_dereference(tcptw->ao_info); 1060 if (ao_info) { 1061 const struct tcp_ao_hdr *aoh; 1062 1063 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1064 inet_twsk_put(tw); 1065 return; 1066 } 1067 1068 if (aoh) 1069 key.ao_key = tcp_ao_established_key(sk, ao_info, 1070 aoh->rnext_keyid, -1); 1071 } 1072 } 1073 if (key.ao_key) { 1074 struct tcp_ao_key *rnext_key; 1075 1076 key.traffic_key = snd_other_key(key.ao_key); 1077 key.sne = READ_ONCE(ao_info->snd_sne); 1078 rnext_key = READ_ONCE(ao_info->rnext_key); 1079 key.rcv_next = rnext_key->rcvid; 1080 key.type = TCP_KEY_AO; 1081 #else 1082 if (0) { 1083 #endif 1084 } else if (static_branch_tcp_md5()) { 1085 key.md5_key = tcp_twsk_md5_key(tcptw); 1086 if (key.md5_key) 1087 key.type = TCP_KEY_MD5; 1088 } 1089 1090 tcp_v4_send_ack(sk, skb, 1091 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1092 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1093 tcp_tw_tsval(tcptw), 1094 READ_ONCE(tcptw->tw_ts_recent), 1095 tw->tw_bound_dev_if, &key, 1096 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1097 tos, 1098 tw->tw_txhash); 1099 1100 inet_twsk_put(tw); 1101 } 1102 1103 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1104 struct request_sock *req) 1105 { 1106 struct tcp_key key = {}; 1107 1108 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1109 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1110 */ 1111 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1112 tcp_sk(sk)->snd_nxt; 1113 1114 #ifdef CONFIG_TCP_AO 1115 if (static_branch_unlikely(&tcp_ao_needed.key) && 1116 tcp_rsk_used_ao(req)) { 1117 const union tcp_md5_addr *addr; 1118 const struct tcp_ao_hdr *aoh; 1119 int l3index; 1120 1121 /* Invalid TCP option size or twice included auth */ 1122 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1123 return; 1124 if (!aoh) 1125 return; 1126 1127 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1128 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1129 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1130 aoh->rnext_keyid, -1); 1131 if (unlikely(!key.ao_key)) { 1132 /* Send ACK with any matching MKT for the peer */ 1133 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1134 /* Matching key disappeared (user removed the key?) 1135 * let the handshake timeout. 1136 */ 1137 if (!key.ao_key) { 1138 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1139 addr, 1140 ntohs(tcp_hdr(skb)->source), 1141 &ip_hdr(skb)->daddr, 1142 ntohs(tcp_hdr(skb)->dest)); 1143 return; 1144 } 1145 } 1146 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1147 if (!key.traffic_key) 1148 return; 1149 1150 key.type = TCP_KEY_AO; 1151 key.rcv_next = aoh->keyid; 1152 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1153 #else 1154 if (0) { 1155 #endif 1156 } else if (static_branch_tcp_md5()) { 1157 const union tcp_md5_addr *addr; 1158 int l3index; 1159 1160 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1161 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1162 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1163 if (key.md5_key) 1164 key.type = TCP_KEY_MD5; 1165 } 1166 1167 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1168 tcp_v4_send_ack(sk, skb, seq, 1169 tcp_rsk(req)->rcv_nxt, 1170 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1171 tcp_rsk_tsval(tcp_rsk(req)), 1172 req->ts_recent, 1173 0, &key, 1174 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1175 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1176 READ_ONCE(tcp_rsk(req)->txhash)); 1177 if (tcp_key_is_ao(&key)) 1178 kfree(key.traffic_key); 1179 } 1180 1181 /* 1182 * Send a SYN-ACK after having received a SYN. 1183 * This still operates on a request_sock only, not on a big 1184 * socket. 1185 */ 1186 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1187 struct flowi *fl, 1188 struct request_sock *req, 1189 struct tcp_fastopen_cookie *foc, 1190 enum tcp_synack_type synack_type, 1191 struct sk_buff *syn_skb) 1192 { 1193 const struct inet_request_sock *ireq = inet_rsk(req); 1194 struct flowi4 fl4; 1195 int err = -1; 1196 struct sk_buff *skb; 1197 u8 tos; 1198 1199 /* First, grab a route. */ 1200 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1201 return -1; 1202 1203 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1204 1205 if (skb) { 1206 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1207 1208 tos = READ_ONCE(inet_sk(sk)->tos); 1209 1210 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1211 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1212 (tos & INET_ECN_MASK); 1213 1214 if (!INET_ECN_is_capable(tos) && 1215 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1216 tos |= INET_ECN_ECT_0; 1217 1218 rcu_read_lock(); 1219 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1220 ireq->ir_rmt_addr, 1221 rcu_dereference(ireq->ireq_opt), 1222 tos); 1223 rcu_read_unlock(); 1224 err = net_xmit_eval(err); 1225 } 1226 1227 return err; 1228 } 1229 1230 /* 1231 * IPv4 request_sock destructor. 1232 */ 1233 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1234 { 1235 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1236 } 1237 1238 #ifdef CONFIG_TCP_MD5SIG 1239 /* 1240 * RFC2385 MD5 checksumming requires a mapping of 1241 * IP address->MD5 Key. 1242 * We need to maintain these in the sk structure. 1243 */ 1244 1245 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1246 EXPORT_IPV6_MOD(tcp_md5_needed); 1247 1248 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1249 { 1250 if (!old) 1251 return true; 1252 1253 /* l3index always overrides non-l3index */ 1254 if (old->l3index && new->l3index == 0) 1255 return false; 1256 if (old->l3index == 0 && new->l3index) 1257 return true; 1258 1259 return old->prefixlen < new->prefixlen; 1260 } 1261 1262 /* Find the Key structure for an address. */ 1263 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1264 const union tcp_md5_addr *addr, 1265 int family, bool any_l3index) 1266 { 1267 const struct tcp_sock *tp = tcp_sk(sk); 1268 struct tcp_md5sig_key *key; 1269 const struct tcp_md5sig_info *md5sig; 1270 __be32 mask; 1271 struct tcp_md5sig_key *best_match = NULL; 1272 bool match; 1273 1274 /* caller either holds rcu_read_lock() or socket lock */ 1275 md5sig = rcu_dereference_check(tp->md5sig_info, 1276 lockdep_sock_is_held(sk)); 1277 if (!md5sig) 1278 return NULL; 1279 1280 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1281 lockdep_sock_is_held(sk)) { 1282 if (key->family != family) 1283 continue; 1284 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1285 key->l3index != l3index) 1286 continue; 1287 if (family == AF_INET) { 1288 mask = inet_make_mask(key->prefixlen); 1289 match = (key->addr.a4.s_addr & mask) == 1290 (addr->a4.s_addr & mask); 1291 #if IS_ENABLED(CONFIG_IPV6) 1292 } else if (family == AF_INET6) { 1293 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1294 key->prefixlen); 1295 #endif 1296 } else { 1297 match = false; 1298 } 1299 1300 if (match && better_md5_match(best_match, key)) 1301 best_match = key; 1302 } 1303 return best_match; 1304 } 1305 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1306 1307 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1308 const union tcp_md5_addr *addr, 1309 int family, u8 prefixlen, 1310 int l3index, u8 flags) 1311 { 1312 const struct tcp_sock *tp = tcp_sk(sk); 1313 struct tcp_md5sig_key *key; 1314 unsigned int size = sizeof(struct in_addr); 1315 const struct tcp_md5sig_info *md5sig; 1316 1317 /* caller either holds rcu_read_lock() or socket lock */ 1318 md5sig = rcu_dereference_check(tp->md5sig_info, 1319 lockdep_sock_is_held(sk)); 1320 if (!md5sig) 1321 return NULL; 1322 #if IS_ENABLED(CONFIG_IPV6) 1323 if (family == AF_INET6) 1324 size = sizeof(struct in6_addr); 1325 #endif 1326 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1327 lockdep_sock_is_held(sk)) { 1328 if (key->family != family) 1329 continue; 1330 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1331 continue; 1332 if (key->l3index != l3index) 1333 continue; 1334 if (!memcmp(&key->addr, addr, size) && 1335 key->prefixlen == prefixlen) 1336 return key; 1337 } 1338 return NULL; 1339 } 1340 1341 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1342 const struct sock *addr_sk) 1343 { 1344 const union tcp_md5_addr *addr; 1345 int l3index; 1346 1347 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1348 addr_sk->sk_bound_dev_if); 1349 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1350 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1351 } 1352 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1353 1354 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1355 { 1356 struct tcp_sock *tp = tcp_sk(sk); 1357 struct tcp_md5sig_info *md5sig; 1358 1359 md5sig = kmalloc(sizeof(*md5sig), gfp); 1360 if (!md5sig) 1361 return -ENOMEM; 1362 1363 sk_gso_disable(sk); 1364 INIT_HLIST_HEAD(&md5sig->head); 1365 rcu_assign_pointer(tp->md5sig_info, md5sig); 1366 return 0; 1367 } 1368 1369 /* This can be called on a newly created socket, from other files */ 1370 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1371 int family, u8 prefixlen, int l3index, u8 flags, 1372 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1373 { 1374 /* Add Key to the list */ 1375 struct tcp_md5sig_key *key; 1376 struct tcp_sock *tp = tcp_sk(sk); 1377 struct tcp_md5sig_info *md5sig; 1378 1379 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1380 if (key) { 1381 /* Pre-existing entry - just update that one. 1382 * Note that the key might be used concurrently. 1383 * data_race() is telling kcsan that we do not care of 1384 * key mismatches, since changing MD5 key on live flows 1385 * can lead to packet drops. 1386 */ 1387 data_race(memcpy(key->key, newkey, newkeylen)); 1388 1389 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1390 * Also note that a reader could catch new key->keylen value 1391 * but old key->key[], this is the reason we use __GFP_ZERO 1392 * at sock_kmalloc() time below these lines. 1393 */ 1394 WRITE_ONCE(key->keylen, newkeylen); 1395 1396 return 0; 1397 } 1398 1399 md5sig = rcu_dereference_protected(tp->md5sig_info, 1400 lockdep_sock_is_held(sk)); 1401 1402 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1403 if (!key) 1404 return -ENOMEM; 1405 1406 memcpy(key->key, newkey, newkeylen); 1407 key->keylen = newkeylen; 1408 key->family = family; 1409 key->prefixlen = prefixlen; 1410 key->l3index = l3index; 1411 key->flags = flags; 1412 memcpy(&key->addr, addr, 1413 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1414 sizeof(struct in_addr)); 1415 hlist_add_head_rcu(&key->node, &md5sig->head); 1416 return 0; 1417 } 1418 1419 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1420 int family, u8 prefixlen, int l3index, u8 flags, 1421 const u8 *newkey, u8 newkeylen) 1422 { 1423 struct tcp_sock *tp = tcp_sk(sk); 1424 1425 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1426 if (tcp_md5_alloc_sigpool()) 1427 return -ENOMEM; 1428 1429 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1430 tcp_md5_release_sigpool(); 1431 return -ENOMEM; 1432 } 1433 1434 if (!static_branch_inc(&tcp_md5_needed.key)) { 1435 struct tcp_md5sig_info *md5sig; 1436 1437 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1438 rcu_assign_pointer(tp->md5sig_info, NULL); 1439 kfree_rcu(md5sig, rcu); 1440 tcp_md5_release_sigpool(); 1441 return -EUSERS; 1442 } 1443 } 1444 1445 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1446 newkey, newkeylen, GFP_KERNEL); 1447 } 1448 EXPORT_IPV6_MOD(tcp_md5_do_add); 1449 1450 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1451 int family, u8 prefixlen, int l3index, 1452 struct tcp_md5sig_key *key) 1453 { 1454 struct tcp_sock *tp = tcp_sk(sk); 1455 1456 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1457 tcp_md5_add_sigpool(); 1458 1459 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1460 tcp_md5_release_sigpool(); 1461 return -ENOMEM; 1462 } 1463 1464 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1465 struct tcp_md5sig_info *md5sig; 1466 1467 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1468 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1469 rcu_assign_pointer(tp->md5sig_info, NULL); 1470 kfree_rcu(md5sig, rcu); 1471 tcp_md5_release_sigpool(); 1472 return -EUSERS; 1473 } 1474 } 1475 1476 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1477 key->flags, key->key, key->keylen, 1478 sk_gfp_mask(sk, GFP_ATOMIC)); 1479 } 1480 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1481 1482 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1483 u8 prefixlen, int l3index, u8 flags) 1484 { 1485 struct tcp_md5sig_key *key; 1486 1487 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1488 if (!key) 1489 return -ENOENT; 1490 hlist_del_rcu(&key->node); 1491 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1492 kfree_rcu(key, rcu); 1493 return 0; 1494 } 1495 EXPORT_IPV6_MOD(tcp_md5_do_del); 1496 1497 void tcp_clear_md5_list(struct sock *sk) 1498 { 1499 struct tcp_sock *tp = tcp_sk(sk); 1500 struct tcp_md5sig_key *key; 1501 struct hlist_node *n; 1502 struct tcp_md5sig_info *md5sig; 1503 1504 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1505 1506 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1507 hlist_del_rcu(&key->node); 1508 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1509 kfree_rcu(key, rcu); 1510 } 1511 } 1512 1513 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1514 sockptr_t optval, int optlen) 1515 { 1516 struct tcp_md5sig cmd; 1517 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1518 const union tcp_md5_addr *addr; 1519 u8 prefixlen = 32; 1520 int l3index = 0; 1521 bool l3flag; 1522 u8 flags; 1523 1524 if (optlen < sizeof(cmd)) 1525 return -EINVAL; 1526 1527 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1528 return -EFAULT; 1529 1530 if (sin->sin_family != AF_INET) 1531 return -EINVAL; 1532 1533 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1534 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1535 1536 if (optname == TCP_MD5SIG_EXT && 1537 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1538 prefixlen = cmd.tcpm_prefixlen; 1539 if (prefixlen > 32) 1540 return -EINVAL; 1541 } 1542 1543 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1544 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1545 struct net_device *dev; 1546 1547 rcu_read_lock(); 1548 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1549 if (dev && netif_is_l3_master(dev)) 1550 l3index = dev->ifindex; 1551 1552 rcu_read_unlock(); 1553 1554 /* ok to reference set/not set outside of rcu; 1555 * right now device MUST be an L3 master 1556 */ 1557 if (!dev || !l3index) 1558 return -EINVAL; 1559 } 1560 1561 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1562 1563 if (!cmd.tcpm_keylen) 1564 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1565 1566 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1567 return -EINVAL; 1568 1569 /* Don't allow keys for peers that have a matching TCP-AO key. 1570 * See the comment in tcp_ao_add_cmd() 1571 */ 1572 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1573 return -EKEYREJECTED; 1574 1575 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1576 cmd.tcpm_key, cmd.tcpm_keylen); 1577 } 1578 1579 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1580 __be32 daddr, __be32 saddr, 1581 const struct tcphdr *th, int nbytes) 1582 { 1583 struct tcp4_pseudohdr *bp; 1584 struct scatterlist sg; 1585 struct tcphdr *_th; 1586 1587 bp = hp->scratch; 1588 bp->saddr = saddr; 1589 bp->daddr = daddr; 1590 bp->pad = 0; 1591 bp->protocol = IPPROTO_TCP; 1592 bp->len = cpu_to_be16(nbytes); 1593 1594 _th = (struct tcphdr *)(bp + 1); 1595 memcpy(_th, th, sizeof(*th)); 1596 _th->check = 0; 1597 1598 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1599 ahash_request_set_crypt(hp->req, &sg, NULL, 1600 sizeof(*bp) + sizeof(*th)); 1601 return crypto_ahash_update(hp->req); 1602 } 1603 1604 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1605 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1606 { 1607 struct tcp_sigpool hp; 1608 1609 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1610 goto clear_hash_nostart; 1611 1612 if (crypto_ahash_init(hp.req)) 1613 goto clear_hash; 1614 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1615 goto clear_hash; 1616 if (tcp_md5_hash_key(&hp, key)) 1617 goto clear_hash; 1618 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1619 if (crypto_ahash_final(hp.req)) 1620 goto clear_hash; 1621 1622 tcp_sigpool_end(&hp); 1623 return 0; 1624 1625 clear_hash: 1626 tcp_sigpool_end(&hp); 1627 clear_hash_nostart: 1628 memset(md5_hash, 0, 16); 1629 return 1; 1630 } 1631 1632 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1633 const struct sock *sk, 1634 const struct sk_buff *skb) 1635 { 1636 const struct tcphdr *th = tcp_hdr(skb); 1637 struct tcp_sigpool hp; 1638 __be32 saddr, daddr; 1639 1640 if (sk) { /* valid for establish/request sockets */ 1641 saddr = sk->sk_rcv_saddr; 1642 daddr = sk->sk_daddr; 1643 } else { 1644 const struct iphdr *iph = ip_hdr(skb); 1645 saddr = iph->saddr; 1646 daddr = iph->daddr; 1647 } 1648 1649 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1650 goto clear_hash_nostart; 1651 1652 if (crypto_ahash_init(hp.req)) 1653 goto clear_hash; 1654 1655 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1656 goto clear_hash; 1657 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1658 goto clear_hash; 1659 if (tcp_md5_hash_key(&hp, key)) 1660 goto clear_hash; 1661 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1662 if (crypto_ahash_final(hp.req)) 1663 goto clear_hash; 1664 1665 tcp_sigpool_end(&hp); 1666 return 0; 1667 1668 clear_hash: 1669 tcp_sigpool_end(&hp); 1670 clear_hash_nostart: 1671 memset(md5_hash, 0, 16); 1672 return 1; 1673 } 1674 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1675 1676 #endif 1677 1678 static void tcp_v4_init_req(struct request_sock *req, 1679 const struct sock *sk_listener, 1680 struct sk_buff *skb) 1681 { 1682 struct inet_request_sock *ireq = inet_rsk(req); 1683 struct net *net = sock_net(sk_listener); 1684 1685 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1686 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1687 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1688 } 1689 1690 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1691 struct sk_buff *skb, 1692 struct flowi *fl, 1693 struct request_sock *req, 1694 u32 tw_isn) 1695 { 1696 tcp_v4_init_req(req, sk, skb); 1697 1698 if (security_inet_conn_request(sk, skb, req)) 1699 return NULL; 1700 1701 return inet_csk_route_req(sk, &fl->u.ip4, req); 1702 } 1703 1704 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1705 .family = PF_INET, 1706 .obj_size = sizeof(struct tcp_request_sock), 1707 .send_ack = tcp_v4_reqsk_send_ack, 1708 .destructor = tcp_v4_reqsk_destructor, 1709 .send_reset = tcp_v4_send_reset, 1710 .syn_ack_timeout = tcp_syn_ack_timeout, 1711 }; 1712 1713 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1714 .mss_clamp = TCP_MSS_DEFAULT, 1715 #ifdef CONFIG_TCP_MD5SIG 1716 .req_md5_lookup = tcp_v4_md5_lookup, 1717 .calc_md5_hash = tcp_v4_md5_hash_skb, 1718 #endif 1719 #ifdef CONFIG_TCP_AO 1720 .ao_lookup = tcp_v4_ao_lookup_rsk, 1721 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1722 .ao_synack_hash = tcp_v4_ao_synack_hash, 1723 #endif 1724 #ifdef CONFIG_SYN_COOKIES 1725 .cookie_init_seq = cookie_v4_init_sequence, 1726 #endif 1727 .route_req = tcp_v4_route_req, 1728 .init_seq = tcp_v4_init_seq, 1729 .init_ts_off = tcp_v4_init_ts_off, 1730 .send_synack = tcp_v4_send_synack, 1731 }; 1732 1733 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1734 { 1735 /* Never answer to SYNs send to broadcast or multicast */ 1736 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1737 goto drop; 1738 1739 return tcp_conn_request(&tcp_request_sock_ops, 1740 &tcp_request_sock_ipv4_ops, sk, skb); 1741 1742 drop: 1743 tcp_listendrop(sk); 1744 return 0; 1745 } 1746 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1747 1748 1749 /* 1750 * The three way handshake has completed - we got a valid synack - 1751 * now create the new socket. 1752 */ 1753 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1754 struct request_sock *req, 1755 struct dst_entry *dst, 1756 struct request_sock *req_unhash, 1757 bool *own_req) 1758 { 1759 struct inet_request_sock *ireq; 1760 bool found_dup_sk = false; 1761 struct inet_sock *newinet; 1762 struct tcp_sock *newtp; 1763 struct sock *newsk; 1764 #ifdef CONFIG_TCP_MD5SIG 1765 const union tcp_md5_addr *addr; 1766 struct tcp_md5sig_key *key; 1767 int l3index; 1768 #endif 1769 struct ip_options_rcu *inet_opt; 1770 1771 if (sk_acceptq_is_full(sk)) 1772 goto exit_overflow; 1773 1774 newsk = tcp_create_openreq_child(sk, req, skb); 1775 if (!newsk) 1776 goto exit_nonewsk; 1777 1778 newsk->sk_gso_type = SKB_GSO_TCPV4; 1779 inet_sk_rx_dst_set(newsk, skb); 1780 1781 newtp = tcp_sk(newsk); 1782 newinet = inet_sk(newsk); 1783 ireq = inet_rsk(req); 1784 inet_opt = rcu_dereference(ireq->ireq_opt); 1785 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1786 newinet->mc_index = inet_iif(skb); 1787 newinet->mc_ttl = ip_hdr(skb)->ttl; 1788 newinet->rcv_tos = ip_hdr(skb)->tos; 1789 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1790 if (inet_opt) 1791 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1792 atomic_set(&newinet->inet_id, get_random_u16()); 1793 1794 /* Set ToS of the new socket based upon the value of incoming SYN. 1795 * ECT bits are set later in tcp_init_transfer(). 1796 */ 1797 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1798 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1799 1800 if (!dst) { 1801 dst = inet_csk_route_child_sock(sk, newsk, req); 1802 if (!dst) 1803 goto put_and_exit; 1804 } else { 1805 /* syncookie case : see end of cookie_v4_check() */ 1806 } 1807 sk_setup_caps(newsk, dst); 1808 1809 tcp_ca_openreq_child(newsk, dst); 1810 1811 tcp_sync_mss(newsk, dst_mtu(dst)); 1812 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1813 1814 tcp_initialize_rcv_mss(newsk); 1815 1816 #ifdef CONFIG_TCP_MD5SIG 1817 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1818 /* Copy over the MD5 key from the original socket */ 1819 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1820 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1821 if (key && !tcp_rsk_used_ao(req)) { 1822 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1823 goto put_and_exit; 1824 sk_gso_disable(newsk); 1825 } 1826 #endif 1827 #ifdef CONFIG_TCP_AO 1828 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1829 goto put_and_exit; /* OOM, release back memory */ 1830 #endif 1831 1832 if (__inet_inherit_port(sk, newsk) < 0) 1833 goto put_and_exit; 1834 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1835 &found_dup_sk); 1836 if (likely(*own_req)) { 1837 tcp_move_syn(newtp, req); 1838 ireq->ireq_opt = NULL; 1839 } else { 1840 newinet->inet_opt = NULL; 1841 1842 if (!req_unhash && found_dup_sk) { 1843 /* This code path should only be executed in the 1844 * syncookie case only 1845 */ 1846 bh_unlock_sock(newsk); 1847 sock_put(newsk); 1848 newsk = NULL; 1849 } 1850 } 1851 return newsk; 1852 1853 exit_overflow: 1854 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1855 exit_nonewsk: 1856 dst_release(dst); 1857 exit: 1858 tcp_listendrop(sk); 1859 return NULL; 1860 put_and_exit: 1861 newinet->inet_opt = NULL; 1862 inet_csk_prepare_forced_close(newsk); 1863 tcp_done(newsk); 1864 goto exit; 1865 } 1866 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1867 1868 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1869 { 1870 #ifdef CONFIG_SYN_COOKIES 1871 const struct tcphdr *th = tcp_hdr(skb); 1872 1873 if (!th->syn) 1874 sk = cookie_v4_check(sk, skb); 1875 #endif 1876 return sk; 1877 } 1878 1879 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1880 struct tcphdr *th, u32 *cookie) 1881 { 1882 u16 mss = 0; 1883 #ifdef CONFIG_SYN_COOKIES 1884 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1885 &tcp_request_sock_ipv4_ops, sk, th); 1886 if (mss) { 1887 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1888 tcp_synq_overflow(sk); 1889 } 1890 #endif 1891 return mss; 1892 } 1893 1894 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1895 u32)); 1896 /* The socket must have it's spinlock held when we get 1897 * here, unless it is a TCP_LISTEN socket. 1898 * 1899 * We have a potential double-lock case here, so even when 1900 * doing backlog processing we use the BH locking scheme. 1901 * This is because we cannot sleep with the original spinlock 1902 * held. 1903 */ 1904 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1905 { 1906 enum skb_drop_reason reason; 1907 struct sock *rsk; 1908 1909 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1910 struct dst_entry *dst; 1911 1912 dst = rcu_dereference_protected(sk->sk_rx_dst, 1913 lockdep_sock_is_held(sk)); 1914 1915 sock_rps_save_rxhash(sk, skb); 1916 sk_mark_napi_id(sk, skb); 1917 if (dst) { 1918 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1919 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1920 dst, 0)) { 1921 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1922 dst_release(dst); 1923 } 1924 } 1925 tcp_rcv_established(sk, skb); 1926 return 0; 1927 } 1928 1929 if (tcp_checksum_complete(skb)) 1930 goto csum_err; 1931 1932 if (sk->sk_state == TCP_LISTEN) { 1933 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1934 1935 if (!nsk) 1936 return 0; 1937 if (nsk != sk) { 1938 reason = tcp_child_process(sk, nsk, skb); 1939 if (reason) { 1940 rsk = nsk; 1941 goto reset; 1942 } 1943 return 0; 1944 } 1945 } else 1946 sock_rps_save_rxhash(sk, skb); 1947 1948 reason = tcp_rcv_state_process(sk, skb); 1949 if (reason) { 1950 rsk = sk; 1951 goto reset; 1952 } 1953 return 0; 1954 1955 reset: 1956 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1957 discard: 1958 sk_skb_reason_drop(sk, skb, reason); 1959 /* Be careful here. If this function gets more complicated and 1960 * gcc suffers from register pressure on the x86, sk (in %ebx) 1961 * might be destroyed here. This current version compiles correctly, 1962 * but you have been warned. 1963 */ 1964 return 0; 1965 1966 csum_err: 1967 reason = SKB_DROP_REASON_TCP_CSUM; 1968 trace_tcp_bad_csum(skb); 1969 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1970 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1971 goto discard; 1972 } 1973 EXPORT_SYMBOL(tcp_v4_do_rcv); 1974 1975 int tcp_v4_early_demux(struct sk_buff *skb) 1976 { 1977 struct net *net = dev_net_rcu(skb->dev); 1978 const struct iphdr *iph; 1979 const struct tcphdr *th; 1980 struct sock *sk; 1981 1982 if (skb->pkt_type != PACKET_HOST) 1983 return 0; 1984 1985 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1986 return 0; 1987 1988 iph = ip_hdr(skb); 1989 th = tcp_hdr(skb); 1990 1991 if (th->doff < sizeof(struct tcphdr) / 4) 1992 return 0; 1993 1994 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1995 iph->saddr, th->source, 1996 iph->daddr, ntohs(th->dest), 1997 skb->skb_iif, inet_sdif(skb)); 1998 if (sk) { 1999 skb->sk = sk; 2000 skb->destructor = sock_edemux; 2001 if (sk_fullsock(sk)) { 2002 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 2003 2004 if (dst) 2005 dst = dst_check(dst, 0); 2006 if (dst && 2007 sk->sk_rx_dst_ifindex == skb->skb_iif) 2008 skb_dst_set_noref(skb, dst); 2009 } 2010 } 2011 return 0; 2012 } 2013 2014 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2015 enum skb_drop_reason *reason) 2016 { 2017 u32 tail_gso_size, tail_gso_segs; 2018 struct skb_shared_info *shinfo; 2019 const struct tcphdr *th; 2020 struct tcphdr *thtail; 2021 struct sk_buff *tail; 2022 unsigned int hdrlen; 2023 bool fragstolen; 2024 u32 gso_segs; 2025 u32 gso_size; 2026 u64 limit; 2027 int delta; 2028 2029 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2030 * we can fix skb->truesize to its real value to avoid future drops. 2031 * This is valid because skb is not yet charged to the socket. 2032 * It has been noticed pure SACK packets were sometimes dropped 2033 * (if cooked by drivers without copybreak feature). 2034 */ 2035 skb_condense(skb); 2036 2037 tcp_cleanup_skb(skb); 2038 2039 if (unlikely(tcp_checksum_complete(skb))) { 2040 bh_unlock_sock(sk); 2041 trace_tcp_bad_csum(skb); 2042 *reason = SKB_DROP_REASON_TCP_CSUM; 2043 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2044 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2045 return true; 2046 } 2047 2048 /* Attempt coalescing to last skb in backlog, even if we are 2049 * above the limits. 2050 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2051 */ 2052 th = (const struct tcphdr *)skb->data; 2053 hdrlen = th->doff * 4; 2054 2055 tail = sk->sk_backlog.tail; 2056 if (!tail) 2057 goto no_coalesce; 2058 thtail = (struct tcphdr *)tail->data; 2059 2060 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2061 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2062 ((TCP_SKB_CB(tail)->tcp_flags | 2063 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2064 !((TCP_SKB_CB(tail)->tcp_flags & 2065 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2066 ((TCP_SKB_CB(tail)->tcp_flags ^ 2067 TCP_SKB_CB(skb)->tcp_flags) & 2068 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 2069 !tcp_skb_can_collapse_rx(tail, skb) || 2070 thtail->doff != th->doff || 2071 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2072 goto no_coalesce; 2073 2074 __skb_pull(skb, hdrlen); 2075 2076 shinfo = skb_shinfo(skb); 2077 gso_size = shinfo->gso_size ?: skb->len; 2078 gso_segs = shinfo->gso_segs ?: 1; 2079 2080 shinfo = skb_shinfo(tail); 2081 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2082 tail_gso_segs = shinfo->gso_segs ?: 1; 2083 2084 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2085 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2086 2087 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2088 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2089 thtail->window = th->window; 2090 } 2091 2092 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2093 * thtail->fin, so that the fast path in tcp_rcv_established() 2094 * is not entered if we append a packet with a FIN. 2095 * SYN, RST, URG are not present. 2096 * ACK is set on both packets. 2097 * PSH : we do not really care in TCP stack, 2098 * at least for 'GRO' packets. 2099 */ 2100 thtail->fin |= th->fin; 2101 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2102 2103 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2104 TCP_SKB_CB(tail)->has_rxtstamp = true; 2105 tail->tstamp = skb->tstamp; 2106 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2107 } 2108 2109 /* Not as strict as GRO. We only need to carry mss max value */ 2110 shinfo->gso_size = max(gso_size, tail_gso_size); 2111 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2112 2113 sk->sk_backlog.len += delta; 2114 __NET_INC_STATS(sock_net(sk), 2115 LINUX_MIB_TCPBACKLOGCOALESCE); 2116 kfree_skb_partial(skb, fragstolen); 2117 return false; 2118 } 2119 __skb_push(skb, hdrlen); 2120 2121 no_coalesce: 2122 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2123 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2124 * sk_rcvbuf in normal conditions. 2125 */ 2126 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2127 2128 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2129 2130 /* Only socket owner can try to collapse/prune rx queues 2131 * to reduce memory overhead, so add a little headroom here. 2132 * Few sockets backlog are possibly concurrently non empty. 2133 */ 2134 limit += 64 * 1024; 2135 2136 limit = min_t(u64, limit, UINT_MAX); 2137 2138 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2139 bh_unlock_sock(sk); 2140 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2141 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2142 return true; 2143 } 2144 return false; 2145 } 2146 EXPORT_IPV6_MOD(tcp_add_backlog); 2147 2148 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2149 { 2150 struct tcphdr *th = (struct tcphdr *)skb->data; 2151 2152 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2153 } 2154 EXPORT_IPV6_MOD(tcp_filter); 2155 2156 static void tcp_v4_restore_cb(struct sk_buff *skb) 2157 { 2158 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2159 sizeof(struct inet_skb_parm)); 2160 } 2161 2162 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2163 const struct tcphdr *th) 2164 { 2165 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2166 * barrier() makes sure compiler wont play fool^Waliasing games. 2167 */ 2168 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2169 sizeof(struct inet_skb_parm)); 2170 barrier(); 2171 2172 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2173 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2174 skb->len - th->doff * 4); 2175 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2176 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2177 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2178 TCP_SKB_CB(skb)->sacked = 0; 2179 TCP_SKB_CB(skb)->has_rxtstamp = 2180 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2181 } 2182 2183 /* 2184 * From tcp_input.c 2185 */ 2186 2187 int tcp_v4_rcv(struct sk_buff *skb) 2188 { 2189 struct net *net = dev_net_rcu(skb->dev); 2190 enum skb_drop_reason drop_reason; 2191 enum tcp_tw_status tw_status; 2192 int sdif = inet_sdif(skb); 2193 int dif = inet_iif(skb); 2194 const struct iphdr *iph; 2195 const struct tcphdr *th; 2196 struct sock *sk = NULL; 2197 bool refcounted; 2198 int ret; 2199 u32 isn; 2200 2201 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2202 if (skb->pkt_type != PACKET_HOST) 2203 goto discard_it; 2204 2205 /* Count it even if it's bad */ 2206 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2207 2208 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2209 goto discard_it; 2210 2211 th = (const struct tcphdr *)skb->data; 2212 2213 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2214 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2215 goto bad_packet; 2216 } 2217 if (!pskb_may_pull(skb, th->doff * 4)) 2218 goto discard_it; 2219 2220 /* An explanation is required here, I think. 2221 * Packet length and doff are validated by header prediction, 2222 * provided case of th->doff==0 is eliminated. 2223 * So, we defer the checks. */ 2224 2225 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2226 goto csum_error; 2227 2228 th = (const struct tcphdr *)skb->data; 2229 iph = ip_hdr(skb); 2230 lookup: 2231 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2232 skb, __tcp_hdrlen(th), th->source, 2233 th->dest, sdif, &refcounted); 2234 if (!sk) 2235 goto no_tcp_socket; 2236 2237 if (sk->sk_state == TCP_TIME_WAIT) 2238 goto do_time_wait; 2239 2240 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2241 struct request_sock *req = inet_reqsk(sk); 2242 bool req_stolen = false; 2243 struct sock *nsk; 2244 2245 sk = req->rsk_listener; 2246 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2247 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2248 else 2249 drop_reason = tcp_inbound_hash(sk, req, skb, 2250 &iph->saddr, &iph->daddr, 2251 AF_INET, dif, sdif); 2252 if (unlikely(drop_reason)) { 2253 sk_drops_add(sk, skb); 2254 reqsk_put(req); 2255 goto discard_it; 2256 } 2257 if (tcp_checksum_complete(skb)) { 2258 reqsk_put(req); 2259 goto csum_error; 2260 } 2261 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2262 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2263 if (!nsk) { 2264 inet_csk_reqsk_queue_drop_and_put(sk, req); 2265 goto lookup; 2266 } 2267 sk = nsk; 2268 /* reuseport_migrate_sock() has already held one sk_refcnt 2269 * before returning. 2270 */ 2271 } else { 2272 /* We own a reference on the listener, increase it again 2273 * as we might lose it too soon. 2274 */ 2275 sock_hold(sk); 2276 } 2277 refcounted = true; 2278 nsk = NULL; 2279 if (!tcp_filter(sk, skb)) { 2280 th = (const struct tcphdr *)skb->data; 2281 iph = ip_hdr(skb); 2282 tcp_v4_fill_cb(skb, iph, th); 2283 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2284 &drop_reason); 2285 } else { 2286 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2287 } 2288 if (!nsk) { 2289 reqsk_put(req); 2290 if (req_stolen) { 2291 /* Another cpu got exclusive access to req 2292 * and created a full blown socket. 2293 * Try to feed this packet to this socket 2294 * instead of discarding it. 2295 */ 2296 tcp_v4_restore_cb(skb); 2297 sock_put(sk); 2298 goto lookup; 2299 } 2300 goto discard_and_relse; 2301 } 2302 nf_reset_ct(skb); 2303 if (nsk == sk) { 2304 reqsk_put(req); 2305 tcp_v4_restore_cb(skb); 2306 } else { 2307 drop_reason = tcp_child_process(sk, nsk, skb); 2308 if (drop_reason) { 2309 enum sk_rst_reason rst_reason; 2310 2311 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2312 tcp_v4_send_reset(nsk, skb, rst_reason); 2313 goto discard_and_relse; 2314 } 2315 sock_put(sk); 2316 return 0; 2317 } 2318 } 2319 2320 process: 2321 if (static_branch_unlikely(&ip4_min_ttl)) { 2322 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2323 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2324 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2325 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2326 goto discard_and_relse; 2327 } 2328 } 2329 2330 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2331 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2332 goto discard_and_relse; 2333 } 2334 2335 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2336 AF_INET, dif, sdif); 2337 if (drop_reason) 2338 goto discard_and_relse; 2339 2340 nf_reset_ct(skb); 2341 2342 if (tcp_filter(sk, skb)) { 2343 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2344 goto discard_and_relse; 2345 } 2346 th = (const struct tcphdr *)skb->data; 2347 iph = ip_hdr(skb); 2348 tcp_v4_fill_cb(skb, iph, th); 2349 2350 skb->dev = NULL; 2351 2352 if (sk->sk_state == TCP_LISTEN) { 2353 ret = tcp_v4_do_rcv(sk, skb); 2354 goto put_and_return; 2355 } 2356 2357 sk_incoming_cpu_update(sk); 2358 2359 bh_lock_sock_nested(sk); 2360 tcp_segs_in(tcp_sk(sk), skb); 2361 ret = 0; 2362 if (!sock_owned_by_user(sk)) { 2363 ret = tcp_v4_do_rcv(sk, skb); 2364 } else { 2365 if (tcp_add_backlog(sk, skb, &drop_reason)) 2366 goto discard_and_relse; 2367 } 2368 bh_unlock_sock(sk); 2369 2370 put_and_return: 2371 if (refcounted) 2372 sock_put(sk); 2373 2374 return ret; 2375 2376 no_tcp_socket: 2377 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2378 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2379 goto discard_it; 2380 2381 tcp_v4_fill_cb(skb, iph, th); 2382 2383 if (tcp_checksum_complete(skb)) { 2384 csum_error: 2385 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2386 trace_tcp_bad_csum(skb); 2387 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2388 bad_packet: 2389 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2390 } else { 2391 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2392 } 2393 2394 discard_it: 2395 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2396 /* Discard frame. */ 2397 sk_skb_reason_drop(sk, skb, drop_reason); 2398 return 0; 2399 2400 discard_and_relse: 2401 sk_drops_add(sk, skb); 2402 if (refcounted) 2403 sock_put(sk); 2404 goto discard_it; 2405 2406 do_time_wait: 2407 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2408 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2409 inet_twsk_put(inet_twsk(sk)); 2410 goto discard_it; 2411 } 2412 2413 tcp_v4_fill_cb(skb, iph, th); 2414 2415 if (tcp_checksum_complete(skb)) { 2416 inet_twsk_put(inet_twsk(sk)); 2417 goto csum_error; 2418 } 2419 2420 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2421 &drop_reason); 2422 switch (tw_status) { 2423 case TCP_TW_SYN: { 2424 struct sock *sk2 = inet_lookup_listener(net, 2425 net->ipv4.tcp_death_row.hashinfo, 2426 skb, __tcp_hdrlen(th), 2427 iph->saddr, th->source, 2428 iph->daddr, th->dest, 2429 inet_iif(skb), 2430 sdif); 2431 if (sk2) { 2432 inet_twsk_deschedule_put(inet_twsk(sk)); 2433 sk = sk2; 2434 tcp_v4_restore_cb(skb); 2435 refcounted = false; 2436 __this_cpu_write(tcp_tw_isn, isn); 2437 goto process; 2438 } 2439 } 2440 /* to ACK */ 2441 fallthrough; 2442 case TCP_TW_ACK: 2443 case TCP_TW_ACK_OOW: 2444 tcp_v4_timewait_ack(sk, skb, tw_status); 2445 break; 2446 case TCP_TW_RST: 2447 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2448 inet_twsk_deschedule_put(inet_twsk(sk)); 2449 goto discard_it; 2450 case TCP_TW_SUCCESS:; 2451 } 2452 goto discard_it; 2453 } 2454 2455 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2456 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2457 .twsk_destructor= tcp_twsk_destructor, 2458 }; 2459 2460 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2461 { 2462 struct dst_entry *dst = skb_dst(skb); 2463 2464 if (dst && dst_hold_safe(dst)) { 2465 rcu_assign_pointer(sk->sk_rx_dst, dst); 2466 sk->sk_rx_dst_ifindex = skb->skb_iif; 2467 } 2468 } 2469 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2470 2471 const struct inet_connection_sock_af_ops ipv4_specific = { 2472 .queue_xmit = ip_queue_xmit, 2473 .send_check = tcp_v4_send_check, 2474 .rebuild_header = inet_sk_rebuild_header, 2475 .sk_rx_dst_set = inet_sk_rx_dst_set, 2476 .conn_request = tcp_v4_conn_request, 2477 .syn_recv_sock = tcp_v4_syn_recv_sock, 2478 .net_header_len = sizeof(struct iphdr), 2479 .setsockopt = ip_setsockopt, 2480 .getsockopt = ip_getsockopt, 2481 .mtu_reduced = tcp_v4_mtu_reduced, 2482 }; 2483 EXPORT_IPV6_MOD(ipv4_specific); 2484 2485 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2486 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2487 #ifdef CONFIG_TCP_MD5SIG 2488 .md5_lookup = tcp_v4_md5_lookup, 2489 .calc_md5_hash = tcp_v4_md5_hash_skb, 2490 .md5_parse = tcp_v4_parse_md5_keys, 2491 #endif 2492 #ifdef CONFIG_TCP_AO 2493 .ao_lookup = tcp_v4_ao_lookup, 2494 .calc_ao_hash = tcp_v4_ao_hash_skb, 2495 .ao_parse = tcp_v4_parse_ao, 2496 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2497 #endif 2498 }; 2499 #endif 2500 2501 /* NOTE: A lot of things set to zero explicitly by call to 2502 * sk_alloc() so need not be done here. 2503 */ 2504 static int tcp_v4_init_sock(struct sock *sk) 2505 { 2506 struct inet_connection_sock *icsk = inet_csk(sk); 2507 2508 tcp_init_sock(sk); 2509 2510 icsk->icsk_af_ops = &ipv4_specific; 2511 2512 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2513 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2514 #endif 2515 2516 return 0; 2517 } 2518 2519 #ifdef CONFIG_TCP_MD5SIG 2520 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2521 { 2522 struct tcp_md5sig_info *md5sig; 2523 2524 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2525 kfree(md5sig); 2526 static_branch_slow_dec_deferred(&tcp_md5_needed); 2527 tcp_md5_release_sigpool(); 2528 } 2529 #endif 2530 2531 static void tcp_release_user_frags(struct sock *sk) 2532 { 2533 #ifdef CONFIG_PAGE_POOL 2534 unsigned long index; 2535 void *netmem; 2536 2537 xa_for_each(&sk->sk_user_frags, index, netmem) 2538 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2539 #endif 2540 } 2541 2542 void tcp_v4_destroy_sock(struct sock *sk) 2543 { 2544 struct tcp_sock *tp = tcp_sk(sk); 2545 2546 tcp_release_user_frags(sk); 2547 2548 xa_destroy(&sk->sk_user_frags); 2549 2550 trace_tcp_destroy_sock(sk); 2551 2552 tcp_clear_xmit_timers(sk); 2553 2554 tcp_cleanup_congestion_control(sk); 2555 2556 tcp_cleanup_ulp(sk); 2557 2558 /* Cleanup up the write buffer. */ 2559 tcp_write_queue_purge(sk); 2560 2561 /* Check if we want to disable active TFO */ 2562 tcp_fastopen_active_disable_ofo_check(sk); 2563 2564 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2565 skb_rbtree_purge(&tp->out_of_order_queue); 2566 2567 #ifdef CONFIG_TCP_MD5SIG 2568 /* Clean up the MD5 key list, if any */ 2569 if (tp->md5sig_info) { 2570 struct tcp_md5sig_info *md5sig; 2571 2572 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2573 tcp_clear_md5_list(sk); 2574 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2575 rcu_assign_pointer(tp->md5sig_info, NULL); 2576 } 2577 #endif 2578 tcp_ao_destroy_sock(sk, false); 2579 2580 /* Clean up a referenced TCP bind bucket. */ 2581 if (inet_csk(sk)->icsk_bind_hash) 2582 inet_put_port(sk); 2583 2584 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2585 2586 /* If socket is aborted during connect operation */ 2587 tcp_free_fastopen_req(tp); 2588 tcp_fastopen_destroy_cipher(sk); 2589 tcp_saved_syn_free(tp); 2590 2591 sk_sockets_allocated_dec(sk); 2592 } 2593 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2594 2595 #ifdef CONFIG_PROC_FS 2596 /* Proc filesystem TCP sock list dumping. */ 2597 2598 static unsigned short seq_file_family(const struct seq_file *seq); 2599 2600 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2601 { 2602 unsigned short family = seq_file_family(seq); 2603 2604 /* AF_UNSPEC is used as a match all */ 2605 return ((family == AF_UNSPEC || family == sk->sk_family) && 2606 net_eq(sock_net(sk), seq_file_net(seq))); 2607 } 2608 2609 /* Find a non empty bucket (starting from st->bucket) 2610 * and return the first sk from it. 2611 */ 2612 static void *listening_get_first(struct seq_file *seq) 2613 { 2614 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2615 struct tcp_iter_state *st = seq->private; 2616 2617 st->offset = 0; 2618 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2619 struct inet_listen_hashbucket *ilb2; 2620 struct hlist_nulls_node *node; 2621 struct sock *sk; 2622 2623 ilb2 = &hinfo->lhash2[st->bucket]; 2624 if (hlist_nulls_empty(&ilb2->nulls_head)) 2625 continue; 2626 2627 spin_lock(&ilb2->lock); 2628 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2629 if (seq_sk_match(seq, sk)) 2630 return sk; 2631 } 2632 spin_unlock(&ilb2->lock); 2633 } 2634 2635 return NULL; 2636 } 2637 2638 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2639 * If "cur" is the last one in the st->bucket, 2640 * call listening_get_first() to return the first sk of the next 2641 * non empty bucket. 2642 */ 2643 static void *listening_get_next(struct seq_file *seq, void *cur) 2644 { 2645 struct tcp_iter_state *st = seq->private; 2646 struct inet_listen_hashbucket *ilb2; 2647 struct hlist_nulls_node *node; 2648 struct inet_hashinfo *hinfo; 2649 struct sock *sk = cur; 2650 2651 ++st->num; 2652 ++st->offset; 2653 2654 sk = sk_nulls_next(sk); 2655 sk_nulls_for_each_from(sk, node) { 2656 if (seq_sk_match(seq, sk)) 2657 return sk; 2658 } 2659 2660 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2661 ilb2 = &hinfo->lhash2[st->bucket]; 2662 spin_unlock(&ilb2->lock); 2663 ++st->bucket; 2664 return listening_get_first(seq); 2665 } 2666 2667 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2668 { 2669 struct tcp_iter_state *st = seq->private; 2670 void *rc; 2671 2672 st->bucket = 0; 2673 st->offset = 0; 2674 rc = listening_get_first(seq); 2675 2676 while (rc && *pos) { 2677 rc = listening_get_next(seq, rc); 2678 --*pos; 2679 } 2680 return rc; 2681 } 2682 2683 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2684 const struct tcp_iter_state *st) 2685 { 2686 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2687 } 2688 2689 /* 2690 * Get first established socket starting from bucket given in st->bucket. 2691 * If st->bucket is zero, the very first socket in the hash is returned. 2692 */ 2693 static void *established_get_first(struct seq_file *seq) 2694 { 2695 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2696 struct tcp_iter_state *st = seq->private; 2697 2698 st->offset = 0; 2699 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2700 struct sock *sk; 2701 struct hlist_nulls_node *node; 2702 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2703 2704 cond_resched(); 2705 2706 /* Lockless fast path for the common case of empty buckets */ 2707 if (empty_bucket(hinfo, st)) 2708 continue; 2709 2710 spin_lock_bh(lock); 2711 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2712 if (seq_sk_match(seq, sk)) 2713 return sk; 2714 } 2715 spin_unlock_bh(lock); 2716 } 2717 2718 return NULL; 2719 } 2720 2721 static void *established_get_next(struct seq_file *seq, void *cur) 2722 { 2723 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2724 struct tcp_iter_state *st = seq->private; 2725 struct hlist_nulls_node *node; 2726 struct sock *sk = cur; 2727 2728 ++st->num; 2729 ++st->offset; 2730 2731 sk = sk_nulls_next(sk); 2732 2733 sk_nulls_for_each_from(sk, node) { 2734 if (seq_sk_match(seq, sk)) 2735 return sk; 2736 } 2737 2738 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2739 ++st->bucket; 2740 return established_get_first(seq); 2741 } 2742 2743 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2744 { 2745 struct tcp_iter_state *st = seq->private; 2746 void *rc; 2747 2748 st->bucket = 0; 2749 rc = established_get_first(seq); 2750 2751 while (rc && pos) { 2752 rc = established_get_next(seq, rc); 2753 --pos; 2754 } 2755 return rc; 2756 } 2757 2758 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2759 { 2760 void *rc; 2761 struct tcp_iter_state *st = seq->private; 2762 2763 st->state = TCP_SEQ_STATE_LISTENING; 2764 rc = listening_get_idx(seq, &pos); 2765 2766 if (!rc) { 2767 st->state = TCP_SEQ_STATE_ESTABLISHED; 2768 rc = established_get_idx(seq, pos); 2769 } 2770 2771 return rc; 2772 } 2773 2774 static void *tcp_seek_last_pos(struct seq_file *seq) 2775 { 2776 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2777 struct tcp_iter_state *st = seq->private; 2778 int bucket = st->bucket; 2779 int offset = st->offset; 2780 int orig_num = st->num; 2781 void *rc = NULL; 2782 2783 switch (st->state) { 2784 case TCP_SEQ_STATE_LISTENING: 2785 if (st->bucket > hinfo->lhash2_mask) 2786 break; 2787 rc = listening_get_first(seq); 2788 while (offset-- && rc && bucket == st->bucket) 2789 rc = listening_get_next(seq, rc); 2790 if (rc) 2791 break; 2792 st->bucket = 0; 2793 st->state = TCP_SEQ_STATE_ESTABLISHED; 2794 fallthrough; 2795 case TCP_SEQ_STATE_ESTABLISHED: 2796 if (st->bucket > hinfo->ehash_mask) 2797 break; 2798 rc = established_get_first(seq); 2799 while (offset-- && rc && bucket == st->bucket) 2800 rc = established_get_next(seq, rc); 2801 } 2802 2803 st->num = orig_num; 2804 2805 return rc; 2806 } 2807 2808 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2809 { 2810 struct tcp_iter_state *st = seq->private; 2811 void *rc; 2812 2813 if (*pos && *pos == st->last_pos) { 2814 rc = tcp_seek_last_pos(seq); 2815 if (rc) 2816 goto out; 2817 } 2818 2819 st->state = TCP_SEQ_STATE_LISTENING; 2820 st->num = 0; 2821 st->bucket = 0; 2822 st->offset = 0; 2823 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2824 2825 out: 2826 st->last_pos = *pos; 2827 return rc; 2828 } 2829 EXPORT_IPV6_MOD(tcp_seq_start); 2830 2831 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2832 { 2833 struct tcp_iter_state *st = seq->private; 2834 void *rc = NULL; 2835 2836 if (v == SEQ_START_TOKEN) { 2837 rc = tcp_get_idx(seq, 0); 2838 goto out; 2839 } 2840 2841 switch (st->state) { 2842 case TCP_SEQ_STATE_LISTENING: 2843 rc = listening_get_next(seq, v); 2844 if (!rc) { 2845 st->state = TCP_SEQ_STATE_ESTABLISHED; 2846 st->bucket = 0; 2847 st->offset = 0; 2848 rc = established_get_first(seq); 2849 } 2850 break; 2851 case TCP_SEQ_STATE_ESTABLISHED: 2852 rc = established_get_next(seq, v); 2853 break; 2854 } 2855 out: 2856 ++*pos; 2857 st->last_pos = *pos; 2858 return rc; 2859 } 2860 EXPORT_IPV6_MOD(tcp_seq_next); 2861 2862 void tcp_seq_stop(struct seq_file *seq, void *v) 2863 { 2864 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2865 struct tcp_iter_state *st = seq->private; 2866 2867 switch (st->state) { 2868 case TCP_SEQ_STATE_LISTENING: 2869 if (v != SEQ_START_TOKEN) 2870 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2871 break; 2872 case TCP_SEQ_STATE_ESTABLISHED: 2873 if (v) 2874 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2875 break; 2876 } 2877 } 2878 EXPORT_IPV6_MOD(tcp_seq_stop); 2879 2880 static void get_openreq4(const struct request_sock *req, 2881 struct seq_file *f, int i) 2882 { 2883 const struct inet_request_sock *ireq = inet_rsk(req); 2884 long delta = req->rsk_timer.expires - jiffies; 2885 2886 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2887 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2888 i, 2889 ireq->ir_loc_addr, 2890 ireq->ir_num, 2891 ireq->ir_rmt_addr, 2892 ntohs(ireq->ir_rmt_port), 2893 TCP_SYN_RECV, 2894 0, 0, /* could print option size, but that is af dependent. */ 2895 1, /* timers active (only the expire timer) */ 2896 jiffies_delta_to_clock_t(delta), 2897 req->num_timeout, 2898 from_kuid_munged(seq_user_ns(f), 2899 sk_uid(req->rsk_listener)), 2900 0, /* non standard timer */ 2901 0, /* open_requests have no inode */ 2902 0, 2903 req); 2904 } 2905 2906 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2907 { 2908 int timer_active; 2909 unsigned long timer_expires; 2910 const struct tcp_sock *tp = tcp_sk(sk); 2911 const struct inet_connection_sock *icsk = inet_csk(sk); 2912 const struct inet_sock *inet = inet_sk(sk); 2913 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2914 __be32 dest = inet->inet_daddr; 2915 __be32 src = inet->inet_rcv_saddr; 2916 __u16 destp = ntohs(inet->inet_dport); 2917 __u16 srcp = ntohs(inet->inet_sport); 2918 u8 icsk_pending; 2919 int rx_queue; 2920 int state; 2921 2922 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2923 if (icsk_pending == ICSK_TIME_RETRANS || 2924 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2925 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2926 timer_active = 1; 2927 timer_expires = icsk_timeout(icsk); 2928 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2929 timer_active = 4; 2930 timer_expires = icsk_timeout(icsk); 2931 } else if (timer_pending(&sk->sk_timer)) { 2932 timer_active = 2; 2933 timer_expires = sk->sk_timer.expires; 2934 } else { 2935 timer_active = 0; 2936 timer_expires = jiffies; 2937 } 2938 2939 state = inet_sk_state_load(sk); 2940 if (state == TCP_LISTEN) 2941 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2942 else 2943 /* Because we don't lock the socket, 2944 * we might find a transient negative value. 2945 */ 2946 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2947 READ_ONCE(tp->copied_seq), 0); 2948 2949 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2950 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2951 i, src, srcp, dest, destp, state, 2952 READ_ONCE(tp->write_seq) - tp->snd_una, 2953 rx_queue, 2954 timer_active, 2955 jiffies_delta_to_clock_t(timer_expires - jiffies), 2956 icsk->icsk_retransmits, 2957 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2958 icsk->icsk_probes_out, 2959 sock_i_ino(sk), 2960 refcount_read(&sk->sk_refcnt), sk, 2961 jiffies_to_clock_t(icsk->icsk_rto), 2962 jiffies_to_clock_t(icsk->icsk_ack.ato), 2963 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2964 tcp_snd_cwnd(tp), 2965 state == TCP_LISTEN ? 2966 fastopenq->max_qlen : 2967 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2968 } 2969 2970 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2971 struct seq_file *f, int i) 2972 { 2973 long delta = tw->tw_timer.expires - jiffies; 2974 __be32 dest, src; 2975 __u16 destp, srcp; 2976 2977 dest = tw->tw_daddr; 2978 src = tw->tw_rcv_saddr; 2979 destp = ntohs(tw->tw_dport); 2980 srcp = ntohs(tw->tw_sport); 2981 2982 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2983 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2984 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2985 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2986 refcount_read(&tw->tw_refcnt), tw); 2987 } 2988 2989 #define TMPSZ 150 2990 2991 static int tcp4_seq_show(struct seq_file *seq, void *v) 2992 { 2993 struct tcp_iter_state *st; 2994 struct sock *sk = v; 2995 2996 seq_setwidth(seq, TMPSZ - 1); 2997 if (v == SEQ_START_TOKEN) { 2998 seq_puts(seq, " sl local_address rem_address st tx_queue " 2999 "rx_queue tr tm->when retrnsmt uid timeout " 3000 "inode"); 3001 goto out; 3002 } 3003 st = seq->private; 3004 3005 if (sk->sk_state == TCP_TIME_WAIT) 3006 get_timewait4_sock(v, seq, st->num); 3007 else if (sk->sk_state == TCP_NEW_SYN_RECV) 3008 get_openreq4(v, seq, st->num); 3009 else 3010 get_tcp4_sock(v, seq, st->num); 3011 out: 3012 seq_pad(seq, '\n'); 3013 return 0; 3014 } 3015 3016 #ifdef CONFIG_BPF_SYSCALL 3017 struct bpf_tcp_iter_state { 3018 struct tcp_iter_state state; 3019 unsigned int cur_sk; 3020 unsigned int end_sk; 3021 unsigned int max_sk; 3022 struct sock **batch; 3023 bool st_bucket_done; 3024 }; 3025 3026 struct bpf_iter__tcp { 3027 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3028 __bpf_md_ptr(struct sock_common *, sk_common); 3029 uid_t uid __aligned(8); 3030 }; 3031 3032 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3033 struct sock_common *sk_common, uid_t uid) 3034 { 3035 struct bpf_iter__tcp ctx; 3036 3037 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3038 ctx.meta = meta; 3039 ctx.sk_common = sk_common; 3040 ctx.uid = uid; 3041 return bpf_iter_run_prog(prog, &ctx); 3042 } 3043 3044 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3045 { 3046 while (iter->cur_sk < iter->end_sk) 3047 sock_gen_put(iter->batch[iter->cur_sk++]); 3048 } 3049 3050 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3051 unsigned int new_batch_sz) 3052 { 3053 struct sock **new_batch; 3054 3055 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3056 GFP_USER | __GFP_NOWARN); 3057 if (!new_batch) 3058 return -ENOMEM; 3059 3060 bpf_iter_tcp_put_batch(iter); 3061 kvfree(iter->batch); 3062 iter->batch = new_batch; 3063 iter->max_sk = new_batch_sz; 3064 3065 return 0; 3066 } 3067 3068 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3069 struct sock *start_sk) 3070 { 3071 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3072 struct bpf_tcp_iter_state *iter = seq->private; 3073 struct tcp_iter_state *st = &iter->state; 3074 struct hlist_nulls_node *node; 3075 unsigned int expected = 1; 3076 struct sock *sk; 3077 3078 sock_hold(start_sk); 3079 iter->batch[iter->end_sk++] = start_sk; 3080 3081 sk = sk_nulls_next(start_sk); 3082 sk_nulls_for_each_from(sk, node) { 3083 if (seq_sk_match(seq, sk)) { 3084 if (iter->end_sk < iter->max_sk) { 3085 sock_hold(sk); 3086 iter->batch[iter->end_sk++] = sk; 3087 } 3088 expected++; 3089 } 3090 } 3091 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3092 3093 return expected; 3094 } 3095 3096 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3097 struct sock *start_sk) 3098 { 3099 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3100 struct bpf_tcp_iter_state *iter = seq->private; 3101 struct tcp_iter_state *st = &iter->state; 3102 struct hlist_nulls_node *node; 3103 unsigned int expected = 1; 3104 struct sock *sk; 3105 3106 sock_hold(start_sk); 3107 iter->batch[iter->end_sk++] = start_sk; 3108 3109 sk = sk_nulls_next(start_sk); 3110 sk_nulls_for_each_from(sk, node) { 3111 if (seq_sk_match(seq, sk)) { 3112 if (iter->end_sk < iter->max_sk) { 3113 sock_hold(sk); 3114 iter->batch[iter->end_sk++] = sk; 3115 } 3116 expected++; 3117 } 3118 } 3119 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3120 3121 return expected; 3122 } 3123 3124 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3125 { 3126 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3127 struct bpf_tcp_iter_state *iter = seq->private; 3128 struct tcp_iter_state *st = &iter->state; 3129 unsigned int expected; 3130 bool resized = false; 3131 struct sock *sk; 3132 3133 /* The st->bucket is done. Directly advance to the next 3134 * bucket instead of having the tcp_seek_last_pos() to skip 3135 * one by one in the current bucket and eventually find out 3136 * it has to advance to the next bucket. 3137 */ 3138 if (iter->st_bucket_done) { 3139 st->offset = 0; 3140 st->bucket++; 3141 if (st->state == TCP_SEQ_STATE_LISTENING && 3142 st->bucket > hinfo->lhash2_mask) { 3143 st->state = TCP_SEQ_STATE_ESTABLISHED; 3144 st->bucket = 0; 3145 } 3146 } 3147 3148 again: 3149 /* Get a new batch */ 3150 iter->cur_sk = 0; 3151 iter->end_sk = 0; 3152 iter->st_bucket_done = false; 3153 3154 sk = tcp_seek_last_pos(seq); 3155 if (!sk) 3156 return NULL; /* Done */ 3157 3158 if (st->state == TCP_SEQ_STATE_LISTENING) 3159 expected = bpf_iter_tcp_listening_batch(seq, sk); 3160 else 3161 expected = bpf_iter_tcp_established_batch(seq, sk); 3162 3163 if (iter->end_sk == expected) { 3164 iter->st_bucket_done = true; 3165 return sk; 3166 } 3167 3168 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3169 resized = true; 3170 goto again; 3171 } 3172 3173 return sk; 3174 } 3175 3176 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3177 { 3178 /* bpf iter does not support lseek, so it always 3179 * continue from where it was stop()-ped. 3180 */ 3181 if (*pos) 3182 return bpf_iter_tcp_batch(seq); 3183 3184 return SEQ_START_TOKEN; 3185 } 3186 3187 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3188 { 3189 struct bpf_tcp_iter_state *iter = seq->private; 3190 struct tcp_iter_state *st = &iter->state; 3191 struct sock *sk; 3192 3193 /* Whenever seq_next() is called, the iter->cur_sk is 3194 * done with seq_show(), so advance to the next sk in 3195 * the batch. 3196 */ 3197 if (iter->cur_sk < iter->end_sk) { 3198 /* Keeping st->num consistent in tcp_iter_state. 3199 * bpf_iter_tcp does not use st->num. 3200 * meta.seq_num is used instead. 3201 */ 3202 st->num++; 3203 /* Move st->offset to the next sk in the bucket such that 3204 * the future start() will resume at st->offset in 3205 * st->bucket. See tcp_seek_last_pos(). 3206 */ 3207 st->offset++; 3208 sock_gen_put(iter->batch[iter->cur_sk++]); 3209 } 3210 3211 if (iter->cur_sk < iter->end_sk) 3212 sk = iter->batch[iter->cur_sk]; 3213 else 3214 sk = bpf_iter_tcp_batch(seq); 3215 3216 ++*pos; 3217 /* Keeping st->last_pos consistent in tcp_iter_state. 3218 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3219 */ 3220 st->last_pos = *pos; 3221 return sk; 3222 } 3223 3224 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3225 { 3226 struct bpf_iter_meta meta; 3227 struct bpf_prog *prog; 3228 struct sock *sk = v; 3229 uid_t uid; 3230 int ret; 3231 3232 if (v == SEQ_START_TOKEN) 3233 return 0; 3234 3235 if (sk_fullsock(sk)) 3236 lock_sock(sk); 3237 3238 if (unlikely(sk_unhashed(sk))) { 3239 ret = SEQ_SKIP; 3240 goto unlock; 3241 } 3242 3243 if (sk->sk_state == TCP_TIME_WAIT) { 3244 uid = 0; 3245 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3246 const struct request_sock *req = v; 3247 3248 uid = from_kuid_munged(seq_user_ns(seq), 3249 sk_uid(req->rsk_listener)); 3250 } else { 3251 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3252 } 3253 3254 meta.seq = seq; 3255 prog = bpf_iter_get_info(&meta, false); 3256 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3257 3258 unlock: 3259 if (sk_fullsock(sk)) 3260 release_sock(sk); 3261 return ret; 3262 3263 } 3264 3265 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3266 { 3267 struct bpf_tcp_iter_state *iter = seq->private; 3268 struct bpf_iter_meta meta; 3269 struct bpf_prog *prog; 3270 3271 if (!v) { 3272 meta.seq = seq; 3273 prog = bpf_iter_get_info(&meta, true); 3274 if (prog) 3275 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3276 } 3277 3278 if (iter->cur_sk < iter->end_sk) { 3279 bpf_iter_tcp_put_batch(iter); 3280 iter->st_bucket_done = false; 3281 } 3282 } 3283 3284 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3285 .show = bpf_iter_tcp_seq_show, 3286 .start = bpf_iter_tcp_seq_start, 3287 .next = bpf_iter_tcp_seq_next, 3288 .stop = bpf_iter_tcp_seq_stop, 3289 }; 3290 #endif 3291 static unsigned short seq_file_family(const struct seq_file *seq) 3292 { 3293 const struct tcp_seq_afinfo *afinfo; 3294 3295 #ifdef CONFIG_BPF_SYSCALL 3296 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3297 if (seq->op == &bpf_iter_tcp_seq_ops) 3298 return AF_UNSPEC; 3299 #endif 3300 3301 /* Iterated from proc fs */ 3302 afinfo = pde_data(file_inode(seq->file)); 3303 return afinfo->family; 3304 } 3305 3306 static const struct seq_operations tcp4_seq_ops = { 3307 .show = tcp4_seq_show, 3308 .start = tcp_seq_start, 3309 .next = tcp_seq_next, 3310 .stop = tcp_seq_stop, 3311 }; 3312 3313 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3314 .family = AF_INET, 3315 }; 3316 3317 static int __net_init tcp4_proc_init_net(struct net *net) 3318 { 3319 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3320 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3321 return -ENOMEM; 3322 return 0; 3323 } 3324 3325 static void __net_exit tcp4_proc_exit_net(struct net *net) 3326 { 3327 remove_proc_entry("tcp", net->proc_net); 3328 } 3329 3330 static struct pernet_operations tcp4_net_ops = { 3331 .init = tcp4_proc_init_net, 3332 .exit = tcp4_proc_exit_net, 3333 }; 3334 3335 int __init tcp4_proc_init(void) 3336 { 3337 return register_pernet_subsys(&tcp4_net_ops); 3338 } 3339 3340 void tcp4_proc_exit(void) 3341 { 3342 unregister_pernet_subsys(&tcp4_net_ops); 3343 } 3344 #endif /* CONFIG_PROC_FS */ 3345 3346 /* @wake is one when sk_stream_write_space() calls us. 3347 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3348 * This mimics the strategy used in sock_def_write_space(). 3349 */ 3350 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3351 { 3352 const struct tcp_sock *tp = tcp_sk(sk); 3353 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3354 READ_ONCE(tp->snd_nxt); 3355 3356 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3357 } 3358 EXPORT_SYMBOL(tcp_stream_memory_free); 3359 3360 struct proto tcp_prot = { 3361 .name = "TCP", 3362 .owner = THIS_MODULE, 3363 .close = tcp_close, 3364 .pre_connect = tcp_v4_pre_connect, 3365 .connect = tcp_v4_connect, 3366 .disconnect = tcp_disconnect, 3367 .accept = inet_csk_accept, 3368 .ioctl = tcp_ioctl, 3369 .init = tcp_v4_init_sock, 3370 .destroy = tcp_v4_destroy_sock, 3371 .shutdown = tcp_shutdown, 3372 .setsockopt = tcp_setsockopt, 3373 .getsockopt = tcp_getsockopt, 3374 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3375 .keepalive = tcp_set_keepalive, 3376 .recvmsg = tcp_recvmsg, 3377 .sendmsg = tcp_sendmsg, 3378 .splice_eof = tcp_splice_eof, 3379 .backlog_rcv = tcp_v4_do_rcv, 3380 .release_cb = tcp_release_cb, 3381 .hash = inet_hash, 3382 .unhash = inet_unhash, 3383 .get_port = inet_csk_get_port, 3384 .put_port = inet_put_port, 3385 #ifdef CONFIG_BPF_SYSCALL 3386 .psock_update_sk_prot = tcp_bpf_update_proto, 3387 #endif 3388 .enter_memory_pressure = tcp_enter_memory_pressure, 3389 .leave_memory_pressure = tcp_leave_memory_pressure, 3390 .stream_memory_free = tcp_stream_memory_free, 3391 .sockets_allocated = &tcp_sockets_allocated, 3392 .orphan_count = &tcp_orphan_count, 3393 3394 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3395 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3396 3397 .memory_pressure = &tcp_memory_pressure, 3398 .sysctl_mem = sysctl_tcp_mem, 3399 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3400 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3401 .max_header = MAX_TCP_HEADER, 3402 .obj_size = sizeof(struct tcp_sock), 3403 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3404 .twsk_prot = &tcp_timewait_sock_ops, 3405 .rsk_prot = &tcp_request_sock_ops, 3406 .h.hashinfo = NULL, 3407 .no_autobind = true, 3408 .diag_destroy = tcp_abort, 3409 }; 3410 EXPORT_SYMBOL(tcp_prot); 3411 3412 static void __net_exit tcp_sk_exit(struct net *net) 3413 { 3414 if (net->ipv4.tcp_congestion_control) 3415 bpf_module_put(net->ipv4.tcp_congestion_control, 3416 net->ipv4.tcp_congestion_control->owner); 3417 } 3418 3419 static void __net_init tcp_set_hashinfo(struct net *net) 3420 { 3421 struct inet_hashinfo *hinfo; 3422 unsigned int ehash_entries; 3423 struct net *old_net; 3424 3425 if (net_eq(net, &init_net)) 3426 goto fallback; 3427 3428 old_net = current->nsproxy->net_ns; 3429 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3430 if (!ehash_entries) 3431 goto fallback; 3432 3433 ehash_entries = roundup_pow_of_two(ehash_entries); 3434 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3435 if (!hinfo) { 3436 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3437 "for a netns, fallback to the global one\n", 3438 ehash_entries); 3439 fallback: 3440 hinfo = &tcp_hashinfo; 3441 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3442 } 3443 3444 net->ipv4.tcp_death_row.hashinfo = hinfo; 3445 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3446 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3447 } 3448 3449 static int __net_init tcp_sk_init(struct net *net) 3450 { 3451 net->ipv4.sysctl_tcp_ecn = 2; 3452 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3453 3454 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3455 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3456 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3457 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3458 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3459 3460 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3461 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3462 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3463 3464 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3465 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3466 net->ipv4.sysctl_tcp_syncookies = 1; 3467 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3468 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3469 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3470 net->ipv4.sysctl_tcp_orphan_retries = 0; 3471 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3472 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3473 net->ipv4.sysctl_tcp_tw_reuse = 2; 3474 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3475 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3476 3477 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3478 tcp_set_hashinfo(net); 3479 3480 net->ipv4.sysctl_tcp_sack = 1; 3481 net->ipv4.sysctl_tcp_window_scaling = 1; 3482 net->ipv4.sysctl_tcp_timestamps = 1; 3483 net->ipv4.sysctl_tcp_early_retrans = 3; 3484 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3485 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3486 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3487 net->ipv4.sysctl_tcp_max_reordering = 300; 3488 net->ipv4.sysctl_tcp_dsack = 1; 3489 net->ipv4.sysctl_tcp_app_win = 31; 3490 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3491 net->ipv4.sysctl_tcp_frto = 2; 3492 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3493 /* This limits the percentage of the congestion window which we 3494 * will allow a single TSO frame to consume. Building TSO frames 3495 * which are too large can cause TCP streams to be bursty. 3496 */ 3497 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3498 /* Default TSQ limit of 4 MB */ 3499 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3500 3501 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3502 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3503 3504 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3505 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3506 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3507 net->ipv4.sysctl_tcp_autocorking = 1; 3508 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3509 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3510 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3511 if (net != &init_net) { 3512 memcpy(net->ipv4.sysctl_tcp_rmem, 3513 init_net.ipv4.sysctl_tcp_rmem, 3514 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3515 memcpy(net->ipv4.sysctl_tcp_wmem, 3516 init_net.ipv4.sysctl_tcp_wmem, 3517 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3518 } 3519 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3520 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3521 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3522 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3523 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3524 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3525 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3526 3527 /* Set default values for PLB */ 3528 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3529 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3530 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3531 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3532 /* Default congestion threshold for PLB to mark a round is 50% */ 3533 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3534 3535 /* Reno is always built in */ 3536 if (!net_eq(net, &init_net) && 3537 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3538 init_net.ipv4.tcp_congestion_control->owner)) 3539 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3540 else 3541 net->ipv4.tcp_congestion_control = &tcp_reno; 3542 3543 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3544 net->ipv4.sysctl_tcp_shrink_window = 0; 3545 3546 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3547 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3548 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3549 3550 return 0; 3551 } 3552 3553 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3554 { 3555 struct net *net; 3556 3557 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3558 * and failed setup_net error unwinding path are serialized. 3559 * 3560 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3561 * net_exit_list, the thread that dismantles a particular twsk must 3562 * do so without other thread progressing to refcount_dec_and_test() of 3563 * tcp_death_row.tw_refcount. 3564 */ 3565 mutex_lock(&tcp_exit_batch_mutex); 3566 3567 tcp_twsk_purge(net_exit_list); 3568 3569 list_for_each_entry(net, net_exit_list, exit_list) { 3570 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3571 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3572 tcp_fastopen_ctx_destroy(net); 3573 } 3574 3575 mutex_unlock(&tcp_exit_batch_mutex); 3576 } 3577 3578 static struct pernet_operations __net_initdata tcp_sk_ops = { 3579 .init = tcp_sk_init, 3580 .exit = tcp_sk_exit, 3581 .exit_batch = tcp_sk_exit_batch, 3582 }; 3583 3584 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3585 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3586 struct sock_common *sk_common, uid_t uid) 3587 3588 #define INIT_BATCH_SZ 16 3589 3590 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3591 { 3592 struct bpf_tcp_iter_state *iter = priv_data; 3593 int err; 3594 3595 err = bpf_iter_init_seq_net(priv_data, aux); 3596 if (err) 3597 return err; 3598 3599 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3600 if (err) { 3601 bpf_iter_fini_seq_net(priv_data); 3602 return err; 3603 } 3604 3605 return 0; 3606 } 3607 3608 static void bpf_iter_fini_tcp(void *priv_data) 3609 { 3610 struct bpf_tcp_iter_state *iter = priv_data; 3611 3612 bpf_iter_fini_seq_net(priv_data); 3613 kvfree(iter->batch); 3614 } 3615 3616 static const struct bpf_iter_seq_info tcp_seq_info = { 3617 .seq_ops = &bpf_iter_tcp_seq_ops, 3618 .init_seq_private = bpf_iter_init_tcp, 3619 .fini_seq_private = bpf_iter_fini_tcp, 3620 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3621 }; 3622 3623 static const struct bpf_func_proto * 3624 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3625 const struct bpf_prog *prog) 3626 { 3627 switch (func_id) { 3628 case BPF_FUNC_setsockopt: 3629 return &bpf_sk_setsockopt_proto; 3630 case BPF_FUNC_getsockopt: 3631 return &bpf_sk_getsockopt_proto; 3632 default: 3633 return NULL; 3634 } 3635 } 3636 3637 static struct bpf_iter_reg tcp_reg_info = { 3638 .target = "tcp", 3639 .ctx_arg_info_size = 1, 3640 .ctx_arg_info = { 3641 { offsetof(struct bpf_iter__tcp, sk_common), 3642 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3643 }, 3644 .get_func_proto = bpf_iter_tcp_get_func_proto, 3645 .seq_info = &tcp_seq_info, 3646 }; 3647 3648 static void __init bpf_iter_register(void) 3649 { 3650 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3651 if (bpf_iter_reg_target(&tcp_reg_info)) 3652 pr_warn("Warning: could not register bpf iterator tcp\n"); 3653 } 3654 3655 #endif 3656 3657 void __init tcp_v4_init(void) 3658 { 3659 int cpu, res; 3660 3661 for_each_possible_cpu(cpu) { 3662 struct sock *sk; 3663 3664 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3665 IPPROTO_TCP, &init_net); 3666 if (res) 3667 panic("Failed to create the TCP control socket.\n"); 3668 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3669 3670 /* Please enforce IP_DF and IPID==0 for RST and 3671 * ACK sent in SYN-RECV and TIME-WAIT state. 3672 */ 3673 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3674 3675 sk->sk_clockid = CLOCK_MONOTONIC; 3676 3677 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3678 } 3679 if (register_pernet_subsys(&tcp_sk_ops)) 3680 panic("Failed to create the TCP control socket.\n"); 3681 3682 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3683 bpf_iter_register(); 3684 #endif 3685 } 3686