1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 #include <net/rstreason.h> 74 75 #include <linux/inet.h> 76 #include <linux/ipv6.h> 77 #include <linux/stddef.h> 78 #include <linux/proc_fs.h> 79 #include <linux/seq_file.h> 80 #include <linux/inetdevice.h> 81 #include <linux/btf_ids.h> 82 #include <linux/skbuff_ref.h> 83 84 #include <crypto/hash.h> 85 #include <linux/scatterlist.h> 86 87 #include <trace/events/tcp.h> 88 89 #ifdef CONFIG_TCP_MD5SIG 90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 91 __be32 daddr, __be32 saddr, const struct tcphdr *th); 92 #endif 93 94 struct inet_hashinfo tcp_hashinfo; 95 EXPORT_SYMBOL(tcp_hashinfo); 96 97 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 98 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 99 }; 100 101 static DEFINE_MUTEX(tcp_exit_batch_mutex); 102 103 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 104 { 105 return secure_tcp_seq(ip_hdr(skb)->daddr, 106 ip_hdr(skb)->saddr, 107 tcp_hdr(skb)->dest, 108 tcp_hdr(skb)->source); 109 } 110 111 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 112 { 113 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 114 } 115 116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 117 { 118 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 119 const struct inet_timewait_sock *tw = inet_twsk(sktw); 120 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 121 struct tcp_sock *tp = tcp_sk(sk); 122 int ts_recent_stamp; 123 u32 reuse_thresh; 124 125 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 126 reuse = 0; 127 128 if (reuse == 2) { 129 /* Still does not detect *everything* that goes through 130 * lo, since we require a loopback src or dst address 131 * or direct binding to 'lo' interface. 132 */ 133 bool loopback = false; 134 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 135 loopback = true; 136 #if IS_ENABLED(CONFIG_IPV6) 137 if (tw->tw_family == AF_INET6) { 138 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 139 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 140 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 142 loopback = true; 143 } else 144 #endif 145 { 146 if (ipv4_is_loopback(tw->tw_daddr) || 147 ipv4_is_loopback(tw->tw_rcv_saddr)) 148 loopback = true; 149 } 150 if (!loopback) 151 reuse = 0; 152 } 153 154 /* With PAWS, it is safe from the viewpoint 155 of data integrity. Even without PAWS it is safe provided sequence 156 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 157 158 Actually, the idea is close to VJ's one, only timestamp cache is 159 held not per host, but per port pair and TW bucket is used as state 160 holder. 161 162 If TW bucket has been already destroyed we fall back to VJ's scheme 163 and use initial timestamp retrieved from peer table. 164 */ 165 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 166 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 167 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 168 if (ts_recent_stamp && 169 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 170 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 171 * and releasing the bucket lock. 172 */ 173 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 174 return 0; 175 176 /* In case of repair and re-using TIME-WAIT sockets we still 177 * want to be sure that it is safe as above but honor the 178 * sequence numbers and time stamps set as part of the repair 179 * process. 180 * 181 * Without this check re-using a TIME-WAIT socket with TCP 182 * repair would accumulate a -1 on the repair assigned 183 * sequence number. The first time it is reused the sequence 184 * is -1, the second time -2, etc. This fixes that issue 185 * without appearing to create any others. 186 */ 187 if (likely(!tp->repair)) { 188 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 189 190 if (!seq) 191 seq = 1; 192 WRITE_ONCE(tp->write_seq, seq); 193 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 194 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 195 } 196 197 return 1; 198 } 199 200 return 0; 201 } 202 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 203 204 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 205 int addr_len) 206 { 207 /* This check is replicated from tcp_v4_connect() and intended to 208 * prevent BPF program called below from accessing bytes that are out 209 * of the bound specified by user in addr_len. 210 */ 211 if (addr_len < sizeof(struct sockaddr_in)) 212 return -EINVAL; 213 214 sock_owned_by_me(sk); 215 216 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 217 } 218 219 /* This will initiate an outgoing connection. */ 220 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 221 { 222 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 223 struct inet_timewait_death_row *tcp_death_row; 224 struct inet_sock *inet = inet_sk(sk); 225 struct tcp_sock *tp = tcp_sk(sk); 226 struct ip_options_rcu *inet_opt; 227 struct net *net = sock_net(sk); 228 __be16 orig_sport, orig_dport; 229 __be32 daddr, nexthop; 230 struct flowi4 *fl4; 231 struct rtable *rt; 232 int err; 233 234 if (addr_len < sizeof(struct sockaddr_in)) 235 return -EINVAL; 236 237 if (usin->sin_family != AF_INET) 238 return -EAFNOSUPPORT; 239 240 nexthop = daddr = usin->sin_addr.s_addr; 241 inet_opt = rcu_dereference_protected(inet->inet_opt, 242 lockdep_sock_is_held(sk)); 243 if (inet_opt && inet_opt->opt.srr) { 244 if (!daddr) 245 return -EINVAL; 246 nexthop = inet_opt->opt.faddr; 247 } 248 249 orig_sport = inet->inet_sport; 250 orig_dport = usin->sin_port; 251 fl4 = &inet->cork.fl.u.ip4; 252 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 253 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 254 orig_dport, sk); 255 if (IS_ERR(rt)) { 256 err = PTR_ERR(rt); 257 if (err == -ENETUNREACH) 258 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 259 return err; 260 } 261 262 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 263 ip_rt_put(rt); 264 return -ENETUNREACH; 265 } 266 267 if (!inet_opt || !inet_opt->opt.srr) 268 daddr = fl4->daddr; 269 270 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 271 272 if (!inet->inet_saddr) { 273 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 274 if (err) { 275 ip_rt_put(rt); 276 return err; 277 } 278 } else { 279 sk_rcv_saddr_set(sk, inet->inet_saddr); 280 } 281 282 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 283 /* Reset inherited state */ 284 tp->rx_opt.ts_recent = 0; 285 tp->rx_opt.ts_recent_stamp = 0; 286 if (likely(!tp->repair)) 287 WRITE_ONCE(tp->write_seq, 0); 288 } 289 290 inet->inet_dport = usin->sin_port; 291 sk_daddr_set(sk, daddr); 292 293 inet_csk(sk)->icsk_ext_hdr_len = 0; 294 if (inet_opt) 295 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 296 297 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 298 299 /* Socket identity is still unknown (sport may be zero). 300 * However we set state to SYN-SENT and not releasing socket 301 * lock select source port, enter ourselves into the hash tables and 302 * complete initialization after this. 303 */ 304 tcp_set_state(sk, TCP_SYN_SENT); 305 err = inet_hash_connect(tcp_death_row, sk); 306 if (err) 307 goto failure; 308 309 sk_set_txhash(sk); 310 311 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 312 inet->inet_sport, inet->inet_dport, sk); 313 if (IS_ERR(rt)) { 314 err = PTR_ERR(rt); 315 rt = NULL; 316 goto failure; 317 } 318 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 319 /* OK, now commit destination to socket. */ 320 sk->sk_gso_type = SKB_GSO_TCPV4; 321 sk_setup_caps(sk, &rt->dst); 322 rt = NULL; 323 324 if (likely(!tp->repair)) { 325 if (!tp->write_seq) 326 WRITE_ONCE(tp->write_seq, 327 secure_tcp_seq(inet->inet_saddr, 328 inet->inet_daddr, 329 inet->inet_sport, 330 usin->sin_port)); 331 WRITE_ONCE(tp->tsoffset, 332 secure_tcp_ts_off(net, inet->inet_saddr, 333 inet->inet_daddr)); 334 } 335 336 atomic_set(&inet->inet_id, get_random_u16()); 337 338 if (tcp_fastopen_defer_connect(sk, &err)) 339 return err; 340 if (err) 341 goto failure; 342 343 err = tcp_connect(sk); 344 345 if (err) 346 goto failure; 347 348 return 0; 349 350 failure: 351 /* 352 * This unhashes the socket and releases the local port, 353 * if necessary. 354 */ 355 tcp_set_state(sk, TCP_CLOSE); 356 inet_bhash2_reset_saddr(sk); 357 ip_rt_put(rt); 358 sk->sk_route_caps = 0; 359 inet->inet_dport = 0; 360 return err; 361 } 362 EXPORT_SYMBOL(tcp_v4_connect); 363 364 /* 365 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 366 * It can be called through tcp_release_cb() if socket was owned by user 367 * at the time tcp_v4_err() was called to handle ICMP message. 368 */ 369 void tcp_v4_mtu_reduced(struct sock *sk) 370 { 371 struct inet_sock *inet = inet_sk(sk); 372 struct dst_entry *dst; 373 u32 mtu; 374 375 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 376 return; 377 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 378 dst = inet_csk_update_pmtu(sk, mtu); 379 if (!dst) 380 return; 381 382 /* Something is about to be wrong... Remember soft error 383 * for the case, if this connection will not able to recover. 384 */ 385 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 386 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 387 388 mtu = dst_mtu(dst); 389 390 if (inet->pmtudisc != IP_PMTUDISC_DONT && 391 ip_sk_accept_pmtu(sk) && 392 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 393 tcp_sync_mss(sk, mtu); 394 395 /* Resend the TCP packet because it's 396 * clear that the old packet has been 397 * dropped. This is the new "fast" path mtu 398 * discovery. 399 */ 400 tcp_simple_retransmit(sk); 401 } /* else let the usual retransmit timer handle it */ 402 } 403 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 404 405 static void do_redirect(struct sk_buff *skb, struct sock *sk) 406 { 407 struct dst_entry *dst = __sk_dst_check(sk, 0); 408 409 if (dst) 410 dst->ops->redirect(dst, sk, skb); 411 } 412 413 414 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 415 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 416 { 417 struct request_sock *req = inet_reqsk(sk); 418 struct net *net = sock_net(sk); 419 420 /* ICMPs are not backlogged, hence we cannot get 421 * an established socket here. 422 */ 423 if (seq != tcp_rsk(req)->snt_isn) { 424 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 425 } else if (abort) { 426 /* 427 * Still in SYN_RECV, just remove it silently. 428 * There is no good way to pass the error to the newly 429 * created socket, and POSIX does not want network 430 * errors returned from accept(). 431 */ 432 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 433 tcp_listendrop(req->rsk_listener); 434 } 435 reqsk_put(req); 436 } 437 EXPORT_SYMBOL(tcp_req_err); 438 439 /* TCP-LD (RFC 6069) logic */ 440 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 441 { 442 struct inet_connection_sock *icsk = inet_csk(sk); 443 struct tcp_sock *tp = tcp_sk(sk); 444 struct sk_buff *skb; 445 s32 remaining; 446 u32 delta_us; 447 448 if (sock_owned_by_user(sk)) 449 return; 450 451 if (seq != tp->snd_una || !icsk->icsk_retransmits || 452 !icsk->icsk_backoff) 453 return; 454 455 skb = tcp_rtx_queue_head(sk); 456 if (WARN_ON_ONCE(!skb)) 457 return; 458 459 icsk->icsk_backoff--; 460 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 461 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 462 463 tcp_mstamp_refresh(tp); 464 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 465 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 466 467 if (remaining > 0) { 468 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 469 } else { 470 /* RTO revert clocked out retransmission. 471 * Will retransmit now. 472 */ 473 tcp_retransmit_timer(sk); 474 } 475 } 476 EXPORT_SYMBOL(tcp_ld_RTO_revert); 477 478 /* 479 * This routine is called by the ICMP module when it gets some 480 * sort of error condition. If err < 0 then the socket should 481 * be closed and the error returned to the user. If err > 0 482 * it's just the icmp type << 8 | icmp code. After adjustment 483 * header points to the first 8 bytes of the tcp header. We need 484 * to find the appropriate port. 485 * 486 * The locking strategy used here is very "optimistic". When 487 * someone else accesses the socket the ICMP is just dropped 488 * and for some paths there is no check at all. 489 * A more general error queue to queue errors for later handling 490 * is probably better. 491 * 492 */ 493 494 int tcp_v4_err(struct sk_buff *skb, u32 info) 495 { 496 const struct iphdr *iph = (const struct iphdr *)skb->data; 497 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 498 struct tcp_sock *tp; 499 const int type = icmp_hdr(skb)->type; 500 const int code = icmp_hdr(skb)->code; 501 struct sock *sk; 502 struct request_sock *fastopen; 503 u32 seq, snd_una; 504 int err; 505 struct net *net = dev_net(skb->dev); 506 507 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 508 iph->daddr, th->dest, iph->saddr, 509 ntohs(th->source), inet_iif(skb), 0); 510 if (!sk) { 511 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 512 return -ENOENT; 513 } 514 if (sk->sk_state == TCP_TIME_WAIT) { 515 /* To increase the counter of ignored icmps for TCP-AO */ 516 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 517 inet_twsk_put(inet_twsk(sk)); 518 return 0; 519 } 520 seq = ntohl(th->seq); 521 if (sk->sk_state == TCP_NEW_SYN_RECV) { 522 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 523 type == ICMP_TIME_EXCEEDED || 524 (type == ICMP_DEST_UNREACH && 525 (code == ICMP_NET_UNREACH || 526 code == ICMP_HOST_UNREACH))); 527 return 0; 528 } 529 530 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 531 sock_put(sk); 532 return 0; 533 } 534 535 bh_lock_sock(sk); 536 /* If too many ICMPs get dropped on busy 537 * servers this needs to be solved differently. 538 * We do take care of PMTU discovery (RFC1191) special case : 539 * we can receive locally generated ICMP messages while socket is held. 540 */ 541 if (sock_owned_by_user(sk)) { 542 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 543 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 544 } 545 if (sk->sk_state == TCP_CLOSE) 546 goto out; 547 548 if (static_branch_unlikely(&ip4_min_ttl)) { 549 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 550 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 551 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 552 goto out; 553 } 554 } 555 556 tp = tcp_sk(sk); 557 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 558 fastopen = rcu_dereference(tp->fastopen_rsk); 559 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 560 if (sk->sk_state != TCP_LISTEN && 561 !between(seq, snd_una, tp->snd_nxt)) { 562 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 563 goto out; 564 } 565 566 switch (type) { 567 case ICMP_REDIRECT: 568 if (!sock_owned_by_user(sk)) 569 do_redirect(skb, sk); 570 goto out; 571 case ICMP_SOURCE_QUENCH: 572 /* Just silently ignore these. */ 573 goto out; 574 case ICMP_PARAMETERPROB: 575 err = EPROTO; 576 break; 577 case ICMP_DEST_UNREACH: 578 if (code > NR_ICMP_UNREACH) 579 goto out; 580 581 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 582 /* We are not interested in TCP_LISTEN and open_requests 583 * (SYN-ACKs send out by Linux are always <576bytes so 584 * they should go through unfragmented). 585 */ 586 if (sk->sk_state == TCP_LISTEN) 587 goto out; 588 589 WRITE_ONCE(tp->mtu_info, info); 590 if (!sock_owned_by_user(sk)) { 591 tcp_v4_mtu_reduced(sk); 592 } else { 593 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 594 sock_hold(sk); 595 } 596 goto out; 597 } 598 599 err = icmp_err_convert[code].errno; 600 /* check if this ICMP message allows revert of backoff. 601 * (see RFC 6069) 602 */ 603 if (!fastopen && 604 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 605 tcp_ld_RTO_revert(sk, seq); 606 break; 607 case ICMP_TIME_EXCEEDED: 608 err = EHOSTUNREACH; 609 break; 610 default: 611 goto out; 612 } 613 614 switch (sk->sk_state) { 615 case TCP_SYN_SENT: 616 case TCP_SYN_RECV: 617 /* Only in fast or simultaneous open. If a fast open socket is 618 * already accepted it is treated as a connected one below. 619 */ 620 if (fastopen && !fastopen->sk) 621 break; 622 623 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 624 625 if (!sock_owned_by_user(sk)) 626 tcp_done_with_error(sk, err); 627 else 628 WRITE_ONCE(sk->sk_err_soft, err); 629 goto out; 630 } 631 632 /* If we've already connected we will keep trying 633 * until we time out, or the user gives up. 634 * 635 * rfc1122 4.2.3.9 allows to consider as hard errors 636 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 637 * but it is obsoleted by pmtu discovery). 638 * 639 * Note, that in modern internet, where routing is unreliable 640 * and in each dark corner broken firewalls sit, sending random 641 * errors ordered by their masters even this two messages finally lose 642 * their original sense (even Linux sends invalid PORT_UNREACHs) 643 * 644 * Now we are in compliance with RFCs. 645 * --ANK (980905) 646 */ 647 648 if (!sock_owned_by_user(sk) && 649 inet_test_bit(RECVERR, sk)) { 650 WRITE_ONCE(sk->sk_err, err); 651 sk_error_report(sk); 652 } else { /* Only an error on timeout */ 653 WRITE_ONCE(sk->sk_err_soft, err); 654 } 655 656 out: 657 bh_unlock_sock(sk); 658 sock_put(sk); 659 return 0; 660 } 661 662 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 663 { 664 struct tcphdr *th = tcp_hdr(skb); 665 666 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 667 skb->csum_start = skb_transport_header(skb) - skb->head; 668 skb->csum_offset = offsetof(struct tcphdr, check); 669 } 670 671 /* This routine computes an IPv4 TCP checksum. */ 672 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 673 { 674 const struct inet_sock *inet = inet_sk(sk); 675 676 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 677 } 678 EXPORT_SYMBOL(tcp_v4_send_check); 679 680 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 681 682 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 683 const struct tcp_ao_hdr *aoh, 684 struct ip_reply_arg *arg, struct tcphdr *reply, 685 __be32 reply_options[REPLY_OPTIONS_LEN]) 686 { 687 #ifdef CONFIG_TCP_AO 688 int sdif = tcp_v4_sdif(skb); 689 int dif = inet_iif(skb); 690 int l3index = sdif ? dif : 0; 691 bool allocated_traffic_key; 692 struct tcp_ao_key *key; 693 char *traffic_key; 694 bool drop = true; 695 u32 ao_sne = 0; 696 u8 keyid; 697 698 rcu_read_lock(); 699 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 700 &key, &traffic_key, &allocated_traffic_key, 701 &keyid, &ao_sne)) 702 goto out; 703 704 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 705 (aoh->rnext_keyid << 8) | keyid); 706 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 707 reply->doff = arg->iov[0].iov_len / 4; 708 709 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 710 key, traffic_key, 711 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 712 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 713 reply, ao_sne)) 714 goto out; 715 drop = false; 716 out: 717 rcu_read_unlock(); 718 if (allocated_traffic_key) 719 kfree(traffic_key); 720 return drop; 721 #else 722 return true; 723 #endif 724 } 725 726 /* 727 * This routine will send an RST to the other tcp. 728 * 729 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 730 * for reset. 731 * Answer: if a packet caused RST, it is not for a socket 732 * existing in our system, if it is matched to a socket, 733 * it is just duplicate segment or bug in other side's TCP. 734 * So that we build reply only basing on parameters 735 * arrived with segment. 736 * Exception: precedence violation. We do not implement it in any case. 737 */ 738 739 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 740 enum sk_rst_reason reason) 741 { 742 const struct tcphdr *th = tcp_hdr(skb); 743 struct { 744 struct tcphdr th; 745 __be32 opt[REPLY_OPTIONS_LEN]; 746 } rep; 747 const __u8 *md5_hash_location = NULL; 748 const struct tcp_ao_hdr *aoh; 749 struct ip_reply_arg arg; 750 #ifdef CONFIG_TCP_MD5SIG 751 struct tcp_md5sig_key *key = NULL; 752 unsigned char newhash[16]; 753 struct sock *sk1 = NULL; 754 int genhash; 755 #endif 756 u64 transmit_time = 0; 757 struct sock *ctl_sk; 758 struct net *net; 759 u32 txhash = 0; 760 761 /* Never send a reset in response to a reset. */ 762 if (th->rst) 763 return; 764 765 /* If sk not NULL, it means we did a successful lookup and incoming 766 * route had to be correct. prequeue might have dropped our dst. 767 */ 768 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 769 return; 770 771 /* Swap the send and the receive. */ 772 memset(&rep, 0, sizeof(rep)); 773 rep.th.dest = th->source; 774 rep.th.source = th->dest; 775 rep.th.doff = sizeof(struct tcphdr) / 4; 776 rep.th.rst = 1; 777 778 if (th->ack) { 779 rep.th.seq = th->ack_seq; 780 } else { 781 rep.th.ack = 1; 782 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 783 skb->len - (th->doff << 2)); 784 } 785 786 memset(&arg, 0, sizeof(arg)); 787 arg.iov[0].iov_base = (unsigned char *)&rep; 788 arg.iov[0].iov_len = sizeof(rep.th); 789 790 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 791 792 /* Invalid TCP option size or twice included auth */ 793 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 794 return; 795 796 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 797 return; 798 799 #ifdef CONFIG_TCP_MD5SIG 800 rcu_read_lock(); 801 if (sk && sk_fullsock(sk)) { 802 const union tcp_md5_addr *addr; 803 int l3index; 804 805 /* sdif set, means packet ingressed via a device 806 * in an L3 domain and inet_iif is set to it. 807 */ 808 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 809 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 810 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 811 } else if (md5_hash_location) { 812 const union tcp_md5_addr *addr; 813 int sdif = tcp_v4_sdif(skb); 814 int dif = inet_iif(skb); 815 int l3index; 816 817 /* 818 * active side is lost. Try to find listening socket through 819 * source port, and then find md5 key through listening socket. 820 * we are not loose security here: 821 * Incoming packet is checked with md5 hash with finding key, 822 * no RST generated if md5 hash doesn't match. 823 */ 824 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 825 NULL, 0, ip_hdr(skb)->saddr, 826 th->source, ip_hdr(skb)->daddr, 827 ntohs(th->source), dif, sdif); 828 /* don't send rst if it can't find key */ 829 if (!sk1) 830 goto out; 831 832 /* sdif set, means packet ingressed via a device 833 * in an L3 domain and dif is set to it. 834 */ 835 l3index = sdif ? dif : 0; 836 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 837 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 838 if (!key) 839 goto out; 840 841 842 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 843 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 844 goto out; 845 846 } 847 848 if (key) { 849 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 850 (TCPOPT_NOP << 16) | 851 (TCPOPT_MD5SIG << 8) | 852 TCPOLEN_MD5SIG); 853 /* Update length and the length the header thinks exists */ 854 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 855 rep.th.doff = arg.iov[0].iov_len / 4; 856 857 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 858 key, ip_hdr(skb)->saddr, 859 ip_hdr(skb)->daddr, &rep.th); 860 } 861 #endif 862 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 863 if (rep.opt[0] == 0) { 864 __be32 mrst = mptcp_reset_option(skb); 865 866 if (mrst) { 867 rep.opt[0] = mrst; 868 arg.iov[0].iov_len += sizeof(mrst); 869 rep.th.doff = arg.iov[0].iov_len / 4; 870 } 871 } 872 873 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 874 ip_hdr(skb)->saddr, /* XXX */ 875 arg.iov[0].iov_len, IPPROTO_TCP, 0); 876 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 877 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 878 879 /* When socket is gone, all binding information is lost. 880 * routing might fail in this case. No choice here, if we choose to force 881 * input interface, we will misroute in case of asymmetric route. 882 */ 883 if (sk) 884 arg.bound_dev_if = sk->sk_bound_dev_if; 885 886 trace_tcp_send_reset(sk, skb, reason); 887 888 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 889 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 890 891 arg.tos = ip_hdr(skb)->tos; 892 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 893 local_bh_disable(); 894 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 895 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 896 897 sock_net_set(ctl_sk, net); 898 if (sk) { 899 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 900 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 901 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 902 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 903 transmit_time = tcp_transmit_time(sk); 904 xfrm_sk_clone_policy(ctl_sk, sk); 905 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 906 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 907 } else { 908 ctl_sk->sk_mark = 0; 909 ctl_sk->sk_priority = 0; 910 } 911 ip_send_unicast_reply(ctl_sk, sk, 912 skb, &TCP_SKB_CB(skb)->header.h4.opt, 913 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 914 &arg, arg.iov[0].iov_len, 915 transmit_time, txhash); 916 917 xfrm_sk_free_policy(ctl_sk); 918 sock_net_set(ctl_sk, &init_net); 919 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 920 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 921 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 922 local_bh_enable(); 923 924 #ifdef CONFIG_TCP_MD5SIG 925 out: 926 rcu_read_unlock(); 927 #endif 928 } 929 930 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 931 outside socket context is ugly, certainly. What can I do? 932 */ 933 934 static void tcp_v4_send_ack(const struct sock *sk, 935 struct sk_buff *skb, u32 seq, u32 ack, 936 u32 win, u32 tsval, u32 tsecr, int oif, 937 struct tcp_key *key, 938 int reply_flags, u8 tos, u32 txhash) 939 { 940 const struct tcphdr *th = tcp_hdr(skb); 941 struct { 942 struct tcphdr th; 943 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 944 } rep; 945 struct net *net = sock_net(sk); 946 struct ip_reply_arg arg; 947 struct sock *ctl_sk; 948 u64 transmit_time; 949 950 memset(&rep.th, 0, sizeof(struct tcphdr)); 951 memset(&arg, 0, sizeof(arg)); 952 953 arg.iov[0].iov_base = (unsigned char *)&rep; 954 arg.iov[0].iov_len = sizeof(rep.th); 955 if (tsecr) { 956 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 957 (TCPOPT_TIMESTAMP << 8) | 958 TCPOLEN_TIMESTAMP); 959 rep.opt[1] = htonl(tsval); 960 rep.opt[2] = htonl(tsecr); 961 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 962 } 963 964 /* Swap the send and the receive. */ 965 rep.th.dest = th->source; 966 rep.th.source = th->dest; 967 rep.th.doff = arg.iov[0].iov_len / 4; 968 rep.th.seq = htonl(seq); 969 rep.th.ack_seq = htonl(ack); 970 rep.th.ack = 1; 971 rep.th.window = htons(win); 972 973 #ifdef CONFIG_TCP_MD5SIG 974 if (tcp_key_is_md5(key)) { 975 int offset = (tsecr) ? 3 : 0; 976 977 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 978 (TCPOPT_NOP << 16) | 979 (TCPOPT_MD5SIG << 8) | 980 TCPOLEN_MD5SIG); 981 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 982 rep.th.doff = arg.iov[0].iov_len/4; 983 984 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 985 key->md5_key, ip_hdr(skb)->saddr, 986 ip_hdr(skb)->daddr, &rep.th); 987 } 988 #endif 989 #ifdef CONFIG_TCP_AO 990 if (tcp_key_is_ao(key)) { 991 int offset = (tsecr) ? 3 : 0; 992 993 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 994 (tcp_ao_len(key->ao_key) << 16) | 995 (key->ao_key->sndid << 8) | 996 key->rcv_next); 997 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 998 rep.th.doff = arg.iov[0].iov_len / 4; 999 1000 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1001 key->ao_key, key->traffic_key, 1002 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1003 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1004 &rep.th, key->sne); 1005 } 1006 #endif 1007 arg.flags = reply_flags; 1008 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1009 ip_hdr(skb)->saddr, /* XXX */ 1010 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1011 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1012 if (oif) 1013 arg.bound_dev_if = oif; 1014 arg.tos = tos; 1015 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1016 local_bh_disable(); 1017 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1018 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1019 sock_net_set(ctl_sk, net); 1020 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1021 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1022 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1023 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1024 transmit_time = tcp_transmit_time(sk); 1025 ip_send_unicast_reply(ctl_sk, sk, 1026 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1027 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1028 &arg, arg.iov[0].iov_len, 1029 transmit_time, txhash); 1030 1031 sock_net_set(ctl_sk, &init_net); 1032 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1033 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1034 local_bh_enable(); 1035 } 1036 1037 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 1038 { 1039 struct inet_timewait_sock *tw = inet_twsk(sk); 1040 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1041 struct tcp_key key = {}; 1042 #ifdef CONFIG_TCP_AO 1043 struct tcp_ao_info *ao_info; 1044 1045 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1046 /* FIXME: the segment to-be-acked is not verified yet */ 1047 ao_info = rcu_dereference(tcptw->ao_info); 1048 if (ao_info) { 1049 const struct tcp_ao_hdr *aoh; 1050 1051 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1052 inet_twsk_put(tw); 1053 return; 1054 } 1055 1056 if (aoh) 1057 key.ao_key = tcp_ao_established_key(sk, ao_info, 1058 aoh->rnext_keyid, -1); 1059 } 1060 } 1061 if (key.ao_key) { 1062 struct tcp_ao_key *rnext_key; 1063 1064 key.traffic_key = snd_other_key(key.ao_key); 1065 key.sne = READ_ONCE(ao_info->snd_sne); 1066 rnext_key = READ_ONCE(ao_info->rnext_key); 1067 key.rcv_next = rnext_key->rcvid; 1068 key.type = TCP_KEY_AO; 1069 #else 1070 if (0) { 1071 #endif 1072 } else if (static_branch_tcp_md5()) { 1073 key.md5_key = tcp_twsk_md5_key(tcptw); 1074 if (key.md5_key) 1075 key.type = TCP_KEY_MD5; 1076 } 1077 1078 tcp_v4_send_ack(sk, skb, 1079 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1080 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1081 tcp_tw_tsval(tcptw), 1082 READ_ONCE(tcptw->tw_ts_recent), 1083 tw->tw_bound_dev_if, &key, 1084 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1085 tw->tw_tos, 1086 tw->tw_txhash); 1087 1088 inet_twsk_put(tw); 1089 } 1090 1091 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1092 struct request_sock *req) 1093 { 1094 struct tcp_key key = {}; 1095 1096 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1097 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1098 */ 1099 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1100 tcp_sk(sk)->snd_nxt; 1101 1102 #ifdef CONFIG_TCP_AO 1103 if (static_branch_unlikely(&tcp_ao_needed.key) && 1104 tcp_rsk_used_ao(req)) { 1105 const union tcp_md5_addr *addr; 1106 const struct tcp_ao_hdr *aoh; 1107 int l3index; 1108 1109 /* Invalid TCP option size or twice included auth */ 1110 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1111 return; 1112 if (!aoh) 1113 return; 1114 1115 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1116 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1117 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1118 aoh->rnext_keyid, -1); 1119 if (unlikely(!key.ao_key)) { 1120 /* Send ACK with any matching MKT for the peer */ 1121 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1122 /* Matching key disappeared (user removed the key?) 1123 * let the handshake timeout. 1124 */ 1125 if (!key.ao_key) { 1126 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1127 addr, 1128 ntohs(tcp_hdr(skb)->source), 1129 &ip_hdr(skb)->daddr, 1130 ntohs(tcp_hdr(skb)->dest)); 1131 return; 1132 } 1133 } 1134 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1135 if (!key.traffic_key) 1136 return; 1137 1138 key.type = TCP_KEY_AO; 1139 key.rcv_next = aoh->keyid; 1140 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1141 #else 1142 if (0) { 1143 #endif 1144 } else if (static_branch_tcp_md5()) { 1145 const union tcp_md5_addr *addr; 1146 int l3index; 1147 1148 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1149 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1150 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1151 if (key.md5_key) 1152 key.type = TCP_KEY_MD5; 1153 } 1154 1155 tcp_v4_send_ack(sk, skb, seq, 1156 tcp_rsk(req)->rcv_nxt, 1157 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1158 tcp_rsk_tsval(tcp_rsk(req)), 1159 READ_ONCE(req->ts_recent), 1160 0, &key, 1161 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1162 ip_hdr(skb)->tos, 1163 READ_ONCE(tcp_rsk(req)->txhash)); 1164 if (tcp_key_is_ao(&key)) 1165 kfree(key.traffic_key); 1166 } 1167 1168 /* 1169 * Send a SYN-ACK after having received a SYN. 1170 * This still operates on a request_sock only, not on a big 1171 * socket. 1172 */ 1173 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1174 struct flowi *fl, 1175 struct request_sock *req, 1176 struct tcp_fastopen_cookie *foc, 1177 enum tcp_synack_type synack_type, 1178 struct sk_buff *syn_skb) 1179 { 1180 const struct inet_request_sock *ireq = inet_rsk(req); 1181 struct flowi4 fl4; 1182 int err = -1; 1183 struct sk_buff *skb; 1184 u8 tos; 1185 1186 /* First, grab a route. */ 1187 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1188 return -1; 1189 1190 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1191 1192 if (skb) { 1193 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1194 1195 tos = READ_ONCE(inet_sk(sk)->tos); 1196 1197 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1198 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1199 (tos & INET_ECN_MASK); 1200 1201 if (!INET_ECN_is_capable(tos) && 1202 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1203 tos |= INET_ECN_ECT_0; 1204 1205 rcu_read_lock(); 1206 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1207 ireq->ir_rmt_addr, 1208 rcu_dereference(ireq->ireq_opt), 1209 tos); 1210 rcu_read_unlock(); 1211 err = net_xmit_eval(err); 1212 } 1213 1214 return err; 1215 } 1216 1217 /* 1218 * IPv4 request_sock destructor. 1219 */ 1220 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1221 { 1222 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1223 } 1224 1225 #ifdef CONFIG_TCP_MD5SIG 1226 /* 1227 * RFC2385 MD5 checksumming requires a mapping of 1228 * IP address->MD5 Key. 1229 * We need to maintain these in the sk structure. 1230 */ 1231 1232 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1233 EXPORT_SYMBOL(tcp_md5_needed); 1234 1235 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1236 { 1237 if (!old) 1238 return true; 1239 1240 /* l3index always overrides non-l3index */ 1241 if (old->l3index && new->l3index == 0) 1242 return false; 1243 if (old->l3index == 0 && new->l3index) 1244 return true; 1245 1246 return old->prefixlen < new->prefixlen; 1247 } 1248 1249 /* Find the Key structure for an address. */ 1250 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1251 const union tcp_md5_addr *addr, 1252 int family, bool any_l3index) 1253 { 1254 const struct tcp_sock *tp = tcp_sk(sk); 1255 struct tcp_md5sig_key *key; 1256 const struct tcp_md5sig_info *md5sig; 1257 __be32 mask; 1258 struct tcp_md5sig_key *best_match = NULL; 1259 bool match; 1260 1261 /* caller either holds rcu_read_lock() or socket lock */ 1262 md5sig = rcu_dereference_check(tp->md5sig_info, 1263 lockdep_sock_is_held(sk)); 1264 if (!md5sig) 1265 return NULL; 1266 1267 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1268 lockdep_sock_is_held(sk)) { 1269 if (key->family != family) 1270 continue; 1271 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1272 key->l3index != l3index) 1273 continue; 1274 if (family == AF_INET) { 1275 mask = inet_make_mask(key->prefixlen); 1276 match = (key->addr.a4.s_addr & mask) == 1277 (addr->a4.s_addr & mask); 1278 #if IS_ENABLED(CONFIG_IPV6) 1279 } else if (family == AF_INET6) { 1280 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1281 key->prefixlen); 1282 #endif 1283 } else { 1284 match = false; 1285 } 1286 1287 if (match && better_md5_match(best_match, key)) 1288 best_match = key; 1289 } 1290 return best_match; 1291 } 1292 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1293 1294 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1295 const union tcp_md5_addr *addr, 1296 int family, u8 prefixlen, 1297 int l3index, u8 flags) 1298 { 1299 const struct tcp_sock *tp = tcp_sk(sk); 1300 struct tcp_md5sig_key *key; 1301 unsigned int size = sizeof(struct in_addr); 1302 const struct tcp_md5sig_info *md5sig; 1303 1304 /* caller either holds rcu_read_lock() or socket lock */ 1305 md5sig = rcu_dereference_check(tp->md5sig_info, 1306 lockdep_sock_is_held(sk)); 1307 if (!md5sig) 1308 return NULL; 1309 #if IS_ENABLED(CONFIG_IPV6) 1310 if (family == AF_INET6) 1311 size = sizeof(struct in6_addr); 1312 #endif 1313 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1314 lockdep_sock_is_held(sk)) { 1315 if (key->family != family) 1316 continue; 1317 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1318 continue; 1319 if (key->l3index != l3index) 1320 continue; 1321 if (!memcmp(&key->addr, addr, size) && 1322 key->prefixlen == prefixlen) 1323 return key; 1324 } 1325 return NULL; 1326 } 1327 1328 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1329 const struct sock *addr_sk) 1330 { 1331 const union tcp_md5_addr *addr; 1332 int l3index; 1333 1334 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1335 addr_sk->sk_bound_dev_if); 1336 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1337 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1338 } 1339 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1340 1341 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1342 { 1343 struct tcp_sock *tp = tcp_sk(sk); 1344 struct tcp_md5sig_info *md5sig; 1345 1346 md5sig = kmalloc(sizeof(*md5sig), gfp); 1347 if (!md5sig) 1348 return -ENOMEM; 1349 1350 sk_gso_disable(sk); 1351 INIT_HLIST_HEAD(&md5sig->head); 1352 rcu_assign_pointer(tp->md5sig_info, md5sig); 1353 return 0; 1354 } 1355 1356 /* This can be called on a newly created socket, from other files */ 1357 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1358 int family, u8 prefixlen, int l3index, u8 flags, 1359 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1360 { 1361 /* Add Key to the list */ 1362 struct tcp_md5sig_key *key; 1363 struct tcp_sock *tp = tcp_sk(sk); 1364 struct tcp_md5sig_info *md5sig; 1365 1366 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1367 if (key) { 1368 /* Pre-existing entry - just update that one. 1369 * Note that the key might be used concurrently. 1370 * data_race() is telling kcsan that we do not care of 1371 * key mismatches, since changing MD5 key on live flows 1372 * can lead to packet drops. 1373 */ 1374 data_race(memcpy(key->key, newkey, newkeylen)); 1375 1376 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1377 * Also note that a reader could catch new key->keylen value 1378 * but old key->key[], this is the reason we use __GFP_ZERO 1379 * at sock_kmalloc() time below these lines. 1380 */ 1381 WRITE_ONCE(key->keylen, newkeylen); 1382 1383 return 0; 1384 } 1385 1386 md5sig = rcu_dereference_protected(tp->md5sig_info, 1387 lockdep_sock_is_held(sk)); 1388 1389 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1390 if (!key) 1391 return -ENOMEM; 1392 1393 memcpy(key->key, newkey, newkeylen); 1394 key->keylen = newkeylen; 1395 key->family = family; 1396 key->prefixlen = prefixlen; 1397 key->l3index = l3index; 1398 key->flags = flags; 1399 memcpy(&key->addr, addr, 1400 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1401 sizeof(struct in_addr)); 1402 hlist_add_head_rcu(&key->node, &md5sig->head); 1403 return 0; 1404 } 1405 1406 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1407 int family, u8 prefixlen, int l3index, u8 flags, 1408 const u8 *newkey, u8 newkeylen) 1409 { 1410 struct tcp_sock *tp = tcp_sk(sk); 1411 1412 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1413 if (tcp_md5_alloc_sigpool()) 1414 return -ENOMEM; 1415 1416 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1417 tcp_md5_release_sigpool(); 1418 return -ENOMEM; 1419 } 1420 1421 if (!static_branch_inc(&tcp_md5_needed.key)) { 1422 struct tcp_md5sig_info *md5sig; 1423 1424 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1425 rcu_assign_pointer(tp->md5sig_info, NULL); 1426 kfree_rcu(md5sig, rcu); 1427 tcp_md5_release_sigpool(); 1428 return -EUSERS; 1429 } 1430 } 1431 1432 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1433 newkey, newkeylen, GFP_KERNEL); 1434 } 1435 EXPORT_SYMBOL(tcp_md5_do_add); 1436 1437 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1438 int family, u8 prefixlen, int l3index, 1439 struct tcp_md5sig_key *key) 1440 { 1441 struct tcp_sock *tp = tcp_sk(sk); 1442 1443 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1444 tcp_md5_add_sigpool(); 1445 1446 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1447 tcp_md5_release_sigpool(); 1448 return -ENOMEM; 1449 } 1450 1451 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1452 struct tcp_md5sig_info *md5sig; 1453 1454 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1455 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1456 rcu_assign_pointer(tp->md5sig_info, NULL); 1457 kfree_rcu(md5sig, rcu); 1458 tcp_md5_release_sigpool(); 1459 return -EUSERS; 1460 } 1461 } 1462 1463 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1464 key->flags, key->key, key->keylen, 1465 sk_gfp_mask(sk, GFP_ATOMIC)); 1466 } 1467 EXPORT_SYMBOL(tcp_md5_key_copy); 1468 1469 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1470 u8 prefixlen, int l3index, u8 flags) 1471 { 1472 struct tcp_md5sig_key *key; 1473 1474 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1475 if (!key) 1476 return -ENOENT; 1477 hlist_del_rcu(&key->node); 1478 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1479 kfree_rcu(key, rcu); 1480 return 0; 1481 } 1482 EXPORT_SYMBOL(tcp_md5_do_del); 1483 1484 void tcp_clear_md5_list(struct sock *sk) 1485 { 1486 struct tcp_sock *tp = tcp_sk(sk); 1487 struct tcp_md5sig_key *key; 1488 struct hlist_node *n; 1489 struct tcp_md5sig_info *md5sig; 1490 1491 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1492 1493 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1494 hlist_del_rcu(&key->node); 1495 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1496 kfree_rcu(key, rcu); 1497 } 1498 } 1499 1500 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1501 sockptr_t optval, int optlen) 1502 { 1503 struct tcp_md5sig cmd; 1504 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1505 const union tcp_md5_addr *addr; 1506 u8 prefixlen = 32; 1507 int l3index = 0; 1508 bool l3flag; 1509 u8 flags; 1510 1511 if (optlen < sizeof(cmd)) 1512 return -EINVAL; 1513 1514 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1515 return -EFAULT; 1516 1517 if (sin->sin_family != AF_INET) 1518 return -EINVAL; 1519 1520 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1521 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1522 1523 if (optname == TCP_MD5SIG_EXT && 1524 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1525 prefixlen = cmd.tcpm_prefixlen; 1526 if (prefixlen > 32) 1527 return -EINVAL; 1528 } 1529 1530 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1531 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1532 struct net_device *dev; 1533 1534 rcu_read_lock(); 1535 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1536 if (dev && netif_is_l3_master(dev)) 1537 l3index = dev->ifindex; 1538 1539 rcu_read_unlock(); 1540 1541 /* ok to reference set/not set outside of rcu; 1542 * right now device MUST be an L3 master 1543 */ 1544 if (!dev || !l3index) 1545 return -EINVAL; 1546 } 1547 1548 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1549 1550 if (!cmd.tcpm_keylen) 1551 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1552 1553 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1554 return -EINVAL; 1555 1556 /* Don't allow keys for peers that have a matching TCP-AO key. 1557 * See the comment in tcp_ao_add_cmd() 1558 */ 1559 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1560 return -EKEYREJECTED; 1561 1562 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1563 cmd.tcpm_key, cmd.tcpm_keylen); 1564 } 1565 1566 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1567 __be32 daddr, __be32 saddr, 1568 const struct tcphdr *th, int nbytes) 1569 { 1570 struct tcp4_pseudohdr *bp; 1571 struct scatterlist sg; 1572 struct tcphdr *_th; 1573 1574 bp = hp->scratch; 1575 bp->saddr = saddr; 1576 bp->daddr = daddr; 1577 bp->pad = 0; 1578 bp->protocol = IPPROTO_TCP; 1579 bp->len = cpu_to_be16(nbytes); 1580 1581 _th = (struct tcphdr *)(bp + 1); 1582 memcpy(_th, th, sizeof(*th)); 1583 _th->check = 0; 1584 1585 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1586 ahash_request_set_crypt(hp->req, &sg, NULL, 1587 sizeof(*bp) + sizeof(*th)); 1588 return crypto_ahash_update(hp->req); 1589 } 1590 1591 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1592 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1593 { 1594 struct tcp_sigpool hp; 1595 1596 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1597 goto clear_hash_nostart; 1598 1599 if (crypto_ahash_init(hp.req)) 1600 goto clear_hash; 1601 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1602 goto clear_hash; 1603 if (tcp_md5_hash_key(&hp, key)) 1604 goto clear_hash; 1605 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1606 if (crypto_ahash_final(hp.req)) 1607 goto clear_hash; 1608 1609 tcp_sigpool_end(&hp); 1610 return 0; 1611 1612 clear_hash: 1613 tcp_sigpool_end(&hp); 1614 clear_hash_nostart: 1615 memset(md5_hash, 0, 16); 1616 return 1; 1617 } 1618 1619 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1620 const struct sock *sk, 1621 const struct sk_buff *skb) 1622 { 1623 const struct tcphdr *th = tcp_hdr(skb); 1624 struct tcp_sigpool hp; 1625 __be32 saddr, daddr; 1626 1627 if (sk) { /* valid for establish/request sockets */ 1628 saddr = sk->sk_rcv_saddr; 1629 daddr = sk->sk_daddr; 1630 } else { 1631 const struct iphdr *iph = ip_hdr(skb); 1632 saddr = iph->saddr; 1633 daddr = iph->daddr; 1634 } 1635 1636 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1637 goto clear_hash_nostart; 1638 1639 if (crypto_ahash_init(hp.req)) 1640 goto clear_hash; 1641 1642 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1643 goto clear_hash; 1644 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1645 goto clear_hash; 1646 if (tcp_md5_hash_key(&hp, key)) 1647 goto clear_hash; 1648 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1649 if (crypto_ahash_final(hp.req)) 1650 goto clear_hash; 1651 1652 tcp_sigpool_end(&hp); 1653 return 0; 1654 1655 clear_hash: 1656 tcp_sigpool_end(&hp); 1657 clear_hash_nostart: 1658 memset(md5_hash, 0, 16); 1659 return 1; 1660 } 1661 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1662 1663 #endif 1664 1665 static void tcp_v4_init_req(struct request_sock *req, 1666 const struct sock *sk_listener, 1667 struct sk_buff *skb) 1668 { 1669 struct inet_request_sock *ireq = inet_rsk(req); 1670 struct net *net = sock_net(sk_listener); 1671 1672 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1673 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1674 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1675 } 1676 1677 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1678 struct sk_buff *skb, 1679 struct flowi *fl, 1680 struct request_sock *req, 1681 u32 tw_isn) 1682 { 1683 tcp_v4_init_req(req, sk, skb); 1684 1685 if (security_inet_conn_request(sk, skb, req)) 1686 return NULL; 1687 1688 return inet_csk_route_req(sk, &fl->u.ip4, req); 1689 } 1690 1691 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1692 .family = PF_INET, 1693 .obj_size = sizeof(struct tcp_request_sock), 1694 .rtx_syn_ack = tcp_rtx_synack, 1695 .send_ack = tcp_v4_reqsk_send_ack, 1696 .destructor = tcp_v4_reqsk_destructor, 1697 .send_reset = tcp_v4_send_reset, 1698 .syn_ack_timeout = tcp_syn_ack_timeout, 1699 }; 1700 1701 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1702 .mss_clamp = TCP_MSS_DEFAULT, 1703 #ifdef CONFIG_TCP_MD5SIG 1704 .req_md5_lookup = tcp_v4_md5_lookup, 1705 .calc_md5_hash = tcp_v4_md5_hash_skb, 1706 #endif 1707 #ifdef CONFIG_TCP_AO 1708 .ao_lookup = tcp_v4_ao_lookup_rsk, 1709 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1710 .ao_synack_hash = tcp_v4_ao_synack_hash, 1711 #endif 1712 #ifdef CONFIG_SYN_COOKIES 1713 .cookie_init_seq = cookie_v4_init_sequence, 1714 #endif 1715 .route_req = tcp_v4_route_req, 1716 .init_seq = tcp_v4_init_seq, 1717 .init_ts_off = tcp_v4_init_ts_off, 1718 .send_synack = tcp_v4_send_synack, 1719 }; 1720 1721 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1722 { 1723 /* Never answer to SYNs send to broadcast or multicast */ 1724 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1725 goto drop; 1726 1727 return tcp_conn_request(&tcp_request_sock_ops, 1728 &tcp_request_sock_ipv4_ops, sk, skb); 1729 1730 drop: 1731 tcp_listendrop(sk); 1732 return 0; 1733 } 1734 EXPORT_SYMBOL(tcp_v4_conn_request); 1735 1736 1737 /* 1738 * The three way handshake has completed - we got a valid synack - 1739 * now create the new socket. 1740 */ 1741 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1742 struct request_sock *req, 1743 struct dst_entry *dst, 1744 struct request_sock *req_unhash, 1745 bool *own_req) 1746 { 1747 struct inet_request_sock *ireq; 1748 bool found_dup_sk = false; 1749 struct inet_sock *newinet; 1750 struct tcp_sock *newtp; 1751 struct sock *newsk; 1752 #ifdef CONFIG_TCP_MD5SIG 1753 const union tcp_md5_addr *addr; 1754 struct tcp_md5sig_key *key; 1755 int l3index; 1756 #endif 1757 struct ip_options_rcu *inet_opt; 1758 1759 if (sk_acceptq_is_full(sk)) 1760 goto exit_overflow; 1761 1762 newsk = tcp_create_openreq_child(sk, req, skb); 1763 if (!newsk) 1764 goto exit_nonewsk; 1765 1766 newsk->sk_gso_type = SKB_GSO_TCPV4; 1767 inet_sk_rx_dst_set(newsk, skb); 1768 1769 newtp = tcp_sk(newsk); 1770 newinet = inet_sk(newsk); 1771 ireq = inet_rsk(req); 1772 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1773 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1774 newsk->sk_bound_dev_if = ireq->ir_iif; 1775 newinet->inet_saddr = ireq->ir_loc_addr; 1776 inet_opt = rcu_dereference(ireq->ireq_opt); 1777 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1778 newinet->mc_index = inet_iif(skb); 1779 newinet->mc_ttl = ip_hdr(skb)->ttl; 1780 newinet->rcv_tos = ip_hdr(skb)->tos; 1781 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1782 if (inet_opt) 1783 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1784 atomic_set(&newinet->inet_id, get_random_u16()); 1785 1786 /* Set ToS of the new socket based upon the value of incoming SYN. 1787 * ECT bits are set later in tcp_init_transfer(). 1788 */ 1789 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1790 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1791 1792 if (!dst) { 1793 dst = inet_csk_route_child_sock(sk, newsk, req); 1794 if (!dst) 1795 goto put_and_exit; 1796 } else { 1797 /* syncookie case : see end of cookie_v4_check() */ 1798 } 1799 sk_setup_caps(newsk, dst); 1800 1801 tcp_ca_openreq_child(newsk, dst); 1802 1803 tcp_sync_mss(newsk, dst_mtu(dst)); 1804 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1805 1806 tcp_initialize_rcv_mss(newsk); 1807 1808 #ifdef CONFIG_TCP_MD5SIG 1809 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1810 /* Copy over the MD5 key from the original socket */ 1811 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1812 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1813 if (key && !tcp_rsk_used_ao(req)) { 1814 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1815 goto put_and_exit; 1816 sk_gso_disable(newsk); 1817 } 1818 #endif 1819 #ifdef CONFIG_TCP_AO 1820 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1821 goto put_and_exit; /* OOM, release back memory */ 1822 #endif 1823 1824 if (__inet_inherit_port(sk, newsk) < 0) 1825 goto put_and_exit; 1826 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1827 &found_dup_sk); 1828 if (likely(*own_req)) { 1829 tcp_move_syn(newtp, req); 1830 ireq->ireq_opt = NULL; 1831 } else { 1832 newinet->inet_opt = NULL; 1833 1834 if (!req_unhash && found_dup_sk) { 1835 /* This code path should only be executed in the 1836 * syncookie case only 1837 */ 1838 bh_unlock_sock(newsk); 1839 sock_put(newsk); 1840 newsk = NULL; 1841 } 1842 } 1843 return newsk; 1844 1845 exit_overflow: 1846 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1847 exit_nonewsk: 1848 dst_release(dst); 1849 exit: 1850 tcp_listendrop(sk); 1851 return NULL; 1852 put_and_exit: 1853 newinet->inet_opt = NULL; 1854 inet_csk_prepare_forced_close(newsk); 1855 tcp_done(newsk); 1856 goto exit; 1857 } 1858 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1859 1860 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1861 { 1862 #ifdef CONFIG_SYN_COOKIES 1863 const struct tcphdr *th = tcp_hdr(skb); 1864 1865 if (!th->syn) 1866 sk = cookie_v4_check(sk, skb); 1867 #endif 1868 return sk; 1869 } 1870 1871 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1872 struct tcphdr *th, u32 *cookie) 1873 { 1874 u16 mss = 0; 1875 #ifdef CONFIG_SYN_COOKIES 1876 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1877 &tcp_request_sock_ipv4_ops, sk, th); 1878 if (mss) { 1879 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1880 tcp_synq_overflow(sk); 1881 } 1882 #endif 1883 return mss; 1884 } 1885 1886 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1887 u32)); 1888 /* The socket must have it's spinlock held when we get 1889 * here, unless it is a TCP_LISTEN socket. 1890 * 1891 * We have a potential double-lock case here, so even when 1892 * doing backlog processing we use the BH locking scheme. 1893 * This is because we cannot sleep with the original spinlock 1894 * held. 1895 */ 1896 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1897 { 1898 enum skb_drop_reason reason; 1899 struct sock *rsk; 1900 1901 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1902 struct dst_entry *dst; 1903 1904 dst = rcu_dereference_protected(sk->sk_rx_dst, 1905 lockdep_sock_is_held(sk)); 1906 1907 sock_rps_save_rxhash(sk, skb); 1908 sk_mark_napi_id(sk, skb); 1909 if (dst) { 1910 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1911 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1912 dst, 0)) { 1913 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1914 dst_release(dst); 1915 } 1916 } 1917 tcp_rcv_established(sk, skb); 1918 return 0; 1919 } 1920 1921 if (tcp_checksum_complete(skb)) 1922 goto csum_err; 1923 1924 if (sk->sk_state == TCP_LISTEN) { 1925 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1926 1927 if (!nsk) 1928 return 0; 1929 if (nsk != sk) { 1930 reason = tcp_child_process(sk, nsk, skb); 1931 if (reason) { 1932 rsk = nsk; 1933 goto reset; 1934 } 1935 return 0; 1936 } 1937 } else 1938 sock_rps_save_rxhash(sk, skb); 1939 1940 reason = tcp_rcv_state_process(sk, skb); 1941 if (reason) { 1942 rsk = sk; 1943 goto reset; 1944 } 1945 return 0; 1946 1947 reset: 1948 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1949 discard: 1950 sk_skb_reason_drop(sk, skb, reason); 1951 /* Be careful here. If this function gets more complicated and 1952 * gcc suffers from register pressure on the x86, sk (in %ebx) 1953 * might be destroyed here. This current version compiles correctly, 1954 * but you have been warned. 1955 */ 1956 return 0; 1957 1958 csum_err: 1959 reason = SKB_DROP_REASON_TCP_CSUM; 1960 trace_tcp_bad_csum(skb); 1961 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1962 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1963 goto discard; 1964 } 1965 EXPORT_SYMBOL(tcp_v4_do_rcv); 1966 1967 int tcp_v4_early_demux(struct sk_buff *skb) 1968 { 1969 struct net *net = dev_net(skb->dev); 1970 const struct iphdr *iph; 1971 const struct tcphdr *th; 1972 struct sock *sk; 1973 1974 if (skb->pkt_type != PACKET_HOST) 1975 return 0; 1976 1977 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1978 return 0; 1979 1980 iph = ip_hdr(skb); 1981 th = tcp_hdr(skb); 1982 1983 if (th->doff < sizeof(struct tcphdr) / 4) 1984 return 0; 1985 1986 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1987 iph->saddr, th->source, 1988 iph->daddr, ntohs(th->dest), 1989 skb->skb_iif, inet_sdif(skb)); 1990 if (sk) { 1991 skb->sk = sk; 1992 skb->destructor = sock_edemux; 1993 if (sk_fullsock(sk)) { 1994 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1995 1996 if (dst) 1997 dst = dst_check(dst, 0); 1998 if (dst && 1999 sk->sk_rx_dst_ifindex == skb->skb_iif) 2000 skb_dst_set_noref(skb, dst); 2001 } 2002 } 2003 return 0; 2004 } 2005 2006 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2007 enum skb_drop_reason *reason) 2008 { 2009 u32 tail_gso_size, tail_gso_segs; 2010 struct skb_shared_info *shinfo; 2011 const struct tcphdr *th; 2012 struct tcphdr *thtail; 2013 struct sk_buff *tail; 2014 unsigned int hdrlen; 2015 bool fragstolen; 2016 u32 gso_segs; 2017 u32 gso_size; 2018 u64 limit; 2019 int delta; 2020 2021 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2022 * we can fix skb->truesize to its real value to avoid future drops. 2023 * This is valid because skb is not yet charged to the socket. 2024 * It has been noticed pure SACK packets were sometimes dropped 2025 * (if cooked by drivers without copybreak feature). 2026 */ 2027 skb_condense(skb); 2028 2029 skb_dst_drop(skb); 2030 2031 if (unlikely(tcp_checksum_complete(skb))) { 2032 bh_unlock_sock(sk); 2033 trace_tcp_bad_csum(skb); 2034 *reason = SKB_DROP_REASON_TCP_CSUM; 2035 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2036 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2037 return true; 2038 } 2039 2040 /* Attempt coalescing to last skb in backlog, even if we are 2041 * above the limits. 2042 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2043 */ 2044 th = (const struct tcphdr *)skb->data; 2045 hdrlen = th->doff * 4; 2046 2047 tail = sk->sk_backlog.tail; 2048 if (!tail) 2049 goto no_coalesce; 2050 thtail = (struct tcphdr *)tail->data; 2051 2052 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2053 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2054 ((TCP_SKB_CB(tail)->tcp_flags | 2055 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2056 !((TCP_SKB_CB(tail)->tcp_flags & 2057 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2058 ((TCP_SKB_CB(tail)->tcp_flags ^ 2059 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 2060 !tcp_skb_can_collapse_rx(tail, skb) || 2061 thtail->doff != th->doff || 2062 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2063 goto no_coalesce; 2064 2065 __skb_pull(skb, hdrlen); 2066 2067 shinfo = skb_shinfo(skb); 2068 gso_size = shinfo->gso_size ?: skb->len; 2069 gso_segs = shinfo->gso_segs ?: 1; 2070 2071 shinfo = skb_shinfo(tail); 2072 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2073 tail_gso_segs = shinfo->gso_segs ?: 1; 2074 2075 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2076 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2077 2078 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2079 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2080 thtail->window = th->window; 2081 } 2082 2083 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2084 * thtail->fin, so that the fast path in tcp_rcv_established() 2085 * is not entered if we append a packet with a FIN. 2086 * SYN, RST, URG are not present. 2087 * ACK is set on both packets. 2088 * PSH : we do not really care in TCP stack, 2089 * at least for 'GRO' packets. 2090 */ 2091 thtail->fin |= th->fin; 2092 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2093 2094 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2095 TCP_SKB_CB(tail)->has_rxtstamp = true; 2096 tail->tstamp = skb->tstamp; 2097 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2098 } 2099 2100 /* Not as strict as GRO. We only need to carry mss max value */ 2101 shinfo->gso_size = max(gso_size, tail_gso_size); 2102 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2103 2104 sk->sk_backlog.len += delta; 2105 __NET_INC_STATS(sock_net(sk), 2106 LINUX_MIB_TCPBACKLOGCOALESCE); 2107 kfree_skb_partial(skb, fragstolen); 2108 return false; 2109 } 2110 __skb_push(skb, hdrlen); 2111 2112 no_coalesce: 2113 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2114 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2115 * sk_rcvbuf in normal conditions. 2116 */ 2117 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2118 2119 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2120 2121 /* Only socket owner can try to collapse/prune rx queues 2122 * to reduce memory overhead, so add a little headroom here. 2123 * Few sockets backlog are possibly concurrently non empty. 2124 */ 2125 limit += 64 * 1024; 2126 2127 limit = min_t(u64, limit, UINT_MAX); 2128 2129 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2130 bh_unlock_sock(sk); 2131 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2132 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2133 return true; 2134 } 2135 return false; 2136 } 2137 EXPORT_SYMBOL(tcp_add_backlog); 2138 2139 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2140 { 2141 struct tcphdr *th = (struct tcphdr *)skb->data; 2142 2143 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2144 } 2145 EXPORT_SYMBOL(tcp_filter); 2146 2147 static void tcp_v4_restore_cb(struct sk_buff *skb) 2148 { 2149 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2150 sizeof(struct inet_skb_parm)); 2151 } 2152 2153 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2154 const struct tcphdr *th) 2155 { 2156 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2157 * barrier() makes sure compiler wont play fool^Waliasing games. 2158 */ 2159 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2160 sizeof(struct inet_skb_parm)); 2161 barrier(); 2162 2163 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2164 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2165 skb->len - th->doff * 4); 2166 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2167 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 2168 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2169 TCP_SKB_CB(skb)->sacked = 0; 2170 TCP_SKB_CB(skb)->has_rxtstamp = 2171 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2172 } 2173 2174 /* 2175 * From tcp_input.c 2176 */ 2177 2178 int tcp_v4_rcv(struct sk_buff *skb) 2179 { 2180 struct net *net = dev_net(skb->dev); 2181 enum skb_drop_reason drop_reason; 2182 int sdif = inet_sdif(skb); 2183 int dif = inet_iif(skb); 2184 const struct iphdr *iph; 2185 const struct tcphdr *th; 2186 struct sock *sk = NULL; 2187 bool refcounted; 2188 int ret; 2189 u32 isn; 2190 2191 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2192 if (skb->pkt_type != PACKET_HOST) 2193 goto discard_it; 2194 2195 /* Count it even if it's bad */ 2196 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2197 2198 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2199 goto discard_it; 2200 2201 th = (const struct tcphdr *)skb->data; 2202 2203 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2204 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2205 goto bad_packet; 2206 } 2207 if (!pskb_may_pull(skb, th->doff * 4)) 2208 goto discard_it; 2209 2210 /* An explanation is required here, I think. 2211 * Packet length and doff are validated by header prediction, 2212 * provided case of th->doff==0 is eliminated. 2213 * So, we defer the checks. */ 2214 2215 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2216 goto csum_error; 2217 2218 th = (const struct tcphdr *)skb->data; 2219 iph = ip_hdr(skb); 2220 lookup: 2221 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2222 skb, __tcp_hdrlen(th), th->source, 2223 th->dest, sdif, &refcounted); 2224 if (!sk) 2225 goto no_tcp_socket; 2226 2227 if (sk->sk_state == TCP_TIME_WAIT) 2228 goto do_time_wait; 2229 2230 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2231 struct request_sock *req = inet_reqsk(sk); 2232 bool req_stolen = false; 2233 struct sock *nsk; 2234 2235 sk = req->rsk_listener; 2236 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2237 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2238 else 2239 drop_reason = tcp_inbound_hash(sk, req, skb, 2240 &iph->saddr, &iph->daddr, 2241 AF_INET, dif, sdif); 2242 if (unlikely(drop_reason)) { 2243 sk_drops_add(sk, skb); 2244 reqsk_put(req); 2245 goto discard_it; 2246 } 2247 if (tcp_checksum_complete(skb)) { 2248 reqsk_put(req); 2249 goto csum_error; 2250 } 2251 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2252 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2253 if (!nsk) { 2254 inet_csk_reqsk_queue_drop_and_put(sk, req); 2255 goto lookup; 2256 } 2257 sk = nsk; 2258 /* reuseport_migrate_sock() has already held one sk_refcnt 2259 * before returning. 2260 */ 2261 } else { 2262 /* We own a reference on the listener, increase it again 2263 * as we might lose it too soon. 2264 */ 2265 sock_hold(sk); 2266 } 2267 refcounted = true; 2268 nsk = NULL; 2269 if (!tcp_filter(sk, skb)) { 2270 th = (const struct tcphdr *)skb->data; 2271 iph = ip_hdr(skb); 2272 tcp_v4_fill_cb(skb, iph, th); 2273 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2274 } else { 2275 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2276 } 2277 if (!nsk) { 2278 reqsk_put(req); 2279 if (req_stolen) { 2280 /* Another cpu got exclusive access to req 2281 * and created a full blown socket. 2282 * Try to feed this packet to this socket 2283 * instead of discarding it. 2284 */ 2285 tcp_v4_restore_cb(skb); 2286 sock_put(sk); 2287 goto lookup; 2288 } 2289 goto discard_and_relse; 2290 } 2291 nf_reset_ct(skb); 2292 if (nsk == sk) { 2293 reqsk_put(req); 2294 tcp_v4_restore_cb(skb); 2295 } else { 2296 drop_reason = tcp_child_process(sk, nsk, skb); 2297 if (drop_reason) { 2298 enum sk_rst_reason rst_reason; 2299 2300 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2301 tcp_v4_send_reset(nsk, skb, rst_reason); 2302 goto discard_and_relse; 2303 } 2304 sock_put(sk); 2305 return 0; 2306 } 2307 } 2308 2309 process: 2310 if (static_branch_unlikely(&ip4_min_ttl)) { 2311 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2312 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2313 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2314 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2315 goto discard_and_relse; 2316 } 2317 } 2318 2319 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2320 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2321 goto discard_and_relse; 2322 } 2323 2324 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2325 AF_INET, dif, sdif); 2326 if (drop_reason) 2327 goto discard_and_relse; 2328 2329 nf_reset_ct(skb); 2330 2331 if (tcp_filter(sk, skb)) { 2332 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2333 goto discard_and_relse; 2334 } 2335 th = (const struct tcphdr *)skb->data; 2336 iph = ip_hdr(skb); 2337 tcp_v4_fill_cb(skb, iph, th); 2338 2339 skb->dev = NULL; 2340 2341 if (sk->sk_state == TCP_LISTEN) { 2342 ret = tcp_v4_do_rcv(sk, skb); 2343 goto put_and_return; 2344 } 2345 2346 sk_incoming_cpu_update(sk); 2347 2348 bh_lock_sock_nested(sk); 2349 tcp_segs_in(tcp_sk(sk), skb); 2350 ret = 0; 2351 if (!sock_owned_by_user(sk)) { 2352 ret = tcp_v4_do_rcv(sk, skb); 2353 } else { 2354 if (tcp_add_backlog(sk, skb, &drop_reason)) 2355 goto discard_and_relse; 2356 } 2357 bh_unlock_sock(sk); 2358 2359 put_and_return: 2360 if (refcounted) 2361 sock_put(sk); 2362 2363 return ret; 2364 2365 no_tcp_socket: 2366 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2367 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2368 goto discard_it; 2369 2370 tcp_v4_fill_cb(skb, iph, th); 2371 2372 if (tcp_checksum_complete(skb)) { 2373 csum_error: 2374 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2375 trace_tcp_bad_csum(skb); 2376 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2377 bad_packet: 2378 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2379 } else { 2380 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2381 } 2382 2383 discard_it: 2384 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2385 /* Discard frame. */ 2386 sk_skb_reason_drop(sk, skb, drop_reason); 2387 return 0; 2388 2389 discard_and_relse: 2390 sk_drops_add(sk, skb); 2391 if (refcounted) 2392 sock_put(sk); 2393 goto discard_it; 2394 2395 do_time_wait: 2396 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2397 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2398 inet_twsk_put(inet_twsk(sk)); 2399 goto discard_it; 2400 } 2401 2402 tcp_v4_fill_cb(skb, iph, th); 2403 2404 if (tcp_checksum_complete(skb)) { 2405 inet_twsk_put(inet_twsk(sk)); 2406 goto csum_error; 2407 } 2408 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { 2409 case TCP_TW_SYN: { 2410 struct sock *sk2 = inet_lookup_listener(net, 2411 net->ipv4.tcp_death_row.hashinfo, 2412 skb, __tcp_hdrlen(th), 2413 iph->saddr, th->source, 2414 iph->daddr, th->dest, 2415 inet_iif(skb), 2416 sdif); 2417 if (sk2) { 2418 inet_twsk_deschedule_put(inet_twsk(sk)); 2419 sk = sk2; 2420 tcp_v4_restore_cb(skb); 2421 refcounted = false; 2422 __this_cpu_write(tcp_tw_isn, isn); 2423 goto process; 2424 } 2425 } 2426 /* to ACK */ 2427 fallthrough; 2428 case TCP_TW_ACK: 2429 tcp_v4_timewait_ack(sk, skb); 2430 break; 2431 case TCP_TW_RST: 2432 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2433 inet_twsk_deschedule_put(inet_twsk(sk)); 2434 goto discard_it; 2435 case TCP_TW_SUCCESS:; 2436 } 2437 goto discard_it; 2438 } 2439 2440 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2441 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2442 .twsk_destructor= tcp_twsk_destructor, 2443 }; 2444 2445 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2446 { 2447 struct dst_entry *dst = skb_dst(skb); 2448 2449 if (dst && dst_hold_safe(dst)) { 2450 rcu_assign_pointer(sk->sk_rx_dst, dst); 2451 sk->sk_rx_dst_ifindex = skb->skb_iif; 2452 } 2453 } 2454 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2455 2456 const struct inet_connection_sock_af_ops ipv4_specific = { 2457 .queue_xmit = ip_queue_xmit, 2458 .send_check = tcp_v4_send_check, 2459 .rebuild_header = inet_sk_rebuild_header, 2460 .sk_rx_dst_set = inet_sk_rx_dst_set, 2461 .conn_request = tcp_v4_conn_request, 2462 .syn_recv_sock = tcp_v4_syn_recv_sock, 2463 .net_header_len = sizeof(struct iphdr), 2464 .setsockopt = ip_setsockopt, 2465 .getsockopt = ip_getsockopt, 2466 .addr2sockaddr = inet_csk_addr2sockaddr, 2467 .sockaddr_len = sizeof(struct sockaddr_in), 2468 .mtu_reduced = tcp_v4_mtu_reduced, 2469 }; 2470 EXPORT_SYMBOL(ipv4_specific); 2471 2472 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2473 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2474 #ifdef CONFIG_TCP_MD5SIG 2475 .md5_lookup = tcp_v4_md5_lookup, 2476 .calc_md5_hash = tcp_v4_md5_hash_skb, 2477 .md5_parse = tcp_v4_parse_md5_keys, 2478 #endif 2479 #ifdef CONFIG_TCP_AO 2480 .ao_lookup = tcp_v4_ao_lookup, 2481 .calc_ao_hash = tcp_v4_ao_hash_skb, 2482 .ao_parse = tcp_v4_parse_ao, 2483 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2484 #endif 2485 }; 2486 #endif 2487 2488 /* NOTE: A lot of things set to zero explicitly by call to 2489 * sk_alloc() so need not be done here. 2490 */ 2491 static int tcp_v4_init_sock(struct sock *sk) 2492 { 2493 struct inet_connection_sock *icsk = inet_csk(sk); 2494 2495 tcp_init_sock(sk); 2496 2497 icsk->icsk_af_ops = &ipv4_specific; 2498 2499 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2500 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2501 #endif 2502 2503 return 0; 2504 } 2505 2506 #ifdef CONFIG_TCP_MD5SIG 2507 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2508 { 2509 struct tcp_md5sig_info *md5sig; 2510 2511 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2512 kfree(md5sig); 2513 static_branch_slow_dec_deferred(&tcp_md5_needed); 2514 tcp_md5_release_sigpool(); 2515 } 2516 #endif 2517 2518 static void tcp_release_user_frags(struct sock *sk) 2519 { 2520 #ifdef CONFIG_PAGE_POOL 2521 unsigned long index; 2522 void *netmem; 2523 2524 xa_for_each(&sk->sk_user_frags, index, netmem) 2525 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2526 #endif 2527 } 2528 2529 void tcp_v4_destroy_sock(struct sock *sk) 2530 { 2531 struct tcp_sock *tp = tcp_sk(sk); 2532 2533 tcp_release_user_frags(sk); 2534 2535 xa_destroy(&sk->sk_user_frags); 2536 2537 trace_tcp_destroy_sock(sk); 2538 2539 tcp_clear_xmit_timers(sk); 2540 2541 tcp_cleanup_congestion_control(sk); 2542 2543 tcp_cleanup_ulp(sk); 2544 2545 /* Cleanup up the write buffer. */ 2546 tcp_write_queue_purge(sk); 2547 2548 /* Check if we want to disable active TFO */ 2549 tcp_fastopen_active_disable_ofo_check(sk); 2550 2551 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2552 skb_rbtree_purge(&tp->out_of_order_queue); 2553 2554 #ifdef CONFIG_TCP_MD5SIG 2555 /* Clean up the MD5 key list, if any */ 2556 if (tp->md5sig_info) { 2557 struct tcp_md5sig_info *md5sig; 2558 2559 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2560 tcp_clear_md5_list(sk); 2561 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2562 rcu_assign_pointer(tp->md5sig_info, NULL); 2563 } 2564 #endif 2565 tcp_ao_destroy_sock(sk, false); 2566 2567 /* Clean up a referenced TCP bind bucket. */ 2568 if (inet_csk(sk)->icsk_bind_hash) 2569 inet_put_port(sk); 2570 2571 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2572 2573 /* If socket is aborted during connect operation */ 2574 tcp_free_fastopen_req(tp); 2575 tcp_fastopen_destroy_cipher(sk); 2576 tcp_saved_syn_free(tp); 2577 2578 sk_sockets_allocated_dec(sk); 2579 } 2580 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2581 2582 #ifdef CONFIG_PROC_FS 2583 /* Proc filesystem TCP sock list dumping. */ 2584 2585 static unsigned short seq_file_family(const struct seq_file *seq); 2586 2587 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2588 { 2589 unsigned short family = seq_file_family(seq); 2590 2591 /* AF_UNSPEC is used as a match all */ 2592 return ((family == AF_UNSPEC || family == sk->sk_family) && 2593 net_eq(sock_net(sk), seq_file_net(seq))); 2594 } 2595 2596 /* Find a non empty bucket (starting from st->bucket) 2597 * and return the first sk from it. 2598 */ 2599 static void *listening_get_first(struct seq_file *seq) 2600 { 2601 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2602 struct tcp_iter_state *st = seq->private; 2603 2604 st->offset = 0; 2605 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2606 struct inet_listen_hashbucket *ilb2; 2607 struct hlist_nulls_node *node; 2608 struct sock *sk; 2609 2610 ilb2 = &hinfo->lhash2[st->bucket]; 2611 if (hlist_nulls_empty(&ilb2->nulls_head)) 2612 continue; 2613 2614 spin_lock(&ilb2->lock); 2615 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2616 if (seq_sk_match(seq, sk)) 2617 return sk; 2618 } 2619 spin_unlock(&ilb2->lock); 2620 } 2621 2622 return NULL; 2623 } 2624 2625 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2626 * If "cur" is the last one in the st->bucket, 2627 * call listening_get_first() to return the first sk of the next 2628 * non empty bucket. 2629 */ 2630 static void *listening_get_next(struct seq_file *seq, void *cur) 2631 { 2632 struct tcp_iter_state *st = seq->private; 2633 struct inet_listen_hashbucket *ilb2; 2634 struct hlist_nulls_node *node; 2635 struct inet_hashinfo *hinfo; 2636 struct sock *sk = cur; 2637 2638 ++st->num; 2639 ++st->offset; 2640 2641 sk = sk_nulls_next(sk); 2642 sk_nulls_for_each_from(sk, node) { 2643 if (seq_sk_match(seq, sk)) 2644 return sk; 2645 } 2646 2647 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2648 ilb2 = &hinfo->lhash2[st->bucket]; 2649 spin_unlock(&ilb2->lock); 2650 ++st->bucket; 2651 return listening_get_first(seq); 2652 } 2653 2654 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2655 { 2656 struct tcp_iter_state *st = seq->private; 2657 void *rc; 2658 2659 st->bucket = 0; 2660 st->offset = 0; 2661 rc = listening_get_first(seq); 2662 2663 while (rc && *pos) { 2664 rc = listening_get_next(seq, rc); 2665 --*pos; 2666 } 2667 return rc; 2668 } 2669 2670 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2671 const struct tcp_iter_state *st) 2672 { 2673 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2674 } 2675 2676 /* 2677 * Get first established socket starting from bucket given in st->bucket. 2678 * If st->bucket is zero, the very first socket in the hash is returned. 2679 */ 2680 static void *established_get_first(struct seq_file *seq) 2681 { 2682 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2683 struct tcp_iter_state *st = seq->private; 2684 2685 st->offset = 0; 2686 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2687 struct sock *sk; 2688 struct hlist_nulls_node *node; 2689 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2690 2691 cond_resched(); 2692 2693 /* Lockless fast path for the common case of empty buckets */ 2694 if (empty_bucket(hinfo, st)) 2695 continue; 2696 2697 spin_lock_bh(lock); 2698 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2699 if (seq_sk_match(seq, sk)) 2700 return sk; 2701 } 2702 spin_unlock_bh(lock); 2703 } 2704 2705 return NULL; 2706 } 2707 2708 static void *established_get_next(struct seq_file *seq, void *cur) 2709 { 2710 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2711 struct tcp_iter_state *st = seq->private; 2712 struct hlist_nulls_node *node; 2713 struct sock *sk = cur; 2714 2715 ++st->num; 2716 ++st->offset; 2717 2718 sk = sk_nulls_next(sk); 2719 2720 sk_nulls_for_each_from(sk, node) { 2721 if (seq_sk_match(seq, sk)) 2722 return sk; 2723 } 2724 2725 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2726 ++st->bucket; 2727 return established_get_first(seq); 2728 } 2729 2730 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2731 { 2732 struct tcp_iter_state *st = seq->private; 2733 void *rc; 2734 2735 st->bucket = 0; 2736 rc = established_get_first(seq); 2737 2738 while (rc && pos) { 2739 rc = established_get_next(seq, rc); 2740 --pos; 2741 } 2742 return rc; 2743 } 2744 2745 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2746 { 2747 void *rc; 2748 struct tcp_iter_state *st = seq->private; 2749 2750 st->state = TCP_SEQ_STATE_LISTENING; 2751 rc = listening_get_idx(seq, &pos); 2752 2753 if (!rc) { 2754 st->state = TCP_SEQ_STATE_ESTABLISHED; 2755 rc = established_get_idx(seq, pos); 2756 } 2757 2758 return rc; 2759 } 2760 2761 static void *tcp_seek_last_pos(struct seq_file *seq) 2762 { 2763 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2764 struct tcp_iter_state *st = seq->private; 2765 int bucket = st->bucket; 2766 int offset = st->offset; 2767 int orig_num = st->num; 2768 void *rc = NULL; 2769 2770 switch (st->state) { 2771 case TCP_SEQ_STATE_LISTENING: 2772 if (st->bucket > hinfo->lhash2_mask) 2773 break; 2774 rc = listening_get_first(seq); 2775 while (offset-- && rc && bucket == st->bucket) 2776 rc = listening_get_next(seq, rc); 2777 if (rc) 2778 break; 2779 st->bucket = 0; 2780 st->state = TCP_SEQ_STATE_ESTABLISHED; 2781 fallthrough; 2782 case TCP_SEQ_STATE_ESTABLISHED: 2783 if (st->bucket > hinfo->ehash_mask) 2784 break; 2785 rc = established_get_first(seq); 2786 while (offset-- && rc && bucket == st->bucket) 2787 rc = established_get_next(seq, rc); 2788 } 2789 2790 st->num = orig_num; 2791 2792 return rc; 2793 } 2794 2795 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2796 { 2797 struct tcp_iter_state *st = seq->private; 2798 void *rc; 2799 2800 if (*pos && *pos == st->last_pos) { 2801 rc = tcp_seek_last_pos(seq); 2802 if (rc) 2803 goto out; 2804 } 2805 2806 st->state = TCP_SEQ_STATE_LISTENING; 2807 st->num = 0; 2808 st->bucket = 0; 2809 st->offset = 0; 2810 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2811 2812 out: 2813 st->last_pos = *pos; 2814 return rc; 2815 } 2816 EXPORT_SYMBOL(tcp_seq_start); 2817 2818 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2819 { 2820 struct tcp_iter_state *st = seq->private; 2821 void *rc = NULL; 2822 2823 if (v == SEQ_START_TOKEN) { 2824 rc = tcp_get_idx(seq, 0); 2825 goto out; 2826 } 2827 2828 switch (st->state) { 2829 case TCP_SEQ_STATE_LISTENING: 2830 rc = listening_get_next(seq, v); 2831 if (!rc) { 2832 st->state = TCP_SEQ_STATE_ESTABLISHED; 2833 st->bucket = 0; 2834 st->offset = 0; 2835 rc = established_get_first(seq); 2836 } 2837 break; 2838 case TCP_SEQ_STATE_ESTABLISHED: 2839 rc = established_get_next(seq, v); 2840 break; 2841 } 2842 out: 2843 ++*pos; 2844 st->last_pos = *pos; 2845 return rc; 2846 } 2847 EXPORT_SYMBOL(tcp_seq_next); 2848 2849 void tcp_seq_stop(struct seq_file *seq, void *v) 2850 { 2851 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2852 struct tcp_iter_state *st = seq->private; 2853 2854 switch (st->state) { 2855 case TCP_SEQ_STATE_LISTENING: 2856 if (v != SEQ_START_TOKEN) 2857 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2858 break; 2859 case TCP_SEQ_STATE_ESTABLISHED: 2860 if (v) 2861 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2862 break; 2863 } 2864 } 2865 EXPORT_SYMBOL(tcp_seq_stop); 2866 2867 static void get_openreq4(const struct request_sock *req, 2868 struct seq_file *f, int i) 2869 { 2870 const struct inet_request_sock *ireq = inet_rsk(req); 2871 long delta = req->rsk_timer.expires - jiffies; 2872 2873 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2874 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2875 i, 2876 ireq->ir_loc_addr, 2877 ireq->ir_num, 2878 ireq->ir_rmt_addr, 2879 ntohs(ireq->ir_rmt_port), 2880 TCP_SYN_RECV, 2881 0, 0, /* could print option size, but that is af dependent. */ 2882 1, /* timers active (only the expire timer) */ 2883 jiffies_delta_to_clock_t(delta), 2884 req->num_timeout, 2885 from_kuid_munged(seq_user_ns(f), 2886 sock_i_uid(req->rsk_listener)), 2887 0, /* non standard timer */ 2888 0, /* open_requests have no inode */ 2889 0, 2890 req); 2891 } 2892 2893 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2894 { 2895 int timer_active; 2896 unsigned long timer_expires; 2897 const struct tcp_sock *tp = tcp_sk(sk); 2898 const struct inet_connection_sock *icsk = inet_csk(sk); 2899 const struct inet_sock *inet = inet_sk(sk); 2900 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2901 __be32 dest = inet->inet_daddr; 2902 __be32 src = inet->inet_rcv_saddr; 2903 __u16 destp = ntohs(inet->inet_dport); 2904 __u16 srcp = ntohs(inet->inet_sport); 2905 u8 icsk_pending; 2906 int rx_queue; 2907 int state; 2908 2909 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2910 if (icsk_pending == ICSK_TIME_RETRANS || 2911 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2912 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2913 timer_active = 1; 2914 timer_expires = icsk->icsk_timeout; 2915 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2916 timer_active = 4; 2917 timer_expires = icsk->icsk_timeout; 2918 } else if (timer_pending(&sk->sk_timer)) { 2919 timer_active = 2; 2920 timer_expires = sk->sk_timer.expires; 2921 } else { 2922 timer_active = 0; 2923 timer_expires = jiffies; 2924 } 2925 2926 state = inet_sk_state_load(sk); 2927 if (state == TCP_LISTEN) 2928 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2929 else 2930 /* Because we don't lock the socket, 2931 * we might find a transient negative value. 2932 */ 2933 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2934 READ_ONCE(tp->copied_seq), 0); 2935 2936 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2937 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2938 i, src, srcp, dest, destp, state, 2939 READ_ONCE(tp->write_seq) - tp->snd_una, 2940 rx_queue, 2941 timer_active, 2942 jiffies_delta_to_clock_t(timer_expires - jiffies), 2943 icsk->icsk_retransmits, 2944 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2945 icsk->icsk_probes_out, 2946 sock_i_ino(sk), 2947 refcount_read(&sk->sk_refcnt), sk, 2948 jiffies_to_clock_t(icsk->icsk_rto), 2949 jiffies_to_clock_t(icsk->icsk_ack.ato), 2950 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2951 tcp_snd_cwnd(tp), 2952 state == TCP_LISTEN ? 2953 fastopenq->max_qlen : 2954 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2955 } 2956 2957 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2958 struct seq_file *f, int i) 2959 { 2960 long delta = tw->tw_timer.expires - jiffies; 2961 __be32 dest, src; 2962 __u16 destp, srcp; 2963 2964 dest = tw->tw_daddr; 2965 src = tw->tw_rcv_saddr; 2966 destp = ntohs(tw->tw_dport); 2967 srcp = ntohs(tw->tw_sport); 2968 2969 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2970 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2971 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2972 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2973 refcount_read(&tw->tw_refcnt), tw); 2974 } 2975 2976 #define TMPSZ 150 2977 2978 static int tcp4_seq_show(struct seq_file *seq, void *v) 2979 { 2980 struct tcp_iter_state *st; 2981 struct sock *sk = v; 2982 2983 seq_setwidth(seq, TMPSZ - 1); 2984 if (v == SEQ_START_TOKEN) { 2985 seq_puts(seq, " sl local_address rem_address st tx_queue " 2986 "rx_queue tr tm->when retrnsmt uid timeout " 2987 "inode"); 2988 goto out; 2989 } 2990 st = seq->private; 2991 2992 if (sk->sk_state == TCP_TIME_WAIT) 2993 get_timewait4_sock(v, seq, st->num); 2994 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2995 get_openreq4(v, seq, st->num); 2996 else 2997 get_tcp4_sock(v, seq, st->num); 2998 out: 2999 seq_pad(seq, '\n'); 3000 return 0; 3001 } 3002 3003 #ifdef CONFIG_BPF_SYSCALL 3004 struct bpf_tcp_iter_state { 3005 struct tcp_iter_state state; 3006 unsigned int cur_sk; 3007 unsigned int end_sk; 3008 unsigned int max_sk; 3009 struct sock **batch; 3010 bool st_bucket_done; 3011 }; 3012 3013 struct bpf_iter__tcp { 3014 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3015 __bpf_md_ptr(struct sock_common *, sk_common); 3016 uid_t uid __aligned(8); 3017 }; 3018 3019 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3020 struct sock_common *sk_common, uid_t uid) 3021 { 3022 struct bpf_iter__tcp ctx; 3023 3024 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3025 ctx.meta = meta; 3026 ctx.sk_common = sk_common; 3027 ctx.uid = uid; 3028 return bpf_iter_run_prog(prog, &ctx); 3029 } 3030 3031 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3032 { 3033 while (iter->cur_sk < iter->end_sk) 3034 sock_gen_put(iter->batch[iter->cur_sk++]); 3035 } 3036 3037 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3038 unsigned int new_batch_sz) 3039 { 3040 struct sock **new_batch; 3041 3042 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3043 GFP_USER | __GFP_NOWARN); 3044 if (!new_batch) 3045 return -ENOMEM; 3046 3047 bpf_iter_tcp_put_batch(iter); 3048 kvfree(iter->batch); 3049 iter->batch = new_batch; 3050 iter->max_sk = new_batch_sz; 3051 3052 return 0; 3053 } 3054 3055 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3056 struct sock *start_sk) 3057 { 3058 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3059 struct bpf_tcp_iter_state *iter = seq->private; 3060 struct tcp_iter_state *st = &iter->state; 3061 struct hlist_nulls_node *node; 3062 unsigned int expected = 1; 3063 struct sock *sk; 3064 3065 sock_hold(start_sk); 3066 iter->batch[iter->end_sk++] = start_sk; 3067 3068 sk = sk_nulls_next(start_sk); 3069 sk_nulls_for_each_from(sk, node) { 3070 if (seq_sk_match(seq, sk)) { 3071 if (iter->end_sk < iter->max_sk) { 3072 sock_hold(sk); 3073 iter->batch[iter->end_sk++] = sk; 3074 } 3075 expected++; 3076 } 3077 } 3078 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3079 3080 return expected; 3081 } 3082 3083 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3084 struct sock *start_sk) 3085 { 3086 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3087 struct bpf_tcp_iter_state *iter = seq->private; 3088 struct tcp_iter_state *st = &iter->state; 3089 struct hlist_nulls_node *node; 3090 unsigned int expected = 1; 3091 struct sock *sk; 3092 3093 sock_hold(start_sk); 3094 iter->batch[iter->end_sk++] = start_sk; 3095 3096 sk = sk_nulls_next(start_sk); 3097 sk_nulls_for_each_from(sk, node) { 3098 if (seq_sk_match(seq, sk)) { 3099 if (iter->end_sk < iter->max_sk) { 3100 sock_hold(sk); 3101 iter->batch[iter->end_sk++] = sk; 3102 } 3103 expected++; 3104 } 3105 } 3106 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3107 3108 return expected; 3109 } 3110 3111 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3112 { 3113 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3114 struct bpf_tcp_iter_state *iter = seq->private; 3115 struct tcp_iter_state *st = &iter->state; 3116 unsigned int expected; 3117 bool resized = false; 3118 struct sock *sk; 3119 3120 /* The st->bucket is done. Directly advance to the next 3121 * bucket instead of having the tcp_seek_last_pos() to skip 3122 * one by one in the current bucket and eventually find out 3123 * it has to advance to the next bucket. 3124 */ 3125 if (iter->st_bucket_done) { 3126 st->offset = 0; 3127 st->bucket++; 3128 if (st->state == TCP_SEQ_STATE_LISTENING && 3129 st->bucket > hinfo->lhash2_mask) { 3130 st->state = TCP_SEQ_STATE_ESTABLISHED; 3131 st->bucket = 0; 3132 } 3133 } 3134 3135 again: 3136 /* Get a new batch */ 3137 iter->cur_sk = 0; 3138 iter->end_sk = 0; 3139 iter->st_bucket_done = false; 3140 3141 sk = tcp_seek_last_pos(seq); 3142 if (!sk) 3143 return NULL; /* Done */ 3144 3145 if (st->state == TCP_SEQ_STATE_LISTENING) 3146 expected = bpf_iter_tcp_listening_batch(seq, sk); 3147 else 3148 expected = bpf_iter_tcp_established_batch(seq, sk); 3149 3150 if (iter->end_sk == expected) { 3151 iter->st_bucket_done = true; 3152 return sk; 3153 } 3154 3155 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3156 resized = true; 3157 goto again; 3158 } 3159 3160 return sk; 3161 } 3162 3163 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3164 { 3165 /* bpf iter does not support lseek, so it always 3166 * continue from where it was stop()-ped. 3167 */ 3168 if (*pos) 3169 return bpf_iter_tcp_batch(seq); 3170 3171 return SEQ_START_TOKEN; 3172 } 3173 3174 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3175 { 3176 struct bpf_tcp_iter_state *iter = seq->private; 3177 struct tcp_iter_state *st = &iter->state; 3178 struct sock *sk; 3179 3180 /* Whenever seq_next() is called, the iter->cur_sk is 3181 * done with seq_show(), so advance to the next sk in 3182 * the batch. 3183 */ 3184 if (iter->cur_sk < iter->end_sk) { 3185 /* Keeping st->num consistent in tcp_iter_state. 3186 * bpf_iter_tcp does not use st->num. 3187 * meta.seq_num is used instead. 3188 */ 3189 st->num++; 3190 /* Move st->offset to the next sk in the bucket such that 3191 * the future start() will resume at st->offset in 3192 * st->bucket. See tcp_seek_last_pos(). 3193 */ 3194 st->offset++; 3195 sock_gen_put(iter->batch[iter->cur_sk++]); 3196 } 3197 3198 if (iter->cur_sk < iter->end_sk) 3199 sk = iter->batch[iter->cur_sk]; 3200 else 3201 sk = bpf_iter_tcp_batch(seq); 3202 3203 ++*pos; 3204 /* Keeping st->last_pos consistent in tcp_iter_state. 3205 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3206 */ 3207 st->last_pos = *pos; 3208 return sk; 3209 } 3210 3211 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3212 { 3213 struct bpf_iter_meta meta; 3214 struct bpf_prog *prog; 3215 struct sock *sk = v; 3216 uid_t uid; 3217 int ret; 3218 3219 if (v == SEQ_START_TOKEN) 3220 return 0; 3221 3222 if (sk_fullsock(sk)) 3223 lock_sock(sk); 3224 3225 if (unlikely(sk_unhashed(sk))) { 3226 ret = SEQ_SKIP; 3227 goto unlock; 3228 } 3229 3230 if (sk->sk_state == TCP_TIME_WAIT) { 3231 uid = 0; 3232 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3233 const struct request_sock *req = v; 3234 3235 uid = from_kuid_munged(seq_user_ns(seq), 3236 sock_i_uid(req->rsk_listener)); 3237 } else { 3238 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3239 } 3240 3241 meta.seq = seq; 3242 prog = bpf_iter_get_info(&meta, false); 3243 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3244 3245 unlock: 3246 if (sk_fullsock(sk)) 3247 release_sock(sk); 3248 return ret; 3249 3250 } 3251 3252 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3253 { 3254 struct bpf_tcp_iter_state *iter = seq->private; 3255 struct bpf_iter_meta meta; 3256 struct bpf_prog *prog; 3257 3258 if (!v) { 3259 meta.seq = seq; 3260 prog = bpf_iter_get_info(&meta, true); 3261 if (prog) 3262 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3263 } 3264 3265 if (iter->cur_sk < iter->end_sk) { 3266 bpf_iter_tcp_put_batch(iter); 3267 iter->st_bucket_done = false; 3268 } 3269 } 3270 3271 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3272 .show = bpf_iter_tcp_seq_show, 3273 .start = bpf_iter_tcp_seq_start, 3274 .next = bpf_iter_tcp_seq_next, 3275 .stop = bpf_iter_tcp_seq_stop, 3276 }; 3277 #endif 3278 static unsigned short seq_file_family(const struct seq_file *seq) 3279 { 3280 const struct tcp_seq_afinfo *afinfo; 3281 3282 #ifdef CONFIG_BPF_SYSCALL 3283 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3284 if (seq->op == &bpf_iter_tcp_seq_ops) 3285 return AF_UNSPEC; 3286 #endif 3287 3288 /* Iterated from proc fs */ 3289 afinfo = pde_data(file_inode(seq->file)); 3290 return afinfo->family; 3291 } 3292 3293 static const struct seq_operations tcp4_seq_ops = { 3294 .show = tcp4_seq_show, 3295 .start = tcp_seq_start, 3296 .next = tcp_seq_next, 3297 .stop = tcp_seq_stop, 3298 }; 3299 3300 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3301 .family = AF_INET, 3302 }; 3303 3304 static int __net_init tcp4_proc_init_net(struct net *net) 3305 { 3306 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3307 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3308 return -ENOMEM; 3309 return 0; 3310 } 3311 3312 static void __net_exit tcp4_proc_exit_net(struct net *net) 3313 { 3314 remove_proc_entry("tcp", net->proc_net); 3315 } 3316 3317 static struct pernet_operations tcp4_net_ops = { 3318 .init = tcp4_proc_init_net, 3319 .exit = tcp4_proc_exit_net, 3320 }; 3321 3322 int __init tcp4_proc_init(void) 3323 { 3324 return register_pernet_subsys(&tcp4_net_ops); 3325 } 3326 3327 void tcp4_proc_exit(void) 3328 { 3329 unregister_pernet_subsys(&tcp4_net_ops); 3330 } 3331 #endif /* CONFIG_PROC_FS */ 3332 3333 /* @wake is one when sk_stream_write_space() calls us. 3334 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3335 * This mimics the strategy used in sock_def_write_space(). 3336 */ 3337 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3338 { 3339 const struct tcp_sock *tp = tcp_sk(sk); 3340 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3341 READ_ONCE(tp->snd_nxt); 3342 3343 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3344 } 3345 EXPORT_SYMBOL(tcp_stream_memory_free); 3346 3347 struct proto tcp_prot = { 3348 .name = "TCP", 3349 .owner = THIS_MODULE, 3350 .close = tcp_close, 3351 .pre_connect = tcp_v4_pre_connect, 3352 .connect = tcp_v4_connect, 3353 .disconnect = tcp_disconnect, 3354 .accept = inet_csk_accept, 3355 .ioctl = tcp_ioctl, 3356 .init = tcp_v4_init_sock, 3357 .destroy = tcp_v4_destroy_sock, 3358 .shutdown = tcp_shutdown, 3359 .setsockopt = tcp_setsockopt, 3360 .getsockopt = tcp_getsockopt, 3361 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3362 .keepalive = tcp_set_keepalive, 3363 .recvmsg = tcp_recvmsg, 3364 .sendmsg = tcp_sendmsg, 3365 .splice_eof = tcp_splice_eof, 3366 .backlog_rcv = tcp_v4_do_rcv, 3367 .release_cb = tcp_release_cb, 3368 .hash = inet_hash, 3369 .unhash = inet_unhash, 3370 .get_port = inet_csk_get_port, 3371 .put_port = inet_put_port, 3372 #ifdef CONFIG_BPF_SYSCALL 3373 .psock_update_sk_prot = tcp_bpf_update_proto, 3374 #endif 3375 .enter_memory_pressure = tcp_enter_memory_pressure, 3376 .leave_memory_pressure = tcp_leave_memory_pressure, 3377 .stream_memory_free = tcp_stream_memory_free, 3378 .sockets_allocated = &tcp_sockets_allocated, 3379 .orphan_count = &tcp_orphan_count, 3380 3381 .memory_allocated = &tcp_memory_allocated, 3382 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3383 3384 .memory_pressure = &tcp_memory_pressure, 3385 .sysctl_mem = sysctl_tcp_mem, 3386 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3387 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3388 .max_header = MAX_TCP_HEADER, 3389 .obj_size = sizeof(struct tcp_sock), 3390 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3391 .twsk_prot = &tcp_timewait_sock_ops, 3392 .rsk_prot = &tcp_request_sock_ops, 3393 .h.hashinfo = NULL, 3394 .no_autobind = true, 3395 .diag_destroy = tcp_abort, 3396 }; 3397 EXPORT_SYMBOL(tcp_prot); 3398 3399 static void __net_exit tcp_sk_exit(struct net *net) 3400 { 3401 if (net->ipv4.tcp_congestion_control) 3402 bpf_module_put(net->ipv4.tcp_congestion_control, 3403 net->ipv4.tcp_congestion_control->owner); 3404 } 3405 3406 static void __net_init tcp_set_hashinfo(struct net *net) 3407 { 3408 struct inet_hashinfo *hinfo; 3409 unsigned int ehash_entries; 3410 struct net *old_net; 3411 3412 if (net_eq(net, &init_net)) 3413 goto fallback; 3414 3415 old_net = current->nsproxy->net_ns; 3416 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3417 if (!ehash_entries) 3418 goto fallback; 3419 3420 ehash_entries = roundup_pow_of_two(ehash_entries); 3421 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3422 if (!hinfo) { 3423 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3424 "for a netns, fallback to the global one\n", 3425 ehash_entries); 3426 fallback: 3427 hinfo = &tcp_hashinfo; 3428 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3429 } 3430 3431 net->ipv4.tcp_death_row.hashinfo = hinfo; 3432 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3433 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3434 } 3435 3436 static int __net_init tcp_sk_init(struct net *net) 3437 { 3438 net->ipv4.sysctl_tcp_ecn = 2; 3439 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3440 3441 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3442 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3443 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3444 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3445 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3446 3447 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3448 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3449 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3450 3451 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3452 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3453 net->ipv4.sysctl_tcp_syncookies = 1; 3454 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3455 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3456 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3457 net->ipv4.sysctl_tcp_orphan_retries = 0; 3458 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3459 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3460 net->ipv4.sysctl_tcp_tw_reuse = 2; 3461 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3462 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3463 3464 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3465 tcp_set_hashinfo(net); 3466 3467 net->ipv4.sysctl_tcp_sack = 1; 3468 net->ipv4.sysctl_tcp_window_scaling = 1; 3469 net->ipv4.sysctl_tcp_timestamps = 1; 3470 net->ipv4.sysctl_tcp_early_retrans = 3; 3471 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3472 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3473 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3474 net->ipv4.sysctl_tcp_max_reordering = 300; 3475 net->ipv4.sysctl_tcp_dsack = 1; 3476 net->ipv4.sysctl_tcp_app_win = 31; 3477 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3478 net->ipv4.sysctl_tcp_frto = 2; 3479 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3480 /* This limits the percentage of the congestion window which we 3481 * will allow a single TSO frame to consume. Building TSO frames 3482 * which are too large can cause TCP streams to be bursty. 3483 */ 3484 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3485 /* Default TSQ limit of 16 TSO segments */ 3486 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3487 3488 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3489 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3490 3491 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3492 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3493 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3494 net->ipv4.sysctl_tcp_autocorking = 1; 3495 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3496 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3497 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3498 if (net != &init_net) { 3499 memcpy(net->ipv4.sysctl_tcp_rmem, 3500 init_net.ipv4.sysctl_tcp_rmem, 3501 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3502 memcpy(net->ipv4.sysctl_tcp_wmem, 3503 init_net.ipv4.sysctl_tcp_wmem, 3504 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3505 } 3506 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3507 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3508 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3509 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3510 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3511 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3512 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3513 3514 /* Set default values for PLB */ 3515 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3516 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3517 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3518 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3519 /* Default congestion threshold for PLB to mark a round is 50% */ 3520 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3521 3522 /* Reno is always built in */ 3523 if (!net_eq(net, &init_net) && 3524 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3525 init_net.ipv4.tcp_congestion_control->owner)) 3526 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3527 else 3528 net->ipv4.tcp_congestion_control = &tcp_reno; 3529 3530 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3531 net->ipv4.sysctl_tcp_shrink_window = 0; 3532 3533 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3534 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3535 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3536 3537 return 0; 3538 } 3539 3540 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3541 { 3542 struct net *net; 3543 3544 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3545 * and failed setup_net error unwinding path are serialized. 3546 * 3547 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3548 * net_exit_list, the thread that dismantles a particular twsk must 3549 * do so without other thread progressing to refcount_dec_and_test() of 3550 * tcp_death_row.tw_refcount. 3551 */ 3552 mutex_lock(&tcp_exit_batch_mutex); 3553 3554 tcp_twsk_purge(net_exit_list); 3555 3556 list_for_each_entry(net, net_exit_list, exit_list) { 3557 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3558 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3559 tcp_fastopen_ctx_destroy(net); 3560 } 3561 3562 mutex_unlock(&tcp_exit_batch_mutex); 3563 } 3564 3565 static struct pernet_operations __net_initdata tcp_sk_ops = { 3566 .init = tcp_sk_init, 3567 .exit = tcp_sk_exit, 3568 .exit_batch = tcp_sk_exit_batch, 3569 }; 3570 3571 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3572 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3573 struct sock_common *sk_common, uid_t uid) 3574 3575 #define INIT_BATCH_SZ 16 3576 3577 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3578 { 3579 struct bpf_tcp_iter_state *iter = priv_data; 3580 int err; 3581 3582 err = bpf_iter_init_seq_net(priv_data, aux); 3583 if (err) 3584 return err; 3585 3586 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3587 if (err) { 3588 bpf_iter_fini_seq_net(priv_data); 3589 return err; 3590 } 3591 3592 return 0; 3593 } 3594 3595 static void bpf_iter_fini_tcp(void *priv_data) 3596 { 3597 struct bpf_tcp_iter_state *iter = priv_data; 3598 3599 bpf_iter_fini_seq_net(priv_data); 3600 kvfree(iter->batch); 3601 } 3602 3603 static const struct bpf_iter_seq_info tcp_seq_info = { 3604 .seq_ops = &bpf_iter_tcp_seq_ops, 3605 .init_seq_private = bpf_iter_init_tcp, 3606 .fini_seq_private = bpf_iter_fini_tcp, 3607 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3608 }; 3609 3610 static const struct bpf_func_proto * 3611 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3612 const struct bpf_prog *prog) 3613 { 3614 switch (func_id) { 3615 case BPF_FUNC_setsockopt: 3616 return &bpf_sk_setsockopt_proto; 3617 case BPF_FUNC_getsockopt: 3618 return &bpf_sk_getsockopt_proto; 3619 default: 3620 return NULL; 3621 } 3622 } 3623 3624 static struct bpf_iter_reg tcp_reg_info = { 3625 .target = "tcp", 3626 .ctx_arg_info_size = 1, 3627 .ctx_arg_info = { 3628 { offsetof(struct bpf_iter__tcp, sk_common), 3629 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3630 }, 3631 .get_func_proto = bpf_iter_tcp_get_func_proto, 3632 .seq_info = &tcp_seq_info, 3633 }; 3634 3635 static void __init bpf_iter_register(void) 3636 { 3637 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3638 if (bpf_iter_reg_target(&tcp_reg_info)) 3639 pr_warn("Warning: could not register bpf iterator tcp\n"); 3640 } 3641 3642 #endif 3643 3644 void __init tcp_v4_init(void) 3645 { 3646 int cpu, res; 3647 3648 for_each_possible_cpu(cpu) { 3649 struct sock *sk; 3650 3651 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3652 IPPROTO_TCP, &init_net); 3653 if (res) 3654 panic("Failed to create the TCP control socket.\n"); 3655 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3656 3657 /* Please enforce IP_DF and IPID==0 for RST and 3658 * ACK sent in SYN-RECV and TIME-WAIT state. 3659 */ 3660 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3661 3662 sk->sk_clockid = CLOCK_MONOTONIC; 3663 3664 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3665 } 3666 if (register_pernet_subsys(&tcp_sk_ops)) 3667 panic("Failed to create the TCP control socket.\n"); 3668 3669 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3670 bpf_iter_register(); 3671 #endif 3672 } 3673