1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/fips.h> 57 #include <linux/jhash.h> 58 #include <linux/init.h> 59 #include <linux/times.h> 60 #include <linux/slab.h> 61 #include <linux/sched.h> 62 #include <linux/sock_diag.h> 63 64 #include <net/aligned_data.h> 65 #include <net/net_namespace.h> 66 #include <net/icmp.h> 67 #include <net/inet_hashtables.h> 68 #include <net/tcp.h> 69 #include <net/tcp_ecn.h> 70 #include <net/transp_v6.h> 71 #include <net/ipv6.h> 72 #include <net/inet_common.h> 73 #include <net/inet_ecn.h> 74 #include <net/timewait_sock.h> 75 #include <net/xfrm.h> 76 #include <net/secure_seq.h> 77 #include <net/busy_poll.h> 78 #include <net/rstreason.h> 79 #include <net/psp.h> 80 81 #include <linux/inet.h> 82 #include <linux/ipv6.h> 83 #include <linux/stddef.h> 84 #include <linux/proc_fs.h> 85 #include <linux/seq_file.h> 86 #include <linux/inetdevice.h> 87 #include <linux/btf_ids.h> 88 #include <linux/skbuff_ref.h> 89 90 #include <crypto/md5.h> 91 #include <crypto/utils.h> 92 93 #include <trace/events/tcp.h> 94 95 #ifdef CONFIG_TCP_MD5SIG 96 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 97 __be32 daddr, __be32 saddr, const struct tcphdr *th); 98 #endif 99 100 struct inet_hashinfo tcp_hashinfo; 101 102 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 103 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 104 }; 105 106 static DEFINE_MUTEX(tcp_exit_batch_mutex); 107 108 static union tcp_seq_and_ts_off 109 tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb) 110 { 111 return secure_tcp_seq_and_ts_off(net, 112 ip_hdr(skb)->daddr, 113 ip_hdr(skb)->saddr, 114 tcp_hdr(skb)->dest, 115 tcp_hdr(skb)->source); 116 } 117 118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 119 { 120 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 121 const struct inet_timewait_sock *tw = inet_twsk(sktw); 122 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 123 struct tcp_sock *tp = tcp_sk(sk); 124 int ts_recent_stamp; 125 u32 reuse_thresh; 126 127 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 128 reuse = 0; 129 130 if (reuse == 2) { 131 /* Still does not detect *everything* that goes through 132 * lo, since we require a loopback src or dst address 133 * or direct binding to 'lo' interface. 134 */ 135 bool loopback = false; 136 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 137 loopback = true; 138 #if IS_ENABLED(CONFIG_IPV6) 139 if (tw->tw_family == AF_INET6) { 140 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 142 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 144 loopback = true; 145 } else 146 #endif 147 { 148 if (ipv4_is_loopback(tw->tw_daddr) || 149 ipv4_is_loopback(tw->tw_rcv_saddr)) 150 loopback = true; 151 } 152 if (!loopback) 153 reuse = 0; 154 } 155 156 /* With PAWS, it is safe from the viewpoint 157 of data integrity. Even without PAWS it is safe provided sequence 158 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 159 160 Actually, the idea is close to VJ's one, only timestamp cache is 161 held not per host, but per port pair and TW bucket is used as state 162 holder. 163 164 If TW bucket has been already destroyed we fall back to VJ's scheme 165 and use initial timestamp retrieved from peer table. 166 */ 167 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 168 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 169 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 170 if (ts_recent_stamp && 171 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 172 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 173 * and releasing the bucket lock. 174 */ 175 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 176 return 0; 177 178 /* In case of repair and re-using TIME-WAIT sockets we still 179 * want to be sure that it is safe as above but honor the 180 * sequence numbers and time stamps set as part of the repair 181 * process. 182 * 183 * Without this check re-using a TIME-WAIT socket with TCP 184 * repair would accumulate a -1 on the repair assigned 185 * sequence number. The first time it is reused the sequence 186 * is -1, the second time -2, etc. This fixes that issue 187 * without appearing to create any others. 188 */ 189 if (likely(!tp->repair)) { 190 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 191 192 if (!seq) 193 seq = 1; 194 WRITE_ONCE(tp->write_seq, seq); 195 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 196 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 197 } 198 199 return 1; 200 } 201 202 return 0; 203 } 204 205 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, 206 int addr_len) 207 { 208 /* This check is replicated from tcp_v4_connect() and intended to 209 * prevent BPF program called below from accessing bytes that are out 210 * of the bound specified by user in addr_len. 211 */ 212 if (addr_len < sizeof(struct sockaddr_in)) 213 return -EINVAL; 214 215 sock_owned_by_me(sk); 216 217 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 218 } 219 220 /* This will initiate an outgoing connection. */ 221 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) 222 { 223 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 224 struct inet_timewait_death_row *tcp_death_row; 225 struct inet_sock *inet = inet_sk(sk); 226 struct tcp_sock *tp = tcp_sk(sk); 227 struct ip_options_rcu *inet_opt; 228 struct net *net = sock_net(sk); 229 __be16 orig_sport, orig_dport; 230 __be32 daddr, nexthop; 231 struct flowi4 *fl4; 232 struct rtable *rt; 233 int err; 234 235 if (addr_len < sizeof(struct sockaddr_in)) 236 return -EINVAL; 237 238 if (usin->sin_family != AF_INET) 239 return -EAFNOSUPPORT; 240 241 nexthop = daddr = usin->sin_addr.s_addr; 242 inet_opt = rcu_dereference_protected(inet->inet_opt, 243 lockdep_sock_is_held(sk)); 244 if (inet_opt && inet_opt->opt.srr) { 245 if (!daddr) 246 return -EINVAL; 247 nexthop = inet_opt->opt.faddr; 248 } 249 250 orig_sport = inet->inet_sport; 251 orig_dport = usin->sin_port; 252 fl4 = &inet->cork.fl.u.ip4; 253 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 254 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 255 orig_dport, sk); 256 if (IS_ERR(rt)) { 257 err = PTR_ERR(rt); 258 if (err == -ENETUNREACH) 259 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 260 return err; 261 } 262 263 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 264 ip_rt_put(rt); 265 return -ENETUNREACH; 266 } 267 268 if (!inet_opt || !inet_opt->opt.srr) 269 daddr = fl4->daddr; 270 271 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 272 273 if (!inet->inet_saddr) { 274 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 275 if (err) { 276 ip_rt_put(rt); 277 return err; 278 } 279 } else { 280 sk_rcv_saddr_set(sk, inet->inet_saddr); 281 } 282 283 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 284 /* Reset inherited state */ 285 tp->rx_opt.ts_recent = 0; 286 tp->rx_opt.ts_recent_stamp = 0; 287 if (likely(!tp->repair)) 288 WRITE_ONCE(tp->write_seq, 0); 289 } 290 291 inet->inet_dport = usin->sin_port; 292 sk_daddr_set(sk, daddr); 293 294 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); 295 if (inet_opt) 296 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; 297 298 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 299 300 /* Socket identity is still unknown (sport may be zero). 301 * However we set state to SYN-SENT and not releasing socket 302 * lock select source port, enter ourselves into the hash tables and 303 * complete initialization after this. 304 */ 305 tcp_set_state(sk, TCP_SYN_SENT); 306 err = inet_hash_connect(tcp_death_row, sk); 307 if (err) 308 goto failure; 309 310 sk_set_txhash(sk); 311 312 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 313 inet->inet_sport, inet->inet_dport, sk); 314 if (IS_ERR(rt)) { 315 err = PTR_ERR(rt); 316 rt = NULL; 317 goto failure; 318 } 319 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 320 /* OK, now commit destination to socket. */ 321 sk->sk_gso_type = SKB_GSO_TCPV4; 322 sk_setup_caps(sk, &rt->dst); 323 rt = NULL; 324 325 if (likely(!tp->repair)) { 326 union tcp_seq_and_ts_off st; 327 328 st = secure_tcp_seq_and_ts_off(net, 329 inet->inet_saddr, 330 inet->inet_daddr, 331 inet->inet_sport, 332 usin->sin_port); 333 if (!tp->write_seq) 334 WRITE_ONCE(tp->write_seq, st.seq); 335 WRITE_ONCE(tp->tsoffset, st.ts_off); 336 } 337 338 atomic_set(&inet->inet_id, get_random_u16()); 339 340 if (tcp_fastopen_defer_connect(sk, &err)) 341 return err; 342 if (err) 343 goto failure; 344 345 err = tcp_connect(sk); 346 347 if (err) 348 goto failure; 349 350 return 0; 351 352 failure: 353 /* 354 * This unhashes the socket and releases the local port, 355 * if necessary. 356 */ 357 tcp_set_state(sk, TCP_CLOSE); 358 inet_bhash2_reset_saddr(sk); 359 ip_rt_put(rt); 360 sk->sk_route_caps = 0; 361 inet->inet_dport = 0; 362 return err; 363 } 364 365 /* 366 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 367 * It can be called through tcp_release_cb() if socket was owned by user 368 * at the time tcp_v4_err() was called to handle ICMP message. 369 */ 370 void tcp_v4_mtu_reduced(struct sock *sk) 371 { 372 struct inet_sock *inet = inet_sk(sk); 373 struct dst_entry *dst; 374 u32 mtu, dmtu; 375 376 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 377 return; 378 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 379 dst = inet_csk_update_pmtu(sk, mtu); 380 if (!dst) 381 return; 382 383 /* Something is about to be wrong... Remember soft error 384 * for the case, if this connection will not able to recover. 385 */ 386 dmtu = dst4_mtu(dst); 387 if (mtu < dmtu && ip_dont_fragment(sk, dst)) 388 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 389 390 if (inet->pmtudisc != IP_PMTUDISC_DONT && 391 ip_sk_accept_pmtu(sk) && 392 inet_csk(sk)->icsk_pmtu_cookie > dmtu) { 393 tcp_sync_mss(sk, dmtu); 394 395 /* Resend the TCP packet because it's 396 * clear that the old packet has been 397 * dropped. This is the new "fast" path mtu 398 * discovery. 399 */ 400 tcp_simple_retransmit(sk); 401 } /* else let the usual retransmit timer handle it */ 402 } 403 404 static void do_redirect(struct sk_buff *skb, struct sock *sk) 405 { 406 struct dst_entry *dst = __sk_dst_check(sk, 0); 407 408 if (dst) 409 dst->ops->redirect(dst, sk, skb); 410 } 411 412 413 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 414 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 415 { 416 struct request_sock *req = inet_reqsk(sk); 417 struct net *net = sock_net(sk); 418 419 /* ICMPs are not backlogged, hence we cannot get 420 * an established socket here. 421 */ 422 if (seq != tcp_rsk(req)->snt_isn) { 423 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 424 } else if (abort) { 425 /* 426 * Still in SYN_RECV, just remove it silently. 427 * There is no good way to pass the error to the newly 428 * created socket, and POSIX does not want network 429 * errors returned from accept(). 430 */ 431 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 432 tcp_listendrop(req->rsk_listener); 433 } 434 reqsk_put(req); 435 } 436 437 /* TCP-LD (RFC 6069) logic */ 438 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 439 { 440 struct inet_connection_sock *icsk = inet_csk(sk); 441 struct tcp_sock *tp = tcp_sk(sk); 442 struct sk_buff *skb; 443 s32 remaining; 444 u32 delta_us; 445 446 if (sock_owned_by_user(sk)) 447 return; 448 449 if (seq != tp->snd_una || !icsk->icsk_retransmits || 450 !icsk->icsk_backoff) 451 return; 452 453 skb = tcp_rtx_queue_head(sk); 454 if (WARN_ON_ONCE(!skb)) 455 return; 456 457 icsk->icsk_backoff--; 458 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 459 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 460 461 tcp_mstamp_refresh(tp); 462 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 463 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 464 465 if (remaining > 0) { 466 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 467 } else { 468 /* RTO revert clocked out retransmission. 469 * Will retransmit now. 470 */ 471 tcp_retransmit_timer(sk); 472 } 473 } 474 475 /* 476 * This routine is called by the ICMP module when it gets some 477 * sort of error condition. If err < 0 then the socket should 478 * be closed and the error returned to the user. If err > 0 479 * it's just the icmp type << 8 | icmp code. After adjustment 480 * header points to the first 8 bytes of the tcp header. We need 481 * to find the appropriate port. 482 * 483 * The locking strategy used here is very "optimistic". When 484 * someone else accesses the socket the ICMP is just dropped 485 * and for some paths there is no check at all. 486 * A more general error queue to queue errors for later handling 487 * is probably better. 488 * 489 */ 490 491 int tcp_v4_err(struct sk_buff *skb, u32 info) 492 { 493 const struct iphdr *iph = (const struct iphdr *)skb->data; 494 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 495 struct net *net = dev_net_rcu(skb->dev); 496 const int type = icmp_hdr(skb)->type; 497 const int code = icmp_hdr(skb)->code; 498 struct request_sock *fastopen; 499 struct tcp_sock *tp; 500 u32 seq, snd_una; 501 struct sock *sk; 502 int err; 503 504 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, 505 ntohs(th->source), inet_iif(skb), 0); 506 if (!sk) { 507 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 508 return -ENOENT; 509 } 510 if (sk->sk_state == TCP_TIME_WAIT) { 511 /* To increase the counter of ignored icmps for TCP-AO */ 512 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 513 inet_twsk_put(inet_twsk(sk)); 514 return 0; 515 } 516 seq = ntohl(th->seq); 517 if (sk->sk_state == TCP_NEW_SYN_RECV) { 518 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 519 type == ICMP_TIME_EXCEEDED || 520 (type == ICMP_DEST_UNREACH && 521 (code == ICMP_NET_UNREACH || 522 code == ICMP_HOST_UNREACH))); 523 return 0; 524 } 525 526 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 527 sock_put(sk); 528 return 0; 529 } 530 531 bh_lock_sock(sk); 532 /* If too many ICMPs get dropped on busy 533 * servers this needs to be solved differently. 534 * We do take care of PMTU discovery (RFC1191) special case : 535 * we can receive locally generated ICMP messages while socket is held. 536 */ 537 if (sock_owned_by_user(sk)) { 538 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 539 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 540 } 541 if (sk->sk_state == TCP_CLOSE) 542 goto out; 543 544 if (static_branch_unlikely(&ip4_min_ttl)) { 545 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 546 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 547 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 548 goto out; 549 } 550 } 551 552 tp = tcp_sk(sk); 553 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 554 fastopen = rcu_dereference(tp->fastopen_rsk); 555 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 556 if (sk->sk_state != TCP_LISTEN && 557 !between(seq, snd_una, tp->snd_nxt)) { 558 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 559 goto out; 560 } 561 562 switch (type) { 563 case ICMP_REDIRECT: 564 if (!sock_owned_by_user(sk)) 565 do_redirect(skb, sk); 566 goto out; 567 case ICMP_SOURCE_QUENCH: 568 /* Just silently ignore these. */ 569 goto out; 570 case ICMP_PARAMETERPROB: 571 err = EPROTO; 572 break; 573 case ICMP_DEST_UNREACH: 574 if (code > NR_ICMP_UNREACH) 575 goto out; 576 577 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 578 /* We are not interested in TCP_LISTEN and open_requests 579 * (SYN-ACKs send out by Linux are always <576bytes so 580 * they should go through unfragmented). 581 */ 582 if (sk->sk_state == TCP_LISTEN) 583 goto out; 584 585 WRITE_ONCE(tp->mtu_info, info); 586 if (!sock_owned_by_user(sk)) { 587 tcp_v4_mtu_reduced(sk); 588 } else { 589 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 590 sock_hold(sk); 591 } 592 goto out; 593 } 594 595 err = icmp_err_convert[code].errno; 596 /* check if this ICMP message allows revert of backoff. 597 * (see RFC 6069) 598 */ 599 if (!fastopen && 600 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 601 tcp_ld_RTO_revert(sk, seq); 602 break; 603 case ICMP_TIME_EXCEEDED: 604 err = EHOSTUNREACH; 605 break; 606 default: 607 goto out; 608 } 609 610 switch (sk->sk_state) { 611 case TCP_SYN_SENT: 612 case TCP_SYN_RECV: 613 /* Only in fast or simultaneous open. If a fast open socket is 614 * already accepted it is treated as a connected one below. 615 */ 616 if (fastopen && !fastopen->sk) 617 break; 618 619 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 620 621 if (!sock_owned_by_user(sk)) 622 tcp_done_with_error(sk, err); 623 else 624 WRITE_ONCE(sk->sk_err_soft, err); 625 goto out; 626 } 627 628 /* If we've already connected we will keep trying 629 * until we time out, or the user gives up. 630 * 631 * rfc1122 4.2.3.9 allows to consider as hard errors 632 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 633 * but it is obsoleted by pmtu discovery). 634 * 635 * Note, that in modern internet, where routing is unreliable 636 * and in each dark corner broken firewalls sit, sending random 637 * errors ordered by their masters even this two messages finally lose 638 * their original sense (even Linux sends invalid PORT_UNREACHs) 639 * 640 * Now we are in compliance with RFCs. 641 * --ANK (980905) 642 */ 643 644 if (!sock_owned_by_user(sk) && 645 inet_test_bit(RECVERR, sk)) { 646 WRITE_ONCE(sk->sk_err, err); 647 sk_error_report(sk); 648 } else { /* Only an error on timeout */ 649 WRITE_ONCE(sk->sk_err_soft, err); 650 } 651 652 out: 653 bh_unlock_sock(sk); 654 sock_put(sk); 655 return 0; 656 } 657 658 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 659 660 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 661 const struct tcp_ao_hdr *aoh, 662 struct ip_reply_arg *arg, struct tcphdr *reply, 663 __be32 reply_options[REPLY_OPTIONS_LEN]) 664 { 665 #ifdef CONFIG_TCP_AO 666 int sdif = tcp_v4_sdif(skb); 667 int dif = inet_iif(skb); 668 int l3index = sdif ? dif : 0; 669 bool allocated_traffic_key; 670 struct tcp_ao_key *key; 671 char *traffic_key; 672 bool drop = true; 673 u32 ao_sne = 0; 674 u8 keyid; 675 676 rcu_read_lock(); 677 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 678 &key, &traffic_key, &allocated_traffic_key, 679 &keyid, &ao_sne)) 680 goto out; 681 682 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 683 (aoh->rnext_keyid << 8) | keyid); 684 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 685 reply->doff = arg->iov[0].iov_len / 4; 686 687 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 688 key, traffic_key, 689 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 690 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 691 reply, ao_sne)) 692 goto out; 693 drop = false; 694 out: 695 rcu_read_unlock(); 696 if (allocated_traffic_key) 697 kfree(traffic_key); 698 return drop; 699 #else 700 return true; 701 #endif 702 } 703 704 /* 705 * This routine will send an RST to the other tcp. 706 * 707 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 708 * for reset. 709 * Answer: if a packet caused RST, it is not for a socket 710 * existing in our system, if it is matched to a socket, 711 * it is just duplicate segment or bug in other side's TCP. 712 * So that we build reply only basing on parameters 713 * arrived with segment. 714 * Exception: precedence violation. We do not implement it in any case. 715 */ 716 717 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 718 enum sk_rst_reason reason) 719 { 720 const struct tcphdr *th = tcp_hdr(skb); 721 struct { 722 struct tcphdr th; 723 __be32 opt[REPLY_OPTIONS_LEN]; 724 } rep; 725 const __u8 *md5_hash_location = NULL; 726 const struct tcp_ao_hdr *aoh; 727 struct ip_reply_arg arg; 728 #ifdef CONFIG_TCP_MD5SIG 729 struct tcp_md5sig_key *key = NULL; 730 unsigned char newhash[16]; 731 struct sock *sk1 = NULL; 732 #endif 733 u64 transmit_time = 0; 734 struct sock *ctl_sk; 735 struct net *net; 736 u32 txhash = 0; 737 738 /* Never send a reset in response to a reset. */ 739 if (th->rst) 740 return; 741 742 /* If sk not NULL, it means we did a successful lookup and incoming 743 * route had to be correct. prequeue might have dropped our dst. 744 */ 745 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 746 return; 747 748 /* Swap the send and the receive. */ 749 memset(&rep, 0, sizeof(rep)); 750 rep.th.dest = th->source; 751 rep.th.source = th->dest; 752 rep.th.doff = sizeof(struct tcphdr) / 4; 753 rep.th.rst = 1; 754 755 if (th->ack) { 756 rep.th.seq = th->ack_seq; 757 } else { 758 rep.th.ack = 1; 759 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 760 skb->len - (th->doff << 2)); 761 } 762 763 memset(&arg, 0, sizeof(arg)); 764 arg.iov[0].iov_base = (unsigned char *)&rep; 765 arg.iov[0].iov_len = sizeof(rep.th); 766 767 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 768 769 /* Invalid TCP option size or twice included auth */ 770 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 771 return; 772 773 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 774 return; 775 776 #ifdef CONFIG_TCP_MD5SIG 777 rcu_read_lock(); 778 if (sk && sk_fullsock(sk)) { 779 const union tcp_md5_addr *addr; 780 int l3index; 781 782 /* sdif set, means packet ingressed via a device 783 * in an L3 domain and inet_iif is set to it. 784 */ 785 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 786 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 787 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 788 } else if (md5_hash_location) { 789 const union tcp_md5_addr *addr; 790 int sdif = tcp_v4_sdif(skb); 791 int dif = inet_iif(skb); 792 int l3index; 793 794 /* 795 * active side is lost. Try to find listening socket through 796 * source port, and then find md5 key through listening socket. 797 * we are not loose security here: 798 * Incoming packet is checked with md5 hash with finding key, 799 * no RST generated if md5 hash doesn't match. 800 */ 801 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, 802 th->source, ip_hdr(skb)->daddr, 803 ntohs(th->source), dif, sdif); 804 /* don't send rst if it can't find key */ 805 if (!sk1) 806 goto out; 807 808 /* sdif set, means packet ingressed via a device 809 * in an L3 domain and dif is set to it. 810 */ 811 l3index = sdif ? dif : 0; 812 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 813 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 814 if (!key) 815 goto out; 816 817 tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 818 if (crypto_memneq(md5_hash_location, newhash, 16)) 819 goto out; 820 } 821 822 if (key) { 823 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 824 (TCPOPT_NOP << 16) | 825 (TCPOPT_MD5SIG << 8) | 826 TCPOLEN_MD5SIG); 827 /* Update length and the length the header thinks exists */ 828 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 829 rep.th.doff = arg.iov[0].iov_len / 4; 830 831 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 832 key, ip_hdr(skb)->saddr, 833 ip_hdr(skb)->daddr, &rep.th); 834 } 835 #endif 836 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 837 if (rep.opt[0] == 0) { 838 __be32 mrst = mptcp_reset_option(skb); 839 840 if (mrst) { 841 rep.opt[0] = mrst; 842 arg.iov[0].iov_len += sizeof(mrst); 843 rep.th.doff = arg.iov[0].iov_len / 4; 844 } 845 } 846 847 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 848 ip_hdr(skb)->saddr, /* XXX */ 849 arg.iov[0].iov_len, IPPROTO_TCP, 0); 850 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 851 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 852 853 /* When socket is gone, all binding information is lost. 854 * routing might fail in this case. No choice here, if we choose to force 855 * input interface, we will misroute in case of asymmetric route. 856 */ 857 if (sk) 858 arg.bound_dev_if = sk->sk_bound_dev_if; 859 860 trace_tcp_send_reset(sk, skb, reason); 861 862 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 863 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 864 865 /* ECN bits of TW reset are cleared */ 866 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; 867 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 868 local_bh_disable(); 869 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 870 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 871 872 sock_net_set(ctl_sk, net); 873 if (sk) { 874 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 875 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 876 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 877 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 878 transmit_time = tcp_transmit_time(sk); 879 xfrm_sk_clone_policy(ctl_sk, sk); 880 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 881 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 882 } else { 883 ctl_sk->sk_mark = 0; 884 ctl_sk->sk_priority = 0; 885 } 886 ip_send_unicast_reply(ctl_sk, sk, 887 skb, &TCP_SKB_CB(skb)->header.h4.opt, 888 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 889 &arg, arg.iov[0].iov_len, 890 transmit_time, txhash); 891 892 xfrm_sk_free_policy(ctl_sk); 893 sock_net_set(ctl_sk, &init_net); 894 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 895 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 896 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 897 local_bh_enable(); 898 899 #ifdef CONFIG_TCP_MD5SIG 900 out: 901 rcu_read_unlock(); 902 #endif 903 } 904 905 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 906 outside socket context is ugly, certainly. What can I do? 907 */ 908 909 static void tcp_v4_send_ack(const struct sock *sk, 910 struct sk_buff *skb, u32 seq, u32 ack, 911 u32 win, u32 tsval, u32 tsecr, int oif, 912 struct tcp_key *key, 913 int reply_flags, u8 tos, u32 txhash) 914 { 915 const struct tcphdr *th = tcp_hdr(skb); 916 struct { 917 struct tcphdr th; 918 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 919 } rep; 920 struct net *net = sock_net(sk); 921 struct ip_reply_arg arg; 922 struct sock *ctl_sk; 923 u64 transmit_time; 924 925 memset(&rep.th, 0, sizeof(struct tcphdr)); 926 memset(&arg, 0, sizeof(arg)); 927 928 arg.iov[0].iov_base = (unsigned char *)&rep; 929 arg.iov[0].iov_len = sizeof(rep.th); 930 if (tsecr) { 931 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 932 (TCPOPT_TIMESTAMP << 8) | 933 TCPOLEN_TIMESTAMP); 934 rep.opt[1] = htonl(tsval); 935 rep.opt[2] = htonl(tsecr); 936 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 937 } 938 939 /* Swap the send and the receive. */ 940 rep.th.dest = th->source; 941 rep.th.source = th->dest; 942 rep.th.doff = arg.iov[0].iov_len / 4; 943 rep.th.seq = htonl(seq); 944 rep.th.ack_seq = htonl(ack); 945 rep.th.ack = 1; 946 rep.th.window = htons(win); 947 948 #ifdef CONFIG_TCP_MD5SIG 949 if (tcp_key_is_md5(key)) { 950 int offset = (tsecr) ? 3 : 0; 951 952 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 953 (TCPOPT_NOP << 16) | 954 (TCPOPT_MD5SIG << 8) | 955 TCPOLEN_MD5SIG); 956 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 957 rep.th.doff = arg.iov[0].iov_len/4; 958 959 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 960 key->md5_key, ip_hdr(skb)->saddr, 961 ip_hdr(skb)->daddr, &rep.th); 962 } 963 #endif 964 #ifdef CONFIG_TCP_AO 965 if (tcp_key_is_ao(key)) { 966 int offset = (tsecr) ? 3 : 0; 967 968 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 969 (tcp_ao_len(key->ao_key) << 16) | 970 (key->ao_key->sndid << 8) | 971 key->rcv_next); 972 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 973 rep.th.doff = arg.iov[0].iov_len / 4; 974 975 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 976 key->ao_key, key->traffic_key, 977 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 978 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 979 &rep.th, key->sne); 980 } 981 #endif 982 arg.flags = reply_flags; 983 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 984 ip_hdr(skb)->saddr, /* XXX */ 985 arg.iov[0].iov_len, IPPROTO_TCP, 0); 986 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 987 if (oif) 988 arg.bound_dev_if = oif; 989 arg.tos = tos; 990 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 991 local_bh_disable(); 992 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 993 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 994 sock_net_set(ctl_sk, net); 995 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 996 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 997 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 998 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 999 transmit_time = tcp_transmit_time(sk); 1000 ip_send_unicast_reply(ctl_sk, sk, 1001 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1002 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1003 &arg, arg.iov[0].iov_len, 1004 transmit_time, txhash); 1005 1006 sock_net_set(ctl_sk, &init_net); 1007 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1008 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1009 local_bh_enable(); 1010 } 1011 1012 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, 1013 enum tcp_tw_status tw_status) 1014 { 1015 struct inet_timewait_sock *tw = inet_twsk(sk); 1016 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1017 struct tcp_key key = {}; 1018 u8 tos = tw->tw_tos; 1019 1020 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, 1021 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs 1022 * being placed in a different service queues (Classic rather than L4S) 1023 */ 1024 if (tw_status == TCP_TW_ACK_OOW) 1025 tos &= ~INET_ECN_MASK; 1026 1027 #ifdef CONFIG_TCP_AO 1028 struct tcp_ao_info *ao_info; 1029 1030 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1031 /* FIXME: the segment to-be-acked is not verified yet */ 1032 ao_info = rcu_dereference(tcptw->ao_info); 1033 if (ao_info) { 1034 const struct tcp_ao_hdr *aoh; 1035 1036 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1037 inet_twsk_put(tw); 1038 return; 1039 } 1040 1041 if (aoh) 1042 key.ao_key = tcp_ao_established_key(sk, ao_info, 1043 aoh->rnext_keyid, -1); 1044 } 1045 } 1046 if (key.ao_key) { 1047 struct tcp_ao_key *rnext_key; 1048 1049 key.traffic_key = snd_other_key(key.ao_key); 1050 key.sne = READ_ONCE(ao_info->snd_sne); 1051 rnext_key = READ_ONCE(ao_info->rnext_key); 1052 key.rcv_next = rnext_key->rcvid; 1053 key.type = TCP_KEY_AO; 1054 #else 1055 if (0) { 1056 #endif 1057 } else if (static_branch_tcp_md5()) { 1058 key.md5_key = tcp_twsk_md5_key(tcptw); 1059 if (key.md5_key) 1060 key.type = TCP_KEY_MD5; 1061 } 1062 1063 tcp_v4_send_ack(sk, skb, 1064 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1065 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1066 tcp_tw_tsval(tcptw), 1067 READ_ONCE(tcptw->tw_ts_recent), 1068 tw->tw_bound_dev_if, &key, 1069 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1070 tos, 1071 tw->tw_txhash); 1072 1073 inet_twsk_put(tw); 1074 } 1075 1076 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1077 struct request_sock *req) 1078 { 1079 struct tcp_key key = {}; 1080 1081 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1082 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1083 */ 1084 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1085 tcp_sk(sk)->snd_nxt; 1086 1087 #ifdef CONFIG_TCP_AO 1088 if (static_branch_unlikely(&tcp_ao_needed.key) && 1089 tcp_rsk_used_ao(req)) { 1090 const union tcp_md5_addr *addr; 1091 const struct tcp_ao_hdr *aoh; 1092 int l3index; 1093 1094 /* Invalid TCP option size or twice included auth */ 1095 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1096 return; 1097 if (!aoh) 1098 return; 1099 1100 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1101 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1102 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1103 aoh->rnext_keyid, -1); 1104 if (unlikely(!key.ao_key)) { 1105 /* Send ACK with any matching MKT for the peer */ 1106 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1107 /* Matching key disappeared (user removed the key?) 1108 * let the handshake timeout. 1109 */ 1110 if (!key.ao_key) { 1111 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1112 addr, 1113 ntohs(tcp_hdr(skb)->source), 1114 &ip_hdr(skb)->daddr, 1115 ntohs(tcp_hdr(skb)->dest)); 1116 return; 1117 } 1118 } 1119 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1120 if (!key.traffic_key) 1121 return; 1122 1123 key.type = TCP_KEY_AO; 1124 key.rcv_next = aoh->keyid; 1125 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1126 #else 1127 if (0) { 1128 #endif 1129 } else if (static_branch_tcp_md5()) { 1130 const union tcp_md5_addr *addr; 1131 int l3index; 1132 1133 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1134 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1135 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1136 if (key.md5_key) 1137 key.type = TCP_KEY_MD5; 1138 } 1139 1140 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ 1141 tcp_v4_send_ack(sk, skb, seq, 1142 tcp_rsk(req)->rcv_nxt, 1143 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1144 tcp_rsk_tsval(tcp_rsk(req)), 1145 req->ts_recent, 1146 0, &key, 1147 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1148 ip_hdr(skb)->tos & ~INET_ECN_MASK, 1149 READ_ONCE(tcp_rsk(req)->txhash)); 1150 if (tcp_key_is_ao(&key)) 1151 kfree(key.traffic_key); 1152 } 1153 1154 /* 1155 * Send a SYN-ACK after having received a SYN. 1156 * This still operates on a request_sock only, not on a big 1157 * socket. 1158 */ 1159 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1160 struct flowi *fl, 1161 struct request_sock *req, 1162 struct tcp_fastopen_cookie *foc, 1163 enum tcp_synack_type synack_type, 1164 struct sk_buff *syn_skb) 1165 { 1166 struct inet_request_sock *ireq = inet_rsk(req); 1167 struct flowi4 fl4; 1168 int err = -1; 1169 struct sk_buff *skb; 1170 u8 tos; 1171 1172 /* First, grab a route. */ 1173 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1174 return -1; 1175 1176 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1177 1178 if (skb) { 1179 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 1180 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1181 1182 tos = READ_ONCE(inet_sk(sk)->tos); 1183 1184 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1185 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1186 (tos & INET_ECN_MASK); 1187 1188 if (!INET_ECN_is_capable(tos) && 1189 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1190 tos |= INET_ECN_ECT_0; 1191 1192 rcu_read_lock(); 1193 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1194 ireq->ir_rmt_addr, 1195 rcu_dereference(ireq->ireq_opt), 1196 tos); 1197 rcu_read_unlock(); 1198 err = net_xmit_eval(err); 1199 } 1200 1201 return err; 1202 } 1203 1204 /* 1205 * IPv4 request_sock destructor. 1206 */ 1207 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1208 { 1209 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1210 } 1211 1212 #ifdef CONFIG_TCP_MD5SIG 1213 /* 1214 * RFC2385 MD5 checksumming requires a mapping of 1215 * IP address->MD5 Key. 1216 * We need to maintain these in the sk structure. 1217 */ 1218 1219 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1220 1221 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1222 { 1223 if (!old) 1224 return true; 1225 1226 /* l3index always overrides non-l3index */ 1227 if (old->l3index && new->l3index == 0) 1228 return false; 1229 if (old->l3index == 0 && new->l3index) 1230 return true; 1231 1232 return old->prefixlen < new->prefixlen; 1233 } 1234 1235 /* Find the Key structure for an address. */ 1236 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1237 const union tcp_md5_addr *addr, 1238 int family, bool any_l3index) 1239 { 1240 const struct tcp_sock *tp = tcp_sk(sk); 1241 struct tcp_md5sig_key *key; 1242 const struct tcp_md5sig_info *md5sig; 1243 __be32 mask; 1244 struct tcp_md5sig_key *best_match = NULL; 1245 bool match; 1246 1247 /* caller either holds rcu_read_lock() or socket lock */ 1248 md5sig = rcu_dereference_check(tp->md5sig_info, 1249 lockdep_sock_is_held(sk)); 1250 if (!md5sig) 1251 return NULL; 1252 1253 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1254 lockdep_sock_is_held(sk)) { 1255 if (key->family != family) 1256 continue; 1257 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1258 key->l3index != l3index) 1259 continue; 1260 if (family == AF_INET) { 1261 mask = inet_make_mask(key->prefixlen); 1262 match = (key->addr.a4.s_addr & mask) == 1263 (addr->a4.s_addr & mask); 1264 #if IS_ENABLED(CONFIG_IPV6) 1265 } else if (family == AF_INET6) { 1266 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1267 key->prefixlen); 1268 #endif 1269 } else { 1270 match = false; 1271 } 1272 1273 if (match && better_md5_match(best_match, key)) 1274 best_match = key; 1275 } 1276 return best_match; 1277 } 1278 1279 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1280 const union tcp_md5_addr *addr, 1281 int family, u8 prefixlen, 1282 int l3index, u8 flags) 1283 { 1284 const struct tcp_sock *tp = tcp_sk(sk); 1285 struct tcp_md5sig_key *key; 1286 unsigned int size = sizeof(struct in_addr); 1287 const struct tcp_md5sig_info *md5sig; 1288 1289 /* caller either holds rcu_read_lock() or socket lock */ 1290 md5sig = rcu_dereference_check(tp->md5sig_info, 1291 lockdep_sock_is_held(sk)); 1292 if (!md5sig) 1293 return NULL; 1294 #if IS_ENABLED(CONFIG_IPV6) 1295 if (family == AF_INET6) 1296 size = sizeof(struct in6_addr); 1297 #endif 1298 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1299 lockdep_sock_is_held(sk)) { 1300 if (key->family != family) 1301 continue; 1302 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1303 continue; 1304 if (key->l3index != l3index) 1305 continue; 1306 if (!memcmp(&key->addr, addr, size) && 1307 key->prefixlen == prefixlen) 1308 return key; 1309 } 1310 return NULL; 1311 } 1312 1313 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1314 const struct sock *addr_sk) 1315 { 1316 const union tcp_md5_addr *addr; 1317 int l3index; 1318 1319 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1320 addr_sk->sk_bound_dev_if); 1321 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1322 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1323 } 1324 1325 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1326 { 1327 struct tcp_sock *tp = tcp_sk(sk); 1328 struct tcp_md5sig_info *md5sig; 1329 1330 md5sig = kmalloc_obj(*md5sig, gfp); 1331 if (!md5sig) 1332 return -ENOMEM; 1333 1334 sk_gso_disable(sk); 1335 INIT_HLIST_HEAD(&md5sig->head); 1336 rcu_assign_pointer(tp->md5sig_info, md5sig); 1337 return 0; 1338 } 1339 1340 /* This can be called on a newly created socket, from other files */ 1341 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1342 int family, u8 prefixlen, int l3index, u8 flags, 1343 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1344 { 1345 /* Add Key to the list */ 1346 struct tcp_md5sig_key *key; 1347 struct tcp_sock *tp = tcp_sk(sk); 1348 struct tcp_md5sig_info *md5sig; 1349 1350 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1351 if (key) { 1352 /* Pre-existing entry - just update that one. 1353 * Note that the key might be used concurrently. 1354 * data_race() is telling kcsan that we do not care of 1355 * key mismatches, since changing MD5 key on live flows 1356 * can lead to packet drops. 1357 */ 1358 data_race(memcpy(key->key, newkey, newkeylen)); 1359 1360 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1361 * Also note that a reader could catch new key->keylen value 1362 * but old key->key[], this is the reason we use __GFP_ZERO 1363 * at sock_kmalloc() time below these lines. 1364 */ 1365 WRITE_ONCE(key->keylen, newkeylen); 1366 1367 return 0; 1368 } 1369 1370 md5sig = rcu_dereference_protected(tp->md5sig_info, 1371 lockdep_sock_is_held(sk)); 1372 1373 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1374 if (!key) 1375 return -ENOMEM; 1376 1377 memcpy(key->key, newkey, newkeylen); 1378 key->keylen = newkeylen; 1379 key->family = family; 1380 key->prefixlen = prefixlen; 1381 key->l3index = l3index; 1382 key->flags = flags; 1383 memcpy(&key->addr, addr, 1384 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1385 sizeof(struct in_addr)); 1386 hlist_add_head_rcu(&key->node, &md5sig->head); 1387 return 0; 1388 } 1389 1390 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1391 int family, u8 prefixlen, int l3index, u8 flags, 1392 const u8 *newkey, u8 newkeylen) 1393 { 1394 struct tcp_sock *tp = tcp_sk(sk); 1395 1396 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1397 if (fips_enabled) { 1398 pr_warn_once("TCP-MD5 support is disabled due to FIPS\n"); 1399 return -EOPNOTSUPP; 1400 } 1401 1402 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1403 return -ENOMEM; 1404 1405 if (!static_branch_inc(&tcp_md5_needed.key)) { 1406 struct tcp_md5sig_info *md5sig; 1407 1408 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1409 rcu_assign_pointer(tp->md5sig_info, NULL); 1410 kfree_rcu(md5sig, rcu); 1411 return -EUSERS; 1412 } 1413 } 1414 1415 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1416 newkey, newkeylen, GFP_KERNEL); 1417 } 1418 1419 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1420 int family, u8 prefixlen, int l3index, 1421 struct tcp_md5sig_key *key) 1422 { 1423 struct tcp_sock *tp = tcp_sk(sk); 1424 1425 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1426 1427 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1428 return -ENOMEM; 1429 1430 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1431 struct tcp_md5sig_info *md5sig; 1432 1433 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1434 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1435 rcu_assign_pointer(tp->md5sig_info, NULL); 1436 kfree_rcu(md5sig, rcu); 1437 return -EUSERS; 1438 } 1439 } 1440 1441 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1442 key->flags, key->key, key->keylen, 1443 sk_gfp_mask(sk, GFP_ATOMIC)); 1444 } 1445 1446 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1447 u8 prefixlen, int l3index, u8 flags) 1448 { 1449 struct tcp_md5sig_key *key; 1450 1451 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1452 if (!key) 1453 return -ENOENT; 1454 hlist_del_rcu(&key->node); 1455 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1456 kfree_rcu(key, rcu); 1457 return 0; 1458 } 1459 1460 void tcp_clear_md5_list(struct sock *sk) 1461 { 1462 struct tcp_sock *tp = tcp_sk(sk); 1463 struct tcp_md5sig_key *key; 1464 struct hlist_node *n; 1465 struct tcp_md5sig_info *md5sig; 1466 1467 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1468 1469 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1470 hlist_del(&key->node); 1471 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1472 kfree(key); 1473 } 1474 } 1475 1476 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1477 sockptr_t optval, int optlen) 1478 { 1479 struct tcp_md5sig cmd; 1480 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1481 const union tcp_md5_addr *addr; 1482 u8 prefixlen = 32; 1483 int l3index = 0; 1484 bool l3flag; 1485 u8 flags; 1486 1487 if (optlen < sizeof(cmd)) 1488 return -EINVAL; 1489 1490 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1491 return -EFAULT; 1492 1493 if (sin->sin_family != AF_INET) 1494 return -EINVAL; 1495 1496 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1497 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1498 1499 if (optname == TCP_MD5SIG_EXT && 1500 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1501 prefixlen = cmd.tcpm_prefixlen; 1502 if (prefixlen > 32) 1503 return -EINVAL; 1504 } 1505 1506 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1507 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1508 struct net_device *dev; 1509 1510 rcu_read_lock(); 1511 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1512 if (dev && netif_is_l3_master(dev)) 1513 l3index = dev->ifindex; 1514 1515 rcu_read_unlock(); 1516 1517 /* ok to reference set/not set outside of rcu; 1518 * right now device MUST be an L3 master 1519 */ 1520 if (!dev || !l3index) 1521 return -EINVAL; 1522 } 1523 1524 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1525 1526 if (!cmd.tcpm_keylen) 1527 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1528 1529 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1530 return -EINVAL; 1531 1532 /* Don't allow keys for peers that have a matching TCP-AO key. 1533 * See the comment in tcp_ao_add_cmd() 1534 */ 1535 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1536 return -EKEYREJECTED; 1537 1538 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1539 cmd.tcpm_key, cmd.tcpm_keylen); 1540 } 1541 1542 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx, 1543 __be32 daddr, __be32 saddr, 1544 const struct tcphdr *th, int nbytes) 1545 { 1546 struct { 1547 struct tcp4_pseudohdr ip; 1548 struct tcphdr tcp; 1549 } h; 1550 1551 h.ip.saddr = saddr; 1552 h.ip.daddr = daddr; 1553 h.ip.pad = 0; 1554 h.ip.protocol = IPPROTO_TCP; 1555 h.ip.len = cpu_to_be16(nbytes); 1556 h.tcp = *th; 1557 h.tcp.check = 0; 1558 md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); 1559 } 1560 1561 static noinline_for_stack void 1562 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1563 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1564 { 1565 struct md5_ctx ctx; 1566 1567 md5_init(&ctx); 1568 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); 1569 tcp_md5_hash_key(&ctx, key); 1570 md5_final(&ctx, md5_hash); 1571 } 1572 1573 noinline_for_stack void 1574 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1575 const struct sock *sk, const struct sk_buff *skb) 1576 { 1577 const struct tcphdr *th = tcp_hdr(skb); 1578 __be32 saddr, daddr; 1579 struct md5_ctx ctx; 1580 1581 if (sk) { /* valid for establish/request sockets */ 1582 saddr = sk->sk_rcv_saddr; 1583 daddr = sk->sk_daddr; 1584 } else { 1585 const struct iphdr *iph = ip_hdr(skb); 1586 saddr = iph->saddr; 1587 daddr = iph->daddr; 1588 } 1589 1590 md5_init(&ctx); 1591 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); 1592 tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); 1593 tcp_md5_hash_key(&ctx, key); 1594 md5_final(&ctx, md5_hash); 1595 } 1596 1597 #endif 1598 1599 static void tcp_v4_init_req(struct request_sock *req, 1600 const struct sock *sk_listener, 1601 struct sk_buff *skb) 1602 { 1603 struct inet_request_sock *ireq = inet_rsk(req); 1604 struct net *net = sock_net(sk_listener); 1605 1606 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1607 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1608 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1609 } 1610 1611 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1612 struct sk_buff *skb, 1613 struct flowi *fl, 1614 struct request_sock *req, 1615 u32 tw_isn) 1616 { 1617 tcp_v4_init_req(req, sk, skb); 1618 1619 if (security_inet_conn_request(sk, skb, req)) 1620 return NULL; 1621 1622 return inet_csk_route_req(sk, &fl->u.ip4, req); 1623 } 1624 1625 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1626 .family = PF_INET, 1627 .obj_size = sizeof(struct tcp_request_sock), 1628 .send_ack = tcp_v4_reqsk_send_ack, 1629 .destructor = tcp_v4_reqsk_destructor, 1630 .send_reset = tcp_v4_send_reset, 1631 }; 1632 1633 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1634 .mss_clamp = TCP_MSS_DEFAULT, 1635 #ifdef CONFIG_TCP_MD5SIG 1636 .req_md5_lookup = tcp_v4_md5_lookup, 1637 .calc_md5_hash = tcp_v4_md5_hash_skb, 1638 #endif 1639 #ifdef CONFIG_TCP_AO 1640 .ao_lookup = tcp_v4_ao_lookup_rsk, 1641 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1642 .ao_synack_hash = tcp_v4_ao_synack_hash, 1643 #endif 1644 #ifdef CONFIG_SYN_COOKIES 1645 .cookie_init_seq = cookie_v4_init_sequence, 1646 #endif 1647 .route_req = tcp_v4_route_req, 1648 .init_seq_and_ts_off = tcp_v4_init_seq_and_ts_off, 1649 .send_synack = tcp_v4_send_synack, 1650 }; 1651 1652 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1653 { 1654 /* Never answer to SYNs send to broadcast or multicast */ 1655 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1656 goto drop; 1657 1658 return tcp_conn_request(&tcp_request_sock_ops, 1659 &tcp_request_sock_ipv4_ops, sk, skb); 1660 1661 drop: 1662 tcp_listendrop(sk); 1663 return 0; 1664 } 1665 1666 1667 /* 1668 * The three way handshake has completed - we got a valid synack - 1669 * now create the new socket. 1670 */ 1671 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1672 struct request_sock *req, 1673 struct dst_entry *dst, 1674 struct request_sock *req_unhash, 1675 bool *own_req, 1676 void (*opt_child_init)(struct sock *newsk, 1677 const struct sock *sk)) 1678 { 1679 struct inet_request_sock *ireq; 1680 bool found_dup_sk = false; 1681 struct inet_sock *newinet; 1682 struct tcp_sock *newtp; 1683 struct sock *newsk; 1684 #ifdef CONFIG_TCP_MD5SIG 1685 const union tcp_md5_addr *addr; 1686 struct tcp_md5sig_key *key; 1687 int l3index; 1688 #endif 1689 struct ip_options_rcu *inet_opt; 1690 1691 if (sk_acceptq_is_full(sk)) 1692 goto exit_overflow; 1693 1694 newsk = tcp_create_openreq_child(sk, req, skb); 1695 if (!newsk) 1696 goto exit_nonewsk; 1697 1698 newsk->sk_gso_type = SKB_GSO_TCPV4; 1699 inet_sk_rx_dst_set(newsk, skb); 1700 1701 newtp = tcp_sk(newsk); 1702 newinet = inet_sk(newsk); 1703 ireq = inet_rsk(req); 1704 inet_opt = rcu_dereference(ireq->ireq_opt); 1705 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1706 newinet->mc_index = inet_iif(skb); 1707 newinet->mc_ttl = ip_hdr(skb)->ttl; 1708 newinet->rcv_tos = ip_hdr(skb)->tos; 1709 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1710 if (inet_opt) 1711 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1712 atomic_set(&newinet->inet_id, get_random_u16()); 1713 1714 /* Set ToS of the new socket based upon the value of incoming SYN. 1715 * ECT bits are set later in tcp_init_transfer(). 1716 */ 1717 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1718 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1719 1720 if (!dst) { 1721 dst = inet_csk_route_child_sock(sk, newsk, req); 1722 if (!dst) 1723 goto put_and_exit; 1724 } else { 1725 /* syncookie case : see end of cookie_v4_check() */ 1726 } 1727 sk_setup_caps(newsk, dst); 1728 1729 #if IS_ENABLED(CONFIG_IPV6) 1730 if (opt_child_init) 1731 opt_child_init(newsk, sk); 1732 #endif 1733 tcp_ca_openreq_child(newsk, dst); 1734 1735 tcp_sync_mss(newsk, dst4_mtu(dst)); 1736 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1737 1738 tcp_initialize_rcv_mss(newsk); 1739 1740 #ifdef CONFIG_TCP_MD5SIG 1741 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1742 /* Copy over the MD5 key from the original socket */ 1743 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1744 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1745 if (key && !tcp_rsk_used_ao(req)) { 1746 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1747 goto put_and_exit; 1748 sk_gso_disable(newsk); 1749 } 1750 #endif 1751 #ifdef CONFIG_TCP_AO 1752 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1753 goto put_and_exit; /* OOM, release back memory */ 1754 #endif 1755 1756 if (__inet_inherit_port(sk, newsk) < 0) 1757 goto put_and_exit; 1758 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1759 &found_dup_sk); 1760 if (likely(*own_req)) { 1761 tcp_move_syn(newtp, req); 1762 ireq->ireq_opt = NULL; 1763 } else { 1764 newinet->inet_opt = NULL; 1765 1766 if (!req_unhash && found_dup_sk) { 1767 /* This code path should only be executed in the 1768 * syncookie case only 1769 */ 1770 bh_unlock_sock(newsk); 1771 sock_put(newsk); 1772 newsk = NULL; 1773 } 1774 } 1775 return newsk; 1776 1777 exit_overflow: 1778 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1779 exit_nonewsk: 1780 dst_release(dst); 1781 exit: 1782 tcp_listendrop(sk); 1783 return NULL; 1784 put_and_exit: 1785 newinet->inet_opt = NULL; 1786 inet_csk_prepare_forced_close(newsk); 1787 tcp_done(newsk); 1788 goto exit; 1789 } 1790 1791 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1792 { 1793 #ifdef CONFIG_SYN_COOKIES 1794 const struct tcphdr *th = tcp_hdr(skb); 1795 1796 if (!th->syn) 1797 sk = cookie_v4_check(sk, skb); 1798 #endif 1799 return sk; 1800 } 1801 1802 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1803 struct tcphdr *th, u32 *cookie) 1804 { 1805 u16 mss = 0; 1806 #ifdef CONFIG_SYN_COOKIES 1807 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1808 &tcp_request_sock_ipv4_ops, sk, th); 1809 if (mss) { 1810 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1811 tcp_synq_overflow(sk); 1812 } 1813 #endif 1814 return mss; 1815 } 1816 1817 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1818 u32)); 1819 /* The socket must have it's spinlock held when we get 1820 * here, unless it is a TCP_LISTEN socket. 1821 * 1822 * We have a potential double-lock case here, so even when 1823 * doing backlog processing we use the BH locking scheme. 1824 * This is because we cannot sleep with the original spinlock 1825 * held. 1826 */ 1827 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1828 { 1829 enum skb_drop_reason reason; 1830 struct sock *rsk; 1831 1832 reason = psp_sk_rx_policy_check(sk, skb); 1833 if (reason) 1834 goto err_discard; 1835 1836 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1837 struct dst_entry *dst; 1838 1839 dst = rcu_dereference_protected(sk->sk_rx_dst, 1840 lockdep_sock_is_held(sk)); 1841 1842 sock_rps_save_rxhash(sk, skb); 1843 sk_mark_napi_id(sk, skb); 1844 if (dst && unlikely(dst != skb_dst(skb))) { 1845 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1846 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1847 dst, 0)) { 1848 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1849 dst_release(dst); 1850 } 1851 } 1852 tcp_rcv_established(sk, skb); 1853 return 0; 1854 } 1855 1856 if (tcp_checksum_complete(skb)) 1857 goto csum_err; 1858 1859 if (sk->sk_state == TCP_LISTEN) { 1860 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1861 1862 if (!nsk) 1863 return 0; 1864 if (nsk != sk) { 1865 reason = tcp_child_process(sk, nsk, skb); 1866 if (reason) { 1867 rsk = nsk; 1868 goto reset; 1869 } 1870 return 0; 1871 } 1872 } else 1873 sock_rps_save_rxhash(sk, skb); 1874 1875 reason = tcp_rcv_state_process(sk, skb); 1876 if (reason) { 1877 rsk = sk; 1878 goto reset; 1879 } 1880 return 0; 1881 1882 reset: 1883 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1884 discard: 1885 sk_skb_reason_drop(sk, skb, reason); 1886 /* Be careful here. If this function gets more complicated and 1887 * gcc suffers from register pressure on the x86, sk (in %ebx) 1888 * might be destroyed here. This current version compiles correctly, 1889 * but you have been warned. 1890 */ 1891 return 0; 1892 1893 csum_err: 1894 reason = SKB_DROP_REASON_TCP_CSUM; 1895 trace_tcp_bad_csum(skb); 1896 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1897 err_discard: 1898 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1899 goto discard; 1900 } 1901 EXPORT_SYMBOL(tcp_v4_do_rcv); 1902 1903 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1904 enum skb_drop_reason *reason) 1905 { 1906 u32 tail_gso_size, tail_gso_segs; 1907 struct skb_shared_info *shinfo; 1908 const struct tcphdr *th; 1909 struct tcphdr *thtail; 1910 struct sk_buff *tail; 1911 unsigned int hdrlen; 1912 bool fragstolen; 1913 u32 gso_segs; 1914 u32 gso_size; 1915 u64 limit; 1916 int delta; 1917 int err; 1918 1919 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1920 * we can fix skb->truesize to its real value to avoid future drops. 1921 * This is valid because skb is not yet charged to the socket. 1922 * It has been noticed pure SACK packets were sometimes dropped 1923 * (if cooked by drivers without copybreak feature). 1924 */ 1925 skb_condense(skb); 1926 1927 tcp_cleanup_skb(skb); 1928 1929 if (unlikely(tcp_checksum_complete(skb))) { 1930 bh_unlock_sock(sk); 1931 trace_tcp_bad_csum(skb); 1932 *reason = SKB_DROP_REASON_TCP_CSUM; 1933 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1934 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1935 return true; 1936 } 1937 1938 /* Attempt coalescing to last skb in backlog, even if we are 1939 * above the limits. 1940 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1941 */ 1942 th = (const struct tcphdr *)skb->data; 1943 hdrlen = th->doff * 4; 1944 1945 tail = sk->sk_backlog.tail; 1946 if (!tail) 1947 goto no_coalesce; 1948 thtail = (struct tcphdr *)tail->data; 1949 1950 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1951 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1952 ((TCP_SKB_CB(tail)->tcp_flags | 1953 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1954 !((TCP_SKB_CB(tail)->tcp_flags & 1955 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1956 ((TCP_SKB_CB(tail)->tcp_flags ^ 1957 TCP_SKB_CB(skb)->tcp_flags) & 1958 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || 1959 !tcp_skb_can_collapse_rx(tail, skb) || 1960 thtail->doff != th->doff || 1961 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || 1962 /* prior to PSP Rx policy check, retain exact PSP metadata */ 1963 psp_skb_coalesce_diff(tail, skb)) 1964 goto no_coalesce; 1965 1966 __skb_pull(skb, hdrlen); 1967 1968 shinfo = skb_shinfo(skb); 1969 gso_size = shinfo->gso_size ?: skb->len; 1970 gso_segs = shinfo->gso_segs ?: 1; 1971 1972 shinfo = skb_shinfo(tail); 1973 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1974 tail_gso_segs = shinfo->gso_segs ?: 1; 1975 1976 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1977 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1978 1979 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1980 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1981 thtail->window = th->window; 1982 } 1983 1984 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1985 * thtail->fin, so that the fast path in tcp_rcv_established() 1986 * is not entered if we append a packet with a FIN. 1987 * SYN, RST, URG are not present. 1988 * ACK is set on both packets. 1989 * PSH : we do not really care in TCP stack, 1990 * at least for 'GRO' packets. 1991 */ 1992 thtail->fin |= th->fin; 1993 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1994 1995 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1996 TCP_SKB_CB(tail)->has_rxtstamp = true; 1997 tail->tstamp = skb->tstamp; 1998 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1999 } 2000 2001 /* Not as strict as GRO. We only need to carry mss max value */ 2002 shinfo->gso_size = max(gso_size, tail_gso_size); 2003 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2004 2005 sk->sk_backlog.len += delta; 2006 __NET_INC_STATS(sock_net(sk), 2007 LINUX_MIB_TCPBACKLOGCOALESCE); 2008 kfree_skb_partial(skb, fragstolen); 2009 return false; 2010 } 2011 __skb_push(skb, hdrlen); 2012 2013 no_coalesce: 2014 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2015 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2016 * sk_rcvbuf in normal conditions. 2017 */ 2018 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2019 2020 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2021 2022 /* Only socket owner can try to collapse/prune rx queues 2023 * to reduce memory overhead, so add a little headroom here. 2024 * Few sockets backlog are possibly concurrently non empty. 2025 */ 2026 limit += 64 * 1024; 2027 2028 limit = min_t(u64, limit, UINT_MAX); 2029 2030 err = sk_add_backlog(sk, skb, limit); 2031 if (unlikely(err)) { 2032 bh_unlock_sock(sk); 2033 if (err == -ENOMEM) { 2034 *reason = SKB_DROP_REASON_PFMEMALLOC; 2035 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2036 } else { 2037 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2038 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2039 } 2040 return true; 2041 } 2042 return false; 2043 } 2044 2045 static void tcp_v4_restore_cb(struct sk_buff *skb) 2046 { 2047 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2048 sizeof(struct inet_skb_parm)); 2049 } 2050 2051 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2052 const struct tcphdr *th) 2053 { 2054 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2055 * barrier() makes sure compiler wont play fool^Waliasing games. 2056 */ 2057 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2058 sizeof(struct inet_skb_parm)); 2059 barrier(); 2060 2061 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2062 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2063 skb->len - th->doff * 4); 2064 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2065 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); 2066 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2067 TCP_SKB_CB(skb)->sacked = 0; 2068 TCP_SKB_CB(skb)->has_rxtstamp = 2069 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2070 } 2071 2072 /* 2073 * From tcp_input.c 2074 */ 2075 2076 int tcp_v4_rcv(struct sk_buff *skb) 2077 { 2078 struct net *net = dev_net_rcu(skb->dev); 2079 enum skb_drop_reason drop_reason; 2080 enum tcp_tw_status tw_status; 2081 int sdif = inet_sdif(skb); 2082 int dif = inet_iif(skb); 2083 const struct iphdr *iph; 2084 const struct tcphdr *th; 2085 struct sock *sk = NULL; 2086 bool refcounted; 2087 int ret; 2088 u32 isn; 2089 2090 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2091 if (skb->pkt_type != PACKET_HOST) 2092 goto discard_it; 2093 2094 /* Count it even if it's bad */ 2095 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2096 2097 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2098 goto discard_it; 2099 2100 th = (const struct tcphdr *)skb->data; 2101 2102 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2103 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2104 goto bad_packet; 2105 } 2106 if (!pskb_may_pull(skb, th->doff * 4)) 2107 goto discard_it; 2108 2109 /* An explanation is required here, I think. 2110 * Packet length and doff are validated by header prediction, 2111 * provided case of th->doff==0 is eliminated. 2112 * So, we defer the checks. */ 2113 2114 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2115 goto csum_error; 2116 2117 th = (const struct tcphdr *)skb->data; 2118 iph = ip_hdr(skb); 2119 lookup: 2120 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, 2121 th->dest, sdif, &refcounted); 2122 if (!sk) 2123 goto no_tcp_socket; 2124 2125 if (sk->sk_state == TCP_TIME_WAIT) 2126 goto do_time_wait; 2127 2128 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2129 struct request_sock *req = inet_reqsk(sk); 2130 bool req_stolen = false; 2131 struct sock *nsk; 2132 2133 sk = req->rsk_listener; 2134 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2135 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2136 else 2137 drop_reason = tcp_inbound_hash(sk, req, skb, 2138 &iph->saddr, &iph->daddr, 2139 AF_INET, dif, sdif); 2140 if (unlikely(drop_reason)) { 2141 sk_drops_skbadd(sk, skb); 2142 reqsk_put(req); 2143 goto discard_it; 2144 } 2145 if (tcp_checksum_complete(skb)) { 2146 reqsk_put(req); 2147 goto csum_error; 2148 } 2149 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2150 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2151 if (!nsk) { 2152 inet_csk_reqsk_queue_drop_and_put(sk, req); 2153 goto lookup; 2154 } 2155 sk = nsk; 2156 /* reuseport_migrate_sock() has already held one sk_refcnt 2157 * before returning. 2158 */ 2159 } else { 2160 /* We own a reference on the listener, increase it again 2161 * as we might lose it too soon. 2162 */ 2163 sock_hold(sk); 2164 } 2165 refcounted = true; 2166 nsk = NULL; 2167 if (!tcp_filter(sk, skb, &drop_reason)) { 2168 th = (const struct tcphdr *)skb->data; 2169 iph = ip_hdr(skb); 2170 tcp_v4_fill_cb(skb, iph, th); 2171 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2172 &drop_reason); 2173 } 2174 if (!nsk) { 2175 reqsk_put(req); 2176 if (req_stolen) { 2177 /* Another cpu got exclusive access to req 2178 * and created a full blown socket. 2179 * Try to feed this packet to this socket 2180 * instead of discarding it. 2181 */ 2182 tcp_v4_restore_cb(skb); 2183 sock_put(sk); 2184 goto lookup; 2185 } 2186 goto discard_and_relse; 2187 } 2188 nf_reset_ct(skb); 2189 if (nsk == sk) { 2190 reqsk_put(req); 2191 tcp_v4_restore_cb(skb); 2192 } else { 2193 drop_reason = tcp_child_process(sk, nsk, skb); 2194 if (drop_reason) { 2195 enum sk_rst_reason rst_reason; 2196 2197 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2198 tcp_v4_send_reset(nsk, skb, rst_reason); 2199 goto discard_and_relse; 2200 } 2201 sock_put(sk); 2202 return 0; 2203 } 2204 } 2205 2206 process: 2207 if (static_branch_unlikely(&ip4_min_ttl)) { 2208 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2209 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2210 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2211 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2212 goto discard_and_relse; 2213 } 2214 } 2215 2216 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2217 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2218 goto discard_and_relse; 2219 } 2220 2221 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2222 AF_INET, dif, sdif); 2223 if (drop_reason) 2224 goto discard_and_relse; 2225 2226 nf_reset_ct(skb); 2227 2228 if (tcp_filter(sk, skb, &drop_reason)) 2229 goto discard_and_relse; 2230 2231 th = (const struct tcphdr *)skb->data; 2232 iph = ip_hdr(skb); 2233 tcp_v4_fill_cb(skb, iph, th); 2234 2235 skb->dev = NULL; 2236 2237 if (sk->sk_state == TCP_LISTEN) { 2238 ret = tcp_v4_do_rcv(sk, skb); 2239 goto put_and_return; 2240 } 2241 2242 sk_incoming_cpu_update(sk); 2243 2244 bh_lock_sock_nested(sk); 2245 tcp_segs_in(tcp_sk(sk), skb); 2246 ret = 0; 2247 if (!sock_owned_by_user(sk)) { 2248 ret = tcp_v4_do_rcv(sk, skb); 2249 } else { 2250 if (tcp_add_backlog(sk, skb, &drop_reason)) 2251 goto discard_and_relse; 2252 } 2253 bh_unlock_sock(sk); 2254 2255 put_and_return: 2256 if (refcounted) 2257 sock_put(sk); 2258 2259 return ret; 2260 2261 no_tcp_socket: 2262 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2263 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2264 goto discard_it; 2265 2266 tcp_v4_fill_cb(skb, iph, th); 2267 2268 if (tcp_checksum_complete(skb)) { 2269 csum_error: 2270 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2271 trace_tcp_bad_csum(skb); 2272 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2273 bad_packet: 2274 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2275 } else { 2276 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2277 } 2278 2279 discard_it: 2280 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2281 /* Discard frame. */ 2282 sk_skb_reason_drop(sk, skb, drop_reason); 2283 return 0; 2284 2285 discard_and_relse: 2286 sk_drops_skbadd(sk, skb); 2287 if (refcounted) 2288 sock_put(sk); 2289 goto discard_it; 2290 2291 do_time_wait: 2292 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2293 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2294 inet_twsk_put(inet_twsk(sk)); 2295 goto discard_it; 2296 } 2297 2298 tcp_v4_fill_cb(skb, iph, th); 2299 2300 if (tcp_checksum_complete(skb)) { 2301 inet_twsk_put(inet_twsk(sk)); 2302 goto csum_error; 2303 } 2304 2305 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, 2306 &drop_reason); 2307 switch (tw_status) { 2308 case TCP_TW_SYN: { 2309 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), 2310 iph->saddr, th->source, 2311 iph->daddr, th->dest, 2312 inet_iif(skb), 2313 sdif); 2314 if (sk2) { 2315 inet_twsk_deschedule_put(inet_twsk(sk)); 2316 sk = sk2; 2317 tcp_v4_restore_cb(skb); 2318 refcounted = false; 2319 __this_cpu_write(tcp_tw_isn, isn); 2320 goto process; 2321 } 2322 2323 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); 2324 if (drop_reason) 2325 break; 2326 } 2327 /* to ACK */ 2328 fallthrough; 2329 case TCP_TW_ACK: 2330 case TCP_TW_ACK_OOW: 2331 tcp_v4_timewait_ack(sk, skb, tw_status); 2332 break; 2333 case TCP_TW_RST: 2334 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2335 inet_twsk_deschedule_put(inet_twsk(sk)); 2336 goto discard_it; 2337 case TCP_TW_SUCCESS:; 2338 } 2339 goto discard_it; 2340 } 2341 2342 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2343 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2344 }; 2345 2346 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2347 { 2348 struct dst_entry *dst = skb_dst(skb); 2349 2350 if (dst && dst_hold_safe(dst)) { 2351 rcu_assign_pointer(sk->sk_rx_dst, dst); 2352 sk->sk_rx_dst_ifindex = skb->skb_iif; 2353 } 2354 } 2355 2356 const struct inet_connection_sock_af_ops ipv4_specific = { 2357 .queue_xmit = ip_queue_xmit, 2358 .rebuild_header = inet_sk_rebuild_header, 2359 .sk_rx_dst_set = inet_sk_rx_dst_set, 2360 .conn_request = tcp_v4_conn_request, 2361 .syn_recv_sock = tcp_v4_syn_recv_sock, 2362 .net_header_len = sizeof(struct iphdr), 2363 .setsockopt = ip_setsockopt, 2364 .getsockopt = ip_getsockopt, 2365 .mtu_reduced = tcp_v4_mtu_reduced, 2366 }; 2367 2368 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2369 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2370 #ifdef CONFIG_TCP_MD5SIG 2371 .md5_lookup = tcp_v4_md5_lookup, 2372 .calc_md5_hash = tcp_v4_md5_hash_skb, 2373 .md5_parse = tcp_v4_parse_md5_keys, 2374 #endif 2375 #ifdef CONFIG_TCP_AO 2376 .ao_lookup = tcp_v4_ao_lookup, 2377 .calc_ao_hash = tcp_v4_ao_hash_skb, 2378 .ao_parse = tcp_v4_parse_ao, 2379 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2380 #endif 2381 }; 2382 2383 static void tcp4_destruct_sock(struct sock *sk) 2384 { 2385 tcp_md5_destruct_sock(sk); 2386 tcp_ao_destroy_sock(sk, false); 2387 inet_sock_destruct(sk); 2388 } 2389 #endif 2390 2391 /* NOTE: A lot of things set to zero explicitly by call to 2392 * sk_alloc() so need not be done here. 2393 */ 2394 static int tcp_v4_init_sock(struct sock *sk) 2395 { 2396 struct inet_connection_sock *icsk = inet_csk(sk); 2397 2398 tcp_init_sock(sk); 2399 2400 icsk->icsk_af_ops = &ipv4_specific; 2401 2402 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2403 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2404 sk->sk_destruct = tcp4_destruct_sock; 2405 #endif 2406 2407 return 0; 2408 } 2409 2410 static void tcp_release_user_frags(struct sock *sk) 2411 { 2412 #ifdef CONFIG_PAGE_POOL 2413 unsigned long index; 2414 void *netmem; 2415 2416 xa_for_each(&sk->sk_user_frags, index, netmem) 2417 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2418 #endif 2419 } 2420 2421 void tcp_v4_destroy_sock(struct sock *sk) 2422 { 2423 struct tcp_sock *tp = tcp_sk(sk); 2424 2425 tcp_release_user_frags(sk); 2426 2427 xa_destroy(&sk->sk_user_frags); 2428 2429 trace_tcp_destroy_sock(sk); 2430 2431 tcp_clear_xmit_timers(sk); 2432 2433 tcp_cleanup_congestion_control(sk); 2434 2435 tcp_cleanup_ulp(sk); 2436 2437 /* Cleanup up the write buffer. */ 2438 tcp_write_queue_purge(sk); 2439 2440 /* Check if we want to disable active TFO */ 2441 tcp_fastopen_active_disable_ofo_check(sk); 2442 2443 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2444 skb_rbtree_purge(&tp->out_of_order_queue); 2445 2446 /* Clean up a referenced TCP bind bucket. */ 2447 if (inet_csk(sk)->icsk_bind_hash) 2448 inet_put_port(sk); 2449 2450 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2451 2452 /* If socket is aborted during connect operation */ 2453 tcp_free_fastopen_req(tp); 2454 tcp_fastopen_destroy_cipher(sk); 2455 tcp_saved_syn_free(tp); 2456 2457 sk_sockets_allocated_dec(sk); 2458 } 2459 2460 #ifdef CONFIG_PROC_FS 2461 /* Proc filesystem TCP sock list dumping. */ 2462 2463 static unsigned short seq_file_family(const struct seq_file *seq); 2464 2465 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2466 { 2467 unsigned short family = seq_file_family(seq); 2468 2469 /* AF_UNSPEC is used as a match all */ 2470 return ((family == AF_UNSPEC || family == sk->sk_family) && 2471 net_eq(sock_net(sk), seq_file_net(seq))); 2472 } 2473 2474 /* Find a non empty bucket (starting from st->bucket) 2475 * and return the first sk from it. 2476 */ 2477 static void *listening_get_first(struct seq_file *seq) 2478 { 2479 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2480 struct tcp_iter_state *st = seq->private; 2481 2482 st->offset = 0; 2483 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2484 struct inet_listen_hashbucket *ilb2; 2485 struct hlist_nulls_node *node; 2486 struct sock *sk; 2487 2488 ilb2 = &hinfo->lhash2[st->bucket]; 2489 if (hlist_nulls_empty(&ilb2->nulls_head)) 2490 continue; 2491 2492 spin_lock(&ilb2->lock); 2493 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2494 if (seq_sk_match(seq, sk)) 2495 return sk; 2496 } 2497 spin_unlock(&ilb2->lock); 2498 } 2499 2500 return NULL; 2501 } 2502 2503 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2504 * If "cur" is the last one in the st->bucket, 2505 * call listening_get_first() to return the first sk of the next 2506 * non empty bucket. 2507 */ 2508 static void *listening_get_next(struct seq_file *seq, void *cur) 2509 { 2510 struct tcp_iter_state *st = seq->private; 2511 struct inet_listen_hashbucket *ilb2; 2512 struct hlist_nulls_node *node; 2513 struct inet_hashinfo *hinfo; 2514 struct sock *sk = cur; 2515 2516 ++st->num; 2517 ++st->offset; 2518 2519 sk = sk_nulls_next(sk); 2520 sk_nulls_for_each_from(sk, node) { 2521 if (seq_sk_match(seq, sk)) 2522 return sk; 2523 } 2524 2525 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2526 ilb2 = &hinfo->lhash2[st->bucket]; 2527 spin_unlock(&ilb2->lock); 2528 ++st->bucket; 2529 return listening_get_first(seq); 2530 } 2531 2532 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2533 { 2534 struct tcp_iter_state *st = seq->private; 2535 void *rc; 2536 2537 st->bucket = 0; 2538 st->offset = 0; 2539 rc = listening_get_first(seq); 2540 2541 while (rc && *pos) { 2542 rc = listening_get_next(seq, rc); 2543 --*pos; 2544 } 2545 return rc; 2546 } 2547 2548 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2549 const struct tcp_iter_state *st) 2550 { 2551 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2552 } 2553 2554 /* 2555 * Get first established socket starting from bucket given in st->bucket. 2556 * If st->bucket is zero, the very first socket in the hash is returned. 2557 */ 2558 static void *established_get_first(struct seq_file *seq) 2559 { 2560 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2561 struct tcp_iter_state *st = seq->private; 2562 2563 st->offset = 0; 2564 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2565 struct sock *sk; 2566 struct hlist_nulls_node *node; 2567 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2568 2569 cond_resched(); 2570 2571 /* Lockless fast path for the common case of empty buckets */ 2572 if (empty_bucket(hinfo, st)) 2573 continue; 2574 2575 spin_lock_bh(lock); 2576 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2577 if (seq_sk_match(seq, sk)) 2578 return sk; 2579 } 2580 spin_unlock_bh(lock); 2581 } 2582 2583 return NULL; 2584 } 2585 2586 static void *established_get_next(struct seq_file *seq, void *cur) 2587 { 2588 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2589 struct tcp_iter_state *st = seq->private; 2590 struct hlist_nulls_node *node; 2591 struct sock *sk = cur; 2592 2593 ++st->num; 2594 ++st->offset; 2595 2596 sk = sk_nulls_next(sk); 2597 2598 sk_nulls_for_each_from(sk, node) { 2599 if (seq_sk_match(seq, sk)) 2600 return sk; 2601 } 2602 2603 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2604 ++st->bucket; 2605 return established_get_first(seq); 2606 } 2607 2608 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2609 { 2610 struct tcp_iter_state *st = seq->private; 2611 void *rc; 2612 2613 st->bucket = 0; 2614 rc = established_get_first(seq); 2615 2616 while (rc && pos) { 2617 rc = established_get_next(seq, rc); 2618 --pos; 2619 } 2620 return rc; 2621 } 2622 2623 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2624 { 2625 void *rc; 2626 struct tcp_iter_state *st = seq->private; 2627 2628 st->state = TCP_SEQ_STATE_LISTENING; 2629 rc = listening_get_idx(seq, &pos); 2630 2631 if (!rc) { 2632 st->state = TCP_SEQ_STATE_ESTABLISHED; 2633 rc = established_get_idx(seq, pos); 2634 } 2635 2636 return rc; 2637 } 2638 2639 static void *tcp_seek_last_pos(struct seq_file *seq) 2640 { 2641 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2642 struct tcp_iter_state *st = seq->private; 2643 int bucket = st->bucket; 2644 int offset = st->offset; 2645 int orig_num = st->num; 2646 void *rc = NULL; 2647 2648 switch (st->state) { 2649 case TCP_SEQ_STATE_LISTENING: 2650 if (st->bucket > hinfo->lhash2_mask) 2651 break; 2652 rc = listening_get_first(seq); 2653 while (offset-- && rc && bucket == st->bucket) 2654 rc = listening_get_next(seq, rc); 2655 if (rc) 2656 break; 2657 st->bucket = 0; 2658 st->state = TCP_SEQ_STATE_ESTABLISHED; 2659 fallthrough; 2660 case TCP_SEQ_STATE_ESTABLISHED: 2661 if (st->bucket > hinfo->ehash_mask) 2662 break; 2663 rc = established_get_first(seq); 2664 while (offset-- && rc && bucket == st->bucket) 2665 rc = established_get_next(seq, rc); 2666 } 2667 2668 st->num = orig_num; 2669 2670 return rc; 2671 } 2672 2673 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2674 { 2675 struct tcp_iter_state *st = seq->private; 2676 void *rc; 2677 2678 if (*pos && *pos == st->last_pos) { 2679 rc = tcp_seek_last_pos(seq); 2680 if (rc) 2681 goto out; 2682 } 2683 2684 st->state = TCP_SEQ_STATE_LISTENING; 2685 st->num = 0; 2686 st->bucket = 0; 2687 st->offset = 0; 2688 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2689 2690 out: 2691 st->last_pos = *pos; 2692 return rc; 2693 } 2694 2695 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2696 { 2697 struct tcp_iter_state *st = seq->private; 2698 void *rc = NULL; 2699 2700 if (v == SEQ_START_TOKEN) { 2701 rc = tcp_get_idx(seq, 0); 2702 goto out; 2703 } 2704 2705 switch (st->state) { 2706 case TCP_SEQ_STATE_LISTENING: 2707 rc = listening_get_next(seq, v); 2708 if (!rc) { 2709 st->state = TCP_SEQ_STATE_ESTABLISHED; 2710 st->bucket = 0; 2711 st->offset = 0; 2712 rc = established_get_first(seq); 2713 } 2714 break; 2715 case TCP_SEQ_STATE_ESTABLISHED: 2716 rc = established_get_next(seq, v); 2717 break; 2718 } 2719 out: 2720 ++*pos; 2721 st->last_pos = *pos; 2722 return rc; 2723 } 2724 2725 void tcp_seq_stop(struct seq_file *seq, void *v) 2726 { 2727 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2728 struct tcp_iter_state *st = seq->private; 2729 2730 switch (st->state) { 2731 case TCP_SEQ_STATE_LISTENING: 2732 if (v != SEQ_START_TOKEN) 2733 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2734 break; 2735 case TCP_SEQ_STATE_ESTABLISHED: 2736 if (v) 2737 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2738 break; 2739 } 2740 } 2741 2742 static void get_openreq4(const struct request_sock *req, 2743 struct seq_file *f, int i) 2744 { 2745 const struct inet_request_sock *ireq = inet_rsk(req); 2746 long delta = req->rsk_timer.expires - jiffies; 2747 2748 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2749 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2750 i, 2751 ireq->ir_loc_addr, 2752 ireq->ir_num, 2753 ireq->ir_rmt_addr, 2754 ntohs(ireq->ir_rmt_port), 2755 TCP_SYN_RECV, 2756 0, 0, /* could print option size, but that is af dependent. */ 2757 1, /* timers active (only the expire timer) */ 2758 jiffies_delta_to_clock_t(delta), 2759 req->num_timeout, 2760 from_kuid_munged(seq_user_ns(f), 2761 sk_uid(req->rsk_listener)), 2762 0, /* non standard timer */ 2763 0, /* open_requests have no inode */ 2764 0, 2765 req); 2766 } 2767 2768 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2769 { 2770 int timer_active; 2771 unsigned long timer_expires; 2772 const struct tcp_sock *tp = tcp_sk(sk); 2773 const struct inet_connection_sock *icsk = inet_csk(sk); 2774 const struct inet_sock *inet = inet_sk(sk); 2775 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2776 __be32 dest = inet->inet_daddr; 2777 __be32 src = inet->inet_rcv_saddr; 2778 __u16 destp = ntohs(inet->inet_dport); 2779 __u16 srcp = ntohs(inet->inet_sport); 2780 u8 icsk_pending; 2781 int rx_queue; 2782 int state; 2783 2784 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2785 if (icsk_pending == ICSK_TIME_RETRANS || 2786 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2787 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2788 timer_active = 1; 2789 timer_expires = tcp_timeout_expires(sk); 2790 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2791 timer_active = 4; 2792 timer_expires = tcp_timeout_expires(sk); 2793 } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 2794 timer_active = 2; 2795 timer_expires = icsk->icsk_keepalive_timer.expires; 2796 } else { 2797 timer_active = 0; 2798 timer_expires = jiffies; 2799 } 2800 2801 state = inet_sk_state_load(sk); 2802 if (state == TCP_LISTEN) 2803 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2804 else 2805 /* Because we don't lock the socket, 2806 * we might find a transient negative value. 2807 */ 2808 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2809 READ_ONCE(tp->copied_seq), 0); 2810 2811 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2812 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2813 i, src, srcp, dest, destp, state, 2814 READ_ONCE(tp->write_seq) - tp->snd_una, 2815 rx_queue, 2816 timer_active, 2817 jiffies_delta_to_clock_t(timer_expires - jiffies), 2818 READ_ONCE(icsk->icsk_retransmits), 2819 from_kuid_munged(seq_user_ns(f), sk_uid(sk)), 2820 READ_ONCE(icsk->icsk_probes_out), 2821 sock_i_ino(sk), 2822 refcount_read(&sk->sk_refcnt), sk, 2823 jiffies_to_clock_t(icsk->icsk_rto), 2824 jiffies_to_clock_t(icsk->icsk_ack.ato), 2825 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2826 tcp_snd_cwnd(tp), 2827 state == TCP_LISTEN ? 2828 fastopenq->max_qlen : 2829 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2830 } 2831 2832 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2833 struct seq_file *f, int i) 2834 { 2835 long delta = tw->tw_timer.expires - jiffies; 2836 __be32 dest, src; 2837 __u16 destp, srcp; 2838 2839 dest = tw->tw_daddr; 2840 src = tw->tw_rcv_saddr; 2841 destp = ntohs(tw->tw_dport); 2842 srcp = ntohs(tw->tw_sport); 2843 2844 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2845 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2846 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2847 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2848 refcount_read(&tw->tw_refcnt), tw); 2849 } 2850 2851 #define TMPSZ 150 2852 2853 static int tcp4_seq_show(struct seq_file *seq, void *v) 2854 { 2855 struct tcp_iter_state *st; 2856 struct sock *sk = v; 2857 2858 seq_setwidth(seq, TMPSZ - 1); 2859 if (v == SEQ_START_TOKEN) { 2860 seq_puts(seq, " sl local_address rem_address st tx_queue " 2861 "rx_queue tr tm->when retrnsmt uid timeout " 2862 "inode"); 2863 goto out; 2864 } 2865 st = seq->private; 2866 2867 if (sk->sk_state == TCP_TIME_WAIT) 2868 get_timewait4_sock(v, seq, st->num); 2869 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2870 get_openreq4(v, seq, st->num); 2871 else 2872 get_tcp4_sock(v, seq, st->num); 2873 out: 2874 seq_pad(seq, '\n'); 2875 return 0; 2876 } 2877 2878 #ifdef CONFIG_BPF_SYSCALL 2879 union bpf_tcp_iter_batch_item { 2880 struct sock *sk; 2881 __u64 cookie; 2882 }; 2883 2884 struct bpf_tcp_iter_state { 2885 struct tcp_iter_state state; 2886 unsigned int cur_sk; 2887 unsigned int end_sk; 2888 unsigned int max_sk; 2889 union bpf_tcp_iter_batch_item *batch; 2890 }; 2891 2892 struct bpf_iter__tcp { 2893 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2894 __bpf_md_ptr(struct sock_common *, sk_common); 2895 uid_t uid __aligned(8); 2896 }; 2897 2898 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2899 struct sock_common *sk_common, uid_t uid) 2900 { 2901 struct bpf_iter__tcp ctx; 2902 2903 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2904 ctx.meta = meta; 2905 ctx.sk_common = sk_common; 2906 ctx.uid = uid; 2907 return bpf_iter_run_prog(prog, &ctx); 2908 } 2909 2910 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2911 { 2912 union bpf_tcp_iter_batch_item *item; 2913 unsigned int cur_sk = iter->cur_sk; 2914 __u64 cookie; 2915 2916 /* Remember the cookies of the sockets we haven't seen yet, so we can 2917 * pick up where we left off next time around. 2918 */ 2919 while (cur_sk < iter->end_sk) { 2920 item = &iter->batch[cur_sk++]; 2921 cookie = sock_gen_cookie(item->sk); 2922 sock_gen_put(item->sk); 2923 item->cookie = cookie; 2924 } 2925 } 2926 2927 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2928 unsigned int new_batch_sz, gfp_t flags) 2929 { 2930 union bpf_tcp_iter_batch_item *new_batch; 2931 2932 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2933 flags | __GFP_NOWARN); 2934 if (!new_batch) 2935 return -ENOMEM; 2936 2937 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); 2938 kvfree(iter->batch); 2939 iter->batch = new_batch; 2940 iter->max_sk = new_batch_sz; 2941 2942 return 0; 2943 } 2944 2945 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, 2946 union bpf_tcp_iter_batch_item *cookies, 2947 int n_cookies) 2948 { 2949 struct hlist_nulls_node *node; 2950 struct sock *sk; 2951 int i; 2952 2953 for (i = 0; i < n_cookies; i++) { 2954 sk = first_sk; 2955 sk_nulls_for_each_from(sk, node) 2956 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) 2957 return sk; 2958 } 2959 2960 return NULL; 2961 } 2962 2963 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) 2964 { 2965 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2966 struct bpf_tcp_iter_state *iter = seq->private; 2967 struct tcp_iter_state *st = &iter->state; 2968 unsigned int find_cookie = iter->cur_sk; 2969 unsigned int end_cookie = iter->end_sk; 2970 int resume_bucket = st->bucket; 2971 struct sock *sk; 2972 2973 if (end_cookie && find_cookie == end_cookie) 2974 ++st->bucket; 2975 2976 sk = listening_get_first(seq); 2977 iter->cur_sk = 0; 2978 iter->end_sk = 0; 2979 2980 if (sk && st->bucket == resume_bucket && end_cookie) { 2981 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 2982 end_cookie - find_cookie); 2983 if (!sk) { 2984 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2985 ++st->bucket; 2986 sk = listening_get_first(seq); 2987 } 2988 } 2989 2990 return sk; 2991 } 2992 2993 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) 2994 { 2995 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2996 struct bpf_tcp_iter_state *iter = seq->private; 2997 struct tcp_iter_state *st = &iter->state; 2998 unsigned int find_cookie = iter->cur_sk; 2999 unsigned int end_cookie = iter->end_sk; 3000 int resume_bucket = st->bucket; 3001 struct sock *sk; 3002 3003 if (end_cookie && find_cookie == end_cookie) 3004 ++st->bucket; 3005 3006 sk = established_get_first(seq); 3007 iter->cur_sk = 0; 3008 iter->end_sk = 0; 3009 3010 if (sk && st->bucket == resume_bucket && end_cookie) { 3011 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], 3012 end_cookie - find_cookie); 3013 if (!sk) { 3014 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3015 ++st->bucket; 3016 sk = established_get_first(seq); 3017 } 3018 } 3019 3020 return sk; 3021 } 3022 3023 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) 3024 { 3025 struct bpf_tcp_iter_state *iter = seq->private; 3026 struct tcp_iter_state *st = &iter->state; 3027 struct sock *sk = NULL; 3028 3029 switch (st->state) { 3030 case TCP_SEQ_STATE_LISTENING: 3031 sk = bpf_iter_tcp_resume_listening(seq); 3032 if (sk) 3033 break; 3034 st->bucket = 0; 3035 st->state = TCP_SEQ_STATE_ESTABLISHED; 3036 fallthrough; 3037 case TCP_SEQ_STATE_ESTABLISHED: 3038 sk = bpf_iter_tcp_resume_established(seq); 3039 break; 3040 } 3041 3042 return sk; 3043 } 3044 3045 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3046 struct sock **start_sk) 3047 { 3048 struct bpf_tcp_iter_state *iter = seq->private; 3049 struct hlist_nulls_node *node; 3050 unsigned int expected = 1; 3051 struct sock *sk; 3052 3053 sock_hold(*start_sk); 3054 iter->batch[iter->end_sk++].sk = *start_sk; 3055 3056 sk = sk_nulls_next(*start_sk); 3057 *start_sk = NULL; 3058 sk_nulls_for_each_from(sk, node) { 3059 if (seq_sk_match(seq, sk)) { 3060 if (iter->end_sk < iter->max_sk) { 3061 sock_hold(sk); 3062 iter->batch[iter->end_sk++].sk = sk; 3063 } else if (!*start_sk) { 3064 /* Remember where we left off. */ 3065 *start_sk = sk; 3066 } 3067 expected++; 3068 } 3069 } 3070 3071 return expected; 3072 } 3073 3074 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3075 struct sock **start_sk) 3076 { 3077 struct bpf_tcp_iter_state *iter = seq->private; 3078 struct hlist_nulls_node *node; 3079 unsigned int expected = 1; 3080 struct sock *sk; 3081 3082 sock_hold(*start_sk); 3083 iter->batch[iter->end_sk++].sk = *start_sk; 3084 3085 sk = sk_nulls_next(*start_sk); 3086 *start_sk = NULL; 3087 sk_nulls_for_each_from(sk, node) { 3088 if (seq_sk_match(seq, sk)) { 3089 if (iter->end_sk < iter->max_sk) { 3090 sock_hold(sk); 3091 iter->batch[iter->end_sk++].sk = sk; 3092 } else if (!*start_sk) { 3093 /* Remember where we left off. */ 3094 *start_sk = sk; 3095 } 3096 expected++; 3097 } 3098 } 3099 3100 return expected; 3101 } 3102 3103 static unsigned int bpf_iter_fill_batch(struct seq_file *seq, 3104 struct sock **start_sk) 3105 { 3106 struct bpf_tcp_iter_state *iter = seq->private; 3107 struct tcp_iter_state *st = &iter->state; 3108 3109 if (st->state == TCP_SEQ_STATE_LISTENING) 3110 return bpf_iter_tcp_listening_batch(seq, start_sk); 3111 else 3112 return bpf_iter_tcp_established_batch(seq, start_sk); 3113 } 3114 3115 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) 3116 { 3117 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3118 struct bpf_tcp_iter_state *iter = seq->private; 3119 struct tcp_iter_state *st = &iter->state; 3120 3121 if (st->state == TCP_SEQ_STATE_LISTENING) 3122 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3123 else 3124 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3125 } 3126 3127 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3128 { 3129 struct bpf_tcp_iter_state *iter = seq->private; 3130 unsigned int expected; 3131 struct sock *sk; 3132 int err; 3133 3134 sk = bpf_iter_tcp_resume(seq); 3135 if (!sk) 3136 return NULL; /* Done */ 3137 3138 expected = bpf_iter_fill_batch(seq, &sk); 3139 if (likely(iter->end_sk == expected)) 3140 goto done; 3141 3142 /* Batch size was too small. */ 3143 bpf_iter_tcp_unlock_bucket(seq); 3144 bpf_iter_tcp_put_batch(iter); 3145 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, 3146 GFP_USER); 3147 if (err) 3148 return ERR_PTR(err); 3149 3150 sk = bpf_iter_tcp_resume(seq); 3151 if (!sk) 3152 return NULL; /* Done */ 3153 3154 expected = bpf_iter_fill_batch(seq, &sk); 3155 if (likely(iter->end_sk == expected)) 3156 goto done; 3157 3158 /* Batch size was still too small. Hold onto the lock while we try 3159 * again with a larger batch to make sure the current bucket's size 3160 * does not change in the meantime. 3161 */ 3162 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); 3163 if (err) { 3164 bpf_iter_tcp_unlock_bucket(seq); 3165 return ERR_PTR(err); 3166 } 3167 3168 expected = bpf_iter_fill_batch(seq, &sk); 3169 WARN_ON_ONCE(iter->end_sk != expected); 3170 done: 3171 bpf_iter_tcp_unlock_bucket(seq); 3172 return iter->batch[0].sk; 3173 } 3174 3175 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3176 { 3177 /* bpf iter does not support lseek, so it always 3178 * continue from where it was stop()-ped. 3179 */ 3180 if (*pos) 3181 return bpf_iter_tcp_batch(seq); 3182 3183 return SEQ_START_TOKEN; 3184 } 3185 3186 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3187 { 3188 struct bpf_tcp_iter_state *iter = seq->private; 3189 struct tcp_iter_state *st = &iter->state; 3190 struct sock *sk; 3191 3192 /* Whenever seq_next() is called, the iter->cur_sk is 3193 * done with seq_show(), so advance to the next sk in 3194 * the batch. 3195 */ 3196 if (iter->cur_sk < iter->end_sk) { 3197 /* Keeping st->num consistent in tcp_iter_state. 3198 * bpf_iter_tcp does not use st->num. 3199 * meta.seq_num is used instead. 3200 */ 3201 st->num++; 3202 sock_gen_put(iter->batch[iter->cur_sk++].sk); 3203 } 3204 3205 if (iter->cur_sk < iter->end_sk) 3206 sk = iter->batch[iter->cur_sk].sk; 3207 else 3208 sk = bpf_iter_tcp_batch(seq); 3209 3210 ++*pos; 3211 /* Keeping st->last_pos consistent in tcp_iter_state. 3212 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3213 */ 3214 st->last_pos = *pos; 3215 return sk; 3216 } 3217 3218 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3219 { 3220 struct bpf_iter_meta meta; 3221 struct bpf_prog *prog; 3222 struct sock *sk = v; 3223 uid_t uid; 3224 int ret; 3225 3226 if (v == SEQ_START_TOKEN) 3227 return 0; 3228 3229 if (sk_fullsock(sk)) 3230 lock_sock(sk); 3231 3232 if (unlikely(sk_unhashed(sk))) { 3233 ret = SEQ_SKIP; 3234 goto unlock; 3235 } 3236 3237 if (sk->sk_state == TCP_TIME_WAIT) { 3238 uid = 0; 3239 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3240 const struct request_sock *req = v; 3241 3242 uid = from_kuid_munged(seq_user_ns(seq), 3243 sk_uid(req->rsk_listener)); 3244 } else { 3245 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); 3246 } 3247 3248 meta.seq = seq; 3249 prog = bpf_iter_get_info(&meta, false); 3250 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3251 3252 unlock: 3253 if (sk_fullsock(sk)) 3254 release_sock(sk); 3255 return ret; 3256 3257 } 3258 3259 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3260 { 3261 struct bpf_tcp_iter_state *iter = seq->private; 3262 struct bpf_iter_meta meta; 3263 struct bpf_prog *prog; 3264 3265 if (!v) { 3266 meta.seq = seq; 3267 prog = bpf_iter_get_info(&meta, true); 3268 if (prog) 3269 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3270 } 3271 3272 if (iter->cur_sk < iter->end_sk) 3273 bpf_iter_tcp_put_batch(iter); 3274 } 3275 3276 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3277 .show = bpf_iter_tcp_seq_show, 3278 .start = bpf_iter_tcp_seq_start, 3279 .next = bpf_iter_tcp_seq_next, 3280 .stop = bpf_iter_tcp_seq_stop, 3281 }; 3282 #endif 3283 static unsigned short seq_file_family(const struct seq_file *seq) 3284 { 3285 const struct tcp_seq_afinfo *afinfo; 3286 3287 #ifdef CONFIG_BPF_SYSCALL 3288 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3289 if (seq->op == &bpf_iter_tcp_seq_ops) 3290 return AF_UNSPEC; 3291 #endif 3292 3293 /* Iterated from proc fs */ 3294 afinfo = pde_data(file_inode(seq->file)); 3295 return afinfo->family; 3296 } 3297 3298 static const struct seq_operations tcp4_seq_ops = { 3299 .show = tcp4_seq_show, 3300 .start = tcp_seq_start, 3301 .next = tcp_seq_next, 3302 .stop = tcp_seq_stop, 3303 }; 3304 3305 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3306 .family = AF_INET, 3307 }; 3308 3309 static int __net_init tcp4_proc_init_net(struct net *net) 3310 { 3311 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3312 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3313 return -ENOMEM; 3314 return 0; 3315 } 3316 3317 static void __net_exit tcp4_proc_exit_net(struct net *net) 3318 { 3319 remove_proc_entry("tcp", net->proc_net); 3320 } 3321 3322 static struct pernet_operations tcp4_net_ops = { 3323 .init = tcp4_proc_init_net, 3324 .exit = tcp4_proc_exit_net, 3325 }; 3326 3327 int __init tcp4_proc_init(void) 3328 { 3329 return register_pernet_subsys(&tcp4_net_ops); 3330 } 3331 3332 void tcp4_proc_exit(void) 3333 { 3334 unregister_pernet_subsys(&tcp4_net_ops); 3335 } 3336 #endif /* CONFIG_PROC_FS */ 3337 3338 struct proto tcp_prot = { 3339 .name = "TCP", 3340 .owner = THIS_MODULE, 3341 .close = tcp_close, 3342 .pre_connect = tcp_v4_pre_connect, 3343 .connect = tcp_v4_connect, 3344 .disconnect = tcp_disconnect, 3345 .accept = inet_csk_accept, 3346 .ioctl = tcp_ioctl, 3347 .init = tcp_v4_init_sock, 3348 .destroy = tcp_v4_destroy_sock, 3349 .shutdown = tcp_shutdown, 3350 .setsockopt = tcp_setsockopt, 3351 .getsockopt = tcp_getsockopt, 3352 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3353 .keepalive = tcp_set_keepalive, 3354 .recvmsg = tcp_recvmsg, 3355 .sendmsg = tcp_sendmsg, 3356 .splice_eof = tcp_splice_eof, 3357 .backlog_rcv = tcp_v4_do_rcv, 3358 .release_cb = tcp_release_cb, 3359 .hash = inet_hash, 3360 .unhash = inet_unhash, 3361 .get_port = inet_csk_get_port, 3362 .put_port = inet_put_port, 3363 #ifdef CONFIG_BPF_SYSCALL 3364 .psock_update_sk_prot = tcp_bpf_update_proto, 3365 #endif 3366 .enter_memory_pressure = tcp_enter_memory_pressure, 3367 .leave_memory_pressure = tcp_leave_memory_pressure, 3368 .stream_memory_free = tcp_stream_memory_free, 3369 .sockets_allocated = &tcp_sockets_allocated, 3370 3371 .memory_allocated = &net_aligned_data.tcp_memory_allocated, 3372 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3373 3374 .memory_pressure = &tcp_memory_pressure, 3375 .sysctl_mem = sysctl_tcp_mem, 3376 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3377 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3378 .max_header = MAX_TCP_HEADER, 3379 .obj_size = sizeof(struct tcp_sock), 3380 .freeptr_offset = offsetof(struct tcp_sock, 3381 inet_conn.icsk_inet.sk.sk_freeptr), 3382 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3383 .twsk_prot = &tcp_timewait_sock_ops, 3384 .rsk_prot = &tcp_request_sock_ops, 3385 .h.hashinfo = NULL, 3386 .no_autobind = true, 3387 .diag_destroy = tcp_abort, 3388 }; 3389 EXPORT_SYMBOL(tcp_prot); 3390 3391 static void __net_exit tcp_sk_exit(struct net *net) 3392 { 3393 if (net->ipv4.tcp_congestion_control) 3394 bpf_module_put(net->ipv4.tcp_congestion_control, 3395 net->ipv4.tcp_congestion_control->owner); 3396 } 3397 3398 static void __net_init tcp_set_hashinfo(struct net *net) 3399 { 3400 struct inet_hashinfo *hinfo; 3401 unsigned int ehash_entries; 3402 struct net *old_net; 3403 3404 if (net_eq(net, &init_net)) 3405 goto fallback; 3406 3407 old_net = current->nsproxy->net_ns; 3408 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3409 if (!ehash_entries) 3410 goto fallback; 3411 3412 ehash_entries = roundup_pow_of_two(ehash_entries); 3413 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3414 if (!hinfo) { 3415 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3416 "for a netns, fallback to the global one\n", 3417 ehash_entries); 3418 fallback: 3419 hinfo = &tcp_hashinfo; 3420 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3421 } 3422 3423 net->ipv4.tcp_death_row.hashinfo = hinfo; 3424 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3425 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3426 } 3427 3428 static int __net_init tcp_sk_init(struct net *net) 3429 { 3430 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3431 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3432 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; 3433 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3434 3435 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3436 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3437 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3438 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3439 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3440 3441 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3442 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3443 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3444 3445 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3446 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3447 net->ipv4.sysctl_tcp_syncookies = 1; 3448 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3449 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3450 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3451 net->ipv4.sysctl_tcp_orphan_retries = 0; 3452 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3453 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3454 net->ipv4.sysctl_tcp_tw_reuse = 2; 3455 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3456 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3457 3458 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3459 tcp_set_hashinfo(net); 3460 3461 net->ipv4.sysctl_tcp_sack = 1; 3462 net->ipv4.sysctl_tcp_window_scaling = 1; 3463 net->ipv4.sysctl_tcp_timestamps = 1; 3464 net->ipv4.sysctl_tcp_early_retrans = 3; 3465 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3466 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3467 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3468 net->ipv4.sysctl_tcp_max_reordering = 300; 3469 net->ipv4.sysctl_tcp_dsack = 1; 3470 net->ipv4.sysctl_tcp_app_win = 31; 3471 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3472 net->ipv4.sysctl_tcp_frto = 2; 3473 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3474 net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC; 3475 /* This limits the percentage of the congestion window which we 3476 * will allow a single TSO frame to consume. Building TSO frames 3477 * which are too large can cause TCP streams to be bursty. 3478 */ 3479 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3480 /* Default TSQ limit of 4 MB */ 3481 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3482 3483 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3484 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3485 3486 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3487 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3488 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3489 net->ipv4.sysctl_tcp_autocorking = 1; 3490 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3491 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3492 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3493 if (net != &init_net) { 3494 memcpy(net->ipv4.sysctl_tcp_rmem, 3495 init_net.ipv4.sysctl_tcp_rmem, 3496 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3497 memcpy(net->ipv4.sysctl_tcp_wmem, 3498 init_net.ipv4.sysctl_tcp_wmem, 3499 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3500 } 3501 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3502 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC; 3503 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3504 net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33; 3505 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3506 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3507 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3508 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3509 3510 /* Set default values for PLB */ 3511 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3512 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3513 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3514 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3515 /* Default congestion threshold for PLB to mark a round is 50% */ 3516 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3517 3518 /* Reno is always built in */ 3519 if (!net_eq(net, &init_net) && 3520 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3521 init_net.ipv4.tcp_congestion_control->owner)) 3522 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3523 else 3524 net->ipv4.tcp_congestion_control = &tcp_reno; 3525 3526 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3527 net->ipv4.sysctl_tcp_shrink_window = 0; 3528 3529 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3530 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3531 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3532 3533 return 0; 3534 } 3535 3536 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3537 { 3538 struct net *net; 3539 3540 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3541 * and failed setup_net error unwinding path are serialized. 3542 * 3543 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3544 * net_exit_list, the thread that dismantles a particular twsk must 3545 * do so without other thread progressing to refcount_dec_and_test() of 3546 * tcp_death_row.tw_refcount. 3547 */ 3548 mutex_lock(&tcp_exit_batch_mutex); 3549 3550 tcp_twsk_purge(net_exit_list); 3551 3552 list_for_each_entry(net, net_exit_list, exit_list) { 3553 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3554 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3555 tcp_fastopen_ctx_destroy(net); 3556 } 3557 3558 mutex_unlock(&tcp_exit_batch_mutex); 3559 } 3560 3561 static struct pernet_operations __net_initdata tcp_sk_ops = { 3562 .init = tcp_sk_init, 3563 .exit = tcp_sk_exit, 3564 .exit_batch = tcp_sk_exit_batch, 3565 }; 3566 3567 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3568 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3569 struct sock_common *sk_common, uid_t uid) 3570 3571 #define INIT_BATCH_SZ 16 3572 3573 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3574 { 3575 struct bpf_tcp_iter_state *iter = priv_data; 3576 int err; 3577 3578 err = bpf_iter_init_seq_net(priv_data, aux); 3579 if (err) 3580 return err; 3581 3582 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); 3583 if (err) { 3584 bpf_iter_fini_seq_net(priv_data); 3585 return err; 3586 } 3587 3588 return 0; 3589 } 3590 3591 static void bpf_iter_fini_tcp(void *priv_data) 3592 { 3593 struct bpf_tcp_iter_state *iter = priv_data; 3594 3595 bpf_iter_fini_seq_net(priv_data); 3596 kvfree(iter->batch); 3597 } 3598 3599 static const struct bpf_iter_seq_info tcp_seq_info = { 3600 .seq_ops = &bpf_iter_tcp_seq_ops, 3601 .init_seq_private = bpf_iter_init_tcp, 3602 .fini_seq_private = bpf_iter_fini_tcp, 3603 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3604 }; 3605 3606 static const struct bpf_func_proto * 3607 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3608 const struct bpf_prog *prog) 3609 { 3610 switch (func_id) { 3611 case BPF_FUNC_setsockopt: 3612 return &bpf_sk_setsockopt_proto; 3613 case BPF_FUNC_getsockopt: 3614 return &bpf_sk_getsockopt_proto; 3615 default: 3616 return NULL; 3617 } 3618 } 3619 3620 static struct bpf_iter_reg tcp_reg_info = { 3621 .target = "tcp", 3622 .ctx_arg_info_size = 1, 3623 .ctx_arg_info = { 3624 { offsetof(struct bpf_iter__tcp, sk_common), 3625 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3626 }, 3627 .get_func_proto = bpf_iter_tcp_get_func_proto, 3628 .seq_info = &tcp_seq_info, 3629 }; 3630 3631 static void __init bpf_iter_register(void) 3632 { 3633 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3634 if (bpf_iter_reg_target(&tcp_reg_info)) 3635 pr_warn("Warning: could not register bpf iterator tcp\n"); 3636 } 3637 3638 #endif 3639 3640 void __init tcp_v4_init(void) 3641 { 3642 int cpu, res; 3643 3644 for_each_possible_cpu(cpu) { 3645 struct sock *sk; 3646 3647 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3648 IPPROTO_TCP, &init_net); 3649 if (res) 3650 panic("Failed to create the TCP control socket.\n"); 3651 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3652 3653 /* Please enforce IP_DF and IPID==0 for RST and 3654 * ACK sent in SYN-RECV and TIME-WAIT state. 3655 */ 3656 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3657 3658 sk->sk_clockid = CLOCK_MONOTONIC; 3659 3660 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3661 } 3662 if (register_pernet_subsys(&tcp_sk_ops)) 3663 panic("Failed to create the TCP control socket.\n"); 3664 3665 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3666 bpf_iter_register(); 3667 #endif 3668 } 3669