1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 #include <net/rstreason.h> 74 75 #include <linux/inet.h> 76 #include <linux/ipv6.h> 77 #include <linux/stddef.h> 78 #include <linux/proc_fs.h> 79 #include <linux/seq_file.h> 80 #include <linux/inetdevice.h> 81 #include <linux/btf_ids.h> 82 #include <linux/skbuff_ref.h> 83 84 #include <crypto/hash.h> 85 #include <linux/scatterlist.h> 86 87 #include <trace/events/tcp.h> 88 89 #ifdef CONFIG_TCP_MD5SIG 90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 91 __be32 daddr, __be32 saddr, const struct tcphdr *th); 92 #endif 93 94 struct inet_hashinfo tcp_hashinfo; 95 96 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 97 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 98 }; 99 100 static DEFINE_MUTEX(tcp_exit_batch_mutex); 101 102 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 103 { 104 return secure_tcp_seq(ip_hdr(skb)->daddr, 105 ip_hdr(skb)->saddr, 106 tcp_hdr(skb)->dest, 107 tcp_hdr(skb)->source); 108 } 109 110 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 111 { 112 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 113 } 114 115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 116 { 117 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 118 const struct inet_timewait_sock *tw = inet_twsk(sktw); 119 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 120 struct tcp_sock *tp = tcp_sk(sk); 121 int ts_recent_stamp; 122 u32 reuse_thresh; 123 124 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 125 reuse = 0; 126 127 if (reuse == 2) { 128 /* Still does not detect *everything* that goes through 129 * lo, since we require a loopback src or dst address 130 * or direct binding to 'lo' interface. 131 */ 132 bool loopback = false; 133 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 134 loopback = true; 135 #if IS_ENABLED(CONFIG_IPV6) 136 if (tw->tw_family == AF_INET6) { 137 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 138 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 139 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 140 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 141 loopback = true; 142 } else 143 #endif 144 { 145 if (ipv4_is_loopback(tw->tw_daddr) || 146 ipv4_is_loopback(tw->tw_rcv_saddr)) 147 loopback = true; 148 } 149 if (!loopback) 150 reuse = 0; 151 } 152 153 /* With PAWS, it is safe from the viewpoint 154 of data integrity. Even without PAWS it is safe provided sequence 155 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 156 157 Actually, the idea is close to VJ's one, only timestamp cache is 158 held not per host, but per port pair and TW bucket is used as state 159 holder. 160 161 If TW bucket has been already destroyed we fall back to VJ's scheme 162 and use initial timestamp retrieved from peer table. 163 */ 164 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 165 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + 166 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); 167 if (ts_recent_stamp && 168 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { 169 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 170 * and releasing the bucket lock. 171 */ 172 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 173 return 0; 174 175 /* In case of repair and re-using TIME-WAIT sockets we still 176 * want to be sure that it is safe as above but honor the 177 * sequence numbers and time stamps set as part of the repair 178 * process. 179 * 180 * Without this check re-using a TIME-WAIT socket with TCP 181 * repair would accumulate a -1 on the repair assigned 182 * sequence number. The first time it is reused the sequence 183 * is -1, the second time -2, etc. This fixes that issue 184 * without appearing to create any others. 185 */ 186 if (likely(!tp->repair)) { 187 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 188 189 if (!seq) 190 seq = 1; 191 WRITE_ONCE(tp->write_seq, seq); 192 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 193 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 194 } 195 196 return 1; 197 } 198 199 return 0; 200 } 201 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); 202 203 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 204 int addr_len) 205 { 206 /* This check is replicated from tcp_v4_connect() and intended to 207 * prevent BPF program called below from accessing bytes that are out 208 * of the bound specified by user in addr_len. 209 */ 210 if (addr_len < sizeof(struct sockaddr_in)) 211 return -EINVAL; 212 213 sock_owned_by_me(sk); 214 215 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 216 } 217 218 /* This will initiate an outgoing connection. */ 219 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 220 { 221 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 222 struct inet_timewait_death_row *tcp_death_row; 223 struct inet_sock *inet = inet_sk(sk); 224 struct tcp_sock *tp = tcp_sk(sk); 225 struct ip_options_rcu *inet_opt; 226 struct net *net = sock_net(sk); 227 __be16 orig_sport, orig_dport; 228 __be32 daddr, nexthop; 229 struct flowi4 *fl4; 230 struct rtable *rt; 231 int err; 232 233 if (addr_len < sizeof(struct sockaddr_in)) 234 return -EINVAL; 235 236 if (usin->sin_family != AF_INET) 237 return -EAFNOSUPPORT; 238 239 nexthop = daddr = usin->sin_addr.s_addr; 240 inet_opt = rcu_dereference_protected(inet->inet_opt, 241 lockdep_sock_is_held(sk)); 242 if (inet_opt && inet_opt->opt.srr) { 243 if (!daddr) 244 return -EINVAL; 245 nexthop = inet_opt->opt.faddr; 246 } 247 248 orig_sport = inet->inet_sport; 249 orig_dport = usin->sin_port; 250 fl4 = &inet->cork.fl.u.ip4; 251 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 252 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 253 orig_dport, sk); 254 if (IS_ERR(rt)) { 255 err = PTR_ERR(rt); 256 if (err == -ENETUNREACH) 257 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 258 return err; 259 } 260 261 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 262 ip_rt_put(rt); 263 return -ENETUNREACH; 264 } 265 266 if (!inet_opt || !inet_opt->opt.srr) 267 daddr = fl4->daddr; 268 269 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 270 271 if (!inet->inet_saddr) { 272 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 273 if (err) { 274 ip_rt_put(rt); 275 return err; 276 } 277 } else { 278 sk_rcv_saddr_set(sk, inet->inet_saddr); 279 } 280 281 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 282 /* Reset inherited state */ 283 tp->rx_opt.ts_recent = 0; 284 tp->rx_opt.ts_recent_stamp = 0; 285 if (likely(!tp->repair)) 286 WRITE_ONCE(tp->write_seq, 0); 287 } 288 289 inet->inet_dport = usin->sin_port; 290 sk_daddr_set(sk, daddr); 291 292 inet_csk(sk)->icsk_ext_hdr_len = 0; 293 if (inet_opt) 294 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 295 296 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 297 298 /* Socket identity is still unknown (sport may be zero). 299 * However we set state to SYN-SENT and not releasing socket 300 * lock select source port, enter ourselves into the hash tables and 301 * complete initialization after this. 302 */ 303 tcp_set_state(sk, TCP_SYN_SENT); 304 err = inet_hash_connect(tcp_death_row, sk); 305 if (err) 306 goto failure; 307 308 sk_set_txhash(sk); 309 310 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 311 inet->inet_sport, inet->inet_dport, sk); 312 if (IS_ERR(rt)) { 313 err = PTR_ERR(rt); 314 rt = NULL; 315 goto failure; 316 } 317 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 318 /* OK, now commit destination to socket. */ 319 sk->sk_gso_type = SKB_GSO_TCPV4; 320 sk_setup_caps(sk, &rt->dst); 321 rt = NULL; 322 323 if (likely(!tp->repair)) { 324 if (!tp->write_seq) 325 WRITE_ONCE(tp->write_seq, 326 secure_tcp_seq(inet->inet_saddr, 327 inet->inet_daddr, 328 inet->inet_sport, 329 usin->sin_port)); 330 WRITE_ONCE(tp->tsoffset, 331 secure_tcp_ts_off(net, inet->inet_saddr, 332 inet->inet_daddr)); 333 } 334 335 atomic_set(&inet->inet_id, get_random_u16()); 336 337 if (tcp_fastopen_defer_connect(sk, &err)) 338 return err; 339 if (err) 340 goto failure; 341 342 err = tcp_connect(sk); 343 344 if (err) 345 goto failure; 346 347 return 0; 348 349 failure: 350 /* 351 * This unhashes the socket and releases the local port, 352 * if necessary. 353 */ 354 tcp_set_state(sk, TCP_CLOSE); 355 inet_bhash2_reset_saddr(sk); 356 ip_rt_put(rt); 357 sk->sk_route_caps = 0; 358 inet->inet_dport = 0; 359 return err; 360 } 361 EXPORT_IPV6_MOD(tcp_v4_connect); 362 363 /* 364 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 365 * It can be called through tcp_release_cb() if socket was owned by user 366 * at the time tcp_v4_err() was called to handle ICMP message. 367 */ 368 void tcp_v4_mtu_reduced(struct sock *sk) 369 { 370 struct inet_sock *inet = inet_sk(sk); 371 struct dst_entry *dst; 372 u32 mtu; 373 374 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 375 return; 376 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 377 dst = inet_csk_update_pmtu(sk, mtu); 378 if (!dst) 379 return; 380 381 /* Something is about to be wrong... Remember soft error 382 * for the case, if this connection will not able to recover. 383 */ 384 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 385 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 386 387 mtu = dst_mtu(dst); 388 389 if (inet->pmtudisc != IP_PMTUDISC_DONT && 390 ip_sk_accept_pmtu(sk) && 391 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 392 tcp_sync_mss(sk, mtu); 393 394 /* Resend the TCP packet because it's 395 * clear that the old packet has been 396 * dropped. This is the new "fast" path mtu 397 * discovery. 398 */ 399 tcp_simple_retransmit(sk); 400 } /* else let the usual retransmit timer handle it */ 401 } 402 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); 403 404 static void do_redirect(struct sk_buff *skb, struct sock *sk) 405 { 406 struct dst_entry *dst = __sk_dst_check(sk, 0); 407 408 if (dst) 409 dst->ops->redirect(dst, sk, skb); 410 } 411 412 413 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 414 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 415 { 416 struct request_sock *req = inet_reqsk(sk); 417 struct net *net = sock_net(sk); 418 419 /* ICMPs are not backlogged, hence we cannot get 420 * an established socket here. 421 */ 422 if (seq != tcp_rsk(req)->snt_isn) { 423 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 424 } else if (abort) { 425 /* 426 * Still in SYN_RECV, just remove it silently. 427 * There is no good way to pass the error to the newly 428 * created socket, and POSIX does not want network 429 * errors returned from accept(). 430 */ 431 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 432 tcp_listendrop(req->rsk_listener); 433 } 434 reqsk_put(req); 435 } 436 EXPORT_IPV6_MOD(tcp_req_err); 437 438 /* TCP-LD (RFC 6069) logic */ 439 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 440 { 441 struct inet_connection_sock *icsk = inet_csk(sk); 442 struct tcp_sock *tp = tcp_sk(sk); 443 struct sk_buff *skb; 444 s32 remaining; 445 u32 delta_us; 446 447 if (sock_owned_by_user(sk)) 448 return; 449 450 if (seq != tp->snd_una || !icsk->icsk_retransmits || 451 !icsk->icsk_backoff) 452 return; 453 454 skb = tcp_rtx_queue_head(sk); 455 if (WARN_ON_ONCE(!skb)) 456 return; 457 458 icsk->icsk_backoff--; 459 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 460 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); 461 462 tcp_mstamp_refresh(tp); 463 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 464 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 465 466 if (remaining > 0) { 467 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); 468 } else { 469 /* RTO revert clocked out retransmission. 470 * Will retransmit now. 471 */ 472 tcp_retransmit_timer(sk); 473 } 474 } 475 EXPORT_IPV6_MOD(tcp_ld_RTO_revert); 476 477 /* 478 * This routine is called by the ICMP module when it gets some 479 * sort of error condition. If err < 0 then the socket should 480 * be closed and the error returned to the user. If err > 0 481 * it's just the icmp type << 8 | icmp code. After adjustment 482 * header points to the first 8 bytes of the tcp header. We need 483 * to find the appropriate port. 484 * 485 * The locking strategy used here is very "optimistic". When 486 * someone else accesses the socket the ICMP is just dropped 487 * and for some paths there is no check at all. 488 * A more general error queue to queue errors for later handling 489 * is probably better. 490 * 491 */ 492 493 int tcp_v4_err(struct sk_buff *skb, u32 info) 494 { 495 const struct iphdr *iph = (const struct iphdr *)skb->data; 496 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 497 struct net *net = dev_net_rcu(skb->dev); 498 const int type = icmp_hdr(skb)->type; 499 const int code = icmp_hdr(skb)->code; 500 struct request_sock *fastopen; 501 struct tcp_sock *tp; 502 u32 seq, snd_una; 503 struct sock *sk; 504 int err; 505 506 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 507 iph->daddr, th->dest, iph->saddr, 508 ntohs(th->source), inet_iif(skb), 0); 509 if (!sk) { 510 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 511 return -ENOENT; 512 } 513 if (sk->sk_state == TCP_TIME_WAIT) { 514 /* To increase the counter of ignored icmps for TCP-AO */ 515 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 516 inet_twsk_put(inet_twsk(sk)); 517 return 0; 518 } 519 seq = ntohl(th->seq); 520 if (sk->sk_state == TCP_NEW_SYN_RECV) { 521 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 522 type == ICMP_TIME_EXCEEDED || 523 (type == ICMP_DEST_UNREACH && 524 (code == ICMP_NET_UNREACH || 525 code == ICMP_HOST_UNREACH))); 526 return 0; 527 } 528 529 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 530 sock_put(sk); 531 return 0; 532 } 533 534 bh_lock_sock(sk); 535 /* If too many ICMPs get dropped on busy 536 * servers this needs to be solved differently. 537 * We do take care of PMTU discovery (RFC1191) special case : 538 * we can receive locally generated ICMP messages while socket is held. 539 */ 540 if (sock_owned_by_user(sk)) { 541 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 542 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 543 } 544 if (sk->sk_state == TCP_CLOSE) 545 goto out; 546 547 if (static_branch_unlikely(&ip4_min_ttl)) { 548 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 549 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 550 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 551 goto out; 552 } 553 } 554 555 tp = tcp_sk(sk); 556 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 557 fastopen = rcu_dereference(tp->fastopen_rsk); 558 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 559 if (sk->sk_state != TCP_LISTEN && 560 !between(seq, snd_una, tp->snd_nxt)) { 561 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 562 goto out; 563 } 564 565 switch (type) { 566 case ICMP_REDIRECT: 567 if (!sock_owned_by_user(sk)) 568 do_redirect(skb, sk); 569 goto out; 570 case ICMP_SOURCE_QUENCH: 571 /* Just silently ignore these. */ 572 goto out; 573 case ICMP_PARAMETERPROB: 574 err = EPROTO; 575 break; 576 case ICMP_DEST_UNREACH: 577 if (code > NR_ICMP_UNREACH) 578 goto out; 579 580 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 581 /* We are not interested in TCP_LISTEN and open_requests 582 * (SYN-ACKs send out by Linux are always <576bytes so 583 * they should go through unfragmented). 584 */ 585 if (sk->sk_state == TCP_LISTEN) 586 goto out; 587 588 WRITE_ONCE(tp->mtu_info, info); 589 if (!sock_owned_by_user(sk)) { 590 tcp_v4_mtu_reduced(sk); 591 } else { 592 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 593 sock_hold(sk); 594 } 595 goto out; 596 } 597 598 err = icmp_err_convert[code].errno; 599 /* check if this ICMP message allows revert of backoff. 600 * (see RFC 6069) 601 */ 602 if (!fastopen && 603 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 604 tcp_ld_RTO_revert(sk, seq); 605 break; 606 case ICMP_TIME_EXCEEDED: 607 err = EHOSTUNREACH; 608 break; 609 default: 610 goto out; 611 } 612 613 switch (sk->sk_state) { 614 case TCP_SYN_SENT: 615 case TCP_SYN_RECV: 616 /* Only in fast or simultaneous open. If a fast open socket is 617 * already accepted it is treated as a connected one below. 618 */ 619 if (fastopen && !fastopen->sk) 620 break; 621 622 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 623 624 if (!sock_owned_by_user(sk)) 625 tcp_done_with_error(sk, err); 626 else 627 WRITE_ONCE(sk->sk_err_soft, err); 628 goto out; 629 } 630 631 /* If we've already connected we will keep trying 632 * until we time out, or the user gives up. 633 * 634 * rfc1122 4.2.3.9 allows to consider as hard errors 635 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 636 * but it is obsoleted by pmtu discovery). 637 * 638 * Note, that in modern internet, where routing is unreliable 639 * and in each dark corner broken firewalls sit, sending random 640 * errors ordered by their masters even this two messages finally lose 641 * their original sense (even Linux sends invalid PORT_UNREACHs) 642 * 643 * Now we are in compliance with RFCs. 644 * --ANK (980905) 645 */ 646 647 if (!sock_owned_by_user(sk) && 648 inet_test_bit(RECVERR, sk)) { 649 WRITE_ONCE(sk->sk_err, err); 650 sk_error_report(sk); 651 } else { /* Only an error on timeout */ 652 WRITE_ONCE(sk->sk_err_soft, err); 653 } 654 655 out: 656 bh_unlock_sock(sk); 657 sock_put(sk); 658 return 0; 659 } 660 661 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 662 { 663 struct tcphdr *th = tcp_hdr(skb); 664 665 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 666 skb->csum_start = skb_transport_header(skb) - skb->head; 667 skb->csum_offset = offsetof(struct tcphdr, check); 668 } 669 670 /* This routine computes an IPv4 TCP checksum. */ 671 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 672 { 673 const struct inet_sock *inet = inet_sk(sk); 674 675 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 676 } 677 EXPORT_IPV6_MOD(tcp_v4_send_check); 678 679 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 680 681 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 682 const struct tcp_ao_hdr *aoh, 683 struct ip_reply_arg *arg, struct tcphdr *reply, 684 __be32 reply_options[REPLY_OPTIONS_LEN]) 685 { 686 #ifdef CONFIG_TCP_AO 687 int sdif = tcp_v4_sdif(skb); 688 int dif = inet_iif(skb); 689 int l3index = sdif ? dif : 0; 690 bool allocated_traffic_key; 691 struct tcp_ao_key *key; 692 char *traffic_key; 693 bool drop = true; 694 u32 ao_sne = 0; 695 u8 keyid; 696 697 rcu_read_lock(); 698 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 699 &key, &traffic_key, &allocated_traffic_key, 700 &keyid, &ao_sne)) 701 goto out; 702 703 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 704 (aoh->rnext_keyid << 8) | keyid); 705 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 706 reply->doff = arg->iov[0].iov_len / 4; 707 708 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 709 key, traffic_key, 710 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 711 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 712 reply, ao_sne)) 713 goto out; 714 drop = false; 715 out: 716 rcu_read_unlock(); 717 if (allocated_traffic_key) 718 kfree(traffic_key); 719 return drop; 720 #else 721 return true; 722 #endif 723 } 724 725 /* 726 * This routine will send an RST to the other tcp. 727 * 728 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 729 * for reset. 730 * Answer: if a packet caused RST, it is not for a socket 731 * existing in our system, if it is matched to a socket, 732 * it is just duplicate segment or bug in other side's TCP. 733 * So that we build reply only basing on parameters 734 * arrived with segment. 735 * Exception: precedence violation. We do not implement it in any case. 736 */ 737 738 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 739 enum sk_rst_reason reason) 740 { 741 const struct tcphdr *th = tcp_hdr(skb); 742 struct { 743 struct tcphdr th; 744 __be32 opt[REPLY_OPTIONS_LEN]; 745 } rep; 746 const __u8 *md5_hash_location = NULL; 747 const struct tcp_ao_hdr *aoh; 748 struct ip_reply_arg arg; 749 #ifdef CONFIG_TCP_MD5SIG 750 struct tcp_md5sig_key *key = NULL; 751 unsigned char newhash[16]; 752 struct sock *sk1 = NULL; 753 int genhash; 754 #endif 755 u64 transmit_time = 0; 756 struct sock *ctl_sk; 757 struct net *net; 758 u32 txhash = 0; 759 760 /* Never send a reset in response to a reset. */ 761 if (th->rst) 762 return; 763 764 /* If sk not NULL, it means we did a successful lookup and incoming 765 * route had to be correct. prequeue might have dropped our dst. 766 */ 767 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 768 return; 769 770 /* Swap the send and the receive. */ 771 memset(&rep, 0, sizeof(rep)); 772 rep.th.dest = th->source; 773 rep.th.source = th->dest; 774 rep.th.doff = sizeof(struct tcphdr) / 4; 775 rep.th.rst = 1; 776 777 if (th->ack) { 778 rep.th.seq = th->ack_seq; 779 } else { 780 rep.th.ack = 1; 781 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 782 skb->len - (th->doff << 2)); 783 } 784 785 memset(&arg, 0, sizeof(arg)); 786 arg.iov[0].iov_base = (unsigned char *)&rep; 787 arg.iov[0].iov_len = sizeof(rep.th); 788 789 net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); 790 791 /* Invalid TCP option size or twice included auth */ 792 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 793 return; 794 795 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 796 return; 797 798 #ifdef CONFIG_TCP_MD5SIG 799 rcu_read_lock(); 800 if (sk && sk_fullsock(sk)) { 801 const union tcp_md5_addr *addr; 802 int l3index; 803 804 /* sdif set, means packet ingressed via a device 805 * in an L3 domain and inet_iif is set to it. 806 */ 807 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 808 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 809 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 810 } else if (md5_hash_location) { 811 const union tcp_md5_addr *addr; 812 int sdif = tcp_v4_sdif(skb); 813 int dif = inet_iif(skb); 814 int l3index; 815 816 /* 817 * active side is lost. Try to find listening socket through 818 * source port, and then find md5 key through listening socket. 819 * we are not loose security here: 820 * Incoming packet is checked with md5 hash with finding key, 821 * no RST generated if md5 hash doesn't match. 822 */ 823 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 824 NULL, 0, ip_hdr(skb)->saddr, 825 th->source, ip_hdr(skb)->daddr, 826 ntohs(th->source), dif, sdif); 827 /* don't send rst if it can't find key */ 828 if (!sk1) 829 goto out; 830 831 /* sdif set, means packet ingressed via a device 832 * in an L3 domain and dif is set to it. 833 */ 834 l3index = sdif ? dif : 0; 835 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 836 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 837 if (!key) 838 goto out; 839 840 841 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 842 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 843 goto out; 844 845 } 846 847 if (key) { 848 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 849 (TCPOPT_NOP << 16) | 850 (TCPOPT_MD5SIG << 8) | 851 TCPOLEN_MD5SIG); 852 /* Update length and the length the header thinks exists */ 853 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 854 rep.th.doff = arg.iov[0].iov_len / 4; 855 856 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 857 key, ip_hdr(skb)->saddr, 858 ip_hdr(skb)->daddr, &rep.th); 859 } 860 #endif 861 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 862 if (rep.opt[0] == 0) { 863 __be32 mrst = mptcp_reset_option(skb); 864 865 if (mrst) { 866 rep.opt[0] = mrst; 867 arg.iov[0].iov_len += sizeof(mrst); 868 rep.th.doff = arg.iov[0].iov_len / 4; 869 } 870 } 871 872 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 873 ip_hdr(skb)->saddr, /* XXX */ 874 arg.iov[0].iov_len, IPPROTO_TCP, 0); 875 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 876 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 877 878 /* When socket is gone, all binding information is lost. 879 * routing might fail in this case. No choice here, if we choose to force 880 * input interface, we will misroute in case of asymmetric route. 881 */ 882 if (sk) 883 arg.bound_dev_if = sk->sk_bound_dev_if; 884 885 trace_tcp_send_reset(sk, skb, reason); 886 887 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 888 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 889 890 arg.tos = ip_hdr(skb)->tos; 891 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 892 local_bh_disable(); 893 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 894 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 895 896 sock_net_set(ctl_sk, net); 897 if (sk) { 898 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 899 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 900 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 901 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 902 transmit_time = tcp_transmit_time(sk); 903 xfrm_sk_clone_policy(ctl_sk, sk); 904 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 905 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 906 } else { 907 ctl_sk->sk_mark = 0; 908 ctl_sk->sk_priority = 0; 909 } 910 ip_send_unicast_reply(ctl_sk, sk, 911 skb, &TCP_SKB_CB(skb)->header.h4.opt, 912 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 913 &arg, arg.iov[0].iov_len, 914 transmit_time, txhash); 915 916 xfrm_sk_free_policy(ctl_sk); 917 sock_net_set(ctl_sk, &init_net); 918 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 919 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 920 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 921 local_bh_enable(); 922 923 #ifdef CONFIG_TCP_MD5SIG 924 out: 925 rcu_read_unlock(); 926 #endif 927 } 928 929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 930 outside socket context is ugly, certainly. What can I do? 931 */ 932 933 static void tcp_v4_send_ack(const struct sock *sk, 934 struct sk_buff *skb, u32 seq, u32 ack, 935 u32 win, u32 tsval, u32 tsecr, int oif, 936 struct tcp_key *key, 937 int reply_flags, u8 tos, u32 txhash) 938 { 939 const struct tcphdr *th = tcp_hdr(skb); 940 struct { 941 struct tcphdr th; 942 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 943 } rep; 944 struct net *net = sock_net(sk); 945 struct ip_reply_arg arg; 946 struct sock *ctl_sk; 947 u64 transmit_time; 948 949 memset(&rep.th, 0, sizeof(struct tcphdr)); 950 memset(&arg, 0, sizeof(arg)); 951 952 arg.iov[0].iov_base = (unsigned char *)&rep; 953 arg.iov[0].iov_len = sizeof(rep.th); 954 if (tsecr) { 955 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 956 (TCPOPT_TIMESTAMP << 8) | 957 TCPOLEN_TIMESTAMP); 958 rep.opt[1] = htonl(tsval); 959 rep.opt[2] = htonl(tsecr); 960 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 961 } 962 963 /* Swap the send and the receive. */ 964 rep.th.dest = th->source; 965 rep.th.source = th->dest; 966 rep.th.doff = arg.iov[0].iov_len / 4; 967 rep.th.seq = htonl(seq); 968 rep.th.ack_seq = htonl(ack); 969 rep.th.ack = 1; 970 rep.th.window = htons(win); 971 972 #ifdef CONFIG_TCP_MD5SIG 973 if (tcp_key_is_md5(key)) { 974 int offset = (tsecr) ? 3 : 0; 975 976 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 977 (TCPOPT_NOP << 16) | 978 (TCPOPT_MD5SIG << 8) | 979 TCPOLEN_MD5SIG); 980 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 981 rep.th.doff = arg.iov[0].iov_len/4; 982 983 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 984 key->md5_key, ip_hdr(skb)->saddr, 985 ip_hdr(skb)->daddr, &rep.th); 986 } 987 #endif 988 #ifdef CONFIG_TCP_AO 989 if (tcp_key_is_ao(key)) { 990 int offset = (tsecr) ? 3 : 0; 991 992 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 993 (tcp_ao_len(key->ao_key) << 16) | 994 (key->ao_key->sndid << 8) | 995 key->rcv_next); 996 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 997 rep.th.doff = arg.iov[0].iov_len / 4; 998 999 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1000 key->ao_key, key->traffic_key, 1001 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1002 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1003 &rep.th, key->sne); 1004 } 1005 #endif 1006 arg.flags = reply_flags; 1007 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1008 ip_hdr(skb)->saddr, /* XXX */ 1009 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1010 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1011 if (oif) 1012 arg.bound_dev_if = oif; 1013 arg.tos = tos; 1014 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1015 local_bh_disable(); 1016 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1017 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1018 sock_net_set(ctl_sk, net); 1019 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1020 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1021 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1022 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1023 transmit_time = tcp_transmit_time(sk); 1024 ip_send_unicast_reply(ctl_sk, sk, 1025 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1026 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1027 &arg, arg.iov[0].iov_len, 1028 transmit_time, txhash); 1029 1030 sock_net_set(ctl_sk, &init_net); 1031 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1032 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1033 local_bh_enable(); 1034 } 1035 1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 1037 { 1038 struct inet_timewait_sock *tw = inet_twsk(sk); 1039 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1040 struct tcp_key key = {}; 1041 #ifdef CONFIG_TCP_AO 1042 struct tcp_ao_info *ao_info; 1043 1044 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1045 /* FIXME: the segment to-be-acked is not verified yet */ 1046 ao_info = rcu_dereference(tcptw->ao_info); 1047 if (ao_info) { 1048 const struct tcp_ao_hdr *aoh; 1049 1050 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1051 inet_twsk_put(tw); 1052 return; 1053 } 1054 1055 if (aoh) 1056 key.ao_key = tcp_ao_established_key(sk, ao_info, 1057 aoh->rnext_keyid, -1); 1058 } 1059 } 1060 if (key.ao_key) { 1061 struct tcp_ao_key *rnext_key; 1062 1063 key.traffic_key = snd_other_key(key.ao_key); 1064 key.sne = READ_ONCE(ao_info->snd_sne); 1065 rnext_key = READ_ONCE(ao_info->rnext_key); 1066 key.rcv_next = rnext_key->rcvid; 1067 key.type = TCP_KEY_AO; 1068 #else 1069 if (0) { 1070 #endif 1071 } else if (static_branch_tcp_md5()) { 1072 key.md5_key = tcp_twsk_md5_key(tcptw); 1073 if (key.md5_key) 1074 key.type = TCP_KEY_MD5; 1075 } 1076 1077 tcp_v4_send_ack(sk, skb, 1078 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1079 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1080 tcp_tw_tsval(tcptw), 1081 READ_ONCE(tcptw->tw_ts_recent), 1082 tw->tw_bound_dev_if, &key, 1083 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1084 tw->tw_tos, 1085 tw->tw_txhash); 1086 1087 inet_twsk_put(tw); 1088 } 1089 1090 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1091 struct request_sock *req) 1092 { 1093 struct tcp_key key = {}; 1094 1095 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1096 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1097 */ 1098 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1099 tcp_sk(sk)->snd_nxt; 1100 1101 #ifdef CONFIG_TCP_AO 1102 if (static_branch_unlikely(&tcp_ao_needed.key) && 1103 tcp_rsk_used_ao(req)) { 1104 const union tcp_md5_addr *addr; 1105 const struct tcp_ao_hdr *aoh; 1106 int l3index; 1107 1108 /* Invalid TCP option size or twice included auth */ 1109 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1110 return; 1111 if (!aoh) 1112 return; 1113 1114 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1115 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1116 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1117 aoh->rnext_keyid, -1); 1118 if (unlikely(!key.ao_key)) { 1119 /* Send ACK with any matching MKT for the peer */ 1120 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1121 /* Matching key disappeared (user removed the key?) 1122 * let the handshake timeout. 1123 */ 1124 if (!key.ao_key) { 1125 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1126 addr, 1127 ntohs(tcp_hdr(skb)->source), 1128 &ip_hdr(skb)->daddr, 1129 ntohs(tcp_hdr(skb)->dest)); 1130 return; 1131 } 1132 } 1133 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1134 if (!key.traffic_key) 1135 return; 1136 1137 key.type = TCP_KEY_AO; 1138 key.rcv_next = aoh->keyid; 1139 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1140 #else 1141 if (0) { 1142 #endif 1143 } else if (static_branch_tcp_md5()) { 1144 const union tcp_md5_addr *addr; 1145 int l3index; 1146 1147 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1148 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1149 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1150 if (key.md5_key) 1151 key.type = TCP_KEY_MD5; 1152 } 1153 1154 tcp_v4_send_ack(sk, skb, seq, 1155 tcp_rsk(req)->rcv_nxt, 1156 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1157 tcp_rsk_tsval(tcp_rsk(req)), 1158 req->ts_recent, 1159 0, &key, 1160 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1161 ip_hdr(skb)->tos, 1162 READ_ONCE(tcp_rsk(req)->txhash)); 1163 if (tcp_key_is_ao(&key)) 1164 kfree(key.traffic_key); 1165 } 1166 1167 /* 1168 * Send a SYN-ACK after having received a SYN. 1169 * This still operates on a request_sock only, not on a big 1170 * socket. 1171 */ 1172 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1173 struct flowi *fl, 1174 struct request_sock *req, 1175 struct tcp_fastopen_cookie *foc, 1176 enum tcp_synack_type synack_type, 1177 struct sk_buff *syn_skb) 1178 { 1179 const struct inet_request_sock *ireq = inet_rsk(req); 1180 struct flowi4 fl4; 1181 int err = -1; 1182 struct sk_buff *skb; 1183 u8 tos; 1184 1185 /* First, grab a route. */ 1186 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1187 return -1; 1188 1189 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1190 1191 if (skb) { 1192 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1193 1194 tos = READ_ONCE(inet_sk(sk)->tos); 1195 1196 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1197 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1198 (tos & INET_ECN_MASK); 1199 1200 if (!INET_ECN_is_capable(tos) && 1201 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1202 tos |= INET_ECN_ECT_0; 1203 1204 rcu_read_lock(); 1205 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1206 ireq->ir_rmt_addr, 1207 rcu_dereference(ireq->ireq_opt), 1208 tos); 1209 rcu_read_unlock(); 1210 err = net_xmit_eval(err); 1211 } 1212 1213 return err; 1214 } 1215 1216 /* 1217 * IPv4 request_sock destructor. 1218 */ 1219 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1220 { 1221 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1222 } 1223 1224 #ifdef CONFIG_TCP_MD5SIG 1225 /* 1226 * RFC2385 MD5 checksumming requires a mapping of 1227 * IP address->MD5 Key. 1228 * We need to maintain these in the sk structure. 1229 */ 1230 1231 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1232 EXPORT_IPV6_MOD(tcp_md5_needed); 1233 1234 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1235 { 1236 if (!old) 1237 return true; 1238 1239 /* l3index always overrides non-l3index */ 1240 if (old->l3index && new->l3index == 0) 1241 return false; 1242 if (old->l3index == 0 && new->l3index) 1243 return true; 1244 1245 return old->prefixlen < new->prefixlen; 1246 } 1247 1248 /* Find the Key structure for an address. */ 1249 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1250 const union tcp_md5_addr *addr, 1251 int family, bool any_l3index) 1252 { 1253 const struct tcp_sock *tp = tcp_sk(sk); 1254 struct tcp_md5sig_key *key; 1255 const struct tcp_md5sig_info *md5sig; 1256 __be32 mask; 1257 struct tcp_md5sig_key *best_match = NULL; 1258 bool match; 1259 1260 /* caller either holds rcu_read_lock() or socket lock */ 1261 md5sig = rcu_dereference_check(tp->md5sig_info, 1262 lockdep_sock_is_held(sk)); 1263 if (!md5sig) 1264 return NULL; 1265 1266 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1267 lockdep_sock_is_held(sk)) { 1268 if (key->family != family) 1269 continue; 1270 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1271 key->l3index != l3index) 1272 continue; 1273 if (family == AF_INET) { 1274 mask = inet_make_mask(key->prefixlen); 1275 match = (key->addr.a4.s_addr & mask) == 1276 (addr->a4.s_addr & mask); 1277 #if IS_ENABLED(CONFIG_IPV6) 1278 } else if (family == AF_INET6) { 1279 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1280 key->prefixlen); 1281 #endif 1282 } else { 1283 match = false; 1284 } 1285 1286 if (match && better_md5_match(best_match, key)) 1287 best_match = key; 1288 } 1289 return best_match; 1290 } 1291 EXPORT_IPV6_MOD(__tcp_md5_do_lookup); 1292 1293 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1294 const union tcp_md5_addr *addr, 1295 int family, u8 prefixlen, 1296 int l3index, u8 flags) 1297 { 1298 const struct tcp_sock *tp = tcp_sk(sk); 1299 struct tcp_md5sig_key *key; 1300 unsigned int size = sizeof(struct in_addr); 1301 const struct tcp_md5sig_info *md5sig; 1302 1303 /* caller either holds rcu_read_lock() or socket lock */ 1304 md5sig = rcu_dereference_check(tp->md5sig_info, 1305 lockdep_sock_is_held(sk)); 1306 if (!md5sig) 1307 return NULL; 1308 #if IS_ENABLED(CONFIG_IPV6) 1309 if (family == AF_INET6) 1310 size = sizeof(struct in6_addr); 1311 #endif 1312 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1313 lockdep_sock_is_held(sk)) { 1314 if (key->family != family) 1315 continue; 1316 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1317 continue; 1318 if (key->l3index != l3index) 1319 continue; 1320 if (!memcmp(&key->addr, addr, size) && 1321 key->prefixlen == prefixlen) 1322 return key; 1323 } 1324 return NULL; 1325 } 1326 1327 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1328 const struct sock *addr_sk) 1329 { 1330 const union tcp_md5_addr *addr; 1331 int l3index; 1332 1333 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1334 addr_sk->sk_bound_dev_if); 1335 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1336 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1337 } 1338 EXPORT_IPV6_MOD(tcp_v4_md5_lookup); 1339 1340 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1341 { 1342 struct tcp_sock *tp = tcp_sk(sk); 1343 struct tcp_md5sig_info *md5sig; 1344 1345 md5sig = kmalloc(sizeof(*md5sig), gfp); 1346 if (!md5sig) 1347 return -ENOMEM; 1348 1349 sk_gso_disable(sk); 1350 INIT_HLIST_HEAD(&md5sig->head); 1351 rcu_assign_pointer(tp->md5sig_info, md5sig); 1352 return 0; 1353 } 1354 1355 /* This can be called on a newly created socket, from other files */ 1356 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1357 int family, u8 prefixlen, int l3index, u8 flags, 1358 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1359 { 1360 /* Add Key to the list */ 1361 struct tcp_md5sig_key *key; 1362 struct tcp_sock *tp = tcp_sk(sk); 1363 struct tcp_md5sig_info *md5sig; 1364 1365 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1366 if (key) { 1367 /* Pre-existing entry - just update that one. 1368 * Note that the key might be used concurrently. 1369 * data_race() is telling kcsan that we do not care of 1370 * key mismatches, since changing MD5 key on live flows 1371 * can lead to packet drops. 1372 */ 1373 data_race(memcpy(key->key, newkey, newkeylen)); 1374 1375 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1376 * Also note that a reader could catch new key->keylen value 1377 * but old key->key[], this is the reason we use __GFP_ZERO 1378 * at sock_kmalloc() time below these lines. 1379 */ 1380 WRITE_ONCE(key->keylen, newkeylen); 1381 1382 return 0; 1383 } 1384 1385 md5sig = rcu_dereference_protected(tp->md5sig_info, 1386 lockdep_sock_is_held(sk)); 1387 1388 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1389 if (!key) 1390 return -ENOMEM; 1391 1392 memcpy(key->key, newkey, newkeylen); 1393 key->keylen = newkeylen; 1394 key->family = family; 1395 key->prefixlen = prefixlen; 1396 key->l3index = l3index; 1397 key->flags = flags; 1398 memcpy(&key->addr, addr, 1399 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1400 sizeof(struct in_addr)); 1401 hlist_add_head_rcu(&key->node, &md5sig->head); 1402 return 0; 1403 } 1404 1405 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1406 int family, u8 prefixlen, int l3index, u8 flags, 1407 const u8 *newkey, u8 newkeylen) 1408 { 1409 struct tcp_sock *tp = tcp_sk(sk); 1410 1411 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1412 if (tcp_md5_alloc_sigpool()) 1413 return -ENOMEM; 1414 1415 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1416 tcp_md5_release_sigpool(); 1417 return -ENOMEM; 1418 } 1419 1420 if (!static_branch_inc(&tcp_md5_needed.key)) { 1421 struct tcp_md5sig_info *md5sig; 1422 1423 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1424 rcu_assign_pointer(tp->md5sig_info, NULL); 1425 kfree_rcu(md5sig, rcu); 1426 tcp_md5_release_sigpool(); 1427 return -EUSERS; 1428 } 1429 } 1430 1431 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1432 newkey, newkeylen, GFP_KERNEL); 1433 } 1434 EXPORT_IPV6_MOD(tcp_md5_do_add); 1435 1436 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1437 int family, u8 prefixlen, int l3index, 1438 struct tcp_md5sig_key *key) 1439 { 1440 struct tcp_sock *tp = tcp_sk(sk); 1441 1442 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1443 tcp_md5_add_sigpool(); 1444 1445 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1446 tcp_md5_release_sigpool(); 1447 return -ENOMEM; 1448 } 1449 1450 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1451 struct tcp_md5sig_info *md5sig; 1452 1453 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1454 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1455 rcu_assign_pointer(tp->md5sig_info, NULL); 1456 kfree_rcu(md5sig, rcu); 1457 tcp_md5_release_sigpool(); 1458 return -EUSERS; 1459 } 1460 } 1461 1462 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1463 key->flags, key->key, key->keylen, 1464 sk_gfp_mask(sk, GFP_ATOMIC)); 1465 } 1466 EXPORT_IPV6_MOD(tcp_md5_key_copy); 1467 1468 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1469 u8 prefixlen, int l3index, u8 flags) 1470 { 1471 struct tcp_md5sig_key *key; 1472 1473 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1474 if (!key) 1475 return -ENOENT; 1476 hlist_del_rcu(&key->node); 1477 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1478 kfree_rcu(key, rcu); 1479 return 0; 1480 } 1481 EXPORT_IPV6_MOD(tcp_md5_do_del); 1482 1483 void tcp_clear_md5_list(struct sock *sk) 1484 { 1485 struct tcp_sock *tp = tcp_sk(sk); 1486 struct tcp_md5sig_key *key; 1487 struct hlist_node *n; 1488 struct tcp_md5sig_info *md5sig; 1489 1490 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1491 1492 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1493 hlist_del_rcu(&key->node); 1494 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1495 kfree_rcu(key, rcu); 1496 } 1497 } 1498 1499 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1500 sockptr_t optval, int optlen) 1501 { 1502 struct tcp_md5sig cmd; 1503 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1504 const union tcp_md5_addr *addr; 1505 u8 prefixlen = 32; 1506 int l3index = 0; 1507 bool l3flag; 1508 u8 flags; 1509 1510 if (optlen < sizeof(cmd)) 1511 return -EINVAL; 1512 1513 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1514 return -EFAULT; 1515 1516 if (sin->sin_family != AF_INET) 1517 return -EINVAL; 1518 1519 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1520 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1521 1522 if (optname == TCP_MD5SIG_EXT && 1523 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1524 prefixlen = cmd.tcpm_prefixlen; 1525 if (prefixlen > 32) 1526 return -EINVAL; 1527 } 1528 1529 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1530 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1531 struct net_device *dev; 1532 1533 rcu_read_lock(); 1534 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1535 if (dev && netif_is_l3_master(dev)) 1536 l3index = dev->ifindex; 1537 1538 rcu_read_unlock(); 1539 1540 /* ok to reference set/not set outside of rcu; 1541 * right now device MUST be an L3 master 1542 */ 1543 if (!dev || !l3index) 1544 return -EINVAL; 1545 } 1546 1547 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1548 1549 if (!cmd.tcpm_keylen) 1550 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1551 1552 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1553 return -EINVAL; 1554 1555 /* Don't allow keys for peers that have a matching TCP-AO key. 1556 * See the comment in tcp_ao_add_cmd() 1557 */ 1558 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1559 return -EKEYREJECTED; 1560 1561 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1562 cmd.tcpm_key, cmd.tcpm_keylen); 1563 } 1564 1565 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1566 __be32 daddr, __be32 saddr, 1567 const struct tcphdr *th, int nbytes) 1568 { 1569 struct tcp4_pseudohdr *bp; 1570 struct scatterlist sg; 1571 struct tcphdr *_th; 1572 1573 bp = hp->scratch; 1574 bp->saddr = saddr; 1575 bp->daddr = daddr; 1576 bp->pad = 0; 1577 bp->protocol = IPPROTO_TCP; 1578 bp->len = cpu_to_be16(nbytes); 1579 1580 _th = (struct tcphdr *)(bp + 1); 1581 memcpy(_th, th, sizeof(*th)); 1582 _th->check = 0; 1583 1584 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1585 ahash_request_set_crypt(hp->req, &sg, NULL, 1586 sizeof(*bp) + sizeof(*th)); 1587 return crypto_ahash_update(hp->req); 1588 } 1589 1590 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1591 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1592 { 1593 struct tcp_sigpool hp; 1594 1595 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1596 goto clear_hash_nostart; 1597 1598 if (crypto_ahash_init(hp.req)) 1599 goto clear_hash; 1600 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1601 goto clear_hash; 1602 if (tcp_md5_hash_key(&hp, key)) 1603 goto clear_hash; 1604 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1605 if (crypto_ahash_final(hp.req)) 1606 goto clear_hash; 1607 1608 tcp_sigpool_end(&hp); 1609 return 0; 1610 1611 clear_hash: 1612 tcp_sigpool_end(&hp); 1613 clear_hash_nostart: 1614 memset(md5_hash, 0, 16); 1615 return 1; 1616 } 1617 1618 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1619 const struct sock *sk, 1620 const struct sk_buff *skb) 1621 { 1622 const struct tcphdr *th = tcp_hdr(skb); 1623 struct tcp_sigpool hp; 1624 __be32 saddr, daddr; 1625 1626 if (sk) { /* valid for establish/request sockets */ 1627 saddr = sk->sk_rcv_saddr; 1628 daddr = sk->sk_daddr; 1629 } else { 1630 const struct iphdr *iph = ip_hdr(skb); 1631 saddr = iph->saddr; 1632 daddr = iph->daddr; 1633 } 1634 1635 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1636 goto clear_hash_nostart; 1637 1638 if (crypto_ahash_init(hp.req)) 1639 goto clear_hash; 1640 1641 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1642 goto clear_hash; 1643 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1644 goto clear_hash; 1645 if (tcp_md5_hash_key(&hp, key)) 1646 goto clear_hash; 1647 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1648 if (crypto_ahash_final(hp.req)) 1649 goto clear_hash; 1650 1651 tcp_sigpool_end(&hp); 1652 return 0; 1653 1654 clear_hash: 1655 tcp_sigpool_end(&hp); 1656 clear_hash_nostart: 1657 memset(md5_hash, 0, 16); 1658 return 1; 1659 } 1660 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); 1661 1662 #endif 1663 1664 static void tcp_v4_init_req(struct request_sock *req, 1665 const struct sock *sk_listener, 1666 struct sk_buff *skb) 1667 { 1668 struct inet_request_sock *ireq = inet_rsk(req); 1669 struct net *net = sock_net(sk_listener); 1670 1671 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1672 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1673 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1674 } 1675 1676 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1677 struct sk_buff *skb, 1678 struct flowi *fl, 1679 struct request_sock *req, 1680 u32 tw_isn) 1681 { 1682 tcp_v4_init_req(req, sk, skb); 1683 1684 if (security_inet_conn_request(sk, skb, req)) 1685 return NULL; 1686 1687 return inet_csk_route_req(sk, &fl->u.ip4, req); 1688 } 1689 1690 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1691 .family = PF_INET, 1692 .obj_size = sizeof(struct tcp_request_sock), 1693 .rtx_syn_ack = tcp_rtx_synack, 1694 .send_ack = tcp_v4_reqsk_send_ack, 1695 .destructor = tcp_v4_reqsk_destructor, 1696 .send_reset = tcp_v4_send_reset, 1697 .syn_ack_timeout = tcp_syn_ack_timeout, 1698 }; 1699 1700 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1701 .mss_clamp = TCP_MSS_DEFAULT, 1702 #ifdef CONFIG_TCP_MD5SIG 1703 .req_md5_lookup = tcp_v4_md5_lookup, 1704 .calc_md5_hash = tcp_v4_md5_hash_skb, 1705 #endif 1706 #ifdef CONFIG_TCP_AO 1707 .ao_lookup = tcp_v4_ao_lookup_rsk, 1708 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1709 .ao_synack_hash = tcp_v4_ao_synack_hash, 1710 #endif 1711 #ifdef CONFIG_SYN_COOKIES 1712 .cookie_init_seq = cookie_v4_init_sequence, 1713 #endif 1714 .route_req = tcp_v4_route_req, 1715 .init_seq = tcp_v4_init_seq, 1716 .init_ts_off = tcp_v4_init_ts_off, 1717 .send_synack = tcp_v4_send_synack, 1718 }; 1719 1720 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1721 { 1722 /* Never answer to SYNs send to broadcast or multicast */ 1723 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1724 goto drop; 1725 1726 return tcp_conn_request(&tcp_request_sock_ops, 1727 &tcp_request_sock_ipv4_ops, sk, skb); 1728 1729 drop: 1730 tcp_listendrop(sk); 1731 return 0; 1732 } 1733 EXPORT_IPV6_MOD(tcp_v4_conn_request); 1734 1735 1736 /* 1737 * The three way handshake has completed - we got a valid synack - 1738 * now create the new socket. 1739 */ 1740 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1741 struct request_sock *req, 1742 struct dst_entry *dst, 1743 struct request_sock *req_unhash, 1744 bool *own_req) 1745 { 1746 struct inet_request_sock *ireq; 1747 bool found_dup_sk = false; 1748 struct inet_sock *newinet; 1749 struct tcp_sock *newtp; 1750 struct sock *newsk; 1751 #ifdef CONFIG_TCP_MD5SIG 1752 const union tcp_md5_addr *addr; 1753 struct tcp_md5sig_key *key; 1754 int l3index; 1755 #endif 1756 struct ip_options_rcu *inet_opt; 1757 1758 if (sk_acceptq_is_full(sk)) 1759 goto exit_overflow; 1760 1761 newsk = tcp_create_openreq_child(sk, req, skb); 1762 if (!newsk) 1763 goto exit_nonewsk; 1764 1765 newsk->sk_gso_type = SKB_GSO_TCPV4; 1766 inet_sk_rx_dst_set(newsk, skb); 1767 1768 newtp = tcp_sk(newsk); 1769 newinet = inet_sk(newsk); 1770 ireq = inet_rsk(req); 1771 inet_opt = rcu_dereference(ireq->ireq_opt); 1772 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1773 newinet->mc_index = inet_iif(skb); 1774 newinet->mc_ttl = ip_hdr(skb)->ttl; 1775 newinet->rcv_tos = ip_hdr(skb)->tos; 1776 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1777 if (inet_opt) 1778 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1779 atomic_set(&newinet->inet_id, get_random_u16()); 1780 1781 /* Set ToS of the new socket based upon the value of incoming SYN. 1782 * ECT bits are set later in tcp_init_transfer(). 1783 */ 1784 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1785 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1786 1787 if (!dst) { 1788 dst = inet_csk_route_child_sock(sk, newsk, req); 1789 if (!dst) 1790 goto put_and_exit; 1791 } else { 1792 /* syncookie case : see end of cookie_v4_check() */ 1793 } 1794 sk_setup_caps(newsk, dst); 1795 1796 tcp_ca_openreq_child(newsk, dst); 1797 1798 tcp_sync_mss(newsk, dst_mtu(dst)); 1799 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1800 1801 tcp_initialize_rcv_mss(newsk); 1802 1803 #ifdef CONFIG_TCP_MD5SIG 1804 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1805 /* Copy over the MD5 key from the original socket */ 1806 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1807 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1808 if (key && !tcp_rsk_used_ao(req)) { 1809 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1810 goto put_and_exit; 1811 sk_gso_disable(newsk); 1812 } 1813 #endif 1814 #ifdef CONFIG_TCP_AO 1815 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1816 goto put_and_exit; /* OOM, release back memory */ 1817 #endif 1818 1819 if (__inet_inherit_port(sk, newsk) < 0) 1820 goto put_and_exit; 1821 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1822 &found_dup_sk); 1823 if (likely(*own_req)) { 1824 tcp_move_syn(newtp, req); 1825 ireq->ireq_opt = NULL; 1826 } else { 1827 newinet->inet_opt = NULL; 1828 1829 if (!req_unhash && found_dup_sk) { 1830 /* This code path should only be executed in the 1831 * syncookie case only 1832 */ 1833 bh_unlock_sock(newsk); 1834 sock_put(newsk); 1835 newsk = NULL; 1836 } 1837 } 1838 return newsk; 1839 1840 exit_overflow: 1841 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1842 exit_nonewsk: 1843 dst_release(dst); 1844 exit: 1845 tcp_listendrop(sk); 1846 return NULL; 1847 put_and_exit: 1848 newinet->inet_opt = NULL; 1849 inet_csk_prepare_forced_close(newsk); 1850 tcp_done(newsk); 1851 goto exit; 1852 } 1853 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); 1854 1855 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1856 { 1857 #ifdef CONFIG_SYN_COOKIES 1858 const struct tcphdr *th = tcp_hdr(skb); 1859 1860 if (!th->syn) 1861 sk = cookie_v4_check(sk, skb); 1862 #endif 1863 return sk; 1864 } 1865 1866 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1867 struct tcphdr *th, u32 *cookie) 1868 { 1869 u16 mss = 0; 1870 #ifdef CONFIG_SYN_COOKIES 1871 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1872 &tcp_request_sock_ipv4_ops, sk, th); 1873 if (mss) { 1874 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1875 tcp_synq_overflow(sk); 1876 } 1877 #endif 1878 return mss; 1879 } 1880 1881 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1882 u32)); 1883 /* The socket must have it's spinlock held when we get 1884 * here, unless it is a TCP_LISTEN socket. 1885 * 1886 * We have a potential double-lock case here, so even when 1887 * doing backlog processing we use the BH locking scheme. 1888 * This is because we cannot sleep with the original spinlock 1889 * held. 1890 */ 1891 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1892 { 1893 enum skb_drop_reason reason; 1894 struct sock *rsk; 1895 1896 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1897 struct dst_entry *dst; 1898 1899 dst = rcu_dereference_protected(sk->sk_rx_dst, 1900 lockdep_sock_is_held(sk)); 1901 1902 sock_rps_save_rxhash(sk, skb); 1903 sk_mark_napi_id(sk, skb); 1904 if (dst) { 1905 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1906 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1907 dst, 0)) { 1908 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1909 dst_release(dst); 1910 } 1911 } 1912 tcp_rcv_established(sk, skb); 1913 return 0; 1914 } 1915 1916 if (tcp_checksum_complete(skb)) 1917 goto csum_err; 1918 1919 if (sk->sk_state == TCP_LISTEN) { 1920 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1921 1922 if (!nsk) 1923 return 0; 1924 if (nsk != sk) { 1925 reason = tcp_child_process(sk, nsk, skb); 1926 if (reason) { 1927 rsk = nsk; 1928 goto reset; 1929 } 1930 return 0; 1931 } 1932 } else 1933 sock_rps_save_rxhash(sk, skb); 1934 1935 reason = tcp_rcv_state_process(sk, skb); 1936 if (reason) { 1937 rsk = sk; 1938 goto reset; 1939 } 1940 return 0; 1941 1942 reset: 1943 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1944 discard: 1945 sk_skb_reason_drop(sk, skb, reason); 1946 /* Be careful here. If this function gets more complicated and 1947 * gcc suffers from register pressure on the x86, sk (in %ebx) 1948 * might be destroyed here. This current version compiles correctly, 1949 * but you have been warned. 1950 */ 1951 return 0; 1952 1953 csum_err: 1954 reason = SKB_DROP_REASON_TCP_CSUM; 1955 trace_tcp_bad_csum(skb); 1956 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1957 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1958 goto discard; 1959 } 1960 EXPORT_SYMBOL(tcp_v4_do_rcv); 1961 1962 int tcp_v4_early_demux(struct sk_buff *skb) 1963 { 1964 struct net *net = dev_net_rcu(skb->dev); 1965 const struct iphdr *iph; 1966 const struct tcphdr *th; 1967 struct sock *sk; 1968 1969 if (skb->pkt_type != PACKET_HOST) 1970 return 0; 1971 1972 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1973 return 0; 1974 1975 iph = ip_hdr(skb); 1976 th = tcp_hdr(skb); 1977 1978 if (th->doff < sizeof(struct tcphdr) / 4) 1979 return 0; 1980 1981 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1982 iph->saddr, th->source, 1983 iph->daddr, ntohs(th->dest), 1984 skb->skb_iif, inet_sdif(skb)); 1985 if (sk) { 1986 skb->sk = sk; 1987 skb->destructor = sock_edemux; 1988 if (sk_fullsock(sk)) { 1989 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1990 1991 if (dst) 1992 dst = dst_check(dst, 0); 1993 if (dst && 1994 sk->sk_rx_dst_ifindex == skb->skb_iif) 1995 skb_dst_set_noref(skb, dst); 1996 } 1997 } 1998 return 0; 1999 } 2000 2001 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2002 enum skb_drop_reason *reason) 2003 { 2004 u32 tail_gso_size, tail_gso_segs; 2005 struct skb_shared_info *shinfo; 2006 const struct tcphdr *th; 2007 struct tcphdr *thtail; 2008 struct sk_buff *tail; 2009 unsigned int hdrlen; 2010 bool fragstolen; 2011 u32 gso_segs; 2012 u32 gso_size; 2013 u64 limit; 2014 int delta; 2015 2016 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2017 * we can fix skb->truesize to its real value to avoid future drops. 2018 * This is valid because skb is not yet charged to the socket. 2019 * It has been noticed pure SACK packets were sometimes dropped 2020 * (if cooked by drivers without copybreak feature). 2021 */ 2022 skb_condense(skb); 2023 2024 tcp_cleanup_skb(skb); 2025 2026 if (unlikely(tcp_checksum_complete(skb))) { 2027 bh_unlock_sock(sk); 2028 trace_tcp_bad_csum(skb); 2029 *reason = SKB_DROP_REASON_TCP_CSUM; 2030 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2031 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2032 return true; 2033 } 2034 2035 /* Attempt coalescing to last skb in backlog, even if we are 2036 * above the limits. 2037 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2038 */ 2039 th = (const struct tcphdr *)skb->data; 2040 hdrlen = th->doff * 4; 2041 2042 tail = sk->sk_backlog.tail; 2043 if (!tail) 2044 goto no_coalesce; 2045 thtail = (struct tcphdr *)tail->data; 2046 2047 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2048 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2049 ((TCP_SKB_CB(tail)->tcp_flags | 2050 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2051 !((TCP_SKB_CB(tail)->tcp_flags & 2052 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2053 ((TCP_SKB_CB(tail)->tcp_flags ^ 2054 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 2055 !tcp_skb_can_collapse_rx(tail, skb) || 2056 thtail->doff != th->doff || 2057 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2058 goto no_coalesce; 2059 2060 __skb_pull(skb, hdrlen); 2061 2062 shinfo = skb_shinfo(skb); 2063 gso_size = shinfo->gso_size ?: skb->len; 2064 gso_segs = shinfo->gso_segs ?: 1; 2065 2066 shinfo = skb_shinfo(tail); 2067 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2068 tail_gso_segs = shinfo->gso_segs ?: 1; 2069 2070 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2071 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2072 2073 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2074 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2075 thtail->window = th->window; 2076 } 2077 2078 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2079 * thtail->fin, so that the fast path in tcp_rcv_established() 2080 * is not entered if we append a packet with a FIN. 2081 * SYN, RST, URG are not present. 2082 * ACK is set on both packets. 2083 * PSH : we do not really care in TCP stack, 2084 * at least for 'GRO' packets. 2085 */ 2086 thtail->fin |= th->fin; 2087 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2088 2089 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2090 TCP_SKB_CB(tail)->has_rxtstamp = true; 2091 tail->tstamp = skb->tstamp; 2092 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2093 } 2094 2095 /* Not as strict as GRO. We only need to carry mss max value */ 2096 shinfo->gso_size = max(gso_size, tail_gso_size); 2097 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2098 2099 sk->sk_backlog.len += delta; 2100 __NET_INC_STATS(sock_net(sk), 2101 LINUX_MIB_TCPBACKLOGCOALESCE); 2102 kfree_skb_partial(skb, fragstolen); 2103 return false; 2104 } 2105 __skb_push(skb, hdrlen); 2106 2107 no_coalesce: 2108 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2109 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2110 * sk_rcvbuf in normal conditions. 2111 */ 2112 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2113 2114 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2115 2116 /* Only socket owner can try to collapse/prune rx queues 2117 * to reduce memory overhead, so add a little headroom here. 2118 * Few sockets backlog are possibly concurrently non empty. 2119 */ 2120 limit += 64 * 1024; 2121 2122 limit = min_t(u64, limit, UINT_MAX); 2123 2124 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2125 bh_unlock_sock(sk); 2126 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2127 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2128 return true; 2129 } 2130 return false; 2131 } 2132 EXPORT_IPV6_MOD(tcp_add_backlog); 2133 2134 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2135 { 2136 struct tcphdr *th = (struct tcphdr *)skb->data; 2137 2138 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2139 } 2140 EXPORT_IPV6_MOD(tcp_filter); 2141 2142 static void tcp_v4_restore_cb(struct sk_buff *skb) 2143 { 2144 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2145 sizeof(struct inet_skb_parm)); 2146 } 2147 2148 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2149 const struct tcphdr *th) 2150 { 2151 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2152 * barrier() makes sure compiler wont play fool^Waliasing games. 2153 */ 2154 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2155 sizeof(struct inet_skb_parm)); 2156 barrier(); 2157 2158 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2159 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2160 skb->len - th->doff * 4); 2161 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2162 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 2163 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2164 TCP_SKB_CB(skb)->sacked = 0; 2165 TCP_SKB_CB(skb)->has_rxtstamp = 2166 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2167 } 2168 2169 /* 2170 * From tcp_input.c 2171 */ 2172 2173 int tcp_v4_rcv(struct sk_buff *skb) 2174 { 2175 struct net *net = dev_net_rcu(skb->dev); 2176 enum skb_drop_reason drop_reason; 2177 int sdif = inet_sdif(skb); 2178 int dif = inet_iif(skb); 2179 const struct iphdr *iph; 2180 const struct tcphdr *th; 2181 struct sock *sk = NULL; 2182 bool refcounted; 2183 int ret; 2184 u32 isn; 2185 2186 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2187 if (skb->pkt_type != PACKET_HOST) 2188 goto discard_it; 2189 2190 /* Count it even if it's bad */ 2191 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2192 2193 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2194 goto discard_it; 2195 2196 th = (const struct tcphdr *)skb->data; 2197 2198 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2199 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2200 goto bad_packet; 2201 } 2202 if (!pskb_may_pull(skb, th->doff * 4)) 2203 goto discard_it; 2204 2205 /* An explanation is required here, I think. 2206 * Packet length and doff are validated by header prediction, 2207 * provided case of th->doff==0 is eliminated. 2208 * So, we defer the checks. */ 2209 2210 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2211 goto csum_error; 2212 2213 th = (const struct tcphdr *)skb->data; 2214 iph = ip_hdr(skb); 2215 lookup: 2216 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2217 skb, __tcp_hdrlen(th), th->source, 2218 th->dest, sdif, &refcounted); 2219 if (!sk) 2220 goto no_tcp_socket; 2221 2222 if (sk->sk_state == TCP_TIME_WAIT) 2223 goto do_time_wait; 2224 2225 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2226 struct request_sock *req = inet_reqsk(sk); 2227 bool req_stolen = false; 2228 struct sock *nsk; 2229 2230 sk = req->rsk_listener; 2231 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2232 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2233 else 2234 drop_reason = tcp_inbound_hash(sk, req, skb, 2235 &iph->saddr, &iph->daddr, 2236 AF_INET, dif, sdif); 2237 if (unlikely(drop_reason)) { 2238 sk_drops_add(sk, skb); 2239 reqsk_put(req); 2240 goto discard_it; 2241 } 2242 if (tcp_checksum_complete(skb)) { 2243 reqsk_put(req); 2244 goto csum_error; 2245 } 2246 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2247 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2248 if (!nsk) { 2249 inet_csk_reqsk_queue_drop_and_put(sk, req); 2250 goto lookup; 2251 } 2252 sk = nsk; 2253 /* reuseport_migrate_sock() has already held one sk_refcnt 2254 * before returning. 2255 */ 2256 } else { 2257 /* We own a reference on the listener, increase it again 2258 * as we might lose it too soon. 2259 */ 2260 sock_hold(sk); 2261 } 2262 refcounted = true; 2263 nsk = NULL; 2264 if (!tcp_filter(sk, skb)) { 2265 th = (const struct tcphdr *)skb->data; 2266 iph = ip_hdr(skb); 2267 tcp_v4_fill_cb(skb, iph, th); 2268 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2269 &drop_reason); 2270 } else { 2271 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2272 } 2273 if (!nsk) { 2274 reqsk_put(req); 2275 if (req_stolen) { 2276 /* Another cpu got exclusive access to req 2277 * and created a full blown socket. 2278 * Try to feed this packet to this socket 2279 * instead of discarding it. 2280 */ 2281 tcp_v4_restore_cb(skb); 2282 sock_put(sk); 2283 goto lookup; 2284 } 2285 goto discard_and_relse; 2286 } 2287 nf_reset_ct(skb); 2288 if (nsk == sk) { 2289 reqsk_put(req); 2290 tcp_v4_restore_cb(skb); 2291 } else { 2292 drop_reason = tcp_child_process(sk, nsk, skb); 2293 if (drop_reason) { 2294 enum sk_rst_reason rst_reason; 2295 2296 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2297 tcp_v4_send_reset(nsk, skb, rst_reason); 2298 goto discard_and_relse; 2299 } 2300 sock_put(sk); 2301 return 0; 2302 } 2303 } 2304 2305 process: 2306 if (static_branch_unlikely(&ip4_min_ttl)) { 2307 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2308 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2309 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2310 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2311 goto discard_and_relse; 2312 } 2313 } 2314 2315 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2316 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2317 goto discard_and_relse; 2318 } 2319 2320 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2321 AF_INET, dif, sdif); 2322 if (drop_reason) 2323 goto discard_and_relse; 2324 2325 nf_reset_ct(skb); 2326 2327 if (tcp_filter(sk, skb)) { 2328 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2329 goto discard_and_relse; 2330 } 2331 th = (const struct tcphdr *)skb->data; 2332 iph = ip_hdr(skb); 2333 tcp_v4_fill_cb(skb, iph, th); 2334 2335 skb->dev = NULL; 2336 2337 if (sk->sk_state == TCP_LISTEN) { 2338 ret = tcp_v4_do_rcv(sk, skb); 2339 goto put_and_return; 2340 } 2341 2342 sk_incoming_cpu_update(sk); 2343 2344 bh_lock_sock_nested(sk); 2345 tcp_segs_in(tcp_sk(sk), skb); 2346 ret = 0; 2347 if (!sock_owned_by_user(sk)) { 2348 ret = tcp_v4_do_rcv(sk, skb); 2349 } else { 2350 if (tcp_add_backlog(sk, skb, &drop_reason)) 2351 goto discard_and_relse; 2352 } 2353 bh_unlock_sock(sk); 2354 2355 put_and_return: 2356 if (refcounted) 2357 sock_put(sk); 2358 2359 return ret; 2360 2361 no_tcp_socket: 2362 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2363 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2364 goto discard_it; 2365 2366 tcp_v4_fill_cb(skb, iph, th); 2367 2368 if (tcp_checksum_complete(skb)) { 2369 csum_error: 2370 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2371 trace_tcp_bad_csum(skb); 2372 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2373 bad_packet: 2374 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2375 } else { 2376 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2377 } 2378 2379 discard_it: 2380 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2381 /* Discard frame. */ 2382 sk_skb_reason_drop(sk, skb, drop_reason); 2383 return 0; 2384 2385 discard_and_relse: 2386 sk_drops_add(sk, skb); 2387 if (refcounted) 2388 sock_put(sk); 2389 goto discard_it; 2390 2391 do_time_wait: 2392 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2393 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2394 inet_twsk_put(inet_twsk(sk)); 2395 goto discard_it; 2396 } 2397 2398 tcp_v4_fill_cb(skb, iph, th); 2399 2400 if (tcp_checksum_complete(skb)) { 2401 inet_twsk_put(inet_twsk(sk)); 2402 goto csum_error; 2403 } 2404 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { 2405 case TCP_TW_SYN: { 2406 struct sock *sk2 = inet_lookup_listener(net, 2407 net->ipv4.tcp_death_row.hashinfo, 2408 skb, __tcp_hdrlen(th), 2409 iph->saddr, th->source, 2410 iph->daddr, th->dest, 2411 inet_iif(skb), 2412 sdif); 2413 if (sk2) { 2414 inet_twsk_deschedule_put(inet_twsk(sk)); 2415 sk = sk2; 2416 tcp_v4_restore_cb(skb); 2417 refcounted = false; 2418 __this_cpu_write(tcp_tw_isn, isn); 2419 goto process; 2420 } 2421 } 2422 /* to ACK */ 2423 fallthrough; 2424 case TCP_TW_ACK: 2425 tcp_v4_timewait_ack(sk, skb); 2426 break; 2427 case TCP_TW_RST: 2428 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2429 inet_twsk_deschedule_put(inet_twsk(sk)); 2430 goto discard_it; 2431 case TCP_TW_SUCCESS:; 2432 } 2433 goto discard_it; 2434 } 2435 2436 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2437 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2438 .twsk_destructor= tcp_twsk_destructor, 2439 }; 2440 2441 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2442 { 2443 struct dst_entry *dst = skb_dst(skb); 2444 2445 if (dst && dst_hold_safe(dst)) { 2446 rcu_assign_pointer(sk->sk_rx_dst, dst); 2447 sk->sk_rx_dst_ifindex = skb->skb_iif; 2448 } 2449 } 2450 EXPORT_IPV6_MOD(inet_sk_rx_dst_set); 2451 2452 const struct inet_connection_sock_af_ops ipv4_specific = { 2453 .queue_xmit = ip_queue_xmit, 2454 .send_check = tcp_v4_send_check, 2455 .rebuild_header = inet_sk_rebuild_header, 2456 .sk_rx_dst_set = inet_sk_rx_dst_set, 2457 .conn_request = tcp_v4_conn_request, 2458 .syn_recv_sock = tcp_v4_syn_recv_sock, 2459 .net_header_len = sizeof(struct iphdr), 2460 .setsockopt = ip_setsockopt, 2461 .getsockopt = ip_getsockopt, 2462 .addr2sockaddr = inet_csk_addr2sockaddr, 2463 .sockaddr_len = sizeof(struct sockaddr_in), 2464 .mtu_reduced = tcp_v4_mtu_reduced, 2465 }; 2466 EXPORT_IPV6_MOD(ipv4_specific); 2467 2468 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2469 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2470 #ifdef CONFIG_TCP_MD5SIG 2471 .md5_lookup = tcp_v4_md5_lookup, 2472 .calc_md5_hash = tcp_v4_md5_hash_skb, 2473 .md5_parse = tcp_v4_parse_md5_keys, 2474 #endif 2475 #ifdef CONFIG_TCP_AO 2476 .ao_lookup = tcp_v4_ao_lookup, 2477 .calc_ao_hash = tcp_v4_ao_hash_skb, 2478 .ao_parse = tcp_v4_parse_ao, 2479 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2480 #endif 2481 }; 2482 #endif 2483 2484 /* NOTE: A lot of things set to zero explicitly by call to 2485 * sk_alloc() so need not be done here. 2486 */ 2487 static int tcp_v4_init_sock(struct sock *sk) 2488 { 2489 struct inet_connection_sock *icsk = inet_csk(sk); 2490 2491 tcp_init_sock(sk); 2492 2493 icsk->icsk_af_ops = &ipv4_specific; 2494 2495 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2496 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2497 #endif 2498 2499 return 0; 2500 } 2501 2502 #ifdef CONFIG_TCP_MD5SIG 2503 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2504 { 2505 struct tcp_md5sig_info *md5sig; 2506 2507 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2508 kfree(md5sig); 2509 static_branch_slow_dec_deferred(&tcp_md5_needed); 2510 tcp_md5_release_sigpool(); 2511 } 2512 #endif 2513 2514 static void tcp_release_user_frags(struct sock *sk) 2515 { 2516 #ifdef CONFIG_PAGE_POOL 2517 unsigned long index; 2518 void *netmem; 2519 2520 xa_for_each(&sk->sk_user_frags, index, netmem) 2521 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2522 #endif 2523 } 2524 2525 void tcp_v4_destroy_sock(struct sock *sk) 2526 { 2527 struct tcp_sock *tp = tcp_sk(sk); 2528 2529 tcp_release_user_frags(sk); 2530 2531 xa_destroy(&sk->sk_user_frags); 2532 2533 trace_tcp_destroy_sock(sk); 2534 2535 tcp_clear_xmit_timers(sk); 2536 2537 tcp_cleanup_congestion_control(sk); 2538 2539 tcp_cleanup_ulp(sk); 2540 2541 /* Cleanup up the write buffer. */ 2542 tcp_write_queue_purge(sk); 2543 2544 /* Check if we want to disable active TFO */ 2545 tcp_fastopen_active_disable_ofo_check(sk); 2546 2547 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2548 skb_rbtree_purge(&tp->out_of_order_queue); 2549 2550 #ifdef CONFIG_TCP_MD5SIG 2551 /* Clean up the MD5 key list, if any */ 2552 if (tp->md5sig_info) { 2553 struct tcp_md5sig_info *md5sig; 2554 2555 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2556 tcp_clear_md5_list(sk); 2557 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2558 rcu_assign_pointer(tp->md5sig_info, NULL); 2559 } 2560 #endif 2561 tcp_ao_destroy_sock(sk, false); 2562 2563 /* Clean up a referenced TCP bind bucket. */ 2564 if (inet_csk(sk)->icsk_bind_hash) 2565 inet_put_port(sk); 2566 2567 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2568 2569 /* If socket is aborted during connect operation */ 2570 tcp_free_fastopen_req(tp); 2571 tcp_fastopen_destroy_cipher(sk); 2572 tcp_saved_syn_free(tp); 2573 2574 sk_sockets_allocated_dec(sk); 2575 } 2576 EXPORT_IPV6_MOD(tcp_v4_destroy_sock); 2577 2578 #ifdef CONFIG_PROC_FS 2579 /* Proc filesystem TCP sock list dumping. */ 2580 2581 static unsigned short seq_file_family(const struct seq_file *seq); 2582 2583 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2584 { 2585 unsigned short family = seq_file_family(seq); 2586 2587 /* AF_UNSPEC is used as a match all */ 2588 return ((family == AF_UNSPEC || family == sk->sk_family) && 2589 net_eq(sock_net(sk), seq_file_net(seq))); 2590 } 2591 2592 /* Find a non empty bucket (starting from st->bucket) 2593 * and return the first sk from it. 2594 */ 2595 static void *listening_get_first(struct seq_file *seq) 2596 { 2597 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2598 struct tcp_iter_state *st = seq->private; 2599 2600 st->offset = 0; 2601 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2602 struct inet_listen_hashbucket *ilb2; 2603 struct hlist_nulls_node *node; 2604 struct sock *sk; 2605 2606 ilb2 = &hinfo->lhash2[st->bucket]; 2607 if (hlist_nulls_empty(&ilb2->nulls_head)) 2608 continue; 2609 2610 spin_lock(&ilb2->lock); 2611 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2612 if (seq_sk_match(seq, sk)) 2613 return sk; 2614 } 2615 spin_unlock(&ilb2->lock); 2616 } 2617 2618 return NULL; 2619 } 2620 2621 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2622 * If "cur" is the last one in the st->bucket, 2623 * call listening_get_first() to return the first sk of the next 2624 * non empty bucket. 2625 */ 2626 static void *listening_get_next(struct seq_file *seq, void *cur) 2627 { 2628 struct tcp_iter_state *st = seq->private; 2629 struct inet_listen_hashbucket *ilb2; 2630 struct hlist_nulls_node *node; 2631 struct inet_hashinfo *hinfo; 2632 struct sock *sk = cur; 2633 2634 ++st->num; 2635 ++st->offset; 2636 2637 sk = sk_nulls_next(sk); 2638 sk_nulls_for_each_from(sk, node) { 2639 if (seq_sk_match(seq, sk)) 2640 return sk; 2641 } 2642 2643 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2644 ilb2 = &hinfo->lhash2[st->bucket]; 2645 spin_unlock(&ilb2->lock); 2646 ++st->bucket; 2647 return listening_get_first(seq); 2648 } 2649 2650 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2651 { 2652 struct tcp_iter_state *st = seq->private; 2653 void *rc; 2654 2655 st->bucket = 0; 2656 st->offset = 0; 2657 rc = listening_get_first(seq); 2658 2659 while (rc && *pos) { 2660 rc = listening_get_next(seq, rc); 2661 --*pos; 2662 } 2663 return rc; 2664 } 2665 2666 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2667 const struct tcp_iter_state *st) 2668 { 2669 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2670 } 2671 2672 /* 2673 * Get first established socket starting from bucket given in st->bucket. 2674 * If st->bucket is zero, the very first socket in the hash is returned. 2675 */ 2676 static void *established_get_first(struct seq_file *seq) 2677 { 2678 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2679 struct tcp_iter_state *st = seq->private; 2680 2681 st->offset = 0; 2682 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2683 struct sock *sk; 2684 struct hlist_nulls_node *node; 2685 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2686 2687 cond_resched(); 2688 2689 /* Lockless fast path for the common case of empty buckets */ 2690 if (empty_bucket(hinfo, st)) 2691 continue; 2692 2693 spin_lock_bh(lock); 2694 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2695 if (seq_sk_match(seq, sk)) 2696 return sk; 2697 } 2698 spin_unlock_bh(lock); 2699 } 2700 2701 return NULL; 2702 } 2703 2704 static void *established_get_next(struct seq_file *seq, void *cur) 2705 { 2706 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2707 struct tcp_iter_state *st = seq->private; 2708 struct hlist_nulls_node *node; 2709 struct sock *sk = cur; 2710 2711 ++st->num; 2712 ++st->offset; 2713 2714 sk = sk_nulls_next(sk); 2715 2716 sk_nulls_for_each_from(sk, node) { 2717 if (seq_sk_match(seq, sk)) 2718 return sk; 2719 } 2720 2721 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2722 ++st->bucket; 2723 return established_get_first(seq); 2724 } 2725 2726 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2727 { 2728 struct tcp_iter_state *st = seq->private; 2729 void *rc; 2730 2731 st->bucket = 0; 2732 rc = established_get_first(seq); 2733 2734 while (rc && pos) { 2735 rc = established_get_next(seq, rc); 2736 --pos; 2737 } 2738 return rc; 2739 } 2740 2741 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2742 { 2743 void *rc; 2744 struct tcp_iter_state *st = seq->private; 2745 2746 st->state = TCP_SEQ_STATE_LISTENING; 2747 rc = listening_get_idx(seq, &pos); 2748 2749 if (!rc) { 2750 st->state = TCP_SEQ_STATE_ESTABLISHED; 2751 rc = established_get_idx(seq, pos); 2752 } 2753 2754 return rc; 2755 } 2756 2757 static void *tcp_seek_last_pos(struct seq_file *seq) 2758 { 2759 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2760 struct tcp_iter_state *st = seq->private; 2761 int bucket = st->bucket; 2762 int offset = st->offset; 2763 int orig_num = st->num; 2764 void *rc = NULL; 2765 2766 switch (st->state) { 2767 case TCP_SEQ_STATE_LISTENING: 2768 if (st->bucket > hinfo->lhash2_mask) 2769 break; 2770 rc = listening_get_first(seq); 2771 while (offset-- && rc && bucket == st->bucket) 2772 rc = listening_get_next(seq, rc); 2773 if (rc) 2774 break; 2775 st->bucket = 0; 2776 st->state = TCP_SEQ_STATE_ESTABLISHED; 2777 fallthrough; 2778 case TCP_SEQ_STATE_ESTABLISHED: 2779 if (st->bucket > hinfo->ehash_mask) 2780 break; 2781 rc = established_get_first(seq); 2782 while (offset-- && rc && bucket == st->bucket) 2783 rc = established_get_next(seq, rc); 2784 } 2785 2786 st->num = orig_num; 2787 2788 return rc; 2789 } 2790 2791 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2792 { 2793 struct tcp_iter_state *st = seq->private; 2794 void *rc; 2795 2796 if (*pos && *pos == st->last_pos) { 2797 rc = tcp_seek_last_pos(seq); 2798 if (rc) 2799 goto out; 2800 } 2801 2802 st->state = TCP_SEQ_STATE_LISTENING; 2803 st->num = 0; 2804 st->bucket = 0; 2805 st->offset = 0; 2806 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2807 2808 out: 2809 st->last_pos = *pos; 2810 return rc; 2811 } 2812 EXPORT_IPV6_MOD(tcp_seq_start); 2813 2814 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2815 { 2816 struct tcp_iter_state *st = seq->private; 2817 void *rc = NULL; 2818 2819 if (v == SEQ_START_TOKEN) { 2820 rc = tcp_get_idx(seq, 0); 2821 goto out; 2822 } 2823 2824 switch (st->state) { 2825 case TCP_SEQ_STATE_LISTENING: 2826 rc = listening_get_next(seq, v); 2827 if (!rc) { 2828 st->state = TCP_SEQ_STATE_ESTABLISHED; 2829 st->bucket = 0; 2830 st->offset = 0; 2831 rc = established_get_first(seq); 2832 } 2833 break; 2834 case TCP_SEQ_STATE_ESTABLISHED: 2835 rc = established_get_next(seq, v); 2836 break; 2837 } 2838 out: 2839 ++*pos; 2840 st->last_pos = *pos; 2841 return rc; 2842 } 2843 EXPORT_IPV6_MOD(tcp_seq_next); 2844 2845 void tcp_seq_stop(struct seq_file *seq, void *v) 2846 { 2847 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2848 struct tcp_iter_state *st = seq->private; 2849 2850 switch (st->state) { 2851 case TCP_SEQ_STATE_LISTENING: 2852 if (v != SEQ_START_TOKEN) 2853 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2854 break; 2855 case TCP_SEQ_STATE_ESTABLISHED: 2856 if (v) 2857 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2858 break; 2859 } 2860 } 2861 EXPORT_IPV6_MOD(tcp_seq_stop); 2862 2863 static void get_openreq4(const struct request_sock *req, 2864 struct seq_file *f, int i) 2865 { 2866 const struct inet_request_sock *ireq = inet_rsk(req); 2867 long delta = req->rsk_timer.expires - jiffies; 2868 2869 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2870 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2871 i, 2872 ireq->ir_loc_addr, 2873 ireq->ir_num, 2874 ireq->ir_rmt_addr, 2875 ntohs(ireq->ir_rmt_port), 2876 TCP_SYN_RECV, 2877 0, 0, /* could print option size, but that is af dependent. */ 2878 1, /* timers active (only the expire timer) */ 2879 jiffies_delta_to_clock_t(delta), 2880 req->num_timeout, 2881 from_kuid_munged(seq_user_ns(f), 2882 sock_i_uid(req->rsk_listener)), 2883 0, /* non standard timer */ 2884 0, /* open_requests have no inode */ 2885 0, 2886 req); 2887 } 2888 2889 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2890 { 2891 int timer_active; 2892 unsigned long timer_expires; 2893 const struct tcp_sock *tp = tcp_sk(sk); 2894 const struct inet_connection_sock *icsk = inet_csk(sk); 2895 const struct inet_sock *inet = inet_sk(sk); 2896 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2897 __be32 dest = inet->inet_daddr; 2898 __be32 src = inet->inet_rcv_saddr; 2899 __u16 destp = ntohs(inet->inet_dport); 2900 __u16 srcp = ntohs(inet->inet_sport); 2901 u8 icsk_pending; 2902 int rx_queue; 2903 int state; 2904 2905 icsk_pending = smp_load_acquire(&icsk->icsk_pending); 2906 if (icsk_pending == ICSK_TIME_RETRANS || 2907 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2908 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2909 timer_active = 1; 2910 timer_expires = icsk->icsk_timeout; 2911 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2912 timer_active = 4; 2913 timer_expires = icsk->icsk_timeout; 2914 } else if (timer_pending(&sk->sk_timer)) { 2915 timer_active = 2; 2916 timer_expires = sk->sk_timer.expires; 2917 } else { 2918 timer_active = 0; 2919 timer_expires = jiffies; 2920 } 2921 2922 state = inet_sk_state_load(sk); 2923 if (state == TCP_LISTEN) 2924 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2925 else 2926 /* Because we don't lock the socket, 2927 * we might find a transient negative value. 2928 */ 2929 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2930 READ_ONCE(tp->copied_seq), 0); 2931 2932 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2933 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2934 i, src, srcp, dest, destp, state, 2935 READ_ONCE(tp->write_seq) - tp->snd_una, 2936 rx_queue, 2937 timer_active, 2938 jiffies_delta_to_clock_t(timer_expires - jiffies), 2939 icsk->icsk_retransmits, 2940 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2941 icsk->icsk_probes_out, 2942 sock_i_ino(sk), 2943 refcount_read(&sk->sk_refcnt), sk, 2944 jiffies_to_clock_t(icsk->icsk_rto), 2945 jiffies_to_clock_t(icsk->icsk_ack.ato), 2946 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2947 tcp_snd_cwnd(tp), 2948 state == TCP_LISTEN ? 2949 fastopenq->max_qlen : 2950 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2951 } 2952 2953 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2954 struct seq_file *f, int i) 2955 { 2956 long delta = tw->tw_timer.expires - jiffies; 2957 __be32 dest, src; 2958 __u16 destp, srcp; 2959 2960 dest = tw->tw_daddr; 2961 src = tw->tw_rcv_saddr; 2962 destp = ntohs(tw->tw_dport); 2963 srcp = ntohs(tw->tw_sport); 2964 2965 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2966 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2967 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2968 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2969 refcount_read(&tw->tw_refcnt), tw); 2970 } 2971 2972 #define TMPSZ 150 2973 2974 static int tcp4_seq_show(struct seq_file *seq, void *v) 2975 { 2976 struct tcp_iter_state *st; 2977 struct sock *sk = v; 2978 2979 seq_setwidth(seq, TMPSZ - 1); 2980 if (v == SEQ_START_TOKEN) { 2981 seq_puts(seq, " sl local_address rem_address st tx_queue " 2982 "rx_queue tr tm->when retrnsmt uid timeout " 2983 "inode"); 2984 goto out; 2985 } 2986 st = seq->private; 2987 2988 if (sk->sk_state == TCP_TIME_WAIT) 2989 get_timewait4_sock(v, seq, st->num); 2990 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2991 get_openreq4(v, seq, st->num); 2992 else 2993 get_tcp4_sock(v, seq, st->num); 2994 out: 2995 seq_pad(seq, '\n'); 2996 return 0; 2997 } 2998 2999 #ifdef CONFIG_BPF_SYSCALL 3000 struct bpf_tcp_iter_state { 3001 struct tcp_iter_state state; 3002 unsigned int cur_sk; 3003 unsigned int end_sk; 3004 unsigned int max_sk; 3005 struct sock **batch; 3006 bool st_bucket_done; 3007 }; 3008 3009 struct bpf_iter__tcp { 3010 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3011 __bpf_md_ptr(struct sock_common *, sk_common); 3012 uid_t uid __aligned(8); 3013 }; 3014 3015 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3016 struct sock_common *sk_common, uid_t uid) 3017 { 3018 struct bpf_iter__tcp ctx; 3019 3020 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3021 ctx.meta = meta; 3022 ctx.sk_common = sk_common; 3023 ctx.uid = uid; 3024 return bpf_iter_run_prog(prog, &ctx); 3025 } 3026 3027 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3028 { 3029 while (iter->cur_sk < iter->end_sk) 3030 sock_gen_put(iter->batch[iter->cur_sk++]); 3031 } 3032 3033 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3034 unsigned int new_batch_sz) 3035 { 3036 struct sock **new_batch; 3037 3038 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3039 GFP_USER | __GFP_NOWARN); 3040 if (!new_batch) 3041 return -ENOMEM; 3042 3043 bpf_iter_tcp_put_batch(iter); 3044 kvfree(iter->batch); 3045 iter->batch = new_batch; 3046 iter->max_sk = new_batch_sz; 3047 3048 return 0; 3049 } 3050 3051 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3052 struct sock *start_sk) 3053 { 3054 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3055 struct bpf_tcp_iter_state *iter = seq->private; 3056 struct tcp_iter_state *st = &iter->state; 3057 struct hlist_nulls_node *node; 3058 unsigned int expected = 1; 3059 struct sock *sk; 3060 3061 sock_hold(start_sk); 3062 iter->batch[iter->end_sk++] = start_sk; 3063 3064 sk = sk_nulls_next(start_sk); 3065 sk_nulls_for_each_from(sk, node) { 3066 if (seq_sk_match(seq, sk)) { 3067 if (iter->end_sk < iter->max_sk) { 3068 sock_hold(sk); 3069 iter->batch[iter->end_sk++] = sk; 3070 } 3071 expected++; 3072 } 3073 } 3074 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3075 3076 return expected; 3077 } 3078 3079 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3080 struct sock *start_sk) 3081 { 3082 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3083 struct bpf_tcp_iter_state *iter = seq->private; 3084 struct tcp_iter_state *st = &iter->state; 3085 struct hlist_nulls_node *node; 3086 unsigned int expected = 1; 3087 struct sock *sk; 3088 3089 sock_hold(start_sk); 3090 iter->batch[iter->end_sk++] = start_sk; 3091 3092 sk = sk_nulls_next(start_sk); 3093 sk_nulls_for_each_from(sk, node) { 3094 if (seq_sk_match(seq, sk)) { 3095 if (iter->end_sk < iter->max_sk) { 3096 sock_hold(sk); 3097 iter->batch[iter->end_sk++] = sk; 3098 } 3099 expected++; 3100 } 3101 } 3102 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3103 3104 return expected; 3105 } 3106 3107 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3108 { 3109 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3110 struct bpf_tcp_iter_state *iter = seq->private; 3111 struct tcp_iter_state *st = &iter->state; 3112 unsigned int expected; 3113 bool resized = false; 3114 struct sock *sk; 3115 3116 /* The st->bucket is done. Directly advance to the next 3117 * bucket instead of having the tcp_seek_last_pos() to skip 3118 * one by one in the current bucket and eventually find out 3119 * it has to advance to the next bucket. 3120 */ 3121 if (iter->st_bucket_done) { 3122 st->offset = 0; 3123 st->bucket++; 3124 if (st->state == TCP_SEQ_STATE_LISTENING && 3125 st->bucket > hinfo->lhash2_mask) { 3126 st->state = TCP_SEQ_STATE_ESTABLISHED; 3127 st->bucket = 0; 3128 } 3129 } 3130 3131 again: 3132 /* Get a new batch */ 3133 iter->cur_sk = 0; 3134 iter->end_sk = 0; 3135 iter->st_bucket_done = false; 3136 3137 sk = tcp_seek_last_pos(seq); 3138 if (!sk) 3139 return NULL; /* Done */ 3140 3141 if (st->state == TCP_SEQ_STATE_LISTENING) 3142 expected = bpf_iter_tcp_listening_batch(seq, sk); 3143 else 3144 expected = bpf_iter_tcp_established_batch(seq, sk); 3145 3146 if (iter->end_sk == expected) { 3147 iter->st_bucket_done = true; 3148 return sk; 3149 } 3150 3151 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3152 resized = true; 3153 goto again; 3154 } 3155 3156 return sk; 3157 } 3158 3159 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3160 { 3161 /* bpf iter does not support lseek, so it always 3162 * continue from where it was stop()-ped. 3163 */ 3164 if (*pos) 3165 return bpf_iter_tcp_batch(seq); 3166 3167 return SEQ_START_TOKEN; 3168 } 3169 3170 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3171 { 3172 struct bpf_tcp_iter_state *iter = seq->private; 3173 struct tcp_iter_state *st = &iter->state; 3174 struct sock *sk; 3175 3176 /* Whenever seq_next() is called, the iter->cur_sk is 3177 * done with seq_show(), so advance to the next sk in 3178 * the batch. 3179 */ 3180 if (iter->cur_sk < iter->end_sk) { 3181 /* Keeping st->num consistent in tcp_iter_state. 3182 * bpf_iter_tcp does not use st->num. 3183 * meta.seq_num is used instead. 3184 */ 3185 st->num++; 3186 /* Move st->offset to the next sk in the bucket such that 3187 * the future start() will resume at st->offset in 3188 * st->bucket. See tcp_seek_last_pos(). 3189 */ 3190 st->offset++; 3191 sock_gen_put(iter->batch[iter->cur_sk++]); 3192 } 3193 3194 if (iter->cur_sk < iter->end_sk) 3195 sk = iter->batch[iter->cur_sk]; 3196 else 3197 sk = bpf_iter_tcp_batch(seq); 3198 3199 ++*pos; 3200 /* Keeping st->last_pos consistent in tcp_iter_state. 3201 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3202 */ 3203 st->last_pos = *pos; 3204 return sk; 3205 } 3206 3207 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3208 { 3209 struct bpf_iter_meta meta; 3210 struct bpf_prog *prog; 3211 struct sock *sk = v; 3212 uid_t uid; 3213 int ret; 3214 3215 if (v == SEQ_START_TOKEN) 3216 return 0; 3217 3218 if (sk_fullsock(sk)) 3219 lock_sock(sk); 3220 3221 if (unlikely(sk_unhashed(sk))) { 3222 ret = SEQ_SKIP; 3223 goto unlock; 3224 } 3225 3226 if (sk->sk_state == TCP_TIME_WAIT) { 3227 uid = 0; 3228 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3229 const struct request_sock *req = v; 3230 3231 uid = from_kuid_munged(seq_user_ns(seq), 3232 sock_i_uid(req->rsk_listener)); 3233 } else { 3234 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3235 } 3236 3237 meta.seq = seq; 3238 prog = bpf_iter_get_info(&meta, false); 3239 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3240 3241 unlock: 3242 if (sk_fullsock(sk)) 3243 release_sock(sk); 3244 return ret; 3245 3246 } 3247 3248 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3249 { 3250 struct bpf_tcp_iter_state *iter = seq->private; 3251 struct bpf_iter_meta meta; 3252 struct bpf_prog *prog; 3253 3254 if (!v) { 3255 meta.seq = seq; 3256 prog = bpf_iter_get_info(&meta, true); 3257 if (prog) 3258 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3259 } 3260 3261 if (iter->cur_sk < iter->end_sk) { 3262 bpf_iter_tcp_put_batch(iter); 3263 iter->st_bucket_done = false; 3264 } 3265 } 3266 3267 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3268 .show = bpf_iter_tcp_seq_show, 3269 .start = bpf_iter_tcp_seq_start, 3270 .next = bpf_iter_tcp_seq_next, 3271 .stop = bpf_iter_tcp_seq_stop, 3272 }; 3273 #endif 3274 static unsigned short seq_file_family(const struct seq_file *seq) 3275 { 3276 const struct tcp_seq_afinfo *afinfo; 3277 3278 #ifdef CONFIG_BPF_SYSCALL 3279 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3280 if (seq->op == &bpf_iter_tcp_seq_ops) 3281 return AF_UNSPEC; 3282 #endif 3283 3284 /* Iterated from proc fs */ 3285 afinfo = pde_data(file_inode(seq->file)); 3286 return afinfo->family; 3287 } 3288 3289 static const struct seq_operations tcp4_seq_ops = { 3290 .show = tcp4_seq_show, 3291 .start = tcp_seq_start, 3292 .next = tcp_seq_next, 3293 .stop = tcp_seq_stop, 3294 }; 3295 3296 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3297 .family = AF_INET, 3298 }; 3299 3300 static int __net_init tcp4_proc_init_net(struct net *net) 3301 { 3302 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3303 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3304 return -ENOMEM; 3305 return 0; 3306 } 3307 3308 static void __net_exit tcp4_proc_exit_net(struct net *net) 3309 { 3310 remove_proc_entry("tcp", net->proc_net); 3311 } 3312 3313 static struct pernet_operations tcp4_net_ops = { 3314 .init = tcp4_proc_init_net, 3315 .exit = tcp4_proc_exit_net, 3316 }; 3317 3318 int __init tcp4_proc_init(void) 3319 { 3320 return register_pernet_subsys(&tcp4_net_ops); 3321 } 3322 3323 void tcp4_proc_exit(void) 3324 { 3325 unregister_pernet_subsys(&tcp4_net_ops); 3326 } 3327 #endif /* CONFIG_PROC_FS */ 3328 3329 /* @wake is one when sk_stream_write_space() calls us. 3330 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3331 * This mimics the strategy used in sock_def_write_space(). 3332 */ 3333 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3334 { 3335 const struct tcp_sock *tp = tcp_sk(sk); 3336 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3337 READ_ONCE(tp->snd_nxt); 3338 3339 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3340 } 3341 EXPORT_SYMBOL(tcp_stream_memory_free); 3342 3343 struct proto tcp_prot = { 3344 .name = "TCP", 3345 .owner = THIS_MODULE, 3346 .close = tcp_close, 3347 .pre_connect = tcp_v4_pre_connect, 3348 .connect = tcp_v4_connect, 3349 .disconnect = tcp_disconnect, 3350 .accept = inet_csk_accept, 3351 .ioctl = tcp_ioctl, 3352 .init = tcp_v4_init_sock, 3353 .destroy = tcp_v4_destroy_sock, 3354 .shutdown = tcp_shutdown, 3355 .setsockopt = tcp_setsockopt, 3356 .getsockopt = tcp_getsockopt, 3357 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3358 .keepalive = tcp_set_keepalive, 3359 .recvmsg = tcp_recvmsg, 3360 .sendmsg = tcp_sendmsg, 3361 .splice_eof = tcp_splice_eof, 3362 .backlog_rcv = tcp_v4_do_rcv, 3363 .release_cb = tcp_release_cb, 3364 .hash = inet_hash, 3365 .unhash = inet_unhash, 3366 .get_port = inet_csk_get_port, 3367 .put_port = inet_put_port, 3368 #ifdef CONFIG_BPF_SYSCALL 3369 .psock_update_sk_prot = tcp_bpf_update_proto, 3370 #endif 3371 .enter_memory_pressure = tcp_enter_memory_pressure, 3372 .leave_memory_pressure = tcp_leave_memory_pressure, 3373 .stream_memory_free = tcp_stream_memory_free, 3374 .sockets_allocated = &tcp_sockets_allocated, 3375 .orphan_count = &tcp_orphan_count, 3376 3377 .memory_allocated = &tcp_memory_allocated, 3378 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3379 3380 .memory_pressure = &tcp_memory_pressure, 3381 .sysctl_mem = sysctl_tcp_mem, 3382 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3383 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3384 .max_header = MAX_TCP_HEADER, 3385 .obj_size = sizeof(struct tcp_sock), 3386 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3387 .twsk_prot = &tcp_timewait_sock_ops, 3388 .rsk_prot = &tcp_request_sock_ops, 3389 .h.hashinfo = NULL, 3390 .no_autobind = true, 3391 .diag_destroy = tcp_abort, 3392 }; 3393 EXPORT_SYMBOL(tcp_prot); 3394 3395 static void __net_exit tcp_sk_exit(struct net *net) 3396 { 3397 if (net->ipv4.tcp_congestion_control) 3398 bpf_module_put(net->ipv4.tcp_congestion_control, 3399 net->ipv4.tcp_congestion_control->owner); 3400 } 3401 3402 static void __net_init tcp_set_hashinfo(struct net *net) 3403 { 3404 struct inet_hashinfo *hinfo; 3405 unsigned int ehash_entries; 3406 struct net *old_net; 3407 3408 if (net_eq(net, &init_net)) 3409 goto fallback; 3410 3411 old_net = current->nsproxy->net_ns; 3412 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3413 if (!ehash_entries) 3414 goto fallback; 3415 3416 ehash_entries = roundup_pow_of_two(ehash_entries); 3417 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3418 if (!hinfo) { 3419 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3420 "for a netns, fallback to the global one\n", 3421 ehash_entries); 3422 fallback: 3423 hinfo = &tcp_hashinfo; 3424 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3425 } 3426 3427 net->ipv4.tcp_death_row.hashinfo = hinfo; 3428 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3429 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3430 } 3431 3432 static int __net_init tcp_sk_init(struct net *net) 3433 { 3434 net->ipv4.sysctl_tcp_ecn = 2; 3435 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3436 3437 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3438 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3439 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3440 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3441 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3442 3443 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3444 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3445 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3446 3447 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3448 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3449 net->ipv4.sysctl_tcp_syncookies = 1; 3450 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3451 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3452 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3453 net->ipv4.sysctl_tcp_orphan_retries = 0; 3454 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3455 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3456 net->ipv4.sysctl_tcp_tw_reuse = 2; 3457 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; 3458 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3459 3460 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3461 tcp_set_hashinfo(net); 3462 3463 net->ipv4.sysctl_tcp_sack = 1; 3464 net->ipv4.sysctl_tcp_window_scaling = 1; 3465 net->ipv4.sysctl_tcp_timestamps = 1; 3466 net->ipv4.sysctl_tcp_early_retrans = 3; 3467 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3468 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3469 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3470 net->ipv4.sysctl_tcp_max_reordering = 300; 3471 net->ipv4.sysctl_tcp_dsack = 1; 3472 net->ipv4.sysctl_tcp_app_win = 31; 3473 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3474 net->ipv4.sysctl_tcp_frto = 2; 3475 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3476 /* This limits the percentage of the congestion window which we 3477 * will allow a single TSO frame to consume. Building TSO frames 3478 * which are too large can cause TCP streams to be bursty. 3479 */ 3480 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3481 /* Default TSQ limit of 16 TSO segments */ 3482 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3483 3484 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3485 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3486 3487 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3488 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3489 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3490 net->ipv4.sysctl_tcp_autocorking = 1; 3491 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3492 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3493 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3494 if (net != &init_net) { 3495 memcpy(net->ipv4.sysctl_tcp_rmem, 3496 init_net.ipv4.sysctl_tcp_rmem, 3497 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3498 memcpy(net->ipv4.sysctl_tcp_wmem, 3499 init_net.ipv4.sysctl_tcp_wmem, 3500 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3501 } 3502 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3503 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3504 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3505 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3506 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3507 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3508 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3509 3510 /* Set default values for PLB */ 3511 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3512 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3513 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3514 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3515 /* Default congestion threshold for PLB to mark a round is 50% */ 3516 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3517 3518 /* Reno is always built in */ 3519 if (!net_eq(net, &init_net) && 3520 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3521 init_net.ipv4.tcp_congestion_control->owner)) 3522 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3523 else 3524 net->ipv4.tcp_congestion_control = &tcp_reno; 3525 3526 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3527 net->ipv4.sysctl_tcp_shrink_window = 0; 3528 3529 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3530 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3531 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; 3532 3533 return 0; 3534 } 3535 3536 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3537 { 3538 struct net *net; 3539 3540 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3541 * and failed setup_net error unwinding path are serialized. 3542 * 3543 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3544 * net_exit_list, the thread that dismantles a particular twsk must 3545 * do so without other thread progressing to refcount_dec_and_test() of 3546 * tcp_death_row.tw_refcount. 3547 */ 3548 mutex_lock(&tcp_exit_batch_mutex); 3549 3550 tcp_twsk_purge(net_exit_list); 3551 3552 list_for_each_entry(net, net_exit_list, exit_list) { 3553 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3554 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3555 tcp_fastopen_ctx_destroy(net); 3556 } 3557 3558 mutex_unlock(&tcp_exit_batch_mutex); 3559 } 3560 3561 static struct pernet_operations __net_initdata tcp_sk_ops = { 3562 .init = tcp_sk_init, 3563 .exit = tcp_sk_exit, 3564 .exit_batch = tcp_sk_exit_batch, 3565 }; 3566 3567 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3568 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3569 struct sock_common *sk_common, uid_t uid) 3570 3571 #define INIT_BATCH_SZ 16 3572 3573 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3574 { 3575 struct bpf_tcp_iter_state *iter = priv_data; 3576 int err; 3577 3578 err = bpf_iter_init_seq_net(priv_data, aux); 3579 if (err) 3580 return err; 3581 3582 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3583 if (err) { 3584 bpf_iter_fini_seq_net(priv_data); 3585 return err; 3586 } 3587 3588 return 0; 3589 } 3590 3591 static void bpf_iter_fini_tcp(void *priv_data) 3592 { 3593 struct bpf_tcp_iter_state *iter = priv_data; 3594 3595 bpf_iter_fini_seq_net(priv_data); 3596 kvfree(iter->batch); 3597 } 3598 3599 static const struct bpf_iter_seq_info tcp_seq_info = { 3600 .seq_ops = &bpf_iter_tcp_seq_ops, 3601 .init_seq_private = bpf_iter_init_tcp, 3602 .fini_seq_private = bpf_iter_fini_tcp, 3603 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3604 }; 3605 3606 static const struct bpf_func_proto * 3607 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3608 const struct bpf_prog *prog) 3609 { 3610 switch (func_id) { 3611 case BPF_FUNC_setsockopt: 3612 return &bpf_sk_setsockopt_proto; 3613 case BPF_FUNC_getsockopt: 3614 return &bpf_sk_getsockopt_proto; 3615 default: 3616 return NULL; 3617 } 3618 } 3619 3620 static struct bpf_iter_reg tcp_reg_info = { 3621 .target = "tcp", 3622 .ctx_arg_info_size = 1, 3623 .ctx_arg_info = { 3624 { offsetof(struct bpf_iter__tcp, sk_common), 3625 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3626 }, 3627 .get_func_proto = bpf_iter_tcp_get_func_proto, 3628 .seq_info = &tcp_seq_info, 3629 }; 3630 3631 static void __init bpf_iter_register(void) 3632 { 3633 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3634 if (bpf_iter_reg_target(&tcp_reg_info)) 3635 pr_warn("Warning: could not register bpf iterator tcp\n"); 3636 } 3637 3638 #endif 3639 3640 void __init tcp_v4_init(void) 3641 { 3642 int cpu, res; 3643 3644 for_each_possible_cpu(cpu) { 3645 struct sock *sk; 3646 3647 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3648 IPPROTO_TCP, &init_net); 3649 if (res) 3650 panic("Failed to create the TCP control socket.\n"); 3651 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3652 3653 /* Please enforce IP_DF and IPID==0 for RST and 3654 * ACK sent in SYN-RECV and TIME-WAIT state. 3655 */ 3656 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3657 3658 sk->sk_clockid = CLOCK_MONOTONIC; 3659 3660 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3661 } 3662 if (register_pernet_subsys(&tcp_sk_ops)) 3663 panic("Failed to create the TCP control socket.\n"); 3664 3665 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3666 bpf_iter_register(); 3667 #endif 3668 } 3669