1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 #include <net/rstreason.h> 74 75 #include <linux/inet.h> 76 #include <linux/ipv6.h> 77 #include <linux/stddef.h> 78 #include <linux/proc_fs.h> 79 #include <linux/seq_file.h> 80 #include <linux/inetdevice.h> 81 #include <linux/btf_ids.h> 82 #include <linux/skbuff_ref.h> 83 84 #include <crypto/hash.h> 85 #include <linux/scatterlist.h> 86 87 #include <trace/events/tcp.h> 88 89 #ifdef CONFIG_TCP_MD5SIG 90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 91 __be32 daddr, __be32 saddr, const struct tcphdr *th); 92 #endif 93 94 struct inet_hashinfo tcp_hashinfo; 95 EXPORT_SYMBOL(tcp_hashinfo); 96 97 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 98 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 99 }; 100 101 static DEFINE_MUTEX(tcp_exit_batch_mutex); 102 103 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 104 { 105 return secure_tcp_seq(ip_hdr(skb)->daddr, 106 ip_hdr(skb)->saddr, 107 tcp_hdr(skb)->dest, 108 tcp_hdr(skb)->source); 109 } 110 111 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 112 { 113 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 114 } 115 116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 117 { 118 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 119 const struct inet_timewait_sock *tw = inet_twsk(sktw); 120 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 121 struct tcp_sock *tp = tcp_sk(sk); 122 int ts_recent_stamp; 123 124 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) 125 reuse = 0; 126 127 if (reuse == 2) { 128 /* Still does not detect *everything* that goes through 129 * lo, since we require a loopback src or dst address 130 * or direct binding to 'lo' interface. 131 */ 132 bool loopback = false; 133 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 134 loopback = true; 135 #if IS_ENABLED(CONFIG_IPV6) 136 if (tw->tw_family == AF_INET6) { 137 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 138 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 139 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 140 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 141 loopback = true; 142 } else 143 #endif 144 { 145 if (ipv4_is_loopback(tw->tw_daddr) || 146 ipv4_is_loopback(tw->tw_rcv_saddr)) 147 loopback = true; 148 } 149 if (!loopback) 150 reuse = 0; 151 } 152 153 /* With PAWS, it is safe from the viewpoint 154 of data integrity. Even without PAWS it is safe provided sequence 155 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 156 157 Actually, the idea is close to VJ's one, only timestamp cache is 158 held not per host, but per port pair and TW bucket is used as state 159 holder. 160 161 If TW bucket has been already destroyed we fall back to VJ's scheme 162 and use initial timestamp retrieved from peer table. 163 */ 164 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 165 if (ts_recent_stamp && 166 (!twp || (reuse && time_after32(ktime_get_seconds(), 167 ts_recent_stamp)))) { 168 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 169 * and releasing the bucket lock. 170 */ 171 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 172 return 0; 173 174 /* In case of repair and re-using TIME-WAIT sockets we still 175 * want to be sure that it is safe as above but honor the 176 * sequence numbers and time stamps set as part of the repair 177 * process. 178 * 179 * Without this check re-using a TIME-WAIT socket with TCP 180 * repair would accumulate a -1 on the repair assigned 181 * sequence number. The first time it is reused the sequence 182 * is -1, the second time -2, etc. This fixes that issue 183 * without appearing to create any others. 184 */ 185 if (likely(!tp->repair)) { 186 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 187 188 if (!seq) 189 seq = 1; 190 WRITE_ONCE(tp->write_seq, seq); 191 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 192 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 193 } 194 195 return 1; 196 } 197 198 return 0; 199 } 200 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 201 202 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 203 int addr_len) 204 { 205 /* This check is replicated from tcp_v4_connect() and intended to 206 * prevent BPF program called below from accessing bytes that are out 207 * of the bound specified by user in addr_len. 208 */ 209 if (addr_len < sizeof(struct sockaddr_in)) 210 return -EINVAL; 211 212 sock_owned_by_me(sk); 213 214 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 215 } 216 217 /* This will initiate an outgoing connection. */ 218 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 219 { 220 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 221 struct inet_timewait_death_row *tcp_death_row; 222 struct inet_sock *inet = inet_sk(sk); 223 struct tcp_sock *tp = tcp_sk(sk); 224 struct ip_options_rcu *inet_opt; 225 struct net *net = sock_net(sk); 226 __be16 orig_sport, orig_dport; 227 __be32 daddr, nexthop; 228 struct flowi4 *fl4; 229 struct rtable *rt; 230 int err; 231 232 if (addr_len < sizeof(struct sockaddr_in)) 233 return -EINVAL; 234 235 if (usin->sin_family != AF_INET) 236 return -EAFNOSUPPORT; 237 238 nexthop = daddr = usin->sin_addr.s_addr; 239 inet_opt = rcu_dereference_protected(inet->inet_opt, 240 lockdep_sock_is_held(sk)); 241 if (inet_opt && inet_opt->opt.srr) { 242 if (!daddr) 243 return -EINVAL; 244 nexthop = inet_opt->opt.faddr; 245 } 246 247 orig_sport = inet->inet_sport; 248 orig_dport = usin->sin_port; 249 fl4 = &inet->cork.fl.u.ip4; 250 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 251 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 252 orig_dport, sk); 253 if (IS_ERR(rt)) { 254 err = PTR_ERR(rt); 255 if (err == -ENETUNREACH) 256 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 257 return err; 258 } 259 260 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 261 ip_rt_put(rt); 262 return -ENETUNREACH; 263 } 264 265 if (!inet_opt || !inet_opt->opt.srr) 266 daddr = fl4->daddr; 267 268 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 269 270 if (!inet->inet_saddr) { 271 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 272 if (err) { 273 ip_rt_put(rt); 274 return err; 275 } 276 } else { 277 sk_rcv_saddr_set(sk, inet->inet_saddr); 278 } 279 280 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 281 /* Reset inherited state */ 282 tp->rx_opt.ts_recent = 0; 283 tp->rx_opt.ts_recent_stamp = 0; 284 if (likely(!tp->repair)) 285 WRITE_ONCE(tp->write_seq, 0); 286 } 287 288 inet->inet_dport = usin->sin_port; 289 sk_daddr_set(sk, daddr); 290 291 inet_csk(sk)->icsk_ext_hdr_len = 0; 292 if (inet_opt) 293 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 294 295 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 296 297 /* Socket identity is still unknown (sport may be zero). 298 * However we set state to SYN-SENT and not releasing socket 299 * lock select source port, enter ourselves into the hash tables and 300 * complete initialization after this. 301 */ 302 tcp_set_state(sk, TCP_SYN_SENT); 303 err = inet_hash_connect(tcp_death_row, sk); 304 if (err) 305 goto failure; 306 307 sk_set_txhash(sk); 308 309 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 310 inet->inet_sport, inet->inet_dport, sk); 311 if (IS_ERR(rt)) { 312 err = PTR_ERR(rt); 313 rt = NULL; 314 goto failure; 315 } 316 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 317 /* OK, now commit destination to socket. */ 318 sk->sk_gso_type = SKB_GSO_TCPV4; 319 sk_setup_caps(sk, &rt->dst); 320 rt = NULL; 321 322 if (likely(!tp->repair)) { 323 if (!tp->write_seq) 324 WRITE_ONCE(tp->write_seq, 325 secure_tcp_seq(inet->inet_saddr, 326 inet->inet_daddr, 327 inet->inet_sport, 328 usin->sin_port)); 329 WRITE_ONCE(tp->tsoffset, 330 secure_tcp_ts_off(net, inet->inet_saddr, 331 inet->inet_daddr)); 332 } 333 334 atomic_set(&inet->inet_id, get_random_u16()); 335 336 if (tcp_fastopen_defer_connect(sk, &err)) 337 return err; 338 if (err) 339 goto failure; 340 341 err = tcp_connect(sk); 342 343 if (err) 344 goto failure; 345 346 return 0; 347 348 failure: 349 /* 350 * This unhashes the socket and releases the local port, 351 * if necessary. 352 */ 353 tcp_set_state(sk, TCP_CLOSE); 354 inet_bhash2_reset_saddr(sk); 355 ip_rt_put(rt); 356 sk->sk_route_caps = 0; 357 inet->inet_dport = 0; 358 return err; 359 } 360 EXPORT_SYMBOL(tcp_v4_connect); 361 362 /* 363 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 364 * It can be called through tcp_release_cb() if socket was owned by user 365 * at the time tcp_v4_err() was called to handle ICMP message. 366 */ 367 void tcp_v4_mtu_reduced(struct sock *sk) 368 { 369 struct inet_sock *inet = inet_sk(sk); 370 struct dst_entry *dst; 371 u32 mtu; 372 373 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 374 return; 375 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 376 dst = inet_csk_update_pmtu(sk, mtu); 377 if (!dst) 378 return; 379 380 /* Something is about to be wrong... Remember soft error 381 * for the case, if this connection will not able to recover. 382 */ 383 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 384 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 385 386 mtu = dst_mtu(dst); 387 388 if (inet->pmtudisc != IP_PMTUDISC_DONT && 389 ip_sk_accept_pmtu(sk) && 390 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 391 tcp_sync_mss(sk, mtu); 392 393 /* Resend the TCP packet because it's 394 * clear that the old packet has been 395 * dropped. This is the new "fast" path mtu 396 * discovery. 397 */ 398 tcp_simple_retransmit(sk); 399 } /* else let the usual retransmit timer handle it */ 400 } 401 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 402 403 static void do_redirect(struct sk_buff *skb, struct sock *sk) 404 { 405 struct dst_entry *dst = __sk_dst_check(sk, 0); 406 407 if (dst) 408 dst->ops->redirect(dst, sk, skb); 409 } 410 411 412 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 413 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 414 { 415 struct request_sock *req = inet_reqsk(sk); 416 struct net *net = sock_net(sk); 417 418 /* ICMPs are not backlogged, hence we cannot get 419 * an established socket here. 420 */ 421 if (seq != tcp_rsk(req)->snt_isn) { 422 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 423 } else if (abort) { 424 /* 425 * Still in SYN_RECV, just remove it silently. 426 * There is no good way to pass the error to the newly 427 * created socket, and POSIX does not want network 428 * errors returned from accept(). 429 */ 430 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 431 tcp_listendrop(req->rsk_listener); 432 } 433 reqsk_put(req); 434 } 435 EXPORT_SYMBOL(tcp_req_err); 436 437 /* TCP-LD (RFC 6069) logic */ 438 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 439 { 440 struct inet_connection_sock *icsk = inet_csk(sk); 441 struct tcp_sock *tp = tcp_sk(sk); 442 struct sk_buff *skb; 443 s32 remaining; 444 u32 delta_us; 445 446 if (sock_owned_by_user(sk)) 447 return; 448 449 if (seq != tp->snd_una || !icsk->icsk_retransmits || 450 !icsk->icsk_backoff) 451 return; 452 453 skb = tcp_rtx_queue_head(sk); 454 if (WARN_ON_ONCE(!skb)) 455 return; 456 457 icsk->icsk_backoff--; 458 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 459 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 460 461 tcp_mstamp_refresh(tp); 462 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 463 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 464 465 if (remaining > 0) { 466 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 467 remaining, TCP_RTO_MAX); 468 } else { 469 /* RTO revert clocked out retransmission. 470 * Will retransmit now. 471 */ 472 tcp_retransmit_timer(sk); 473 } 474 } 475 EXPORT_SYMBOL(tcp_ld_RTO_revert); 476 477 /* 478 * This routine is called by the ICMP module when it gets some 479 * sort of error condition. If err < 0 then the socket should 480 * be closed and the error returned to the user. If err > 0 481 * it's just the icmp type << 8 | icmp code. After adjustment 482 * header points to the first 8 bytes of the tcp header. We need 483 * to find the appropriate port. 484 * 485 * The locking strategy used here is very "optimistic". When 486 * someone else accesses the socket the ICMP is just dropped 487 * and for some paths there is no check at all. 488 * A more general error queue to queue errors for later handling 489 * is probably better. 490 * 491 */ 492 493 int tcp_v4_err(struct sk_buff *skb, u32 info) 494 { 495 const struct iphdr *iph = (const struct iphdr *)skb->data; 496 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 497 struct tcp_sock *tp; 498 const int type = icmp_hdr(skb)->type; 499 const int code = icmp_hdr(skb)->code; 500 struct sock *sk; 501 struct request_sock *fastopen; 502 u32 seq, snd_una; 503 int err; 504 struct net *net = dev_net(skb->dev); 505 506 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 507 iph->daddr, th->dest, iph->saddr, 508 ntohs(th->source), inet_iif(skb), 0); 509 if (!sk) { 510 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 511 return -ENOENT; 512 } 513 if (sk->sk_state == TCP_TIME_WAIT) { 514 /* To increase the counter of ignored icmps for TCP-AO */ 515 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 516 inet_twsk_put(inet_twsk(sk)); 517 return 0; 518 } 519 seq = ntohl(th->seq); 520 if (sk->sk_state == TCP_NEW_SYN_RECV) { 521 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 522 type == ICMP_TIME_EXCEEDED || 523 (type == ICMP_DEST_UNREACH && 524 (code == ICMP_NET_UNREACH || 525 code == ICMP_HOST_UNREACH))); 526 return 0; 527 } 528 529 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 530 sock_put(sk); 531 return 0; 532 } 533 534 bh_lock_sock(sk); 535 /* If too many ICMPs get dropped on busy 536 * servers this needs to be solved differently. 537 * We do take care of PMTU discovery (RFC1191) special case : 538 * we can receive locally generated ICMP messages while socket is held. 539 */ 540 if (sock_owned_by_user(sk)) { 541 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 542 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 543 } 544 if (sk->sk_state == TCP_CLOSE) 545 goto out; 546 547 if (static_branch_unlikely(&ip4_min_ttl)) { 548 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 549 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 550 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 551 goto out; 552 } 553 } 554 555 tp = tcp_sk(sk); 556 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 557 fastopen = rcu_dereference(tp->fastopen_rsk); 558 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 559 if (sk->sk_state != TCP_LISTEN && 560 !between(seq, snd_una, tp->snd_nxt)) { 561 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 562 goto out; 563 } 564 565 switch (type) { 566 case ICMP_REDIRECT: 567 if (!sock_owned_by_user(sk)) 568 do_redirect(skb, sk); 569 goto out; 570 case ICMP_SOURCE_QUENCH: 571 /* Just silently ignore these. */ 572 goto out; 573 case ICMP_PARAMETERPROB: 574 err = EPROTO; 575 break; 576 case ICMP_DEST_UNREACH: 577 if (code > NR_ICMP_UNREACH) 578 goto out; 579 580 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 581 /* We are not interested in TCP_LISTEN and open_requests 582 * (SYN-ACKs send out by Linux are always <576bytes so 583 * they should go through unfragmented). 584 */ 585 if (sk->sk_state == TCP_LISTEN) 586 goto out; 587 588 WRITE_ONCE(tp->mtu_info, info); 589 if (!sock_owned_by_user(sk)) { 590 tcp_v4_mtu_reduced(sk); 591 } else { 592 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 593 sock_hold(sk); 594 } 595 goto out; 596 } 597 598 err = icmp_err_convert[code].errno; 599 /* check if this ICMP message allows revert of backoff. 600 * (see RFC 6069) 601 */ 602 if (!fastopen && 603 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 604 tcp_ld_RTO_revert(sk, seq); 605 break; 606 case ICMP_TIME_EXCEEDED: 607 err = EHOSTUNREACH; 608 break; 609 default: 610 goto out; 611 } 612 613 switch (sk->sk_state) { 614 case TCP_SYN_SENT: 615 case TCP_SYN_RECV: 616 /* Only in fast or simultaneous open. If a fast open socket is 617 * already accepted it is treated as a connected one below. 618 */ 619 if (fastopen && !fastopen->sk) 620 break; 621 622 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 623 624 if (!sock_owned_by_user(sk)) 625 tcp_done_with_error(sk, err); 626 else 627 WRITE_ONCE(sk->sk_err_soft, err); 628 goto out; 629 } 630 631 /* If we've already connected we will keep trying 632 * until we time out, or the user gives up. 633 * 634 * rfc1122 4.2.3.9 allows to consider as hard errors 635 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 636 * but it is obsoleted by pmtu discovery). 637 * 638 * Note, that in modern internet, where routing is unreliable 639 * and in each dark corner broken firewalls sit, sending random 640 * errors ordered by their masters even this two messages finally lose 641 * their original sense (even Linux sends invalid PORT_UNREACHs) 642 * 643 * Now we are in compliance with RFCs. 644 * --ANK (980905) 645 */ 646 647 if (!sock_owned_by_user(sk) && 648 inet_test_bit(RECVERR, sk)) { 649 WRITE_ONCE(sk->sk_err, err); 650 sk_error_report(sk); 651 } else { /* Only an error on timeout */ 652 WRITE_ONCE(sk->sk_err_soft, err); 653 } 654 655 out: 656 bh_unlock_sock(sk); 657 sock_put(sk); 658 return 0; 659 } 660 661 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 662 { 663 struct tcphdr *th = tcp_hdr(skb); 664 665 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 666 skb->csum_start = skb_transport_header(skb) - skb->head; 667 skb->csum_offset = offsetof(struct tcphdr, check); 668 } 669 670 /* This routine computes an IPv4 TCP checksum. */ 671 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 672 { 673 const struct inet_sock *inet = inet_sk(sk); 674 675 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 676 } 677 EXPORT_SYMBOL(tcp_v4_send_check); 678 679 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 680 681 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 682 const struct tcp_ao_hdr *aoh, 683 struct ip_reply_arg *arg, struct tcphdr *reply, 684 __be32 reply_options[REPLY_OPTIONS_LEN]) 685 { 686 #ifdef CONFIG_TCP_AO 687 int sdif = tcp_v4_sdif(skb); 688 int dif = inet_iif(skb); 689 int l3index = sdif ? dif : 0; 690 bool allocated_traffic_key; 691 struct tcp_ao_key *key; 692 char *traffic_key; 693 bool drop = true; 694 u32 ao_sne = 0; 695 u8 keyid; 696 697 rcu_read_lock(); 698 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 699 &key, &traffic_key, &allocated_traffic_key, 700 &keyid, &ao_sne)) 701 goto out; 702 703 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 704 (aoh->rnext_keyid << 8) | keyid); 705 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 706 reply->doff = arg->iov[0].iov_len / 4; 707 708 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 709 key, traffic_key, 710 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 711 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 712 reply, ao_sne)) 713 goto out; 714 drop = false; 715 out: 716 rcu_read_unlock(); 717 if (allocated_traffic_key) 718 kfree(traffic_key); 719 return drop; 720 #else 721 return true; 722 #endif 723 } 724 725 /* 726 * This routine will send an RST to the other tcp. 727 * 728 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 729 * for reset. 730 * Answer: if a packet caused RST, it is not for a socket 731 * existing in our system, if it is matched to a socket, 732 * it is just duplicate segment or bug in other side's TCP. 733 * So that we build reply only basing on parameters 734 * arrived with segment. 735 * Exception: precedence violation. We do not implement it in any case. 736 */ 737 738 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 739 enum sk_rst_reason reason) 740 { 741 const struct tcphdr *th = tcp_hdr(skb); 742 struct { 743 struct tcphdr th; 744 __be32 opt[REPLY_OPTIONS_LEN]; 745 } rep; 746 const __u8 *md5_hash_location = NULL; 747 const struct tcp_ao_hdr *aoh; 748 struct ip_reply_arg arg; 749 #ifdef CONFIG_TCP_MD5SIG 750 struct tcp_md5sig_key *key = NULL; 751 unsigned char newhash[16]; 752 struct sock *sk1 = NULL; 753 int genhash; 754 #endif 755 u64 transmit_time = 0; 756 struct sock *ctl_sk; 757 struct net *net; 758 u32 txhash = 0; 759 760 /* Never send a reset in response to a reset. */ 761 if (th->rst) 762 return; 763 764 /* If sk not NULL, it means we did a successful lookup and incoming 765 * route had to be correct. prequeue might have dropped our dst. 766 */ 767 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 768 return; 769 770 /* Swap the send and the receive. */ 771 memset(&rep, 0, sizeof(rep)); 772 rep.th.dest = th->source; 773 rep.th.source = th->dest; 774 rep.th.doff = sizeof(struct tcphdr) / 4; 775 rep.th.rst = 1; 776 777 if (th->ack) { 778 rep.th.seq = th->ack_seq; 779 } else { 780 rep.th.ack = 1; 781 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 782 skb->len - (th->doff << 2)); 783 } 784 785 memset(&arg, 0, sizeof(arg)); 786 arg.iov[0].iov_base = (unsigned char *)&rep; 787 arg.iov[0].iov_len = sizeof(rep.th); 788 789 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 790 791 /* Invalid TCP option size or twice included auth */ 792 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 793 return; 794 795 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 796 return; 797 798 #ifdef CONFIG_TCP_MD5SIG 799 rcu_read_lock(); 800 if (sk && sk_fullsock(sk)) { 801 const union tcp_md5_addr *addr; 802 int l3index; 803 804 /* sdif set, means packet ingressed via a device 805 * in an L3 domain and inet_iif is set to it. 806 */ 807 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 808 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 809 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 810 } else if (md5_hash_location) { 811 const union tcp_md5_addr *addr; 812 int sdif = tcp_v4_sdif(skb); 813 int dif = inet_iif(skb); 814 int l3index; 815 816 /* 817 * active side is lost. Try to find listening socket through 818 * source port, and then find md5 key through listening socket. 819 * we are not loose security here: 820 * Incoming packet is checked with md5 hash with finding key, 821 * no RST generated if md5 hash doesn't match. 822 */ 823 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 824 NULL, 0, ip_hdr(skb)->saddr, 825 th->source, ip_hdr(skb)->daddr, 826 ntohs(th->source), dif, sdif); 827 /* don't send rst if it can't find key */ 828 if (!sk1) 829 goto out; 830 831 /* sdif set, means packet ingressed via a device 832 * in an L3 domain and dif is set to it. 833 */ 834 l3index = sdif ? dif : 0; 835 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 836 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 837 if (!key) 838 goto out; 839 840 841 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 842 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 843 goto out; 844 845 } 846 847 if (key) { 848 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 849 (TCPOPT_NOP << 16) | 850 (TCPOPT_MD5SIG << 8) | 851 TCPOLEN_MD5SIG); 852 /* Update length and the length the header thinks exists */ 853 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 854 rep.th.doff = arg.iov[0].iov_len / 4; 855 856 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 857 key, ip_hdr(skb)->saddr, 858 ip_hdr(skb)->daddr, &rep.th); 859 } 860 #endif 861 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 862 if (rep.opt[0] == 0) { 863 __be32 mrst = mptcp_reset_option(skb); 864 865 if (mrst) { 866 rep.opt[0] = mrst; 867 arg.iov[0].iov_len += sizeof(mrst); 868 rep.th.doff = arg.iov[0].iov_len / 4; 869 } 870 } 871 872 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 873 ip_hdr(skb)->saddr, /* XXX */ 874 arg.iov[0].iov_len, IPPROTO_TCP, 0); 875 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 876 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 877 878 /* When socket is gone, all binding information is lost. 879 * routing might fail in this case. No choice here, if we choose to force 880 * input interface, we will misroute in case of asymmetric route. 881 */ 882 if (sk) 883 arg.bound_dev_if = sk->sk_bound_dev_if; 884 885 trace_tcp_send_reset(sk, skb, reason); 886 887 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 888 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 889 890 arg.tos = ip_hdr(skb)->tos; 891 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 892 local_bh_disable(); 893 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 894 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 895 896 sock_net_set(ctl_sk, net); 897 if (sk) { 898 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 899 inet_twsk(sk)->tw_mark : sk->sk_mark; 900 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 901 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 902 transmit_time = tcp_transmit_time(sk); 903 xfrm_sk_clone_policy(ctl_sk, sk); 904 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 905 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 906 } else { 907 ctl_sk->sk_mark = 0; 908 ctl_sk->sk_priority = 0; 909 } 910 ip_send_unicast_reply(ctl_sk, 911 skb, &TCP_SKB_CB(skb)->header.h4.opt, 912 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 913 &arg, arg.iov[0].iov_len, 914 transmit_time, txhash); 915 916 xfrm_sk_free_policy(ctl_sk); 917 sock_net_set(ctl_sk, &init_net); 918 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 919 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 920 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 921 local_bh_enable(); 922 923 #ifdef CONFIG_TCP_MD5SIG 924 out: 925 rcu_read_unlock(); 926 #endif 927 } 928 929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 930 outside socket context is ugly, certainly. What can I do? 931 */ 932 933 static void tcp_v4_send_ack(const struct sock *sk, 934 struct sk_buff *skb, u32 seq, u32 ack, 935 u32 win, u32 tsval, u32 tsecr, int oif, 936 struct tcp_key *key, 937 int reply_flags, u8 tos, u32 txhash) 938 { 939 const struct tcphdr *th = tcp_hdr(skb); 940 struct { 941 struct tcphdr th; 942 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 943 } rep; 944 struct net *net = sock_net(sk); 945 struct ip_reply_arg arg; 946 struct sock *ctl_sk; 947 u64 transmit_time; 948 949 memset(&rep.th, 0, sizeof(struct tcphdr)); 950 memset(&arg, 0, sizeof(arg)); 951 952 arg.iov[0].iov_base = (unsigned char *)&rep; 953 arg.iov[0].iov_len = sizeof(rep.th); 954 if (tsecr) { 955 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 956 (TCPOPT_TIMESTAMP << 8) | 957 TCPOLEN_TIMESTAMP); 958 rep.opt[1] = htonl(tsval); 959 rep.opt[2] = htonl(tsecr); 960 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 961 } 962 963 /* Swap the send and the receive. */ 964 rep.th.dest = th->source; 965 rep.th.source = th->dest; 966 rep.th.doff = arg.iov[0].iov_len / 4; 967 rep.th.seq = htonl(seq); 968 rep.th.ack_seq = htonl(ack); 969 rep.th.ack = 1; 970 rep.th.window = htons(win); 971 972 #ifdef CONFIG_TCP_MD5SIG 973 if (tcp_key_is_md5(key)) { 974 int offset = (tsecr) ? 3 : 0; 975 976 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 977 (TCPOPT_NOP << 16) | 978 (TCPOPT_MD5SIG << 8) | 979 TCPOLEN_MD5SIG); 980 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 981 rep.th.doff = arg.iov[0].iov_len/4; 982 983 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 984 key->md5_key, ip_hdr(skb)->saddr, 985 ip_hdr(skb)->daddr, &rep.th); 986 } 987 #endif 988 #ifdef CONFIG_TCP_AO 989 if (tcp_key_is_ao(key)) { 990 int offset = (tsecr) ? 3 : 0; 991 992 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 993 (tcp_ao_len(key->ao_key) << 16) | 994 (key->ao_key->sndid << 8) | 995 key->rcv_next); 996 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 997 rep.th.doff = arg.iov[0].iov_len / 4; 998 999 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 1000 key->ao_key, key->traffic_key, 1001 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 1002 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 1003 &rep.th, key->sne); 1004 } 1005 #endif 1006 arg.flags = reply_flags; 1007 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1008 ip_hdr(skb)->saddr, /* XXX */ 1009 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1010 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1011 if (oif) 1012 arg.bound_dev_if = oif; 1013 arg.tos = tos; 1014 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1015 local_bh_disable(); 1016 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1017 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1018 sock_net_set(ctl_sk, net); 1019 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1020 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1021 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1022 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1023 transmit_time = tcp_transmit_time(sk); 1024 ip_send_unicast_reply(ctl_sk, 1025 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1026 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1027 &arg, arg.iov[0].iov_len, 1028 transmit_time, txhash); 1029 1030 sock_net_set(ctl_sk, &init_net); 1031 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1032 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1033 local_bh_enable(); 1034 } 1035 1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 1037 { 1038 struct inet_timewait_sock *tw = inet_twsk(sk); 1039 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1040 struct tcp_key key = {}; 1041 #ifdef CONFIG_TCP_AO 1042 struct tcp_ao_info *ao_info; 1043 1044 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1045 /* FIXME: the segment to-be-acked is not verified yet */ 1046 ao_info = rcu_dereference(tcptw->ao_info); 1047 if (ao_info) { 1048 const struct tcp_ao_hdr *aoh; 1049 1050 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1051 inet_twsk_put(tw); 1052 return; 1053 } 1054 1055 if (aoh) 1056 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); 1057 } 1058 } 1059 if (key.ao_key) { 1060 struct tcp_ao_key *rnext_key; 1061 1062 key.traffic_key = snd_other_key(key.ao_key); 1063 key.sne = READ_ONCE(ao_info->snd_sne); 1064 rnext_key = READ_ONCE(ao_info->rnext_key); 1065 key.rcv_next = rnext_key->rcvid; 1066 key.type = TCP_KEY_AO; 1067 #else 1068 if (0) { 1069 #endif 1070 } else if (static_branch_tcp_md5()) { 1071 key.md5_key = tcp_twsk_md5_key(tcptw); 1072 if (key.md5_key) 1073 key.type = TCP_KEY_MD5; 1074 } 1075 1076 tcp_v4_send_ack(sk, skb, 1077 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), 1078 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1079 tcp_tw_tsval(tcptw), 1080 READ_ONCE(tcptw->tw_ts_recent), 1081 tw->tw_bound_dev_if, &key, 1082 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1083 tw->tw_tos, 1084 tw->tw_txhash); 1085 1086 inet_twsk_put(tw); 1087 } 1088 1089 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1090 struct request_sock *req) 1091 { 1092 struct tcp_key key = {}; 1093 1094 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1095 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1096 */ 1097 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1098 tcp_sk(sk)->snd_nxt; 1099 1100 #ifdef CONFIG_TCP_AO 1101 if (static_branch_unlikely(&tcp_ao_needed.key) && 1102 tcp_rsk_used_ao(req)) { 1103 const union tcp_md5_addr *addr; 1104 const struct tcp_ao_hdr *aoh; 1105 int l3index; 1106 1107 /* Invalid TCP option size or twice included auth */ 1108 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1109 return; 1110 if (!aoh) 1111 return; 1112 1113 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1114 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1115 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1116 aoh->rnext_keyid, -1); 1117 if (unlikely(!key.ao_key)) { 1118 /* Send ACK with any matching MKT for the peer */ 1119 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1120 /* Matching key disappeared (user removed the key?) 1121 * let the handshake timeout. 1122 */ 1123 if (!key.ao_key) { 1124 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1125 addr, 1126 ntohs(tcp_hdr(skb)->source), 1127 &ip_hdr(skb)->daddr, 1128 ntohs(tcp_hdr(skb)->dest)); 1129 return; 1130 } 1131 } 1132 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1133 if (!key.traffic_key) 1134 return; 1135 1136 key.type = TCP_KEY_AO; 1137 key.rcv_next = aoh->keyid; 1138 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1139 #else 1140 if (0) { 1141 #endif 1142 } else if (static_branch_tcp_md5()) { 1143 const union tcp_md5_addr *addr; 1144 int l3index; 1145 1146 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1147 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1148 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1149 if (key.md5_key) 1150 key.type = TCP_KEY_MD5; 1151 } 1152 1153 tcp_v4_send_ack(sk, skb, seq, 1154 tcp_rsk(req)->rcv_nxt, 1155 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1156 tcp_rsk_tsval(tcp_rsk(req)), 1157 READ_ONCE(req->ts_recent), 1158 0, &key, 1159 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1160 ip_hdr(skb)->tos, 1161 READ_ONCE(tcp_rsk(req)->txhash)); 1162 if (tcp_key_is_ao(&key)) 1163 kfree(key.traffic_key); 1164 } 1165 1166 /* 1167 * Send a SYN-ACK after having received a SYN. 1168 * This still operates on a request_sock only, not on a big 1169 * socket. 1170 */ 1171 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1172 struct flowi *fl, 1173 struct request_sock *req, 1174 struct tcp_fastopen_cookie *foc, 1175 enum tcp_synack_type synack_type, 1176 struct sk_buff *syn_skb) 1177 { 1178 const struct inet_request_sock *ireq = inet_rsk(req); 1179 struct flowi4 fl4; 1180 int err = -1; 1181 struct sk_buff *skb; 1182 u8 tos; 1183 1184 /* First, grab a route. */ 1185 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1186 return -1; 1187 1188 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1189 1190 if (skb) { 1191 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1192 1193 tos = READ_ONCE(inet_sk(sk)->tos); 1194 1195 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1196 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1197 (tos & INET_ECN_MASK); 1198 1199 if (!INET_ECN_is_capable(tos) && 1200 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1201 tos |= INET_ECN_ECT_0; 1202 1203 rcu_read_lock(); 1204 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1205 ireq->ir_rmt_addr, 1206 rcu_dereference(ireq->ireq_opt), 1207 tos); 1208 rcu_read_unlock(); 1209 err = net_xmit_eval(err); 1210 } 1211 1212 return err; 1213 } 1214 1215 /* 1216 * IPv4 request_sock destructor. 1217 */ 1218 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1219 { 1220 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1221 } 1222 1223 #ifdef CONFIG_TCP_MD5SIG 1224 /* 1225 * RFC2385 MD5 checksumming requires a mapping of 1226 * IP address->MD5 Key. 1227 * We need to maintain these in the sk structure. 1228 */ 1229 1230 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1231 EXPORT_SYMBOL(tcp_md5_needed); 1232 1233 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1234 { 1235 if (!old) 1236 return true; 1237 1238 /* l3index always overrides non-l3index */ 1239 if (old->l3index && new->l3index == 0) 1240 return false; 1241 if (old->l3index == 0 && new->l3index) 1242 return true; 1243 1244 return old->prefixlen < new->prefixlen; 1245 } 1246 1247 /* Find the Key structure for an address. */ 1248 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1249 const union tcp_md5_addr *addr, 1250 int family, bool any_l3index) 1251 { 1252 const struct tcp_sock *tp = tcp_sk(sk); 1253 struct tcp_md5sig_key *key; 1254 const struct tcp_md5sig_info *md5sig; 1255 __be32 mask; 1256 struct tcp_md5sig_key *best_match = NULL; 1257 bool match; 1258 1259 /* caller either holds rcu_read_lock() or socket lock */ 1260 md5sig = rcu_dereference_check(tp->md5sig_info, 1261 lockdep_sock_is_held(sk)); 1262 if (!md5sig) 1263 return NULL; 1264 1265 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1266 lockdep_sock_is_held(sk)) { 1267 if (key->family != family) 1268 continue; 1269 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1270 key->l3index != l3index) 1271 continue; 1272 if (family == AF_INET) { 1273 mask = inet_make_mask(key->prefixlen); 1274 match = (key->addr.a4.s_addr & mask) == 1275 (addr->a4.s_addr & mask); 1276 #if IS_ENABLED(CONFIG_IPV6) 1277 } else if (family == AF_INET6) { 1278 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1279 key->prefixlen); 1280 #endif 1281 } else { 1282 match = false; 1283 } 1284 1285 if (match && better_md5_match(best_match, key)) 1286 best_match = key; 1287 } 1288 return best_match; 1289 } 1290 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1291 1292 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1293 const union tcp_md5_addr *addr, 1294 int family, u8 prefixlen, 1295 int l3index, u8 flags) 1296 { 1297 const struct tcp_sock *tp = tcp_sk(sk); 1298 struct tcp_md5sig_key *key; 1299 unsigned int size = sizeof(struct in_addr); 1300 const struct tcp_md5sig_info *md5sig; 1301 1302 /* caller either holds rcu_read_lock() or socket lock */ 1303 md5sig = rcu_dereference_check(tp->md5sig_info, 1304 lockdep_sock_is_held(sk)); 1305 if (!md5sig) 1306 return NULL; 1307 #if IS_ENABLED(CONFIG_IPV6) 1308 if (family == AF_INET6) 1309 size = sizeof(struct in6_addr); 1310 #endif 1311 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1312 lockdep_sock_is_held(sk)) { 1313 if (key->family != family) 1314 continue; 1315 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1316 continue; 1317 if (key->l3index != l3index) 1318 continue; 1319 if (!memcmp(&key->addr, addr, size) && 1320 key->prefixlen == prefixlen) 1321 return key; 1322 } 1323 return NULL; 1324 } 1325 1326 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1327 const struct sock *addr_sk) 1328 { 1329 const union tcp_md5_addr *addr; 1330 int l3index; 1331 1332 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1333 addr_sk->sk_bound_dev_if); 1334 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1335 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1336 } 1337 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1338 1339 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1340 { 1341 struct tcp_sock *tp = tcp_sk(sk); 1342 struct tcp_md5sig_info *md5sig; 1343 1344 md5sig = kmalloc(sizeof(*md5sig), gfp); 1345 if (!md5sig) 1346 return -ENOMEM; 1347 1348 sk_gso_disable(sk); 1349 INIT_HLIST_HEAD(&md5sig->head); 1350 rcu_assign_pointer(tp->md5sig_info, md5sig); 1351 return 0; 1352 } 1353 1354 /* This can be called on a newly created socket, from other files */ 1355 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1356 int family, u8 prefixlen, int l3index, u8 flags, 1357 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1358 { 1359 /* Add Key to the list */ 1360 struct tcp_md5sig_key *key; 1361 struct tcp_sock *tp = tcp_sk(sk); 1362 struct tcp_md5sig_info *md5sig; 1363 1364 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1365 if (key) { 1366 /* Pre-existing entry - just update that one. 1367 * Note that the key might be used concurrently. 1368 * data_race() is telling kcsan that we do not care of 1369 * key mismatches, since changing MD5 key on live flows 1370 * can lead to packet drops. 1371 */ 1372 data_race(memcpy(key->key, newkey, newkeylen)); 1373 1374 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1375 * Also note that a reader could catch new key->keylen value 1376 * but old key->key[], this is the reason we use __GFP_ZERO 1377 * at sock_kmalloc() time below these lines. 1378 */ 1379 WRITE_ONCE(key->keylen, newkeylen); 1380 1381 return 0; 1382 } 1383 1384 md5sig = rcu_dereference_protected(tp->md5sig_info, 1385 lockdep_sock_is_held(sk)); 1386 1387 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1388 if (!key) 1389 return -ENOMEM; 1390 1391 memcpy(key->key, newkey, newkeylen); 1392 key->keylen = newkeylen; 1393 key->family = family; 1394 key->prefixlen = prefixlen; 1395 key->l3index = l3index; 1396 key->flags = flags; 1397 memcpy(&key->addr, addr, 1398 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1399 sizeof(struct in_addr)); 1400 hlist_add_head_rcu(&key->node, &md5sig->head); 1401 return 0; 1402 } 1403 1404 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1405 int family, u8 prefixlen, int l3index, u8 flags, 1406 const u8 *newkey, u8 newkeylen) 1407 { 1408 struct tcp_sock *tp = tcp_sk(sk); 1409 1410 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1411 if (tcp_md5_alloc_sigpool()) 1412 return -ENOMEM; 1413 1414 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1415 tcp_md5_release_sigpool(); 1416 return -ENOMEM; 1417 } 1418 1419 if (!static_branch_inc(&tcp_md5_needed.key)) { 1420 struct tcp_md5sig_info *md5sig; 1421 1422 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1423 rcu_assign_pointer(tp->md5sig_info, NULL); 1424 kfree_rcu(md5sig, rcu); 1425 tcp_md5_release_sigpool(); 1426 return -EUSERS; 1427 } 1428 } 1429 1430 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1431 newkey, newkeylen, GFP_KERNEL); 1432 } 1433 EXPORT_SYMBOL(tcp_md5_do_add); 1434 1435 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1436 int family, u8 prefixlen, int l3index, 1437 struct tcp_md5sig_key *key) 1438 { 1439 struct tcp_sock *tp = tcp_sk(sk); 1440 1441 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1442 tcp_md5_add_sigpool(); 1443 1444 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1445 tcp_md5_release_sigpool(); 1446 return -ENOMEM; 1447 } 1448 1449 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1450 struct tcp_md5sig_info *md5sig; 1451 1452 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1453 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1454 rcu_assign_pointer(tp->md5sig_info, NULL); 1455 kfree_rcu(md5sig, rcu); 1456 tcp_md5_release_sigpool(); 1457 return -EUSERS; 1458 } 1459 } 1460 1461 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1462 key->flags, key->key, key->keylen, 1463 sk_gfp_mask(sk, GFP_ATOMIC)); 1464 } 1465 EXPORT_SYMBOL(tcp_md5_key_copy); 1466 1467 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1468 u8 prefixlen, int l3index, u8 flags) 1469 { 1470 struct tcp_md5sig_key *key; 1471 1472 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1473 if (!key) 1474 return -ENOENT; 1475 hlist_del_rcu(&key->node); 1476 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1477 kfree_rcu(key, rcu); 1478 return 0; 1479 } 1480 EXPORT_SYMBOL(tcp_md5_do_del); 1481 1482 void tcp_clear_md5_list(struct sock *sk) 1483 { 1484 struct tcp_sock *tp = tcp_sk(sk); 1485 struct tcp_md5sig_key *key; 1486 struct hlist_node *n; 1487 struct tcp_md5sig_info *md5sig; 1488 1489 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1490 1491 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1492 hlist_del_rcu(&key->node); 1493 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1494 kfree_rcu(key, rcu); 1495 } 1496 } 1497 1498 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1499 sockptr_t optval, int optlen) 1500 { 1501 struct tcp_md5sig cmd; 1502 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1503 const union tcp_md5_addr *addr; 1504 u8 prefixlen = 32; 1505 int l3index = 0; 1506 bool l3flag; 1507 u8 flags; 1508 1509 if (optlen < sizeof(cmd)) 1510 return -EINVAL; 1511 1512 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1513 return -EFAULT; 1514 1515 if (sin->sin_family != AF_INET) 1516 return -EINVAL; 1517 1518 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1519 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1520 1521 if (optname == TCP_MD5SIG_EXT && 1522 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1523 prefixlen = cmd.tcpm_prefixlen; 1524 if (prefixlen > 32) 1525 return -EINVAL; 1526 } 1527 1528 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1529 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1530 struct net_device *dev; 1531 1532 rcu_read_lock(); 1533 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1534 if (dev && netif_is_l3_master(dev)) 1535 l3index = dev->ifindex; 1536 1537 rcu_read_unlock(); 1538 1539 /* ok to reference set/not set outside of rcu; 1540 * right now device MUST be an L3 master 1541 */ 1542 if (!dev || !l3index) 1543 return -EINVAL; 1544 } 1545 1546 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1547 1548 if (!cmd.tcpm_keylen) 1549 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1550 1551 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1552 return -EINVAL; 1553 1554 /* Don't allow keys for peers that have a matching TCP-AO key. 1555 * See the comment in tcp_ao_add_cmd() 1556 */ 1557 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1558 return -EKEYREJECTED; 1559 1560 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1561 cmd.tcpm_key, cmd.tcpm_keylen); 1562 } 1563 1564 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1565 __be32 daddr, __be32 saddr, 1566 const struct tcphdr *th, int nbytes) 1567 { 1568 struct tcp4_pseudohdr *bp; 1569 struct scatterlist sg; 1570 struct tcphdr *_th; 1571 1572 bp = hp->scratch; 1573 bp->saddr = saddr; 1574 bp->daddr = daddr; 1575 bp->pad = 0; 1576 bp->protocol = IPPROTO_TCP; 1577 bp->len = cpu_to_be16(nbytes); 1578 1579 _th = (struct tcphdr *)(bp + 1); 1580 memcpy(_th, th, sizeof(*th)); 1581 _th->check = 0; 1582 1583 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1584 ahash_request_set_crypt(hp->req, &sg, NULL, 1585 sizeof(*bp) + sizeof(*th)); 1586 return crypto_ahash_update(hp->req); 1587 } 1588 1589 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1590 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1591 { 1592 struct tcp_sigpool hp; 1593 1594 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1595 goto clear_hash_nostart; 1596 1597 if (crypto_ahash_init(hp.req)) 1598 goto clear_hash; 1599 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1600 goto clear_hash; 1601 if (tcp_md5_hash_key(&hp, key)) 1602 goto clear_hash; 1603 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1604 if (crypto_ahash_final(hp.req)) 1605 goto clear_hash; 1606 1607 tcp_sigpool_end(&hp); 1608 return 0; 1609 1610 clear_hash: 1611 tcp_sigpool_end(&hp); 1612 clear_hash_nostart: 1613 memset(md5_hash, 0, 16); 1614 return 1; 1615 } 1616 1617 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1618 const struct sock *sk, 1619 const struct sk_buff *skb) 1620 { 1621 const struct tcphdr *th = tcp_hdr(skb); 1622 struct tcp_sigpool hp; 1623 __be32 saddr, daddr; 1624 1625 if (sk) { /* valid for establish/request sockets */ 1626 saddr = sk->sk_rcv_saddr; 1627 daddr = sk->sk_daddr; 1628 } else { 1629 const struct iphdr *iph = ip_hdr(skb); 1630 saddr = iph->saddr; 1631 daddr = iph->daddr; 1632 } 1633 1634 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1635 goto clear_hash_nostart; 1636 1637 if (crypto_ahash_init(hp.req)) 1638 goto clear_hash; 1639 1640 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1641 goto clear_hash; 1642 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1643 goto clear_hash; 1644 if (tcp_md5_hash_key(&hp, key)) 1645 goto clear_hash; 1646 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1647 if (crypto_ahash_final(hp.req)) 1648 goto clear_hash; 1649 1650 tcp_sigpool_end(&hp); 1651 return 0; 1652 1653 clear_hash: 1654 tcp_sigpool_end(&hp); 1655 clear_hash_nostart: 1656 memset(md5_hash, 0, 16); 1657 return 1; 1658 } 1659 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1660 1661 #endif 1662 1663 static void tcp_v4_init_req(struct request_sock *req, 1664 const struct sock *sk_listener, 1665 struct sk_buff *skb) 1666 { 1667 struct inet_request_sock *ireq = inet_rsk(req); 1668 struct net *net = sock_net(sk_listener); 1669 1670 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1671 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1672 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1673 } 1674 1675 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1676 struct sk_buff *skb, 1677 struct flowi *fl, 1678 struct request_sock *req, 1679 u32 tw_isn) 1680 { 1681 tcp_v4_init_req(req, sk, skb); 1682 1683 if (security_inet_conn_request(sk, skb, req)) 1684 return NULL; 1685 1686 return inet_csk_route_req(sk, &fl->u.ip4, req); 1687 } 1688 1689 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1690 .family = PF_INET, 1691 .obj_size = sizeof(struct tcp_request_sock), 1692 .rtx_syn_ack = tcp_rtx_synack, 1693 .send_ack = tcp_v4_reqsk_send_ack, 1694 .destructor = tcp_v4_reqsk_destructor, 1695 .send_reset = tcp_v4_send_reset, 1696 .syn_ack_timeout = tcp_syn_ack_timeout, 1697 }; 1698 1699 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1700 .mss_clamp = TCP_MSS_DEFAULT, 1701 #ifdef CONFIG_TCP_MD5SIG 1702 .req_md5_lookup = tcp_v4_md5_lookup, 1703 .calc_md5_hash = tcp_v4_md5_hash_skb, 1704 #endif 1705 #ifdef CONFIG_TCP_AO 1706 .ao_lookup = tcp_v4_ao_lookup_rsk, 1707 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1708 .ao_synack_hash = tcp_v4_ao_synack_hash, 1709 #endif 1710 #ifdef CONFIG_SYN_COOKIES 1711 .cookie_init_seq = cookie_v4_init_sequence, 1712 #endif 1713 .route_req = tcp_v4_route_req, 1714 .init_seq = tcp_v4_init_seq, 1715 .init_ts_off = tcp_v4_init_ts_off, 1716 .send_synack = tcp_v4_send_synack, 1717 }; 1718 1719 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1720 { 1721 /* Never answer to SYNs send to broadcast or multicast */ 1722 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1723 goto drop; 1724 1725 return tcp_conn_request(&tcp_request_sock_ops, 1726 &tcp_request_sock_ipv4_ops, sk, skb); 1727 1728 drop: 1729 tcp_listendrop(sk); 1730 return 0; 1731 } 1732 EXPORT_SYMBOL(tcp_v4_conn_request); 1733 1734 1735 /* 1736 * The three way handshake has completed - we got a valid synack - 1737 * now create the new socket. 1738 */ 1739 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1740 struct request_sock *req, 1741 struct dst_entry *dst, 1742 struct request_sock *req_unhash, 1743 bool *own_req) 1744 { 1745 struct inet_request_sock *ireq; 1746 bool found_dup_sk = false; 1747 struct inet_sock *newinet; 1748 struct tcp_sock *newtp; 1749 struct sock *newsk; 1750 #ifdef CONFIG_TCP_MD5SIG 1751 const union tcp_md5_addr *addr; 1752 struct tcp_md5sig_key *key; 1753 int l3index; 1754 #endif 1755 struct ip_options_rcu *inet_opt; 1756 1757 if (sk_acceptq_is_full(sk)) 1758 goto exit_overflow; 1759 1760 newsk = tcp_create_openreq_child(sk, req, skb); 1761 if (!newsk) 1762 goto exit_nonewsk; 1763 1764 newsk->sk_gso_type = SKB_GSO_TCPV4; 1765 inet_sk_rx_dst_set(newsk, skb); 1766 1767 newtp = tcp_sk(newsk); 1768 newinet = inet_sk(newsk); 1769 ireq = inet_rsk(req); 1770 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1771 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1772 newsk->sk_bound_dev_if = ireq->ir_iif; 1773 newinet->inet_saddr = ireq->ir_loc_addr; 1774 inet_opt = rcu_dereference(ireq->ireq_opt); 1775 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1776 newinet->mc_index = inet_iif(skb); 1777 newinet->mc_ttl = ip_hdr(skb)->ttl; 1778 newinet->rcv_tos = ip_hdr(skb)->tos; 1779 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1780 if (inet_opt) 1781 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1782 atomic_set(&newinet->inet_id, get_random_u16()); 1783 1784 /* Set ToS of the new socket based upon the value of incoming SYN. 1785 * ECT bits are set later in tcp_init_transfer(). 1786 */ 1787 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1788 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1789 1790 if (!dst) { 1791 dst = inet_csk_route_child_sock(sk, newsk, req); 1792 if (!dst) 1793 goto put_and_exit; 1794 } else { 1795 /* syncookie case : see end of cookie_v4_check() */ 1796 } 1797 sk_setup_caps(newsk, dst); 1798 1799 tcp_ca_openreq_child(newsk, dst); 1800 1801 tcp_sync_mss(newsk, dst_mtu(dst)); 1802 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1803 1804 tcp_initialize_rcv_mss(newsk); 1805 1806 #ifdef CONFIG_TCP_MD5SIG 1807 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1808 /* Copy over the MD5 key from the original socket */ 1809 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1810 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1811 if (key && !tcp_rsk_used_ao(req)) { 1812 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1813 goto put_and_exit; 1814 sk_gso_disable(newsk); 1815 } 1816 #endif 1817 #ifdef CONFIG_TCP_AO 1818 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1819 goto put_and_exit; /* OOM, release back memory */ 1820 #endif 1821 1822 if (__inet_inherit_port(sk, newsk) < 0) 1823 goto put_and_exit; 1824 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1825 &found_dup_sk); 1826 if (likely(*own_req)) { 1827 tcp_move_syn(newtp, req); 1828 ireq->ireq_opt = NULL; 1829 } else { 1830 newinet->inet_opt = NULL; 1831 1832 if (!req_unhash && found_dup_sk) { 1833 /* This code path should only be executed in the 1834 * syncookie case only 1835 */ 1836 bh_unlock_sock(newsk); 1837 sock_put(newsk); 1838 newsk = NULL; 1839 } 1840 } 1841 return newsk; 1842 1843 exit_overflow: 1844 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1845 exit_nonewsk: 1846 dst_release(dst); 1847 exit: 1848 tcp_listendrop(sk); 1849 return NULL; 1850 put_and_exit: 1851 newinet->inet_opt = NULL; 1852 inet_csk_prepare_forced_close(newsk); 1853 tcp_done(newsk); 1854 goto exit; 1855 } 1856 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1857 1858 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1859 { 1860 #ifdef CONFIG_SYN_COOKIES 1861 const struct tcphdr *th = tcp_hdr(skb); 1862 1863 if (!th->syn) 1864 sk = cookie_v4_check(sk, skb); 1865 #endif 1866 return sk; 1867 } 1868 1869 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1870 struct tcphdr *th, u32 *cookie) 1871 { 1872 u16 mss = 0; 1873 #ifdef CONFIG_SYN_COOKIES 1874 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1875 &tcp_request_sock_ipv4_ops, sk, th); 1876 if (mss) { 1877 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1878 tcp_synq_overflow(sk); 1879 } 1880 #endif 1881 return mss; 1882 } 1883 1884 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1885 u32)); 1886 /* The socket must have it's spinlock held when we get 1887 * here, unless it is a TCP_LISTEN socket. 1888 * 1889 * We have a potential double-lock case here, so even when 1890 * doing backlog processing we use the BH locking scheme. 1891 * This is because we cannot sleep with the original spinlock 1892 * held. 1893 */ 1894 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1895 { 1896 enum skb_drop_reason reason; 1897 struct sock *rsk; 1898 1899 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1900 struct dst_entry *dst; 1901 1902 dst = rcu_dereference_protected(sk->sk_rx_dst, 1903 lockdep_sock_is_held(sk)); 1904 1905 sock_rps_save_rxhash(sk, skb); 1906 sk_mark_napi_id(sk, skb); 1907 if (dst) { 1908 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1909 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1910 dst, 0)) { 1911 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1912 dst_release(dst); 1913 } 1914 } 1915 tcp_rcv_established(sk, skb); 1916 return 0; 1917 } 1918 1919 if (tcp_checksum_complete(skb)) 1920 goto csum_err; 1921 1922 if (sk->sk_state == TCP_LISTEN) { 1923 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1924 1925 if (!nsk) 1926 return 0; 1927 if (nsk != sk) { 1928 reason = tcp_child_process(sk, nsk, skb); 1929 if (reason) { 1930 rsk = nsk; 1931 goto reset; 1932 } 1933 return 0; 1934 } 1935 } else 1936 sock_rps_save_rxhash(sk, skb); 1937 1938 reason = tcp_rcv_state_process(sk, skb); 1939 if (reason) { 1940 rsk = sk; 1941 goto reset; 1942 } 1943 return 0; 1944 1945 reset: 1946 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1947 discard: 1948 sk_skb_reason_drop(sk, skb, reason); 1949 /* Be careful here. If this function gets more complicated and 1950 * gcc suffers from register pressure on the x86, sk (in %ebx) 1951 * might be destroyed here. This current version compiles correctly, 1952 * but you have been warned. 1953 */ 1954 return 0; 1955 1956 csum_err: 1957 reason = SKB_DROP_REASON_TCP_CSUM; 1958 trace_tcp_bad_csum(skb); 1959 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1960 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1961 goto discard; 1962 } 1963 EXPORT_SYMBOL(tcp_v4_do_rcv); 1964 1965 int tcp_v4_early_demux(struct sk_buff *skb) 1966 { 1967 struct net *net = dev_net(skb->dev); 1968 const struct iphdr *iph; 1969 const struct tcphdr *th; 1970 struct sock *sk; 1971 1972 if (skb->pkt_type != PACKET_HOST) 1973 return 0; 1974 1975 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1976 return 0; 1977 1978 iph = ip_hdr(skb); 1979 th = tcp_hdr(skb); 1980 1981 if (th->doff < sizeof(struct tcphdr) / 4) 1982 return 0; 1983 1984 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1985 iph->saddr, th->source, 1986 iph->daddr, ntohs(th->dest), 1987 skb->skb_iif, inet_sdif(skb)); 1988 if (sk) { 1989 skb->sk = sk; 1990 skb->destructor = sock_edemux; 1991 if (sk_fullsock(sk)) { 1992 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1993 1994 if (dst) 1995 dst = dst_check(dst, 0); 1996 if (dst && 1997 sk->sk_rx_dst_ifindex == skb->skb_iif) 1998 skb_dst_set_noref(skb, dst); 1999 } 2000 } 2001 return 0; 2002 } 2003 2004 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2005 enum skb_drop_reason *reason) 2006 { 2007 u32 tail_gso_size, tail_gso_segs; 2008 struct skb_shared_info *shinfo; 2009 const struct tcphdr *th; 2010 struct tcphdr *thtail; 2011 struct sk_buff *tail; 2012 unsigned int hdrlen; 2013 bool fragstolen; 2014 u32 gso_segs; 2015 u32 gso_size; 2016 u64 limit; 2017 int delta; 2018 2019 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2020 * we can fix skb->truesize to its real value to avoid future drops. 2021 * This is valid because skb is not yet charged to the socket. 2022 * It has been noticed pure SACK packets were sometimes dropped 2023 * (if cooked by drivers without copybreak feature). 2024 */ 2025 skb_condense(skb); 2026 2027 skb_dst_drop(skb); 2028 2029 if (unlikely(tcp_checksum_complete(skb))) { 2030 bh_unlock_sock(sk); 2031 trace_tcp_bad_csum(skb); 2032 *reason = SKB_DROP_REASON_TCP_CSUM; 2033 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2034 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2035 return true; 2036 } 2037 2038 /* Attempt coalescing to last skb in backlog, even if we are 2039 * above the limits. 2040 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2041 */ 2042 th = (const struct tcphdr *)skb->data; 2043 hdrlen = th->doff * 4; 2044 2045 tail = sk->sk_backlog.tail; 2046 if (!tail) 2047 goto no_coalesce; 2048 thtail = (struct tcphdr *)tail->data; 2049 2050 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2051 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2052 ((TCP_SKB_CB(tail)->tcp_flags | 2053 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2054 !((TCP_SKB_CB(tail)->tcp_flags & 2055 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2056 ((TCP_SKB_CB(tail)->tcp_flags ^ 2057 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 2058 !tcp_skb_can_collapse_rx(tail, skb) || 2059 thtail->doff != th->doff || 2060 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2061 goto no_coalesce; 2062 2063 __skb_pull(skb, hdrlen); 2064 2065 shinfo = skb_shinfo(skb); 2066 gso_size = shinfo->gso_size ?: skb->len; 2067 gso_segs = shinfo->gso_segs ?: 1; 2068 2069 shinfo = skb_shinfo(tail); 2070 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2071 tail_gso_segs = shinfo->gso_segs ?: 1; 2072 2073 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2074 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2075 2076 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2077 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2078 thtail->window = th->window; 2079 } 2080 2081 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2082 * thtail->fin, so that the fast path in tcp_rcv_established() 2083 * is not entered if we append a packet with a FIN. 2084 * SYN, RST, URG are not present. 2085 * ACK is set on both packets. 2086 * PSH : we do not really care in TCP stack, 2087 * at least for 'GRO' packets. 2088 */ 2089 thtail->fin |= th->fin; 2090 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2091 2092 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2093 TCP_SKB_CB(tail)->has_rxtstamp = true; 2094 tail->tstamp = skb->tstamp; 2095 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2096 } 2097 2098 /* Not as strict as GRO. We only need to carry mss max value */ 2099 shinfo->gso_size = max(gso_size, tail_gso_size); 2100 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2101 2102 sk->sk_backlog.len += delta; 2103 __NET_INC_STATS(sock_net(sk), 2104 LINUX_MIB_TCPBACKLOGCOALESCE); 2105 kfree_skb_partial(skb, fragstolen); 2106 return false; 2107 } 2108 __skb_push(skb, hdrlen); 2109 2110 no_coalesce: 2111 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2112 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2113 * sk_rcvbuf in normal conditions. 2114 */ 2115 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2116 2117 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2118 2119 /* Only socket owner can try to collapse/prune rx queues 2120 * to reduce memory overhead, so add a little headroom here. 2121 * Few sockets backlog are possibly concurrently non empty. 2122 */ 2123 limit += 64 * 1024; 2124 2125 limit = min_t(u64, limit, UINT_MAX); 2126 2127 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2128 bh_unlock_sock(sk); 2129 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2130 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2131 return true; 2132 } 2133 return false; 2134 } 2135 EXPORT_SYMBOL(tcp_add_backlog); 2136 2137 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2138 { 2139 struct tcphdr *th = (struct tcphdr *)skb->data; 2140 2141 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2142 } 2143 EXPORT_SYMBOL(tcp_filter); 2144 2145 static void tcp_v4_restore_cb(struct sk_buff *skb) 2146 { 2147 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2148 sizeof(struct inet_skb_parm)); 2149 } 2150 2151 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2152 const struct tcphdr *th) 2153 { 2154 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2155 * barrier() makes sure compiler wont play fool^Waliasing games. 2156 */ 2157 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2158 sizeof(struct inet_skb_parm)); 2159 barrier(); 2160 2161 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2162 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2163 skb->len - th->doff * 4); 2164 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2165 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 2166 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2167 TCP_SKB_CB(skb)->sacked = 0; 2168 TCP_SKB_CB(skb)->has_rxtstamp = 2169 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2170 } 2171 2172 /* 2173 * From tcp_input.c 2174 */ 2175 2176 int tcp_v4_rcv(struct sk_buff *skb) 2177 { 2178 struct net *net = dev_net(skb->dev); 2179 enum skb_drop_reason drop_reason; 2180 int sdif = inet_sdif(skb); 2181 int dif = inet_iif(skb); 2182 const struct iphdr *iph; 2183 const struct tcphdr *th; 2184 struct sock *sk = NULL; 2185 bool refcounted; 2186 int ret; 2187 u32 isn; 2188 2189 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2190 if (skb->pkt_type != PACKET_HOST) 2191 goto discard_it; 2192 2193 /* Count it even if it's bad */ 2194 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2195 2196 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2197 goto discard_it; 2198 2199 th = (const struct tcphdr *)skb->data; 2200 2201 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2202 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2203 goto bad_packet; 2204 } 2205 if (!pskb_may_pull(skb, th->doff * 4)) 2206 goto discard_it; 2207 2208 /* An explanation is required here, I think. 2209 * Packet length and doff are validated by header prediction, 2210 * provided case of th->doff==0 is eliminated. 2211 * So, we defer the checks. */ 2212 2213 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2214 goto csum_error; 2215 2216 th = (const struct tcphdr *)skb->data; 2217 iph = ip_hdr(skb); 2218 lookup: 2219 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2220 skb, __tcp_hdrlen(th), th->source, 2221 th->dest, sdif, &refcounted); 2222 if (!sk) 2223 goto no_tcp_socket; 2224 2225 if (sk->sk_state == TCP_TIME_WAIT) 2226 goto do_time_wait; 2227 2228 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2229 struct request_sock *req = inet_reqsk(sk); 2230 bool req_stolen = false; 2231 struct sock *nsk; 2232 2233 sk = req->rsk_listener; 2234 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2235 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2236 else 2237 drop_reason = tcp_inbound_hash(sk, req, skb, 2238 &iph->saddr, &iph->daddr, 2239 AF_INET, dif, sdif); 2240 if (unlikely(drop_reason)) { 2241 sk_drops_add(sk, skb); 2242 reqsk_put(req); 2243 goto discard_it; 2244 } 2245 if (tcp_checksum_complete(skb)) { 2246 reqsk_put(req); 2247 goto csum_error; 2248 } 2249 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2250 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2251 if (!nsk) { 2252 inet_csk_reqsk_queue_drop_and_put(sk, req); 2253 goto lookup; 2254 } 2255 sk = nsk; 2256 /* reuseport_migrate_sock() has already held one sk_refcnt 2257 * before returning. 2258 */ 2259 } else { 2260 /* We own a reference on the listener, increase it again 2261 * as we might lose it too soon. 2262 */ 2263 sock_hold(sk); 2264 } 2265 refcounted = true; 2266 nsk = NULL; 2267 if (!tcp_filter(sk, skb)) { 2268 th = (const struct tcphdr *)skb->data; 2269 iph = ip_hdr(skb); 2270 tcp_v4_fill_cb(skb, iph, th); 2271 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2272 } else { 2273 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2274 } 2275 if (!nsk) { 2276 reqsk_put(req); 2277 if (req_stolen) { 2278 /* Another cpu got exclusive access to req 2279 * and created a full blown socket. 2280 * Try to feed this packet to this socket 2281 * instead of discarding it. 2282 */ 2283 tcp_v4_restore_cb(skb); 2284 sock_put(sk); 2285 goto lookup; 2286 } 2287 goto discard_and_relse; 2288 } 2289 nf_reset_ct(skb); 2290 if (nsk == sk) { 2291 reqsk_put(req); 2292 tcp_v4_restore_cb(skb); 2293 } else { 2294 drop_reason = tcp_child_process(sk, nsk, skb); 2295 if (drop_reason) { 2296 enum sk_rst_reason rst_reason; 2297 2298 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2299 tcp_v4_send_reset(nsk, skb, rst_reason); 2300 goto discard_and_relse; 2301 } 2302 sock_put(sk); 2303 return 0; 2304 } 2305 } 2306 2307 process: 2308 if (static_branch_unlikely(&ip4_min_ttl)) { 2309 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2310 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2311 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2312 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2313 goto discard_and_relse; 2314 } 2315 } 2316 2317 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2318 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2319 goto discard_and_relse; 2320 } 2321 2322 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2323 AF_INET, dif, sdif); 2324 if (drop_reason) 2325 goto discard_and_relse; 2326 2327 nf_reset_ct(skb); 2328 2329 if (tcp_filter(sk, skb)) { 2330 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2331 goto discard_and_relse; 2332 } 2333 th = (const struct tcphdr *)skb->data; 2334 iph = ip_hdr(skb); 2335 tcp_v4_fill_cb(skb, iph, th); 2336 2337 skb->dev = NULL; 2338 2339 if (sk->sk_state == TCP_LISTEN) { 2340 ret = tcp_v4_do_rcv(sk, skb); 2341 goto put_and_return; 2342 } 2343 2344 sk_incoming_cpu_update(sk); 2345 2346 bh_lock_sock_nested(sk); 2347 tcp_segs_in(tcp_sk(sk), skb); 2348 ret = 0; 2349 if (!sock_owned_by_user(sk)) { 2350 ret = tcp_v4_do_rcv(sk, skb); 2351 } else { 2352 if (tcp_add_backlog(sk, skb, &drop_reason)) 2353 goto discard_and_relse; 2354 } 2355 bh_unlock_sock(sk); 2356 2357 put_and_return: 2358 if (refcounted) 2359 sock_put(sk); 2360 2361 return ret; 2362 2363 no_tcp_socket: 2364 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2365 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2366 goto discard_it; 2367 2368 tcp_v4_fill_cb(skb, iph, th); 2369 2370 if (tcp_checksum_complete(skb)) { 2371 csum_error: 2372 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2373 trace_tcp_bad_csum(skb); 2374 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2375 bad_packet: 2376 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2377 } else { 2378 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2379 } 2380 2381 discard_it: 2382 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2383 /* Discard frame. */ 2384 sk_skb_reason_drop(sk, skb, drop_reason); 2385 return 0; 2386 2387 discard_and_relse: 2388 sk_drops_add(sk, skb); 2389 if (refcounted) 2390 sock_put(sk); 2391 goto discard_it; 2392 2393 do_time_wait: 2394 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2395 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2396 inet_twsk_put(inet_twsk(sk)); 2397 goto discard_it; 2398 } 2399 2400 tcp_v4_fill_cb(skb, iph, th); 2401 2402 if (tcp_checksum_complete(skb)) { 2403 inet_twsk_put(inet_twsk(sk)); 2404 goto csum_error; 2405 } 2406 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { 2407 case TCP_TW_SYN: { 2408 struct sock *sk2 = inet_lookup_listener(net, 2409 net->ipv4.tcp_death_row.hashinfo, 2410 skb, __tcp_hdrlen(th), 2411 iph->saddr, th->source, 2412 iph->daddr, th->dest, 2413 inet_iif(skb), 2414 sdif); 2415 if (sk2) { 2416 inet_twsk_deschedule_put(inet_twsk(sk)); 2417 sk = sk2; 2418 tcp_v4_restore_cb(skb); 2419 refcounted = false; 2420 __this_cpu_write(tcp_tw_isn, isn); 2421 goto process; 2422 } 2423 } 2424 /* to ACK */ 2425 fallthrough; 2426 case TCP_TW_ACK: 2427 tcp_v4_timewait_ack(sk, skb); 2428 break; 2429 case TCP_TW_RST: 2430 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2431 inet_twsk_deschedule_put(inet_twsk(sk)); 2432 goto discard_it; 2433 case TCP_TW_SUCCESS:; 2434 } 2435 goto discard_it; 2436 } 2437 2438 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2439 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2440 .twsk_destructor= tcp_twsk_destructor, 2441 }; 2442 2443 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2444 { 2445 struct dst_entry *dst = skb_dst(skb); 2446 2447 if (dst && dst_hold_safe(dst)) { 2448 rcu_assign_pointer(sk->sk_rx_dst, dst); 2449 sk->sk_rx_dst_ifindex = skb->skb_iif; 2450 } 2451 } 2452 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2453 2454 const struct inet_connection_sock_af_ops ipv4_specific = { 2455 .queue_xmit = ip_queue_xmit, 2456 .send_check = tcp_v4_send_check, 2457 .rebuild_header = inet_sk_rebuild_header, 2458 .sk_rx_dst_set = inet_sk_rx_dst_set, 2459 .conn_request = tcp_v4_conn_request, 2460 .syn_recv_sock = tcp_v4_syn_recv_sock, 2461 .net_header_len = sizeof(struct iphdr), 2462 .setsockopt = ip_setsockopt, 2463 .getsockopt = ip_getsockopt, 2464 .addr2sockaddr = inet_csk_addr2sockaddr, 2465 .sockaddr_len = sizeof(struct sockaddr_in), 2466 .mtu_reduced = tcp_v4_mtu_reduced, 2467 }; 2468 EXPORT_SYMBOL(ipv4_specific); 2469 2470 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2471 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2472 #ifdef CONFIG_TCP_MD5SIG 2473 .md5_lookup = tcp_v4_md5_lookup, 2474 .calc_md5_hash = tcp_v4_md5_hash_skb, 2475 .md5_parse = tcp_v4_parse_md5_keys, 2476 #endif 2477 #ifdef CONFIG_TCP_AO 2478 .ao_lookup = tcp_v4_ao_lookup, 2479 .calc_ao_hash = tcp_v4_ao_hash_skb, 2480 .ao_parse = tcp_v4_parse_ao, 2481 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2482 #endif 2483 }; 2484 #endif 2485 2486 /* NOTE: A lot of things set to zero explicitly by call to 2487 * sk_alloc() so need not be done here. 2488 */ 2489 static int tcp_v4_init_sock(struct sock *sk) 2490 { 2491 struct inet_connection_sock *icsk = inet_csk(sk); 2492 2493 tcp_init_sock(sk); 2494 2495 icsk->icsk_af_ops = &ipv4_specific; 2496 2497 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2498 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2499 #endif 2500 2501 return 0; 2502 } 2503 2504 #ifdef CONFIG_TCP_MD5SIG 2505 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2506 { 2507 struct tcp_md5sig_info *md5sig; 2508 2509 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2510 kfree(md5sig); 2511 static_branch_slow_dec_deferred(&tcp_md5_needed); 2512 tcp_md5_release_sigpool(); 2513 } 2514 #endif 2515 2516 static void tcp_release_user_frags(struct sock *sk) 2517 { 2518 #ifdef CONFIG_PAGE_POOL 2519 unsigned long index; 2520 void *netmem; 2521 2522 xa_for_each(&sk->sk_user_frags, index, netmem) 2523 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); 2524 #endif 2525 } 2526 2527 void tcp_v4_destroy_sock(struct sock *sk) 2528 { 2529 struct tcp_sock *tp = tcp_sk(sk); 2530 2531 tcp_release_user_frags(sk); 2532 2533 xa_destroy(&sk->sk_user_frags); 2534 2535 trace_tcp_destroy_sock(sk); 2536 2537 tcp_clear_xmit_timers(sk); 2538 2539 tcp_cleanup_congestion_control(sk); 2540 2541 tcp_cleanup_ulp(sk); 2542 2543 /* Cleanup up the write buffer. */ 2544 tcp_write_queue_purge(sk); 2545 2546 /* Check if we want to disable active TFO */ 2547 tcp_fastopen_active_disable_ofo_check(sk); 2548 2549 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2550 skb_rbtree_purge(&tp->out_of_order_queue); 2551 2552 #ifdef CONFIG_TCP_MD5SIG 2553 /* Clean up the MD5 key list, if any */ 2554 if (tp->md5sig_info) { 2555 struct tcp_md5sig_info *md5sig; 2556 2557 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2558 tcp_clear_md5_list(sk); 2559 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2560 rcu_assign_pointer(tp->md5sig_info, NULL); 2561 } 2562 #endif 2563 tcp_ao_destroy_sock(sk, false); 2564 2565 /* Clean up a referenced TCP bind bucket. */ 2566 if (inet_csk(sk)->icsk_bind_hash) 2567 inet_put_port(sk); 2568 2569 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2570 2571 /* If socket is aborted during connect operation */ 2572 tcp_free_fastopen_req(tp); 2573 tcp_fastopen_destroy_cipher(sk); 2574 tcp_saved_syn_free(tp); 2575 2576 sk_sockets_allocated_dec(sk); 2577 } 2578 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2579 2580 #ifdef CONFIG_PROC_FS 2581 /* Proc filesystem TCP sock list dumping. */ 2582 2583 static unsigned short seq_file_family(const struct seq_file *seq); 2584 2585 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2586 { 2587 unsigned short family = seq_file_family(seq); 2588 2589 /* AF_UNSPEC is used as a match all */ 2590 return ((family == AF_UNSPEC || family == sk->sk_family) && 2591 net_eq(sock_net(sk), seq_file_net(seq))); 2592 } 2593 2594 /* Find a non empty bucket (starting from st->bucket) 2595 * and return the first sk from it. 2596 */ 2597 static void *listening_get_first(struct seq_file *seq) 2598 { 2599 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2600 struct tcp_iter_state *st = seq->private; 2601 2602 st->offset = 0; 2603 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2604 struct inet_listen_hashbucket *ilb2; 2605 struct hlist_nulls_node *node; 2606 struct sock *sk; 2607 2608 ilb2 = &hinfo->lhash2[st->bucket]; 2609 if (hlist_nulls_empty(&ilb2->nulls_head)) 2610 continue; 2611 2612 spin_lock(&ilb2->lock); 2613 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2614 if (seq_sk_match(seq, sk)) 2615 return sk; 2616 } 2617 spin_unlock(&ilb2->lock); 2618 } 2619 2620 return NULL; 2621 } 2622 2623 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2624 * If "cur" is the last one in the st->bucket, 2625 * call listening_get_first() to return the first sk of the next 2626 * non empty bucket. 2627 */ 2628 static void *listening_get_next(struct seq_file *seq, void *cur) 2629 { 2630 struct tcp_iter_state *st = seq->private; 2631 struct inet_listen_hashbucket *ilb2; 2632 struct hlist_nulls_node *node; 2633 struct inet_hashinfo *hinfo; 2634 struct sock *sk = cur; 2635 2636 ++st->num; 2637 ++st->offset; 2638 2639 sk = sk_nulls_next(sk); 2640 sk_nulls_for_each_from(sk, node) { 2641 if (seq_sk_match(seq, sk)) 2642 return sk; 2643 } 2644 2645 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2646 ilb2 = &hinfo->lhash2[st->bucket]; 2647 spin_unlock(&ilb2->lock); 2648 ++st->bucket; 2649 return listening_get_first(seq); 2650 } 2651 2652 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2653 { 2654 struct tcp_iter_state *st = seq->private; 2655 void *rc; 2656 2657 st->bucket = 0; 2658 st->offset = 0; 2659 rc = listening_get_first(seq); 2660 2661 while (rc && *pos) { 2662 rc = listening_get_next(seq, rc); 2663 --*pos; 2664 } 2665 return rc; 2666 } 2667 2668 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2669 const struct tcp_iter_state *st) 2670 { 2671 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2672 } 2673 2674 /* 2675 * Get first established socket starting from bucket given in st->bucket. 2676 * If st->bucket is zero, the very first socket in the hash is returned. 2677 */ 2678 static void *established_get_first(struct seq_file *seq) 2679 { 2680 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2681 struct tcp_iter_state *st = seq->private; 2682 2683 st->offset = 0; 2684 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2685 struct sock *sk; 2686 struct hlist_nulls_node *node; 2687 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2688 2689 cond_resched(); 2690 2691 /* Lockless fast path for the common case of empty buckets */ 2692 if (empty_bucket(hinfo, st)) 2693 continue; 2694 2695 spin_lock_bh(lock); 2696 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2697 if (seq_sk_match(seq, sk)) 2698 return sk; 2699 } 2700 spin_unlock_bh(lock); 2701 } 2702 2703 return NULL; 2704 } 2705 2706 static void *established_get_next(struct seq_file *seq, void *cur) 2707 { 2708 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2709 struct tcp_iter_state *st = seq->private; 2710 struct hlist_nulls_node *node; 2711 struct sock *sk = cur; 2712 2713 ++st->num; 2714 ++st->offset; 2715 2716 sk = sk_nulls_next(sk); 2717 2718 sk_nulls_for_each_from(sk, node) { 2719 if (seq_sk_match(seq, sk)) 2720 return sk; 2721 } 2722 2723 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2724 ++st->bucket; 2725 return established_get_first(seq); 2726 } 2727 2728 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2729 { 2730 struct tcp_iter_state *st = seq->private; 2731 void *rc; 2732 2733 st->bucket = 0; 2734 rc = established_get_first(seq); 2735 2736 while (rc && pos) { 2737 rc = established_get_next(seq, rc); 2738 --pos; 2739 } 2740 return rc; 2741 } 2742 2743 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2744 { 2745 void *rc; 2746 struct tcp_iter_state *st = seq->private; 2747 2748 st->state = TCP_SEQ_STATE_LISTENING; 2749 rc = listening_get_idx(seq, &pos); 2750 2751 if (!rc) { 2752 st->state = TCP_SEQ_STATE_ESTABLISHED; 2753 rc = established_get_idx(seq, pos); 2754 } 2755 2756 return rc; 2757 } 2758 2759 static void *tcp_seek_last_pos(struct seq_file *seq) 2760 { 2761 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2762 struct tcp_iter_state *st = seq->private; 2763 int bucket = st->bucket; 2764 int offset = st->offset; 2765 int orig_num = st->num; 2766 void *rc = NULL; 2767 2768 switch (st->state) { 2769 case TCP_SEQ_STATE_LISTENING: 2770 if (st->bucket > hinfo->lhash2_mask) 2771 break; 2772 rc = listening_get_first(seq); 2773 while (offset-- && rc && bucket == st->bucket) 2774 rc = listening_get_next(seq, rc); 2775 if (rc) 2776 break; 2777 st->bucket = 0; 2778 st->state = TCP_SEQ_STATE_ESTABLISHED; 2779 fallthrough; 2780 case TCP_SEQ_STATE_ESTABLISHED: 2781 if (st->bucket > hinfo->ehash_mask) 2782 break; 2783 rc = established_get_first(seq); 2784 while (offset-- && rc && bucket == st->bucket) 2785 rc = established_get_next(seq, rc); 2786 } 2787 2788 st->num = orig_num; 2789 2790 return rc; 2791 } 2792 2793 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2794 { 2795 struct tcp_iter_state *st = seq->private; 2796 void *rc; 2797 2798 if (*pos && *pos == st->last_pos) { 2799 rc = tcp_seek_last_pos(seq); 2800 if (rc) 2801 goto out; 2802 } 2803 2804 st->state = TCP_SEQ_STATE_LISTENING; 2805 st->num = 0; 2806 st->bucket = 0; 2807 st->offset = 0; 2808 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2809 2810 out: 2811 st->last_pos = *pos; 2812 return rc; 2813 } 2814 EXPORT_SYMBOL(tcp_seq_start); 2815 2816 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2817 { 2818 struct tcp_iter_state *st = seq->private; 2819 void *rc = NULL; 2820 2821 if (v == SEQ_START_TOKEN) { 2822 rc = tcp_get_idx(seq, 0); 2823 goto out; 2824 } 2825 2826 switch (st->state) { 2827 case TCP_SEQ_STATE_LISTENING: 2828 rc = listening_get_next(seq, v); 2829 if (!rc) { 2830 st->state = TCP_SEQ_STATE_ESTABLISHED; 2831 st->bucket = 0; 2832 st->offset = 0; 2833 rc = established_get_first(seq); 2834 } 2835 break; 2836 case TCP_SEQ_STATE_ESTABLISHED: 2837 rc = established_get_next(seq, v); 2838 break; 2839 } 2840 out: 2841 ++*pos; 2842 st->last_pos = *pos; 2843 return rc; 2844 } 2845 EXPORT_SYMBOL(tcp_seq_next); 2846 2847 void tcp_seq_stop(struct seq_file *seq, void *v) 2848 { 2849 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2850 struct tcp_iter_state *st = seq->private; 2851 2852 switch (st->state) { 2853 case TCP_SEQ_STATE_LISTENING: 2854 if (v != SEQ_START_TOKEN) 2855 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2856 break; 2857 case TCP_SEQ_STATE_ESTABLISHED: 2858 if (v) 2859 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2860 break; 2861 } 2862 } 2863 EXPORT_SYMBOL(tcp_seq_stop); 2864 2865 static void get_openreq4(const struct request_sock *req, 2866 struct seq_file *f, int i) 2867 { 2868 const struct inet_request_sock *ireq = inet_rsk(req); 2869 long delta = req->rsk_timer.expires - jiffies; 2870 2871 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2872 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2873 i, 2874 ireq->ir_loc_addr, 2875 ireq->ir_num, 2876 ireq->ir_rmt_addr, 2877 ntohs(ireq->ir_rmt_port), 2878 TCP_SYN_RECV, 2879 0, 0, /* could print option size, but that is af dependent. */ 2880 1, /* timers active (only the expire timer) */ 2881 jiffies_delta_to_clock_t(delta), 2882 req->num_timeout, 2883 from_kuid_munged(seq_user_ns(f), 2884 sock_i_uid(req->rsk_listener)), 2885 0, /* non standard timer */ 2886 0, /* open_requests have no inode */ 2887 0, 2888 req); 2889 } 2890 2891 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2892 { 2893 int timer_active; 2894 unsigned long timer_expires; 2895 const struct tcp_sock *tp = tcp_sk(sk); 2896 const struct inet_connection_sock *icsk = inet_csk(sk); 2897 const struct inet_sock *inet = inet_sk(sk); 2898 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2899 __be32 dest = inet->inet_daddr; 2900 __be32 src = inet->inet_rcv_saddr; 2901 __u16 destp = ntohs(inet->inet_dport); 2902 __u16 srcp = ntohs(inet->inet_sport); 2903 int rx_queue; 2904 int state; 2905 2906 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2907 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2908 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2909 timer_active = 1; 2910 timer_expires = icsk->icsk_timeout; 2911 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2912 timer_active = 4; 2913 timer_expires = icsk->icsk_timeout; 2914 } else if (timer_pending(&sk->sk_timer)) { 2915 timer_active = 2; 2916 timer_expires = sk->sk_timer.expires; 2917 } else { 2918 timer_active = 0; 2919 timer_expires = jiffies; 2920 } 2921 2922 state = inet_sk_state_load(sk); 2923 if (state == TCP_LISTEN) 2924 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2925 else 2926 /* Because we don't lock the socket, 2927 * we might find a transient negative value. 2928 */ 2929 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2930 READ_ONCE(tp->copied_seq), 0); 2931 2932 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2933 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2934 i, src, srcp, dest, destp, state, 2935 READ_ONCE(tp->write_seq) - tp->snd_una, 2936 rx_queue, 2937 timer_active, 2938 jiffies_delta_to_clock_t(timer_expires - jiffies), 2939 icsk->icsk_retransmits, 2940 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2941 icsk->icsk_probes_out, 2942 sock_i_ino(sk), 2943 refcount_read(&sk->sk_refcnt), sk, 2944 jiffies_to_clock_t(icsk->icsk_rto), 2945 jiffies_to_clock_t(icsk->icsk_ack.ato), 2946 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2947 tcp_snd_cwnd(tp), 2948 state == TCP_LISTEN ? 2949 fastopenq->max_qlen : 2950 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2951 } 2952 2953 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2954 struct seq_file *f, int i) 2955 { 2956 long delta = tw->tw_timer.expires - jiffies; 2957 __be32 dest, src; 2958 __u16 destp, srcp; 2959 2960 dest = tw->tw_daddr; 2961 src = tw->tw_rcv_saddr; 2962 destp = ntohs(tw->tw_dport); 2963 srcp = ntohs(tw->tw_sport); 2964 2965 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2966 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2967 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 2968 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2969 refcount_read(&tw->tw_refcnt), tw); 2970 } 2971 2972 #define TMPSZ 150 2973 2974 static int tcp4_seq_show(struct seq_file *seq, void *v) 2975 { 2976 struct tcp_iter_state *st; 2977 struct sock *sk = v; 2978 2979 seq_setwidth(seq, TMPSZ - 1); 2980 if (v == SEQ_START_TOKEN) { 2981 seq_puts(seq, " sl local_address rem_address st tx_queue " 2982 "rx_queue tr tm->when retrnsmt uid timeout " 2983 "inode"); 2984 goto out; 2985 } 2986 st = seq->private; 2987 2988 if (sk->sk_state == TCP_TIME_WAIT) 2989 get_timewait4_sock(v, seq, st->num); 2990 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2991 get_openreq4(v, seq, st->num); 2992 else 2993 get_tcp4_sock(v, seq, st->num); 2994 out: 2995 seq_pad(seq, '\n'); 2996 return 0; 2997 } 2998 2999 #ifdef CONFIG_BPF_SYSCALL 3000 struct bpf_tcp_iter_state { 3001 struct tcp_iter_state state; 3002 unsigned int cur_sk; 3003 unsigned int end_sk; 3004 unsigned int max_sk; 3005 struct sock **batch; 3006 bool st_bucket_done; 3007 }; 3008 3009 struct bpf_iter__tcp { 3010 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3011 __bpf_md_ptr(struct sock_common *, sk_common); 3012 uid_t uid __aligned(8); 3013 }; 3014 3015 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3016 struct sock_common *sk_common, uid_t uid) 3017 { 3018 struct bpf_iter__tcp ctx; 3019 3020 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3021 ctx.meta = meta; 3022 ctx.sk_common = sk_common; 3023 ctx.uid = uid; 3024 return bpf_iter_run_prog(prog, &ctx); 3025 } 3026 3027 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3028 { 3029 while (iter->cur_sk < iter->end_sk) 3030 sock_gen_put(iter->batch[iter->cur_sk++]); 3031 } 3032 3033 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3034 unsigned int new_batch_sz) 3035 { 3036 struct sock **new_batch; 3037 3038 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3039 GFP_USER | __GFP_NOWARN); 3040 if (!new_batch) 3041 return -ENOMEM; 3042 3043 bpf_iter_tcp_put_batch(iter); 3044 kvfree(iter->batch); 3045 iter->batch = new_batch; 3046 iter->max_sk = new_batch_sz; 3047 3048 return 0; 3049 } 3050 3051 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3052 struct sock *start_sk) 3053 { 3054 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3055 struct bpf_tcp_iter_state *iter = seq->private; 3056 struct tcp_iter_state *st = &iter->state; 3057 struct hlist_nulls_node *node; 3058 unsigned int expected = 1; 3059 struct sock *sk; 3060 3061 sock_hold(start_sk); 3062 iter->batch[iter->end_sk++] = start_sk; 3063 3064 sk = sk_nulls_next(start_sk); 3065 sk_nulls_for_each_from(sk, node) { 3066 if (seq_sk_match(seq, sk)) { 3067 if (iter->end_sk < iter->max_sk) { 3068 sock_hold(sk); 3069 iter->batch[iter->end_sk++] = sk; 3070 } 3071 expected++; 3072 } 3073 } 3074 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3075 3076 return expected; 3077 } 3078 3079 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3080 struct sock *start_sk) 3081 { 3082 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3083 struct bpf_tcp_iter_state *iter = seq->private; 3084 struct tcp_iter_state *st = &iter->state; 3085 struct hlist_nulls_node *node; 3086 unsigned int expected = 1; 3087 struct sock *sk; 3088 3089 sock_hold(start_sk); 3090 iter->batch[iter->end_sk++] = start_sk; 3091 3092 sk = sk_nulls_next(start_sk); 3093 sk_nulls_for_each_from(sk, node) { 3094 if (seq_sk_match(seq, sk)) { 3095 if (iter->end_sk < iter->max_sk) { 3096 sock_hold(sk); 3097 iter->batch[iter->end_sk++] = sk; 3098 } 3099 expected++; 3100 } 3101 } 3102 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3103 3104 return expected; 3105 } 3106 3107 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3108 { 3109 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3110 struct bpf_tcp_iter_state *iter = seq->private; 3111 struct tcp_iter_state *st = &iter->state; 3112 unsigned int expected; 3113 bool resized = false; 3114 struct sock *sk; 3115 3116 /* The st->bucket is done. Directly advance to the next 3117 * bucket instead of having the tcp_seek_last_pos() to skip 3118 * one by one in the current bucket and eventually find out 3119 * it has to advance to the next bucket. 3120 */ 3121 if (iter->st_bucket_done) { 3122 st->offset = 0; 3123 st->bucket++; 3124 if (st->state == TCP_SEQ_STATE_LISTENING && 3125 st->bucket > hinfo->lhash2_mask) { 3126 st->state = TCP_SEQ_STATE_ESTABLISHED; 3127 st->bucket = 0; 3128 } 3129 } 3130 3131 again: 3132 /* Get a new batch */ 3133 iter->cur_sk = 0; 3134 iter->end_sk = 0; 3135 iter->st_bucket_done = false; 3136 3137 sk = tcp_seek_last_pos(seq); 3138 if (!sk) 3139 return NULL; /* Done */ 3140 3141 if (st->state == TCP_SEQ_STATE_LISTENING) 3142 expected = bpf_iter_tcp_listening_batch(seq, sk); 3143 else 3144 expected = bpf_iter_tcp_established_batch(seq, sk); 3145 3146 if (iter->end_sk == expected) { 3147 iter->st_bucket_done = true; 3148 return sk; 3149 } 3150 3151 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3152 resized = true; 3153 goto again; 3154 } 3155 3156 return sk; 3157 } 3158 3159 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3160 { 3161 /* bpf iter does not support lseek, so it always 3162 * continue from where it was stop()-ped. 3163 */ 3164 if (*pos) 3165 return bpf_iter_tcp_batch(seq); 3166 3167 return SEQ_START_TOKEN; 3168 } 3169 3170 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3171 { 3172 struct bpf_tcp_iter_state *iter = seq->private; 3173 struct tcp_iter_state *st = &iter->state; 3174 struct sock *sk; 3175 3176 /* Whenever seq_next() is called, the iter->cur_sk is 3177 * done with seq_show(), so advance to the next sk in 3178 * the batch. 3179 */ 3180 if (iter->cur_sk < iter->end_sk) { 3181 /* Keeping st->num consistent in tcp_iter_state. 3182 * bpf_iter_tcp does not use st->num. 3183 * meta.seq_num is used instead. 3184 */ 3185 st->num++; 3186 /* Move st->offset to the next sk in the bucket such that 3187 * the future start() will resume at st->offset in 3188 * st->bucket. See tcp_seek_last_pos(). 3189 */ 3190 st->offset++; 3191 sock_gen_put(iter->batch[iter->cur_sk++]); 3192 } 3193 3194 if (iter->cur_sk < iter->end_sk) 3195 sk = iter->batch[iter->cur_sk]; 3196 else 3197 sk = bpf_iter_tcp_batch(seq); 3198 3199 ++*pos; 3200 /* Keeping st->last_pos consistent in tcp_iter_state. 3201 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3202 */ 3203 st->last_pos = *pos; 3204 return sk; 3205 } 3206 3207 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3208 { 3209 struct bpf_iter_meta meta; 3210 struct bpf_prog *prog; 3211 struct sock *sk = v; 3212 uid_t uid; 3213 int ret; 3214 3215 if (v == SEQ_START_TOKEN) 3216 return 0; 3217 3218 if (sk_fullsock(sk)) 3219 lock_sock(sk); 3220 3221 if (unlikely(sk_unhashed(sk))) { 3222 ret = SEQ_SKIP; 3223 goto unlock; 3224 } 3225 3226 if (sk->sk_state == TCP_TIME_WAIT) { 3227 uid = 0; 3228 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3229 const struct request_sock *req = v; 3230 3231 uid = from_kuid_munged(seq_user_ns(seq), 3232 sock_i_uid(req->rsk_listener)); 3233 } else { 3234 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3235 } 3236 3237 meta.seq = seq; 3238 prog = bpf_iter_get_info(&meta, false); 3239 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3240 3241 unlock: 3242 if (sk_fullsock(sk)) 3243 release_sock(sk); 3244 return ret; 3245 3246 } 3247 3248 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3249 { 3250 struct bpf_tcp_iter_state *iter = seq->private; 3251 struct bpf_iter_meta meta; 3252 struct bpf_prog *prog; 3253 3254 if (!v) { 3255 meta.seq = seq; 3256 prog = bpf_iter_get_info(&meta, true); 3257 if (prog) 3258 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3259 } 3260 3261 if (iter->cur_sk < iter->end_sk) { 3262 bpf_iter_tcp_put_batch(iter); 3263 iter->st_bucket_done = false; 3264 } 3265 } 3266 3267 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3268 .show = bpf_iter_tcp_seq_show, 3269 .start = bpf_iter_tcp_seq_start, 3270 .next = bpf_iter_tcp_seq_next, 3271 .stop = bpf_iter_tcp_seq_stop, 3272 }; 3273 #endif 3274 static unsigned short seq_file_family(const struct seq_file *seq) 3275 { 3276 const struct tcp_seq_afinfo *afinfo; 3277 3278 #ifdef CONFIG_BPF_SYSCALL 3279 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3280 if (seq->op == &bpf_iter_tcp_seq_ops) 3281 return AF_UNSPEC; 3282 #endif 3283 3284 /* Iterated from proc fs */ 3285 afinfo = pde_data(file_inode(seq->file)); 3286 return afinfo->family; 3287 } 3288 3289 static const struct seq_operations tcp4_seq_ops = { 3290 .show = tcp4_seq_show, 3291 .start = tcp_seq_start, 3292 .next = tcp_seq_next, 3293 .stop = tcp_seq_stop, 3294 }; 3295 3296 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3297 .family = AF_INET, 3298 }; 3299 3300 static int __net_init tcp4_proc_init_net(struct net *net) 3301 { 3302 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3303 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3304 return -ENOMEM; 3305 return 0; 3306 } 3307 3308 static void __net_exit tcp4_proc_exit_net(struct net *net) 3309 { 3310 remove_proc_entry("tcp", net->proc_net); 3311 } 3312 3313 static struct pernet_operations tcp4_net_ops = { 3314 .init = tcp4_proc_init_net, 3315 .exit = tcp4_proc_exit_net, 3316 }; 3317 3318 int __init tcp4_proc_init(void) 3319 { 3320 return register_pernet_subsys(&tcp4_net_ops); 3321 } 3322 3323 void tcp4_proc_exit(void) 3324 { 3325 unregister_pernet_subsys(&tcp4_net_ops); 3326 } 3327 #endif /* CONFIG_PROC_FS */ 3328 3329 /* @wake is one when sk_stream_write_space() calls us. 3330 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3331 * This mimics the strategy used in sock_def_write_space(). 3332 */ 3333 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3334 { 3335 const struct tcp_sock *tp = tcp_sk(sk); 3336 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3337 READ_ONCE(tp->snd_nxt); 3338 3339 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3340 } 3341 EXPORT_SYMBOL(tcp_stream_memory_free); 3342 3343 struct proto tcp_prot = { 3344 .name = "TCP", 3345 .owner = THIS_MODULE, 3346 .close = tcp_close, 3347 .pre_connect = tcp_v4_pre_connect, 3348 .connect = tcp_v4_connect, 3349 .disconnect = tcp_disconnect, 3350 .accept = inet_csk_accept, 3351 .ioctl = tcp_ioctl, 3352 .init = tcp_v4_init_sock, 3353 .destroy = tcp_v4_destroy_sock, 3354 .shutdown = tcp_shutdown, 3355 .setsockopt = tcp_setsockopt, 3356 .getsockopt = tcp_getsockopt, 3357 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3358 .keepalive = tcp_set_keepalive, 3359 .recvmsg = tcp_recvmsg, 3360 .sendmsg = tcp_sendmsg, 3361 .splice_eof = tcp_splice_eof, 3362 .backlog_rcv = tcp_v4_do_rcv, 3363 .release_cb = tcp_release_cb, 3364 .hash = inet_hash, 3365 .unhash = inet_unhash, 3366 .get_port = inet_csk_get_port, 3367 .put_port = inet_put_port, 3368 #ifdef CONFIG_BPF_SYSCALL 3369 .psock_update_sk_prot = tcp_bpf_update_proto, 3370 #endif 3371 .enter_memory_pressure = tcp_enter_memory_pressure, 3372 .leave_memory_pressure = tcp_leave_memory_pressure, 3373 .stream_memory_free = tcp_stream_memory_free, 3374 .sockets_allocated = &tcp_sockets_allocated, 3375 .orphan_count = &tcp_orphan_count, 3376 3377 .memory_allocated = &tcp_memory_allocated, 3378 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3379 3380 .memory_pressure = &tcp_memory_pressure, 3381 .sysctl_mem = sysctl_tcp_mem, 3382 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3383 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3384 .max_header = MAX_TCP_HEADER, 3385 .obj_size = sizeof(struct tcp_sock), 3386 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3387 .twsk_prot = &tcp_timewait_sock_ops, 3388 .rsk_prot = &tcp_request_sock_ops, 3389 .h.hashinfo = NULL, 3390 .no_autobind = true, 3391 .diag_destroy = tcp_abort, 3392 }; 3393 EXPORT_SYMBOL(tcp_prot); 3394 3395 static void __net_exit tcp_sk_exit(struct net *net) 3396 { 3397 if (net->ipv4.tcp_congestion_control) 3398 bpf_module_put(net->ipv4.tcp_congestion_control, 3399 net->ipv4.tcp_congestion_control->owner); 3400 } 3401 3402 static void __net_init tcp_set_hashinfo(struct net *net) 3403 { 3404 struct inet_hashinfo *hinfo; 3405 unsigned int ehash_entries; 3406 struct net *old_net; 3407 3408 if (net_eq(net, &init_net)) 3409 goto fallback; 3410 3411 old_net = current->nsproxy->net_ns; 3412 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3413 if (!ehash_entries) 3414 goto fallback; 3415 3416 ehash_entries = roundup_pow_of_two(ehash_entries); 3417 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3418 if (!hinfo) { 3419 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3420 "for a netns, fallback to the global one\n", 3421 ehash_entries); 3422 fallback: 3423 hinfo = &tcp_hashinfo; 3424 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3425 } 3426 3427 net->ipv4.tcp_death_row.hashinfo = hinfo; 3428 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3429 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3430 } 3431 3432 static int __net_init tcp_sk_init(struct net *net) 3433 { 3434 net->ipv4.sysctl_tcp_ecn = 2; 3435 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3436 3437 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3438 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3439 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3440 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3441 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3442 3443 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3444 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3445 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3446 3447 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3448 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3449 net->ipv4.sysctl_tcp_syncookies = 1; 3450 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3451 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3452 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3453 net->ipv4.sysctl_tcp_orphan_retries = 0; 3454 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3455 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3456 net->ipv4.sysctl_tcp_tw_reuse = 2; 3457 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3458 3459 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3460 tcp_set_hashinfo(net); 3461 3462 net->ipv4.sysctl_tcp_sack = 1; 3463 net->ipv4.sysctl_tcp_window_scaling = 1; 3464 net->ipv4.sysctl_tcp_timestamps = 1; 3465 net->ipv4.sysctl_tcp_early_retrans = 3; 3466 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3467 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3468 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3469 net->ipv4.sysctl_tcp_max_reordering = 300; 3470 net->ipv4.sysctl_tcp_dsack = 1; 3471 net->ipv4.sysctl_tcp_app_win = 31; 3472 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3473 net->ipv4.sysctl_tcp_frto = 2; 3474 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3475 /* This limits the percentage of the congestion window which we 3476 * will allow a single TSO frame to consume. Building TSO frames 3477 * which are too large can cause TCP streams to be bursty. 3478 */ 3479 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3480 /* Default TSQ limit of 16 TSO segments */ 3481 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3482 3483 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3484 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3485 3486 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3487 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3488 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3489 net->ipv4.sysctl_tcp_autocorking = 1; 3490 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3491 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3492 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3493 if (net != &init_net) { 3494 memcpy(net->ipv4.sysctl_tcp_rmem, 3495 init_net.ipv4.sysctl_tcp_rmem, 3496 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3497 memcpy(net->ipv4.sysctl_tcp_wmem, 3498 init_net.ipv4.sysctl_tcp_wmem, 3499 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3500 } 3501 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3502 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3503 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3504 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3505 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3506 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3507 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3508 3509 /* Set default values for PLB */ 3510 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3511 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3512 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3513 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3514 /* Default congestion threshold for PLB to mark a round is 50% */ 3515 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3516 3517 /* Reno is always built in */ 3518 if (!net_eq(net, &init_net) && 3519 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3520 init_net.ipv4.tcp_congestion_control->owner)) 3521 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3522 else 3523 net->ipv4.tcp_congestion_control = &tcp_reno; 3524 3525 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3526 net->ipv4.sysctl_tcp_shrink_window = 0; 3527 3528 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3529 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3530 3531 return 0; 3532 } 3533 3534 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3535 { 3536 struct net *net; 3537 3538 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work 3539 * and failed setup_net error unwinding path are serialized. 3540 * 3541 * tcp_twsk_purge() handles twsk in any dead netns, not just those in 3542 * net_exit_list, the thread that dismantles a particular twsk must 3543 * do so without other thread progressing to refcount_dec_and_test() of 3544 * tcp_death_row.tw_refcount. 3545 */ 3546 mutex_lock(&tcp_exit_batch_mutex); 3547 3548 tcp_twsk_purge(net_exit_list); 3549 3550 list_for_each_entry(net, net_exit_list, exit_list) { 3551 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3552 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3553 tcp_fastopen_ctx_destroy(net); 3554 } 3555 3556 mutex_unlock(&tcp_exit_batch_mutex); 3557 } 3558 3559 static struct pernet_operations __net_initdata tcp_sk_ops = { 3560 .init = tcp_sk_init, 3561 .exit = tcp_sk_exit, 3562 .exit_batch = tcp_sk_exit_batch, 3563 }; 3564 3565 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3566 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3567 struct sock_common *sk_common, uid_t uid) 3568 3569 #define INIT_BATCH_SZ 16 3570 3571 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3572 { 3573 struct bpf_tcp_iter_state *iter = priv_data; 3574 int err; 3575 3576 err = bpf_iter_init_seq_net(priv_data, aux); 3577 if (err) 3578 return err; 3579 3580 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3581 if (err) { 3582 bpf_iter_fini_seq_net(priv_data); 3583 return err; 3584 } 3585 3586 return 0; 3587 } 3588 3589 static void bpf_iter_fini_tcp(void *priv_data) 3590 { 3591 struct bpf_tcp_iter_state *iter = priv_data; 3592 3593 bpf_iter_fini_seq_net(priv_data); 3594 kvfree(iter->batch); 3595 } 3596 3597 static const struct bpf_iter_seq_info tcp_seq_info = { 3598 .seq_ops = &bpf_iter_tcp_seq_ops, 3599 .init_seq_private = bpf_iter_init_tcp, 3600 .fini_seq_private = bpf_iter_fini_tcp, 3601 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3602 }; 3603 3604 static const struct bpf_func_proto * 3605 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3606 const struct bpf_prog *prog) 3607 { 3608 switch (func_id) { 3609 case BPF_FUNC_setsockopt: 3610 return &bpf_sk_setsockopt_proto; 3611 case BPF_FUNC_getsockopt: 3612 return &bpf_sk_getsockopt_proto; 3613 default: 3614 return NULL; 3615 } 3616 } 3617 3618 static struct bpf_iter_reg tcp_reg_info = { 3619 .target = "tcp", 3620 .ctx_arg_info_size = 1, 3621 .ctx_arg_info = { 3622 { offsetof(struct bpf_iter__tcp, sk_common), 3623 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3624 }, 3625 .get_func_proto = bpf_iter_tcp_get_func_proto, 3626 .seq_info = &tcp_seq_info, 3627 }; 3628 3629 static void __init bpf_iter_register(void) 3630 { 3631 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3632 if (bpf_iter_reg_target(&tcp_reg_info)) 3633 pr_warn("Warning: could not register bpf iterator tcp\n"); 3634 } 3635 3636 #endif 3637 3638 void __init tcp_v4_init(void) 3639 { 3640 int cpu, res; 3641 3642 for_each_possible_cpu(cpu) { 3643 struct sock *sk; 3644 3645 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3646 IPPROTO_TCP, &init_net); 3647 if (res) 3648 panic("Failed to create the TCP control socket.\n"); 3649 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3650 3651 /* Please enforce IP_DF and IPID==0 for RST and 3652 * ACK sent in SYN-RECV and TIME-WAIT state. 3653 */ 3654 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3655 3656 sk->sk_clockid = CLOCK_MONOTONIC; 3657 3658 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3659 } 3660 if (register_pernet_subsys(&tcp_sk_ops)) 3661 panic("Failed to create the TCP control socket.\n"); 3662 3663 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3664 bpf_iter_register(); 3665 #endif 3666 } 3667