1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 74 #include <linux/inet.h> 75 #include <linux/ipv6.h> 76 #include <linux/stddef.h> 77 #include <linux/proc_fs.h> 78 #include <linux/seq_file.h> 79 #include <linux/inetdevice.h> 80 #include <linux/btf_ids.h> 81 82 #include <crypto/hash.h> 83 #include <linux/scatterlist.h> 84 85 #include <trace/events/tcp.h> 86 87 #ifdef CONFIG_TCP_MD5SIG 88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 89 __be32 daddr, __be32 saddr, const struct tcphdr *th); 90 #endif 91 92 struct inet_hashinfo tcp_hashinfo; 93 EXPORT_SYMBOL(tcp_hashinfo); 94 95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 96 97 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 98 { 99 return secure_tcp_seq(ip_hdr(skb)->daddr, 100 ip_hdr(skb)->saddr, 101 tcp_hdr(skb)->dest, 102 tcp_hdr(skb)->source); 103 } 104 105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 106 { 107 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 108 } 109 110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 111 { 112 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 113 const struct inet_timewait_sock *tw = inet_twsk(sktw); 114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 115 struct tcp_sock *tp = tcp_sk(sk); 116 117 if (reuse == 2) { 118 /* Still does not detect *everything* that goes through 119 * lo, since we require a loopback src or dst address 120 * or direct binding to 'lo' interface. 121 */ 122 bool loopback = false; 123 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 124 loopback = true; 125 #if IS_ENABLED(CONFIG_IPV6) 126 if (tw->tw_family == AF_INET6) { 127 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 128 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 129 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 130 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 131 loopback = true; 132 } else 133 #endif 134 { 135 if (ipv4_is_loopback(tw->tw_daddr) || 136 ipv4_is_loopback(tw->tw_rcv_saddr)) 137 loopback = true; 138 } 139 if (!loopback) 140 reuse = 0; 141 } 142 143 /* With PAWS, it is safe from the viewpoint 144 of data integrity. Even without PAWS it is safe provided sequence 145 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 146 147 Actually, the idea is close to VJ's one, only timestamp cache is 148 held not per host, but per port pair and TW bucket is used as state 149 holder. 150 151 If TW bucket has been already destroyed we fall back to VJ's scheme 152 and use initial timestamp retrieved from peer table. 153 */ 154 if (tcptw->tw_ts_recent_stamp && 155 (!twp || (reuse && time_after32(ktime_get_seconds(), 156 tcptw->tw_ts_recent_stamp)))) { 157 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk 158 * and releasing the bucket lock. 159 */ 160 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 161 return 0; 162 163 /* In case of repair and re-using TIME-WAIT sockets we still 164 * want to be sure that it is safe as above but honor the 165 * sequence numbers and time stamps set as part of the repair 166 * process. 167 * 168 * Without this check re-using a TIME-WAIT socket with TCP 169 * repair would accumulate a -1 on the repair assigned 170 * sequence number. The first time it is reused the sequence 171 * is -1, the second time -2, etc. This fixes that issue 172 * without appearing to create any others. 173 */ 174 if (likely(!tp->repair)) { 175 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 176 177 if (!seq) 178 seq = 1; 179 WRITE_ONCE(tp->write_seq, seq); 180 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 181 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 182 } 183 184 return 1; 185 } 186 187 return 0; 188 } 189 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 190 191 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 192 int addr_len) 193 { 194 /* This check is replicated from tcp_v4_connect() and intended to 195 * prevent BPF program called below from accessing bytes that are out 196 * of the bound specified by user in addr_len. 197 */ 198 if (addr_len < sizeof(struct sockaddr_in)) 199 return -EINVAL; 200 201 sock_owned_by_me(sk); 202 203 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 204 } 205 206 /* This will initiate an outgoing connection. */ 207 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 208 { 209 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 210 struct inet_timewait_death_row *tcp_death_row; 211 struct inet_sock *inet = inet_sk(sk); 212 struct tcp_sock *tp = tcp_sk(sk); 213 struct ip_options_rcu *inet_opt; 214 struct net *net = sock_net(sk); 215 __be16 orig_sport, orig_dport; 216 __be32 daddr, nexthop; 217 struct flowi4 *fl4; 218 struct rtable *rt; 219 int err; 220 221 if (addr_len < sizeof(struct sockaddr_in)) 222 return -EINVAL; 223 224 if (usin->sin_family != AF_INET) 225 return -EAFNOSUPPORT; 226 227 nexthop = daddr = usin->sin_addr.s_addr; 228 inet_opt = rcu_dereference_protected(inet->inet_opt, 229 lockdep_sock_is_held(sk)); 230 if (inet_opt && inet_opt->opt.srr) { 231 if (!daddr) 232 return -EINVAL; 233 nexthop = inet_opt->opt.faddr; 234 } 235 236 orig_sport = inet->inet_sport; 237 orig_dport = usin->sin_port; 238 fl4 = &inet->cork.fl.u.ip4; 239 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 240 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 241 orig_dport, sk); 242 if (IS_ERR(rt)) { 243 err = PTR_ERR(rt); 244 if (err == -ENETUNREACH) 245 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 246 return err; 247 } 248 249 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 250 ip_rt_put(rt); 251 return -ENETUNREACH; 252 } 253 254 if (!inet_opt || !inet_opt->opt.srr) 255 daddr = fl4->daddr; 256 257 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 258 259 if (!inet->inet_saddr) { 260 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 261 if (err) { 262 ip_rt_put(rt); 263 return err; 264 } 265 } else { 266 sk_rcv_saddr_set(sk, inet->inet_saddr); 267 } 268 269 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 270 /* Reset inherited state */ 271 tp->rx_opt.ts_recent = 0; 272 tp->rx_opt.ts_recent_stamp = 0; 273 if (likely(!tp->repair)) 274 WRITE_ONCE(tp->write_seq, 0); 275 } 276 277 inet->inet_dport = usin->sin_port; 278 sk_daddr_set(sk, daddr); 279 280 inet_csk(sk)->icsk_ext_hdr_len = 0; 281 if (inet_opt) 282 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 283 284 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 285 286 /* Socket identity is still unknown (sport may be zero). 287 * However we set state to SYN-SENT and not releasing socket 288 * lock select source port, enter ourselves into the hash tables and 289 * complete initialization after this. 290 */ 291 tcp_set_state(sk, TCP_SYN_SENT); 292 err = inet_hash_connect(tcp_death_row, sk); 293 if (err) 294 goto failure; 295 296 sk_set_txhash(sk); 297 298 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 299 inet->inet_sport, inet->inet_dport, sk); 300 if (IS_ERR(rt)) { 301 err = PTR_ERR(rt); 302 rt = NULL; 303 goto failure; 304 } 305 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 306 /* OK, now commit destination to socket. */ 307 sk->sk_gso_type = SKB_GSO_TCPV4; 308 sk_setup_caps(sk, &rt->dst); 309 rt = NULL; 310 311 if (likely(!tp->repair)) { 312 if (!tp->write_seq) 313 WRITE_ONCE(tp->write_seq, 314 secure_tcp_seq(inet->inet_saddr, 315 inet->inet_daddr, 316 inet->inet_sport, 317 usin->sin_port)); 318 WRITE_ONCE(tp->tsoffset, 319 secure_tcp_ts_off(net, inet->inet_saddr, 320 inet->inet_daddr)); 321 } 322 323 atomic_set(&inet->inet_id, get_random_u16()); 324 325 if (tcp_fastopen_defer_connect(sk, &err)) 326 return err; 327 if (err) 328 goto failure; 329 330 err = tcp_connect(sk); 331 332 if (err) 333 goto failure; 334 335 return 0; 336 337 failure: 338 /* 339 * This unhashes the socket and releases the local port, 340 * if necessary. 341 */ 342 tcp_set_state(sk, TCP_CLOSE); 343 inet_bhash2_reset_saddr(sk); 344 ip_rt_put(rt); 345 sk->sk_route_caps = 0; 346 inet->inet_dport = 0; 347 return err; 348 } 349 EXPORT_SYMBOL(tcp_v4_connect); 350 351 /* 352 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 353 * It can be called through tcp_release_cb() if socket was owned by user 354 * at the time tcp_v4_err() was called to handle ICMP message. 355 */ 356 void tcp_v4_mtu_reduced(struct sock *sk) 357 { 358 struct inet_sock *inet = inet_sk(sk); 359 struct dst_entry *dst; 360 u32 mtu; 361 362 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 363 return; 364 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 365 dst = inet_csk_update_pmtu(sk, mtu); 366 if (!dst) 367 return; 368 369 /* Something is about to be wrong... Remember soft error 370 * for the case, if this connection will not able to recover. 371 */ 372 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 373 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 374 375 mtu = dst_mtu(dst); 376 377 if (inet->pmtudisc != IP_PMTUDISC_DONT && 378 ip_sk_accept_pmtu(sk) && 379 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 380 tcp_sync_mss(sk, mtu); 381 382 /* Resend the TCP packet because it's 383 * clear that the old packet has been 384 * dropped. This is the new "fast" path mtu 385 * discovery. 386 */ 387 tcp_simple_retransmit(sk); 388 } /* else let the usual retransmit timer handle it */ 389 } 390 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 391 392 static void do_redirect(struct sk_buff *skb, struct sock *sk) 393 { 394 struct dst_entry *dst = __sk_dst_check(sk, 0); 395 396 if (dst) 397 dst->ops->redirect(dst, sk, skb); 398 } 399 400 401 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 402 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 403 { 404 struct request_sock *req = inet_reqsk(sk); 405 struct net *net = sock_net(sk); 406 407 /* ICMPs are not backlogged, hence we cannot get 408 * an established socket here. 409 */ 410 if (seq != tcp_rsk(req)->snt_isn) { 411 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 412 } else if (abort) { 413 /* 414 * Still in SYN_RECV, just remove it silently. 415 * There is no good way to pass the error to the newly 416 * created socket, and POSIX does not want network 417 * errors returned from accept(). 418 */ 419 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 420 tcp_listendrop(req->rsk_listener); 421 } 422 reqsk_put(req); 423 } 424 EXPORT_SYMBOL(tcp_req_err); 425 426 /* TCP-LD (RFC 6069) logic */ 427 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 428 { 429 struct inet_connection_sock *icsk = inet_csk(sk); 430 struct tcp_sock *tp = tcp_sk(sk); 431 struct sk_buff *skb; 432 s32 remaining; 433 u32 delta_us; 434 435 if (sock_owned_by_user(sk)) 436 return; 437 438 if (seq != tp->snd_una || !icsk->icsk_retransmits || 439 !icsk->icsk_backoff) 440 return; 441 442 skb = tcp_rtx_queue_head(sk); 443 if (WARN_ON_ONCE(!skb)) 444 return; 445 446 icsk->icsk_backoff--; 447 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 448 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 449 450 tcp_mstamp_refresh(tp); 451 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 452 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 453 454 if (remaining > 0) { 455 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 456 remaining, TCP_RTO_MAX); 457 } else { 458 /* RTO revert clocked out retransmission. 459 * Will retransmit now. 460 */ 461 tcp_retransmit_timer(sk); 462 } 463 } 464 EXPORT_SYMBOL(tcp_ld_RTO_revert); 465 466 /* 467 * This routine is called by the ICMP module when it gets some 468 * sort of error condition. If err < 0 then the socket should 469 * be closed and the error returned to the user. If err > 0 470 * it's just the icmp type << 8 | icmp code. After adjustment 471 * header points to the first 8 bytes of the tcp header. We need 472 * to find the appropriate port. 473 * 474 * The locking strategy used here is very "optimistic". When 475 * someone else accesses the socket the ICMP is just dropped 476 * and for some paths there is no check at all. 477 * A more general error queue to queue errors for later handling 478 * is probably better. 479 * 480 */ 481 482 int tcp_v4_err(struct sk_buff *skb, u32 info) 483 { 484 const struct iphdr *iph = (const struct iphdr *)skb->data; 485 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 486 struct tcp_sock *tp; 487 const int type = icmp_hdr(skb)->type; 488 const int code = icmp_hdr(skb)->code; 489 struct sock *sk; 490 struct request_sock *fastopen; 491 u32 seq, snd_una; 492 int err; 493 struct net *net = dev_net(skb->dev); 494 495 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 496 iph->daddr, th->dest, iph->saddr, 497 ntohs(th->source), inet_iif(skb), 0); 498 if (!sk) { 499 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 500 return -ENOENT; 501 } 502 if (sk->sk_state == TCP_TIME_WAIT) { 503 /* To increase the counter of ignored icmps for TCP-AO */ 504 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 505 inet_twsk_put(inet_twsk(sk)); 506 return 0; 507 } 508 seq = ntohl(th->seq); 509 if (sk->sk_state == TCP_NEW_SYN_RECV) { 510 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 511 type == ICMP_TIME_EXCEEDED || 512 (type == ICMP_DEST_UNREACH && 513 (code == ICMP_NET_UNREACH || 514 code == ICMP_HOST_UNREACH))); 515 return 0; 516 } 517 518 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 519 sock_put(sk); 520 return 0; 521 } 522 523 bh_lock_sock(sk); 524 /* If too many ICMPs get dropped on busy 525 * servers this needs to be solved differently. 526 * We do take care of PMTU discovery (RFC1191) special case : 527 * we can receive locally generated ICMP messages while socket is held. 528 */ 529 if (sock_owned_by_user(sk)) { 530 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 531 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 532 } 533 if (sk->sk_state == TCP_CLOSE) 534 goto out; 535 536 if (static_branch_unlikely(&ip4_min_ttl)) { 537 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 538 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 539 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 540 goto out; 541 } 542 } 543 544 tp = tcp_sk(sk); 545 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 546 fastopen = rcu_dereference(tp->fastopen_rsk); 547 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 548 if (sk->sk_state != TCP_LISTEN && 549 !between(seq, snd_una, tp->snd_nxt)) { 550 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 551 goto out; 552 } 553 554 switch (type) { 555 case ICMP_REDIRECT: 556 if (!sock_owned_by_user(sk)) 557 do_redirect(skb, sk); 558 goto out; 559 case ICMP_SOURCE_QUENCH: 560 /* Just silently ignore these. */ 561 goto out; 562 case ICMP_PARAMETERPROB: 563 err = EPROTO; 564 break; 565 case ICMP_DEST_UNREACH: 566 if (code > NR_ICMP_UNREACH) 567 goto out; 568 569 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 570 /* We are not interested in TCP_LISTEN and open_requests 571 * (SYN-ACKs send out by Linux are always <576bytes so 572 * they should go through unfragmented). 573 */ 574 if (sk->sk_state == TCP_LISTEN) 575 goto out; 576 577 WRITE_ONCE(tp->mtu_info, info); 578 if (!sock_owned_by_user(sk)) { 579 tcp_v4_mtu_reduced(sk); 580 } else { 581 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 582 sock_hold(sk); 583 } 584 goto out; 585 } 586 587 err = icmp_err_convert[code].errno; 588 /* check if this ICMP message allows revert of backoff. 589 * (see RFC 6069) 590 */ 591 if (!fastopen && 592 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 593 tcp_ld_RTO_revert(sk, seq); 594 break; 595 case ICMP_TIME_EXCEEDED: 596 err = EHOSTUNREACH; 597 break; 598 default: 599 goto out; 600 } 601 602 switch (sk->sk_state) { 603 case TCP_SYN_SENT: 604 case TCP_SYN_RECV: 605 /* Only in fast or simultaneous open. If a fast open socket is 606 * already accepted it is treated as a connected one below. 607 */ 608 if (fastopen && !fastopen->sk) 609 break; 610 611 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 612 613 if (!sock_owned_by_user(sk)) { 614 WRITE_ONCE(sk->sk_err, err); 615 616 sk_error_report(sk); 617 618 tcp_done(sk); 619 } else { 620 WRITE_ONCE(sk->sk_err_soft, err); 621 } 622 goto out; 623 } 624 625 /* If we've already connected we will keep trying 626 * until we time out, or the user gives up. 627 * 628 * rfc1122 4.2.3.9 allows to consider as hard errors 629 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 630 * but it is obsoleted by pmtu discovery). 631 * 632 * Note, that in modern internet, where routing is unreliable 633 * and in each dark corner broken firewalls sit, sending random 634 * errors ordered by their masters even this two messages finally lose 635 * their original sense (even Linux sends invalid PORT_UNREACHs) 636 * 637 * Now we are in compliance with RFCs. 638 * --ANK (980905) 639 */ 640 641 if (!sock_owned_by_user(sk) && 642 inet_test_bit(RECVERR, sk)) { 643 WRITE_ONCE(sk->sk_err, err); 644 sk_error_report(sk); 645 } else { /* Only an error on timeout */ 646 WRITE_ONCE(sk->sk_err_soft, err); 647 } 648 649 out: 650 bh_unlock_sock(sk); 651 sock_put(sk); 652 return 0; 653 } 654 655 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 656 { 657 struct tcphdr *th = tcp_hdr(skb); 658 659 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 660 skb->csum_start = skb_transport_header(skb) - skb->head; 661 skb->csum_offset = offsetof(struct tcphdr, check); 662 } 663 664 /* This routine computes an IPv4 TCP checksum. */ 665 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 666 { 667 const struct inet_sock *inet = inet_sk(sk); 668 669 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 670 } 671 EXPORT_SYMBOL(tcp_v4_send_check); 672 673 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 674 675 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 676 const struct tcp_ao_hdr *aoh, 677 struct ip_reply_arg *arg, struct tcphdr *reply, 678 __be32 reply_options[REPLY_OPTIONS_LEN]) 679 { 680 #ifdef CONFIG_TCP_AO 681 int sdif = tcp_v4_sdif(skb); 682 int dif = inet_iif(skb); 683 int l3index = sdif ? dif : 0; 684 bool allocated_traffic_key; 685 struct tcp_ao_key *key; 686 char *traffic_key; 687 bool drop = true; 688 u32 ao_sne = 0; 689 u8 keyid; 690 691 rcu_read_lock(); 692 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 693 &key, &traffic_key, &allocated_traffic_key, 694 &keyid, &ao_sne)) 695 goto out; 696 697 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 698 (aoh->rnext_keyid << 8) | keyid); 699 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 700 reply->doff = arg->iov[0].iov_len / 4; 701 702 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 703 key, traffic_key, 704 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 705 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 706 reply, ao_sne)) 707 goto out; 708 drop = false; 709 out: 710 rcu_read_unlock(); 711 if (allocated_traffic_key) 712 kfree(traffic_key); 713 return drop; 714 #else 715 return true; 716 #endif 717 } 718 719 /* 720 * This routine will send an RST to the other tcp. 721 * 722 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 723 * for reset. 724 * Answer: if a packet caused RST, it is not for a socket 725 * existing in our system, if it is matched to a socket, 726 * it is just duplicate segment or bug in other side's TCP. 727 * So that we build reply only basing on parameters 728 * arrived with segment. 729 * Exception: precedence violation. We do not implement it in any case. 730 */ 731 732 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 733 { 734 const struct tcphdr *th = tcp_hdr(skb); 735 struct { 736 struct tcphdr th; 737 __be32 opt[REPLY_OPTIONS_LEN]; 738 } rep; 739 const __u8 *md5_hash_location = NULL; 740 const struct tcp_ao_hdr *aoh; 741 struct ip_reply_arg arg; 742 #ifdef CONFIG_TCP_MD5SIG 743 struct tcp_md5sig_key *key = NULL; 744 unsigned char newhash[16]; 745 struct sock *sk1 = NULL; 746 int genhash; 747 #endif 748 u64 transmit_time = 0; 749 struct sock *ctl_sk; 750 struct net *net; 751 u32 txhash = 0; 752 753 /* Never send a reset in response to a reset. */ 754 if (th->rst) 755 return; 756 757 /* If sk not NULL, it means we did a successful lookup and incoming 758 * route had to be correct. prequeue might have dropped our dst. 759 */ 760 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 761 return; 762 763 /* Swap the send and the receive. */ 764 memset(&rep, 0, sizeof(rep)); 765 rep.th.dest = th->source; 766 rep.th.source = th->dest; 767 rep.th.doff = sizeof(struct tcphdr) / 4; 768 rep.th.rst = 1; 769 770 if (th->ack) { 771 rep.th.seq = th->ack_seq; 772 } else { 773 rep.th.ack = 1; 774 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 775 skb->len - (th->doff << 2)); 776 } 777 778 memset(&arg, 0, sizeof(arg)); 779 arg.iov[0].iov_base = (unsigned char *)&rep; 780 arg.iov[0].iov_len = sizeof(rep.th); 781 782 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 783 784 /* Invalid TCP option size or twice included auth */ 785 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 786 return; 787 788 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 789 return; 790 791 #ifdef CONFIG_TCP_MD5SIG 792 rcu_read_lock(); 793 if (sk && sk_fullsock(sk)) { 794 const union tcp_md5_addr *addr; 795 int l3index; 796 797 /* sdif set, means packet ingressed via a device 798 * in an L3 domain and inet_iif is set to it. 799 */ 800 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 801 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 802 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 803 } else if (md5_hash_location) { 804 const union tcp_md5_addr *addr; 805 int sdif = tcp_v4_sdif(skb); 806 int dif = inet_iif(skb); 807 int l3index; 808 809 /* 810 * active side is lost. Try to find listening socket through 811 * source port, and then find md5 key through listening socket. 812 * we are not loose security here: 813 * Incoming packet is checked with md5 hash with finding key, 814 * no RST generated if md5 hash doesn't match. 815 */ 816 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 817 NULL, 0, ip_hdr(skb)->saddr, 818 th->source, ip_hdr(skb)->daddr, 819 ntohs(th->source), dif, sdif); 820 /* don't send rst if it can't find key */ 821 if (!sk1) 822 goto out; 823 824 /* sdif set, means packet ingressed via a device 825 * in an L3 domain and dif is set to it. 826 */ 827 l3index = sdif ? dif : 0; 828 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 829 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 830 if (!key) 831 goto out; 832 833 834 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 835 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 836 goto out; 837 838 } 839 840 if (key) { 841 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 842 (TCPOPT_NOP << 16) | 843 (TCPOPT_MD5SIG << 8) | 844 TCPOLEN_MD5SIG); 845 /* Update length and the length the header thinks exists */ 846 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 847 rep.th.doff = arg.iov[0].iov_len / 4; 848 849 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 850 key, ip_hdr(skb)->saddr, 851 ip_hdr(skb)->daddr, &rep.th); 852 } 853 #endif 854 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 855 if (rep.opt[0] == 0) { 856 __be32 mrst = mptcp_reset_option(skb); 857 858 if (mrst) { 859 rep.opt[0] = mrst; 860 arg.iov[0].iov_len += sizeof(mrst); 861 rep.th.doff = arg.iov[0].iov_len / 4; 862 } 863 } 864 865 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 866 ip_hdr(skb)->saddr, /* XXX */ 867 arg.iov[0].iov_len, IPPROTO_TCP, 0); 868 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 869 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 870 871 /* When socket is gone, all binding information is lost. 872 * routing might fail in this case. No choice here, if we choose to force 873 * input interface, we will misroute in case of asymmetric route. 874 */ 875 if (sk) { 876 arg.bound_dev_if = sk->sk_bound_dev_if; 877 if (sk_fullsock(sk)) 878 trace_tcp_send_reset(sk, skb); 879 } 880 881 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 882 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 883 884 arg.tos = ip_hdr(skb)->tos; 885 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 886 local_bh_disable(); 887 ctl_sk = this_cpu_read(ipv4_tcp_sk); 888 sock_net_set(ctl_sk, net); 889 if (sk) { 890 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 891 inet_twsk(sk)->tw_mark : sk->sk_mark; 892 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 893 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 894 transmit_time = tcp_transmit_time(sk); 895 xfrm_sk_clone_policy(ctl_sk, sk); 896 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 897 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 898 } else { 899 ctl_sk->sk_mark = 0; 900 ctl_sk->sk_priority = 0; 901 } 902 ip_send_unicast_reply(ctl_sk, 903 skb, &TCP_SKB_CB(skb)->header.h4.opt, 904 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 905 &arg, arg.iov[0].iov_len, 906 transmit_time, txhash); 907 908 xfrm_sk_free_policy(ctl_sk); 909 sock_net_set(ctl_sk, &init_net); 910 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 911 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 912 local_bh_enable(); 913 914 #ifdef CONFIG_TCP_MD5SIG 915 out: 916 rcu_read_unlock(); 917 #endif 918 } 919 920 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 921 outside socket context is ugly, certainly. What can I do? 922 */ 923 924 static void tcp_v4_send_ack(const struct sock *sk, 925 struct sk_buff *skb, u32 seq, u32 ack, 926 u32 win, u32 tsval, u32 tsecr, int oif, 927 struct tcp_key *key, 928 int reply_flags, u8 tos, u32 txhash) 929 { 930 const struct tcphdr *th = tcp_hdr(skb); 931 struct { 932 struct tcphdr th; 933 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 934 } rep; 935 struct net *net = sock_net(sk); 936 struct ip_reply_arg arg; 937 struct sock *ctl_sk; 938 u64 transmit_time; 939 940 memset(&rep.th, 0, sizeof(struct tcphdr)); 941 memset(&arg, 0, sizeof(arg)); 942 943 arg.iov[0].iov_base = (unsigned char *)&rep; 944 arg.iov[0].iov_len = sizeof(rep.th); 945 if (tsecr) { 946 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 947 (TCPOPT_TIMESTAMP << 8) | 948 TCPOLEN_TIMESTAMP); 949 rep.opt[1] = htonl(tsval); 950 rep.opt[2] = htonl(tsecr); 951 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 952 } 953 954 /* Swap the send and the receive. */ 955 rep.th.dest = th->source; 956 rep.th.source = th->dest; 957 rep.th.doff = arg.iov[0].iov_len / 4; 958 rep.th.seq = htonl(seq); 959 rep.th.ack_seq = htonl(ack); 960 rep.th.ack = 1; 961 rep.th.window = htons(win); 962 963 #ifdef CONFIG_TCP_MD5SIG 964 if (tcp_key_is_md5(key)) { 965 int offset = (tsecr) ? 3 : 0; 966 967 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 968 (TCPOPT_NOP << 16) | 969 (TCPOPT_MD5SIG << 8) | 970 TCPOLEN_MD5SIG); 971 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 972 rep.th.doff = arg.iov[0].iov_len/4; 973 974 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 975 key->md5_key, ip_hdr(skb)->saddr, 976 ip_hdr(skb)->daddr, &rep.th); 977 } 978 #endif 979 #ifdef CONFIG_TCP_AO 980 if (tcp_key_is_ao(key)) { 981 int offset = (tsecr) ? 3 : 0; 982 983 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 984 (tcp_ao_len(key->ao_key) << 16) | 985 (key->ao_key->sndid << 8) | 986 key->rcv_next); 987 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 988 rep.th.doff = arg.iov[0].iov_len / 4; 989 990 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 991 key->ao_key, key->traffic_key, 992 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 993 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 994 &rep.th, key->sne); 995 } 996 #endif 997 arg.flags = reply_flags; 998 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 999 ip_hdr(skb)->saddr, /* XXX */ 1000 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1001 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1002 if (oif) 1003 arg.bound_dev_if = oif; 1004 arg.tos = tos; 1005 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1006 local_bh_disable(); 1007 ctl_sk = this_cpu_read(ipv4_tcp_sk); 1008 sock_net_set(ctl_sk, net); 1009 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1010 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1011 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1012 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1013 transmit_time = tcp_transmit_time(sk); 1014 ip_send_unicast_reply(ctl_sk, 1015 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1016 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1017 &arg, arg.iov[0].iov_len, 1018 transmit_time, txhash); 1019 1020 sock_net_set(ctl_sk, &init_net); 1021 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1022 local_bh_enable(); 1023 } 1024 1025 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 1026 { 1027 struct inet_timewait_sock *tw = inet_twsk(sk); 1028 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1029 struct tcp_key key = {}; 1030 #ifdef CONFIG_TCP_AO 1031 struct tcp_ao_info *ao_info; 1032 1033 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1034 /* FIXME: the segment to-be-acked is not verified yet */ 1035 ao_info = rcu_dereference(tcptw->ao_info); 1036 if (ao_info) { 1037 const struct tcp_ao_hdr *aoh; 1038 1039 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1040 inet_twsk_put(tw); 1041 return; 1042 } 1043 1044 if (aoh) 1045 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); 1046 } 1047 } 1048 if (key.ao_key) { 1049 struct tcp_ao_key *rnext_key; 1050 1051 key.traffic_key = snd_other_key(key.ao_key); 1052 key.sne = READ_ONCE(ao_info->snd_sne); 1053 rnext_key = READ_ONCE(ao_info->rnext_key); 1054 key.rcv_next = rnext_key->rcvid; 1055 key.type = TCP_KEY_AO; 1056 #else 1057 if (0) { 1058 #endif 1059 #ifdef CONFIG_TCP_MD5SIG 1060 } else if (static_branch_unlikely(&tcp_md5_needed.key)) { 1061 key.md5_key = tcp_twsk_md5_key(tcptw); 1062 if (key.md5_key) 1063 key.type = TCP_KEY_MD5; 1064 #endif 1065 } 1066 1067 tcp_v4_send_ack(sk, skb, 1068 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 1069 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1070 tcp_tw_tsval(tcptw), 1071 tcptw->tw_ts_recent, 1072 tw->tw_bound_dev_if, &key, 1073 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1074 tw->tw_tos, 1075 tw->tw_txhash); 1076 1077 inet_twsk_put(tw); 1078 } 1079 1080 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1081 struct request_sock *req) 1082 { 1083 struct tcp_key key = {}; 1084 1085 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1086 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1087 */ 1088 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1089 tcp_sk(sk)->snd_nxt; 1090 1091 #ifdef CONFIG_TCP_AO 1092 if (static_branch_unlikely(&tcp_ao_needed.key) && 1093 tcp_rsk_used_ao(req)) { 1094 const union tcp_md5_addr *addr; 1095 const struct tcp_ao_hdr *aoh; 1096 int l3index; 1097 1098 /* Invalid TCP option size or twice included auth */ 1099 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1100 return; 1101 if (!aoh) 1102 return; 1103 1104 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1105 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1106 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1107 aoh->rnext_keyid, -1); 1108 if (unlikely(!key.ao_key)) { 1109 /* Send ACK with any matching MKT for the peer */ 1110 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1111 /* Matching key disappeared (user removed the key?) 1112 * let the handshake timeout. 1113 */ 1114 if (!key.ao_key) { 1115 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1116 addr, 1117 ntohs(tcp_hdr(skb)->source), 1118 &ip_hdr(skb)->daddr, 1119 ntohs(tcp_hdr(skb)->dest)); 1120 return; 1121 } 1122 } 1123 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1124 if (!key.traffic_key) 1125 return; 1126 1127 key.type = TCP_KEY_AO; 1128 key.rcv_next = aoh->keyid; 1129 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1130 #else 1131 if (0) { 1132 #endif 1133 #ifdef CONFIG_TCP_MD5SIG 1134 } else if (static_branch_unlikely(&tcp_md5_needed.key)) { 1135 const union tcp_md5_addr *addr; 1136 int l3index; 1137 1138 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1139 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1140 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1141 if (key.md5_key) 1142 key.type = TCP_KEY_MD5; 1143 #endif 1144 } 1145 1146 /* RFC 7323 2.3 1147 * The window field (SEG.WND) of every outgoing segment, with the 1148 * exception of <SYN> segments, MUST be right-shifted by 1149 * Rcv.Wind.Shift bits: 1150 */ 1151 tcp_v4_send_ack(sk, skb, seq, 1152 tcp_rsk(req)->rcv_nxt, 1153 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 1154 tcp_rsk_tsval(tcp_rsk(req)), 1155 READ_ONCE(req->ts_recent), 1156 0, &key, 1157 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1158 ip_hdr(skb)->tos, 1159 READ_ONCE(tcp_rsk(req)->txhash)); 1160 if (tcp_key_is_ao(&key)) 1161 kfree(key.traffic_key); 1162 } 1163 1164 /* 1165 * Send a SYN-ACK after having received a SYN. 1166 * This still operates on a request_sock only, not on a big 1167 * socket. 1168 */ 1169 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1170 struct flowi *fl, 1171 struct request_sock *req, 1172 struct tcp_fastopen_cookie *foc, 1173 enum tcp_synack_type synack_type, 1174 struct sk_buff *syn_skb) 1175 { 1176 const struct inet_request_sock *ireq = inet_rsk(req); 1177 struct flowi4 fl4; 1178 int err = -1; 1179 struct sk_buff *skb; 1180 u8 tos; 1181 1182 /* First, grab a route. */ 1183 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1184 return -1; 1185 1186 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1187 1188 if (skb) { 1189 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1190 1191 tos = READ_ONCE(inet_sk(sk)->tos); 1192 1193 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1194 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1195 (tos & INET_ECN_MASK); 1196 1197 if (!INET_ECN_is_capable(tos) && 1198 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1199 tos |= INET_ECN_ECT_0; 1200 1201 rcu_read_lock(); 1202 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1203 ireq->ir_rmt_addr, 1204 rcu_dereference(ireq->ireq_opt), 1205 tos); 1206 rcu_read_unlock(); 1207 err = net_xmit_eval(err); 1208 } 1209 1210 return err; 1211 } 1212 1213 /* 1214 * IPv4 request_sock destructor. 1215 */ 1216 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1217 { 1218 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1219 } 1220 1221 #ifdef CONFIG_TCP_MD5SIG 1222 /* 1223 * RFC2385 MD5 checksumming requires a mapping of 1224 * IP address->MD5 Key. 1225 * We need to maintain these in the sk structure. 1226 */ 1227 1228 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1229 EXPORT_SYMBOL(tcp_md5_needed); 1230 1231 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1232 { 1233 if (!old) 1234 return true; 1235 1236 /* l3index always overrides non-l3index */ 1237 if (old->l3index && new->l3index == 0) 1238 return false; 1239 if (old->l3index == 0 && new->l3index) 1240 return true; 1241 1242 return old->prefixlen < new->prefixlen; 1243 } 1244 1245 /* Find the Key structure for an address. */ 1246 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1247 const union tcp_md5_addr *addr, 1248 int family, bool any_l3index) 1249 { 1250 const struct tcp_sock *tp = tcp_sk(sk); 1251 struct tcp_md5sig_key *key; 1252 const struct tcp_md5sig_info *md5sig; 1253 __be32 mask; 1254 struct tcp_md5sig_key *best_match = NULL; 1255 bool match; 1256 1257 /* caller either holds rcu_read_lock() or socket lock */ 1258 md5sig = rcu_dereference_check(tp->md5sig_info, 1259 lockdep_sock_is_held(sk)); 1260 if (!md5sig) 1261 return NULL; 1262 1263 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1264 lockdep_sock_is_held(sk)) { 1265 if (key->family != family) 1266 continue; 1267 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1268 key->l3index != l3index) 1269 continue; 1270 if (family == AF_INET) { 1271 mask = inet_make_mask(key->prefixlen); 1272 match = (key->addr.a4.s_addr & mask) == 1273 (addr->a4.s_addr & mask); 1274 #if IS_ENABLED(CONFIG_IPV6) 1275 } else if (family == AF_INET6) { 1276 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1277 key->prefixlen); 1278 #endif 1279 } else { 1280 match = false; 1281 } 1282 1283 if (match && better_md5_match(best_match, key)) 1284 best_match = key; 1285 } 1286 return best_match; 1287 } 1288 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1289 1290 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1291 const union tcp_md5_addr *addr, 1292 int family, u8 prefixlen, 1293 int l3index, u8 flags) 1294 { 1295 const struct tcp_sock *tp = tcp_sk(sk); 1296 struct tcp_md5sig_key *key; 1297 unsigned int size = sizeof(struct in_addr); 1298 const struct tcp_md5sig_info *md5sig; 1299 1300 /* caller either holds rcu_read_lock() or socket lock */ 1301 md5sig = rcu_dereference_check(tp->md5sig_info, 1302 lockdep_sock_is_held(sk)); 1303 if (!md5sig) 1304 return NULL; 1305 #if IS_ENABLED(CONFIG_IPV6) 1306 if (family == AF_INET6) 1307 size = sizeof(struct in6_addr); 1308 #endif 1309 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1310 lockdep_sock_is_held(sk)) { 1311 if (key->family != family) 1312 continue; 1313 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1314 continue; 1315 if (key->l3index != l3index) 1316 continue; 1317 if (!memcmp(&key->addr, addr, size) && 1318 key->prefixlen == prefixlen) 1319 return key; 1320 } 1321 return NULL; 1322 } 1323 1324 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1325 const struct sock *addr_sk) 1326 { 1327 const union tcp_md5_addr *addr; 1328 int l3index; 1329 1330 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1331 addr_sk->sk_bound_dev_if); 1332 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1333 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1334 } 1335 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1336 1337 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1338 { 1339 struct tcp_sock *tp = tcp_sk(sk); 1340 struct tcp_md5sig_info *md5sig; 1341 1342 md5sig = kmalloc(sizeof(*md5sig), gfp); 1343 if (!md5sig) 1344 return -ENOMEM; 1345 1346 sk_gso_disable(sk); 1347 INIT_HLIST_HEAD(&md5sig->head); 1348 rcu_assign_pointer(tp->md5sig_info, md5sig); 1349 return 0; 1350 } 1351 1352 /* This can be called on a newly created socket, from other files */ 1353 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1354 int family, u8 prefixlen, int l3index, u8 flags, 1355 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1356 { 1357 /* Add Key to the list */ 1358 struct tcp_md5sig_key *key; 1359 struct tcp_sock *tp = tcp_sk(sk); 1360 struct tcp_md5sig_info *md5sig; 1361 1362 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1363 if (key) { 1364 /* Pre-existing entry - just update that one. 1365 * Note that the key might be used concurrently. 1366 * data_race() is telling kcsan that we do not care of 1367 * key mismatches, since changing MD5 key on live flows 1368 * can lead to packet drops. 1369 */ 1370 data_race(memcpy(key->key, newkey, newkeylen)); 1371 1372 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1373 * Also note that a reader could catch new key->keylen value 1374 * but old key->key[], this is the reason we use __GFP_ZERO 1375 * at sock_kmalloc() time below these lines. 1376 */ 1377 WRITE_ONCE(key->keylen, newkeylen); 1378 1379 return 0; 1380 } 1381 1382 md5sig = rcu_dereference_protected(tp->md5sig_info, 1383 lockdep_sock_is_held(sk)); 1384 1385 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1386 if (!key) 1387 return -ENOMEM; 1388 1389 memcpy(key->key, newkey, newkeylen); 1390 key->keylen = newkeylen; 1391 key->family = family; 1392 key->prefixlen = prefixlen; 1393 key->l3index = l3index; 1394 key->flags = flags; 1395 memcpy(&key->addr, addr, 1396 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1397 sizeof(struct in_addr)); 1398 hlist_add_head_rcu(&key->node, &md5sig->head); 1399 return 0; 1400 } 1401 1402 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1403 int family, u8 prefixlen, int l3index, u8 flags, 1404 const u8 *newkey, u8 newkeylen) 1405 { 1406 struct tcp_sock *tp = tcp_sk(sk); 1407 1408 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1409 if (tcp_md5_alloc_sigpool()) 1410 return -ENOMEM; 1411 1412 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1413 tcp_md5_release_sigpool(); 1414 return -ENOMEM; 1415 } 1416 1417 if (!static_branch_inc(&tcp_md5_needed.key)) { 1418 struct tcp_md5sig_info *md5sig; 1419 1420 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1421 rcu_assign_pointer(tp->md5sig_info, NULL); 1422 kfree_rcu(md5sig, rcu); 1423 tcp_md5_release_sigpool(); 1424 return -EUSERS; 1425 } 1426 } 1427 1428 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1429 newkey, newkeylen, GFP_KERNEL); 1430 } 1431 EXPORT_SYMBOL(tcp_md5_do_add); 1432 1433 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1434 int family, u8 prefixlen, int l3index, 1435 struct tcp_md5sig_key *key) 1436 { 1437 struct tcp_sock *tp = tcp_sk(sk); 1438 1439 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1440 tcp_md5_add_sigpool(); 1441 1442 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1443 tcp_md5_release_sigpool(); 1444 return -ENOMEM; 1445 } 1446 1447 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1448 struct tcp_md5sig_info *md5sig; 1449 1450 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1451 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1452 rcu_assign_pointer(tp->md5sig_info, NULL); 1453 kfree_rcu(md5sig, rcu); 1454 tcp_md5_release_sigpool(); 1455 return -EUSERS; 1456 } 1457 } 1458 1459 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1460 key->flags, key->key, key->keylen, 1461 sk_gfp_mask(sk, GFP_ATOMIC)); 1462 } 1463 EXPORT_SYMBOL(tcp_md5_key_copy); 1464 1465 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1466 u8 prefixlen, int l3index, u8 flags) 1467 { 1468 struct tcp_md5sig_key *key; 1469 1470 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1471 if (!key) 1472 return -ENOENT; 1473 hlist_del_rcu(&key->node); 1474 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1475 kfree_rcu(key, rcu); 1476 return 0; 1477 } 1478 EXPORT_SYMBOL(tcp_md5_do_del); 1479 1480 void tcp_clear_md5_list(struct sock *sk) 1481 { 1482 struct tcp_sock *tp = tcp_sk(sk); 1483 struct tcp_md5sig_key *key; 1484 struct hlist_node *n; 1485 struct tcp_md5sig_info *md5sig; 1486 1487 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1488 1489 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1490 hlist_del_rcu(&key->node); 1491 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1492 kfree_rcu(key, rcu); 1493 } 1494 } 1495 1496 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1497 sockptr_t optval, int optlen) 1498 { 1499 struct tcp_md5sig cmd; 1500 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1501 const union tcp_md5_addr *addr; 1502 u8 prefixlen = 32; 1503 int l3index = 0; 1504 bool l3flag; 1505 u8 flags; 1506 1507 if (optlen < sizeof(cmd)) 1508 return -EINVAL; 1509 1510 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1511 return -EFAULT; 1512 1513 if (sin->sin_family != AF_INET) 1514 return -EINVAL; 1515 1516 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1517 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1518 1519 if (optname == TCP_MD5SIG_EXT && 1520 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1521 prefixlen = cmd.tcpm_prefixlen; 1522 if (prefixlen > 32) 1523 return -EINVAL; 1524 } 1525 1526 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1527 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1528 struct net_device *dev; 1529 1530 rcu_read_lock(); 1531 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1532 if (dev && netif_is_l3_master(dev)) 1533 l3index = dev->ifindex; 1534 1535 rcu_read_unlock(); 1536 1537 /* ok to reference set/not set outside of rcu; 1538 * right now device MUST be an L3 master 1539 */ 1540 if (!dev || !l3index) 1541 return -EINVAL; 1542 } 1543 1544 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1545 1546 if (!cmd.tcpm_keylen) 1547 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1548 1549 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1550 return -EINVAL; 1551 1552 /* Don't allow keys for peers that have a matching TCP-AO key. 1553 * See the comment in tcp_ao_add_cmd() 1554 */ 1555 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1556 return -EKEYREJECTED; 1557 1558 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1559 cmd.tcpm_key, cmd.tcpm_keylen); 1560 } 1561 1562 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1563 __be32 daddr, __be32 saddr, 1564 const struct tcphdr *th, int nbytes) 1565 { 1566 struct tcp4_pseudohdr *bp; 1567 struct scatterlist sg; 1568 struct tcphdr *_th; 1569 1570 bp = hp->scratch; 1571 bp->saddr = saddr; 1572 bp->daddr = daddr; 1573 bp->pad = 0; 1574 bp->protocol = IPPROTO_TCP; 1575 bp->len = cpu_to_be16(nbytes); 1576 1577 _th = (struct tcphdr *)(bp + 1); 1578 memcpy(_th, th, sizeof(*th)); 1579 _th->check = 0; 1580 1581 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1582 ahash_request_set_crypt(hp->req, &sg, NULL, 1583 sizeof(*bp) + sizeof(*th)); 1584 return crypto_ahash_update(hp->req); 1585 } 1586 1587 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1588 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1589 { 1590 struct tcp_sigpool hp; 1591 1592 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1593 goto clear_hash_nostart; 1594 1595 if (crypto_ahash_init(hp.req)) 1596 goto clear_hash; 1597 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1598 goto clear_hash; 1599 if (tcp_md5_hash_key(&hp, key)) 1600 goto clear_hash; 1601 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1602 if (crypto_ahash_final(hp.req)) 1603 goto clear_hash; 1604 1605 tcp_sigpool_end(&hp); 1606 return 0; 1607 1608 clear_hash: 1609 tcp_sigpool_end(&hp); 1610 clear_hash_nostart: 1611 memset(md5_hash, 0, 16); 1612 return 1; 1613 } 1614 1615 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1616 const struct sock *sk, 1617 const struct sk_buff *skb) 1618 { 1619 const struct tcphdr *th = tcp_hdr(skb); 1620 struct tcp_sigpool hp; 1621 __be32 saddr, daddr; 1622 1623 if (sk) { /* valid for establish/request sockets */ 1624 saddr = sk->sk_rcv_saddr; 1625 daddr = sk->sk_daddr; 1626 } else { 1627 const struct iphdr *iph = ip_hdr(skb); 1628 saddr = iph->saddr; 1629 daddr = iph->daddr; 1630 } 1631 1632 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1633 goto clear_hash_nostart; 1634 1635 if (crypto_ahash_init(hp.req)) 1636 goto clear_hash; 1637 1638 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1639 goto clear_hash; 1640 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1641 goto clear_hash; 1642 if (tcp_md5_hash_key(&hp, key)) 1643 goto clear_hash; 1644 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1645 if (crypto_ahash_final(hp.req)) 1646 goto clear_hash; 1647 1648 tcp_sigpool_end(&hp); 1649 return 0; 1650 1651 clear_hash: 1652 tcp_sigpool_end(&hp); 1653 clear_hash_nostart: 1654 memset(md5_hash, 0, 16); 1655 return 1; 1656 } 1657 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1658 1659 #endif 1660 1661 static void tcp_v4_init_req(struct request_sock *req, 1662 const struct sock *sk_listener, 1663 struct sk_buff *skb) 1664 { 1665 struct inet_request_sock *ireq = inet_rsk(req); 1666 struct net *net = sock_net(sk_listener); 1667 1668 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1669 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1670 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1671 } 1672 1673 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1674 struct sk_buff *skb, 1675 struct flowi *fl, 1676 struct request_sock *req) 1677 { 1678 tcp_v4_init_req(req, sk, skb); 1679 1680 if (security_inet_conn_request(sk, skb, req)) 1681 return NULL; 1682 1683 return inet_csk_route_req(sk, &fl->u.ip4, req); 1684 } 1685 1686 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1687 .family = PF_INET, 1688 .obj_size = sizeof(struct tcp_request_sock), 1689 .rtx_syn_ack = tcp_rtx_synack, 1690 .send_ack = tcp_v4_reqsk_send_ack, 1691 .destructor = tcp_v4_reqsk_destructor, 1692 .send_reset = tcp_v4_send_reset, 1693 .syn_ack_timeout = tcp_syn_ack_timeout, 1694 }; 1695 1696 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1697 .mss_clamp = TCP_MSS_DEFAULT, 1698 #ifdef CONFIG_TCP_MD5SIG 1699 .req_md5_lookup = tcp_v4_md5_lookup, 1700 .calc_md5_hash = tcp_v4_md5_hash_skb, 1701 #endif 1702 #ifdef CONFIG_TCP_AO 1703 .ao_lookup = tcp_v4_ao_lookup_rsk, 1704 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1705 .ao_synack_hash = tcp_v4_ao_synack_hash, 1706 #endif 1707 #ifdef CONFIG_SYN_COOKIES 1708 .cookie_init_seq = cookie_v4_init_sequence, 1709 #endif 1710 .route_req = tcp_v4_route_req, 1711 .init_seq = tcp_v4_init_seq, 1712 .init_ts_off = tcp_v4_init_ts_off, 1713 .send_synack = tcp_v4_send_synack, 1714 }; 1715 1716 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1717 { 1718 /* Never answer to SYNs send to broadcast or multicast */ 1719 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1720 goto drop; 1721 1722 return tcp_conn_request(&tcp_request_sock_ops, 1723 &tcp_request_sock_ipv4_ops, sk, skb); 1724 1725 drop: 1726 tcp_listendrop(sk); 1727 return 0; 1728 } 1729 EXPORT_SYMBOL(tcp_v4_conn_request); 1730 1731 1732 /* 1733 * The three way handshake has completed - we got a valid synack - 1734 * now create the new socket. 1735 */ 1736 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1737 struct request_sock *req, 1738 struct dst_entry *dst, 1739 struct request_sock *req_unhash, 1740 bool *own_req) 1741 { 1742 struct inet_request_sock *ireq; 1743 bool found_dup_sk = false; 1744 struct inet_sock *newinet; 1745 struct tcp_sock *newtp; 1746 struct sock *newsk; 1747 #ifdef CONFIG_TCP_MD5SIG 1748 const union tcp_md5_addr *addr; 1749 struct tcp_md5sig_key *key; 1750 int l3index; 1751 #endif 1752 struct ip_options_rcu *inet_opt; 1753 1754 if (sk_acceptq_is_full(sk)) 1755 goto exit_overflow; 1756 1757 newsk = tcp_create_openreq_child(sk, req, skb); 1758 if (!newsk) 1759 goto exit_nonewsk; 1760 1761 newsk->sk_gso_type = SKB_GSO_TCPV4; 1762 inet_sk_rx_dst_set(newsk, skb); 1763 1764 newtp = tcp_sk(newsk); 1765 newinet = inet_sk(newsk); 1766 ireq = inet_rsk(req); 1767 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1768 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1769 newsk->sk_bound_dev_if = ireq->ir_iif; 1770 newinet->inet_saddr = ireq->ir_loc_addr; 1771 inet_opt = rcu_dereference(ireq->ireq_opt); 1772 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1773 newinet->mc_index = inet_iif(skb); 1774 newinet->mc_ttl = ip_hdr(skb)->ttl; 1775 newinet->rcv_tos = ip_hdr(skb)->tos; 1776 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1777 if (inet_opt) 1778 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1779 atomic_set(&newinet->inet_id, get_random_u16()); 1780 1781 /* Set ToS of the new socket based upon the value of incoming SYN. 1782 * ECT bits are set later in tcp_init_transfer(). 1783 */ 1784 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1785 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1786 1787 if (!dst) { 1788 dst = inet_csk_route_child_sock(sk, newsk, req); 1789 if (!dst) 1790 goto put_and_exit; 1791 } else { 1792 /* syncookie case : see end of cookie_v4_check() */ 1793 } 1794 sk_setup_caps(newsk, dst); 1795 1796 tcp_ca_openreq_child(newsk, dst); 1797 1798 tcp_sync_mss(newsk, dst_mtu(dst)); 1799 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1800 1801 tcp_initialize_rcv_mss(newsk); 1802 1803 #ifdef CONFIG_TCP_MD5SIG 1804 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1805 /* Copy over the MD5 key from the original socket */ 1806 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1807 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1808 if (key && !tcp_rsk_used_ao(req)) { 1809 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1810 goto put_and_exit; 1811 sk_gso_disable(newsk); 1812 } 1813 #endif 1814 #ifdef CONFIG_TCP_AO 1815 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1816 goto put_and_exit; /* OOM, release back memory */ 1817 #endif 1818 1819 if (__inet_inherit_port(sk, newsk) < 0) 1820 goto put_and_exit; 1821 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1822 &found_dup_sk); 1823 if (likely(*own_req)) { 1824 tcp_move_syn(newtp, req); 1825 ireq->ireq_opt = NULL; 1826 } else { 1827 newinet->inet_opt = NULL; 1828 1829 if (!req_unhash && found_dup_sk) { 1830 /* This code path should only be executed in the 1831 * syncookie case only 1832 */ 1833 bh_unlock_sock(newsk); 1834 sock_put(newsk); 1835 newsk = NULL; 1836 } 1837 } 1838 return newsk; 1839 1840 exit_overflow: 1841 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1842 exit_nonewsk: 1843 dst_release(dst); 1844 exit: 1845 tcp_listendrop(sk); 1846 return NULL; 1847 put_and_exit: 1848 newinet->inet_opt = NULL; 1849 inet_csk_prepare_forced_close(newsk); 1850 tcp_done(newsk); 1851 goto exit; 1852 } 1853 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1854 1855 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1856 { 1857 #ifdef CONFIG_SYN_COOKIES 1858 const struct tcphdr *th = tcp_hdr(skb); 1859 1860 if (!th->syn) 1861 sk = cookie_v4_check(sk, skb); 1862 #endif 1863 return sk; 1864 } 1865 1866 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1867 struct tcphdr *th, u32 *cookie) 1868 { 1869 u16 mss = 0; 1870 #ifdef CONFIG_SYN_COOKIES 1871 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1872 &tcp_request_sock_ipv4_ops, sk, th); 1873 if (mss) { 1874 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1875 tcp_synq_overflow(sk); 1876 } 1877 #endif 1878 return mss; 1879 } 1880 1881 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1882 u32)); 1883 /* The socket must have it's spinlock held when we get 1884 * here, unless it is a TCP_LISTEN socket. 1885 * 1886 * We have a potential double-lock case here, so even when 1887 * doing backlog processing we use the BH locking scheme. 1888 * This is because we cannot sleep with the original spinlock 1889 * held. 1890 */ 1891 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1892 { 1893 enum skb_drop_reason reason; 1894 struct sock *rsk; 1895 1896 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1897 struct dst_entry *dst; 1898 1899 dst = rcu_dereference_protected(sk->sk_rx_dst, 1900 lockdep_sock_is_held(sk)); 1901 1902 sock_rps_save_rxhash(sk, skb); 1903 sk_mark_napi_id(sk, skb); 1904 if (dst) { 1905 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1906 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1907 dst, 0)) { 1908 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1909 dst_release(dst); 1910 } 1911 } 1912 tcp_rcv_established(sk, skb); 1913 return 0; 1914 } 1915 1916 if (tcp_checksum_complete(skb)) 1917 goto csum_err; 1918 1919 if (sk->sk_state == TCP_LISTEN) { 1920 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1921 1922 if (!nsk) 1923 return 0; 1924 if (nsk != sk) { 1925 reason = tcp_child_process(sk, nsk, skb); 1926 if (reason) { 1927 rsk = nsk; 1928 goto reset; 1929 } 1930 return 0; 1931 } 1932 } else 1933 sock_rps_save_rxhash(sk, skb); 1934 1935 reason = tcp_rcv_state_process(sk, skb); 1936 if (reason) { 1937 rsk = sk; 1938 goto reset; 1939 } 1940 return 0; 1941 1942 reset: 1943 tcp_v4_send_reset(rsk, skb); 1944 discard: 1945 kfree_skb_reason(skb, reason); 1946 /* Be careful here. If this function gets more complicated and 1947 * gcc suffers from register pressure on the x86, sk (in %ebx) 1948 * might be destroyed here. This current version compiles correctly, 1949 * but you have been warned. 1950 */ 1951 return 0; 1952 1953 csum_err: 1954 reason = SKB_DROP_REASON_TCP_CSUM; 1955 trace_tcp_bad_csum(skb); 1956 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1957 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1958 goto discard; 1959 } 1960 EXPORT_SYMBOL(tcp_v4_do_rcv); 1961 1962 int tcp_v4_early_demux(struct sk_buff *skb) 1963 { 1964 struct net *net = dev_net(skb->dev); 1965 const struct iphdr *iph; 1966 const struct tcphdr *th; 1967 struct sock *sk; 1968 1969 if (skb->pkt_type != PACKET_HOST) 1970 return 0; 1971 1972 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1973 return 0; 1974 1975 iph = ip_hdr(skb); 1976 th = tcp_hdr(skb); 1977 1978 if (th->doff < sizeof(struct tcphdr) / 4) 1979 return 0; 1980 1981 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1982 iph->saddr, th->source, 1983 iph->daddr, ntohs(th->dest), 1984 skb->skb_iif, inet_sdif(skb)); 1985 if (sk) { 1986 skb->sk = sk; 1987 skb->destructor = sock_edemux; 1988 if (sk_fullsock(sk)) { 1989 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1990 1991 if (dst) 1992 dst = dst_check(dst, 0); 1993 if (dst && 1994 sk->sk_rx_dst_ifindex == skb->skb_iif) 1995 skb_dst_set_noref(skb, dst); 1996 } 1997 } 1998 return 0; 1999 } 2000 2001 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2002 enum skb_drop_reason *reason) 2003 { 2004 u32 limit, tail_gso_size, tail_gso_segs; 2005 struct skb_shared_info *shinfo; 2006 const struct tcphdr *th; 2007 struct tcphdr *thtail; 2008 struct sk_buff *tail; 2009 unsigned int hdrlen; 2010 bool fragstolen; 2011 u32 gso_segs; 2012 u32 gso_size; 2013 int delta; 2014 2015 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2016 * we can fix skb->truesize to its real value to avoid future drops. 2017 * This is valid because skb is not yet charged to the socket. 2018 * It has been noticed pure SACK packets were sometimes dropped 2019 * (if cooked by drivers without copybreak feature). 2020 */ 2021 skb_condense(skb); 2022 2023 skb_dst_drop(skb); 2024 2025 if (unlikely(tcp_checksum_complete(skb))) { 2026 bh_unlock_sock(sk); 2027 trace_tcp_bad_csum(skb); 2028 *reason = SKB_DROP_REASON_TCP_CSUM; 2029 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2030 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2031 return true; 2032 } 2033 2034 /* Attempt coalescing to last skb in backlog, even if we are 2035 * above the limits. 2036 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2037 */ 2038 th = (const struct tcphdr *)skb->data; 2039 hdrlen = th->doff * 4; 2040 2041 tail = sk->sk_backlog.tail; 2042 if (!tail) 2043 goto no_coalesce; 2044 thtail = (struct tcphdr *)tail->data; 2045 2046 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2047 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2048 ((TCP_SKB_CB(tail)->tcp_flags | 2049 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2050 !((TCP_SKB_CB(tail)->tcp_flags & 2051 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2052 ((TCP_SKB_CB(tail)->tcp_flags ^ 2053 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 2054 #ifdef CONFIG_TLS_DEVICE 2055 tail->decrypted != skb->decrypted || 2056 #endif 2057 !mptcp_skb_can_collapse(tail, skb) || 2058 thtail->doff != th->doff || 2059 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2060 goto no_coalesce; 2061 2062 __skb_pull(skb, hdrlen); 2063 2064 shinfo = skb_shinfo(skb); 2065 gso_size = shinfo->gso_size ?: skb->len; 2066 gso_segs = shinfo->gso_segs ?: 1; 2067 2068 shinfo = skb_shinfo(tail); 2069 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2070 tail_gso_segs = shinfo->gso_segs ?: 1; 2071 2072 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2073 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2074 2075 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2076 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2077 thtail->window = th->window; 2078 } 2079 2080 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2081 * thtail->fin, so that the fast path in tcp_rcv_established() 2082 * is not entered if we append a packet with a FIN. 2083 * SYN, RST, URG are not present. 2084 * ACK is set on both packets. 2085 * PSH : we do not really care in TCP stack, 2086 * at least for 'GRO' packets. 2087 */ 2088 thtail->fin |= th->fin; 2089 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2090 2091 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2092 TCP_SKB_CB(tail)->has_rxtstamp = true; 2093 tail->tstamp = skb->tstamp; 2094 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2095 } 2096 2097 /* Not as strict as GRO. We only need to carry mss max value */ 2098 shinfo->gso_size = max(gso_size, tail_gso_size); 2099 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2100 2101 sk->sk_backlog.len += delta; 2102 __NET_INC_STATS(sock_net(sk), 2103 LINUX_MIB_TCPBACKLOGCOALESCE); 2104 kfree_skb_partial(skb, fragstolen); 2105 return false; 2106 } 2107 __skb_push(skb, hdrlen); 2108 2109 no_coalesce: 2110 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); 2111 2112 /* Only socket owner can try to collapse/prune rx queues 2113 * to reduce memory overhead, so add a little headroom here. 2114 * Few sockets backlog are possibly concurrently non empty. 2115 */ 2116 limit += 64 * 1024; 2117 2118 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2119 bh_unlock_sock(sk); 2120 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2121 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2122 return true; 2123 } 2124 return false; 2125 } 2126 EXPORT_SYMBOL(tcp_add_backlog); 2127 2128 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2129 { 2130 struct tcphdr *th = (struct tcphdr *)skb->data; 2131 2132 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2133 } 2134 EXPORT_SYMBOL(tcp_filter); 2135 2136 static void tcp_v4_restore_cb(struct sk_buff *skb) 2137 { 2138 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2139 sizeof(struct inet_skb_parm)); 2140 } 2141 2142 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2143 const struct tcphdr *th) 2144 { 2145 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2146 * barrier() makes sure compiler wont play fool^Waliasing games. 2147 */ 2148 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2149 sizeof(struct inet_skb_parm)); 2150 barrier(); 2151 2152 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2153 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2154 skb->len - th->doff * 4); 2155 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2156 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 2157 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 2158 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2159 TCP_SKB_CB(skb)->sacked = 0; 2160 TCP_SKB_CB(skb)->has_rxtstamp = 2161 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2162 } 2163 2164 /* 2165 * From tcp_input.c 2166 */ 2167 2168 int tcp_v4_rcv(struct sk_buff *skb) 2169 { 2170 struct net *net = dev_net(skb->dev); 2171 enum skb_drop_reason drop_reason; 2172 int sdif = inet_sdif(skb); 2173 int dif = inet_iif(skb); 2174 const struct iphdr *iph; 2175 const struct tcphdr *th; 2176 bool refcounted; 2177 struct sock *sk; 2178 int ret; 2179 2180 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2181 if (skb->pkt_type != PACKET_HOST) 2182 goto discard_it; 2183 2184 /* Count it even if it's bad */ 2185 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2186 2187 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2188 goto discard_it; 2189 2190 th = (const struct tcphdr *)skb->data; 2191 2192 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2193 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2194 goto bad_packet; 2195 } 2196 if (!pskb_may_pull(skb, th->doff * 4)) 2197 goto discard_it; 2198 2199 /* An explanation is required here, I think. 2200 * Packet length and doff are validated by header prediction, 2201 * provided case of th->doff==0 is eliminated. 2202 * So, we defer the checks. */ 2203 2204 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2205 goto csum_error; 2206 2207 th = (const struct tcphdr *)skb->data; 2208 iph = ip_hdr(skb); 2209 lookup: 2210 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2211 skb, __tcp_hdrlen(th), th->source, 2212 th->dest, sdif, &refcounted); 2213 if (!sk) 2214 goto no_tcp_socket; 2215 2216 process: 2217 if (sk->sk_state == TCP_TIME_WAIT) 2218 goto do_time_wait; 2219 2220 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2221 struct request_sock *req = inet_reqsk(sk); 2222 bool req_stolen = false; 2223 struct sock *nsk; 2224 2225 sk = req->rsk_listener; 2226 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2227 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2228 else 2229 drop_reason = tcp_inbound_hash(sk, req, skb, 2230 &iph->saddr, &iph->daddr, 2231 AF_INET, dif, sdif); 2232 if (unlikely(drop_reason)) { 2233 sk_drops_add(sk, skb); 2234 reqsk_put(req); 2235 goto discard_it; 2236 } 2237 if (tcp_checksum_complete(skb)) { 2238 reqsk_put(req); 2239 goto csum_error; 2240 } 2241 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2242 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2243 if (!nsk) { 2244 inet_csk_reqsk_queue_drop_and_put(sk, req); 2245 goto lookup; 2246 } 2247 sk = nsk; 2248 /* reuseport_migrate_sock() has already held one sk_refcnt 2249 * before returning. 2250 */ 2251 } else { 2252 /* We own a reference on the listener, increase it again 2253 * as we might lose it too soon. 2254 */ 2255 sock_hold(sk); 2256 } 2257 refcounted = true; 2258 nsk = NULL; 2259 if (!tcp_filter(sk, skb)) { 2260 th = (const struct tcphdr *)skb->data; 2261 iph = ip_hdr(skb); 2262 tcp_v4_fill_cb(skb, iph, th); 2263 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2264 } else { 2265 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2266 } 2267 if (!nsk) { 2268 reqsk_put(req); 2269 if (req_stolen) { 2270 /* Another cpu got exclusive access to req 2271 * and created a full blown socket. 2272 * Try to feed this packet to this socket 2273 * instead of discarding it. 2274 */ 2275 tcp_v4_restore_cb(skb); 2276 sock_put(sk); 2277 goto lookup; 2278 } 2279 goto discard_and_relse; 2280 } 2281 nf_reset_ct(skb); 2282 if (nsk == sk) { 2283 reqsk_put(req); 2284 tcp_v4_restore_cb(skb); 2285 } else { 2286 drop_reason = tcp_child_process(sk, nsk, skb); 2287 if (drop_reason) { 2288 tcp_v4_send_reset(nsk, skb); 2289 goto discard_and_relse; 2290 } 2291 sock_put(sk); 2292 return 0; 2293 } 2294 } 2295 2296 if (static_branch_unlikely(&ip4_min_ttl)) { 2297 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2298 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2299 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2300 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2301 goto discard_and_relse; 2302 } 2303 } 2304 2305 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2306 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2307 goto discard_and_relse; 2308 } 2309 2310 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2311 AF_INET, dif, sdif); 2312 if (drop_reason) 2313 goto discard_and_relse; 2314 2315 nf_reset_ct(skb); 2316 2317 if (tcp_filter(sk, skb)) { 2318 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2319 goto discard_and_relse; 2320 } 2321 th = (const struct tcphdr *)skb->data; 2322 iph = ip_hdr(skb); 2323 tcp_v4_fill_cb(skb, iph, th); 2324 2325 skb->dev = NULL; 2326 2327 if (sk->sk_state == TCP_LISTEN) { 2328 ret = tcp_v4_do_rcv(sk, skb); 2329 goto put_and_return; 2330 } 2331 2332 sk_incoming_cpu_update(sk); 2333 2334 bh_lock_sock_nested(sk); 2335 tcp_segs_in(tcp_sk(sk), skb); 2336 ret = 0; 2337 if (!sock_owned_by_user(sk)) { 2338 ret = tcp_v4_do_rcv(sk, skb); 2339 } else { 2340 if (tcp_add_backlog(sk, skb, &drop_reason)) 2341 goto discard_and_relse; 2342 } 2343 bh_unlock_sock(sk); 2344 2345 put_and_return: 2346 if (refcounted) 2347 sock_put(sk); 2348 2349 return ret; 2350 2351 no_tcp_socket: 2352 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2353 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2354 goto discard_it; 2355 2356 tcp_v4_fill_cb(skb, iph, th); 2357 2358 if (tcp_checksum_complete(skb)) { 2359 csum_error: 2360 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2361 trace_tcp_bad_csum(skb); 2362 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2363 bad_packet: 2364 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2365 } else { 2366 tcp_v4_send_reset(NULL, skb); 2367 } 2368 2369 discard_it: 2370 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2371 /* Discard frame. */ 2372 kfree_skb_reason(skb, drop_reason); 2373 return 0; 2374 2375 discard_and_relse: 2376 sk_drops_add(sk, skb); 2377 if (refcounted) 2378 sock_put(sk); 2379 goto discard_it; 2380 2381 do_time_wait: 2382 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2383 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2384 inet_twsk_put(inet_twsk(sk)); 2385 goto discard_it; 2386 } 2387 2388 tcp_v4_fill_cb(skb, iph, th); 2389 2390 if (tcp_checksum_complete(skb)) { 2391 inet_twsk_put(inet_twsk(sk)); 2392 goto csum_error; 2393 } 2394 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2395 case TCP_TW_SYN: { 2396 struct sock *sk2 = inet_lookup_listener(net, 2397 net->ipv4.tcp_death_row.hashinfo, 2398 skb, __tcp_hdrlen(th), 2399 iph->saddr, th->source, 2400 iph->daddr, th->dest, 2401 inet_iif(skb), 2402 sdif); 2403 if (sk2) { 2404 inet_twsk_deschedule_put(inet_twsk(sk)); 2405 sk = sk2; 2406 tcp_v4_restore_cb(skb); 2407 refcounted = false; 2408 goto process; 2409 } 2410 } 2411 /* to ACK */ 2412 fallthrough; 2413 case TCP_TW_ACK: 2414 tcp_v4_timewait_ack(sk, skb); 2415 break; 2416 case TCP_TW_RST: 2417 tcp_v4_send_reset(sk, skb); 2418 inet_twsk_deschedule_put(inet_twsk(sk)); 2419 goto discard_it; 2420 case TCP_TW_SUCCESS:; 2421 } 2422 goto discard_it; 2423 } 2424 2425 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2426 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2427 .twsk_unique = tcp_twsk_unique, 2428 .twsk_destructor= tcp_twsk_destructor, 2429 }; 2430 2431 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2432 { 2433 struct dst_entry *dst = skb_dst(skb); 2434 2435 if (dst && dst_hold_safe(dst)) { 2436 rcu_assign_pointer(sk->sk_rx_dst, dst); 2437 sk->sk_rx_dst_ifindex = skb->skb_iif; 2438 } 2439 } 2440 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2441 2442 const struct inet_connection_sock_af_ops ipv4_specific = { 2443 .queue_xmit = ip_queue_xmit, 2444 .send_check = tcp_v4_send_check, 2445 .rebuild_header = inet_sk_rebuild_header, 2446 .sk_rx_dst_set = inet_sk_rx_dst_set, 2447 .conn_request = tcp_v4_conn_request, 2448 .syn_recv_sock = tcp_v4_syn_recv_sock, 2449 .net_header_len = sizeof(struct iphdr), 2450 .setsockopt = ip_setsockopt, 2451 .getsockopt = ip_getsockopt, 2452 .addr2sockaddr = inet_csk_addr2sockaddr, 2453 .sockaddr_len = sizeof(struct sockaddr_in), 2454 .mtu_reduced = tcp_v4_mtu_reduced, 2455 }; 2456 EXPORT_SYMBOL(ipv4_specific); 2457 2458 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2459 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2460 #ifdef CONFIG_TCP_MD5SIG 2461 .md5_lookup = tcp_v4_md5_lookup, 2462 .calc_md5_hash = tcp_v4_md5_hash_skb, 2463 .md5_parse = tcp_v4_parse_md5_keys, 2464 #endif 2465 #ifdef CONFIG_TCP_AO 2466 .ao_lookup = tcp_v4_ao_lookup, 2467 .calc_ao_hash = tcp_v4_ao_hash_skb, 2468 .ao_parse = tcp_v4_parse_ao, 2469 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2470 #endif 2471 }; 2472 #endif 2473 2474 /* NOTE: A lot of things set to zero explicitly by call to 2475 * sk_alloc() so need not be done here. 2476 */ 2477 static int tcp_v4_init_sock(struct sock *sk) 2478 { 2479 struct inet_connection_sock *icsk = inet_csk(sk); 2480 2481 tcp_init_sock(sk); 2482 2483 icsk->icsk_af_ops = &ipv4_specific; 2484 2485 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2486 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2487 #endif 2488 2489 return 0; 2490 } 2491 2492 #ifdef CONFIG_TCP_MD5SIG 2493 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2494 { 2495 struct tcp_md5sig_info *md5sig; 2496 2497 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2498 kfree(md5sig); 2499 static_branch_slow_dec_deferred(&tcp_md5_needed); 2500 tcp_md5_release_sigpool(); 2501 } 2502 #endif 2503 2504 void tcp_v4_destroy_sock(struct sock *sk) 2505 { 2506 struct tcp_sock *tp = tcp_sk(sk); 2507 2508 trace_tcp_destroy_sock(sk); 2509 2510 tcp_clear_xmit_timers(sk); 2511 2512 tcp_cleanup_congestion_control(sk); 2513 2514 tcp_cleanup_ulp(sk); 2515 2516 /* Cleanup up the write buffer. */ 2517 tcp_write_queue_purge(sk); 2518 2519 /* Check if we want to disable active TFO */ 2520 tcp_fastopen_active_disable_ofo_check(sk); 2521 2522 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2523 skb_rbtree_purge(&tp->out_of_order_queue); 2524 2525 #ifdef CONFIG_TCP_MD5SIG 2526 /* Clean up the MD5 key list, if any */ 2527 if (tp->md5sig_info) { 2528 struct tcp_md5sig_info *md5sig; 2529 2530 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2531 tcp_clear_md5_list(sk); 2532 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2533 rcu_assign_pointer(tp->md5sig_info, NULL); 2534 } 2535 #endif 2536 tcp_ao_destroy_sock(sk, false); 2537 2538 /* Clean up a referenced TCP bind bucket. */ 2539 if (inet_csk(sk)->icsk_bind_hash) 2540 inet_put_port(sk); 2541 2542 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2543 2544 /* If socket is aborted during connect operation */ 2545 tcp_free_fastopen_req(tp); 2546 tcp_fastopen_destroy_cipher(sk); 2547 tcp_saved_syn_free(tp); 2548 2549 sk_sockets_allocated_dec(sk); 2550 } 2551 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2552 2553 #ifdef CONFIG_PROC_FS 2554 /* Proc filesystem TCP sock list dumping. */ 2555 2556 static unsigned short seq_file_family(const struct seq_file *seq); 2557 2558 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2559 { 2560 unsigned short family = seq_file_family(seq); 2561 2562 /* AF_UNSPEC is used as a match all */ 2563 return ((family == AF_UNSPEC || family == sk->sk_family) && 2564 net_eq(sock_net(sk), seq_file_net(seq))); 2565 } 2566 2567 /* Find a non empty bucket (starting from st->bucket) 2568 * and return the first sk from it. 2569 */ 2570 static void *listening_get_first(struct seq_file *seq) 2571 { 2572 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2573 struct tcp_iter_state *st = seq->private; 2574 2575 st->offset = 0; 2576 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2577 struct inet_listen_hashbucket *ilb2; 2578 struct hlist_nulls_node *node; 2579 struct sock *sk; 2580 2581 ilb2 = &hinfo->lhash2[st->bucket]; 2582 if (hlist_nulls_empty(&ilb2->nulls_head)) 2583 continue; 2584 2585 spin_lock(&ilb2->lock); 2586 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2587 if (seq_sk_match(seq, sk)) 2588 return sk; 2589 } 2590 spin_unlock(&ilb2->lock); 2591 } 2592 2593 return NULL; 2594 } 2595 2596 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2597 * If "cur" is the last one in the st->bucket, 2598 * call listening_get_first() to return the first sk of the next 2599 * non empty bucket. 2600 */ 2601 static void *listening_get_next(struct seq_file *seq, void *cur) 2602 { 2603 struct tcp_iter_state *st = seq->private; 2604 struct inet_listen_hashbucket *ilb2; 2605 struct hlist_nulls_node *node; 2606 struct inet_hashinfo *hinfo; 2607 struct sock *sk = cur; 2608 2609 ++st->num; 2610 ++st->offset; 2611 2612 sk = sk_nulls_next(sk); 2613 sk_nulls_for_each_from(sk, node) { 2614 if (seq_sk_match(seq, sk)) 2615 return sk; 2616 } 2617 2618 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2619 ilb2 = &hinfo->lhash2[st->bucket]; 2620 spin_unlock(&ilb2->lock); 2621 ++st->bucket; 2622 return listening_get_first(seq); 2623 } 2624 2625 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2626 { 2627 struct tcp_iter_state *st = seq->private; 2628 void *rc; 2629 2630 st->bucket = 0; 2631 st->offset = 0; 2632 rc = listening_get_first(seq); 2633 2634 while (rc && *pos) { 2635 rc = listening_get_next(seq, rc); 2636 --*pos; 2637 } 2638 return rc; 2639 } 2640 2641 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2642 const struct tcp_iter_state *st) 2643 { 2644 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2645 } 2646 2647 /* 2648 * Get first established socket starting from bucket given in st->bucket. 2649 * If st->bucket is zero, the very first socket in the hash is returned. 2650 */ 2651 static void *established_get_first(struct seq_file *seq) 2652 { 2653 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2654 struct tcp_iter_state *st = seq->private; 2655 2656 st->offset = 0; 2657 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2658 struct sock *sk; 2659 struct hlist_nulls_node *node; 2660 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2661 2662 cond_resched(); 2663 2664 /* Lockless fast path for the common case of empty buckets */ 2665 if (empty_bucket(hinfo, st)) 2666 continue; 2667 2668 spin_lock_bh(lock); 2669 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2670 if (seq_sk_match(seq, sk)) 2671 return sk; 2672 } 2673 spin_unlock_bh(lock); 2674 } 2675 2676 return NULL; 2677 } 2678 2679 static void *established_get_next(struct seq_file *seq, void *cur) 2680 { 2681 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2682 struct tcp_iter_state *st = seq->private; 2683 struct hlist_nulls_node *node; 2684 struct sock *sk = cur; 2685 2686 ++st->num; 2687 ++st->offset; 2688 2689 sk = sk_nulls_next(sk); 2690 2691 sk_nulls_for_each_from(sk, node) { 2692 if (seq_sk_match(seq, sk)) 2693 return sk; 2694 } 2695 2696 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2697 ++st->bucket; 2698 return established_get_first(seq); 2699 } 2700 2701 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2702 { 2703 struct tcp_iter_state *st = seq->private; 2704 void *rc; 2705 2706 st->bucket = 0; 2707 rc = established_get_first(seq); 2708 2709 while (rc && pos) { 2710 rc = established_get_next(seq, rc); 2711 --pos; 2712 } 2713 return rc; 2714 } 2715 2716 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2717 { 2718 void *rc; 2719 struct tcp_iter_state *st = seq->private; 2720 2721 st->state = TCP_SEQ_STATE_LISTENING; 2722 rc = listening_get_idx(seq, &pos); 2723 2724 if (!rc) { 2725 st->state = TCP_SEQ_STATE_ESTABLISHED; 2726 rc = established_get_idx(seq, pos); 2727 } 2728 2729 return rc; 2730 } 2731 2732 static void *tcp_seek_last_pos(struct seq_file *seq) 2733 { 2734 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2735 struct tcp_iter_state *st = seq->private; 2736 int bucket = st->bucket; 2737 int offset = st->offset; 2738 int orig_num = st->num; 2739 void *rc = NULL; 2740 2741 switch (st->state) { 2742 case TCP_SEQ_STATE_LISTENING: 2743 if (st->bucket > hinfo->lhash2_mask) 2744 break; 2745 rc = listening_get_first(seq); 2746 while (offset-- && rc && bucket == st->bucket) 2747 rc = listening_get_next(seq, rc); 2748 if (rc) 2749 break; 2750 st->bucket = 0; 2751 st->state = TCP_SEQ_STATE_ESTABLISHED; 2752 fallthrough; 2753 case TCP_SEQ_STATE_ESTABLISHED: 2754 if (st->bucket > hinfo->ehash_mask) 2755 break; 2756 rc = established_get_first(seq); 2757 while (offset-- && rc && bucket == st->bucket) 2758 rc = established_get_next(seq, rc); 2759 } 2760 2761 st->num = orig_num; 2762 2763 return rc; 2764 } 2765 2766 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2767 { 2768 struct tcp_iter_state *st = seq->private; 2769 void *rc; 2770 2771 if (*pos && *pos == st->last_pos) { 2772 rc = tcp_seek_last_pos(seq); 2773 if (rc) 2774 goto out; 2775 } 2776 2777 st->state = TCP_SEQ_STATE_LISTENING; 2778 st->num = 0; 2779 st->bucket = 0; 2780 st->offset = 0; 2781 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2782 2783 out: 2784 st->last_pos = *pos; 2785 return rc; 2786 } 2787 EXPORT_SYMBOL(tcp_seq_start); 2788 2789 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2790 { 2791 struct tcp_iter_state *st = seq->private; 2792 void *rc = NULL; 2793 2794 if (v == SEQ_START_TOKEN) { 2795 rc = tcp_get_idx(seq, 0); 2796 goto out; 2797 } 2798 2799 switch (st->state) { 2800 case TCP_SEQ_STATE_LISTENING: 2801 rc = listening_get_next(seq, v); 2802 if (!rc) { 2803 st->state = TCP_SEQ_STATE_ESTABLISHED; 2804 st->bucket = 0; 2805 st->offset = 0; 2806 rc = established_get_first(seq); 2807 } 2808 break; 2809 case TCP_SEQ_STATE_ESTABLISHED: 2810 rc = established_get_next(seq, v); 2811 break; 2812 } 2813 out: 2814 ++*pos; 2815 st->last_pos = *pos; 2816 return rc; 2817 } 2818 EXPORT_SYMBOL(tcp_seq_next); 2819 2820 void tcp_seq_stop(struct seq_file *seq, void *v) 2821 { 2822 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2823 struct tcp_iter_state *st = seq->private; 2824 2825 switch (st->state) { 2826 case TCP_SEQ_STATE_LISTENING: 2827 if (v != SEQ_START_TOKEN) 2828 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2829 break; 2830 case TCP_SEQ_STATE_ESTABLISHED: 2831 if (v) 2832 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2833 break; 2834 } 2835 } 2836 EXPORT_SYMBOL(tcp_seq_stop); 2837 2838 static void get_openreq4(const struct request_sock *req, 2839 struct seq_file *f, int i) 2840 { 2841 const struct inet_request_sock *ireq = inet_rsk(req); 2842 long delta = req->rsk_timer.expires - jiffies; 2843 2844 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2845 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2846 i, 2847 ireq->ir_loc_addr, 2848 ireq->ir_num, 2849 ireq->ir_rmt_addr, 2850 ntohs(ireq->ir_rmt_port), 2851 TCP_SYN_RECV, 2852 0, 0, /* could print option size, but that is af dependent. */ 2853 1, /* timers active (only the expire timer) */ 2854 jiffies_delta_to_clock_t(delta), 2855 req->num_timeout, 2856 from_kuid_munged(seq_user_ns(f), 2857 sock_i_uid(req->rsk_listener)), 2858 0, /* non standard timer */ 2859 0, /* open_requests have no inode */ 2860 0, 2861 req); 2862 } 2863 2864 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2865 { 2866 int timer_active; 2867 unsigned long timer_expires; 2868 const struct tcp_sock *tp = tcp_sk(sk); 2869 const struct inet_connection_sock *icsk = inet_csk(sk); 2870 const struct inet_sock *inet = inet_sk(sk); 2871 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2872 __be32 dest = inet->inet_daddr; 2873 __be32 src = inet->inet_rcv_saddr; 2874 __u16 destp = ntohs(inet->inet_dport); 2875 __u16 srcp = ntohs(inet->inet_sport); 2876 int rx_queue; 2877 int state; 2878 2879 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2880 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2881 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2882 timer_active = 1; 2883 timer_expires = icsk->icsk_timeout; 2884 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2885 timer_active = 4; 2886 timer_expires = icsk->icsk_timeout; 2887 } else if (timer_pending(&sk->sk_timer)) { 2888 timer_active = 2; 2889 timer_expires = sk->sk_timer.expires; 2890 } else { 2891 timer_active = 0; 2892 timer_expires = jiffies; 2893 } 2894 2895 state = inet_sk_state_load(sk); 2896 if (state == TCP_LISTEN) 2897 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2898 else 2899 /* Because we don't lock the socket, 2900 * we might find a transient negative value. 2901 */ 2902 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2903 READ_ONCE(tp->copied_seq), 0); 2904 2905 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2906 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2907 i, src, srcp, dest, destp, state, 2908 READ_ONCE(tp->write_seq) - tp->snd_una, 2909 rx_queue, 2910 timer_active, 2911 jiffies_delta_to_clock_t(timer_expires - jiffies), 2912 icsk->icsk_retransmits, 2913 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2914 icsk->icsk_probes_out, 2915 sock_i_ino(sk), 2916 refcount_read(&sk->sk_refcnt), sk, 2917 jiffies_to_clock_t(icsk->icsk_rto), 2918 jiffies_to_clock_t(icsk->icsk_ack.ato), 2919 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2920 tcp_snd_cwnd(tp), 2921 state == TCP_LISTEN ? 2922 fastopenq->max_qlen : 2923 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2924 } 2925 2926 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2927 struct seq_file *f, int i) 2928 { 2929 long delta = tw->tw_timer.expires - jiffies; 2930 __be32 dest, src; 2931 __u16 destp, srcp; 2932 2933 dest = tw->tw_daddr; 2934 src = tw->tw_rcv_saddr; 2935 destp = ntohs(tw->tw_dport); 2936 srcp = ntohs(tw->tw_sport); 2937 2938 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2939 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2940 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2941 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2942 refcount_read(&tw->tw_refcnt), tw); 2943 } 2944 2945 #define TMPSZ 150 2946 2947 static int tcp4_seq_show(struct seq_file *seq, void *v) 2948 { 2949 struct tcp_iter_state *st; 2950 struct sock *sk = v; 2951 2952 seq_setwidth(seq, TMPSZ - 1); 2953 if (v == SEQ_START_TOKEN) { 2954 seq_puts(seq, " sl local_address rem_address st tx_queue " 2955 "rx_queue tr tm->when retrnsmt uid timeout " 2956 "inode"); 2957 goto out; 2958 } 2959 st = seq->private; 2960 2961 if (sk->sk_state == TCP_TIME_WAIT) 2962 get_timewait4_sock(v, seq, st->num); 2963 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2964 get_openreq4(v, seq, st->num); 2965 else 2966 get_tcp4_sock(v, seq, st->num); 2967 out: 2968 seq_pad(seq, '\n'); 2969 return 0; 2970 } 2971 2972 #ifdef CONFIG_BPF_SYSCALL 2973 struct bpf_tcp_iter_state { 2974 struct tcp_iter_state state; 2975 unsigned int cur_sk; 2976 unsigned int end_sk; 2977 unsigned int max_sk; 2978 struct sock **batch; 2979 bool st_bucket_done; 2980 }; 2981 2982 struct bpf_iter__tcp { 2983 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2984 __bpf_md_ptr(struct sock_common *, sk_common); 2985 uid_t uid __aligned(8); 2986 }; 2987 2988 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2989 struct sock_common *sk_common, uid_t uid) 2990 { 2991 struct bpf_iter__tcp ctx; 2992 2993 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2994 ctx.meta = meta; 2995 ctx.sk_common = sk_common; 2996 ctx.uid = uid; 2997 return bpf_iter_run_prog(prog, &ctx); 2998 } 2999 3000 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3001 { 3002 while (iter->cur_sk < iter->end_sk) 3003 sock_gen_put(iter->batch[iter->cur_sk++]); 3004 } 3005 3006 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3007 unsigned int new_batch_sz) 3008 { 3009 struct sock **new_batch; 3010 3011 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3012 GFP_USER | __GFP_NOWARN); 3013 if (!new_batch) 3014 return -ENOMEM; 3015 3016 bpf_iter_tcp_put_batch(iter); 3017 kvfree(iter->batch); 3018 iter->batch = new_batch; 3019 iter->max_sk = new_batch_sz; 3020 3021 return 0; 3022 } 3023 3024 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3025 struct sock *start_sk) 3026 { 3027 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3028 struct bpf_tcp_iter_state *iter = seq->private; 3029 struct tcp_iter_state *st = &iter->state; 3030 struct hlist_nulls_node *node; 3031 unsigned int expected = 1; 3032 struct sock *sk; 3033 3034 sock_hold(start_sk); 3035 iter->batch[iter->end_sk++] = start_sk; 3036 3037 sk = sk_nulls_next(start_sk); 3038 sk_nulls_for_each_from(sk, node) { 3039 if (seq_sk_match(seq, sk)) { 3040 if (iter->end_sk < iter->max_sk) { 3041 sock_hold(sk); 3042 iter->batch[iter->end_sk++] = sk; 3043 } 3044 expected++; 3045 } 3046 } 3047 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3048 3049 return expected; 3050 } 3051 3052 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3053 struct sock *start_sk) 3054 { 3055 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3056 struct bpf_tcp_iter_state *iter = seq->private; 3057 struct tcp_iter_state *st = &iter->state; 3058 struct hlist_nulls_node *node; 3059 unsigned int expected = 1; 3060 struct sock *sk; 3061 3062 sock_hold(start_sk); 3063 iter->batch[iter->end_sk++] = start_sk; 3064 3065 sk = sk_nulls_next(start_sk); 3066 sk_nulls_for_each_from(sk, node) { 3067 if (seq_sk_match(seq, sk)) { 3068 if (iter->end_sk < iter->max_sk) { 3069 sock_hold(sk); 3070 iter->batch[iter->end_sk++] = sk; 3071 } 3072 expected++; 3073 } 3074 } 3075 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3076 3077 return expected; 3078 } 3079 3080 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3081 { 3082 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3083 struct bpf_tcp_iter_state *iter = seq->private; 3084 struct tcp_iter_state *st = &iter->state; 3085 unsigned int expected; 3086 bool resized = false; 3087 struct sock *sk; 3088 3089 /* The st->bucket is done. Directly advance to the next 3090 * bucket instead of having the tcp_seek_last_pos() to skip 3091 * one by one in the current bucket and eventually find out 3092 * it has to advance to the next bucket. 3093 */ 3094 if (iter->st_bucket_done) { 3095 st->offset = 0; 3096 st->bucket++; 3097 if (st->state == TCP_SEQ_STATE_LISTENING && 3098 st->bucket > hinfo->lhash2_mask) { 3099 st->state = TCP_SEQ_STATE_ESTABLISHED; 3100 st->bucket = 0; 3101 } 3102 } 3103 3104 again: 3105 /* Get a new batch */ 3106 iter->cur_sk = 0; 3107 iter->end_sk = 0; 3108 iter->st_bucket_done = false; 3109 3110 sk = tcp_seek_last_pos(seq); 3111 if (!sk) 3112 return NULL; /* Done */ 3113 3114 if (st->state == TCP_SEQ_STATE_LISTENING) 3115 expected = bpf_iter_tcp_listening_batch(seq, sk); 3116 else 3117 expected = bpf_iter_tcp_established_batch(seq, sk); 3118 3119 if (iter->end_sk == expected) { 3120 iter->st_bucket_done = true; 3121 return sk; 3122 } 3123 3124 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3125 resized = true; 3126 goto again; 3127 } 3128 3129 return sk; 3130 } 3131 3132 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3133 { 3134 /* bpf iter does not support lseek, so it always 3135 * continue from where it was stop()-ped. 3136 */ 3137 if (*pos) 3138 return bpf_iter_tcp_batch(seq); 3139 3140 return SEQ_START_TOKEN; 3141 } 3142 3143 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3144 { 3145 struct bpf_tcp_iter_state *iter = seq->private; 3146 struct tcp_iter_state *st = &iter->state; 3147 struct sock *sk; 3148 3149 /* Whenever seq_next() is called, the iter->cur_sk is 3150 * done with seq_show(), so advance to the next sk in 3151 * the batch. 3152 */ 3153 if (iter->cur_sk < iter->end_sk) { 3154 /* Keeping st->num consistent in tcp_iter_state. 3155 * bpf_iter_tcp does not use st->num. 3156 * meta.seq_num is used instead. 3157 */ 3158 st->num++; 3159 /* Move st->offset to the next sk in the bucket such that 3160 * the future start() will resume at st->offset in 3161 * st->bucket. See tcp_seek_last_pos(). 3162 */ 3163 st->offset++; 3164 sock_gen_put(iter->batch[iter->cur_sk++]); 3165 } 3166 3167 if (iter->cur_sk < iter->end_sk) 3168 sk = iter->batch[iter->cur_sk]; 3169 else 3170 sk = bpf_iter_tcp_batch(seq); 3171 3172 ++*pos; 3173 /* Keeping st->last_pos consistent in tcp_iter_state. 3174 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3175 */ 3176 st->last_pos = *pos; 3177 return sk; 3178 } 3179 3180 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3181 { 3182 struct bpf_iter_meta meta; 3183 struct bpf_prog *prog; 3184 struct sock *sk = v; 3185 uid_t uid; 3186 int ret; 3187 3188 if (v == SEQ_START_TOKEN) 3189 return 0; 3190 3191 if (sk_fullsock(sk)) 3192 lock_sock(sk); 3193 3194 if (unlikely(sk_unhashed(sk))) { 3195 ret = SEQ_SKIP; 3196 goto unlock; 3197 } 3198 3199 if (sk->sk_state == TCP_TIME_WAIT) { 3200 uid = 0; 3201 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3202 const struct request_sock *req = v; 3203 3204 uid = from_kuid_munged(seq_user_ns(seq), 3205 sock_i_uid(req->rsk_listener)); 3206 } else { 3207 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3208 } 3209 3210 meta.seq = seq; 3211 prog = bpf_iter_get_info(&meta, false); 3212 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3213 3214 unlock: 3215 if (sk_fullsock(sk)) 3216 release_sock(sk); 3217 return ret; 3218 3219 } 3220 3221 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3222 { 3223 struct bpf_tcp_iter_state *iter = seq->private; 3224 struct bpf_iter_meta meta; 3225 struct bpf_prog *prog; 3226 3227 if (!v) { 3228 meta.seq = seq; 3229 prog = bpf_iter_get_info(&meta, true); 3230 if (prog) 3231 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3232 } 3233 3234 if (iter->cur_sk < iter->end_sk) { 3235 bpf_iter_tcp_put_batch(iter); 3236 iter->st_bucket_done = false; 3237 } 3238 } 3239 3240 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3241 .show = bpf_iter_tcp_seq_show, 3242 .start = bpf_iter_tcp_seq_start, 3243 .next = bpf_iter_tcp_seq_next, 3244 .stop = bpf_iter_tcp_seq_stop, 3245 }; 3246 #endif 3247 static unsigned short seq_file_family(const struct seq_file *seq) 3248 { 3249 const struct tcp_seq_afinfo *afinfo; 3250 3251 #ifdef CONFIG_BPF_SYSCALL 3252 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3253 if (seq->op == &bpf_iter_tcp_seq_ops) 3254 return AF_UNSPEC; 3255 #endif 3256 3257 /* Iterated from proc fs */ 3258 afinfo = pde_data(file_inode(seq->file)); 3259 return afinfo->family; 3260 } 3261 3262 static const struct seq_operations tcp4_seq_ops = { 3263 .show = tcp4_seq_show, 3264 .start = tcp_seq_start, 3265 .next = tcp_seq_next, 3266 .stop = tcp_seq_stop, 3267 }; 3268 3269 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3270 .family = AF_INET, 3271 }; 3272 3273 static int __net_init tcp4_proc_init_net(struct net *net) 3274 { 3275 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3276 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3277 return -ENOMEM; 3278 return 0; 3279 } 3280 3281 static void __net_exit tcp4_proc_exit_net(struct net *net) 3282 { 3283 remove_proc_entry("tcp", net->proc_net); 3284 } 3285 3286 static struct pernet_operations tcp4_net_ops = { 3287 .init = tcp4_proc_init_net, 3288 .exit = tcp4_proc_exit_net, 3289 }; 3290 3291 int __init tcp4_proc_init(void) 3292 { 3293 return register_pernet_subsys(&tcp4_net_ops); 3294 } 3295 3296 void tcp4_proc_exit(void) 3297 { 3298 unregister_pernet_subsys(&tcp4_net_ops); 3299 } 3300 #endif /* CONFIG_PROC_FS */ 3301 3302 /* @wake is one when sk_stream_write_space() calls us. 3303 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3304 * This mimics the strategy used in sock_def_write_space(). 3305 */ 3306 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3307 { 3308 const struct tcp_sock *tp = tcp_sk(sk); 3309 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3310 READ_ONCE(tp->snd_nxt); 3311 3312 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3313 } 3314 EXPORT_SYMBOL(tcp_stream_memory_free); 3315 3316 struct proto tcp_prot = { 3317 .name = "TCP", 3318 .owner = THIS_MODULE, 3319 .close = tcp_close, 3320 .pre_connect = tcp_v4_pre_connect, 3321 .connect = tcp_v4_connect, 3322 .disconnect = tcp_disconnect, 3323 .accept = inet_csk_accept, 3324 .ioctl = tcp_ioctl, 3325 .init = tcp_v4_init_sock, 3326 .destroy = tcp_v4_destroy_sock, 3327 .shutdown = tcp_shutdown, 3328 .setsockopt = tcp_setsockopt, 3329 .getsockopt = tcp_getsockopt, 3330 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3331 .keepalive = tcp_set_keepalive, 3332 .recvmsg = tcp_recvmsg, 3333 .sendmsg = tcp_sendmsg, 3334 .splice_eof = tcp_splice_eof, 3335 .backlog_rcv = tcp_v4_do_rcv, 3336 .release_cb = tcp_release_cb, 3337 .hash = inet_hash, 3338 .unhash = inet_unhash, 3339 .get_port = inet_csk_get_port, 3340 .put_port = inet_put_port, 3341 #ifdef CONFIG_BPF_SYSCALL 3342 .psock_update_sk_prot = tcp_bpf_update_proto, 3343 #endif 3344 .enter_memory_pressure = tcp_enter_memory_pressure, 3345 .leave_memory_pressure = tcp_leave_memory_pressure, 3346 .stream_memory_free = tcp_stream_memory_free, 3347 .sockets_allocated = &tcp_sockets_allocated, 3348 .orphan_count = &tcp_orphan_count, 3349 3350 .memory_allocated = &tcp_memory_allocated, 3351 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3352 3353 .memory_pressure = &tcp_memory_pressure, 3354 .sysctl_mem = sysctl_tcp_mem, 3355 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3356 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3357 .max_header = MAX_TCP_HEADER, 3358 .obj_size = sizeof(struct tcp_sock), 3359 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3360 .twsk_prot = &tcp_timewait_sock_ops, 3361 .rsk_prot = &tcp_request_sock_ops, 3362 .h.hashinfo = NULL, 3363 .no_autobind = true, 3364 .diag_destroy = tcp_abort, 3365 }; 3366 EXPORT_SYMBOL(tcp_prot); 3367 3368 static void __net_exit tcp_sk_exit(struct net *net) 3369 { 3370 if (net->ipv4.tcp_congestion_control) 3371 bpf_module_put(net->ipv4.tcp_congestion_control, 3372 net->ipv4.tcp_congestion_control->owner); 3373 } 3374 3375 static void __net_init tcp_set_hashinfo(struct net *net) 3376 { 3377 struct inet_hashinfo *hinfo; 3378 unsigned int ehash_entries; 3379 struct net *old_net; 3380 3381 if (net_eq(net, &init_net)) 3382 goto fallback; 3383 3384 old_net = current->nsproxy->net_ns; 3385 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3386 if (!ehash_entries) 3387 goto fallback; 3388 3389 ehash_entries = roundup_pow_of_two(ehash_entries); 3390 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3391 if (!hinfo) { 3392 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3393 "for a netns, fallback to the global one\n", 3394 ehash_entries); 3395 fallback: 3396 hinfo = &tcp_hashinfo; 3397 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3398 } 3399 3400 net->ipv4.tcp_death_row.hashinfo = hinfo; 3401 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3402 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3403 } 3404 3405 static int __net_init tcp_sk_init(struct net *net) 3406 { 3407 net->ipv4.sysctl_tcp_ecn = 2; 3408 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3409 3410 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3411 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3412 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3413 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3414 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3415 3416 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3417 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3418 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3419 3420 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3421 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3422 net->ipv4.sysctl_tcp_syncookies = 1; 3423 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3424 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3425 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3426 net->ipv4.sysctl_tcp_orphan_retries = 0; 3427 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3428 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3429 net->ipv4.sysctl_tcp_tw_reuse = 2; 3430 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3431 3432 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3433 tcp_set_hashinfo(net); 3434 3435 net->ipv4.sysctl_tcp_sack = 1; 3436 net->ipv4.sysctl_tcp_window_scaling = 1; 3437 net->ipv4.sysctl_tcp_timestamps = 1; 3438 net->ipv4.sysctl_tcp_early_retrans = 3; 3439 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3440 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3441 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3442 net->ipv4.sysctl_tcp_max_reordering = 300; 3443 net->ipv4.sysctl_tcp_dsack = 1; 3444 net->ipv4.sysctl_tcp_app_win = 31; 3445 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3446 net->ipv4.sysctl_tcp_frto = 2; 3447 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3448 /* This limits the percentage of the congestion window which we 3449 * will allow a single TSO frame to consume. Building TSO frames 3450 * which are too large can cause TCP streams to be bursty. 3451 */ 3452 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3453 /* Default TSQ limit of 16 TSO segments */ 3454 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3455 3456 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3457 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3458 3459 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3460 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3461 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3462 net->ipv4.sysctl_tcp_autocorking = 1; 3463 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3464 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3465 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3466 if (net != &init_net) { 3467 memcpy(net->ipv4.sysctl_tcp_rmem, 3468 init_net.ipv4.sysctl_tcp_rmem, 3469 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3470 memcpy(net->ipv4.sysctl_tcp_wmem, 3471 init_net.ipv4.sysctl_tcp_wmem, 3472 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3473 } 3474 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3475 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3476 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3477 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3478 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3479 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3480 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3481 3482 /* Set default values for PLB */ 3483 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3484 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3485 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3486 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3487 /* Default congestion threshold for PLB to mark a round is 50% */ 3488 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3489 3490 /* Reno is always built in */ 3491 if (!net_eq(net, &init_net) && 3492 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3493 init_net.ipv4.tcp_congestion_control->owner)) 3494 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3495 else 3496 net->ipv4.tcp_congestion_control = &tcp_reno; 3497 3498 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3499 net->ipv4.sysctl_tcp_shrink_window = 0; 3500 3501 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3502 3503 return 0; 3504 } 3505 3506 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3507 { 3508 struct net *net; 3509 3510 tcp_twsk_purge(net_exit_list, AF_INET); 3511 3512 list_for_each_entry(net, net_exit_list, exit_list) { 3513 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3514 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3515 tcp_fastopen_ctx_destroy(net); 3516 } 3517 } 3518 3519 static struct pernet_operations __net_initdata tcp_sk_ops = { 3520 .init = tcp_sk_init, 3521 .exit = tcp_sk_exit, 3522 .exit_batch = tcp_sk_exit_batch, 3523 }; 3524 3525 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3526 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3527 struct sock_common *sk_common, uid_t uid) 3528 3529 #define INIT_BATCH_SZ 16 3530 3531 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3532 { 3533 struct bpf_tcp_iter_state *iter = priv_data; 3534 int err; 3535 3536 err = bpf_iter_init_seq_net(priv_data, aux); 3537 if (err) 3538 return err; 3539 3540 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3541 if (err) { 3542 bpf_iter_fini_seq_net(priv_data); 3543 return err; 3544 } 3545 3546 return 0; 3547 } 3548 3549 static void bpf_iter_fini_tcp(void *priv_data) 3550 { 3551 struct bpf_tcp_iter_state *iter = priv_data; 3552 3553 bpf_iter_fini_seq_net(priv_data); 3554 kvfree(iter->batch); 3555 } 3556 3557 static const struct bpf_iter_seq_info tcp_seq_info = { 3558 .seq_ops = &bpf_iter_tcp_seq_ops, 3559 .init_seq_private = bpf_iter_init_tcp, 3560 .fini_seq_private = bpf_iter_fini_tcp, 3561 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3562 }; 3563 3564 static const struct bpf_func_proto * 3565 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3566 const struct bpf_prog *prog) 3567 { 3568 switch (func_id) { 3569 case BPF_FUNC_setsockopt: 3570 return &bpf_sk_setsockopt_proto; 3571 case BPF_FUNC_getsockopt: 3572 return &bpf_sk_getsockopt_proto; 3573 default: 3574 return NULL; 3575 } 3576 } 3577 3578 static struct bpf_iter_reg tcp_reg_info = { 3579 .target = "tcp", 3580 .ctx_arg_info_size = 1, 3581 .ctx_arg_info = { 3582 { offsetof(struct bpf_iter__tcp, sk_common), 3583 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3584 }, 3585 .get_func_proto = bpf_iter_tcp_get_func_proto, 3586 .seq_info = &tcp_seq_info, 3587 }; 3588 3589 static void __init bpf_iter_register(void) 3590 { 3591 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3592 if (bpf_iter_reg_target(&tcp_reg_info)) 3593 pr_warn("Warning: could not register bpf iterator tcp\n"); 3594 } 3595 3596 #endif 3597 3598 void __init tcp_v4_init(void) 3599 { 3600 int cpu, res; 3601 3602 for_each_possible_cpu(cpu) { 3603 struct sock *sk; 3604 3605 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3606 IPPROTO_TCP, &init_net); 3607 if (res) 3608 panic("Failed to create the TCP control socket.\n"); 3609 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3610 3611 /* Please enforce IP_DF and IPID==0 for RST and 3612 * ACK sent in SYN-RECV and TIME-WAIT state. 3613 */ 3614 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3615 3616 per_cpu(ipv4_tcp_sk, cpu) = sk; 3617 } 3618 if (register_pernet_subsys(&tcp_sk_ops)) 3619 panic("Failed to create the TCP control socket.\n"); 3620 3621 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3622 bpf_iter_register(); 3623 #endif 3624 } 3625