1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 #include <net/rstreason.h> 74 75 #include <linux/inet.h> 76 #include <linux/ipv6.h> 77 #include <linux/stddef.h> 78 #include <linux/proc_fs.h> 79 #include <linux/seq_file.h> 80 #include <linux/inetdevice.h> 81 #include <linux/btf_ids.h> 82 83 #include <crypto/hash.h> 84 #include <linux/scatterlist.h> 85 86 #include <trace/events/tcp.h> 87 88 #ifdef CONFIG_TCP_MD5SIG 89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 90 __be32 daddr, __be32 saddr, const struct tcphdr *th); 91 #endif 92 93 struct inet_hashinfo tcp_hashinfo; 94 EXPORT_SYMBOL(tcp_hashinfo); 95 96 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 97 98 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 99 { 100 return secure_tcp_seq(ip_hdr(skb)->daddr, 101 ip_hdr(skb)->saddr, 102 tcp_hdr(skb)->dest, 103 tcp_hdr(skb)->source); 104 } 105 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 107 { 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 109 } 110 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 112 { 113 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 114 const struct inet_timewait_sock *tw = inet_twsk(sktw); 115 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 116 struct tcp_sock *tp = tcp_sk(sk); 117 118 if (reuse == 2) { 119 /* Still does not detect *everything* that goes through 120 * lo, since we require a loopback src or dst address 121 * or direct binding to 'lo' interface. 122 */ 123 bool loopback = false; 124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 125 loopback = true; 126 #if IS_ENABLED(CONFIG_IPV6) 127 if (tw->tw_family == AF_INET6) { 128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 130 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 131 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 132 loopback = true; 133 } else 134 #endif 135 { 136 if (ipv4_is_loopback(tw->tw_daddr) || 137 ipv4_is_loopback(tw->tw_rcv_saddr)) 138 loopback = true; 139 } 140 if (!loopback) 141 reuse = 0; 142 } 143 144 /* With PAWS, it is safe from the viewpoint 145 of data integrity. Even without PAWS it is safe provided sequence 146 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 147 148 Actually, the idea is close to VJ's one, only timestamp cache is 149 held not per host, but per port pair and TW bucket is used as state 150 holder. 151 152 If TW bucket has been already destroyed we fall back to VJ's scheme 153 and use initial timestamp retrieved from peer table. 154 */ 155 if (tcptw->tw_ts_recent_stamp && 156 (!twp || (reuse && time_after32(ktime_get_seconds(), 157 tcptw->tw_ts_recent_stamp)))) { 158 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk 159 * and releasing the bucket lock. 160 */ 161 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 162 return 0; 163 164 /* In case of repair and re-using TIME-WAIT sockets we still 165 * want to be sure that it is safe as above but honor the 166 * sequence numbers and time stamps set as part of the repair 167 * process. 168 * 169 * Without this check re-using a TIME-WAIT socket with TCP 170 * repair would accumulate a -1 on the repair assigned 171 * sequence number. The first time it is reused the sequence 172 * is -1, the second time -2, etc. This fixes that issue 173 * without appearing to create any others. 174 */ 175 if (likely(!tp->repair)) { 176 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 177 178 if (!seq) 179 seq = 1; 180 WRITE_ONCE(tp->write_seq, seq); 181 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 182 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 183 } 184 185 return 1; 186 } 187 188 return 0; 189 } 190 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 191 192 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 193 int addr_len) 194 { 195 /* This check is replicated from tcp_v4_connect() and intended to 196 * prevent BPF program called below from accessing bytes that are out 197 * of the bound specified by user in addr_len. 198 */ 199 if (addr_len < sizeof(struct sockaddr_in)) 200 return -EINVAL; 201 202 sock_owned_by_me(sk); 203 204 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 205 } 206 207 /* This will initiate an outgoing connection. */ 208 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 209 { 210 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 211 struct inet_timewait_death_row *tcp_death_row; 212 struct inet_sock *inet = inet_sk(sk); 213 struct tcp_sock *tp = tcp_sk(sk); 214 struct ip_options_rcu *inet_opt; 215 struct net *net = sock_net(sk); 216 __be16 orig_sport, orig_dport; 217 __be32 daddr, nexthop; 218 struct flowi4 *fl4; 219 struct rtable *rt; 220 int err; 221 222 if (addr_len < sizeof(struct sockaddr_in)) 223 return -EINVAL; 224 225 if (usin->sin_family != AF_INET) 226 return -EAFNOSUPPORT; 227 228 nexthop = daddr = usin->sin_addr.s_addr; 229 inet_opt = rcu_dereference_protected(inet->inet_opt, 230 lockdep_sock_is_held(sk)); 231 if (inet_opt && inet_opt->opt.srr) { 232 if (!daddr) 233 return -EINVAL; 234 nexthop = inet_opt->opt.faddr; 235 } 236 237 orig_sport = inet->inet_sport; 238 orig_dport = usin->sin_port; 239 fl4 = &inet->cork.fl.u.ip4; 240 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 241 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 242 orig_dport, sk); 243 if (IS_ERR(rt)) { 244 err = PTR_ERR(rt); 245 if (err == -ENETUNREACH) 246 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 247 return err; 248 } 249 250 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 251 ip_rt_put(rt); 252 return -ENETUNREACH; 253 } 254 255 if (!inet_opt || !inet_opt->opt.srr) 256 daddr = fl4->daddr; 257 258 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 259 260 if (!inet->inet_saddr) { 261 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 262 if (err) { 263 ip_rt_put(rt); 264 return err; 265 } 266 } else { 267 sk_rcv_saddr_set(sk, inet->inet_saddr); 268 } 269 270 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 271 /* Reset inherited state */ 272 tp->rx_opt.ts_recent = 0; 273 tp->rx_opt.ts_recent_stamp = 0; 274 if (likely(!tp->repair)) 275 WRITE_ONCE(tp->write_seq, 0); 276 } 277 278 inet->inet_dport = usin->sin_port; 279 sk_daddr_set(sk, daddr); 280 281 inet_csk(sk)->icsk_ext_hdr_len = 0; 282 if (inet_opt) 283 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 284 285 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 286 287 /* Socket identity is still unknown (sport may be zero). 288 * However we set state to SYN-SENT and not releasing socket 289 * lock select source port, enter ourselves into the hash tables and 290 * complete initialization after this. 291 */ 292 tcp_set_state(sk, TCP_SYN_SENT); 293 err = inet_hash_connect(tcp_death_row, sk); 294 if (err) 295 goto failure; 296 297 sk_set_txhash(sk); 298 299 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 300 inet->inet_sport, inet->inet_dport, sk); 301 if (IS_ERR(rt)) { 302 err = PTR_ERR(rt); 303 rt = NULL; 304 goto failure; 305 } 306 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 307 /* OK, now commit destination to socket. */ 308 sk->sk_gso_type = SKB_GSO_TCPV4; 309 sk_setup_caps(sk, &rt->dst); 310 rt = NULL; 311 312 if (likely(!tp->repair)) { 313 if (!tp->write_seq) 314 WRITE_ONCE(tp->write_seq, 315 secure_tcp_seq(inet->inet_saddr, 316 inet->inet_daddr, 317 inet->inet_sport, 318 usin->sin_port)); 319 WRITE_ONCE(tp->tsoffset, 320 secure_tcp_ts_off(net, inet->inet_saddr, 321 inet->inet_daddr)); 322 } 323 324 atomic_set(&inet->inet_id, get_random_u16()); 325 326 if (tcp_fastopen_defer_connect(sk, &err)) 327 return err; 328 if (err) 329 goto failure; 330 331 err = tcp_connect(sk); 332 333 if (err) 334 goto failure; 335 336 return 0; 337 338 failure: 339 /* 340 * This unhashes the socket and releases the local port, 341 * if necessary. 342 */ 343 tcp_set_state(sk, TCP_CLOSE); 344 inet_bhash2_reset_saddr(sk); 345 ip_rt_put(rt); 346 sk->sk_route_caps = 0; 347 inet->inet_dport = 0; 348 return err; 349 } 350 EXPORT_SYMBOL(tcp_v4_connect); 351 352 /* 353 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 354 * It can be called through tcp_release_cb() if socket was owned by user 355 * at the time tcp_v4_err() was called to handle ICMP message. 356 */ 357 void tcp_v4_mtu_reduced(struct sock *sk) 358 { 359 struct inet_sock *inet = inet_sk(sk); 360 struct dst_entry *dst; 361 u32 mtu; 362 363 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 364 return; 365 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 366 dst = inet_csk_update_pmtu(sk, mtu); 367 if (!dst) 368 return; 369 370 /* Something is about to be wrong... Remember soft error 371 * for the case, if this connection will not able to recover. 372 */ 373 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 374 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 375 376 mtu = dst_mtu(dst); 377 378 if (inet->pmtudisc != IP_PMTUDISC_DONT && 379 ip_sk_accept_pmtu(sk) && 380 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 381 tcp_sync_mss(sk, mtu); 382 383 /* Resend the TCP packet because it's 384 * clear that the old packet has been 385 * dropped. This is the new "fast" path mtu 386 * discovery. 387 */ 388 tcp_simple_retransmit(sk); 389 } /* else let the usual retransmit timer handle it */ 390 } 391 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 392 393 static void do_redirect(struct sk_buff *skb, struct sock *sk) 394 { 395 struct dst_entry *dst = __sk_dst_check(sk, 0); 396 397 if (dst) 398 dst->ops->redirect(dst, sk, skb); 399 } 400 401 402 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 403 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 404 { 405 struct request_sock *req = inet_reqsk(sk); 406 struct net *net = sock_net(sk); 407 408 /* ICMPs are not backlogged, hence we cannot get 409 * an established socket here. 410 */ 411 if (seq != tcp_rsk(req)->snt_isn) { 412 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 413 } else if (abort) { 414 /* 415 * Still in SYN_RECV, just remove it silently. 416 * There is no good way to pass the error to the newly 417 * created socket, and POSIX does not want network 418 * errors returned from accept(). 419 */ 420 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 421 tcp_listendrop(req->rsk_listener); 422 } 423 reqsk_put(req); 424 } 425 EXPORT_SYMBOL(tcp_req_err); 426 427 /* TCP-LD (RFC 6069) logic */ 428 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 429 { 430 struct inet_connection_sock *icsk = inet_csk(sk); 431 struct tcp_sock *tp = tcp_sk(sk); 432 struct sk_buff *skb; 433 s32 remaining; 434 u32 delta_us; 435 436 if (sock_owned_by_user(sk)) 437 return; 438 439 if (seq != tp->snd_una || !icsk->icsk_retransmits || 440 !icsk->icsk_backoff) 441 return; 442 443 skb = tcp_rtx_queue_head(sk); 444 if (WARN_ON_ONCE(!skb)) 445 return; 446 447 icsk->icsk_backoff--; 448 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 449 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 450 451 tcp_mstamp_refresh(tp); 452 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 453 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 454 455 if (remaining > 0) { 456 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 457 remaining, TCP_RTO_MAX); 458 } else { 459 /* RTO revert clocked out retransmission. 460 * Will retransmit now. 461 */ 462 tcp_retransmit_timer(sk); 463 } 464 } 465 EXPORT_SYMBOL(tcp_ld_RTO_revert); 466 467 /* 468 * This routine is called by the ICMP module when it gets some 469 * sort of error condition. If err < 0 then the socket should 470 * be closed and the error returned to the user. If err > 0 471 * it's just the icmp type << 8 | icmp code. After adjustment 472 * header points to the first 8 bytes of the tcp header. We need 473 * to find the appropriate port. 474 * 475 * The locking strategy used here is very "optimistic". When 476 * someone else accesses the socket the ICMP is just dropped 477 * and for some paths there is no check at all. 478 * A more general error queue to queue errors for later handling 479 * is probably better. 480 * 481 */ 482 483 int tcp_v4_err(struct sk_buff *skb, u32 info) 484 { 485 const struct iphdr *iph = (const struct iphdr *)skb->data; 486 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 487 struct tcp_sock *tp; 488 const int type = icmp_hdr(skb)->type; 489 const int code = icmp_hdr(skb)->code; 490 struct sock *sk; 491 struct request_sock *fastopen; 492 u32 seq, snd_una; 493 int err; 494 struct net *net = dev_net(skb->dev); 495 496 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 497 iph->daddr, th->dest, iph->saddr, 498 ntohs(th->source), inet_iif(skb), 0); 499 if (!sk) { 500 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 501 return -ENOENT; 502 } 503 if (sk->sk_state == TCP_TIME_WAIT) { 504 /* To increase the counter of ignored icmps for TCP-AO */ 505 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 506 inet_twsk_put(inet_twsk(sk)); 507 return 0; 508 } 509 seq = ntohl(th->seq); 510 if (sk->sk_state == TCP_NEW_SYN_RECV) { 511 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 512 type == ICMP_TIME_EXCEEDED || 513 (type == ICMP_DEST_UNREACH && 514 (code == ICMP_NET_UNREACH || 515 code == ICMP_HOST_UNREACH))); 516 return 0; 517 } 518 519 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 520 sock_put(sk); 521 return 0; 522 } 523 524 bh_lock_sock(sk); 525 /* If too many ICMPs get dropped on busy 526 * servers this needs to be solved differently. 527 * We do take care of PMTU discovery (RFC1191) special case : 528 * we can receive locally generated ICMP messages while socket is held. 529 */ 530 if (sock_owned_by_user(sk)) { 531 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 532 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 533 } 534 if (sk->sk_state == TCP_CLOSE) 535 goto out; 536 537 if (static_branch_unlikely(&ip4_min_ttl)) { 538 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 539 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 540 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 541 goto out; 542 } 543 } 544 545 tp = tcp_sk(sk); 546 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 547 fastopen = rcu_dereference(tp->fastopen_rsk); 548 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 549 if (sk->sk_state != TCP_LISTEN && 550 !between(seq, snd_una, tp->snd_nxt)) { 551 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 552 goto out; 553 } 554 555 switch (type) { 556 case ICMP_REDIRECT: 557 if (!sock_owned_by_user(sk)) 558 do_redirect(skb, sk); 559 goto out; 560 case ICMP_SOURCE_QUENCH: 561 /* Just silently ignore these. */ 562 goto out; 563 case ICMP_PARAMETERPROB: 564 err = EPROTO; 565 break; 566 case ICMP_DEST_UNREACH: 567 if (code > NR_ICMP_UNREACH) 568 goto out; 569 570 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 571 /* We are not interested in TCP_LISTEN and open_requests 572 * (SYN-ACKs send out by Linux are always <576bytes so 573 * they should go through unfragmented). 574 */ 575 if (sk->sk_state == TCP_LISTEN) 576 goto out; 577 578 WRITE_ONCE(tp->mtu_info, info); 579 if (!sock_owned_by_user(sk)) { 580 tcp_v4_mtu_reduced(sk); 581 } else { 582 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 583 sock_hold(sk); 584 } 585 goto out; 586 } 587 588 err = icmp_err_convert[code].errno; 589 /* check if this ICMP message allows revert of backoff. 590 * (see RFC 6069) 591 */ 592 if (!fastopen && 593 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 594 tcp_ld_RTO_revert(sk, seq); 595 break; 596 case ICMP_TIME_EXCEEDED: 597 err = EHOSTUNREACH; 598 break; 599 default: 600 goto out; 601 } 602 603 switch (sk->sk_state) { 604 case TCP_SYN_SENT: 605 case TCP_SYN_RECV: 606 /* Only in fast or simultaneous open. If a fast open socket is 607 * already accepted it is treated as a connected one below. 608 */ 609 if (fastopen && !fastopen->sk) 610 break; 611 612 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 613 614 if (!sock_owned_by_user(sk)) { 615 WRITE_ONCE(sk->sk_err, err); 616 617 sk_error_report(sk); 618 619 tcp_done(sk); 620 } else { 621 WRITE_ONCE(sk->sk_err_soft, err); 622 } 623 goto out; 624 } 625 626 /* If we've already connected we will keep trying 627 * until we time out, or the user gives up. 628 * 629 * rfc1122 4.2.3.9 allows to consider as hard errors 630 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 631 * but it is obsoleted by pmtu discovery). 632 * 633 * Note, that in modern internet, where routing is unreliable 634 * and in each dark corner broken firewalls sit, sending random 635 * errors ordered by their masters even this two messages finally lose 636 * their original sense (even Linux sends invalid PORT_UNREACHs) 637 * 638 * Now we are in compliance with RFCs. 639 * --ANK (980905) 640 */ 641 642 if (!sock_owned_by_user(sk) && 643 inet_test_bit(RECVERR, sk)) { 644 WRITE_ONCE(sk->sk_err, err); 645 sk_error_report(sk); 646 } else { /* Only an error on timeout */ 647 WRITE_ONCE(sk->sk_err_soft, err); 648 } 649 650 out: 651 bh_unlock_sock(sk); 652 sock_put(sk); 653 return 0; 654 } 655 656 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 657 { 658 struct tcphdr *th = tcp_hdr(skb); 659 660 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 661 skb->csum_start = skb_transport_header(skb) - skb->head; 662 skb->csum_offset = offsetof(struct tcphdr, check); 663 } 664 665 /* This routine computes an IPv4 TCP checksum. */ 666 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 667 { 668 const struct inet_sock *inet = inet_sk(sk); 669 670 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 671 } 672 EXPORT_SYMBOL(tcp_v4_send_check); 673 674 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 675 676 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 677 const struct tcp_ao_hdr *aoh, 678 struct ip_reply_arg *arg, struct tcphdr *reply, 679 __be32 reply_options[REPLY_OPTIONS_LEN]) 680 { 681 #ifdef CONFIG_TCP_AO 682 int sdif = tcp_v4_sdif(skb); 683 int dif = inet_iif(skb); 684 int l3index = sdif ? dif : 0; 685 bool allocated_traffic_key; 686 struct tcp_ao_key *key; 687 char *traffic_key; 688 bool drop = true; 689 u32 ao_sne = 0; 690 u8 keyid; 691 692 rcu_read_lock(); 693 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 694 &key, &traffic_key, &allocated_traffic_key, 695 &keyid, &ao_sne)) 696 goto out; 697 698 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 699 (aoh->rnext_keyid << 8) | keyid); 700 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 701 reply->doff = arg->iov[0].iov_len / 4; 702 703 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 704 key, traffic_key, 705 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 706 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 707 reply, ao_sne)) 708 goto out; 709 drop = false; 710 out: 711 rcu_read_unlock(); 712 if (allocated_traffic_key) 713 kfree(traffic_key); 714 return drop; 715 #else 716 return true; 717 #endif 718 } 719 720 /* 721 * This routine will send an RST to the other tcp. 722 * 723 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 724 * for reset. 725 * Answer: if a packet caused RST, it is not for a socket 726 * existing in our system, if it is matched to a socket, 727 * it is just duplicate segment or bug in other side's TCP. 728 * So that we build reply only basing on parameters 729 * arrived with segment. 730 * Exception: precedence violation. We do not implement it in any case. 731 */ 732 733 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 734 enum sk_rst_reason reason) 735 { 736 const struct tcphdr *th = tcp_hdr(skb); 737 struct { 738 struct tcphdr th; 739 __be32 opt[REPLY_OPTIONS_LEN]; 740 } rep; 741 const __u8 *md5_hash_location = NULL; 742 const struct tcp_ao_hdr *aoh; 743 struct ip_reply_arg arg; 744 #ifdef CONFIG_TCP_MD5SIG 745 struct tcp_md5sig_key *key = NULL; 746 unsigned char newhash[16]; 747 struct sock *sk1 = NULL; 748 int genhash; 749 #endif 750 u64 transmit_time = 0; 751 struct sock *ctl_sk; 752 struct net *net; 753 u32 txhash = 0; 754 755 /* Never send a reset in response to a reset. */ 756 if (th->rst) 757 return; 758 759 /* If sk not NULL, it means we did a successful lookup and incoming 760 * route had to be correct. prequeue might have dropped our dst. 761 */ 762 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 763 return; 764 765 /* Swap the send and the receive. */ 766 memset(&rep, 0, sizeof(rep)); 767 rep.th.dest = th->source; 768 rep.th.source = th->dest; 769 rep.th.doff = sizeof(struct tcphdr) / 4; 770 rep.th.rst = 1; 771 772 if (th->ack) { 773 rep.th.seq = th->ack_seq; 774 } else { 775 rep.th.ack = 1; 776 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 777 skb->len - (th->doff << 2)); 778 } 779 780 memset(&arg, 0, sizeof(arg)); 781 arg.iov[0].iov_base = (unsigned char *)&rep; 782 arg.iov[0].iov_len = sizeof(rep.th); 783 784 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 785 786 /* Invalid TCP option size or twice included auth */ 787 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 788 return; 789 790 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 791 return; 792 793 #ifdef CONFIG_TCP_MD5SIG 794 rcu_read_lock(); 795 if (sk && sk_fullsock(sk)) { 796 const union tcp_md5_addr *addr; 797 int l3index; 798 799 /* sdif set, means packet ingressed via a device 800 * in an L3 domain and inet_iif is set to it. 801 */ 802 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 803 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 804 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 805 } else if (md5_hash_location) { 806 const union tcp_md5_addr *addr; 807 int sdif = tcp_v4_sdif(skb); 808 int dif = inet_iif(skb); 809 int l3index; 810 811 /* 812 * active side is lost. Try to find listening socket through 813 * source port, and then find md5 key through listening socket. 814 * we are not loose security here: 815 * Incoming packet is checked with md5 hash with finding key, 816 * no RST generated if md5 hash doesn't match. 817 */ 818 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 819 NULL, 0, ip_hdr(skb)->saddr, 820 th->source, ip_hdr(skb)->daddr, 821 ntohs(th->source), dif, sdif); 822 /* don't send rst if it can't find key */ 823 if (!sk1) 824 goto out; 825 826 /* sdif set, means packet ingressed via a device 827 * in an L3 domain and dif is set to it. 828 */ 829 l3index = sdif ? dif : 0; 830 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 831 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 832 if (!key) 833 goto out; 834 835 836 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 837 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 838 goto out; 839 840 } 841 842 if (key) { 843 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 844 (TCPOPT_NOP << 16) | 845 (TCPOPT_MD5SIG << 8) | 846 TCPOLEN_MD5SIG); 847 /* Update length and the length the header thinks exists */ 848 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 849 rep.th.doff = arg.iov[0].iov_len / 4; 850 851 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 852 key, ip_hdr(skb)->saddr, 853 ip_hdr(skb)->daddr, &rep.th); 854 } 855 #endif 856 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 857 if (rep.opt[0] == 0) { 858 __be32 mrst = mptcp_reset_option(skb); 859 860 if (mrst) { 861 rep.opt[0] = mrst; 862 arg.iov[0].iov_len += sizeof(mrst); 863 rep.th.doff = arg.iov[0].iov_len / 4; 864 } 865 } 866 867 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 868 ip_hdr(skb)->saddr, /* XXX */ 869 arg.iov[0].iov_len, IPPROTO_TCP, 0); 870 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 871 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 872 873 /* When socket is gone, all binding information is lost. 874 * routing might fail in this case. No choice here, if we choose to force 875 * input interface, we will misroute in case of asymmetric route. 876 */ 877 if (sk) 878 arg.bound_dev_if = sk->sk_bound_dev_if; 879 880 trace_tcp_send_reset(sk, skb, reason); 881 882 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 883 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 884 885 arg.tos = ip_hdr(skb)->tos; 886 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 887 local_bh_disable(); 888 ctl_sk = this_cpu_read(ipv4_tcp_sk); 889 sock_net_set(ctl_sk, net); 890 if (sk) { 891 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 892 inet_twsk(sk)->tw_mark : sk->sk_mark; 893 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 894 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 895 transmit_time = tcp_transmit_time(sk); 896 xfrm_sk_clone_policy(ctl_sk, sk); 897 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 898 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 899 } else { 900 ctl_sk->sk_mark = 0; 901 ctl_sk->sk_priority = 0; 902 } 903 ip_send_unicast_reply(ctl_sk, 904 skb, &TCP_SKB_CB(skb)->header.h4.opt, 905 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 906 &arg, arg.iov[0].iov_len, 907 transmit_time, txhash); 908 909 xfrm_sk_free_policy(ctl_sk); 910 sock_net_set(ctl_sk, &init_net); 911 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 912 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 913 local_bh_enable(); 914 915 #ifdef CONFIG_TCP_MD5SIG 916 out: 917 rcu_read_unlock(); 918 #endif 919 } 920 921 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 922 outside socket context is ugly, certainly. What can I do? 923 */ 924 925 static void tcp_v4_send_ack(const struct sock *sk, 926 struct sk_buff *skb, u32 seq, u32 ack, 927 u32 win, u32 tsval, u32 tsecr, int oif, 928 struct tcp_key *key, 929 int reply_flags, u8 tos, u32 txhash) 930 { 931 const struct tcphdr *th = tcp_hdr(skb); 932 struct { 933 struct tcphdr th; 934 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 935 } rep; 936 struct net *net = sock_net(sk); 937 struct ip_reply_arg arg; 938 struct sock *ctl_sk; 939 u64 transmit_time; 940 941 memset(&rep.th, 0, sizeof(struct tcphdr)); 942 memset(&arg, 0, sizeof(arg)); 943 944 arg.iov[0].iov_base = (unsigned char *)&rep; 945 arg.iov[0].iov_len = sizeof(rep.th); 946 if (tsecr) { 947 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 948 (TCPOPT_TIMESTAMP << 8) | 949 TCPOLEN_TIMESTAMP); 950 rep.opt[1] = htonl(tsval); 951 rep.opt[2] = htonl(tsecr); 952 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 953 } 954 955 /* Swap the send and the receive. */ 956 rep.th.dest = th->source; 957 rep.th.source = th->dest; 958 rep.th.doff = arg.iov[0].iov_len / 4; 959 rep.th.seq = htonl(seq); 960 rep.th.ack_seq = htonl(ack); 961 rep.th.ack = 1; 962 rep.th.window = htons(win); 963 964 #ifdef CONFIG_TCP_MD5SIG 965 if (tcp_key_is_md5(key)) { 966 int offset = (tsecr) ? 3 : 0; 967 968 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 969 (TCPOPT_NOP << 16) | 970 (TCPOPT_MD5SIG << 8) | 971 TCPOLEN_MD5SIG); 972 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 973 rep.th.doff = arg.iov[0].iov_len/4; 974 975 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 976 key->md5_key, ip_hdr(skb)->saddr, 977 ip_hdr(skb)->daddr, &rep.th); 978 } 979 #endif 980 #ifdef CONFIG_TCP_AO 981 if (tcp_key_is_ao(key)) { 982 int offset = (tsecr) ? 3 : 0; 983 984 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 985 (tcp_ao_len(key->ao_key) << 16) | 986 (key->ao_key->sndid << 8) | 987 key->rcv_next); 988 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 989 rep.th.doff = arg.iov[0].iov_len / 4; 990 991 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 992 key->ao_key, key->traffic_key, 993 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 994 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 995 &rep.th, key->sne); 996 } 997 #endif 998 arg.flags = reply_flags; 999 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1000 ip_hdr(skb)->saddr, /* XXX */ 1001 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1002 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1003 if (oif) 1004 arg.bound_dev_if = oif; 1005 arg.tos = tos; 1006 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1007 local_bh_disable(); 1008 ctl_sk = this_cpu_read(ipv4_tcp_sk); 1009 sock_net_set(ctl_sk, net); 1010 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1011 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1012 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1013 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1014 transmit_time = tcp_transmit_time(sk); 1015 ip_send_unicast_reply(ctl_sk, 1016 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1017 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1018 &arg, arg.iov[0].iov_len, 1019 transmit_time, txhash); 1020 1021 sock_net_set(ctl_sk, &init_net); 1022 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1023 local_bh_enable(); 1024 } 1025 1026 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 1027 { 1028 struct inet_timewait_sock *tw = inet_twsk(sk); 1029 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1030 struct tcp_key key = {}; 1031 #ifdef CONFIG_TCP_AO 1032 struct tcp_ao_info *ao_info; 1033 1034 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1035 /* FIXME: the segment to-be-acked is not verified yet */ 1036 ao_info = rcu_dereference(tcptw->ao_info); 1037 if (ao_info) { 1038 const struct tcp_ao_hdr *aoh; 1039 1040 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1041 inet_twsk_put(tw); 1042 return; 1043 } 1044 1045 if (aoh) 1046 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); 1047 } 1048 } 1049 if (key.ao_key) { 1050 struct tcp_ao_key *rnext_key; 1051 1052 key.traffic_key = snd_other_key(key.ao_key); 1053 key.sne = READ_ONCE(ao_info->snd_sne); 1054 rnext_key = READ_ONCE(ao_info->rnext_key); 1055 key.rcv_next = rnext_key->rcvid; 1056 key.type = TCP_KEY_AO; 1057 #else 1058 if (0) { 1059 #endif 1060 #ifdef CONFIG_TCP_MD5SIG 1061 } else if (static_branch_unlikely(&tcp_md5_needed.key)) { 1062 key.md5_key = tcp_twsk_md5_key(tcptw); 1063 if (key.md5_key) 1064 key.type = TCP_KEY_MD5; 1065 #endif 1066 } 1067 1068 tcp_v4_send_ack(sk, skb, 1069 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 1070 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1071 tcp_tw_tsval(tcptw), 1072 tcptw->tw_ts_recent, 1073 tw->tw_bound_dev_if, &key, 1074 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1075 tw->tw_tos, 1076 tw->tw_txhash); 1077 1078 inet_twsk_put(tw); 1079 } 1080 1081 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1082 struct request_sock *req) 1083 { 1084 struct tcp_key key = {}; 1085 1086 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1087 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1088 */ 1089 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1090 tcp_sk(sk)->snd_nxt; 1091 1092 #ifdef CONFIG_TCP_AO 1093 if (static_branch_unlikely(&tcp_ao_needed.key) && 1094 tcp_rsk_used_ao(req)) { 1095 const union tcp_md5_addr *addr; 1096 const struct tcp_ao_hdr *aoh; 1097 int l3index; 1098 1099 /* Invalid TCP option size or twice included auth */ 1100 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1101 return; 1102 if (!aoh) 1103 return; 1104 1105 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1106 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1107 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1108 aoh->rnext_keyid, -1); 1109 if (unlikely(!key.ao_key)) { 1110 /* Send ACK with any matching MKT for the peer */ 1111 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1112 /* Matching key disappeared (user removed the key?) 1113 * let the handshake timeout. 1114 */ 1115 if (!key.ao_key) { 1116 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1117 addr, 1118 ntohs(tcp_hdr(skb)->source), 1119 &ip_hdr(skb)->daddr, 1120 ntohs(tcp_hdr(skb)->dest)); 1121 return; 1122 } 1123 } 1124 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1125 if (!key.traffic_key) 1126 return; 1127 1128 key.type = TCP_KEY_AO; 1129 key.rcv_next = aoh->keyid; 1130 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1131 #else 1132 if (0) { 1133 #endif 1134 #ifdef CONFIG_TCP_MD5SIG 1135 } else if (static_branch_unlikely(&tcp_md5_needed.key)) { 1136 const union tcp_md5_addr *addr; 1137 int l3index; 1138 1139 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1140 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1141 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1142 if (key.md5_key) 1143 key.type = TCP_KEY_MD5; 1144 #endif 1145 } 1146 1147 /* RFC 7323 2.3 1148 * The window field (SEG.WND) of every outgoing segment, with the 1149 * exception of <SYN> segments, MUST be right-shifted by 1150 * Rcv.Wind.Shift bits: 1151 */ 1152 tcp_v4_send_ack(sk, skb, seq, 1153 tcp_rsk(req)->rcv_nxt, 1154 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 1155 tcp_rsk_tsval(tcp_rsk(req)), 1156 READ_ONCE(req->ts_recent), 1157 0, &key, 1158 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1159 ip_hdr(skb)->tos, 1160 READ_ONCE(tcp_rsk(req)->txhash)); 1161 if (tcp_key_is_ao(&key)) 1162 kfree(key.traffic_key); 1163 } 1164 1165 /* 1166 * Send a SYN-ACK after having received a SYN. 1167 * This still operates on a request_sock only, not on a big 1168 * socket. 1169 */ 1170 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1171 struct flowi *fl, 1172 struct request_sock *req, 1173 struct tcp_fastopen_cookie *foc, 1174 enum tcp_synack_type synack_type, 1175 struct sk_buff *syn_skb) 1176 { 1177 const struct inet_request_sock *ireq = inet_rsk(req); 1178 struct flowi4 fl4; 1179 int err = -1; 1180 struct sk_buff *skb; 1181 u8 tos; 1182 1183 /* First, grab a route. */ 1184 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1185 return -1; 1186 1187 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1188 1189 if (skb) { 1190 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1191 1192 tos = READ_ONCE(inet_sk(sk)->tos); 1193 1194 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1195 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1196 (tos & INET_ECN_MASK); 1197 1198 if (!INET_ECN_is_capable(tos) && 1199 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1200 tos |= INET_ECN_ECT_0; 1201 1202 rcu_read_lock(); 1203 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1204 ireq->ir_rmt_addr, 1205 rcu_dereference(ireq->ireq_opt), 1206 tos); 1207 rcu_read_unlock(); 1208 err = net_xmit_eval(err); 1209 } 1210 1211 return err; 1212 } 1213 1214 /* 1215 * IPv4 request_sock destructor. 1216 */ 1217 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1218 { 1219 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1220 } 1221 1222 #ifdef CONFIG_TCP_MD5SIG 1223 /* 1224 * RFC2385 MD5 checksumming requires a mapping of 1225 * IP address->MD5 Key. 1226 * We need to maintain these in the sk structure. 1227 */ 1228 1229 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1230 EXPORT_SYMBOL(tcp_md5_needed); 1231 1232 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1233 { 1234 if (!old) 1235 return true; 1236 1237 /* l3index always overrides non-l3index */ 1238 if (old->l3index && new->l3index == 0) 1239 return false; 1240 if (old->l3index == 0 && new->l3index) 1241 return true; 1242 1243 return old->prefixlen < new->prefixlen; 1244 } 1245 1246 /* Find the Key structure for an address. */ 1247 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1248 const union tcp_md5_addr *addr, 1249 int family, bool any_l3index) 1250 { 1251 const struct tcp_sock *tp = tcp_sk(sk); 1252 struct tcp_md5sig_key *key; 1253 const struct tcp_md5sig_info *md5sig; 1254 __be32 mask; 1255 struct tcp_md5sig_key *best_match = NULL; 1256 bool match; 1257 1258 /* caller either holds rcu_read_lock() or socket lock */ 1259 md5sig = rcu_dereference_check(tp->md5sig_info, 1260 lockdep_sock_is_held(sk)); 1261 if (!md5sig) 1262 return NULL; 1263 1264 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1265 lockdep_sock_is_held(sk)) { 1266 if (key->family != family) 1267 continue; 1268 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1269 key->l3index != l3index) 1270 continue; 1271 if (family == AF_INET) { 1272 mask = inet_make_mask(key->prefixlen); 1273 match = (key->addr.a4.s_addr & mask) == 1274 (addr->a4.s_addr & mask); 1275 #if IS_ENABLED(CONFIG_IPV6) 1276 } else if (family == AF_INET6) { 1277 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1278 key->prefixlen); 1279 #endif 1280 } else { 1281 match = false; 1282 } 1283 1284 if (match && better_md5_match(best_match, key)) 1285 best_match = key; 1286 } 1287 return best_match; 1288 } 1289 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1290 1291 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1292 const union tcp_md5_addr *addr, 1293 int family, u8 prefixlen, 1294 int l3index, u8 flags) 1295 { 1296 const struct tcp_sock *tp = tcp_sk(sk); 1297 struct tcp_md5sig_key *key; 1298 unsigned int size = sizeof(struct in_addr); 1299 const struct tcp_md5sig_info *md5sig; 1300 1301 /* caller either holds rcu_read_lock() or socket lock */ 1302 md5sig = rcu_dereference_check(tp->md5sig_info, 1303 lockdep_sock_is_held(sk)); 1304 if (!md5sig) 1305 return NULL; 1306 #if IS_ENABLED(CONFIG_IPV6) 1307 if (family == AF_INET6) 1308 size = sizeof(struct in6_addr); 1309 #endif 1310 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1311 lockdep_sock_is_held(sk)) { 1312 if (key->family != family) 1313 continue; 1314 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1315 continue; 1316 if (key->l3index != l3index) 1317 continue; 1318 if (!memcmp(&key->addr, addr, size) && 1319 key->prefixlen == prefixlen) 1320 return key; 1321 } 1322 return NULL; 1323 } 1324 1325 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1326 const struct sock *addr_sk) 1327 { 1328 const union tcp_md5_addr *addr; 1329 int l3index; 1330 1331 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1332 addr_sk->sk_bound_dev_if); 1333 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1334 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1335 } 1336 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1337 1338 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1339 { 1340 struct tcp_sock *tp = tcp_sk(sk); 1341 struct tcp_md5sig_info *md5sig; 1342 1343 md5sig = kmalloc(sizeof(*md5sig), gfp); 1344 if (!md5sig) 1345 return -ENOMEM; 1346 1347 sk_gso_disable(sk); 1348 INIT_HLIST_HEAD(&md5sig->head); 1349 rcu_assign_pointer(tp->md5sig_info, md5sig); 1350 return 0; 1351 } 1352 1353 /* This can be called on a newly created socket, from other files */ 1354 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1355 int family, u8 prefixlen, int l3index, u8 flags, 1356 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1357 { 1358 /* Add Key to the list */ 1359 struct tcp_md5sig_key *key; 1360 struct tcp_sock *tp = tcp_sk(sk); 1361 struct tcp_md5sig_info *md5sig; 1362 1363 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1364 if (key) { 1365 /* Pre-existing entry - just update that one. 1366 * Note that the key might be used concurrently. 1367 * data_race() is telling kcsan that we do not care of 1368 * key mismatches, since changing MD5 key on live flows 1369 * can lead to packet drops. 1370 */ 1371 data_race(memcpy(key->key, newkey, newkeylen)); 1372 1373 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1374 * Also note that a reader could catch new key->keylen value 1375 * but old key->key[], this is the reason we use __GFP_ZERO 1376 * at sock_kmalloc() time below these lines. 1377 */ 1378 WRITE_ONCE(key->keylen, newkeylen); 1379 1380 return 0; 1381 } 1382 1383 md5sig = rcu_dereference_protected(tp->md5sig_info, 1384 lockdep_sock_is_held(sk)); 1385 1386 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1387 if (!key) 1388 return -ENOMEM; 1389 1390 memcpy(key->key, newkey, newkeylen); 1391 key->keylen = newkeylen; 1392 key->family = family; 1393 key->prefixlen = prefixlen; 1394 key->l3index = l3index; 1395 key->flags = flags; 1396 memcpy(&key->addr, addr, 1397 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1398 sizeof(struct in_addr)); 1399 hlist_add_head_rcu(&key->node, &md5sig->head); 1400 return 0; 1401 } 1402 1403 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1404 int family, u8 prefixlen, int l3index, u8 flags, 1405 const u8 *newkey, u8 newkeylen) 1406 { 1407 struct tcp_sock *tp = tcp_sk(sk); 1408 1409 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1410 if (tcp_md5_alloc_sigpool()) 1411 return -ENOMEM; 1412 1413 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1414 tcp_md5_release_sigpool(); 1415 return -ENOMEM; 1416 } 1417 1418 if (!static_branch_inc(&tcp_md5_needed.key)) { 1419 struct tcp_md5sig_info *md5sig; 1420 1421 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1422 rcu_assign_pointer(tp->md5sig_info, NULL); 1423 kfree_rcu(md5sig, rcu); 1424 tcp_md5_release_sigpool(); 1425 return -EUSERS; 1426 } 1427 } 1428 1429 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1430 newkey, newkeylen, GFP_KERNEL); 1431 } 1432 EXPORT_SYMBOL(tcp_md5_do_add); 1433 1434 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1435 int family, u8 prefixlen, int l3index, 1436 struct tcp_md5sig_key *key) 1437 { 1438 struct tcp_sock *tp = tcp_sk(sk); 1439 1440 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1441 tcp_md5_add_sigpool(); 1442 1443 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1444 tcp_md5_release_sigpool(); 1445 return -ENOMEM; 1446 } 1447 1448 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1449 struct tcp_md5sig_info *md5sig; 1450 1451 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1452 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1453 rcu_assign_pointer(tp->md5sig_info, NULL); 1454 kfree_rcu(md5sig, rcu); 1455 tcp_md5_release_sigpool(); 1456 return -EUSERS; 1457 } 1458 } 1459 1460 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1461 key->flags, key->key, key->keylen, 1462 sk_gfp_mask(sk, GFP_ATOMIC)); 1463 } 1464 EXPORT_SYMBOL(tcp_md5_key_copy); 1465 1466 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1467 u8 prefixlen, int l3index, u8 flags) 1468 { 1469 struct tcp_md5sig_key *key; 1470 1471 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1472 if (!key) 1473 return -ENOENT; 1474 hlist_del_rcu(&key->node); 1475 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1476 kfree_rcu(key, rcu); 1477 return 0; 1478 } 1479 EXPORT_SYMBOL(tcp_md5_do_del); 1480 1481 void tcp_clear_md5_list(struct sock *sk) 1482 { 1483 struct tcp_sock *tp = tcp_sk(sk); 1484 struct tcp_md5sig_key *key; 1485 struct hlist_node *n; 1486 struct tcp_md5sig_info *md5sig; 1487 1488 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1489 1490 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1491 hlist_del_rcu(&key->node); 1492 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1493 kfree_rcu(key, rcu); 1494 } 1495 } 1496 1497 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1498 sockptr_t optval, int optlen) 1499 { 1500 struct tcp_md5sig cmd; 1501 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1502 const union tcp_md5_addr *addr; 1503 u8 prefixlen = 32; 1504 int l3index = 0; 1505 bool l3flag; 1506 u8 flags; 1507 1508 if (optlen < sizeof(cmd)) 1509 return -EINVAL; 1510 1511 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1512 return -EFAULT; 1513 1514 if (sin->sin_family != AF_INET) 1515 return -EINVAL; 1516 1517 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1518 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1519 1520 if (optname == TCP_MD5SIG_EXT && 1521 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1522 prefixlen = cmd.tcpm_prefixlen; 1523 if (prefixlen > 32) 1524 return -EINVAL; 1525 } 1526 1527 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1528 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1529 struct net_device *dev; 1530 1531 rcu_read_lock(); 1532 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1533 if (dev && netif_is_l3_master(dev)) 1534 l3index = dev->ifindex; 1535 1536 rcu_read_unlock(); 1537 1538 /* ok to reference set/not set outside of rcu; 1539 * right now device MUST be an L3 master 1540 */ 1541 if (!dev || !l3index) 1542 return -EINVAL; 1543 } 1544 1545 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1546 1547 if (!cmd.tcpm_keylen) 1548 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1549 1550 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1551 return -EINVAL; 1552 1553 /* Don't allow keys for peers that have a matching TCP-AO key. 1554 * See the comment in tcp_ao_add_cmd() 1555 */ 1556 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1557 return -EKEYREJECTED; 1558 1559 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1560 cmd.tcpm_key, cmd.tcpm_keylen); 1561 } 1562 1563 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1564 __be32 daddr, __be32 saddr, 1565 const struct tcphdr *th, int nbytes) 1566 { 1567 struct tcp4_pseudohdr *bp; 1568 struct scatterlist sg; 1569 struct tcphdr *_th; 1570 1571 bp = hp->scratch; 1572 bp->saddr = saddr; 1573 bp->daddr = daddr; 1574 bp->pad = 0; 1575 bp->protocol = IPPROTO_TCP; 1576 bp->len = cpu_to_be16(nbytes); 1577 1578 _th = (struct tcphdr *)(bp + 1); 1579 memcpy(_th, th, sizeof(*th)); 1580 _th->check = 0; 1581 1582 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1583 ahash_request_set_crypt(hp->req, &sg, NULL, 1584 sizeof(*bp) + sizeof(*th)); 1585 return crypto_ahash_update(hp->req); 1586 } 1587 1588 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1589 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1590 { 1591 struct tcp_sigpool hp; 1592 1593 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1594 goto clear_hash_nostart; 1595 1596 if (crypto_ahash_init(hp.req)) 1597 goto clear_hash; 1598 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1599 goto clear_hash; 1600 if (tcp_md5_hash_key(&hp, key)) 1601 goto clear_hash; 1602 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1603 if (crypto_ahash_final(hp.req)) 1604 goto clear_hash; 1605 1606 tcp_sigpool_end(&hp); 1607 return 0; 1608 1609 clear_hash: 1610 tcp_sigpool_end(&hp); 1611 clear_hash_nostart: 1612 memset(md5_hash, 0, 16); 1613 return 1; 1614 } 1615 1616 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1617 const struct sock *sk, 1618 const struct sk_buff *skb) 1619 { 1620 const struct tcphdr *th = tcp_hdr(skb); 1621 struct tcp_sigpool hp; 1622 __be32 saddr, daddr; 1623 1624 if (sk) { /* valid for establish/request sockets */ 1625 saddr = sk->sk_rcv_saddr; 1626 daddr = sk->sk_daddr; 1627 } else { 1628 const struct iphdr *iph = ip_hdr(skb); 1629 saddr = iph->saddr; 1630 daddr = iph->daddr; 1631 } 1632 1633 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1634 goto clear_hash_nostart; 1635 1636 if (crypto_ahash_init(hp.req)) 1637 goto clear_hash; 1638 1639 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1640 goto clear_hash; 1641 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1642 goto clear_hash; 1643 if (tcp_md5_hash_key(&hp, key)) 1644 goto clear_hash; 1645 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1646 if (crypto_ahash_final(hp.req)) 1647 goto clear_hash; 1648 1649 tcp_sigpool_end(&hp); 1650 return 0; 1651 1652 clear_hash: 1653 tcp_sigpool_end(&hp); 1654 clear_hash_nostart: 1655 memset(md5_hash, 0, 16); 1656 return 1; 1657 } 1658 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1659 1660 #endif 1661 1662 static void tcp_v4_init_req(struct request_sock *req, 1663 const struct sock *sk_listener, 1664 struct sk_buff *skb) 1665 { 1666 struct inet_request_sock *ireq = inet_rsk(req); 1667 struct net *net = sock_net(sk_listener); 1668 1669 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1670 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1671 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1672 } 1673 1674 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1675 struct sk_buff *skb, 1676 struct flowi *fl, 1677 struct request_sock *req, 1678 u32 tw_isn) 1679 { 1680 tcp_v4_init_req(req, sk, skb); 1681 1682 if (security_inet_conn_request(sk, skb, req)) 1683 return NULL; 1684 1685 return inet_csk_route_req(sk, &fl->u.ip4, req); 1686 } 1687 1688 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1689 .family = PF_INET, 1690 .obj_size = sizeof(struct tcp_request_sock), 1691 .rtx_syn_ack = tcp_rtx_synack, 1692 .send_ack = tcp_v4_reqsk_send_ack, 1693 .destructor = tcp_v4_reqsk_destructor, 1694 .send_reset = tcp_v4_send_reset, 1695 .syn_ack_timeout = tcp_syn_ack_timeout, 1696 }; 1697 1698 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1699 .mss_clamp = TCP_MSS_DEFAULT, 1700 #ifdef CONFIG_TCP_MD5SIG 1701 .req_md5_lookup = tcp_v4_md5_lookup, 1702 .calc_md5_hash = tcp_v4_md5_hash_skb, 1703 #endif 1704 #ifdef CONFIG_TCP_AO 1705 .ao_lookup = tcp_v4_ao_lookup_rsk, 1706 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1707 .ao_synack_hash = tcp_v4_ao_synack_hash, 1708 #endif 1709 #ifdef CONFIG_SYN_COOKIES 1710 .cookie_init_seq = cookie_v4_init_sequence, 1711 #endif 1712 .route_req = tcp_v4_route_req, 1713 .init_seq = tcp_v4_init_seq, 1714 .init_ts_off = tcp_v4_init_ts_off, 1715 .send_synack = tcp_v4_send_synack, 1716 }; 1717 1718 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1719 { 1720 /* Never answer to SYNs send to broadcast or multicast */ 1721 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1722 goto drop; 1723 1724 return tcp_conn_request(&tcp_request_sock_ops, 1725 &tcp_request_sock_ipv4_ops, sk, skb); 1726 1727 drop: 1728 tcp_listendrop(sk); 1729 return 0; 1730 } 1731 EXPORT_SYMBOL(tcp_v4_conn_request); 1732 1733 1734 /* 1735 * The three way handshake has completed - we got a valid synack - 1736 * now create the new socket. 1737 */ 1738 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1739 struct request_sock *req, 1740 struct dst_entry *dst, 1741 struct request_sock *req_unhash, 1742 bool *own_req) 1743 { 1744 struct inet_request_sock *ireq; 1745 bool found_dup_sk = false; 1746 struct inet_sock *newinet; 1747 struct tcp_sock *newtp; 1748 struct sock *newsk; 1749 #ifdef CONFIG_TCP_MD5SIG 1750 const union tcp_md5_addr *addr; 1751 struct tcp_md5sig_key *key; 1752 int l3index; 1753 #endif 1754 struct ip_options_rcu *inet_opt; 1755 1756 if (sk_acceptq_is_full(sk)) 1757 goto exit_overflow; 1758 1759 newsk = tcp_create_openreq_child(sk, req, skb); 1760 if (!newsk) 1761 goto exit_nonewsk; 1762 1763 newsk->sk_gso_type = SKB_GSO_TCPV4; 1764 inet_sk_rx_dst_set(newsk, skb); 1765 1766 newtp = tcp_sk(newsk); 1767 newinet = inet_sk(newsk); 1768 ireq = inet_rsk(req); 1769 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1770 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1771 newsk->sk_bound_dev_if = ireq->ir_iif; 1772 newinet->inet_saddr = ireq->ir_loc_addr; 1773 inet_opt = rcu_dereference(ireq->ireq_opt); 1774 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1775 newinet->mc_index = inet_iif(skb); 1776 newinet->mc_ttl = ip_hdr(skb)->ttl; 1777 newinet->rcv_tos = ip_hdr(skb)->tos; 1778 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1779 if (inet_opt) 1780 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1781 atomic_set(&newinet->inet_id, get_random_u16()); 1782 1783 /* Set ToS of the new socket based upon the value of incoming SYN. 1784 * ECT bits are set later in tcp_init_transfer(). 1785 */ 1786 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1787 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1788 1789 if (!dst) { 1790 dst = inet_csk_route_child_sock(sk, newsk, req); 1791 if (!dst) 1792 goto put_and_exit; 1793 } else { 1794 /* syncookie case : see end of cookie_v4_check() */ 1795 } 1796 sk_setup_caps(newsk, dst); 1797 1798 tcp_ca_openreq_child(newsk, dst); 1799 1800 tcp_sync_mss(newsk, dst_mtu(dst)); 1801 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1802 1803 tcp_initialize_rcv_mss(newsk); 1804 1805 #ifdef CONFIG_TCP_MD5SIG 1806 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1807 /* Copy over the MD5 key from the original socket */ 1808 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1809 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1810 if (key && !tcp_rsk_used_ao(req)) { 1811 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1812 goto put_and_exit; 1813 sk_gso_disable(newsk); 1814 } 1815 #endif 1816 #ifdef CONFIG_TCP_AO 1817 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1818 goto put_and_exit; /* OOM, release back memory */ 1819 #endif 1820 1821 if (__inet_inherit_port(sk, newsk) < 0) 1822 goto put_and_exit; 1823 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1824 &found_dup_sk); 1825 if (likely(*own_req)) { 1826 tcp_move_syn(newtp, req); 1827 ireq->ireq_opt = NULL; 1828 } else { 1829 newinet->inet_opt = NULL; 1830 1831 if (!req_unhash && found_dup_sk) { 1832 /* This code path should only be executed in the 1833 * syncookie case only 1834 */ 1835 bh_unlock_sock(newsk); 1836 sock_put(newsk); 1837 newsk = NULL; 1838 } 1839 } 1840 return newsk; 1841 1842 exit_overflow: 1843 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1844 exit_nonewsk: 1845 dst_release(dst); 1846 exit: 1847 tcp_listendrop(sk); 1848 return NULL; 1849 put_and_exit: 1850 newinet->inet_opt = NULL; 1851 inet_csk_prepare_forced_close(newsk); 1852 tcp_done(newsk); 1853 goto exit; 1854 } 1855 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1856 1857 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1858 { 1859 #ifdef CONFIG_SYN_COOKIES 1860 const struct tcphdr *th = tcp_hdr(skb); 1861 1862 if (!th->syn) 1863 sk = cookie_v4_check(sk, skb); 1864 #endif 1865 return sk; 1866 } 1867 1868 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1869 struct tcphdr *th, u32 *cookie) 1870 { 1871 u16 mss = 0; 1872 #ifdef CONFIG_SYN_COOKIES 1873 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1874 &tcp_request_sock_ipv4_ops, sk, th); 1875 if (mss) { 1876 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1877 tcp_synq_overflow(sk); 1878 } 1879 #endif 1880 return mss; 1881 } 1882 1883 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1884 u32)); 1885 /* The socket must have it's spinlock held when we get 1886 * here, unless it is a TCP_LISTEN socket. 1887 * 1888 * We have a potential double-lock case here, so even when 1889 * doing backlog processing we use the BH locking scheme. 1890 * This is because we cannot sleep with the original spinlock 1891 * held. 1892 */ 1893 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1894 { 1895 enum skb_drop_reason reason; 1896 struct sock *rsk; 1897 1898 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1899 struct dst_entry *dst; 1900 1901 dst = rcu_dereference_protected(sk->sk_rx_dst, 1902 lockdep_sock_is_held(sk)); 1903 1904 sock_rps_save_rxhash(sk, skb); 1905 sk_mark_napi_id(sk, skb); 1906 if (dst) { 1907 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1908 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1909 dst, 0)) { 1910 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1911 dst_release(dst); 1912 } 1913 } 1914 tcp_rcv_established(sk, skb); 1915 return 0; 1916 } 1917 1918 if (tcp_checksum_complete(skb)) 1919 goto csum_err; 1920 1921 if (sk->sk_state == TCP_LISTEN) { 1922 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1923 1924 if (!nsk) 1925 return 0; 1926 if (nsk != sk) { 1927 reason = tcp_child_process(sk, nsk, skb); 1928 if (reason) { 1929 rsk = nsk; 1930 goto reset; 1931 } 1932 return 0; 1933 } 1934 } else 1935 sock_rps_save_rxhash(sk, skb); 1936 1937 reason = tcp_rcv_state_process(sk, skb); 1938 if (reason) { 1939 rsk = sk; 1940 goto reset; 1941 } 1942 return 0; 1943 1944 reset: 1945 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1946 discard: 1947 kfree_skb_reason(skb, reason); 1948 /* Be careful here. If this function gets more complicated and 1949 * gcc suffers from register pressure on the x86, sk (in %ebx) 1950 * might be destroyed here. This current version compiles correctly, 1951 * but you have been warned. 1952 */ 1953 return 0; 1954 1955 csum_err: 1956 reason = SKB_DROP_REASON_TCP_CSUM; 1957 trace_tcp_bad_csum(skb); 1958 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1959 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1960 goto discard; 1961 } 1962 EXPORT_SYMBOL(tcp_v4_do_rcv); 1963 1964 int tcp_v4_early_demux(struct sk_buff *skb) 1965 { 1966 struct net *net = dev_net(skb->dev); 1967 const struct iphdr *iph; 1968 const struct tcphdr *th; 1969 struct sock *sk; 1970 1971 if (skb->pkt_type != PACKET_HOST) 1972 return 0; 1973 1974 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1975 return 0; 1976 1977 iph = ip_hdr(skb); 1978 th = tcp_hdr(skb); 1979 1980 if (th->doff < sizeof(struct tcphdr) / 4) 1981 return 0; 1982 1983 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1984 iph->saddr, th->source, 1985 iph->daddr, ntohs(th->dest), 1986 skb->skb_iif, inet_sdif(skb)); 1987 if (sk) { 1988 skb->sk = sk; 1989 skb->destructor = sock_edemux; 1990 if (sk_fullsock(sk)) { 1991 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1992 1993 if (dst) 1994 dst = dst_check(dst, 0); 1995 if (dst && 1996 sk->sk_rx_dst_ifindex == skb->skb_iif) 1997 skb_dst_set_noref(skb, dst); 1998 } 1999 } 2000 return 0; 2001 } 2002 2003 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2004 enum skb_drop_reason *reason) 2005 { 2006 u32 tail_gso_size, tail_gso_segs; 2007 struct skb_shared_info *shinfo; 2008 const struct tcphdr *th; 2009 struct tcphdr *thtail; 2010 struct sk_buff *tail; 2011 unsigned int hdrlen; 2012 bool fragstolen; 2013 u32 gso_segs; 2014 u32 gso_size; 2015 u64 limit; 2016 int delta; 2017 2018 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2019 * we can fix skb->truesize to its real value to avoid future drops. 2020 * This is valid because skb is not yet charged to the socket. 2021 * It has been noticed pure SACK packets were sometimes dropped 2022 * (if cooked by drivers without copybreak feature). 2023 */ 2024 skb_condense(skb); 2025 2026 skb_dst_drop(skb); 2027 2028 if (unlikely(tcp_checksum_complete(skb))) { 2029 bh_unlock_sock(sk); 2030 trace_tcp_bad_csum(skb); 2031 *reason = SKB_DROP_REASON_TCP_CSUM; 2032 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2033 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2034 return true; 2035 } 2036 2037 /* Attempt coalescing to last skb in backlog, even if we are 2038 * above the limits. 2039 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2040 */ 2041 th = (const struct tcphdr *)skb->data; 2042 hdrlen = th->doff * 4; 2043 2044 tail = sk->sk_backlog.tail; 2045 if (!tail) 2046 goto no_coalesce; 2047 thtail = (struct tcphdr *)tail->data; 2048 2049 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2050 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2051 ((TCP_SKB_CB(tail)->tcp_flags | 2052 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2053 !((TCP_SKB_CB(tail)->tcp_flags & 2054 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2055 ((TCP_SKB_CB(tail)->tcp_flags ^ 2056 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 2057 !mptcp_skb_can_collapse(tail, skb) || 2058 skb_cmp_decrypted(tail, skb) || 2059 thtail->doff != th->doff || 2060 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2061 goto no_coalesce; 2062 2063 __skb_pull(skb, hdrlen); 2064 2065 shinfo = skb_shinfo(skb); 2066 gso_size = shinfo->gso_size ?: skb->len; 2067 gso_segs = shinfo->gso_segs ?: 1; 2068 2069 shinfo = skb_shinfo(tail); 2070 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2071 tail_gso_segs = shinfo->gso_segs ?: 1; 2072 2073 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2074 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2075 2076 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2077 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2078 thtail->window = th->window; 2079 } 2080 2081 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2082 * thtail->fin, so that the fast path in tcp_rcv_established() 2083 * is not entered if we append a packet with a FIN. 2084 * SYN, RST, URG are not present. 2085 * ACK is set on both packets. 2086 * PSH : we do not really care in TCP stack, 2087 * at least for 'GRO' packets. 2088 */ 2089 thtail->fin |= th->fin; 2090 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2091 2092 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2093 TCP_SKB_CB(tail)->has_rxtstamp = true; 2094 tail->tstamp = skb->tstamp; 2095 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2096 } 2097 2098 /* Not as strict as GRO. We only need to carry mss max value */ 2099 shinfo->gso_size = max(gso_size, tail_gso_size); 2100 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2101 2102 sk->sk_backlog.len += delta; 2103 __NET_INC_STATS(sock_net(sk), 2104 LINUX_MIB_TCPBACKLOGCOALESCE); 2105 kfree_skb_partial(skb, fragstolen); 2106 return false; 2107 } 2108 __skb_push(skb, hdrlen); 2109 2110 no_coalesce: 2111 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2112 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2113 * sk_rcvbuf in normal conditions. 2114 */ 2115 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2116 2117 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2118 2119 /* Only socket owner can try to collapse/prune rx queues 2120 * to reduce memory overhead, so add a little headroom here. 2121 * Few sockets backlog are possibly concurrently non empty. 2122 */ 2123 limit += 64 * 1024; 2124 2125 limit = min_t(u64, limit, UINT_MAX); 2126 2127 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2128 bh_unlock_sock(sk); 2129 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2130 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2131 return true; 2132 } 2133 return false; 2134 } 2135 EXPORT_SYMBOL(tcp_add_backlog); 2136 2137 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2138 { 2139 struct tcphdr *th = (struct tcphdr *)skb->data; 2140 2141 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2142 } 2143 EXPORT_SYMBOL(tcp_filter); 2144 2145 static void tcp_v4_restore_cb(struct sk_buff *skb) 2146 { 2147 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2148 sizeof(struct inet_skb_parm)); 2149 } 2150 2151 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2152 const struct tcphdr *th) 2153 { 2154 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2155 * barrier() makes sure compiler wont play fool^Waliasing games. 2156 */ 2157 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2158 sizeof(struct inet_skb_parm)); 2159 barrier(); 2160 2161 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2162 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2163 skb->len - th->doff * 4); 2164 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2165 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 2166 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2167 TCP_SKB_CB(skb)->sacked = 0; 2168 TCP_SKB_CB(skb)->has_rxtstamp = 2169 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2170 } 2171 2172 /* 2173 * From tcp_input.c 2174 */ 2175 2176 int tcp_v4_rcv(struct sk_buff *skb) 2177 { 2178 struct net *net = dev_net(skb->dev); 2179 enum skb_drop_reason drop_reason; 2180 int sdif = inet_sdif(skb); 2181 int dif = inet_iif(skb); 2182 const struct iphdr *iph; 2183 const struct tcphdr *th; 2184 bool refcounted; 2185 struct sock *sk; 2186 int ret; 2187 u32 isn; 2188 2189 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2190 if (skb->pkt_type != PACKET_HOST) 2191 goto discard_it; 2192 2193 /* Count it even if it's bad */ 2194 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2195 2196 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2197 goto discard_it; 2198 2199 th = (const struct tcphdr *)skb->data; 2200 2201 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2202 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2203 goto bad_packet; 2204 } 2205 if (!pskb_may_pull(skb, th->doff * 4)) 2206 goto discard_it; 2207 2208 /* An explanation is required here, I think. 2209 * Packet length and doff are validated by header prediction, 2210 * provided case of th->doff==0 is eliminated. 2211 * So, we defer the checks. */ 2212 2213 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2214 goto csum_error; 2215 2216 th = (const struct tcphdr *)skb->data; 2217 iph = ip_hdr(skb); 2218 lookup: 2219 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2220 skb, __tcp_hdrlen(th), th->source, 2221 th->dest, sdif, &refcounted); 2222 if (!sk) 2223 goto no_tcp_socket; 2224 2225 if (sk->sk_state == TCP_TIME_WAIT) 2226 goto do_time_wait; 2227 2228 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2229 struct request_sock *req = inet_reqsk(sk); 2230 bool req_stolen = false; 2231 struct sock *nsk; 2232 2233 sk = req->rsk_listener; 2234 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2235 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2236 else 2237 drop_reason = tcp_inbound_hash(sk, req, skb, 2238 &iph->saddr, &iph->daddr, 2239 AF_INET, dif, sdif); 2240 if (unlikely(drop_reason)) { 2241 sk_drops_add(sk, skb); 2242 reqsk_put(req); 2243 goto discard_it; 2244 } 2245 if (tcp_checksum_complete(skb)) { 2246 reqsk_put(req); 2247 goto csum_error; 2248 } 2249 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2250 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2251 if (!nsk) { 2252 inet_csk_reqsk_queue_drop_and_put(sk, req); 2253 goto lookup; 2254 } 2255 sk = nsk; 2256 /* reuseport_migrate_sock() has already held one sk_refcnt 2257 * before returning. 2258 */ 2259 } else { 2260 /* We own a reference on the listener, increase it again 2261 * as we might lose it too soon. 2262 */ 2263 sock_hold(sk); 2264 } 2265 refcounted = true; 2266 nsk = NULL; 2267 if (!tcp_filter(sk, skb)) { 2268 th = (const struct tcphdr *)skb->data; 2269 iph = ip_hdr(skb); 2270 tcp_v4_fill_cb(skb, iph, th); 2271 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2272 } else { 2273 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2274 } 2275 if (!nsk) { 2276 reqsk_put(req); 2277 if (req_stolen) { 2278 /* Another cpu got exclusive access to req 2279 * and created a full blown socket. 2280 * Try to feed this packet to this socket 2281 * instead of discarding it. 2282 */ 2283 tcp_v4_restore_cb(skb); 2284 sock_put(sk); 2285 goto lookup; 2286 } 2287 goto discard_and_relse; 2288 } 2289 nf_reset_ct(skb); 2290 if (nsk == sk) { 2291 reqsk_put(req); 2292 tcp_v4_restore_cb(skb); 2293 } else { 2294 drop_reason = tcp_child_process(sk, nsk, skb); 2295 if (drop_reason) { 2296 enum sk_rst_reason rst_reason; 2297 2298 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2299 tcp_v4_send_reset(nsk, skb, rst_reason); 2300 goto discard_and_relse; 2301 } 2302 sock_put(sk); 2303 return 0; 2304 } 2305 } 2306 2307 process: 2308 if (static_branch_unlikely(&ip4_min_ttl)) { 2309 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2310 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2311 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2312 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2313 goto discard_and_relse; 2314 } 2315 } 2316 2317 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2318 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2319 goto discard_and_relse; 2320 } 2321 2322 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2323 AF_INET, dif, sdif); 2324 if (drop_reason) 2325 goto discard_and_relse; 2326 2327 nf_reset_ct(skb); 2328 2329 if (tcp_filter(sk, skb)) { 2330 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2331 goto discard_and_relse; 2332 } 2333 th = (const struct tcphdr *)skb->data; 2334 iph = ip_hdr(skb); 2335 tcp_v4_fill_cb(skb, iph, th); 2336 2337 skb->dev = NULL; 2338 2339 if (sk->sk_state == TCP_LISTEN) { 2340 ret = tcp_v4_do_rcv(sk, skb); 2341 goto put_and_return; 2342 } 2343 2344 sk_incoming_cpu_update(sk); 2345 2346 bh_lock_sock_nested(sk); 2347 tcp_segs_in(tcp_sk(sk), skb); 2348 ret = 0; 2349 if (!sock_owned_by_user(sk)) { 2350 ret = tcp_v4_do_rcv(sk, skb); 2351 } else { 2352 if (tcp_add_backlog(sk, skb, &drop_reason)) 2353 goto discard_and_relse; 2354 } 2355 bh_unlock_sock(sk); 2356 2357 put_and_return: 2358 if (refcounted) 2359 sock_put(sk); 2360 2361 return ret; 2362 2363 no_tcp_socket: 2364 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2365 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2366 goto discard_it; 2367 2368 tcp_v4_fill_cb(skb, iph, th); 2369 2370 if (tcp_checksum_complete(skb)) { 2371 csum_error: 2372 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2373 trace_tcp_bad_csum(skb); 2374 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2375 bad_packet: 2376 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2377 } else { 2378 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2379 } 2380 2381 discard_it: 2382 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2383 /* Discard frame. */ 2384 kfree_skb_reason(skb, drop_reason); 2385 return 0; 2386 2387 discard_and_relse: 2388 sk_drops_add(sk, skb); 2389 if (refcounted) 2390 sock_put(sk); 2391 goto discard_it; 2392 2393 do_time_wait: 2394 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2395 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2396 inet_twsk_put(inet_twsk(sk)); 2397 goto discard_it; 2398 } 2399 2400 tcp_v4_fill_cb(skb, iph, th); 2401 2402 if (tcp_checksum_complete(skb)) { 2403 inet_twsk_put(inet_twsk(sk)); 2404 goto csum_error; 2405 } 2406 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { 2407 case TCP_TW_SYN: { 2408 struct sock *sk2 = inet_lookup_listener(net, 2409 net->ipv4.tcp_death_row.hashinfo, 2410 skb, __tcp_hdrlen(th), 2411 iph->saddr, th->source, 2412 iph->daddr, th->dest, 2413 inet_iif(skb), 2414 sdif); 2415 if (sk2) { 2416 inet_twsk_deschedule_put(inet_twsk(sk)); 2417 sk = sk2; 2418 tcp_v4_restore_cb(skb); 2419 refcounted = false; 2420 __this_cpu_write(tcp_tw_isn, isn); 2421 goto process; 2422 } 2423 } 2424 /* to ACK */ 2425 fallthrough; 2426 case TCP_TW_ACK: 2427 tcp_v4_timewait_ack(sk, skb); 2428 break; 2429 case TCP_TW_RST: 2430 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2431 inet_twsk_deschedule_put(inet_twsk(sk)); 2432 goto discard_it; 2433 case TCP_TW_SUCCESS:; 2434 } 2435 goto discard_it; 2436 } 2437 2438 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2439 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2440 .twsk_destructor= tcp_twsk_destructor, 2441 }; 2442 2443 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2444 { 2445 struct dst_entry *dst = skb_dst(skb); 2446 2447 if (dst && dst_hold_safe(dst)) { 2448 rcu_assign_pointer(sk->sk_rx_dst, dst); 2449 sk->sk_rx_dst_ifindex = skb->skb_iif; 2450 } 2451 } 2452 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2453 2454 const struct inet_connection_sock_af_ops ipv4_specific = { 2455 .queue_xmit = ip_queue_xmit, 2456 .send_check = tcp_v4_send_check, 2457 .rebuild_header = inet_sk_rebuild_header, 2458 .sk_rx_dst_set = inet_sk_rx_dst_set, 2459 .conn_request = tcp_v4_conn_request, 2460 .syn_recv_sock = tcp_v4_syn_recv_sock, 2461 .net_header_len = sizeof(struct iphdr), 2462 .setsockopt = ip_setsockopt, 2463 .getsockopt = ip_getsockopt, 2464 .addr2sockaddr = inet_csk_addr2sockaddr, 2465 .sockaddr_len = sizeof(struct sockaddr_in), 2466 .mtu_reduced = tcp_v4_mtu_reduced, 2467 }; 2468 EXPORT_SYMBOL(ipv4_specific); 2469 2470 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2471 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2472 #ifdef CONFIG_TCP_MD5SIG 2473 .md5_lookup = tcp_v4_md5_lookup, 2474 .calc_md5_hash = tcp_v4_md5_hash_skb, 2475 .md5_parse = tcp_v4_parse_md5_keys, 2476 #endif 2477 #ifdef CONFIG_TCP_AO 2478 .ao_lookup = tcp_v4_ao_lookup, 2479 .calc_ao_hash = tcp_v4_ao_hash_skb, 2480 .ao_parse = tcp_v4_parse_ao, 2481 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2482 #endif 2483 }; 2484 #endif 2485 2486 /* NOTE: A lot of things set to zero explicitly by call to 2487 * sk_alloc() so need not be done here. 2488 */ 2489 static int tcp_v4_init_sock(struct sock *sk) 2490 { 2491 struct inet_connection_sock *icsk = inet_csk(sk); 2492 2493 tcp_init_sock(sk); 2494 2495 icsk->icsk_af_ops = &ipv4_specific; 2496 2497 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2498 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2499 #endif 2500 2501 return 0; 2502 } 2503 2504 #ifdef CONFIG_TCP_MD5SIG 2505 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2506 { 2507 struct tcp_md5sig_info *md5sig; 2508 2509 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2510 kfree(md5sig); 2511 static_branch_slow_dec_deferred(&tcp_md5_needed); 2512 tcp_md5_release_sigpool(); 2513 } 2514 #endif 2515 2516 void tcp_v4_destroy_sock(struct sock *sk) 2517 { 2518 struct tcp_sock *tp = tcp_sk(sk); 2519 2520 trace_tcp_destroy_sock(sk); 2521 2522 tcp_clear_xmit_timers(sk); 2523 2524 tcp_cleanup_congestion_control(sk); 2525 2526 tcp_cleanup_ulp(sk); 2527 2528 /* Cleanup up the write buffer. */ 2529 tcp_write_queue_purge(sk); 2530 2531 /* Check if we want to disable active TFO */ 2532 tcp_fastopen_active_disable_ofo_check(sk); 2533 2534 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2535 skb_rbtree_purge(&tp->out_of_order_queue); 2536 2537 #ifdef CONFIG_TCP_MD5SIG 2538 /* Clean up the MD5 key list, if any */ 2539 if (tp->md5sig_info) { 2540 struct tcp_md5sig_info *md5sig; 2541 2542 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2543 tcp_clear_md5_list(sk); 2544 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2545 rcu_assign_pointer(tp->md5sig_info, NULL); 2546 } 2547 #endif 2548 tcp_ao_destroy_sock(sk, false); 2549 2550 /* Clean up a referenced TCP bind bucket. */ 2551 if (inet_csk(sk)->icsk_bind_hash) 2552 inet_put_port(sk); 2553 2554 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2555 2556 /* If socket is aborted during connect operation */ 2557 tcp_free_fastopen_req(tp); 2558 tcp_fastopen_destroy_cipher(sk); 2559 tcp_saved_syn_free(tp); 2560 2561 sk_sockets_allocated_dec(sk); 2562 } 2563 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2564 2565 #ifdef CONFIG_PROC_FS 2566 /* Proc filesystem TCP sock list dumping. */ 2567 2568 static unsigned short seq_file_family(const struct seq_file *seq); 2569 2570 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2571 { 2572 unsigned short family = seq_file_family(seq); 2573 2574 /* AF_UNSPEC is used as a match all */ 2575 return ((family == AF_UNSPEC || family == sk->sk_family) && 2576 net_eq(sock_net(sk), seq_file_net(seq))); 2577 } 2578 2579 /* Find a non empty bucket (starting from st->bucket) 2580 * and return the first sk from it. 2581 */ 2582 static void *listening_get_first(struct seq_file *seq) 2583 { 2584 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2585 struct tcp_iter_state *st = seq->private; 2586 2587 st->offset = 0; 2588 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2589 struct inet_listen_hashbucket *ilb2; 2590 struct hlist_nulls_node *node; 2591 struct sock *sk; 2592 2593 ilb2 = &hinfo->lhash2[st->bucket]; 2594 if (hlist_nulls_empty(&ilb2->nulls_head)) 2595 continue; 2596 2597 spin_lock(&ilb2->lock); 2598 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2599 if (seq_sk_match(seq, sk)) 2600 return sk; 2601 } 2602 spin_unlock(&ilb2->lock); 2603 } 2604 2605 return NULL; 2606 } 2607 2608 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2609 * If "cur" is the last one in the st->bucket, 2610 * call listening_get_first() to return the first sk of the next 2611 * non empty bucket. 2612 */ 2613 static void *listening_get_next(struct seq_file *seq, void *cur) 2614 { 2615 struct tcp_iter_state *st = seq->private; 2616 struct inet_listen_hashbucket *ilb2; 2617 struct hlist_nulls_node *node; 2618 struct inet_hashinfo *hinfo; 2619 struct sock *sk = cur; 2620 2621 ++st->num; 2622 ++st->offset; 2623 2624 sk = sk_nulls_next(sk); 2625 sk_nulls_for_each_from(sk, node) { 2626 if (seq_sk_match(seq, sk)) 2627 return sk; 2628 } 2629 2630 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2631 ilb2 = &hinfo->lhash2[st->bucket]; 2632 spin_unlock(&ilb2->lock); 2633 ++st->bucket; 2634 return listening_get_first(seq); 2635 } 2636 2637 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2638 { 2639 struct tcp_iter_state *st = seq->private; 2640 void *rc; 2641 2642 st->bucket = 0; 2643 st->offset = 0; 2644 rc = listening_get_first(seq); 2645 2646 while (rc && *pos) { 2647 rc = listening_get_next(seq, rc); 2648 --*pos; 2649 } 2650 return rc; 2651 } 2652 2653 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2654 const struct tcp_iter_state *st) 2655 { 2656 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2657 } 2658 2659 /* 2660 * Get first established socket starting from bucket given in st->bucket. 2661 * If st->bucket is zero, the very first socket in the hash is returned. 2662 */ 2663 static void *established_get_first(struct seq_file *seq) 2664 { 2665 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2666 struct tcp_iter_state *st = seq->private; 2667 2668 st->offset = 0; 2669 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2670 struct sock *sk; 2671 struct hlist_nulls_node *node; 2672 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2673 2674 cond_resched(); 2675 2676 /* Lockless fast path for the common case of empty buckets */ 2677 if (empty_bucket(hinfo, st)) 2678 continue; 2679 2680 spin_lock_bh(lock); 2681 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2682 if (seq_sk_match(seq, sk)) 2683 return sk; 2684 } 2685 spin_unlock_bh(lock); 2686 } 2687 2688 return NULL; 2689 } 2690 2691 static void *established_get_next(struct seq_file *seq, void *cur) 2692 { 2693 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2694 struct tcp_iter_state *st = seq->private; 2695 struct hlist_nulls_node *node; 2696 struct sock *sk = cur; 2697 2698 ++st->num; 2699 ++st->offset; 2700 2701 sk = sk_nulls_next(sk); 2702 2703 sk_nulls_for_each_from(sk, node) { 2704 if (seq_sk_match(seq, sk)) 2705 return sk; 2706 } 2707 2708 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2709 ++st->bucket; 2710 return established_get_first(seq); 2711 } 2712 2713 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2714 { 2715 struct tcp_iter_state *st = seq->private; 2716 void *rc; 2717 2718 st->bucket = 0; 2719 rc = established_get_first(seq); 2720 2721 while (rc && pos) { 2722 rc = established_get_next(seq, rc); 2723 --pos; 2724 } 2725 return rc; 2726 } 2727 2728 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2729 { 2730 void *rc; 2731 struct tcp_iter_state *st = seq->private; 2732 2733 st->state = TCP_SEQ_STATE_LISTENING; 2734 rc = listening_get_idx(seq, &pos); 2735 2736 if (!rc) { 2737 st->state = TCP_SEQ_STATE_ESTABLISHED; 2738 rc = established_get_idx(seq, pos); 2739 } 2740 2741 return rc; 2742 } 2743 2744 static void *tcp_seek_last_pos(struct seq_file *seq) 2745 { 2746 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2747 struct tcp_iter_state *st = seq->private; 2748 int bucket = st->bucket; 2749 int offset = st->offset; 2750 int orig_num = st->num; 2751 void *rc = NULL; 2752 2753 switch (st->state) { 2754 case TCP_SEQ_STATE_LISTENING: 2755 if (st->bucket > hinfo->lhash2_mask) 2756 break; 2757 rc = listening_get_first(seq); 2758 while (offset-- && rc && bucket == st->bucket) 2759 rc = listening_get_next(seq, rc); 2760 if (rc) 2761 break; 2762 st->bucket = 0; 2763 st->state = TCP_SEQ_STATE_ESTABLISHED; 2764 fallthrough; 2765 case TCP_SEQ_STATE_ESTABLISHED: 2766 if (st->bucket > hinfo->ehash_mask) 2767 break; 2768 rc = established_get_first(seq); 2769 while (offset-- && rc && bucket == st->bucket) 2770 rc = established_get_next(seq, rc); 2771 } 2772 2773 st->num = orig_num; 2774 2775 return rc; 2776 } 2777 2778 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2779 { 2780 struct tcp_iter_state *st = seq->private; 2781 void *rc; 2782 2783 if (*pos && *pos == st->last_pos) { 2784 rc = tcp_seek_last_pos(seq); 2785 if (rc) 2786 goto out; 2787 } 2788 2789 st->state = TCP_SEQ_STATE_LISTENING; 2790 st->num = 0; 2791 st->bucket = 0; 2792 st->offset = 0; 2793 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2794 2795 out: 2796 st->last_pos = *pos; 2797 return rc; 2798 } 2799 EXPORT_SYMBOL(tcp_seq_start); 2800 2801 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2802 { 2803 struct tcp_iter_state *st = seq->private; 2804 void *rc = NULL; 2805 2806 if (v == SEQ_START_TOKEN) { 2807 rc = tcp_get_idx(seq, 0); 2808 goto out; 2809 } 2810 2811 switch (st->state) { 2812 case TCP_SEQ_STATE_LISTENING: 2813 rc = listening_get_next(seq, v); 2814 if (!rc) { 2815 st->state = TCP_SEQ_STATE_ESTABLISHED; 2816 st->bucket = 0; 2817 st->offset = 0; 2818 rc = established_get_first(seq); 2819 } 2820 break; 2821 case TCP_SEQ_STATE_ESTABLISHED: 2822 rc = established_get_next(seq, v); 2823 break; 2824 } 2825 out: 2826 ++*pos; 2827 st->last_pos = *pos; 2828 return rc; 2829 } 2830 EXPORT_SYMBOL(tcp_seq_next); 2831 2832 void tcp_seq_stop(struct seq_file *seq, void *v) 2833 { 2834 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2835 struct tcp_iter_state *st = seq->private; 2836 2837 switch (st->state) { 2838 case TCP_SEQ_STATE_LISTENING: 2839 if (v != SEQ_START_TOKEN) 2840 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2841 break; 2842 case TCP_SEQ_STATE_ESTABLISHED: 2843 if (v) 2844 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2845 break; 2846 } 2847 } 2848 EXPORT_SYMBOL(tcp_seq_stop); 2849 2850 static void get_openreq4(const struct request_sock *req, 2851 struct seq_file *f, int i) 2852 { 2853 const struct inet_request_sock *ireq = inet_rsk(req); 2854 long delta = req->rsk_timer.expires - jiffies; 2855 2856 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2857 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2858 i, 2859 ireq->ir_loc_addr, 2860 ireq->ir_num, 2861 ireq->ir_rmt_addr, 2862 ntohs(ireq->ir_rmt_port), 2863 TCP_SYN_RECV, 2864 0, 0, /* could print option size, but that is af dependent. */ 2865 1, /* timers active (only the expire timer) */ 2866 jiffies_delta_to_clock_t(delta), 2867 req->num_timeout, 2868 from_kuid_munged(seq_user_ns(f), 2869 sock_i_uid(req->rsk_listener)), 2870 0, /* non standard timer */ 2871 0, /* open_requests have no inode */ 2872 0, 2873 req); 2874 } 2875 2876 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2877 { 2878 int timer_active; 2879 unsigned long timer_expires; 2880 const struct tcp_sock *tp = tcp_sk(sk); 2881 const struct inet_connection_sock *icsk = inet_csk(sk); 2882 const struct inet_sock *inet = inet_sk(sk); 2883 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2884 __be32 dest = inet->inet_daddr; 2885 __be32 src = inet->inet_rcv_saddr; 2886 __u16 destp = ntohs(inet->inet_dport); 2887 __u16 srcp = ntohs(inet->inet_sport); 2888 int rx_queue; 2889 int state; 2890 2891 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2892 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2893 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2894 timer_active = 1; 2895 timer_expires = icsk->icsk_timeout; 2896 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2897 timer_active = 4; 2898 timer_expires = icsk->icsk_timeout; 2899 } else if (timer_pending(&sk->sk_timer)) { 2900 timer_active = 2; 2901 timer_expires = sk->sk_timer.expires; 2902 } else { 2903 timer_active = 0; 2904 timer_expires = jiffies; 2905 } 2906 2907 state = inet_sk_state_load(sk); 2908 if (state == TCP_LISTEN) 2909 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2910 else 2911 /* Because we don't lock the socket, 2912 * we might find a transient negative value. 2913 */ 2914 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2915 READ_ONCE(tp->copied_seq), 0); 2916 2917 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2918 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2919 i, src, srcp, dest, destp, state, 2920 READ_ONCE(tp->write_seq) - tp->snd_una, 2921 rx_queue, 2922 timer_active, 2923 jiffies_delta_to_clock_t(timer_expires - jiffies), 2924 icsk->icsk_retransmits, 2925 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2926 icsk->icsk_probes_out, 2927 sock_i_ino(sk), 2928 refcount_read(&sk->sk_refcnt), sk, 2929 jiffies_to_clock_t(icsk->icsk_rto), 2930 jiffies_to_clock_t(icsk->icsk_ack.ato), 2931 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2932 tcp_snd_cwnd(tp), 2933 state == TCP_LISTEN ? 2934 fastopenq->max_qlen : 2935 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2936 } 2937 2938 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2939 struct seq_file *f, int i) 2940 { 2941 long delta = tw->tw_timer.expires - jiffies; 2942 __be32 dest, src; 2943 __u16 destp, srcp; 2944 2945 dest = tw->tw_daddr; 2946 src = tw->tw_rcv_saddr; 2947 destp = ntohs(tw->tw_dport); 2948 srcp = ntohs(tw->tw_sport); 2949 2950 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2951 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2952 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2953 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2954 refcount_read(&tw->tw_refcnt), tw); 2955 } 2956 2957 #define TMPSZ 150 2958 2959 static int tcp4_seq_show(struct seq_file *seq, void *v) 2960 { 2961 struct tcp_iter_state *st; 2962 struct sock *sk = v; 2963 2964 seq_setwidth(seq, TMPSZ - 1); 2965 if (v == SEQ_START_TOKEN) { 2966 seq_puts(seq, " sl local_address rem_address st tx_queue " 2967 "rx_queue tr tm->when retrnsmt uid timeout " 2968 "inode"); 2969 goto out; 2970 } 2971 st = seq->private; 2972 2973 if (sk->sk_state == TCP_TIME_WAIT) 2974 get_timewait4_sock(v, seq, st->num); 2975 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2976 get_openreq4(v, seq, st->num); 2977 else 2978 get_tcp4_sock(v, seq, st->num); 2979 out: 2980 seq_pad(seq, '\n'); 2981 return 0; 2982 } 2983 2984 #ifdef CONFIG_BPF_SYSCALL 2985 struct bpf_tcp_iter_state { 2986 struct tcp_iter_state state; 2987 unsigned int cur_sk; 2988 unsigned int end_sk; 2989 unsigned int max_sk; 2990 struct sock **batch; 2991 bool st_bucket_done; 2992 }; 2993 2994 struct bpf_iter__tcp { 2995 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2996 __bpf_md_ptr(struct sock_common *, sk_common); 2997 uid_t uid __aligned(8); 2998 }; 2999 3000 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3001 struct sock_common *sk_common, uid_t uid) 3002 { 3003 struct bpf_iter__tcp ctx; 3004 3005 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3006 ctx.meta = meta; 3007 ctx.sk_common = sk_common; 3008 ctx.uid = uid; 3009 return bpf_iter_run_prog(prog, &ctx); 3010 } 3011 3012 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3013 { 3014 while (iter->cur_sk < iter->end_sk) 3015 sock_gen_put(iter->batch[iter->cur_sk++]); 3016 } 3017 3018 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3019 unsigned int new_batch_sz) 3020 { 3021 struct sock **new_batch; 3022 3023 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3024 GFP_USER | __GFP_NOWARN); 3025 if (!new_batch) 3026 return -ENOMEM; 3027 3028 bpf_iter_tcp_put_batch(iter); 3029 kvfree(iter->batch); 3030 iter->batch = new_batch; 3031 iter->max_sk = new_batch_sz; 3032 3033 return 0; 3034 } 3035 3036 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3037 struct sock *start_sk) 3038 { 3039 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3040 struct bpf_tcp_iter_state *iter = seq->private; 3041 struct tcp_iter_state *st = &iter->state; 3042 struct hlist_nulls_node *node; 3043 unsigned int expected = 1; 3044 struct sock *sk; 3045 3046 sock_hold(start_sk); 3047 iter->batch[iter->end_sk++] = start_sk; 3048 3049 sk = sk_nulls_next(start_sk); 3050 sk_nulls_for_each_from(sk, node) { 3051 if (seq_sk_match(seq, sk)) { 3052 if (iter->end_sk < iter->max_sk) { 3053 sock_hold(sk); 3054 iter->batch[iter->end_sk++] = sk; 3055 } 3056 expected++; 3057 } 3058 } 3059 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3060 3061 return expected; 3062 } 3063 3064 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3065 struct sock *start_sk) 3066 { 3067 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3068 struct bpf_tcp_iter_state *iter = seq->private; 3069 struct tcp_iter_state *st = &iter->state; 3070 struct hlist_nulls_node *node; 3071 unsigned int expected = 1; 3072 struct sock *sk; 3073 3074 sock_hold(start_sk); 3075 iter->batch[iter->end_sk++] = start_sk; 3076 3077 sk = sk_nulls_next(start_sk); 3078 sk_nulls_for_each_from(sk, node) { 3079 if (seq_sk_match(seq, sk)) { 3080 if (iter->end_sk < iter->max_sk) { 3081 sock_hold(sk); 3082 iter->batch[iter->end_sk++] = sk; 3083 } 3084 expected++; 3085 } 3086 } 3087 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3088 3089 return expected; 3090 } 3091 3092 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3093 { 3094 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3095 struct bpf_tcp_iter_state *iter = seq->private; 3096 struct tcp_iter_state *st = &iter->state; 3097 unsigned int expected; 3098 bool resized = false; 3099 struct sock *sk; 3100 3101 /* The st->bucket is done. Directly advance to the next 3102 * bucket instead of having the tcp_seek_last_pos() to skip 3103 * one by one in the current bucket and eventually find out 3104 * it has to advance to the next bucket. 3105 */ 3106 if (iter->st_bucket_done) { 3107 st->offset = 0; 3108 st->bucket++; 3109 if (st->state == TCP_SEQ_STATE_LISTENING && 3110 st->bucket > hinfo->lhash2_mask) { 3111 st->state = TCP_SEQ_STATE_ESTABLISHED; 3112 st->bucket = 0; 3113 } 3114 } 3115 3116 again: 3117 /* Get a new batch */ 3118 iter->cur_sk = 0; 3119 iter->end_sk = 0; 3120 iter->st_bucket_done = false; 3121 3122 sk = tcp_seek_last_pos(seq); 3123 if (!sk) 3124 return NULL; /* Done */ 3125 3126 if (st->state == TCP_SEQ_STATE_LISTENING) 3127 expected = bpf_iter_tcp_listening_batch(seq, sk); 3128 else 3129 expected = bpf_iter_tcp_established_batch(seq, sk); 3130 3131 if (iter->end_sk == expected) { 3132 iter->st_bucket_done = true; 3133 return sk; 3134 } 3135 3136 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3137 resized = true; 3138 goto again; 3139 } 3140 3141 return sk; 3142 } 3143 3144 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3145 { 3146 /* bpf iter does not support lseek, so it always 3147 * continue from where it was stop()-ped. 3148 */ 3149 if (*pos) 3150 return bpf_iter_tcp_batch(seq); 3151 3152 return SEQ_START_TOKEN; 3153 } 3154 3155 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3156 { 3157 struct bpf_tcp_iter_state *iter = seq->private; 3158 struct tcp_iter_state *st = &iter->state; 3159 struct sock *sk; 3160 3161 /* Whenever seq_next() is called, the iter->cur_sk is 3162 * done with seq_show(), so advance to the next sk in 3163 * the batch. 3164 */ 3165 if (iter->cur_sk < iter->end_sk) { 3166 /* Keeping st->num consistent in tcp_iter_state. 3167 * bpf_iter_tcp does not use st->num. 3168 * meta.seq_num is used instead. 3169 */ 3170 st->num++; 3171 /* Move st->offset to the next sk in the bucket such that 3172 * the future start() will resume at st->offset in 3173 * st->bucket. See tcp_seek_last_pos(). 3174 */ 3175 st->offset++; 3176 sock_gen_put(iter->batch[iter->cur_sk++]); 3177 } 3178 3179 if (iter->cur_sk < iter->end_sk) 3180 sk = iter->batch[iter->cur_sk]; 3181 else 3182 sk = bpf_iter_tcp_batch(seq); 3183 3184 ++*pos; 3185 /* Keeping st->last_pos consistent in tcp_iter_state. 3186 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3187 */ 3188 st->last_pos = *pos; 3189 return sk; 3190 } 3191 3192 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3193 { 3194 struct bpf_iter_meta meta; 3195 struct bpf_prog *prog; 3196 struct sock *sk = v; 3197 uid_t uid; 3198 int ret; 3199 3200 if (v == SEQ_START_TOKEN) 3201 return 0; 3202 3203 if (sk_fullsock(sk)) 3204 lock_sock(sk); 3205 3206 if (unlikely(sk_unhashed(sk))) { 3207 ret = SEQ_SKIP; 3208 goto unlock; 3209 } 3210 3211 if (sk->sk_state == TCP_TIME_WAIT) { 3212 uid = 0; 3213 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3214 const struct request_sock *req = v; 3215 3216 uid = from_kuid_munged(seq_user_ns(seq), 3217 sock_i_uid(req->rsk_listener)); 3218 } else { 3219 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3220 } 3221 3222 meta.seq = seq; 3223 prog = bpf_iter_get_info(&meta, false); 3224 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3225 3226 unlock: 3227 if (sk_fullsock(sk)) 3228 release_sock(sk); 3229 return ret; 3230 3231 } 3232 3233 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3234 { 3235 struct bpf_tcp_iter_state *iter = seq->private; 3236 struct bpf_iter_meta meta; 3237 struct bpf_prog *prog; 3238 3239 if (!v) { 3240 meta.seq = seq; 3241 prog = bpf_iter_get_info(&meta, true); 3242 if (prog) 3243 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3244 } 3245 3246 if (iter->cur_sk < iter->end_sk) { 3247 bpf_iter_tcp_put_batch(iter); 3248 iter->st_bucket_done = false; 3249 } 3250 } 3251 3252 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3253 .show = bpf_iter_tcp_seq_show, 3254 .start = bpf_iter_tcp_seq_start, 3255 .next = bpf_iter_tcp_seq_next, 3256 .stop = bpf_iter_tcp_seq_stop, 3257 }; 3258 #endif 3259 static unsigned short seq_file_family(const struct seq_file *seq) 3260 { 3261 const struct tcp_seq_afinfo *afinfo; 3262 3263 #ifdef CONFIG_BPF_SYSCALL 3264 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3265 if (seq->op == &bpf_iter_tcp_seq_ops) 3266 return AF_UNSPEC; 3267 #endif 3268 3269 /* Iterated from proc fs */ 3270 afinfo = pde_data(file_inode(seq->file)); 3271 return afinfo->family; 3272 } 3273 3274 static const struct seq_operations tcp4_seq_ops = { 3275 .show = tcp4_seq_show, 3276 .start = tcp_seq_start, 3277 .next = tcp_seq_next, 3278 .stop = tcp_seq_stop, 3279 }; 3280 3281 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3282 .family = AF_INET, 3283 }; 3284 3285 static int __net_init tcp4_proc_init_net(struct net *net) 3286 { 3287 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3288 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3289 return -ENOMEM; 3290 return 0; 3291 } 3292 3293 static void __net_exit tcp4_proc_exit_net(struct net *net) 3294 { 3295 remove_proc_entry("tcp", net->proc_net); 3296 } 3297 3298 static struct pernet_operations tcp4_net_ops = { 3299 .init = tcp4_proc_init_net, 3300 .exit = tcp4_proc_exit_net, 3301 }; 3302 3303 int __init tcp4_proc_init(void) 3304 { 3305 return register_pernet_subsys(&tcp4_net_ops); 3306 } 3307 3308 void tcp4_proc_exit(void) 3309 { 3310 unregister_pernet_subsys(&tcp4_net_ops); 3311 } 3312 #endif /* CONFIG_PROC_FS */ 3313 3314 /* @wake is one when sk_stream_write_space() calls us. 3315 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3316 * This mimics the strategy used in sock_def_write_space(). 3317 */ 3318 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3319 { 3320 const struct tcp_sock *tp = tcp_sk(sk); 3321 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3322 READ_ONCE(tp->snd_nxt); 3323 3324 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3325 } 3326 EXPORT_SYMBOL(tcp_stream_memory_free); 3327 3328 struct proto tcp_prot = { 3329 .name = "TCP", 3330 .owner = THIS_MODULE, 3331 .close = tcp_close, 3332 .pre_connect = tcp_v4_pre_connect, 3333 .connect = tcp_v4_connect, 3334 .disconnect = tcp_disconnect, 3335 .accept = inet_csk_accept, 3336 .ioctl = tcp_ioctl, 3337 .init = tcp_v4_init_sock, 3338 .destroy = tcp_v4_destroy_sock, 3339 .shutdown = tcp_shutdown, 3340 .setsockopt = tcp_setsockopt, 3341 .getsockopt = tcp_getsockopt, 3342 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3343 .keepalive = tcp_set_keepalive, 3344 .recvmsg = tcp_recvmsg, 3345 .sendmsg = tcp_sendmsg, 3346 .splice_eof = tcp_splice_eof, 3347 .backlog_rcv = tcp_v4_do_rcv, 3348 .release_cb = tcp_release_cb, 3349 .hash = inet_hash, 3350 .unhash = inet_unhash, 3351 .get_port = inet_csk_get_port, 3352 .put_port = inet_put_port, 3353 #ifdef CONFIG_BPF_SYSCALL 3354 .psock_update_sk_prot = tcp_bpf_update_proto, 3355 #endif 3356 .enter_memory_pressure = tcp_enter_memory_pressure, 3357 .leave_memory_pressure = tcp_leave_memory_pressure, 3358 .stream_memory_free = tcp_stream_memory_free, 3359 .sockets_allocated = &tcp_sockets_allocated, 3360 .orphan_count = &tcp_orphan_count, 3361 3362 .memory_allocated = &tcp_memory_allocated, 3363 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3364 3365 .memory_pressure = &tcp_memory_pressure, 3366 .sysctl_mem = sysctl_tcp_mem, 3367 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3368 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3369 .max_header = MAX_TCP_HEADER, 3370 .obj_size = sizeof(struct tcp_sock), 3371 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3372 .twsk_prot = &tcp_timewait_sock_ops, 3373 .rsk_prot = &tcp_request_sock_ops, 3374 .h.hashinfo = NULL, 3375 .no_autobind = true, 3376 .diag_destroy = tcp_abort, 3377 }; 3378 EXPORT_SYMBOL(tcp_prot); 3379 3380 static void __net_exit tcp_sk_exit(struct net *net) 3381 { 3382 if (net->ipv4.tcp_congestion_control) 3383 bpf_module_put(net->ipv4.tcp_congestion_control, 3384 net->ipv4.tcp_congestion_control->owner); 3385 } 3386 3387 static void __net_init tcp_set_hashinfo(struct net *net) 3388 { 3389 struct inet_hashinfo *hinfo; 3390 unsigned int ehash_entries; 3391 struct net *old_net; 3392 3393 if (net_eq(net, &init_net)) 3394 goto fallback; 3395 3396 old_net = current->nsproxy->net_ns; 3397 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3398 if (!ehash_entries) 3399 goto fallback; 3400 3401 ehash_entries = roundup_pow_of_two(ehash_entries); 3402 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3403 if (!hinfo) { 3404 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3405 "for a netns, fallback to the global one\n", 3406 ehash_entries); 3407 fallback: 3408 hinfo = &tcp_hashinfo; 3409 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3410 } 3411 3412 net->ipv4.tcp_death_row.hashinfo = hinfo; 3413 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3414 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3415 } 3416 3417 static int __net_init tcp_sk_init(struct net *net) 3418 { 3419 net->ipv4.sysctl_tcp_ecn = 2; 3420 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3421 3422 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3423 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3424 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3425 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3426 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3427 3428 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3429 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3430 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3431 3432 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3433 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3434 net->ipv4.sysctl_tcp_syncookies = 1; 3435 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3436 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3437 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3438 net->ipv4.sysctl_tcp_orphan_retries = 0; 3439 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3440 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3441 net->ipv4.sysctl_tcp_tw_reuse = 2; 3442 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3443 3444 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3445 tcp_set_hashinfo(net); 3446 3447 net->ipv4.sysctl_tcp_sack = 1; 3448 net->ipv4.sysctl_tcp_window_scaling = 1; 3449 net->ipv4.sysctl_tcp_timestamps = 1; 3450 net->ipv4.sysctl_tcp_early_retrans = 3; 3451 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3452 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3453 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3454 net->ipv4.sysctl_tcp_max_reordering = 300; 3455 net->ipv4.sysctl_tcp_dsack = 1; 3456 net->ipv4.sysctl_tcp_app_win = 31; 3457 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3458 net->ipv4.sysctl_tcp_frto = 2; 3459 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3460 /* This limits the percentage of the congestion window which we 3461 * will allow a single TSO frame to consume. Building TSO frames 3462 * which are too large can cause TCP streams to be bursty. 3463 */ 3464 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3465 /* Default TSQ limit of 16 TSO segments */ 3466 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3467 3468 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3469 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3470 3471 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3472 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3473 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3474 net->ipv4.sysctl_tcp_autocorking = 1; 3475 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3476 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3477 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3478 if (net != &init_net) { 3479 memcpy(net->ipv4.sysctl_tcp_rmem, 3480 init_net.ipv4.sysctl_tcp_rmem, 3481 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3482 memcpy(net->ipv4.sysctl_tcp_wmem, 3483 init_net.ipv4.sysctl_tcp_wmem, 3484 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3485 } 3486 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3487 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3488 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3489 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3490 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3491 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3492 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3493 3494 /* Set default values for PLB */ 3495 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3496 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3497 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3498 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3499 /* Default congestion threshold for PLB to mark a round is 50% */ 3500 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3501 3502 /* Reno is always built in */ 3503 if (!net_eq(net, &init_net) && 3504 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3505 init_net.ipv4.tcp_congestion_control->owner)) 3506 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3507 else 3508 net->ipv4.tcp_congestion_control = &tcp_reno; 3509 3510 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3511 net->ipv4.sysctl_tcp_shrink_window = 0; 3512 3513 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3514 3515 return 0; 3516 } 3517 3518 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3519 { 3520 struct net *net; 3521 3522 tcp_twsk_purge(net_exit_list); 3523 3524 list_for_each_entry(net, net_exit_list, exit_list) { 3525 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3526 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3527 tcp_fastopen_ctx_destroy(net); 3528 } 3529 } 3530 3531 static struct pernet_operations __net_initdata tcp_sk_ops = { 3532 .init = tcp_sk_init, 3533 .exit = tcp_sk_exit, 3534 .exit_batch = tcp_sk_exit_batch, 3535 }; 3536 3537 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3538 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3539 struct sock_common *sk_common, uid_t uid) 3540 3541 #define INIT_BATCH_SZ 16 3542 3543 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3544 { 3545 struct bpf_tcp_iter_state *iter = priv_data; 3546 int err; 3547 3548 err = bpf_iter_init_seq_net(priv_data, aux); 3549 if (err) 3550 return err; 3551 3552 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3553 if (err) { 3554 bpf_iter_fini_seq_net(priv_data); 3555 return err; 3556 } 3557 3558 return 0; 3559 } 3560 3561 static void bpf_iter_fini_tcp(void *priv_data) 3562 { 3563 struct bpf_tcp_iter_state *iter = priv_data; 3564 3565 bpf_iter_fini_seq_net(priv_data); 3566 kvfree(iter->batch); 3567 } 3568 3569 static const struct bpf_iter_seq_info tcp_seq_info = { 3570 .seq_ops = &bpf_iter_tcp_seq_ops, 3571 .init_seq_private = bpf_iter_init_tcp, 3572 .fini_seq_private = bpf_iter_fini_tcp, 3573 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3574 }; 3575 3576 static const struct bpf_func_proto * 3577 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3578 const struct bpf_prog *prog) 3579 { 3580 switch (func_id) { 3581 case BPF_FUNC_setsockopt: 3582 return &bpf_sk_setsockopt_proto; 3583 case BPF_FUNC_getsockopt: 3584 return &bpf_sk_getsockopt_proto; 3585 default: 3586 return NULL; 3587 } 3588 } 3589 3590 static struct bpf_iter_reg tcp_reg_info = { 3591 .target = "tcp", 3592 .ctx_arg_info_size = 1, 3593 .ctx_arg_info = { 3594 { offsetof(struct bpf_iter__tcp, sk_common), 3595 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3596 }, 3597 .get_func_proto = bpf_iter_tcp_get_func_proto, 3598 .seq_info = &tcp_seq_info, 3599 }; 3600 3601 static void __init bpf_iter_register(void) 3602 { 3603 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3604 if (bpf_iter_reg_target(&tcp_reg_info)) 3605 pr_warn("Warning: could not register bpf iterator tcp\n"); 3606 } 3607 3608 #endif 3609 3610 void __init tcp_v4_init(void) 3611 { 3612 int cpu, res; 3613 3614 for_each_possible_cpu(cpu) { 3615 struct sock *sk; 3616 3617 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3618 IPPROTO_TCP, &init_net); 3619 if (res) 3620 panic("Failed to create the TCP control socket.\n"); 3621 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3622 3623 /* Please enforce IP_DF and IPID==0 for RST and 3624 * ACK sent in SYN-RECV and TIME-WAIT state. 3625 */ 3626 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3627 3628 per_cpu(ipv4_tcp_sk, cpu) = sk; 3629 } 3630 if (register_pernet_subsys(&tcp_sk_ops)) 3631 panic("Failed to create the TCP control socket.\n"); 3632 3633 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3634 bpf_iter_register(); 3635 #endif 3636 } 3637