1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 74 #include <linux/inet.h> 75 #include <linux/ipv6.h> 76 #include <linux/stddef.h> 77 #include <linux/proc_fs.h> 78 #include <linux/seq_file.h> 79 #include <linux/inetdevice.h> 80 #include <linux/btf_ids.h> 81 82 #include <crypto/hash.h> 83 #include <linux/scatterlist.h> 84 85 #include <trace/events/tcp.h> 86 87 #ifdef CONFIG_TCP_MD5SIG 88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 89 __be32 daddr, __be32 saddr, const struct tcphdr *th); 90 #endif 91 92 struct inet_hashinfo tcp_hashinfo; 93 EXPORT_SYMBOL(tcp_hashinfo); 94 95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 96 97 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 98 { 99 return secure_tcp_seq(ip_hdr(skb)->daddr, 100 ip_hdr(skb)->saddr, 101 tcp_hdr(skb)->dest, 102 tcp_hdr(skb)->source); 103 } 104 105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 106 { 107 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 108 } 109 110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 111 { 112 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 113 const struct inet_timewait_sock *tw = inet_twsk(sktw); 114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 115 struct tcp_sock *tp = tcp_sk(sk); 116 117 if (reuse == 2) { 118 /* Still does not detect *everything* that goes through 119 * lo, since we require a loopback src or dst address 120 * or direct binding to 'lo' interface. 121 */ 122 bool loopback = false; 123 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 124 loopback = true; 125 #if IS_ENABLED(CONFIG_IPV6) 126 if (tw->tw_family == AF_INET6) { 127 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 128 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 129 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 130 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 131 loopback = true; 132 } else 133 #endif 134 { 135 if (ipv4_is_loopback(tw->tw_daddr) || 136 ipv4_is_loopback(tw->tw_rcv_saddr)) 137 loopback = true; 138 } 139 if (!loopback) 140 reuse = 0; 141 } 142 143 /* With PAWS, it is safe from the viewpoint 144 of data integrity. Even without PAWS it is safe provided sequence 145 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 146 147 Actually, the idea is close to VJ's one, only timestamp cache is 148 held not per host, but per port pair and TW bucket is used as state 149 holder. 150 151 If TW bucket has been already destroyed we fall back to VJ's scheme 152 and use initial timestamp retrieved from peer table. 153 */ 154 if (tcptw->tw_ts_recent_stamp && 155 (!twp || (reuse && time_after32(ktime_get_seconds(), 156 tcptw->tw_ts_recent_stamp)))) { 157 /* In case of repair and re-using TIME-WAIT sockets we still 158 * want to be sure that it is safe as above but honor the 159 * sequence numbers and time stamps set as part of the repair 160 * process. 161 * 162 * Without this check re-using a TIME-WAIT socket with TCP 163 * repair would accumulate a -1 on the repair assigned 164 * sequence number. The first time it is reused the sequence 165 * is -1, the second time -2, etc. This fixes that issue 166 * without appearing to create any others. 167 */ 168 if (likely(!tp->repair)) { 169 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 170 171 if (!seq) 172 seq = 1; 173 WRITE_ONCE(tp->write_seq, seq); 174 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 175 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 176 } 177 sock_hold(sktw); 178 return 1; 179 } 180 181 return 0; 182 } 183 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 184 185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 186 int addr_len) 187 { 188 /* This check is replicated from tcp_v4_connect() and intended to 189 * prevent BPF program called below from accessing bytes that are out 190 * of the bound specified by user in addr_len. 191 */ 192 if (addr_len < sizeof(struct sockaddr_in)) 193 return -EINVAL; 194 195 sock_owned_by_me(sk); 196 197 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 198 } 199 200 /* This will initiate an outgoing connection. */ 201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 202 { 203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 204 struct inet_timewait_death_row *tcp_death_row; 205 struct inet_sock *inet = inet_sk(sk); 206 struct tcp_sock *tp = tcp_sk(sk); 207 struct ip_options_rcu *inet_opt; 208 struct net *net = sock_net(sk); 209 __be16 orig_sport, orig_dport; 210 __be32 daddr, nexthop; 211 struct flowi4 *fl4; 212 struct rtable *rt; 213 int err; 214 215 if (addr_len < sizeof(struct sockaddr_in)) 216 return -EINVAL; 217 218 if (usin->sin_family != AF_INET) 219 return -EAFNOSUPPORT; 220 221 nexthop = daddr = usin->sin_addr.s_addr; 222 inet_opt = rcu_dereference_protected(inet->inet_opt, 223 lockdep_sock_is_held(sk)); 224 if (inet_opt && inet_opt->opt.srr) { 225 if (!daddr) 226 return -EINVAL; 227 nexthop = inet_opt->opt.faddr; 228 } 229 230 orig_sport = inet->inet_sport; 231 orig_dport = usin->sin_port; 232 fl4 = &inet->cork.fl.u.ip4; 233 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 234 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 235 orig_dport, sk); 236 if (IS_ERR(rt)) { 237 err = PTR_ERR(rt); 238 if (err == -ENETUNREACH) 239 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 240 return err; 241 } 242 243 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 244 ip_rt_put(rt); 245 return -ENETUNREACH; 246 } 247 248 if (!inet_opt || !inet_opt->opt.srr) 249 daddr = fl4->daddr; 250 251 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 252 253 if (!inet->inet_saddr) { 254 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 255 if (err) { 256 ip_rt_put(rt); 257 return err; 258 } 259 } else { 260 sk_rcv_saddr_set(sk, inet->inet_saddr); 261 } 262 263 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 264 /* Reset inherited state */ 265 tp->rx_opt.ts_recent = 0; 266 tp->rx_opt.ts_recent_stamp = 0; 267 if (likely(!tp->repair)) 268 WRITE_ONCE(tp->write_seq, 0); 269 } 270 271 inet->inet_dport = usin->sin_port; 272 sk_daddr_set(sk, daddr); 273 274 inet_csk(sk)->icsk_ext_hdr_len = 0; 275 if (inet_opt) 276 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 277 278 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 279 280 /* Socket identity is still unknown (sport may be zero). 281 * However we set state to SYN-SENT and not releasing socket 282 * lock select source port, enter ourselves into the hash tables and 283 * complete initialization after this. 284 */ 285 tcp_set_state(sk, TCP_SYN_SENT); 286 err = inet_hash_connect(tcp_death_row, sk); 287 if (err) 288 goto failure; 289 290 sk_set_txhash(sk); 291 292 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 293 inet->inet_sport, inet->inet_dport, sk); 294 if (IS_ERR(rt)) { 295 err = PTR_ERR(rt); 296 rt = NULL; 297 goto failure; 298 } 299 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 300 /* OK, now commit destination to socket. */ 301 sk->sk_gso_type = SKB_GSO_TCPV4; 302 sk_setup_caps(sk, &rt->dst); 303 rt = NULL; 304 305 if (likely(!tp->repair)) { 306 if (!tp->write_seq) 307 WRITE_ONCE(tp->write_seq, 308 secure_tcp_seq(inet->inet_saddr, 309 inet->inet_daddr, 310 inet->inet_sport, 311 usin->sin_port)); 312 WRITE_ONCE(tp->tsoffset, 313 secure_tcp_ts_off(net, inet->inet_saddr, 314 inet->inet_daddr)); 315 } 316 317 atomic_set(&inet->inet_id, get_random_u16()); 318 319 if (tcp_fastopen_defer_connect(sk, &err)) 320 return err; 321 if (err) 322 goto failure; 323 324 err = tcp_connect(sk); 325 326 if (err) 327 goto failure; 328 329 return 0; 330 331 failure: 332 /* 333 * This unhashes the socket and releases the local port, 334 * if necessary. 335 */ 336 tcp_set_state(sk, TCP_CLOSE); 337 inet_bhash2_reset_saddr(sk); 338 ip_rt_put(rt); 339 sk->sk_route_caps = 0; 340 inet->inet_dport = 0; 341 return err; 342 } 343 EXPORT_SYMBOL(tcp_v4_connect); 344 345 /* 346 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 347 * It can be called through tcp_release_cb() if socket was owned by user 348 * at the time tcp_v4_err() was called to handle ICMP message. 349 */ 350 void tcp_v4_mtu_reduced(struct sock *sk) 351 { 352 struct inet_sock *inet = inet_sk(sk); 353 struct dst_entry *dst; 354 u32 mtu; 355 356 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 357 return; 358 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 359 dst = inet_csk_update_pmtu(sk, mtu); 360 if (!dst) 361 return; 362 363 /* Something is about to be wrong... Remember soft error 364 * for the case, if this connection will not able to recover. 365 */ 366 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 367 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 368 369 mtu = dst_mtu(dst); 370 371 if (inet->pmtudisc != IP_PMTUDISC_DONT && 372 ip_sk_accept_pmtu(sk) && 373 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 374 tcp_sync_mss(sk, mtu); 375 376 /* Resend the TCP packet because it's 377 * clear that the old packet has been 378 * dropped. This is the new "fast" path mtu 379 * discovery. 380 */ 381 tcp_simple_retransmit(sk); 382 } /* else let the usual retransmit timer handle it */ 383 } 384 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 385 386 static void do_redirect(struct sk_buff *skb, struct sock *sk) 387 { 388 struct dst_entry *dst = __sk_dst_check(sk, 0); 389 390 if (dst) 391 dst->ops->redirect(dst, sk, skb); 392 } 393 394 395 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 396 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 397 { 398 struct request_sock *req = inet_reqsk(sk); 399 struct net *net = sock_net(sk); 400 401 /* ICMPs are not backlogged, hence we cannot get 402 * an established socket here. 403 */ 404 if (seq != tcp_rsk(req)->snt_isn) { 405 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 406 } else if (abort) { 407 /* 408 * Still in SYN_RECV, just remove it silently. 409 * There is no good way to pass the error to the newly 410 * created socket, and POSIX does not want network 411 * errors returned from accept(). 412 */ 413 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 414 tcp_listendrop(req->rsk_listener); 415 } 416 reqsk_put(req); 417 } 418 EXPORT_SYMBOL(tcp_req_err); 419 420 /* TCP-LD (RFC 6069) logic */ 421 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 422 { 423 struct inet_connection_sock *icsk = inet_csk(sk); 424 struct tcp_sock *tp = tcp_sk(sk); 425 struct sk_buff *skb; 426 s32 remaining; 427 u32 delta_us; 428 429 if (sock_owned_by_user(sk)) 430 return; 431 432 if (seq != tp->snd_una || !icsk->icsk_retransmits || 433 !icsk->icsk_backoff) 434 return; 435 436 skb = tcp_rtx_queue_head(sk); 437 if (WARN_ON_ONCE(!skb)) 438 return; 439 440 icsk->icsk_backoff--; 441 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 442 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 443 444 tcp_mstamp_refresh(tp); 445 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 446 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 447 448 if (remaining > 0) { 449 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 450 remaining, TCP_RTO_MAX); 451 } else { 452 /* RTO revert clocked out retransmission. 453 * Will retransmit now. 454 */ 455 tcp_retransmit_timer(sk); 456 } 457 } 458 EXPORT_SYMBOL(tcp_ld_RTO_revert); 459 460 /* 461 * This routine is called by the ICMP module when it gets some 462 * sort of error condition. If err < 0 then the socket should 463 * be closed and the error returned to the user. If err > 0 464 * it's just the icmp type << 8 | icmp code. After adjustment 465 * header points to the first 8 bytes of the tcp header. We need 466 * to find the appropriate port. 467 * 468 * The locking strategy used here is very "optimistic". When 469 * someone else accesses the socket the ICMP is just dropped 470 * and for some paths there is no check at all. 471 * A more general error queue to queue errors for later handling 472 * is probably better. 473 * 474 */ 475 476 int tcp_v4_err(struct sk_buff *skb, u32 info) 477 { 478 const struct iphdr *iph = (const struct iphdr *)skb->data; 479 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 480 struct tcp_sock *tp; 481 const int type = icmp_hdr(skb)->type; 482 const int code = icmp_hdr(skb)->code; 483 struct sock *sk; 484 struct request_sock *fastopen; 485 bool harderr = false; 486 u32 seq, snd_una; 487 int err; 488 struct net *net = dev_net(skb->dev); 489 490 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 491 iph->daddr, th->dest, iph->saddr, 492 ntohs(th->source), inet_iif(skb), 0); 493 if (!sk) { 494 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 495 return -ENOENT; 496 } 497 if (sk->sk_state == TCP_TIME_WAIT) { 498 /* To increase the counter of ignored icmps for TCP-AO */ 499 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 500 inet_twsk_put(inet_twsk(sk)); 501 return 0; 502 } 503 seq = ntohl(th->seq); 504 if (sk->sk_state == TCP_NEW_SYN_RECV) { 505 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 506 type == ICMP_TIME_EXCEEDED || 507 (type == ICMP_DEST_UNREACH && 508 (code == ICMP_NET_UNREACH || 509 code == ICMP_HOST_UNREACH))); 510 return 0; 511 } 512 513 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 514 sock_put(sk); 515 return 0; 516 } 517 518 bh_lock_sock(sk); 519 /* If too many ICMPs get dropped on busy 520 * servers this needs to be solved differently. 521 * We do take care of PMTU discovery (RFC1191) special case : 522 * we can receive locally generated ICMP messages while socket is held. 523 */ 524 if (sock_owned_by_user(sk)) { 525 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 526 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 527 } 528 if (sk->sk_state == TCP_CLOSE) 529 goto out; 530 531 if (static_branch_unlikely(&ip4_min_ttl)) { 532 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 533 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 534 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 535 goto out; 536 } 537 } 538 539 tp = tcp_sk(sk); 540 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 541 fastopen = rcu_dereference(tp->fastopen_rsk); 542 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 543 if (sk->sk_state != TCP_LISTEN && 544 !between(seq, snd_una, tp->snd_nxt)) { 545 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 546 goto out; 547 } 548 549 switch (type) { 550 case ICMP_REDIRECT: 551 if (!sock_owned_by_user(sk)) 552 do_redirect(skb, sk); 553 goto out; 554 case ICMP_SOURCE_QUENCH: 555 /* Just silently ignore these. */ 556 goto out; 557 case ICMP_PARAMETERPROB: 558 err = EPROTO; 559 harderr = true; 560 break; 561 case ICMP_DEST_UNREACH: 562 if (code > NR_ICMP_UNREACH) 563 goto out; 564 565 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 566 /* We are not interested in TCP_LISTEN and open_requests 567 * (SYN-ACKs send out by Linux are always <576bytes so 568 * they should go through unfragmented). 569 */ 570 if (sk->sk_state == TCP_LISTEN) 571 goto out; 572 573 WRITE_ONCE(tp->mtu_info, info); 574 if (!sock_owned_by_user(sk)) { 575 tcp_v4_mtu_reduced(sk); 576 } else { 577 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 578 sock_hold(sk); 579 } 580 goto out; 581 } 582 583 err = icmp_err_convert[code].errno; 584 harderr = icmp_err_convert[code].fatal; 585 /* check if this ICMP message allows revert of backoff. 586 * (see RFC 6069) 587 */ 588 if (!fastopen && 589 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 590 tcp_ld_RTO_revert(sk, seq); 591 break; 592 case ICMP_TIME_EXCEEDED: 593 err = EHOSTUNREACH; 594 break; 595 default: 596 goto out; 597 } 598 599 switch (sk->sk_state) { 600 case TCP_SYN_SENT: 601 case TCP_SYN_RECV: 602 /* Only in fast or simultaneous open. If a fast open socket is 603 * already accepted it is treated as a connected one below. 604 */ 605 if (fastopen && !fastopen->sk) 606 break; 607 608 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 609 610 if (!harderr) 611 break; 612 613 if (!sock_owned_by_user(sk)) { 614 WRITE_ONCE(sk->sk_err, err); 615 616 sk_error_report(sk); 617 618 tcp_done(sk); 619 } else { 620 WRITE_ONCE(sk->sk_err_soft, err); 621 } 622 goto out; 623 } 624 625 /* If we've already connected we will keep trying 626 * until we time out, or the user gives up. 627 * 628 * rfc1122 4.2.3.9 allows to consider as hard errors 629 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 630 * but it is obsoleted by pmtu discovery). 631 * 632 * Note, that in modern internet, where routing is unreliable 633 * and in each dark corner broken firewalls sit, sending random 634 * errors ordered by their masters even this two messages finally lose 635 * their original sense (even Linux sends invalid PORT_UNREACHs) 636 * 637 * Now we are in compliance with RFCs. 638 * --ANK (980905) 639 */ 640 641 if (!sock_owned_by_user(sk) && 642 inet_test_bit(RECVERR, sk)) { 643 WRITE_ONCE(sk->sk_err, err); 644 sk_error_report(sk); 645 } else { /* Only an error on timeout */ 646 WRITE_ONCE(sk->sk_err_soft, err); 647 } 648 649 out: 650 bh_unlock_sock(sk); 651 sock_put(sk); 652 return 0; 653 } 654 655 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 656 { 657 struct tcphdr *th = tcp_hdr(skb); 658 659 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 660 skb->csum_start = skb_transport_header(skb) - skb->head; 661 skb->csum_offset = offsetof(struct tcphdr, check); 662 } 663 664 /* This routine computes an IPv4 TCP checksum. */ 665 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 666 { 667 const struct inet_sock *inet = inet_sk(sk); 668 669 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 670 } 671 EXPORT_SYMBOL(tcp_v4_send_check); 672 673 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 674 675 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 676 const struct tcp_ao_hdr *aoh, 677 struct ip_reply_arg *arg, struct tcphdr *reply, 678 __be32 reply_options[REPLY_OPTIONS_LEN]) 679 { 680 #ifdef CONFIG_TCP_AO 681 int sdif = tcp_v4_sdif(skb); 682 int dif = inet_iif(skb); 683 int l3index = sdif ? dif : 0; 684 bool allocated_traffic_key; 685 struct tcp_ao_key *key; 686 char *traffic_key; 687 bool drop = true; 688 u32 ao_sne = 0; 689 u8 keyid; 690 691 rcu_read_lock(); 692 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 693 &key, &traffic_key, &allocated_traffic_key, 694 &keyid, &ao_sne)) 695 goto out; 696 697 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 698 (aoh->rnext_keyid << 8) | keyid); 699 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 700 reply->doff = arg->iov[0].iov_len / 4; 701 702 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 703 key, traffic_key, 704 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 705 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 706 reply, ao_sne)) 707 goto out; 708 drop = false; 709 out: 710 rcu_read_unlock(); 711 if (allocated_traffic_key) 712 kfree(traffic_key); 713 return drop; 714 #else 715 return true; 716 #endif 717 } 718 719 /* 720 * This routine will send an RST to the other tcp. 721 * 722 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 723 * for reset. 724 * Answer: if a packet caused RST, it is not for a socket 725 * existing in our system, if it is matched to a socket, 726 * it is just duplicate segment or bug in other side's TCP. 727 * So that we build reply only basing on parameters 728 * arrived with segment. 729 * Exception: precedence violation. We do not implement it in any case. 730 */ 731 732 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 733 { 734 const struct tcphdr *th = tcp_hdr(skb); 735 struct { 736 struct tcphdr th; 737 __be32 opt[REPLY_OPTIONS_LEN]; 738 } rep; 739 const __u8 *md5_hash_location = NULL; 740 const struct tcp_ao_hdr *aoh; 741 struct ip_reply_arg arg; 742 #ifdef CONFIG_TCP_MD5SIG 743 struct tcp_md5sig_key *key = NULL; 744 unsigned char newhash[16]; 745 struct sock *sk1 = NULL; 746 int genhash; 747 #endif 748 u64 transmit_time = 0; 749 struct sock *ctl_sk; 750 struct net *net; 751 u32 txhash = 0; 752 753 /* Never send a reset in response to a reset. */ 754 if (th->rst) 755 return; 756 757 /* If sk not NULL, it means we did a successful lookup and incoming 758 * route had to be correct. prequeue might have dropped our dst. 759 */ 760 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 761 return; 762 763 /* Swap the send and the receive. */ 764 memset(&rep, 0, sizeof(rep)); 765 rep.th.dest = th->source; 766 rep.th.source = th->dest; 767 rep.th.doff = sizeof(struct tcphdr) / 4; 768 rep.th.rst = 1; 769 770 if (th->ack) { 771 rep.th.seq = th->ack_seq; 772 } else { 773 rep.th.ack = 1; 774 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 775 skb->len - (th->doff << 2)); 776 } 777 778 memset(&arg, 0, sizeof(arg)); 779 arg.iov[0].iov_base = (unsigned char *)&rep; 780 arg.iov[0].iov_len = sizeof(rep.th); 781 782 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 783 784 /* Invalid TCP option size or twice included auth */ 785 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 786 return; 787 788 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 789 return; 790 791 #ifdef CONFIG_TCP_MD5SIG 792 rcu_read_lock(); 793 if (sk && sk_fullsock(sk)) { 794 const union tcp_md5_addr *addr; 795 int l3index; 796 797 /* sdif set, means packet ingressed via a device 798 * in an L3 domain and inet_iif is set to it. 799 */ 800 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 801 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 802 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 803 } else if (md5_hash_location) { 804 const union tcp_md5_addr *addr; 805 int sdif = tcp_v4_sdif(skb); 806 int dif = inet_iif(skb); 807 int l3index; 808 809 /* 810 * active side is lost. Try to find listening socket through 811 * source port, and then find md5 key through listening socket. 812 * we are not loose security here: 813 * Incoming packet is checked with md5 hash with finding key, 814 * no RST generated if md5 hash doesn't match. 815 */ 816 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 817 NULL, 0, ip_hdr(skb)->saddr, 818 th->source, ip_hdr(skb)->daddr, 819 ntohs(th->source), dif, sdif); 820 /* don't send rst if it can't find key */ 821 if (!sk1) 822 goto out; 823 824 /* sdif set, means packet ingressed via a device 825 * in an L3 domain and dif is set to it. 826 */ 827 l3index = sdif ? dif : 0; 828 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 829 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 830 if (!key) 831 goto out; 832 833 834 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 835 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 836 goto out; 837 838 } 839 840 if (key) { 841 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 842 (TCPOPT_NOP << 16) | 843 (TCPOPT_MD5SIG << 8) | 844 TCPOLEN_MD5SIG); 845 /* Update length and the length the header thinks exists */ 846 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 847 rep.th.doff = arg.iov[0].iov_len / 4; 848 849 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 850 key, ip_hdr(skb)->saddr, 851 ip_hdr(skb)->daddr, &rep.th); 852 } 853 #endif 854 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 855 if (rep.opt[0] == 0) { 856 __be32 mrst = mptcp_reset_option(skb); 857 858 if (mrst) { 859 rep.opt[0] = mrst; 860 arg.iov[0].iov_len += sizeof(mrst); 861 rep.th.doff = arg.iov[0].iov_len / 4; 862 } 863 } 864 865 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 866 ip_hdr(skb)->saddr, /* XXX */ 867 arg.iov[0].iov_len, IPPROTO_TCP, 0); 868 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 869 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 870 871 /* When socket is gone, all binding information is lost. 872 * routing might fail in this case. No choice here, if we choose to force 873 * input interface, we will misroute in case of asymmetric route. 874 */ 875 if (sk) { 876 arg.bound_dev_if = sk->sk_bound_dev_if; 877 if (sk_fullsock(sk)) 878 trace_tcp_send_reset(sk, skb); 879 } 880 881 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 882 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 883 884 arg.tos = ip_hdr(skb)->tos; 885 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 886 local_bh_disable(); 887 ctl_sk = this_cpu_read(ipv4_tcp_sk); 888 sock_net_set(ctl_sk, net); 889 if (sk) { 890 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 891 inet_twsk(sk)->tw_mark : sk->sk_mark; 892 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 893 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 894 transmit_time = tcp_transmit_time(sk); 895 xfrm_sk_clone_policy(ctl_sk, sk); 896 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 897 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 898 } else { 899 ctl_sk->sk_mark = 0; 900 ctl_sk->sk_priority = 0; 901 } 902 ip_send_unicast_reply(ctl_sk, 903 skb, &TCP_SKB_CB(skb)->header.h4.opt, 904 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 905 &arg, arg.iov[0].iov_len, 906 transmit_time, txhash); 907 908 xfrm_sk_free_policy(ctl_sk); 909 sock_net_set(ctl_sk, &init_net); 910 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 911 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 912 local_bh_enable(); 913 914 #ifdef CONFIG_TCP_MD5SIG 915 out: 916 rcu_read_unlock(); 917 #endif 918 } 919 920 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 921 outside socket context is ugly, certainly. What can I do? 922 */ 923 924 static void tcp_v4_send_ack(const struct sock *sk, 925 struct sk_buff *skb, u32 seq, u32 ack, 926 u32 win, u32 tsval, u32 tsecr, int oif, 927 struct tcp_key *key, 928 int reply_flags, u8 tos, u32 txhash) 929 { 930 const struct tcphdr *th = tcp_hdr(skb); 931 struct { 932 struct tcphdr th; 933 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 934 } rep; 935 struct net *net = sock_net(sk); 936 struct ip_reply_arg arg; 937 struct sock *ctl_sk; 938 u64 transmit_time; 939 940 memset(&rep.th, 0, sizeof(struct tcphdr)); 941 memset(&arg, 0, sizeof(arg)); 942 943 arg.iov[0].iov_base = (unsigned char *)&rep; 944 arg.iov[0].iov_len = sizeof(rep.th); 945 if (tsecr) { 946 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 947 (TCPOPT_TIMESTAMP << 8) | 948 TCPOLEN_TIMESTAMP); 949 rep.opt[1] = htonl(tsval); 950 rep.opt[2] = htonl(tsecr); 951 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 952 } 953 954 /* Swap the send and the receive. */ 955 rep.th.dest = th->source; 956 rep.th.source = th->dest; 957 rep.th.doff = arg.iov[0].iov_len / 4; 958 rep.th.seq = htonl(seq); 959 rep.th.ack_seq = htonl(ack); 960 rep.th.ack = 1; 961 rep.th.window = htons(win); 962 963 #ifdef CONFIG_TCP_MD5SIG 964 if (tcp_key_is_md5(key)) { 965 int offset = (tsecr) ? 3 : 0; 966 967 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 968 (TCPOPT_NOP << 16) | 969 (TCPOPT_MD5SIG << 8) | 970 TCPOLEN_MD5SIG); 971 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 972 rep.th.doff = arg.iov[0].iov_len/4; 973 974 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 975 key->md5_key, ip_hdr(skb)->saddr, 976 ip_hdr(skb)->daddr, &rep.th); 977 } 978 #endif 979 #ifdef CONFIG_TCP_AO 980 if (tcp_key_is_ao(key)) { 981 int offset = (tsecr) ? 3 : 0; 982 983 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 984 (tcp_ao_len(key->ao_key) << 16) | 985 (key->ao_key->sndid << 8) | 986 key->rcv_next); 987 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 988 rep.th.doff = arg.iov[0].iov_len / 4; 989 990 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 991 key->ao_key, key->traffic_key, 992 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 993 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 994 &rep.th, key->sne); 995 } 996 #endif 997 arg.flags = reply_flags; 998 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 999 ip_hdr(skb)->saddr, /* XXX */ 1000 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1001 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1002 if (oif) 1003 arg.bound_dev_if = oif; 1004 arg.tos = tos; 1005 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1006 local_bh_disable(); 1007 ctl_sk = this_cpu_read(ipv4_tcp_sk); 1008 sock_net_set(ctl_sk, net); 1009 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1010 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1011 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1012 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1013 transmit_time = tcp_transmit_time(sk); 1014 ip_send_unicast_reply(ctl_sk, 1015 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1016 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1017 &arg, arg.iov[0].iov_len, 1018 transmit_time, txhash); 1019 1020 sock_net_set(ctl_sk, &init_net); 1021 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1022 local_bh_enable(); 1023 } 1024 1025 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 1026 { 1027 struct inet_timewait_sock *tw = inet_twsk(sk); 1028 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1029 struct tcp_key key = {}; 1030 #ifdef CONFIG_TCP_AO 1031 struct tcp_ao_info *ao_info; 1032 1033 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1034 /* FIXME: the segment to-be-acked is not verified yet */ 1035 ao_info = rcu_dereference(tcptw->ao_info); 1036 if (ao_info) { 1037 const struct tcp_ao_hdr *aoh; 1038 1039 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1040 inet_twsk_put(tw); 1041 return; 1042 } 1043 1044 if (aoh) 1045 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); 1046 } 1047 } 1048 if (key.ao_key) { 1049 struct tcp_ao_key *rnext_key; 1050 1051 key.traffic_key = snd_other_key(key.ao_key); 1052 key.sne = READ_ONCE(ao_info->snd_sne); 1053 rnext_key = READ_ONCE(ao_info->rnext_key); 1054 key.rcv_next = rnext_key->rcvid; 1055 key.type = TCP_KEY_AO; 1056 #else 1057 if (0) { 1058 #endif 1059 #ifdef CONFIG_TCP_MD5SIG 1060 } else if (static_branch_unlikely(&tcp_md5_needed.key)) { 1061 key.md5_key = tcp_twsk_md5_key(tcptw); 1062 if (key.md5_key) 1063 key.type = TCP_KEY_MD5; 1064 #endif 1065 } 1066 1067 tcp_v4_send_ack(sk, skb, 1068 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 1069 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1070 tcp_tw_tsval(tcptw), 1071 tcptw->tw_ts_recent, 1072 tw->tw_bound_dev_if, &key, 1073 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1074 tw->tw_tos, 1075 tw->tw_txhash); 1076 1077 inet_twsk_put(tw); 1078 } 1079 1080 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1081 struct request_sock *req) 1082 { 1083 struct tcp_key key = {}; 1084 1085 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1086 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1087 */ 1088 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1089 tcp_sk(sk)->snd_nxt; 1090 1091 #ifdef CONFIG_TCP_AO 1092 if (static_branch_unlikely(&tcp_ao_needed.key) && 1093 tcp_rsk_used_ao(req)) { 1094 const union tcp_md5_addr *addr; 1095 const struct tcp_ao_hdr *aoh; 1096 int l3index; 1097 1098 /* Invalid TCP option size or twice included auth */ 1099 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1100 return; 1101 if (!aoh) 1102 return; 1103 1104 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1105 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1106 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1107 aoh->rnext_keyid, -1); 1108 if (unlikely(!key.ao_key)) { 1109 /* Send ACK with any matching MKT for the peer */ 1110 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1111 /* Matching key disappeared (user removed the key?) 1112 * let the handshake timeout. 1113 */ 1114 if (!key.ao_key) { 1115 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1116 addr, 1117 ntohs(tcp_hdr(skb)->source), 1118 &ip_hdr(skb)->daddr, 1119 ntohs(tcp_hdr(skb)->dest)); 1120 return; 1121 } 1122 } 1123 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1124 if (!key.traffic_key) 1125 return; 1126 1127 key.type = TCP_KEY_AO; 1128 key.rcv_next = aoh->keyid; 1129 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1130 #else 1131 if (0) { 1132 #endif 1133 #ifdef CONFIG_TCP_MD5SIG 1134 } else if (static_branch_unlikely(&tcp_md5_needed.key)) { 1135 const union tcp_md5_addr *addr; 1136 int l3index; 1137 1138 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1139 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1140 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1141 if (key.md5_key) 1142 key.type = TCP_KEY_MD5; 1143 #endif 1144 } 1145 1146 /* RFC 7323 2.3 1147 * The window field (SEG.WND) of every outgoing segment, with the 1148 * exception of <SYN> segments, MUST be right-shifted by 1149 * Rcv.Wind.Shift bits: 1150 */ 1151 tcp_v4_send_ack(sk, skb, seq, 1152 tcp_rsk(req)->rcv_nxt, 1153 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 1154 tcp_rsk_tsval(tcp_rsk(req)), 1155 READ_ONCE(req->ts_recent), 1156 0, &key, 1157 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1158 ip_hdr(skb)->tos, 1159 READ_ONCE(tcp_rsk(req)->txhash)); 1160 if (tcp_key_is_ao(&key)) 1161 kfree(key.traffic_key); 1162 } 1163 1164 /* 1165 * Send a SYN-ACK after having received a SYN. 1166 * This still operates on a request_sock only, not on a big 1167 * socket. 1168 */ 1169 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1170 struct flowi *fl, 1171 struct request_sock *req, 1172 struct tcp_fastopen_cookie *foc, 1173 enum tcp_synack_type synack_type, 1174 struct sk_buff *syn_skb) 1175 { 1176 const struct inet_request_sock *ireq = inet_rsk(req); 1177 struct flowi4 fl4; 1178 int err = -1; 1179 struct sk_buff *skb; 1180 u8 tos; 1181 1182 /* First, grab a route. */ 1183 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1184 return -1; 1185 1186 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1187 1188 if (skb) { 1189 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1190 1191 tos = READ_ONCE(inet_sk(sk)->tos); 1192 1193 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1194 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1195 (tos & INET_ECN_MASK); 1196 1197 if (!INET_ECN_is_capable(tos) && 1198 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1199 tos |= INET_ECN_ECT_0; 1200 1201 rcu_read_lock(); 1202 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1203 ireq->ir_rmt_addr, 1204 rcu_dereference(ireq->ireq_opt), 1205 tos); 1206 rcu_read_unlock(); 1207 err = net_xmit_eval(err); 1208 } 1209 1210 return err; 1211 } 1212 1213 /* 1214 * IPv4 request_sock destructor. 1215 */ 1216 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1217 { 1218 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1219 } 1220 1221 #ifdef CONFIG_TCP_MD5SIG 1222 /* 1223 * RFC2385 MD5 checksumming requires a mapping of 1224 * IP address->MD5 Key. 1225 * We need to maintain these in the sk structure. 1226 */ 1227 1228 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1229 EXPORT_SYMBOL(tcp_md5_needed); 1230 1231 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1232 { 1233 if (!old) 1234 return true; 1235 1236 /* l3index always overrides non-l3index */ 1237 if (old->l3index && new->l3index == 0) 1238 return false; 1239 if (old->l3index == 0 && new->l3index) 1240 return true; 1241 1242 return old->prefixlen < new->prefixlen; 1243 } 1244 1245 /* Find the Key structure for an address. */ 1246 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1247 const union tcp_md5_addr *addr, 1248 int family, bool any_l3index) 1249 { 1250 const struct tcp_sock *tp = tcp_sk(sk); 1251 struct tcp_md5sig_key *key; 1252 const struct tcp_md5sig_info *md5sig; 1253 __be32 mask; 1254 struct tcp_md5sig_key *best_match = NULL; 1255 bool match; 1256 1257 /* caller either holds rcu_read_lock() or socket lock */ 1258 md5sig = rcu_dereference_check(tp->md5sig_info, 1259 lockdep_sock_is_held(sk)); 1260 if (!md5sig) 1261 return NULL; 1262 1263 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1264 lockdep_sock_is_held(sk)) { 1265 if (key->family != family) 1266 continue; 1267 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1268 key->l3index != l3index) 1269 continue; 1270 if (family == AF_INET) { 1271 mask = inet_make_mask(key->prefixlen); 1272 match = (key->addr.a4.s_addr & mask) == 1273 (addr->a4.s_addr & mask); 1274 #if IS_ENABLED(CONFIG_IPV6) 1275 } else if (family == AF_INET6) { 1276 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1277 key->prefixlen); 1278 #endif 1279 } else { 1280 match = false; 1281 } 1282 1283 if (match && better_md5_match(best_match, key)) 1284 best_match = key; 1285 } 1286 return best_match; 1287 } 1288 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1289 1290 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1291 const union tcp_md5_addr *addr, 1292 int family, u8 prefixlen, 1293 int l3index, u8 flags) 1294 { 1295 const struct tcp_sock *tp = tcp_sk(sk); 1296 struct tcp_md5sig_key *key; 1297 unsigned int size = sizeof(struct in_addr); 1298 const struct tcp_md5sig_info *md5sig; 1299 1300 /* caller either holds rcu_read_lock() or socket lock */ 1301 md5sig = rcu_dereference_check(tp->md5sig_info, 1302 lockdep_sock_is_held(sk)); 1303 if (!md5sig) 1304 return NULL; 1305 #if IS_ENABLED(CONFIG_IPV6) 1306 if (family == AF_INET6) 1307 size = sizeof(struct in6_addr); 1308 #endif 1309 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1310 lockdep_sock_is_held(sk)) { 1311 if (key->family != family) 1312 continue; 1313 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1314 continue; 1315 if (key->l3index != l3index) 1316 continue; 1317 if (!memcmp(&key->addr, addr, size) && 1318 key->prefixlen == prefixlen) 1319 return key; 1320 } 1321 return NULL; 1322 } 1323 1324 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1325 const struct sock *addr_sk) 1326 { 1327 const union tcp_md5_addr *addr; 1328 int l3index; 1329 1330 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1331 addr_sk->sk_bound_dev_if); 1332 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1333 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1334 } 1335 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1336 1337 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1338 { 1339 struct tcp_sock *tp = tcp_sk(sk); 1340 struct tcp_md5sig_info *md5sig; 1341 1342 md5sig = kmalloc(sizeof(*md5sig), gfp); 1343 if (!md5sig) 1344 return -ENOMEM; 1345 1346 sk_gso_disable(sk); 1347 INIT_HLIST_HEAD(&md5sig->head); 1348 rcu_assign_pointer(tp->md5sig_info, md5sig); 1349 return 0; 1350 } 1351 1352 /* This can be called on a newly created socket, from other files */ 1353 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1354 int family, u8 prefixlen, int l3index, u8 flags, 1355 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1356 { 1357 /* Add Key to the list */ 1358 struct tcp_md5sig_key *key; 1359 struct tcp_sock *tp = tcp_sk(sk); 1360 struct tcp_md5sig_info *md5sig; 1361 1362 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1363 if (key) { 1364 /* Pre-existing entry - just update that one. 1365 * Note that the key might be used concurrently. 1366 * data_race() is telling kcsan that we do not care of 1367 * key mismatches, since changing MD5 key on live flows 1368 * can lead to packet drops. 1369 */ 1370 data_race(memcpy(key->key, newkey, newkeylen)); 1371 1372 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1373 * Also note that a reader could catch new key->keylen value 1374 * but old key->key[], this is the reason we use __GFP_ZERO 1375 * at sock_kmalloc() time below these lines. 1376 */ 1377 WRITE_ONCE(key->keylen, newkeylen); 1378 1379 return 0; 1380 } 1381 1382 md5sig = rcu_dereference_protected(tp->md5sig_info, 1383 lockdep_sock_is_held(sk)); 1384 1385 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1386 if (!key) 1387 return -ENOMEM; 1388 1389 memcpy(key->key, newkey, newkeylen); 1390 key->keylen = newkeylen; 1391 key->family = family; 1392 key->prefixlen = prefixlen; 1393 key->l3index = l3index; 1394 key->flags = flags; 1395 memcpy(&key->addr, addr, 1396 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1397 sizeof(struct in_addr)); 1398 hlist_add_head_rcu(&key->node, &md5sig->head); 1399 return 0; 1400 } 1401 1402 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1403 int family, u8 prefixlen, int l3index, u8 flags, 1404 const u8 *newkey, u8 newkeylen) 1405 { 1406 struct tcp_sock *tp = tcp_sk(sk); 1407 1408 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1409 if (tcp_md5_alloc_sigpool()) 1410 return -ENOMEM; 1411 1412 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1413 tcp_md5_release_sigpool(); 1414 return -ENOMEM; 1415 } 1416 1417 if (!static_branch_inc(&tcp_md5_needed.key)) { 1418 struct tcp_md5sig_info *md5sig; 1419 1420 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1421 rcu_assign_pointer(tp->md5sig_info, NULL); 1422 kfree_rcu(md5sig, rcu); 1423 tcp_md5_release_sigpool(); 1424 return -EUSERS; 1425 } 1426 } 1427 1428 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1429 newkey, newkeylen, GFP_KERNEL); 1430 } 1431 EXPORT_SYMBOL(tcp_md5_do_add); 1432 1433 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1434 int family, u8 prefixlen, int l3index, 1435 struct tcp_md5sig_key *key) 1436 { 1437 struct tcp_sock *tp = tcp_sk(sk); 1438 1439 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1440 tcp_md5_add_sigpool(); 1441 1442 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1443 tcp_md5_release_sigpool(); 1444 return -ENOMEM; 1445 } 1446 1447 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1448 struct tcp_md5sig_info *md5sig; 1449 1450 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1451 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1452 rcu_assign_pointer(tp->md5sig_info, NULL); 1453 kfree_rcu(md5sig, rcu); 1454 tcp_md5_release_sigpool(); 1455 return -EUSERS; 1456 } 1457 } 1458 1459 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1460 key->flags, key->key, key->keylen, 1461 sk_gfp_mask(sk, GFP_ATOMIC)); 1462 } 1463 EXPORT_SYMBOL(tcp_md5_key_copy); 1464 1465 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1466 u8 prefixlen, int l3index, u8 flags) 1467 { 1468 struct tcp_md5sig_key *key; 1469 1470 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1471 if (!key) 1472 return -ENOENT; 1473 hlist_del_rcu(&key->node); 1474 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1475 kfree_rcu(key, rcu); 1476 return 0; 1477 } 1478 EXPORT_SYMBOL(tcp_md5_do_del); 1479 1480 void tcp_clear_md5_list(struct sock *sk) 1481 { 1482 struct tcp_sock *tp = tcp_sk(sk); 1483 struct tcp_md5sig_key *key; 1484 struct hlist_node *n; 1485 struct tcp_md5sig_info *md5sig; 1486 1487 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1488 1489 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1490 hlist_del_rcu(&key->node); 1491 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1492 kfree_rcu(key, rcu); 1493 } 1494 } 1495 1496 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1497 sockptr_t optval, int optlen) 1498 { 1499 struct tcp_md5sig cmd; 1500 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1501 const union tcp_md5_addr *addr; 1502 u8 prefixlen = 32; 1503 int l3index = 0; 1504 bool l3flag; 1505 u8 flags; 1506 1507 if (optlen < sizeof(cmd)) 1508 return -EINVAL; 1509 1510 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1511 return -EFAULT; 1512 1513 if (sin->sin_family != AF_INET) 1514 return -EINVAL; 1515 1516 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1517 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1518 1519 if (optname == TCP_MD5SIG_EXT && 1520 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1521 prefixlen = cmd.tcpm_prefixlen; 1522 if (prefixlen > 32) 1523 return -EINVAL; 1524 } 1525 1526 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1527 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1528 struct net_device *dev; 1529 1530 rcu_read_lock(); 1531 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1532 if (dev && netif_is_l3_master(dev)) 1533 l3index = dev->ifindex; 1534 1535 rcu_read_unlock(); 1536 1537 /* ok to reference set/not set outside of rcu; 1538 * right now device MUST be an L3 master 1539 */ 1540 if (!dev || !l3index) 1541 return -EINVAL; 1542 } 1543 1544 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1545 1546 if (!cmd.tcpm_keylen) 1547 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1548 1549 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1550 return -EINVAL; 1551 1552 /* Don't allow keys for peers that have a matching TCP-AO key. 1553 * See the comment in tcp_ao_add_cmd() 1554 */ 1555 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1556 return -EKEYREJECTED; 1557 1558 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1559 cmd.tcpm_key, cmd.tcpm_keylen); 1560 } 1561 1562 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1563 __be32 daddr, __be32 saddr, 1564 const struct tcphdr *th, int nbytes) 1565 { 1566 struct tcp4_pseudohdr *bp; 1567 struct scatterlist sg; 1568 struct tcphdr *_th; 1569 1570 bp = hp->scratch; 1571 bp->saddr = saddr; 1572 bp->daddr = daddr; 1573 bp->pad = 0; 1574 bp->protocol = IPPROTO_TCP; 1575 bp->len = cpu_to_be16(nbytes); 1576 1577 _th = (struct tcphdr *)(bp + 1); 1578 memcpy(_th, th, sizeof(*th)); 1579 _th->check = 0; 1580 1581 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1582 ahash_request_set_crypt(hp->req, &sg, NULL, 1583 sizeof(*bp) + sizeof(*th)); 1584 return crypto_ahash_update(hp->req); 1585 } 1586 1587 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1588 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1589 { 1590 struct tcp_sigpool hp; 1591 1592 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1593 goto clear_hash_nostart; 1594 1595 if (crypto_ahash_init(hp.req)) 1596 goto clear_hash; 1597 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1598 goto clear_hash; 1599 if (tcp_md5_hash_key(&hp, key)) 1600 goto clear_hash; 1601 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1602 if (crypto_ahash_final(hp.req)) 1603 goto clear_hash; 1604 1605 tcp_sigpool_end(&hp); 1606 return 0; 1607 1608 clear_hash: 1609 tcp_sigpool_end(&hp); 1610 clear_hash_nostart: 1611 memset(md5_hash, 0, 16); 1612 return 1; 1613 } 1614 1615 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1616 const struct sock *sk, 1617 const struct sk_buff *skb) 1618 { 1619 const struct tcphdr *th = tcp_hdr(skb); 1620 struct tcp_sigpool hp; 1621 __be32 saddr, daddr; 1622 1623 if (sk) { /* valid for establish/request sockets */ 1624 saddr = sk->sk_rcv_saddr; 1625 daddr = sk->sk_daddr; 1626 } else { 1627 const struct iphdr *iph = ip_hdr(skb); 1628 saddr = iph->saddr; 1629 daddr = iph->daddr; 1630 } 1631 1632 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1633 goto clear_hash_nostart; 1634 1635 if (crypto_ahash_init(hp.req)) 1636 goto clear_hash; 1637 1638 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1639 goto clear_hash; 1640 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1641 goto clear_hash; 1642 if (tcp_md5_hash_key(&hp, key)) 1643 goto clear_hash; 1644 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1645 if (crypto_ahash_final(hp.req)) 1646 goto clear_hash; 1647 1648 tcp_sigpool_end(&hp); 1649 return 0; 1650 1651 clear_hash: 1652 tcp_sigpool_end(&hp); 1653 clear_hash_nostart: 1654 memset(md5_hash, 0, 16); 1655 return 1; 1656 } 1657 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1658 1659 #endif 1660 1661 static void tcp_v4_init_req(struct request_sock *req, 1662 const struct sock *sk_listener, 1663 struct sk_buff *skb) 1664 { 1665 struct inet_request_sock *ireq = inet_rsk(req); 1666 struct net *net = sock_net(sk_listener); 1667 1668 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1669 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1670 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1671 } 1672 1673 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1674 struct sk_buff *skb, 1675 struct flowi *fl, 1676 struct request_sock *req) 1677 { 1678 tcp_v4_init_req(req, sk, skb); 1679 1680 if (security_inet_conn_request(sk, skb, req)) 1681 return NULL; 1682 1683 return inet_csk_route_req(sk, &fl->u.ip4, req); 1684 } 1685 1686 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1687 .family = PF_INET, 1688 .obj_size = sizeof(struct tcp_request_sock), 1689 .rtx_syn_ack = tcp_rtx_synack, 1690 .send_ack = tcp_v4_reqsk_send_ack, 1691 .destructor = tcp_v4_reqsk_destructor, 1692 .send_reset = tcp_v4_send_reset, 1693 .syn_ack_timeout = tcp_syn_ack_timeout, 1694 }; 1695 1696 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1697 .mss_clamp = TCP_MSS_DEFAULT, 1698 #ifdef CONFIG_TCP_MD5SIG 1699 .req_md5_lookup = tcp_v4_md5_lookup, 1700 .calc_md5_hash = tcp_v4_md5_hash_skb, 1701 #endif 1702 #ifdef CONFIG_TCP_AO 1703 .ao_lookup = tcp_v4_ao_lookup_rsk, 1704 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1705 .ao_synack_hash = tcp_v4_ao_synack_hash, 1706 #endif 1707 #ifdef CONFIG_SYN_COOKIES 1708 .cookie_init_seq = cookie_v4_init_sequence, 1709 #endif 1710 .route_req = tcp_v4_route_req, 1711 .init_seq = tcp_v4_init_seq, 1712 .init_ts_off = tcp_v4_init_ts_off, 1713 .send_synack = tcp_v4_send_synack, 1714 }; 1715 1716 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1717 { 1718 /* Never answer to SYNs send to broadcast or multicast */ 1719 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1720 goto drop; 1721 1722 return tcp_conn_request(&tcp_request_sock_ops, 1723 &tcp_request_sock_ipv4_ops, sk, skb); 1724 1725 drop: 1726 tcp_listendrop(sk); 1727 return 0; 1728 } 1729 EXPORT_SYMBOL(tcp_v4_conn_request); 1730 1731 1732 /* 1733 * The three way handshake has completed - we got a valid synack - 1734 * now create the new socket. 1735 */ 1736 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1737 struct request_sock *req, 1738 struct dst_entry *dst, 1739 struct request_sock *req_unhash, 1740 bool *own_req) 1741 { 1742 struct inet_request_sock *ireq; 1743 bool found_dup_sk = false; 1744 struct inet_sock *newinet; 1745 struct tcp_sock *newtp; 1746 struct sock *newsk; 1747 #ifdef CONFIG_TCP_MD5SIG 1748 const union tcp_md5_addr *addr; 1749 struct tcp_md5sig_key *key; 1750 int l3index; 1751 #endif 1752 struct ip_options_rcu *inet_opt; 1753 1754 if (sk_acceptq_is_full(sk)) 1755 goto exit_overflow; 1756 1757 newsk = tcp_create_openreq_child(sk, req, skb); 1758 if (!newsk) 1759 goto exit_nonewsk; 1760 1761 newsk->sk_gso_type = SKB_GSO_TCPV4; 1762 inet_sk_rx_dst_set(newsk, skb); 1763 1764 newtp = tcp_sk(newsk); 1765 newinet = inet_sk(newsk); 1766 ireq = inet_rsk(req); 1767 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1768 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1769 newsk->sk_bound_dev_if = ireq->ir_iif; 1770 newinet->inet_saddr = ireq->ir_loc_addr; 1771 inet_opt = rcu_dereference(ireq->ireq_opt); 1772 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1773 newinet->mc_index = inet_iif(skb); 1774 newinet->mc_ttl = ip_hdr(skb)->ttl; 1775 newinet->rcv_tos = ip_hdr(skb)->tos; 1776 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1777 if (inet_opt) 1778 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1779 atomic_set(&newinet->inet_id, get_random_u16()); 1780 1781 /* Set ToS of the new socket based upon the value of incoming SYN. 1782 * ECT bits are set later in tcp_init_transfer(). 1783 */ 1784 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1785 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1786 1787 if (!dst) { 1788 dst = inet_csk_route_child_sock(sk, newsk, req); 1789 if (!dst) 1790 goto put_and_exit; 1791 } else { 1792 /* syncookie case : see end of cookie_v4_check() */ 1793 } 1794 sk_setup_caps(newsk, dst); 1795 1796 tcp_ca_openreq_child(newsk, dst); 1797 1798 tcp_sync_mss(newsk, dst_mtu(dst)); 1799 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1800 1801 tcp_initialize_rcv_mss(newsk); 1802 1803 #ifdef CONFIG_TCP_MD5SIG 1804 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1805 /* Copy over the MD5 key from the original socket */ 1806 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1807 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1808 if (key && !tcp_rsk_used_ao(req)) { 1809 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1810 goto put_and_exit; 1811 sk_gso_disable(newsk); 1812 } 1813 #endif 1814 #ifdef CONFIG_TCP_AO 1815 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1816 goto put_and_exit; /* OOM, release back memory */ 1817 #endif 1818 1819 if (__inet_inherit_port(sk, newsk) < 0) 1820 goto put_and_exit; 1821 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1822 &found_dup_sk); 1823 if (likely(*own_req)) { 1824 tcp_move_syn(newtp, req); 1825 ireq->ireq_opt = NULL; 1826 } else { 1827 newinet->inet_opt = NULL; 1828 1829 if (!req_unhash && found_dup_sk) { 1830 /* This code path should only be executed in the 1831 * syncookie case only 1832 */ 1833 bh_unlock_sock(newsk); 1834 sock_put(newsk); 1835 newsk = NULL; 1836 } 1837 } 1838 return newsk; 1839 1840 exit_overflow: 1841 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1842 exit_nonewsk: 1843 dst_release(dst); 1844 exit: 1845 tcp_listendrop(sk); 1846 return NULL; 1847 put_and_exit: 1848 newinet->inet_opt = NULL; 1849 inet_csk_prepare_forced_close(newsk); 1850 tcp_done(newsk); 1851 goto exit; 1852 } 1853 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1854 1855 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1856 { 1857 #ifdef CONFIG_SYN_COOKIES 1858 const struct tcphdr *th = tcp_hdr(skb); 1859 1860 if (!th->syn) 1861 sk = cookie_v4_check(sk, skb); 1862 #endif 1863 return sk; 1864 } 1865 1866 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1867 struct tcphdr *th, u32 *cookie) 1868 { 1869 u16 mss = 0; 1870 #ifdef CONFIG_SYN_COOKIES 1871 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1872 &tcp_request_sock_ipv4_ops, sk, th); 1873 if (mss) { 1874 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1875 tcp_synq_overflow(sk); 1876 } 1877 #endif 1878 return mss; 1879 } 1880 1881 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1882 u32)); 1883 /* The socket must have it's spinlock held when we get 1884 * here, unless it is a TCP_LISTEN socket. 1885 * 1886 * We have a potential double-lock case here, so even when 1887 * doing backlog processing we use the BH locking scheme. 1888 * This is because we cannot sleep with the original spinlock 1889 * held. 1890 */ 1891 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1892 { 1893 enum skb_drop_reason reason; 1894 struct sock *rsk; 1895 1896 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1897 struct dst_entry *dst; 1898 1899 dst = rcu_dereference_protected(sk->sk_rx_dst, 1900 lockdep_sock_is_held(sk)); 1901 1902 sock_rps_save_rxhash(sk, skb); 1903 sk_mark_napi_id(sk, skb); 1904 if (dst) { 1905 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1906 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1907 dst, 0)) { 1908 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1909 dst_release(dst); 1910 } 1911 } 1912 tcp_rcv_established(sk, skb); 1913 return 0; 1914 } 1915 1916 reason = SKB_DROP_REASON_NOT_SPECIFIED; 1917 if (tcp_checksum_complete(skb)) 1918 goto csum_err; 1919 1920 if (sk->sk_state == TCP_LISTEN) { 1921 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1922 1923 if (!nsk) 1924 goto discard; 1925 if (nsk != sk) { 1926 if (tcp_child_process(sk, nsk, skb)) { 1927 rsk = nsk; 1928 goto reset; 1929 } 1930 return 0; 1931 } 1932 } else 1933 sock_rps_save_rxhash(sk, skb); 1934 1935 if (tcp_rcv_state_process(sk, skb)) { 1936 rsk = sk; 1937 goto reset; 1938 } 1939 return 0; 1940 1941 reset: 1942 tcp_v4_send_reset(rsk, skb); 1943 discard: 1944 kfree_skb_reason(skb, reason); 1945 /* Be careful here. If this function gets more complicated and 1946 * gcc suffers from register pressure on the x86, sk (in %ebx) 1947 * might be destroyed here. This current version compiles correctly, 1948 * but you have been warned. 1949 */ 1950 return 0; 1951 1952 csum_err: 1953 reason = SKB_DROP_REASON_TCP_CSUM; 1954 trace_tcp_bad_csum(skb); 1955 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1956 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1957 goto discard; 1958 } 1959 EXPORT_SYMBOL(tcp_v4_do_rcv); 1960 1961 int tcp_v4_early_demux(struct sk_buff *skb) 1962 { 1963 struct net *net = dev_net(skb->dev); 1964 const struct iphdr *iph; 1965 const struct tcphdr *th; 1966 struct sock *sk; 1967 1968 if (skb->pkt_type != PACKET_HOST) 1969 return 0; 1970 1971 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1972 return 0; 1973 1974 iph = ip_hdr(skb); 1975 th = tcp_hdr(skb); 1976 1977 if (th->doff < sizeof(struct tcphdr) / 4) 1978 return 0; 1979 1980 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1981 iph->saddr, th->source, 1982 iph->daddr, ntohs(th->dest), 1983 skb->skb_iif, inet_sdif(skb)); 1984 if (sk) { 1985 skb->sk = sk; 1986 skb->destructor = sock_edemux; 1987 if (sk_fullsock(sk)) { 1988 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1989 1990 if (dst) 1991 dst = dst_check(dst, 0); 1992 if (dst && 1993 sk->sk_rx_dst_ifindex == skb->skb_iif) 1994 skb_dst_set_noref(skb, dst); 1995 } 1996 } 1997 return 0; 1998 } 1999 2000 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 2001 enum skb_drop_reason *reason) 2002 { 2003 u32 limit, tail_gso_size, tail_gso_segs; 2004 struct skb_shared_info *shinfo; 2005 const struct tcphdr *th; 2006 struct tcphdr *thtail; 2007 struct sk_buff *tail; 2008 unsigned int hdrlen; 2009 bool fragstolen; 2010 u32 gso_segs; 2011 u32 gso_size; 2012 int delta; 2013 2014 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2015 * we can fix skb->truesize to its real value to avoid future drops. 2016 * This is valid because skb is not yet charged to the socket. 2017 * It has been noticed pure SACK packets were sometimes dropped 2018 * (if cooked by drivers without copybreak feature). 2019 */ 2020 skb_condense(skb); 2021 2022 skb_dst_drop(skb); 2023 2024 if (unlikely(tcp_checksum_complete(skb))) { 2025 bh_unlock_sock(sk); 2026 trace_tcp_bad_csum(skb); 2027 *reason = SKB_DROP_REASON_TCP_CSUM; 2028 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2029 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2030 return true; 2031 } 2032 2033 /* Attempt coalescing to last skb in backlog, even if we are 2034 * above the limits. 2035 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2036 */ 2037 th = (const struct tcphdr *)skb->data; 2038 hdrlen = th->doff * 4; 2039 2040 tail = sk->sk_backlog.tail; 2041 if (!tail) 2042 goto no_coalesce; 2043 thtail = (struct tcphdr *)tail->data; 2044 2045 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2046 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2047 ((TCP_SKB_CB(tail)->tcp_flags | 2048 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2049 !((TCP_SKB_CB(tail)->tcp_flags & 2050 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2051 ((TCP_SKB_CB(tail)->tcp_flags ^ 2052 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 2053 #ifdef CONFIG_TLS_DEVICE 2054 tail->decrypted != skb->decrypted || 2055 #endif 2056 !mptcp_skb_can_collapse(tail, skb) || 2057 thtail->doff != th->doff || 2058 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2059 goto no_coalesce; 2060 2061 __skb_pull(skb, hdrlen); 2062 2063 shinfo = skb_shinfo(skb); 2064 gso_size = shinfo->gso_size ?: skb->len; 2065 gso_segs = shinfo->gso_segs ?: 1; 2066 2067 shinfo = skb_shinfo(tail); 2068 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2069 tail_gso_segs = shinfo->gso_segs ?: 1; 2070 2071 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2072 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2073 2074 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2075 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2076 thtail->window = th->window; 2077 } 2078 2079 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2080 * thtail->fin, so that the fast path in tcp_rcv_established() 2081 * is not entered if we append a packet with a FIN. 2082 * SYN, RST, URG are not present. 2083 * ACK is set on both packets. 2084 * PSH : we do not really care in TCP stack, 2085 * at least for 'GRO' packets. 2086 */ 2087 thtail->fin |= th->fin; 2088 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2089 2090 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2091 TCP_SKB_CB(tail)->has_rxtstamp = true; 2092 tail->tstamp = skb->tstamp; 2093 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2094 } 2095 2096 /* Not as strict as GRO. We only need to carry mss max value */ 2097 shinfo->gso_size = max(gso_size, tail_gso_size); 2098 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2099 2100 sk->sk_backlog.len += delta; 2101 __NET_INC_STATS(sock_net(sk), 2102 LINUX_MIB_TCPBACKLOGCOALESCE); 2103 kfree_skb_partial(skb, fragstolen); 2104 return false; 2105 } 2106 __skb_push(skb, hdrlen); 2107 2108 no_coalesce: 2109 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); 2110 2111 /* Only socket owner can try to collapse/prune rx queues 2112 * to reduce memory overhead, so add a little headroom here. 2113 * Few sockets backlog are possibly concurrently non empty. 2114 */ 2115 limit += 64 * 1024; 2116 2117 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2118 bh_unlock_sock(sk); 2119 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2120 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2121 return true; 2122 } 2123 return false; 2124 } 2125 EXPORT_SYMBOL(tcp_add_backlog); 2126 2127 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2128 { 2129 struct tcphdr *th = (struct tcphdr *)skb->data; 2130 2131 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2132 } 2133 EXPORT_SYMBOL(tcp_filter); 2134 2135 static void tcp_v4_restore_cb(struct sk_buff *skb) 2136 { 2137 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2138 sizeof(struct inet_skb_parm)); 2139 } 2140 2141 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2142 const struct tcphdr *th) 2143 { 2144 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2145 * barrier() makes sure compiler wont play fool^Waliasing games. 2146 */ 2147 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2148 sizeof(struct inet_skb_parm)); 2149 barrier(); 2150 2151 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2152 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2153 skb->len - th->doff * 4); 2154 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2155 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 2156 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 2157 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2158 TCP_SKB_CB(skb)->sacked = 0; 2159 TCP_SKB_CB(skb)->has_rxtstamp = 2160 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2161 } 2162 2163 /* 2164 * From tcp_input.c 2165 */ 2166 2167 int tcp_v4_rcv(struct sk_buff *skb) 2168 { 2169 struct net *net = dev_net(skb->dev); 2170 enum skb_drop_reason drop_reason; 2171 int sdif = inet_sdif(skb); 2172 int dif = inet_iif(skb); 2173 const struct iphdr *iph; 2174 const struct tcphdr *th; 2175 bool refcounted; 2176 struct sock *sk; 2177 int ret; 2178 2179 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2180 if (skb->pkt_type != PACKET_HOST) 2181 goto discard_it; 2182 2183 /* Count it even if it's bad */ 2184 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2185 2186 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2187 goto discard_it; 2188 2189 th = (const struct tcphdr *)skb->data; 2190 2191 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2192 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2193 goto bad_packet; 2194 } 2195 if (!pskb_may_pull(skb, th->doff * 4)) 2196 goto discard_it; 2197 2198 /* An explanation is required here, I think. 2199 * Packet length and doff are validated by header prediction, 2200 * provided case of th->doff==0 is eliminated. 2201 * So, we defer the checks. */ 2202 2203 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2204 goto csum_error; 2205 2206 th = (const struct tcphdr *)skb->data; 2207 iph = ip_hdr(skb); 2208 lookup: 2209 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2210 skb, __tcp_hdrlen(th), th->source, 2211 th->dest, sdif, &refcounted); 2212 if (!sk) 2213 goto no_tcp_socket; 2214 2215 process: 2216 if (sk->sk_state == TCP_TIME_WAIT) 2217 goto do_time_wait; 2218 2219 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2220 struct request_sock *req = inet_reqsk(sk); 2221 bool req_stolen = false; 2222 struct sock *nsk; 2223 2224 sk = req->rsk_listener; 2225 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2226 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2227 else 2228 drop_reason = tcp_inbound_hash(sk, req, skb, 2229 &iph->saddr, &iph->daddr, 2230 AF_INET, dif, sdif); 2231 if (unlikely(drop_reason)) { 2232 sk_drops_add(sk, skb); 2233 reqsk_put(req); 2234 goto discard_it; 2235 } 2236 if (tcp_checksum_complete(skb)) { 2237 reqsk_put(req); 2238 goto csum_error; 2239 } 2240 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2241 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2242 if (!nsk) { 2243 inet_csk_reqsk_queue_drop_and_put(sk, req); 2244 goto lookup; 2245 } 2246 sk = nsk; 2247 /* reuseport_migrate_sock() has already held one sk_refcnt 2248 * before returning. 2249 */ 2250 } else { 2251 /* We own a reference on the listener, increase it again 2252 * as we might lose it too soon. 2253 */ 2254 sock_hold(sk); 2255 } 2256 refcounted = true; 2257 nsk = NULL; 2258 if (!tcp_filter(sk, skb)) { 2259 th = (const struct tcphdr *)skb->data; 2260 iph = ip_hdr(skb); 2261 tcp_v4_fill_cb(skb, iph, th); 2262 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2263 } else { 2264 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2265 } 2266 if (!nsk) { 2267 reqsk_put(req); 2268 if (req_stolen) { 2269 /* Another cpu got exclusive access to req 2270 * and created a full blown socket. 2271 * Try to feed this packet to this socket 2272 * instead of discarding it. 2273 */ 2274 tcp_v4_restore_cb(skb); 2275 sock_put(sk); 2276 goto lookup; 2277 } 2278 goto discard_and_relse; 2279 } 2280 nf_reset_ct(skb); 2281 if (nsk == sk) { 2282 reqsk_put(req); 2283 tcp_v4_restore_cb(skb); 2284 } else if (tcp_child_process(sk, nsk, skb)) { 2285 tcp_v4_send_reset(nsk, skb); 2286 goto discard_and_relse; 2287 } else { 2288 sock_put(sk); 2289 return 0; 2290 } 2291 } 2292 2293 if (static_branch_unlikely(&ip4_min_ttl)) { 2294 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2295 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2296 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2297 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2298 goto discard_and_relse; 2299 } 2300 } 2301 2302 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2303 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2304 goto discard_and_relse; 2305 } 2306 2307 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2308 AF_INET, dif, sdif); 2309 if (drop_reason) 2310 goto discard_and_relse; 2311 2312 nf_reset_ct(skb); 2313 2314 if (tcp_filter(sk, skb)) { 2315 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2316 goto discard_and_relse; 2317 } 2318 th = (const struct tcphdr *)skb->data; 2319 iph = ip_hdr(skb); 2320 tcp_v4_fill_cb(skb, iph, th); 2321 2322 skb->dev = NULL; 2323 2324 if (sk->sk_state == TCP_LISTEN) { 2325 ret = tcp_v4_do_rcv(sk, skb); 2326 goto put_and_return; 2327 } 2328 2329 sk_incoming_cpu_update(sk); 2330 2331 bh_lock_sock_nested(sk); 2332 tcp_segs_in(tcp_sk(sk), skb); 2333 ret = 0; 2334 if (!sock_owned_by_user(sk)) { 2335 ret = tcp_v4_do_rcv(sk, skb); 2336 } else { 2337 if (tcp_add_backlog(sk, skb, &drop_reason)) 2338 goto discard_and_relse; 2339 } 2340 bh_unlock_sock(sk); 2341 2342 put_and_return: 2343 if (refcounted) 2344 sock_put(sk); 2345 2346 return ret; 2347 2348 no_tcp_socket: 2349 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2350 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2351 goto discard_it; 2352 2353 tcp_v4_fill_cb(skb, iph, th); 2354 2355 if (tcp_checksum_complete(skb)) { 2356 csum_error: 2357 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2358 trace_tcp_bad_csum(skb); 2359 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2360 bad_packet: 2361 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2362 } else { 2363 tcp_v4_send_reset(NULL, skb); 2364 } 2365 2366 discard_it: 2367 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2368 /* Discard frame. */ 2369 kfree_skb_reason(skb, drop_reason); 2370 return 0; 2371 2372 discard_and_relse: 2373 sk_drops_add(sk, skb); 2374 if (refcounted) 2375 sock_put(sk); 2376 goto discard_it; 2377 2378 do_time_wait: 2379 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2380 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2381 inet_twsk_put(inet_twsk(sk)); 2382 goto discard_it; 2383 } 2384 2385 tcp_v4_fill_cb(skb, iph, th); 2386 2387 if (tcp_checksum_complete(skb)) { 2388 inet_twsk_put(inet_twsk(sk)); 2389 goto csum_error; 2390 } 2391 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2392 case TCP_TW_SYN: { 2393 struct sock *sk2 = inet_lookup_listener(net, 2394 net->ipv4.tcp_death_row.hashinfo, 2395 skb, __tcp_hdrlen(th), 2396 iph->saddr, th->source, 2397 iph->daddr, th->dest, 2398 inet_iif(skb), 2399 sdif); 2400 if (sk2) { 2401 inet_twsk_deschedule_put(inet_twsk(sk)); 2402 sk = sk2; 2403 tcp_v4_restore_cb(skb); 2404 refcounted = false; 2405 goto process; 2406 } 2407 } 2408 /* to ACK */ 2409 fallthrough; 2410 case TCP_TW_ACK: 2411 tcp_v4_timewait_ack(sk, skb); 2412 break; 2413 case TCP_TW_RST: 2414 tcp_v4_send_reset(sk, skb); 2415 inet_twsk_deschedule_put(inet_twsk(sk)); 2416 goto discard_it; 2417 case TCP_TW_SUCCESS:; 2418 } 2419 goto discard_it; 2420 } 2421 2422 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2423 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2424 .twsk_unique = tcp_twsk_unique, 2425 .twsk_destructor= tcp_twsk_destructor, 2426 }; 2427 2428 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2429 { 2430 struct dst_entry *dst = skb_dst(skb); 2431 2432 if (dst && dst_hold_safe(dst)) { 2433 rcu_assign_pointer(sk->sk_rx_dst, dst); 2434 sk->sk_rx_dst_ifindex = skb->skb_iif; 2435 } 2436 } 2437 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2438 2439 const struct inet_connection_sock_af_ops ipv4_specific = { 2440 .queue_xmit = ip_queue_xmit, 2441 .send_check = tcp_v4_send_check, 2442 .rebuild_header = inet_sk_rebuild_header, 2443 .sk_rx_dst_set = inet_sk_rx_dst_set, 2444 .conn_request = tcp_v4_conn_request, 2445 .syn_recv_sock = tcp_v4_syn_recv_sock, 2446 .net_header_len = sizeof(struct iphdr), 2447 .setsockopt = ip_setsockopt, 2448 .getsockopt = ip_getsockopt, 2449 .addr2sockaddr = inet_csk_addr2sockaddr, 2450 .sockaddr_len = sizeof(struct sockaddr_in), 2451 .mtu_reduced = tcp_v4_mtu_reduced, 2452 }; 2453 EXPORT_SYMBOL(ipv4_specific); 2454 2455 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2456 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2457 #ifdef CONFIG_TCP_MD5SIG 2458 .md5_lookup = tcp_v4_md5_lookup, 2459 .calc_md5_hash = tcp_v4_md5_hash_skb, 2460 .md5_parse = tcp_v4_parse_md5_keys, 2461 #endif 2462 #ifdef CONFIG_TCP_AO 2463 .ao_lookup = tcp_v4_ao_lookup, 2464 .calc_ao_hash = tcp_v4_ao_hash_skb, 2465 .ao_parse = tcp_v4_parse_ao, 2466 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2467 #endif 2468 }; 2469 #endif 2470 2471 /* NOTE: A lot of things set to zero explicitly by call to 2472 * sk_alloc() so need not be done here. 2473 */ 2474 static int tcp_v4_init_sock(struct sock *sk) 2475 { 2476 struct inet_connection_sock *icsk = inet_csk(sk); 2477 2478 tcp_init_sock(sk); 2479 2480 icsk->icsk_af_ops = &ipv4_specific; 2481 2482 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2483 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2484 #endif 2485 2486 return 0; 2487 } 2488 2489 #ifdef CONFIG_TCP_MD5SIG 2490 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2491 { 2492 struct tcp_md5sig_info *md5sig; 2493 2494 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2495 kfree(md5sig); 2496 static_branch_slow_dec_deferred(&tcp_md5_needed); 2497 tcp_md5_release_sigpool(); 2498 } 2499 #endif 2500 2501 void tcp_v4_destroy_sock(struct sock *sk) 2502 { 2503 struct tcp_sock *tp = tcp_sk(sk); 2504 2505 trace_tcp_destroy_sock(sk); 2506 2507 tcp_clear_xmit_timers(sk); 2508 2509 tcp_cleanup_congestion_control(sk); 2510 2511 tcp_cleanup_ulp(sk); 2512 2513 /* Cleanup up the write buffer. */ 2514 tcp_write_queue_purge(sk); 2515 2516 /* Check if we want to disable active TFO */ 2517 tcp_fastopen_active_disable_ofo_check(sk); 2518 2519 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2520 skb_rbtree_purge(&tp->out_of_order_queue); 2521 2522 #ifdef CONFIG_TCP_MD5SIG 2523 /* Clean up the MD5 key list, if any */ 2524 if (tp->md5sig_info) { 2525 struct tcp_md5sig_info *md5sig; 2526 2527 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2528 tcp_clear_md5_list(sk); 2529 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2530 rcu_assign_pointer(tp->md5sig_info, NULL); 2531 } 2532 #endif 2533 tcp_ao_destroy_sock(sk, false); 2534 2535 /* Clean up a referenced TCP bind bucket. */ 2536 if (inet_csk(sk)->icsk_bind_hash) 2537 inet_put_port(sk); 2538 2539 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2540 2541 /* If socket is aborted during connect operation */ 2542 tcp_free_fastopen_req(tp); 2543 tcp_fastopen_destroy_cipher(sk); 2544 tcp_saved_syn_free(tp); 2545 2546 sk_sockets_allocated_dec(sk); 2547 } 2548 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2549 2550 #ifdef CONFIG_PROC_FS 2551 /* Proc filesystem TCP sock list dumping. */ 2552 2553 static unsigned short seq_file_family(const struct seq_file *seq); 2554 2555 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2556 { 2557 unsigned short family = seq_file_family(seq); 2558 2559 /* AF_UNSPEC is used as a match all */ 2560 return ((family == AF_UNSPEC || family == sk->sk_family) && 2561 net_eq(sock_net(sk), seq_file_net(seq))); 2562 } 2563 2564 /* Find a non empty bucket (starting from st->bucket) 2565 * and return the first sk from it. 2566 */ 2567 static void *listening_get_first(struct seq_file *seq) 2568 { 2569 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2570 struct tcp_iter_state *st = seq->private; 2571 2572 st->offset = 0; 2573 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2574 struct inet_listen_hashbucket *ilb2; 2575 struct hlist_nulls_node *node; 2576 struct sock *sk; 2577 2578 ilb2 = &hinfo->lhash2[st->bucket]; 2579 if (hlist_nulls_empty(&ilb2->nulls_head)) 2580 continue; 2581 2582 spin_lock(&ilb2->lock); 2583 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2584 if (seq_sk_match(seq, sk)) 2585 return sk; 2586 } 2587 spin_unlock(&ilb2->lock); 2588 } 2589 2590 return NULL; 2591 } 2592 2593 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2594 * If "cur" is the last one in the st->bucket, 2595 * call listening_get_first() to return the first sk of the next 2596 * non empty bucket. 2597 */ 2598 static void *listening_get_next(struct seq_file *seq, void *cur) 2599 { 2600 struct tcp_iter_state *st = seq->private; 2601 struct inet_listen_hashbucket *ilb2; 2602 struct hlist_nulls_node *node; 2603 struct inet_hashinfo *hinfo; 2604 struct sock *sk = cur; 2605 2606 ++st->num; 2607 ++st->offset; 2608 2609 sk = sk_nulls_next(sk); 2610 sk_nulls_for_each_from(sk, node) { 2611 if (seq_sk_match(seq, sk)) 2612 return sk; 2613 } 2614 2615 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2616 ilb2 = &hinfo->lhash2[st->bucket]; 2617 spin_unlock(&ilb2->lock); 2618 ++st->bucket; 2619 return listening_get_first(seq); 2620 } 2621 2622 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2623 { 2624 struct tcp_iter_state *st = seq->private; 2625 void *rc; 2626 2627 st->bucket = 0; 2628 st->offset = 0; 2629 rc = listening_get_first(seq); 2630 2631 while (rc && *pos) { 2632 rc = listening_get_next(seq, rc); 2633 --*pos; 2634 } 2635 return rc; 2636 } 2637 2638 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2639 const struct tcp_iter_state *st) 2640 { 2641 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2642 } 2643 2644 /* 2645 * Get first established socket starting from bucket given in st->bucket. 2646 * If st->bucket is zero, the very first socket in the hash is returned. 2647 */ 2648 static void *established_get_first(struct seq_file *seq) 2649 { 2650 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2651 struct tcp_iter_state *st = seq->private; 2652 2653 st->offset = 0; 2654 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2655 struct sock *sk; 2656 struct hlist_nulls_node *node; 2657 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2658 2659 cond_resched(); 2660 2661 /* Lockless fast path for the common case of empty buckets */ 2662 if (empty_bucket(hinfo, st)) 2663 continue; 2664 2665 spin_lock_bh(lock); 2666 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2667 if (seq_sk_match(seq, sk)) 2668 return sk; 2669 } 2670 spin_unlock_bh(lock); 2671 } 2672 2673 return NULL; 2674 } 2675 2676 static void *established_get_next(struct seq_file *seq, void *cur) 2677 { 2678 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2679 struct tcp_iter_state *st = seq->private; 2680 struct hlist_nulls_node *node; 2681 struct sock *sk = cur; 2682 2683 ++st->num; 2684 ++st->offset; 2685 2686 sk = sk_nulls_next(sk); 2687 2688 sk_nulls_for_each_from(sk, node) { 2689 if (seq_sk_match(seq, sk)) 2690 return sk; 2691 } 2692 2693 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2694 ++st->bucket; 2695 return established_get_first(seq); 2696 } 2697 2698 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2699 { 2700 struct tcp_iter_state *st = seq->private; 2701 void *rc; 2702 2703 st->bucket = 0; 2704 rc = established_get_first(seq); 2705 2706 while (rc && pos) { 2707 rc = established_get_next(seq, rc); 2708 --pos; 2709 } 2710 return rc; 2711 } 2712 2713 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2714 { 2715 void *rc; 2716 struct tcp_iter_state *st = seq->private; 2717 2718 st->state = TCP_SEQ_STATE_LISTENING; 2719 rc = listening_get_idx(seq, &pos); 2720 2721 if (!rc) { 2722 st->state = TCP_SEQ_STATE_ESTABLISHED; 2723 rc = established_get_idx(seq, pos); 2724 } 2725 2726 return rc; 2727 } 2728 2729 static void *tcp_seek_last_pos(struct seq_file *seq) 2730 { 2731 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2732 struct tcp_iter_state *st = seq->private; 2733 int bucket = st->bucket; 2734 int offset = st->offset; 2735 int orig_num = st->num; 2736 void *rc = NULL; 2737 2738 switch (st->state) { 2739 case TCP_SEQ_STATE_LISTENING: 2740 if (st->bucket > hinfo->lhash2_mask) 2741 break; 2742 rc = listening_get_first(seq); 2743 while (offset-- && rc && bucket == st->bucket) 2744 rc = listening_get_next(seq, rc); 2745 if (rc) 2746 break; 2747 st->bucket = 0; 2748 st->state = TCP_SEQ_STATE_ESTABLISHED; 2749 fallthrough; 2750 case TCP_SEQ_STATE_ESTABLISHED: 2751 if (st->bucket > hinfo->ehash_mask) 2752 break; 2753 rc = established_get_first(seq); 2754 while (offset-- && rc && bucket == st->bucket) 2755 rc = established_get_next(seq, rc); 2756 } 2757 2758 st->num = orig_num; 2759 2760 return rc; 2761 } 2762 2763 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2764 { 2765 struct tcp_iter_state *st = seq->private; 2766 void *rc; 2767 2768 if (*pos && *pos == st->last_pos) { 2769 rc = tcp_seek_last_pos(seq); 2770 if (rc) 2771 goto out; 2772 } 2773 2774 st->state = TCP_SEQ_STATE_LISTENING; 2775 st->num = 0; 2776 st->bucket = 0; 2777 st->offset = 0; 2778 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2779 2780 out: 2781 st->last_pos = *pos; 2782 return rc; 2783 } 2784 EXPORT_SYMBOL(tcp_seq_start); 2785 2786 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2787 { 2788 struct tcp_iter_state *st = seq->private; 2789 void *rc = NULL; 2790 2791 if (v == SEQ_START_TOKEN) { 2792 rc = tcp_get_idx(seq, 0); 2793 goto out; 2794 } 2795 2796 switch (st->state) { 2797 case TCP_SEQ_STATE_LISTENING: 2798 rc = listening_get_next(seq, v); 2799 if (!rc) { 2800 st->state = TCP_SEQ_STATE_ESTABLISHED; 2801 st->bucket = 0; 2802 st->offset = 0; 2803 rc = established_get_first(seq); 2804 } 2805 break; 2806 case TCP_SEQ_STATE_ESTABLISHED: 2807 rc = established_get_next(seq, v); 2808 break; 2809 } 2810 out: 2811 ++*pos; 2812 st->last_pos = *pos; 2813 return rc; 2814 } 2815 EXPORT_SYMBOL(tcp_seq_next); 2816 2817 void tcp_seq_stop(struct seq_file *seq, void *v) 2818 { 2819 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2820 struct tcp_iter_state *st = seq->private; 2821 2822 switch (st->state) { 2823 case TCP_SEQ_STATE_LISTENING: 2824 if (v != SEQ_START_TOKEN) 2825 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2826 break; 2827 case TCP_SEQ_STATE_ESTABLISHED: 2828 if (v) 2829 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2830 break; 2831 } 2832 } 2833 EXPORT_SYMBOL(tcp_seq_stop); 2834 2835 static void get_openreq4(const struct request_sock *req, 2836 struct seq_file *f, int i) 2837 { 2838 const struct inet_request_sock *ireq = inet_rsk(req); 2839 long delta = req->rsk_timer.expires - jiffies; 2840 2841 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2842 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2843 i, 2844 ireq->ir_loc_addr, 2845 ireq->ir_num, 2846 ireq->ir_rmt_addr, 2847 ntohs(ireq->ir_rmt_port), 2848 TCP_SYN_RECV, 2849 0, 0, /* could print option size, but that is af dependent. */ 2850 1, /* timers active (only the expire timer) */ 2851 jiffies_delta_to_clock_t(delta), 2852 req->num_timeout, 2853 from_kuid_munged(seq_user_ns(f), 2854 sock_i_uid(req->rsk_listener)), 2855 0, /* non standard timer */ 2856 0, /* open_requests have no inode */ 2857 0, 2858 req); 2859 } 2860 2861 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2862 { 2863 int timer_active; 2864 unsigned long timer_expires; 2865 const struct tcp_sock *tp = tcp_sk(sk); 2866 const struct inet_connection_sock *icsk = inet_csk(sk); 2867 const struct inet_sock *inet = inet_sk(sk); 2868 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2869 __be32 dest = inet->inet_daddr; 2870 __be32 src = inet->inet_rcv_saddr; 2871 __u16 destp = ntohs(inet->inet_dport); 2872 __u16 srcp = ntohs(inet->inet_sport); 2873 int rx_queue; 2874 int state; 2875 2876 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2877 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2878 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2879 timer_active = 1; 2880 timer_expires = icsk->icsk_timeout; 2881 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2882 timer_active = 4; 2883 timer_expires = icsk->icsk_timeout; 2884 } else if (timer_pending(&sk->sk_timer)) { 2885 timer_active = 2; 2886 timer_expires = sk->sk_timer.expires; 2887 } else { 2888 timer_active = 0; 2889 timer_expires = jiffies; 2890 } 2891 2892 state = inet_sk_state_load(sk); 2893 if (state == TCP_LISTEN) 2894 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2895 else 2896 /* Because we don't lock the socket, 2897 * we might find a transient negative value. 2898 */ 2899 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2900 READ_ONCE(tp->copied_seq), 0); 2901 2902 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2903 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2904 i, src, srcp, dest, destp, state, 2905 READ_ONCE(tp->write_seq) - tp->snd_una, 2906 rx_queue, 2907 timer_active, 2908 jiffies_delta_to_clock_t(timer_expires - jiffies), 2909 icsk->icsk_retransmits, 2910 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2911 icsk->icsk_probes_out, 2912 sock_i_ino(sk), 2913 refcount_read(&sk->sk_refcnt), sk, 2914 jiffies_to_clock_t(icsk->icsk_rto), 2915 jiffies_to_clock_t(icsk->icsk_ack.ato), 2916 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2917 tcp_snd_cwnd(tp), 2918 state == TCP_LISTEN ? 2919 fastopenq->max_qlen : 2920 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2921 } 2922 2923 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2924 struct seq_file *f, int i) 2925 { 2926 long delta = tw->tw_timer.expires - jiffies; 2927 __be32 dest, src; 2928 __u16 destp, srcp; 2929 2930 dest = tw->tw_daddr; 2931 src = tw->tw_rcv_saddr; 2932 destp = ntohs(tw->tw_dport); 2933 srcp = ntohs(tw->tw_sport); 2934 2935 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2936 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2937 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2938 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2939 refcount_read(&tw->tw_refcnt), tw); 2940 } 2941 2942 #define TMPSZ 150 2943 2944 static int tcp4_seq_show(struct seq_file *seq, void *v) 2945 { 2946 struct tcp_iter_state *st; 2947 struct sock *sk = v; 2948 2949 seq_setwidth(seq, TMPSZ - 1); 2950 if (v == SEQ_START_TOKEN) { 2951 seq_puts(seq, " sl local_address rem_address st tx_queue " 2952 "rx_queue tr tm->when retrnsmt uid timeout " 2953 "inode"); 2954 goto out; 2955 } 2956 st = seq->private; 2957 2958 if (sk->sk_state == TCP_TIME_WAIT) 2959 get_timewait4_sock(v, seq, st->num); 2960 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2961 get_openreq4(v, seq, st->num); 2962 else 2963 get_tcp4_sock(v, seq, st->num); 2964 out: 2965 seq_pad(seq, '\n'); 2966 return 0; 2967 } 2968 2969 #ifdef CONFIG_BPF_SYSCALL 2970 struct bpf_tcp_iter_state { 2971 struct tcp_iter_state state; 2972 unsigned int cur_sk; 2973 unsigned int end_sk; 2974 unsigned int max_sk; 2975 struct sock **batch; 2976 bool st_bucket_done; 2977 }; 2978 2979 struct bpf_iter__tcp { 2980 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2981 __bpf_md_ptr(struct sock_common *, sk_common); 2982 uid_t uid __aligned(8); 2983 }; 2984 2985 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2986 struct sock_common *sk_common, uid_t uid) 2987 { 2988 struct bpf_iter__tcp ctx; 2989 2990 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2991 ctx.meta = meta; 2992 ctx.sk_common = sk_common; 2993 ctx.uid = uid; 2994 return bpf_iter_run_prog(prog, &ctx); 2995 } 2996 2997 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2998 { 2999 while (iter->cur_sk < iter->end_sk) 3000 sock_gen_put(iter->batch[iter->cur_sk++]); 3001 } 3002 3003 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3004 unsigned int new_batch_sz) 3005 { 3006 struct sock **new_batch; 3007 3008 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3009 GFP_USER | __GFP_NOWARN); 3010 if (!new_batch) 3011 return -ENOMEM; 3012 3013 bpf_iter_tcp_put_batch(iter); 3014 kvfree(iter->batch); 3015 iter->batch = new_batch; 3016 iter->max_sk = new_batch_sz; 3017 3018 return 0; 3019 } 3020 3021 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3022 struct sock *start_sk) 3023 { 3024 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3025 struct bpf_tcp_iter_state *iter = seq->private; 3026 struct tcp_iter_state *st = &iter->state; 3027 struct hlist_nulls_node *node; 3028 unsigned int expected = 1; 3029 struct sock *sk; 3030 3031 sock_hold(start_sk); 3032 iter->batch[iter->end_sk++] = start_sk; 3033 3034 sk = sk_nulls_next(start_sk); 3035 sk_nulls_for_each_from(sk, node) { 3036 if (seq_sk_match(seq, sk)) { 3037 if (iter->end_sk < iter->max_sk) { 3038 sock_hold(sk); 3039 iter->batch[iter->end_sk++] = sk; 3040 } 3041 expected++; 3042 } 3043 } 3044 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3045 3046 return expected; 3047 } 3048 3049 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3050 struct sock *start_sk) 3051 { 3052 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3053 struct bpf_tcp_iter_state *iter = seq->private; 3054 struct tcp_iter_state *st = &iter->state; 3055 struct hlist_nulls_node *node; 3056 unsigned int expected = 1; 3057 struct sock *sk; 3058 3059 sock_hold(start_sk); 3060 iter->batch[iter->end_sk++] = start_sk; 3061 3062 sk = sk_nulls_next(start_sk); 3063 sk_nulls_for_each_from(sk, node) { 3064 if (seq_sk_match(seq, sk)) { 3065 if (iter->end_sk < iter->max_sk) { 3066 sock_hold(sk); 3067 iter->batch[iter->end_sk++] = sk; 3068 } 3069 expected++; 3070 } 3071 } 3072 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3073 3074 return expected; 3075 } 3076 3077 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3078 { 3079 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3080 struct bpf_tcp_iter_state *iter = seq->private; 3081 struct tcp_iter_state *st = &iter->state; 3082 unsigned int expected; 3083 bool resized = false; 3084 struct sock *sk; 3085 3086 /* The st->bucket is done. Directly advance to the next 3087 * bucket instead of having the tcp_seek_last_pos() to skip 3088 * one by one in the current bucket and eventually find out 3089 * it has to advance to the next bucket. 3090 */ 3091 if (iter->st_bucket_done) { 3092 st->offset = 0; 3093 st->bucket++; 3094 if (st->state == TCP_SEQ_STATE_LISTENING && 3095 st->bucket > hinfo->lhash2_mask) { 3096 st->state = TCP_SEQ_STATE_ESTABLISHED; 3097 st->bucket = 0; 3098 } 3099 } 3100 3101 again: 3102 /* Get a new batch */ 3103 iter->cur_sk = 0; 3104 iter->end_sk = 0; 3105 iter->st_bucket_done = false; 3106 3107 sk = tcp_seek_last_pos(seq); 3108 if (!sk) 3109 return NULL; /* Done */ 3110 3111 if (st->state == TCP_SEQ_STATE_LISTENING) 3112 expected = bpf_iter_tcp_listening_batch(seq, sk); 3113 else 3114 expected = bpf_iter_tcp_established_batch(seq, sk); 3115 3116 if (iter->end_sk == expected) { 3117 iter->st_bucket_done = true; 3118 return sk; 3119 } 3120 3121 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3122 resized = true; 3123 goto again; 3124 } 3125 3126 return sk; 3127 } 3128 3129 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3130 { 3131 /* bpf iter does not support lseek, so it always 3132 * continue from where it was stop()-ped. 3133 */ 3134 if (*pos) 3135 return bpf_iter_tcp_batch(seq); 3136 3137 return SEQ_START_TOKEN; 3138 } 3139 3140 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3141 { 3142 struct bpf_tcp_iter_state *iter = seq->private; 3143 struct tcp_iter_state *st = &iter->state; 3144 struct sock *sk; 3145 3146 /* Whenever seq_next() is called, the iter->cur_sk is 3147 * done with seq_show(), so advance to the next sk in 3148 * the batch. 3149 */ 3150 if (iter->cur_sk < iter->end_sk) { 3151 /* Keeping st->num consistent in tcp_iter_state. 3152 * bpf_iter_tcp does not use st->num. 3153 * meta.seq_num is used instead. 3154 */ 3155 st->num++; 3156 /* Move st->offset to the next sk in the bucket such that 3157 * the future start() will resume at st->offset in 3158 * st->bucket. See tcp_seek_last_pos(). 3159 */ 3160 st->offset++; 3161 sock_gen_put(iter->batch[iter->cur_sk++]); 3162 } 3163 3164 if (iter->cur_sk < iter->end_sk) 3165 sk = iter->batch[iter->cur_sk]; 3166 else 3167 sk = bpf_iter_tcp_batch(seq); 3168 3169 ++*pos; 3170 /* Keeping st->last_pos consistent in tcp_iter_state. 3171 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3172 */ 3173 st->last_pos = *pos; 3174 return sk; 3175 } 3176 3177 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3178 { 3179 struct bpf_iter_meta meta; 3180 struct bpf_prog *prog; 3181 struct sock *sk = v; 3182 uid_t uid; 3183 int ret; 3184 3185 if (v == SEQ_START_TOKEN) 3186 return 0; 3187 3188 if (sk_fullsock(sk)) 3189 lock_sock(sk); 3190 3191 if (unlikely(sk_unhashed(sk))) { 3192 ret = SEQ_SKIP; 3193 goto unlock; 3194 } 3195 3196 if (sk->sk_state == TCP_TIME_WAIT) { 3197 uid = 0; 3198 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3199 const struct request_sock *req = v; 3200 3201 uid = from_kuid_munged(seq_user_ns(seq), 3202 sock_i_uid(req->rsk_listener)); 3203 } else { 3204 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3205 } 3206 3207 meta.seq = seq; 3208 prog = bpf_iter_get_info(&meta, false); 3209 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3210 3211 unlock: 3212 if (sk_fullsock(sk)) 3213 release_sock(sk); 3214 return ret; 3215 3216 } 3217 3218 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3219 { 3220 struct bpf_tcp_iter_state *iter = seq->private; 3221 struct bpf_iter_meta meta; 3222 struct bpf_prog *prog; 3223 3224 if (!v) { 3225 meta.seq = seq; 3226 prog = bpf_iter_get_info(&meta, true); 3227 if (prog) 3228 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3229 } 3230 3231 if (iter->cur_sk < iter->end_sk) { 3232 bpf_iter_tcp_put_batch(iter); 3233 iter->st_bucket_done = false; 3234 } 3235 } 3236 3237 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3238 .show = bpf_iter_tcp_seq_show, 3239 .start = bpf_iter_tcp_seq_start, 3240 .next = bpf_iter_tcp_seq_next, 3241 .stop = bpf_iter_tcp_seq_stop, 3242 }; 3243 #endif 3244 static unsigned short seq_file_family(const struct seq_file *seq) 3245 { 3246 const struct tcp_seq_afinfo *afinfo; 3247 3248 #ifdef CONFIG_BPF_SYSCALL 3249 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3250 if (seq->op == &bpf_iter_tcp_seq_ops) 3251 return AF_UNSPEC; 3252 #endif 3253 3254 /* Iterated from proc fs */ 3255 afinfo = pde_data(file_inode(seq->file)); 3256 return afinfo->family; 3257 } 3258 3259 static const struct seq_operations tcp4_seq_ops = { 3260 .show = tcp4_seq_show, 3261 .start = tcp_seq_start, 3262 .next = tcp_seq_next, 3263 .stop = tcp_seq_stop, 3264 }; 3265 3266 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3267 .family = AF_INET, 3268 }; 3269 3270 static int __net_init tcp4_proc_init_net(struct net *net) 3271 { 3272 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3273 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3274 return -ENOMEM; 3275 return 0; 3276 } 3277 3278 static void __net_exit tcp4_proc_exit_net(struct net *net) 3279 { 3280 remove_proc_entry("tcp", net->proc_net); 3281 } 3282 3283 static struct pernet_operations tcp4_net_ops = { 3284 .init = tcp4_proc_init_net, 3285 .exit = tcp4_proc_exit_net, 3286 }; 3287 3288 int __init tcp4_proc_init(void) 3289 { 3290 return register_pernet_subsys(&tcp4_net_ops); 3291 } 3292 3293 void tcp4_proc_exit(void) 3294 { 3295 unregister_pernet_subsys(&tcp4_net_ops); 3296 } 3297 #endif /* CONFIG_PROC_FS */ 3298 3299 /* @wake is one when sk_stream_write_space() calls us. 3300 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3301 * This mimics the strategy used in sock_def_write_space(). 3302 */ 3303 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3304 { 3305 const struct tcp_sock *tp = tcp_sk(sk); 3306 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3307 READ_ONCE(tp->snd_nxt); 3308 3309 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3310 } 3311 EXPORT_SYMBOL(tcp_stream_memory_free); 3312 3313 struct proto tcp_prot = { 3314 .name = "TCP", 3315 .owner = THIS_MODULE, 3316 .close = tcp_close, 3317 .pre_connect = tcp_v4_pre_connect, 3318 .connect = tcp_v4_connect, 3319 .disconnect = tcp_disconnect, 3320 .accept = inet_csk_accept, 3321 .ioctl = tcp_ioctl, 3322 .init = tcp_v4_init_sock, 3323 .destroy = tcp_v4_destroy_sock, 3324 .shutdown = tcp_shutdown, 3325 .setsockopt = tcp_setsockopt, 3326 .getsockopt = tcp_getsockopt, 3327 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3328 .keepalive = tcp_set_keepalive, 3329 .recvmsg = tcp_recvmsg, 3330 .sendmsg = tcp_sendmsg, 3331 .splice_eof = tcp_splice_eof, 3332 .backlog_rcv = tcp_v4_do_rcv, 3333 .release_cb = tcp_release_cb, 3334 .hash = inet_hash, 3335 .unhash = inet_unhash, 3336 .get_port = inet_csk_get_port, 3337 .put_port = inet_put_port, 3338 #ifdef CONFIG_BPF_SYSCALL 3339 .psock_update_sk_prot = tcp_bpf_update_proto, 3340 #endif 3341 .enter_memory_pressure = tcp_enter_memory_pressure, 3342 .leave_memory_pressure = tcp_leave_memory_pressure, 3343 .stream_memory_free = tcp_stream_memory_free, 3344 .sockets_allocated = &tcp_sockets_allocated, 3345 .orphan_count = &tcp_orphan_count, 3346 3347 .memory_allocated = &tcp_memory_allocated, 3348 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3349 3350 .memory_pressure = &tcp_memory_pressure, 3351 .sysctl_mem = sysctl_tcp_mem, 3352 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3353 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3354 .max_header = MAX_TCP_HEADER, 3355 .obj_size = sizeof(struct tcp_sock), 3356 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3357 .twsk_prot = &tcp_timewait_sock_ops, 3358 .rsk_prot = &tcp_request_sock_ops, 3359 .h.hashinfo = NULL, 3360 .no_autobind = true, 3361 .diag_destroy = tcp_abort, 3362 }; 3363 EXPORT_SYMBOL(tcp_prot); 3364 3365 static void __net_exit tcp_sk_exit(struct net *net) 3366 { 3367 if (net->ipv4.tcp_congestion_control) 3368 bpf_module_put(net->ipv4.tcp_congestion_control, 3369 net->ipv4.tcp_congestion_control->owner); 3370 } 3371 3372 static void __net_init tcp_set_hashinfo(struct net *net) 3373 { 3374 struct inet_hashinfo *hinfo; 3375 unsigned int ehash_entries; 3376 struct net *old_net; 3377 3378 if (net_eq(net, &init_net)) 3379 goto fallback; 3380 3381 old_net = current->nsproxy->net_ns; 3382 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3383 if (!ehash_entries) 3384 goto fallback; 3385 3386 ehash_entries = roundup_pow_of_two(ehash_entries); 3387 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3388 if (!hinfo) { 3389 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3390 "for a netns, fallback to the global one\n", 3391 ehash_entries); 3392 fallback: 3393 hinfo = &tcp_hashinfo; 3394 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3395 } 3396 3397 net->ipv4.tcp_death_row.hashinfo = hinfo; 3398 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3399 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3400 } 3401 3402 static int __net_init tcp_sk_init(struct net *net) 3403 { 3404 net->ipv4.sysctl_tcp_ecn = 2; 3405 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3406 3407 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3408 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3409 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3410 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3411 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3412 3413 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3414 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3415 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3416 3417 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3418 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3419 net->ipv4.sysctl_tcp_syncookies = 1; 3420 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3421 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3422 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3423 net->ipv4.sysctl_tcp_orphan_retries = 0; 3424 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3425 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3426 net->ipv4.sysctl_tcp_tw_reuse = 2; 3427 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3428 3429 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3430 tcp_set_hashinfo(net); 3431 3432 net->ipv4.sysctl_tcp_sack = 1; 3433 net->ipv4.sysctl_tcp_window_scaling = 1; 3434 net->ipv4.sysctl_tcp_timestamps = 1; 3435 net->ipv4.sysctl_tcp_early_retrans = 3; 3436 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3437 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3438 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3439 net->ipv4.sysctl_tcp_max_reordering = 300; 3440 net->ipv4.sysctl_tcp_dsack = 1; 3441 net->ipv4.sysctl_tcp_app_win = 31; 3442 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3443 net->ipv4.sysctl_tcp_frto = 2; 3444 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3445 /* This limits the percentage of the congestion window which we 3446 * will allow a single TSO frame to consume. Building TSO frames 3447 * which are too large can cause TCP streams to be bursty. 3448 */ 3449 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3450 /* Default TSQ limit of 16 TSO segments */ 3451 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3452 3453 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3454 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3455 3456 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3457 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3458 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3459 net->ipv4.sysctl_tcp_autocorking = 1; 3460 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3461 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3462 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3463 if (net != &init_net) { 3464 memcpy(net->ipv4.sysctl_tcp_rmem, 3465 init_net.ipv4.sysctl_tcp_rmem, 3466 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3467 memcpy(net->ipv4.sysctl_tcp_wmem, 3468 init_net.ipv4.sysctl_tcp_wmem, 3469 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3470 } 3471 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3472 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3473 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3474 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3475 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3476 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3477 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3478 3479 /* Set default values for PLB */ 3480 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3481 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3482 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3483 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3484 /* Default congestion threshold for PLB to mark a round is 50% */ 3485 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3486 3487 /* Reno is always built in */ 3488 if (!net_eq(net, &init_net) && 3489 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3490 init_net.ipv4.tcp_congestion_control->owner)) 3491 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3492 else 3493 net->ipv4.tcp_congestion_control = &tcp_reno; 3494 3495 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3496 net->ipv4.sysctl_tcp_shrink_window = 0; 3497 3498 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3499 3500 return 0; 3501 } 3502 3503 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3504 { 3505 struct net *net; 3506 3507 tcp_twsk_purge(net_exit_list, AF_INET); 3508 3509 list_for_each_entry(net, net_exit_list, exit_list) { 3510 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3511 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3512 tcp_fastopen_ctx_destroy(net); 3513 } 3514 } 3515 3516 static struct pernet_operations __net_initdata tcp_sk_ops = { 3517 .init = tcp_sk_init, 3518 .exit = tcp_sk_exit, 3519 .exit_batch = tcp_sk_exit_batch, 3520 }; 3521 3522 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3523 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3524 struct sock_common *sk_common, uid_t uid) 3525 3526 #define INIT_BATCH_SZ 16 3527 3528 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3529 { 3530 struct bpf_tcp_iter_state *iter = priv_data; 3531 int err; 3532 3533 err = bpf_iter_init_seq_net(priv_data, aux); 3534 if (err) 3535 return err; 3536 3537 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3538 if (err) { 3539 bpf_iter_fini_seq_net(priv_data); 3540 return err; 3541 } 3542 3543 return 0; 3544 } 3545 3546 static void bpf_iter_fini_tcp(void *priv_data) 3547 { 3548 struct bpf_tcp_iter_state *iter = priv_data; 3549 3550 bpf_iter_fini_seq_net(priv_data); 3551 kvfree(iter->batch); 3552 } 3553 3554 static const struct bpf_iter_seq_info tcp_seq_info = { 3555 .seq_ops = &bpf_iter_tcp_seq_ops, 3556 .init_seq_private = bpf_iter_init_tcp, 3557 .fini_seq_private = bpf_iter_fini_tcp, 3558 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3559 }; 3560 3561 static const struct bpf_func_proto * 3562 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3563 const struct bpf_prog *prog) 3564 { 3565 switch (func_id) { 3566 case BPF_FUNC_setsockopt: 3567 return &bpf_sk_setsockopt_proto; 3568 case BPF_FUNC_getsockopt: 3569 return &bpf_sk_getsockopt_proto; 3570 default: 3571 return NULL; 3572 } 3573 } 3574 3575 static struct bpf_iter_reg tcp_reg_info = { 3576 .target = "tcp", 3577 .ctx_arg_info_size = 1, 3578 .ctx_arg_info = { 3579 { offsetof(struct bpf_iter__tcp, sk_common), 3580 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3581 }, 3582 .get_func_proto = bpf_iter_tcp_get_func_proto, 3583 .seq_info = &tcp_seq_info, 3584 }; 3585 3586 static void __init bpf_iter_register(void) 3587 { 3588 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3589 if (bpf_iter_reg_target(&tcp_reg_info)) 3590 pr_warn("Warning: could not register bpf iterator tcp\n"); 3591 } 3592 3593 #endif 3594 3595 void __init tcp_v4_init(void) 3596 { 3597 int cpu, res; 3598 3599 for_each_possible_cpu(cpu) { 3600 struct sock *sk; 3601 3602 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3603 IPPROTO_TCP, &init_net); 3604 if (res) 3605 panic("Failed to create the TCP control socket.\n"); 3606 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3607 3608 /* Please enforce IP_DF and IPID==0 for RST and 3609 * ACK sent in SYN-RECV and TIME-WAIT state. 3610 */ 3611 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3612 3613 per_cpu(ipv4_tcp_sk, cpu) = sk; 3614 } 3615 if (register_pernet_subsys(&tcp_sk_ops)) 3616 panic("Failed to create the TCP control socket.\n"); 3617 3618 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3619 bpf_iter_register(); 3620 #endif 3621 } 3622