1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 #include <net/rstreason.h> 74 75 #include <linux/inet.h> 76 #include <linux/ipv6.h> 77 #include <linux/stddef.h> 78 #include <linux/proc_fs.h> 79 #include <linux/seq_file.h> 80 #include <linux/inetdevice.h> 81 #include <linux/btf_ids.h> 82 83 #include <crypto/hash.h> 84 #include <linux/scatterlist.h> 85 86 #include <trace/events/tcp.h> 87 88 #ifdef CONFIG_TCP_MD5SIG 89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 90 __be32 daddr, __be32 saddr, const struct tcphdr *th); 91 #endif 92 93 struct inet_hashinfo tcp_hashinfo; 94 EXPORT_SYMBOL(tcp_hashinfo); 95 96 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 97 98 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 99 { 100 return secure_tcp_seq(ip_hdr(skb)->daddr, 101 ip_hdr(skb)->saddr, 102 tcp_hdr(skb)->dest, 103 tcp_hdr(skb)->source); 104 } 105 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 107 { 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 109 } 110 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 112 { 113 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 114 const struct inet_timewait_sock *tw = inet_twsk(sktw); 115 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 116 struct tcp_sock *tp = tcp_sk(sk); 117 118 if (reuse == 2) { 119 /* Still does not detect *everything* that goes through 120 * lo, since we require a loopback src or dst address 121 * or direct binding to 'lo' interface. 122 */ 123 bool loopback = false; 124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 125 loopback = true; 126 #if IS_ENABLED(CONFIG_IPV6) 127 if (tw->tw_family == AF_INET6) { 128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 130 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 131 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 132 loopback = true; 133 } else 134 #endif 135 { 136 if (ipv4_is_loopback(tw->tw_daddr) || 137 ipv4_is_loopback(tw->tw_rcv_saddr)) 138 loopback = true; 139 } 140 if (!loopback) 141 reuse = 0; 142 } 143 144 /* With PAWS, it is safe from the viewpoint 145 of data integrity. Even without PAWS it is safe provided sequence 146 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 147 148 Actually, the idea is close to VJ's one, only timestamp cache is 149 held not per host, but per port pair and TW bucket is used as state 150 holder. 151 152 If TW bucket has been already destroyed we fall back to VJ's scheme 153 and use initial timestamp retrieved from peer table. 154 */ 155 if (tcptw->tw_ts_recent_stamp && 156 (!twp || (reuse && time_after32(ktime_get_seconds(), 157 tcptw->tw_ts_recent_stamp)))) { 158 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk 159 * and releasing the bucket lock. 160 */ 161 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 162 return 0; 163 164 /* In case of repair and re-using TIME-WAIT sockets we still 165 * want to be sure that it is safe as above but honor the 166 * sequence numbers and time stamps set as part of the repair 167 * process. 168 * 169 * Without this check re-using a TIME-WAIT socket with TCP 170 * repair would accumulate a -1 on the repair assigned 171 * sequence number. The first time it is reused the sequence 172 * is -1, the second time -2, etc. This fixes that issue 173 * without appearing to create any others. 174 */ 175 if (likely(!tp->repair)) { 176 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 177 178 if (!seq) 179 seq = 1; 180 WRITE_ONCE(tp->write_seq, seq); 181 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 182 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 183 } 184 185 return 1; 186 } 187 188 return 0; 189 } 190 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 191 192 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 193 int addr_len) 194 { 195 /* This check is replicated from tcp_v4_connect() and intended to 196 * prevent BPF program called below from accessing bytes that are out 197 * of the bound specified by user in addr_len. 198 */ 199 if (addr_len < sizeof(struct sockaddr_in)) 200 return -EINVAL; 201 202 sock_owned_by_me(sk); 203 204 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 205 } 206 207 /* This will initiate an outgoing connection. */ 208 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 209 { 210 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 211 struct inet_timewait_death_row *tcp_death_row; 212 struct inet_sock *inet = inet_sk(sk); 213 struct tcp_sock *tp = tcp_sk(sk); 214 struct ip_options_rcu *inet_opt; 215 struct net *net = sock_net(sk); 216 __be16 orig_sport, orig_dport; 217 __be32 daddr, nexthop; 218 struct flowi4 *fl4; 219 struct rtable *rt; 220 int err; 221 222 if (addr_len < sizeof(struct sockaddr_in)) 223 return -EINVAL; 224 225 if (usin->sin_family != AF_INET) 226 return -EAFNOSUPPORT; 227 228 nexthop = daddr = usin->sin_addr.s_addr; 229 inet_opt = rcu_dereference_protected(inet->inet_opt, 230 lockdep_sock_is_held(sk)); 231 if (inet_opt && inet_opt->opt.srr) { 232 if (!daddr) 233 return -EINVAL; 234 nexthop = inet_opt->opt.faddr; 235 } 236 237 orig_sport = inet->inet_sport; 238 orig_dport = usin->sin_port; 239 fl4 = &inet->cork.fl.u.ip4; 240 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 241 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 242 orig_dport, sk); 243 if (IS_ERR(rt)) { 244 err = PTR_ERR(rt); 245 if (err == -ENETUNREACH) 246 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 247 return err; 248 } 249 250 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 251 ip_rt_put(rt); 252 return -ENETUNREACH; 253 } 254 255 if (!inet_opt || !inet_opt->opt.srr) 256 daddr = fl4->daddr; 257 258 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 259 260 if (!inet->inet_saddr) { 261 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 262 if (err) { 263 ip_rt_put(rt); 264 return err; 265 } 266 } else { 267 sk_rcv_saddr_set(sk, inet->inet_saddr); 268 } 269 270 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 271 /* Reset inherited state */ 272 tp->rx_opt.ts_recent = 0; 273 tp->rx_opt.ts_recent_stamp = 0; 274 if (likely(!tp->repair)) 275 WRITE_ONCE(tp->write_seq, 0); 276 } 277 278 inet->inet_dport = usin->sin_port; 279 sk_daddr_set(sk, daddr); 280 281 inet_csk(sk)->icsk_ext_hdr_len = 0; 282 if (inet_opt) 283 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 284 285 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 286 287 /* Socket identity is still unknown (sport may be zero). 288 * However we set state to SYN-SENT and not releasing socket 289 * lock select source port, enter ourselves into the hash tables and 290 * complete initialization after this. 291 */ 292 tcp_set_state(sk, TCP_SYN_SENT); 293 err = inet_hash_connect(tcp_death_row, sk); 294 if (err) 295 goto failure; 296 297 sk_set_txhash(sk); 298 299 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 300 inet->inet_sport, inet->inet_dport, sk); 301 if (IS_ERR(rt)) { 302 err = PTR_ERR(rt); 303 rt = NULL; 304 goto failure; 305 } 306 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 307 /* OK, now commit destination to socket. */ 308 sk->sk_gso_type = SKB_GSO_TCPV4; 309 sk_setup_caps(sk, &rt->dst); 310 rt = NULL; 311 312 if (likely(!tp->repair)) { 313 if (!tp->write_seq) 314 WRITE_ONCE(tp->write_seq, 315 secure_tcp_seq(inet->inet_saddr, 316 inet->inet_daddr, 317 inet->inet_sport, 318 usin->sin_port)); 319 WRITE_ONCE(tp->tsoffset, 320 secure_tcp_ts_off(net, inet->inet_saddr, 321 inet->inet_daddr)); 322 } 323 324 atomic_set(&inet->inet_id, get_random_u16()); 325 326 if (tcp_fastopen_defer_connect(sk, &err)) 327 return err; 328 if (err) 329 goto failure; 330 331 err = tcp_connect(sk); 332 333 if (err) 334 goto failure; 335 336 return 0; 337 338 failure: 339 /* 340 * This unhashes the socket and releases the local port, 341 * if necessary. 342 */ 343 tcp_set_state(sk, TCP_CLOSE); 344 inet_bhash2_reset_saddr(sk); 345 ip_rt_put(rt); 346 sk->sk_route_caps = 0; 347 inet->inet_dport = 0; 348 return err; 349 } 350 EXPORT_SYMBOL(tcp_v4_connect); 351 352 /* 353 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 354 * It can be called through tcp_release_cb() if socket was owned by user 355 * at the time tcp_v4_err() was called to handle ICMP message. 356 */ 357 void tcp_v4_mtu_reduced(struct sock *sk) 358 { 359 struct inet_sock *inet = inet_sk(sk); 360 struct dst_entry *dst; 361 u32 mtu; 362 363 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 364 return; 365 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 366 dst = inet_csk_update_pmtu(sk, mtu); 367 if (!dst) 368 return; 369 370 /* Something is about to be wrong... Remember soft error 371 * for the case, if this connection will not able to recover. 372 */ 373 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 374 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 375 376 mtu = dst_mtu(dst); 377 378 if (inet->pmtudisc != IP_PMTUDISC_DONT && 379 ip_sk_accept_pmtu(sk) && 380 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 381 tcp_sync_mss(sk, mtu); 382 383 /* Resend the TCP packet because it's 384 * clear that the old packet has been 385 * dropped. This is the new "fast" path mtu 386 * discovery. 387 */ 388 tcp_simple_retransmit(sk); 389 } /* else let the usual retransmit timer handle it */ 390 } 391 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 392 393 static void do_redirect(struct sk_buff *skb, struct sock *sk) 394 { 395 struct dst_entry *dst = __sk_dst_check(sk, 0); 396 397 if (dst) 398 dst->ops->redirect(dst, sk, skb); 399 } 400 401 402 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 403 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 404 { 405 struct request_sock *req = inet_reqsk(sk); 406 struct net *net = sock_net(sk); 407 408 /* ICMPs are not backlogged, hence we cannot get 409 * an established socket here. 410 */ 411 if (seq != tcp_rsk(req)->snt_isn) { 412 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 413 } else if (abort) { 414 /* 415 * Still in SYN_RECV, just remove it silently. 416 * There is no good way to pass the error to the newly 417 * created socket, and POSIX does not want network 418 * errors returned from accept(). 419 */ 420 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 421 tcp_listendrop(req->rsk_listener); 422 } 423 reqsk_put(req); 424 } 425 EXPORT_SYMBOL(tcp_req_err); 426 427 /* TCP-LD (RFC 6069) logic */ 428 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 429 { 430 struct inet_connection_sock *icsk = inet_csk(sk); 431 struct tcp_sock *tp = tcp_sk(sk); 432 struct sk_buff *skb; 433 s32 remaining; 434 u32 delta_us; 435 436 if (sock_owned_by_user(sk)) 437 return; 438 439 if (seq != tp->snd_una || !icsk->icsk_retransmits || 440 !icsk->icsk_backoff) 441 return; 442 443 skb = tcp_rtx_queue_head(sk); 444 if (WARN_ON_ONCE(!skb)) 445 return; 446 447 icsk->icsk_backoff--; 448 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 449 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 450 451 tcp_mstamp_refresh(tp); 452 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 453 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 454 455 if (remaining > 0) { 456 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 457 remaining, TCP_RTO_MAX); 458 } else { 459 /* RTO revert clocked out retransmission. 460 * Will retransmit now. 461 */ 462 tcp_retransmit_timer(sk); 463 } 464 } 465 EXPORT_SYMBOL(tcp_ld_RTO_revert); 466 467 /* 468 * This routine is called by the ICMP module when it gets some 469 * sort of error condition. If err < 0 then the socket should 470 * be closed and the error returned to the user. If err > 0 471 * it's just the icmp type << 8 | icmp code. After adjustment 472 * header points to the first 8 bytes of the tcp header. We need 473 * to find the appropriate port. 474 * 475 * The locking strategy used here is very "optimistic". When 476 * someone else accesses the socket the ICMP is just dropped 477 * and for some paths there is no check at all. 478 * A more general error queue to queue errors for later handling 479 * is probably better. 480 * 481 */ 482 483 int tcp_v4_err(struct sk_buff *skb, u32 info) 484 { 485 const struct iphdr *iph = (const struct iphdr *)skb->data; 486 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 487 struct tcp_sock *tp; 488 const int type = icmp_hdr(skb)->type; 489 const int code = icmp_hdr(skb)->code; 490 struct sock *sk; 491 struct request_sock *fastopen; 492 u32 seq, snd_una; 493 int err; 494 struct net *net = dev_net(skb->dev); 495 496 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 497 iph->daddr, th->dest, iph->saddr, 498 ntohs(th->source), inet_iif(skb), 0); 499 if (!sk) { 500 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 501 return -ENOENT; 502 } 503 if (sk->sk_state == TCP_TIME_WAIT) { 504 /* To increase the counter of ignored icmps for TCP-AO */ 505 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 506 inet_twsk_put(inet_twsk(sk)); 507 return 0; 508 } 509 seq = ntohl(th->seq); 510 if (sk->sk_state == TCP_NEW_SYN_RECV) { 511 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 512 type == ICMP_TIME_EXCEEDED || 513 (type == ICMP_DEST_UNREACH && 514 (code == ICMP_NET_UNREACH || 515 code == ICMP_HOST_UNREACH))); 516 return 0; 517 } 518 519 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 520 sock_put(sk); 521 return 0; 522 } 523 524 bh_lock_sock(sk); 525 /* If too many ICMPs get dropped on busy 526 * servers this needs to be solved differently. 527 * We do take care of PMTU discovery (RFC1191) special case : 528 * we can receive locally generated ICMP messages while socket is held. 529 */ 530 if (sock_owned_by_user(sk)) { 531 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 532 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 533 } 534 if (sk->sk_state == TCP_CLOSE) 535 goto out; 536 537 if (static_branch_unlikely(&ip4_min_ttl)) { 538 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 539 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 540 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 541 goto out; 542 } 543 } 544 545 tp = tcp_sk(sk); 546 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 547 fastopen = rcu_dereference(tp->fastopen_rsk); 548 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 549 if (sk->sk_state != TCP_LISTEN && 550 !between(seq, snd_una, tp->snd_nxt)) { 551 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 552 goto out; 553 } 554 555 switch (type) { 556 case ICMP_REDIRECT: 557 if (!sock_owned_by_user(sk)) 558 do_redirect(skb, sk); 559 goto out; 560 case ICMP_SOURCE_QUENCH: 561 /* Just silently ignore these. */ 562 goto out; 563 case ICMP_PARAMETERPROB: 564 err = EPROTO; 565 break; 566 case ICMP_DEST_UNREACH: 567 if (code > NR_ICMP_UNREACH) 568 goto out; 569 570 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 571 /* We are not interested in TCP_LISTEN and open_requests 572 * (SYN-ACKs send out by Linux are always <576bytes so 573 * they should go through unfragmented). 574 */ 575 if (sk->sk_state == TCP_LISTEN) 576 goto out; 577 578 WRITE_ONCE(tp->mtu_info, info); 579 if (!sock_owned_by_user(sk)) { 580 tcp_v4_mtu_reduced(sk); 581 } else { 582 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 583 sock_hold(sk); 584 } 585 goto out; 586 } 587 588 err = icmp_err_convert[code].errno; 589 /* check if this ICMP message allows revert of backoff. 590 * (see RFC 6069) 591 */ 592 if (!fastopen && 593 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 594 tcp_ld_RTO_revert(sk, seq); 595 break; 596 case ICMP_TIME_EXCEEDED: 597 err = EHOSTUNREACH; 598 break; 599 default: 600 goto out; 601 } 602 603 switch (sk->sk_state) { 604 case TCP_SYN_SENT: 605 case TCP_SYN_RECV: 606 /* Only in fast or simultaneous open. If a fast open socket is 607 * already accepted it is treated as a connected one below. 608 */ 609 if (fastopen && !fastopen->sk) 610 break; 611 612 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 613 614 if (!sock_owned_by_user(sk)) { 615 WRITE_ONCE(sk->sk_err, err); 616 617 sk_error_report(sk); 618 619 tcp_done(sk); 620 } else { 621 WRITE_ONCE(sk->sk_err_soft, err); 622 } 623 goto out; 624 } 625 626 /* If we've already connected we will keep trying 627 * until we time out, or the user gives up. 628 * 629 * rfc1122 4.2.3.9 allows to consider as hard errors 630 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 631 * but it is obsoleted by pmtu discovery). 632 * 633 * Note, that in modern internet, where routing is unreliable 634 * and in each dark corner broken firewalls sit, sending random 635 * errors ordered by their masters even this two messages finally lose 636 * their original sense (even Linux sends invalid PORT_UNREACHs) 637 * 638 * Now we are in compliance with RFCs. 639 * --ANK (980905) 640 */ 641 642 if (!sock_owned_by_user(sk) && 643 inet_test_bit(RECVERR, sk)) { 644 WRITE_ONCE(sk->sk_err, err); 645 sk_error_report(sk); 646 } else { /* Only an error on timeout */ 647 WRITE_ONCE(sk->sk_err_soft, err); 648 } 649 650 out: 651 bh_unlock_sock(sk); 652 sock_put(sk); 653 return 0; 654 } 655 656 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 657 { 658 struct tcphdr *th = tcp_hdr(skb); 659 660 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 661 skb->csum_start = skb_transport_header(skb) - skb->head; 662 skb->csum_offset = offsetof(struct tcphdr, check); 663 } 664 665 /* This routine computes an IPv4 TCP checksum. */ 666 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 667 { 668 const struct inet_sock *inet = inet_sk(sk); 669 670 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 671 } 672 EXPORT_SYMBOL(tcp_v4_send_check); 673 674 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 675 676 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 677 const struct tcp_ao_hdr *aoh, 678 struct ip_reply_arg *arg, struct tcphdr *reply, 679 __be32 reply_options[REPLY_OPTIONS_LEN]) 680 { 681 #ifdef CONFIG_TCP_AO 682 int sdif = tcp_v4_sdif(skb); 683 int dif = inet_iif(skb); 684 int l3index = sdif ? dif : 0; 685 bool allocated_traffic_key; 686 struct tcp_ao_key *key; 687 char *traffic_key; 688 bool drop = true; 689 u32 ao_sne = 0; 690 u8 keyid; 691 692 rcu_read_lock(); 693 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 694 &key, &traffic_key, &allocated_traffic_key, 695 &keyid, &ao_sne)) 696 goto out; 697 698 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 699 (aoh->rnext_keyid << 8) | keyid); 700 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 701 reply->doff = arg->iov[0].iov_len / 4; 702 703 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 704 key, traffic_key, 705 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 706 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 707 reply, ao_sne)) 708 goto out; 709 drop = false; 710 out: 711 rcu_read_unlock(); 712 if (allocated_traffic_key) 713 kfree(traffic_key); 714 return drop; 715 #else 716 return true; 717 #endif 718 } 719 720 /* 721 * This routine will send an RST to the other tcp. 722 * 723 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 724 * for reset. 725 * Answer: if a packet caused RST, it is not for a socket 726 * existing in our system, if it is matched to a socket, 727 * it is just duplicate segment or bug in other side's TCP. 728 * So that we build reply only basing on parameters 729 * arrived with segment. 730 * Exception: precedence violation. We do not implement it in any case. 731 */ 732 733 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 734 enum sk_rst_reason reason) 735 { 736 const struct tcphdr *th = tcp_hdr(skb); 737 struct { 738 struct tcphdr th; 739 __be32 opt[REPLY_OPTIONS_LEN]; 740 } rep; 741 const __u8 *md5_hash_location = NULL; 742 const struct tcp_ao_hdr *aoh; 743 struct ip_reply_arg arg; 744 #ifdef CONFIG_TCP_MD5SIG 745 struct tcp_md5sig_key *key = NULL; 746 unsigned char newhash[16]; 747 struct sock *sk1 = NULL; 748 int genhash; 749 #endif 750 u64 transmit_time = 0; 751 struct sock *ctl_sk; 752 struct net *net; 753 u32 txhash = 0; 754 755 /* Never send a reset in response to a reset. */ 756 if (th->rst) 757 return; 758 759 /* If sk not NULL, it means we did a successful lookup and incoming 760 * route had to be correct. prequeue might have dropped our dst. 761 */ 762 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 763 return; 764 765 /* Swap the send and the receive. */ 766 memset(&rep, 0, sizeof(rep)); 767 rep.th.dest = th->source; 768 rep.th.source = th->dest; 769 rep.th.doff = sizeof(struct tcphdr) / 4; 770 rep.th.rst = 1; 771 772 if (th->ack) { 773 rep.th.seq = th->ack_seq; 774 } else { 775 rep.th.ack = 1; 776 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 777 skb->len - (th->doff << 2)); 778 } 779 780 memset(&arg, 0, sizeof(arg)); 781 arg.iov[0].iov_base = (unsigned char *)&rep; 782 arg.iov[0].iov_len = sizeof(rep.th); 783 784 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 785 786 /* Invalid TCP option size or twice included auth */ 787 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 788 return; 789 790 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 791 return; 792 793 #ifdef CONFIG_TCP_MD5SIG 794 rcu_read_lock(); 795 if (sk && sk_fullsock(sk)) { 796 const union tcp_md5_addr *addr; 797 int l3index; 798 799 /* sdif set, means packet ingressed via a device 800 * in an L3 domain and inet_iif is set to it. 801 */ 802 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 803 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 804 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 805 } else if (md5_hash_location) { 806 const union tcp_md5_addr *addr; 807 int sdif = tcp_v4_sdif(skb); 808 int dif = inet_iif(skb); 809 int l3index; 810 811 /* 812 * active side is lost. Try to find listening socket through 813 * source port, and then find md5 key through listening socket. 814 * we are not loose security here: 815 * Incoming packet is checked with md5 hash with finding key, 816 * no RST generated if md5 hash doesn't match. 817 */ 818 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 819 NULL, 0, ip_hdr(skb)->saddr, 820 th->source, ip_hdr(skb)->daddr, 821 ntohs(th->source), dif, sdif); 822 /* don't send rst if it can't find key */ 823 if (!sk1) 824 goto out; 825 826 /* sdif set, means packet ingressed via a device 827 * in an L3 domain and dif is set to it. 828 */ 829 l3index = sdif ? dif : 0; 830 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 831 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 832 if (!key) 833 goto out; 834 835 836 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 837 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 838 goto out; 839 840 } 841 842 if (key) { 843 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 844 (TCPOPT_NOP << 16) | 845 (TCPOPT_MD5SIG << 8) | 846 TCPOLEN_MD5SIG); 847 /* Update length and the length the header thinks exists */ 848 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 849 rep.th.doff = arg.iov[0].iov_len / 4; 850 851 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 852 key, ip_hdr(skb)->saddr, 853 ip_hdr(skb)->daddr, &rep.th); 854 } 855 #endif 856 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 857 if (rep.opt[0] == 0) { 858 __be32 mrst = mptcp_reset_option(skb); 859 860 if (mrst) { 861 rep.opt[0] = mrst; 862 arg.iov[0].iov_len += sizeof(mrst); 863 rep.th.doff = arg.iov[0].iov_len / 4; 864 } 865 } 866 867 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 868 ip_hdr(skb)->saddr, /* XXX */ 869 arg.iov[0].iov_len, IPPROTO_TCP, 0); 870 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 871 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 872 873 /* When socket is gone, all binding information is lost. 874 * routing might fail in this case. No choice here, if we choose to force 875 * input interface, we will misroute in case of asymmetric route. 876 */ 877 if (sk) 878 arg.bound_dev_if = sk->sk_bound_dev_if; 879 880 trace_tcp_send_reset(sk, skb, reason); 881 882 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 883 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 884 885 arg.tos = ip_hdr(skb)->tos; 886 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 887 local_bh_disable(); 888 ctl_sk = this_cpu_read(ipv4_tcp_sk); 889 sock_net_set(ctl_sk, net); 890 if (sk) { 891 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 892 inet_twsk(sk)->tw_mark : sk->sk_mark; 893 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 894 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 895 transmit_time = tcp_transmit_time(sk); 896 xfrm_sk_clone_policy(ctl_sk, sk); 897 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 898 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 899 } else { 900 ctl_sk->sk_mark = 0; 901 ctl_sk->sk_priority = 0; 902 } 903 ip_send_unicast_reply(ctl_sk, 904 skb, &TCP_SKB_CB(skb)->header.h4.opt, 905 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 906 &arg, arg.iov[0].iov_len, 907 transmit_time, txhash); 908 909 xfrm_sk_free_policy(ctl_sk); 910 sock_net_set(ctl_sk, &init_net); 911 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 912 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 913 local_bh_enable(); 914 915 #ifdef CONFIG_TCP_MD5SIG 916 out: 917 rcu_read_unlock(); 918 #endif 919 } 920 921 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 922 outside socket context is ugly, certainly. What can I do? 923 */ 924 925 static void tcp_v4_send_ack(const struct sock *sk, 926 struct sk_buff *skb, u32 seq, u32 ack, 927 u32 win, u32 tsval, u32 tsecr, int oif, 928 struct tcp_key *key, 929 int reply_flags, u8 tos, u32 txhash) 930 { 931 const struct tcphdr *th = tcp_hdr(skb); 932 struct { 933 struct tcphdr th; 934 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 935 } rep; 936 struct net *net = sock_net(sk); 937 struct ip_reply_arg arg; 938 struct sock *ctl_sk; 939 u64 transmit_time; 940 941 memset(&rep.th, 0, sizeof(struct tcphdr)); 942 memset(&arg, 0, sizeof(arg)); 943 944 arg.iov[0].iov_base = (unsigned char *)&rep; 945 arg.iov[0].iov_len = sizeof(rep.th); 946 if (tsecr) { 947 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 948 (TCPOPT_TIMESTAMP << 8) | 949 TCPOLEN_TIMESTAMP); 950 rep.opt[1] = htonl(tsval); 951 rep.opt[2] = htonl(tsecr); 952 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 953 } 954 955 /* Swap the send and the receive. */ 956 rep.th.dest = th->source; 957 rep.th.source = th->dest; 958 rep.th.doff = arg.iov[0].iov_len / 4; 959 rep.th.seq = htonl(seq); 960 rep.th.ack_seq = htonl(ack); 961 rep.th.ack = 1; 962 rep.th.window = htons(win); 963 964 #ifdef CONFIG_TCP_MD5SIG 965 if (tcp_key_is_md5(key)) { 966 int offset = (tsecr) ? 3 : 0; 967 968 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 969 (TCPOPT_NOP << 16) | 970 (TCPOPT_MD5SIG << 8) | 971 TCPOLEN_MD5SIG); 972 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 973 rep.th.doff = arg.iov[0].iov_len/4; 974 975 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 976 key->md5_key, ip_hdr(skb)->saddr, 977 ip_hdr(skb)->daddr, &rep.th); 978 } 979 #endif 980 #ifdef CONFIG_TCP_AO 981 if (tcp_key_is_ao(key)) { 982 int offset = (tsecr) ? 3 : 0; 983 984 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 985 (tcp_ao_len(key->ao_key) << 16) | 986 (key->ao_key->sndid << 8) | 987 key->rcv_next); 988 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 989 rep.th.doff = arg.iov[0].iov_len / 4; 990 991 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 992 key->ao_key, key->traffic_key, 993 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 994 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 995 &rep.th, key->sne); 996 } 997 #endif 998 arg.flags = reply_flags; 999 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1000 ip_hdr(skb)->saddr, /* XXX */ 1001 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1002 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1003 if (oif) 1004 arg.bound_dev_if = oif; 1005 arg.tos = tos; 1006 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1007 local_bh_disable(); 1008 ctl_sk = this_cpu_read(ipv4_tcp_sk); 1009 sock_net_set(ctl_sk, net); 1010 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1011 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1012 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1013 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1014 transmit_time = tcp_transmit_time(sk); 1015 ip_send_unicast_reply(ctl_sk, 1016 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1017 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1018 &arg, arg.iov[0].iov_len, 1019 transmit_time, txhash); 1020 1021 sock_net_set(ctl_sk, &init_net); 1022 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1023 local_bh_enable(); 1024 } 1025 1026 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 1027 { 1028 struct inet_timewait_sock *tw = inet_twsk(sk); 1029 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1030 struct tcp_key key = {}; 1031 #ifdef CONFIG_TCP_AO 1032 struct tcp_ao_info *ao_info; 1033 1034 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1035 /* FIXME: the segment to-be-acked is not verified yet */ 1036 ao_info = rcu_dereference(tcptw->ao_info); 1037 if (ao_info) { 1038 const struct tcp_ao_hdr *aoh; 1039 1040 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1041 inet_twsk_put(tw); 1042 return; 1043 } 1044 1045 if (aoh) 1046 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); 1047 } 1048 } 1049 if (key.ao_key) { 1050 struct tcp_ao_key *rnext_key; 1051 1052 key.traffic_key = snd_other_key(key.ao_key); 1053 key.sne = READ_ONCE(ao_info->snd_sne); 1054 rnext_key = READ_ONCE(ao_info->rnext_key); 1055 key.rcv_next = rnext_key->rcvid; 1056 key.type = TCP_KEY_AO; 1057 #else 1058 if (0) { 1059 #endif 1060 #ifdef CONFIG_TCP_MD5SIG 1061 } else if (static_branch_unlikely(&tcp_md5_needed.key)) { 1062 key.md5_key = tcp_twsk_md5_key(tcptw); 1063 if (key.md5_key) 1064 key.type = TCP_KEY_MD5; 1065 #endif 1066 } 1067 1068 tcp_v4_send_ack(sk, skb, 1069 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 1070 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1071 tcp_tw_tsval(tcptw), 1072 tcptw->tw_ts_recent, 1073 tw->tw_bound_dev_if, &key, 1074 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1075 tw->tw_tos, 1076 tw->tw_txhash); 1077 1078 inet_twsk_put(tw); 1079 } 1080 1081 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1082 struct request_sock *req) 1083 { 1084 struct tcp_key key = {}; 1085 1086 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1087 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1088 */ 1089 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1090 tcp_sk(sk)->snd_nxt; 1091 1092 #ifdef CONFIG_TCP_AO 1093 if (static_branch_unlikely(&tcp_ao_needed.key) && 1094 tcp_rsk_used_ao(req)) { 1095 const union tcp_md5_addr *addr; 1096 const struct tcp_ao_hdr *aoh; 1097 int l3index; 1098 1099 /* Invalid TCP option size or twice included auth */ 1100 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1101 return; 1102 if (!aoh) 1103 return; 1104 1105 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1106 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1107 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1108 aoh->rnext_keyid, -1); 1109 if (unlikely(!key.ao_key)) { 1110 /* Send ACK with any matching MKT for the peer */ 1111 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1112 /* Matching key disappeared (user removed the key?) 1113 * let the handshake timeout. 1114 */ 1115 if (!key.ao_key) { 1116 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1117 addr, 1118 ntohs(tcp_hdr(skb)->source), 1119 &ip_hdr(skb)->daddr, 1120 ntohs(tcp_hdr(skb)->dest)); 1121 return; 1122 } 1123 } 1124 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1125 if (!key.traffic_key) 1126 return; 1127 1128 key.type = TCP_KEY_AO; 1129 key.rcv_next = aoh->keyid; 1130 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1131 #else 1132 if (0) { 1133 #endif 1134 #ifdef CONFIG_TCP_MD5SIG 1135 } else if (static_branch_unlikely(&tcp_md5_needed.key)) { 1136 const union tcp_md5_addr *addr; 1137 int l3index; 1138 1139 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1140 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1141 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1142 if (key.md5_key) 1143 key.type = TCP_KEY_MD5; 1144 #endif 1145 } 1146 1147 tcp_v4_send_ack(sk, skb, seq, 1148 tcp_rsk(req)->rcv_nxt, 1149 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1150 tcp_rsk_tsval(tcp_rsk(req)), 1151 READ_ONCE(req->ts_recent), 1152 0, &key, 1153 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1154 ip_hdr(skb)->tos, 1155 READ_ONCE(tcp_rsk(req)->txhash)); 1156 if (tcp_key_is_ao(&key)) 1157 kfree(key.traffic_key); 1158 } 1159 1160 /* 1161 * Send a SYN-ACK after having received a SYN. 1162 * This still operates on a request_sock only, not on a big 1163 * socket. 1164 */ 1165 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1166 struct flowi *fl, 1167 struct request_sock *req, 1168 struct tcp_fastopen_cookie *foc, 1169 enum tcp_synack_type synack_type, 1170 struct sk_buff *syn_skb) 1171 { 1172 const struct inet_request_sock *ireq = inet_rsk(req); 1173 struct flowi4 fl4; 1174 int err = -1; 1175 struct sk_buff *skb; 1176 u8 tos; 1177 1178 /* First, grab a route. */ 1179 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1180 return -1; 1181 1182 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1183 1184 if (skb) { 1185 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1186 1187 tos = READ_ONCE(inet_sk(sk)->tos); 1188 1189 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1190 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1191 (tos & INET_ECN_MASK); 1192 1193 if (!INET_ECN_is_capable(tos) && 1194 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1195 tos |= INET_ECN_ECT_0; 1196 1197 rcu_read_lock(); 1198 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1199 ireq->ir_rmt_addr, 1200 rcu_dereference(ireq->ireq_opt), 1201 tos); 1202 rcu_read_unlock(); 1203 err = net_xmit_eval(err); 1204 } 1205 1206 return err; 1207 } 1208 1209 /* 1210 * IPv4 request_sock destructor. 1211 */ 1212 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1213 { 1214 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1215 } 1216 1217 #ifdef CONFIG_TCP_MD5SIG 1218 /* 1219 * RFC2385 MD5 checksumming requires a mapping of 1220 * IP address->MD5 Key. 1221 * We need to maintain these in the sk structure. 1222 */ 1223 1224 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1225 EXPORT_SYMBOL(tcp_md5_needed); 1226 1227 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1228 { 1229 if (!old) 1230 return true; 1231 1232 /* l3index always overrides non-l3index */ 1233 if (old->l3index && new->l3index == 0) 1234 return false; 1235 if (old->l3index == 0 && new->l3index) 1236 return true; 1237 1238 return old->prefixlen < new->prefixlen; 1239 } 1240 1241 /* Find the Key structure for an address. */ 1242 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1243 const union tcp_md5_addr *addr, 1244 int family, bool any_l3index) 1245 { 1246 const struct tcp_sock *tp = tcp_sk(sk); 1247 struct tcp_md5sig_key *key; 1248 const struct tcp_md5sig_info *md5sig; 1249 __be32 mask; 1250 struct tcp_md5sig_key *best_match = NULL; 1251 bool match; 1252 1253 /* caller either holds rcu_read_lock() or socket lock */ 1254 md5sig = rcu_dereference_check(tp->md5sig_info, 1255 lockdep_sock_is_held(sk)); 1256 if (!md5sig) 1257 return NULL; 1258 1259 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1260 lockdep_sock_is_held(sk)) { 1261 if (key->family != family) 1262 continue; 1263 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1264 key->l3index != l3index) 1265 continue; 1266 if (family == AF_INET) { 1267 mask = inet_make_mask(key->prefixlen); 1268 match = (key->addr.a4.s_addr & mask) == 1269 (addr->a4.s_addr & mask); 1270 #if IS_ENABLED(CONFIG_IPV6) 1271 } else if (family == AF_INET6) { 1272 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1273 key->prefixlen); 1274 #endif 1275 } else { 1276 match = false; 1277 } 1278 1279 if (match && better_md5_match(best_match, key)) 1280 best_match = key; 1281 } 1282 return best_match; 1283 } 1284 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1285 1286 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1287 const union tcp_md5_addr *addr, 1288 int family, u8 prefixlen, 1289 int l3index, u8 flags) 1290 { 1291 const struct tcp_sock *tp = tcp_sk(sk); 1292 struct tcp_md5sig_key *key; 1293 unsigned int size = sizeof(struct in_addr); 1294 const struct tcp_md5sig_info *md5sig; 1295 1296 /* caller either holds rcu_read_lock() or socket lock */ 1297 md5sig = rcu_dereference_check(tp->md5sig_info, 1298 lockdep_sock_is_held(sk)); 1299 if (!md5sig) 1300 return NULL; 1301 #if IS_ENABLED(CONFIG_IPV6) 1302 if (family == AF_INET6) 1303 size = sizeof(struct in6_addr); 1304 #endif 1305 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1306 lockdep_sock_is_held(sk)) { 1307 if (key->family != family) 1308 continue; 1309 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1310 continue; 1311 if (key->l3index != l3index) 1312 continue; 1313 if (!memcmp(&key->addr, addr, size) && 1314 key->prefixlen == prefixlen) 1315 return key; 1316 } 1317 return NULL; 1318 } 1319 1320 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1321 const struct sock *addr_sk) 1322 { 1323 const union tcp_md5_addr *addr; 1324 int l3index; 1325 1326 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1327 addr_sk->sk_bound_dev_if); 1328 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1329 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1330 } 1331 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1332 1333 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1334 { 1335 struct tcp_sock *tp = tcp_sk(sk); 1336 struct tcp_md5sig_info *md5sig; 1337 1338 md5sig = kmalloc(sizeof(*md5sig), gfp); 1339 if (!md5sig) 1340 return -ENOMEM; 1341 1342 sk_gso_disable(sk); 1343 INIT_HLIST_HEAD(&md5sig->head); 1344 rcu_assign_pointer(tp->md5sig_info, md5sig); 1345 return 0; 1346 } 1347 1348 /* This can be called on a newly created socket, from other files */ 1349 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1350 int family, u8 prefixlen, int l3index, u8 flags, 1351 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1352 { 1353 /* Add Key to the list */ 1354 struct tcp_md5sig_key *key; 1355 struct tcp_sock *tp = tcp_sk(sk); 1356 struct tcp_md5sig_info *md5sig; 1357 1358 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1359 if (key) { 1360 /* Pre-existing entry - just update that one. 1361 * Note that the key might be used concurrently. 1362 * data_race() is telling kcsan that we do not care of 1363 * key mismatches, since changing MD5 key on live flows 1364 * can lead to packet drops. 1365 */ 1366 data_race(memcpy(key->key, newkey, newkeylen)); 1367 1368 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1369 * Also note that a reader could catch new key->keylen value 1370 * but old key->key[], this is the reason we use __GFP_ZERO 1371 * at sock_kmalloc() time below these lines. 1372 */ 1373 WRITE_ONCE(key->keylen, newkeylen); 1374 1375 return 0; 1376 } 1377 1378 md5sig = rcu_dereference_protected(tp->md5sig_info, 1379 lockdep_sock_is_held(sk)); 1380 1381 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1382 if (!key) 1383 return -ENOMEM; 1384 1385 memcpy(key->key, newkey, newkeylen); 1386 key->keylen = newkeylen; 1387 key->family = family; 1388 key->prefixlen = prefixlen; 1389 key->l3index = l3index; 1390 key->flags = flags; 1391 memcpy(&key->addr, addr, 1392 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1393 sizeof(struct in_addr)); 1394 hlist_add_head_rcu(&key->node, &md5sig->head); 1395 return 0; 1396 } 1397 1398 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1399 int family, u8 prefixlen, int l3index, u8 flags, 1400 const u8 *newkey, u8 newkeylen) 1401 { 1402 struct tcp_sock *tp = tcp_sk(sk); 1403 1404 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1405 if (tcp_md5_alloc_sigpool()) 1406 return -ENOMEM; 1407 1408 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1409 tcp_md5_release_sigpool(); 1410 return -ENOMEM; 1411 } 1412 1413 if (!static_branch_inc(&tcp_md5_needed.key)) { 1414 struct tcp_md5sig_info *md5sig; 1415 1416 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1417 rcu_assign_pointer(tp->md5sig_info, NULL); 1418 kfree_rcu(md5sig, rcu); 1419 tcp_md5_release_sigpool(); 1420 return -EUSERS; 1421 } 1422 } 1423 1424 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1425 newkey, newkeylen, GFP_KERNEL); 1426 } 1427 EXPORT_SYMBOL(tcp_md5_do_add); 1428 1429 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1430 int family, u8 prefixlen, int l3index, 1431 struct tcp_md5sig_key *key) 1432 { 1433 struct tcp_sock *tp = tcp_sk(sk); 1434 1435 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1436 tcp_md5_add_sigpool(); 1437 1438 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1439 tcp_md5_release_sigpool(); 1440 return -ENOMEM; 1441 } 1442 1443 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1444 struct tcp_md5sig_info *md5sig; 1445 1446 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1447 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1448 rcu_assign_pointer(tp->md5sig_info, NULL); 1449 kfree_rcu(md5sig, rcu); 1450 tcp_md5_release_sigpool(); 1451 return -EUSERS; 1452 } 1453 } 1454 1455 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1456 key->flags, key->key, key->keylen, 1457 sk_gfp_mask(sk, GFP_ATOMIC)); 1458 } 1459 EXPORT_SYMBOL(tcp_md5_key_copy); 1460 1461 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1462 u8 prefixlen, int l3index, u8 flags) 1463 { 1464 struct tcp_md5sig_key *key; 1465 1466 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1467 if (!key) 1468 return -ENOENT; 1469 hlist_del_rcu(&key->node); 1470 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1471 kfree_rcu(key, rcu); 1472 return 0; 1473 } 1474 EXPORT_SYMBOL(tcp_md5_do_del); 1475 1476 void tcp_clear_md5_list(struct sock *sk) 1477 { 1478 struct tcp_sock *tp = tcp_sk(sk); 1479 struct tcp_md5sig_key *key; 1480 struct hlist_node *n; 1481 struct tcp_md5sig_info *md5sig; 1482 1483 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1484 1485 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1486 hlist_del_rcu(&key->node); 1487 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1488 kfree_rcu(key, rcu); 1489 } 1490 } 1491 1492 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1493 sockptr_t optval, int optlen) 1494 { 1495 struct tcp_md5sig cmd; 1496 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1497 const union tcp_md5_addr *addr; 1498 u8 prefixlen = 32; 1499 int l3index = 0; 1500 bool l3flag; 1501 u8 flags; 1502 1503 if (optlen < sizeof(cmd)) 1504 return -EINVAL; 1505 1506 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1507 return -EFAULT; 1508 1509 if (sin->sin_family != AF_INET) 1510 return -EINVAL; 1511 1512 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1513 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1514 1515 if (optname == TCP_MD5SIG_EXT && 1516 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1517 prefixlen = cmd.tcpm_prefixlen; 1518 if (prefixlen > 32) 1519 return -EINVAL; 1520 } 1521 1522 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1523 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1524 struct net_device *dev; 1525 1526 rcu_read_lock(); 1527 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1528 if (dev && netif_is_l3_master(dev)) 1529 l3index = dev->ifindex; 1530 1531 rcu_read_unlock(); 1532 1533 /* ok to reference set/not set outside of rcu; 1534 * right now device MUST be an L3 master 1535 */ 1536 if (!dev || !l3index) 1537 return -EINVAL; 1538 } 1539 1540 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1541 1542 if (!cmd.tcpm_keylen) 1543 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1544 1545 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1546 return -EINVAL; 1547 1548 /* Don't allow keys for peers that have a matching TCP-AO key. 1549 * See the comment in tcp_ao_add_cmd() 1550 */ 1551 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1552 return -EKEYREJECTED; 1553 1554 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1555 cmd.tcpm_key, cmd.tcpm_keylen); 1556 } 1557 1558 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1559 __be32 daddr, __be32 saddr, 1560 const struct tcphdr *th, int nbytes) 1561 { 1562 struct tcp4_pseudohdr *bp; 1563 struct scatterlist sg; 1564 struct tcphdr *_th; 1565 1566 bp = hp->scratch; 1567 bp->saddr = saddr; 1568 bp->daddr = daddr; 1569 bp->pad = 0; 1570 bp->protocol = IPPROTO_TCP; 1571 bp->len = cpu_to_be16(nbytes); 1572 1573 _th = (struct tcphdr *)(bp + 1); 1574 memcpy(_th, th, sizeof(*th)); 1575 _th->check = 0; 1576 1577 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1578 ahash_request_set_crypt(hp->req, &sg, NULL, 1579 sizeof(*bp) + sizeof(*th)); 1580 return crypto_ahash_update(hp->req); 1581 } 1582 1583 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1584 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1585 { 1586 struct tcp_sigpool hp; 1587 1588 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1589 goto clear_hash_nostart; 1590 1591 if (crypto_ahash_init(hp.req)) 1592 goto clear_hash; 1593 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1594 goto clear_hash; 1595 if (tcp_md5_hash_key(&hp, key)) 1596 goto clear_hash; 1597 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1598 if (crypto_ahash_final(hp.req)) 1599 goto clear_hash; 1600 1601 tcp_sigpool_end(&hp); 1602 return 0; 1603 1604 clear_hash: 1605 tcp_sigpool_end(&hp); 1606 clear_hash_nostart: 1607 memset(md5_hash, 0, 16); 1608 return 1; 1609 } 1610 1611 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1612 const struct sock *sk, 1613 const struct sk_buff *skb) 1614 { 1615 const struct tcphdr *th = tcp_hdr(skb); 1616 struct tcp_sigpool hp; 1617 __be32 saddr, daddr; 1618 1619 if (sk) { /* valid for establish/request sockets */ 1620 saddr = sk->sk_rcv_saddr; 1621 daddr = sk->sk_daddr; 1622 } else { 1623 const struct iphdr *iph = ip_hdr(skb); 1624 saddr = iph->saddr; 1625 daddr = iph->daddr; 1626 } 1627 1628 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1629 goto clear_hash_nostart; 1630 1631 if (crypto_ahash_init(hp.req)) 1632 goto clear_hash; 1633 1634 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1635 goto clear_hash; 1636 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1637 goto clear_hash; 1638 if (tcp_md5_hash_key(&hp, key)) 1639 goto clear_hash; 1640 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1641 if (crypto_ahash_final(hp.req)) 1642 goto clear_hash; 1643 1644 tcp_sigpool_end(&hp); 1645 return 0; 1646 1647 clear_hash: 1648 tcp_sigpool_end(&hp); 1649 clear_hash_nostart: 1650 memset(md5_hash, 0, 16); 1651 return 1; 1652 } 1653 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1654 1655 #endif 1656 1657 static void tcp_v4_init_req(struct request_sock *req, 1658 const struct sock *sk_listener, 1659 struct sk_buff *skb) 1660 { 1661 struct inet_request_sock *ireq = inet_rsk(req); 1662 struct net *net = sock_net(sk_listener); 1663 1664 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1665 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1666 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1667 } 1668 1669 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1670 struct sk_buff *skb, 1671 struct flowi *fl, 1672 struct request_sock *req, 1673 u32 tw_isn) 1674 { 1675 tcp_v4_init_req(req, sk, skb); 1676 1677 if (security_inet_conn_request(sk, skb, req)) 1678 return NULL; 1679 1680 return inet_csk_route_req(sk, &fl->u.ip4, req); 1681 } 1682 1683 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1684 .family = PF_INET, 1685 .obj_size = sizeof(struct tcp_request_sock), 1686 .rtx_syn_ack = tcp_rtx_synack, 1687 .send_ack = tcp_v4_reqsk_send_ack, 1688 .destructor = tcp_v4_reqsk_destructor, 1689 .send_reset = tcp_v4_send_reset, 1690 .syn_ack_timeout = tcp_syn_ack_timeout, 1691 }; 1692 1693 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1694 .mss_clamp = TCP_MSS_DEFAULT, 1695 #ifdef CONFIG_TCP_MD5SIG 1696 .req_md5_lookup = tcp_v4_md5_lookup, 1697 .calc_md5_hash = tcp_v4_md5_hash_skb, 1698 #endif 1699 #ifdef CONFIG_TCP_AO 1700 .ao_lookup = tcp_v4_ao_lookup_rsk, 1701 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1702 .ao_synack_hash = tcp_v4_ao_synack_hash, 1703 #endif 1704 #ifdef CONFIG_SYN_COOKIES 1705 .cookie_init_seq = cookie_v4_init_sequence, 1706 #endif 1707 .route_req = tcp_v4_route_req, 1708 .init_seq = tcp_v4_init_seq, 1709 .init_ts_off = tcp_v4_init_ts_off, 1710 .send_synack = tcp_v4_send_synack, 1711 }; 1712 1713 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1714 { 1715 /* Never answer to SYNs send to broadcast or multicast */ 1716 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1717 goto drop; 1718 1719 return tcp_conn_request(&tcp_request_sock_ops, 1720 &tcp_request_sock_ipv4_ops, sk, skb); 1721 1722 drop: 1723 tcp_listendrop(sk); 1724 return 0; 1725 } 1726 EXPORT_SYMBOL(tcp_v4_conn_request); 1727 1728 1729 /* 1730 * The three way handshake has completed - we got a valid synack - 1731 * now create the new socket. 1732 */ 1733 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1734 struct request_sock *req, 1735 struct dst_entry *dst, 1736 struct request_sock *req_unhash, 1737 bool *own_req) 1738 { 1739 struct inet_request_sock *ireq; 1740 bool found_dup_sk = false; 1741 struct inet_sock *newinet; 1742 struct tcp_sock *newtp; 1743 struct sock *newsk; 1744 #ifdef CONFIG_TCP_MD5SIG 1745 const union tcp_md5_addr *addr; 1746 struct tcp_md5sig_key *key; 1747 int l3index; 1748 #endif 1749 struct ip_options_rcu *inet_opt; 1750 1751 if (sk_acceptq_is_full(sk)) 1752 goto exit_overflow; 1753 1754 newsk = tcp_create_openreq_child(sk, req, skb); 1755 if (!newsk) 1756 goto exit_nonewsk; 1757 1758 newsk->sk_gso_type = SKB_GSO_TCPV4; 1759 inet_sk_rx_dst_set(newsk, skb); 1760 1761 newtp = tcp_sk(newsk); 1762 newinet = inet_sk(newsk); 1763 ireq = inet_rsk(req); 1764 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1765 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1766 newsk->sk_bound_dev_if = ireq->ir_iif; 1767 newinet->inet_saddr = ireq->ir_loc_addr; 1768 inet_opt = rcu_dereference(ireq->ireq_opt); 1769 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1770 newinet->mc_index = inet_iif(skb); 1771 newinet->mc_ttl = ip_hdr(skb)->ttl; 1772 newinet->rcv_tos = ip_hdr(skb)->tos; 1773 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1774 if (inet_opt) 1775 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1776 atomic_set(&newinet->inet_id, get_random_u16()); 1777 1778 /* Set ToS of the new socket based upon the value of incoming SYN. 1779 * ECT bits are set later in tcp_init_transfer(). 1780 */ 1781 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1782 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1783 1784 if (!dst) { 1785 dst = inet_csk_route_child_sock(sk, newsk, req); 1786 if (!dst) 1787 goto put_and_exit; 1788 } else { 1789 /* syncookie case : see end of cookie_v4_check() */ 1790 } 1791 sk_setup_caps(newsk, dst); 1792 1793 tcp_ca_openreq_child(newsk, dst); 1794 1795 tcp_sync_mss(newsk, dst_mtu(dst)); 1796 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1797 1798 tcp_initialize_rcv_mss(newsk); 1799 1800 #ifdef CONFIG_TCP_MD5SIG 1801 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1802 /* Copy over the MD5 key from the original socket */ 1803 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1804 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1805 if (key && !tcp_rsk_used_ao(req)) { 1806 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1807 goto put_and_exit; 1808 sk_gso_disable(newsk); 1809 } 1810 #endif 1811 #ifdef CONFIG_TCP_AO 1812 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1813 goto put_and_exit; /* OOM, release back memory */ 1814 #endif 1815 1816 if (__inet_inherit_port(sk, newsk) < 0) 1817 goto put_and_exit; 1818 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1819 &found_dup_sk); 1820 if (likely(*own_req)) { 1821 tcp_move_syn(newtp, req); 1822 ireq->ireq_opt = NULL; 1823 } else { 1824 newinet->inet_opt = NULL; 1825 1826 if (!req_unhash && found_dup_sk) { 1827 /* This code path should only be executed in the 1828 * syncookie case only 1829 */ 1830 bh_unlock_sock(newsk); 1831 sock_put(newsk); 1832 newsk = NULL; 1833 } 1834 } 1835 return newsk; 1836 1837 exit_overflow: 1838 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1839 exit_nonewsk: 1840 dst_release(dst); 1841 exit: 1842 tcp_listendrop(sk); 1843 return NULL; 1844 put_and_exit: 1845 newinet->inet_opt = NULL; 1846 inet_csk_prepare_forced_close(newsk); 1847 tcp_done(newsk); 1848 goto exit; 1849 } 1850 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1851 1852 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1853 { 1854 #ifdef CONFIG_SYN_COOKIES 1855 const struct tcphdr *th = tcp_hdr(skb); 1856 1857 if (!th->syn) 1858 sk = cookie_v4_check(sk, skb); 1859 #endif 1860 return sk; 1861 } 1862 1863 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1864 struct tcphdr *th, u32 *cookie) 1865 { 1866 u16 mss = 0; 1867 #ifdef CONFIG_SYN_COOKIES 1868 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1869 &tcp_request_sock_ipv4_ops, sk, th); 1870 if (mss) { 1871 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1872 tcp_synq_overflow(sk); 1873 } 1874 #endif 1875 return mss; 1876 } 1877 1878 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1879 u32)); 1880 /* The socket must have it's spinlock held when we get 1881 * here, unless it is a TCP_LISTEN socket. 1882 * 1883 * We have a potential double-lock case here, so even when 1884 * doing backlog processing we use the BH locking scheme. 1885 * This is because we cannot sleep with the original spinlock 1886 * held. 1887 */ 1888 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1889 { 1890 enum skb_drop_reason reason; 1891 struct sock *rsk; 1892 1893 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1894 struct dst_entry *dst; 1895 1896 dst = rcu_dereference_protected(sk->sk_rx_dst, 1897 lockdep_sock_is_held(sk)); 1898 1899 sock_rps_save_rxhash(sk, skb); 1900 sk_mark_napi_id(sk, skb); 1901 if (dst) { 1902 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1903 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1904 dst, 0)) { 1905 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1906 dst_release(dst); 1907 } 1908 } 1909 tcp_rcv_established(sk, skb); 1910 return 0; 1911 } 1912 1913 if (tcp_checksum_complete(skb)) 1914 goto csum_err; 1915 1916 if (sk->sk_state == TCP_LISTEN) { 1917 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1918 1919 if (!nsk) 1920 return 0; 1921 if (nsk != sk) { 1922 reason = tcp_child_process(sk, nsk, skb); 1923 if (reason) { 1924 rsk = nsk; 1925 goto reset; 1926 } 1927 return 0; 1928 } 1929 } else 1930 sock_rps_save_rxhash(sk, skb); 1931 1932 reason = tcp_rcv_state_process(sk, skb); 1933 if (reason) { 1934 rsk = sk; 1935 goto reset; 1936 } 1937 return 0; 1938 1939 reset: 1940 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1941 discard: 1942 kfree_skb_reason(skb, reason); 1943 /* Be careful here. If this function gets more complicated and 1944 * gcc suffers from register pressure on the x86, sk (in %ebx) 1945 * might be destroyed here. This current version compiles correctly, 1946 * but you have been warned. 1947 */ 1948 return 0; 1949 1950 csum_err: 1951 reason = SKB_DROP_REASON_TCP_CSUM; 1952 trace_tcp_bad_csum(skb); 1953 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1954 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1955 goto discard; 1956 } 1957 EXPORT_SYMBOL(tcp_v4_do_rcv); 1958 1959 int tcp_v4_early_demux(struct sk_buff *skb) 1960 { 1961 struct net *net = dev_net(skb->dev); 1962 const struct iphdr *iph; 1963 const struct tcphdr *th; 1964 struct sock *sk; 1965 1966 if (skb->pkt_type != PACKET_HOST) 1967 return 0; 1968 1969 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1970 return 0; 1971 1972 iph = ip_hdr(skb); 1973 th = tcp_hdr(skb); 1974 1975 if (th->doff < sizeof(struct tcphdr) / 4) 1976 return 0; 1977 1978 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1979 iph->saddr, th->source, 1980 iph->daddr, ntohs(th->dest), 1981 skb->skb_iif, inet_sdif(skb)); 1982 if (sk) { 1983 skb->sk = sk; 1984 skb->destructor = sock_edemux; 1985 if (sk_fullsock(sk)) { 1986 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1987 1988 if (dst) 1989 dst = dst_check(dst, 0); 1990 if (dst && 1991 sk->sk_rx_dst_ifindex == skb->skb_iif) 1992 skb_dst_set_noref(skb, dst); 1993 } 1994 } 1995 return 0; 1996 } 1997 1998 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1999 enum skb_drop_reason *reason) 2000 { 2001 u32 tail_gso_size, tail_gso_segs; 2002 struct skb_shared_info *shinfo; 2003 const struct tcphdr *th; 2004 struct tcphdr *thtail; 2005 struct sk_buff *tail; 2006 unsigned int hdrlen; 2007 bool fragstolen; 2008 u32 gso_segs; 2009 u32 gso_size; 2010 u64 limit; 2011 int delta; 2012 2013 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2014 * we can fix skb->truesize to its real value to avoid future drops. 2015 * This is valid because skb is not yet charged to the socket. 2016 * It has been noticed pure SACK packets were sometimes dropped 2017 * (if cooked by drivers without copybreak feature). 2018 */ 2019 skb_condense(skb); 2020 2021 skb_dst_drop(skb); 2022 2023 if (unlikely(tcp_checksum_complete(skb))) { 2024 bh_unlock_sock(sk); 2025 trace_tcp_bad_csum(skb); 2026 *reason = SKB_DROP_REASON_TCP_CSUM; 2027 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2028 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2029 return true; 2030 } 2031 2032 /* Attempt coalescing to last skb in backlog, even if we are 2033 * above the limits. 2034 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2035 */ 2036 th = (const struct tcphdr *)skb->data; 2037 hdrlen = th->doff * 4; 2038 2039 tail = sk->sk_backlog.tail; 2040 if (!tail) 2041 goto no_coalesce; 2042 thtail = (struct tcphdr *)tail->data; 2043 2044 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2045 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2046 ((TCP_SKB_CB(tail)->tcp_flags | 2047 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2048 !((TCP_SKB_CB(tail)->tcp_flags & 2049 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2050 ((TCP_SKB_CB(tail)->tcp_flags ^ 2051 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 2052 !mptcp_skb_can_collapse(tail, skb) || 2053 skb_cmp_decrypted(tail, skb) || 2054 thtail->doff != th->doff || 2055 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2056 goto no_coalesce; 2057 2058 __skb_pull(skb, hdrlen); 2059 2060 shinfo = skb_shinfo(skb); 2061 gso_size = shinfo->gso_size ?: skb->len; 2062 gso_segs = shinfo->gso_segs ?: 1; 2063 2064 shinfo = skb_shinfo(tail); 2065 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2066 tail_gso_segs = shinfo->gso_segs ?: 1; 2067 2068 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2069 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2070 2071 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2072 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2073 thtail->window = th->window; 2074 } 2075 2076 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2077 * thtail->fin, so that the fast path in tcp_rcv_established() 2078 * is not entered if we append a packet with a FIN. 2079 * SYN, RST, URG are not present. 2080 * ACK is set on both packets. 2081 * PSH : we do not really care in TCP stack, 2082 * at least for 'GRO' packets. 2083 */ 2084 thtail->fin |= th->fin; 2085 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2086 2087 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2088 TCP_SKB_CB(tail)->has_rxtstamp = true; 2089 tail->tstamp = skb->tstamp; 2090 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2091 } 2092 2093 /* Not as strict as GRO. We only need to carry mss max value */ 2094 shinfo->gso_size = max(gso_size, tail_gso_size); 2095 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2096 2097 sk->sk_backlog.len += delta; 2098 __NET_INC_STATS(sock_net(sk), 2099 LINUX_MIB_TCPBACKLOGCOALESCE); 2100 kfree_skb_partial(skb, fragstolen); 2101 return false; 2102 } 2103 __skb_push(skb, hdrlen); 2104 2105 no_coalesce: 2106 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2107 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2108 * sk_rcvbuf in normal conditions. 2109 */ 2110 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2111 2112 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2113 2114 /* Only socket owner can try to collapse/prune rx queues 2115 * to reduce memory overhead, so add a little headroom here. 2116 * Few sockets backlog are possibly concurrently non empty. 2117 */ 2118 limit += 64 * 1024; 2119 2120 limit = min_t(u64, limit, UINT_MAX); 2121 2122 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2123 bh_unlock_sock(sk); 2124 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2125 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2126 return true; 2127 } 2128 return false; 2129 } 2130 EXPORT_SYMBOL(tcp_add_backlog); 2131 2132 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2133 { 2134 struct tcphdr *th = (struct tcphdr *)skb->data; 2135 2136 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2137 } 2138 EXPORT_SYMBOL(tcp_filter); 2139 2140 static void tcp_v4_restore_cb(struct sk_buff *skb) 2141 { 2142 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2143 sizeof(struct inet_skb_parm)); 2144 } 2145 2146 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2147 const struct tcphdr *th) 2148 { 2149 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2150 * barrier() makes sure compiler wont play fool^Waliasing games. 2151 */ 2152 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2153 sizeof(struct inet_skb_parm)); 2154 barrier(); 2155 2156 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2157 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2158 skb->len - th->doff * 4); 2159 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2160 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 2161 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2162 TCP_SKB_CB(skb)->sacked = 0; 2163 TCP_SKB_CB(skb)->has_rxtstamp = 2164 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2165 } 2166 2167 /* 2168 * From tcp_input.c 2169 */ 2170 2171 int tcp_v4_rcv(struct sk_buff *skb) 2172 { 2173 struct net *net = dev_net(skb->dev); 2174 enum skb_drop_reason drop_reason; 2175 int sdif = inet_sdif(skb); 2176 int dif = inet_iif(skb); 2177 const struct iphdr *iph; 2178 const struct tcphdr *th; 2179 bool refcounted; 2180 struct sock *sk; 2181 int ret; 2182 u32 isn; 2183 2184 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2185 if (skb->pkt_type != PACKET_HOST) 2186 goto discard_it; 2187 2188 /* Count it even if it's bad */ 2189 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2190 2191 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2192 goto discard_it; 2193 2194 th = (const struct tcphdr *)skb->data; 2195 2196 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2197 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2198 goto bad_packet; 2199 } 2200 if (!pskb_may_pull(skb, th->doff * 4)) 2201 goto discard_it; 2202 2203 /* An explanation is required here, I think. 2204 * Packet length and doff are validated by header prediction, 2205 * provided case of th->doff==0 is eliminated. 2206 * So, we defer the checks. */ 2207 2208 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2209 goto csum_error; 2210 2211 th = (const struct tcphdr *)skb->data; 2212 iph = ip_hdr(skb); 2213 lookup: 2214 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2215 skb, __tcp_hdrlen(th), th->source, 2216 th->dest, sdif, &refcounted); 2217 if (!sk) 2218 goto no_tcp_socket; 2219 2220 if (sk->sk_state == TCP_TIME_WAIT) 2221 goto do_time_wait; 2222 2223 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2224 struct request_sock *req = inet_reqsk(sk); 2225 bool req_stolen = false; 2226 struct sock *nsk; 2227 2228 sk = req->rsk_listener; 2229 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2230 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2231 else 2232 drop_reason = tcp_inbound_hash(sk, req, skb, 2233 &iph->saddr, &iph->daddr, 2234 AF_INET, dif, sdif); 2235 if (unlikely(drop_reason)) { 2236 sk_drops_add(sk, skb); 2237 reqsk_put(req); 2238 goto discard_it; 2239 } 2240 if (tcp_checksum_complete(skb)) { 2241 reqsk_put(req); 2242 goto csum_error; 2243 } 2244 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2245 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2246 if (!nsk) { 2247 inet_csk_reqsk_queue_drop_and_put(sk, req); 2248 goto lookup; 2249 } 2250 sk = nsk; 2251 /* reuseport_migrate_sock() has already held one sk_refcnt 2252 * before returning. 2253 */ 2254 } else { 2255 /* We own a reference on the listener, increase it again 2256 * as we might lose it too soon. 2257 */ 2258 sock_hold(sk); 2259 } 2260 refcounted = true; 2261 nsk = NULL; 2262 if (!tcp_filter(sk, skb)) { 2263 th = (const struct tcphdr *)skb->data; 2264 iph = ip_hdr(skb); 2265 tcp_v4_fill_cb(skb, iph, th); 2266 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2267 } else { 2268 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2269 } 2270 if (!nsk) { 2271 reqsk_put(req); 2272 if (req_stolen) { 2273 /* Another cpu got exclusive access to req 2274 * and created a full blown socket. 2275 * Try to feed this packet to this socket 2276 * instead of discarding it. 2277 */ 2278 tcp_v4_restore_cb(skb); 2279 sock_put(sk); 2280 goto lookup; 2281 } 2282 goto discard_and_relse; 2283 } 2284 nf_reset_ct(skb); 2285 if (nsk == sk) { 2286 reqsk_put(req); 2287 tcp_v4_restore_cb(skb); 2288 } else { 2289 drop_reason = tcp_child_process(sk, nsk, skb); 2290 if (drop_reason) { 2291 enum sk_rst_reason rst_reason; 2292 2293 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2294 tcp_v4_send_reset(nsk, skb, rst_reason); 2295 goto discard_and_relse; 2296 } 2297 sock_put(sk); 2298 return 0; 2299 } 2300 } 2301 2302 process: 2303 if (static_branch_unlikely(&ip4_min_ttl)) { 2304 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2305 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2306 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2307 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2308 goto discard_and_relse; 2309 } 2310 } 2311 2312 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2313 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2314 goto discard_and_relse; 2315 } 2316 2317 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2318 AF_INET, dif, sdif); 2319 if (drop_reason) 2320 goto discard_and_relse; 2321 2322 nf_reset_ct(skb); 2323 2324 if (tcp_filter(sk, skb)) { 2325 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2326 goto discard_and_relse; 2327 } 2328 th = (const struct tcphdr *)skb->data; 2329 iph = ip_hdr(skb); 2330 tcp_v4_fill_cb(skb, iph, th); 2331 2332 skb->dev = NULL; 2333 2334 if (sk->sk_state == TCP_LISTEN) { 2335 ret = tcp_v4_do_rcv(sk, skb); 2336 goto put_and_return; 2337 } 2338 2339 sk_incoming_cpu_update(sk); 2340 2341 bh_lock_sock_nested(sk); 2342 tcp_segs_in(tcp_sk(sk), skb); 2343 ret = 0; 2344 if (!sock_owned_by_user(sk)) { 2345 ret = tcp_v4_do_rcv(sk, skb); 2346 } else { 2347 if (tcp_add_backlog(sk, skb, &drop_reason)) 2348 goto discard_and_relse; 2349 } 2350 bh_unlock_sock(sk); 2351 2352 put_and_return: 2353 if (refcounted) 2354 sock_put(sk); 2355 2356 return ret; 2357 2358 no_tcp_socket: 2359 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2360 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2361 goto discard_it; 2362 2363 tcp_v4_fill_cb(skb, iph, th); 2364 2365 if (tcp_checksum_complete(skb)) { 2366 csum_error: 2367 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2368 trace_tcp_bad_csum(skb); 2369 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2370 bad_packet: 2371 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2372 } else { 2373 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2374 } 2375 2376 discard_it: 2377 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2378 /* Discard frame. */ 2379 kfree_skb_reason(skb, drop_reason); 2380 return 0; 2381 2382 discard_and_relse: 2383 sk_drops_add(sk, skb); 2384 if (refcounted) 2385 sock_put(sk); 2386 goto discard_it; 2387 2388 do_time_wait: 2389 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2390 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2391 inet_twsk_put(inet_twsk(sk)); 2392 goto discard_it; 2393 } 2394 2395 tcp_v4_fill_cb(skb, iph, th); 2396 2397 if (tcp_checksum_complete(skb)) { 2398 inet_twsk_put(inet_twsk(sk)); 2399 goto csum_error; 2400 } 2401 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { 2402 case TCP_TW_SYN: { 2403 struct sock *sk2 = inet_lookup_listener(net, 2404 net->ipv4.tcp_death_row.hashinfo, 2405 skb, __tcp_hdrlen(th), 2406 iph->saddr, th->source, 2407 iph->daddr, th->dest, 2408 inet_iif(skb), 2409 sdif); 2410 if (sk2) { 2411 inet_twsk_deschedule_put(inet_twsk(sk)); 2412 sk = sk2; 2413 tcp_v4_restore_cb(skb); 2414 refcounted = false; 2415 __this_cpu_write(tcp_tw_isn, isn); 2416 goto process; 2417 } 2418 } 2419 /* to ACK */ 2420 fallthrough; 2421 case TCP_TW_ACK: 2422 tcp_v4_timewait_ack(sk, skb); 2423 break; 2424 case TCP_TW_RST: 2425 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2426 inet_twsk_deschedule_put(inet_twsk(sk)); 2427 goto discard_it; 2428 case TCP_TW_SUCCESS:; 2429 } 2430 goto discard_it; 2431 } 2432 2433 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2434 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2435 .twsk_destructor= tcp_twsk_destructor, 2436 }; 2437 2438 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2439 { 2440 struct dst_entry *dst = skb_dst(skb); 2441 2442 if (dst && dst_hold_safe(dst)) { 2443 rcu_assign_pointer(sk->sk_rx_dst, dst); 2444 sk->sk_rx_dst_ifindex = skb->skb_iif; 2445 } 2446 } 2447 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2448 2449 const struct inet_connection_sock_af_ops ipv4_specific = { 2450 .queue_xmit = ip_queue_xmit, 2451 .send_check = tcp_v4_send_check, 2452 .rebuild_header = inet_sk_rebuild_header, 2453 .sk_rx_dst_set = inet_sk_rx_dst_set, 2454 .conn_request = tcp_v4_conn_request, 2455 .syn_recv_sock = tcp_v4_syn_recv_sock, 2456 .net_header_len = sizeof(struct iphdr), 2457 .setsockopt = ip_setsockopt, 2458 .getsockopt = ip_getsockopt, 2459 .addr2sockaddr = inet_csk_addr2sockaddr, 2460 .sockaddr_len = sizeof(struct sockaddr_in), 2461 .mtu_reduced = tcp_v4_mtu_reduced, 2462 }; 2463 EXPORT_SYMBOL(ipv4_specific); 2464 2465 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2466 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2467 #ifdef CONFIG_TCP_MD5SIG 2468 .md5_lookup = tcp_v4_md5_lookup, 2469 .calc_md5_hash = tcp_v4_md5_hash_skb, 2470 .md5_parse = tcp_v4_parse_md5_keys, 2471 #endif 2472 #ifdef CONFIG_TCP_AO 2473 .ao_lookup = tcp_v4_ao_lookup, 2474 .calc_ao_hash = tcp_v4_ao_hash_skb, 2475 .ao_parse = tcp_v4_parse_ao, 2476 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2477 #endif 2478 }; 2479 #endif 2480 2481 /* NOTE: A lot of things set to zero explicitly by call to 2482 * sk_alloc() so need not be done here. 2483 */ 2484 static int tcp_v4_init_sock(struct sock *sk) 2485 { 2486 struct inet_connection_sock *icsk = inet_csk(sk); 2487 2488 tcp_init_sock(sk); 2489 2490 icsk->icsk_af_ops = &ipv4_specific; 2491 2492 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2493 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2494 #endif 2495 2496 return 0; 2497 } 2498 2499 #ifdef CONFIG_TCP_MD5SIG 2500 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2501 { 2502 struct tcp_md5sig_info *md5sig; 2503 2504 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2505 kfree(md5sig); 2506 static_branch_slow_dec_deferred(&tcp_md5_needed); 2507 tcp_md5_release_sigpool(); 2508 } 2509 #endif 2510 2511 void tcp_v4_destroy_sock(struct sock *sk) 2512 { 2513 struct tcp_sock *tp = tcp_sk(sk); 2514 2515 trace_tcp_destroy_sock(sk); 2516 2517 tcp_clear_xmit_timers(sk); 2518 2519 tcp_cleanup_congestion_control(sk); 2520 2521 tcp_cleanup_ulp(sk); 2522 2523 /* Cleanup up the write buffer. */ 2524 tcp_write_queue_purge(sk); 2525 2526 /* Check if we want to disable active TFO */ 2527 tcp_fastopen_active_disable_ofo_check(sk); 2528 2529 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2530 skb_rbtree_purge(&tp->out_of_order_queue); 2531 2532 #ifdef CONFIG_TCP_MD5SIG 2533 /* Clean up the MD5 key list, if any */ 2534 if (tp->md5sig_info) { 2535 struct tcp_md5sig_info *md5sig; 2536 2537 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2538 tcp_clear_md5_list(sk); 2539 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2540 rcu_assign_pointer(tp->md5sig_info, NULL); 2541 } 2542 #endif 2543 tcp_ao_destroy_sock(sk, false); 2544 2545 /* Clean up a referenced TCP bind bucket. */ 2546 if (inet_csk(sk)->icsk_bind_hash) 2547 inet_put_port(sk); 2548 2549 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2550 2551 /* If socket is aborted during connect operation */ 2552 tcp_free_fastopen_req(tp); 2553 tcp_fastopen_destroy_cipher(sk); 2554 tcp_saved_syn_free(tp); 2555 2556 sk_sockets_allocated_dec(sk); 2557 } 2558 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2559 2560 #ifdef CONFIG_PROC_FS 2561 /* Proc filesystem TCP sock list dumping. */ 2562 2563 static unsigned short seq_file_family(const struct seq_file *seq); 2564 2565 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2566 { 2567 unsigned short family = seq_file_family(seq); 2568 2569 /* AF_UNSPEC is used as a match all */ 2570 return ((family == AF_UNSPEC || family == sk->sk_family) && 2571 net_eq(sock_net(sk), seq_file_net(seq))); 2572 } 2573 2574 /* Find a non empty bucket (starting from st->bucket) 2575 * and return the first sk from it. 2576 */ 2577 static void *listening_get_first(struct seq_file *seq) 2578 { 2579 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2580 struct tcp_iter_state *st = seq->private; 2581 2582 st->offset = 0; 2583 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2584 struct inet_listen_hashbucket *ilb2; 2585 struct hlist_nulls_node *node; 2586 struct sock *sk; 2587 2588 ilb2 = &hinfo->lhash2[st->bucket]; 2589 if (hlist_nulls_empty(&ilb2->nulls_head)) 2590 continue; 2591 2592 spin_lock(&ilb2->lock); 2593 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2594 if (seq_sk_match(seq, sk)) 2595 return sk; 2596 } 2597 spin_unlock(&ilb2->lock); 2598 } 2599 2600 return NULL; 2601 } 2602 2603 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2604 * If "cur" is the last one in the st->bucket, 2605 * call listening_get_first() to return the first sk of the next 2606 * non empty bucket. 2607 */ 2608 static void *listening_get_next(struct seq_file *seq, void *cur) 2609 { 2610 struct tcp_iter_state *st = seq->private; 2611 struct inet_listen_hashbucket *ilb2; 2612 struct hlist_nulls_node *node; 2613 struct inet_hashinfo *hinfo; 2614 struct sock *sk = cur; 2615 2616 ++st->num; 2617 ++st->offset; 2618 2619 sk = sk_nulls_next(sk); 2620 sk_nulls_for_each_from(sk, node) { 2621 if (seq_sk_match(seq, sk)) 2622 return sk; 2623 } 2624 2625 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2626 ilb2 = &hinfo->lhash2[st->bucket]; 2627 spin_unlock(&ilb2->lock); 2628 ++st->bucket; 2629 return listening_get_first(seq); 2630 } 2631 2632 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2633 { 2634 struct tcp_iter_state *st = seq->private; 2635 void *rc; 2636 2637 st->bucket = 0; 2638 st->offset = 0; 2639 rc = listening_get_first(seq); 2640 2641 while (rc && *pos) { 2642 rc = listening_get_next(seq, rc); 2643 --*pos; 2644 } 2645 return rc; 2646 } 2647 2648 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2649 const struct tcp_iter_state *st) 2650 { 2651 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2652 } 2653 2654 /* 2655 * Get first established socket starting from bucket given in st->bucket. 2656 * If st->bucket is zero, the very first socket in the hash is returned. 2657 */ 2658 static void *established_get_first(struct seq_file *seq) 2659 { 2660 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2661 struct tcp_iter_state *st = seq->private; 2662 2663 st->offset = 0; 2664 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2665 struct sock *sk; 2666 struct hlist_nulls_node *node; 2667 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2668 2669 cond_resched(); 2670 2671 /* Lockless fast path for the common case of empty buckets */ 2672 if (empty_bucket(hinfo, st)) 2673 continue; 2674 2675 spin_lock_bh(lock); 2676 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2677 if (seq_sk_match(seq, sk)) 2678 return sk; 2679 } 2680 spin_unlock_bh(lock); 2681 } 2682 2683 return NULL; 2684 } 2685 2686 static void *established_get_next(struct seq_file *seq, void *cur) 2687 { 2688 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2689 struct tcp_iter_state *st = seq->private; 2690 struct hlist_nulls_node *node; 2691 struct sock *sk = cur; 2692 2693 ++st->num; 2694 ++st->offset; 2695 2696 sk = sk_nulls_next(sk); 2697 2698 sk_nulls_for_each_from(sk, node) { 2699 if (seq_sk_match(seq, sk)) 2700 return sk; 2701 } 2702 2703 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2704 ++st->bucket; 2705 return established_get_first(seq); 2706 } 2707 2708 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2709 { 2710 struct tcp_iter_state *st = seq->private; 2711 void *rc; 2712 2713 st->bucket = 0; 2714 rc = established_get_first(seq); 2715 2716 while (rc && pos) { 2717 rc = established_get_next(seq, rc); 2718 --pos; 2719 } 2720 return rc; 2721 } 2722 2723 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2724 { 2725 void *rc; 2726 struct tcp_iter_state *st = seq->private; 2727 2728 st->state = TCP_SEQ_STATE_LISTENING; 2729 rc = listening_get_idx(seq, &pos); 2730 2731 if (!rc) { 2732 st->state = TCP_SEQ_STATE_ESTABLISHED; 2733 rc = established_get_idx(seq, pos); 2734 } 2735 2736 return rc; 2737 } 2738 2739 static void *tcp_seek_last_pos(struct seq_file *seq) 2740 { 2741 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2742 struct tcp_iter_state *st = seq->private; 2743 int bucket = st->bucket; 2744 int offset = st->offset; 2745 int orig_num = st->num; 2746 void *rc = NULL; 2747 2748 switch (st->state) { 2749 case TCP_SEQ_STATE_LISTENING: 2750 if (st->bucket > hinfo->lhash2_mask) 2751 break; 2752 rc = listening_get_first(seq); 2753 while (offset-- && rc && bucket == st->bucket) 2754 rc = listening_get_next(seq, rc); 2755 if (rc) 2756 break; 2757 st->bucket = 0; 2758 st->state = TCP_SEQ_STATE_ESTABLISHED; 2759 fallthrough; 2760 case TCP_SEQ_STATE_ESTABLISHED: 2761 if (st->bucket > hinfo->ehash_mask) 2762 break; 2763 rc = established_get_first(seq); 2764 while (offset-- && rc && bucket == st->bucket) 2765 rc = established_get_next(seq, rc); 2766 } 2767 2768 st->num = orig_num; 2769 2770 return rc; 2771 } 2772 2773 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2774 { 2775 struct tcp_iter_state *st = seq->private; 2776 void *rc; 2777 2778 if (*pos && *pos == st->last_pos) { 2779 rc = tcp_seek_last_pos(seq); 2780 if (rc) 2781 goto out; 2782 } 2783 2784 st->state = TCP_SEQ_STATE_LISTENING; 2785 st->num = 0; 2786 st->bucket = 0; 2787 st->offset = 0; 2788 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2789 2790 out: 2791 st->last_pos = *pos; 2792 return rc; 2793 } 2794 EXPORT_SYMBOL(tcp_seq_start); 2795 2796 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2797 { 2798 struct tcp_iter_state *st = seq->private; 2799 void *rc = NULL; 2800 2801 if (v == SEQ_START_TOKEN) { 2802 rc = tcp_get_idx(seq, 0); 2803 goto out; 2804 } 2805 2806 switch (st->state) { 2807 case TCP_SEQ_STATE_LISTENING: 2808 rc = listening_get_next(seq, v); 2809 if (!rc) { 2810 st->state = TCP_SEQ_STATE_ESTABLISHED; 2811 st->bucket = 0; 2812 st->offset = 0; 2813 rc = established_get_first(seq); 2814 } 2815 break; 2816 case TCP_SEQ_STATE_ESTABLISHED: 2817 rc = established_get_next(seq, v); 2818 break; 2819 } 2820 out: 2821 ++*pos; 2822 st->last_pos = *pos; 2823 return rc; 2824 } 2825 EXPORT_SYMBOL(tcp_seq_next); 2826 2827 void tcp_seq_stop(struct seq_file *seq, void *v) 2828 { 2829 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2830 struct tcp_iter_state *st = seq->private; 2831 2832 switch (st->state) { 2833 case TCP_SEQ_STATE_LISTENING: 2834 if (v != SEQ_START_TOKEN) 2835 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2836 break; 2837 case TCP_SEQ_STATE_ESTABLISHED: 2838 if (v) 2839 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2840 break; 2841 } 2842 } 2843 EXPORT_SYMBOL(tcp_seq_stop); 2844 2845 static void get_openreq4(const struct request_sock *req, 2846 struct seq_file *f, int i) 2847 { 2848 const struct inet_request_sock *ireq = inet_rsk(req); 2849 long delta = req->rsk_timer.expires - jiffies; 2850 2851 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2852 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2853 i, 2854 ireq->ir_loc_addr, 2855 ireq->ir_num, 2856 ireq->ir_rmt_addr, 2857 ntohs(ireq->ir_rmt_port), 2858 TCP_SYN_RECV, 2859 0, 0, /* could print option size, but that is af dependent. */ 2860 1, /* timers active (only the expire timer) */ 2861 jiffies_delta_to_clock_t(delta), 2862 req->num_timeout, 2863 from_kuid_munged(seq_user_ns(f), 2864 sock_i_uid(req->rsk_listener)), 2865 0, /* non standard timer */ 2866 0, /* open_requests have no inode */ 2867 0, 2868 req); 2869 } 2870 2871 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2872 { 2873 int timer_active; 2874 unsigned long timer_expires; 2875 const struct tcp_sock *tp = tcp_sk(sk); 2876 const struct inet_connection_sock *icsk = inet_csk(sk); 2877 const struct inet_sock *inet = inet_sk(sk); 2878 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2879 __be32 dest = inet->inet_daddr; 2880 __be32 src = inet->inet_rcv_saddr; 2881 __u16 destp = ntohs(inet->inet_dport); 2882 __u16 srcp = ntohs(inet->inet_sport); 2883 int rx_queue; 2884 int state; 2885 2886 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2887 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2888 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2889 timer_active = 1; 2890 timer_expires = icsk->icsk_timeout; 2891 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2892 timer_active = 4; 2893 timer_expires = icsk->icsk_timeout; 2894 } else if (timer_pending(&sk->sk_timer)) { 2895 timer_active = 2; 2896 timer_expires = sk->sk_timer.expires; 2897 } else { 2898 timer_active = 0; 2899 timer_expires = jiffies; 2900 } 2901 2902 state = inet_sk_state_load(sk); 2903 if (state == TCP_LISTEN) 2904 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2905 else 2906 /* Because we don't lock the socket, 2907 * we might find a transient negative value. 2908 */ 2909 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2910 READ_ONCE(tp->copied_seq), 0); 2911 2912 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2913 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2914 i, src, srcp, dest, destp, state, 2915 READ_ONCE(tp->write_seq) - tp->snd_una, 2916 rx_queue, 2917 timer_active, 2918 jiffies_delta_to_clock_t(timer_expires - jiffies), 2919 icsk->icsk_retransmits, 2920 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2921 icsk->icsk_probes_out, 2922 sock_i_ino(sk), 2923 refcount_read(&sk->sk_refcnt), sk, 2924 jiffies_to_clock_t(icsk->icsk_rto), 2925 jiffies_to_clock_t(icsk->icsk_ack.ato), 2926 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2927 tcp_snd_cwnd(tp), 2928 state == TCP_LISTEN ? 2929 fastopenq->max_qlen : 2930 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2931 } 2932 2933 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2934 struct seq_file *f, int i) 2935 { 2936 long delta = tw->tw_timer.expires - jiffies; 2937 __be32 dest, src; 2938 __u16 destp, srcp; 2939 2940 dest = tw->tw_daddr; 2941 src = tw->tw_rcv_saddr; 2942 destp = ntohs(tw->tw_dport); 2943 srcp = ntohs(tw->tw_sport); 2944 2945 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2946 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2947 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2948 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2949 refcount_read(&tw->tw_refcnt), tw); 2950 } 2951 2952 #define TMPSZ 150 2953 2954 static int tcp4_seq_show(struct seq_file *seq, void *v) 2955 { 2956 struct tcp_iter_state *st; 2957 struct sock *sk = v; 2958 2959 seq_setwidth(seq, TMPSZ - 1); 2960 if (v == SEQ_START_TOKEN) { 2961 seq_puts(seq, " sl local_address rem_address st tx_queue " 2962 "rx_queue tr tm->when retrnsmt uid timeout " 2963 "inode"); 2964 goto out; 2965 } 2966 st = seq->private; 2967 2968 if (sk->sk_state == TCP_TIME_WAIT) 2969 get_timewait4_sock(v, seq, st->num); 2970 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2971 get_openreq4(v, seq, st->num); 2972 else 2973 get_tcp4_sock(v, seq, st->num); 2974 out: 2975 seq_pad(seq, '\n'); 2976 return 0; 2977 } 2978 2979 #ifdef CONFIG_BPF_SYSCALL 2980 struct bpf_tcp_iter_state { 2981 struct tcp_iter_state state; 2982 unsigned int cur_sk; 2983 unsigned int end_sk; 2984 unsigned int max_sk; 2985 struct sock **batch; 2986 bool st_bucket_done; 2987 }; 2988 2989 struct bpf_iter__tcp { 2990 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2991 __bpf_md_ptr(struct sock_common *, sk_common); 2992 uid_t uid __aligned(8); 2993 }; 2994 2995 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2996 struct sock_common *sk_common, uid_t uid) 2997 { 2998 struct bpf_iter__tcp ctx; 2999 3000 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3001 ctx.meta = meta; 3002 ctx.sk_common = sk_common; 3003 ctx.uid = uid; 3004 return bpf_iter_run_prog(prog, &ctx); 3005 } 3006 3007 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3008 { 3009 while (iter->cur_sk < iter->end_sk) 3010 sock_gen_put(iter->batch[iter->cur_sk++]); 3011 } 3012 3013 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3014 unsigned int new_batch_sz) 3015 { 3016 struct sock **new_batch; 3017 3018 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3019 GFP_USER | __GFP_NOWARN); 3020 if (!new_batch) 3021 return -ENOMEM; 3022 3023 bpf_iter_tcp_put_batch(iter); 3024 kvfree(iter->batch); 3025 iter->batch = new_batch; 3026 iter->max_sk = new_batch_sz; 3027 3028 return 0; 3029 } 3030 3031 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3032 struct sock *start_sk) 3033 { 3034 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3035 struct bpf_tcp_iter_state *iter = seq->private; 3036 struct tcp_iter_state *st = &iter->state; 3037 struct hlist_nulls_node *node; 3038 unsigned int expected = 1; 3039 struct sock *sk; 3040 3041 sock_hold(start_sk); 3042 iter->batch[iter->end_sk++] = start_sk; 3043 3044 sk = sk_nulls_next(start_sk); 3045 sk_nulls_for_each_from(sk, node) { 3046 if (seq_sk_match(seq, sk)) { 3047 if (iter->end_sk < iter->max_sk) { 3048 sock_hold(sk); 3049 iter->batch[iter->end_sk++] = sk; 3050 } 3051 expected++; 3052 } 3053 } 3054 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3055 3056 return expected; 3057 } 3058 3059 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3060 struct sock *start_sk) 3061 { 3062 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3063 struct bpf_tcp_iter_state *iter = seq->private; 3064 struct tcp_iter_state *st = &iter->state; 3065 struct hlist_nulls_node *node; 3066 unsigned int expected = 1; 3067 struct sock *sk; 3068 3069 sock_hold(start_sk); 3070 iter->batch[iter->end_sk++] = start_sk; 3071 3072 sk = sk_nulls_next(start_sk); 3073 sk_nulls_for_each_from(sk, node) { 3074 if (seq_sk_match(seq, sk)) { 3075 if (iter->end_sk < iter->max_sk) { 3076 sock_hold(sk); 3077 iter->batch[iter->end_sk++] = sk; 3078 } 3079 expected++; 3080 } 3081 } 3082 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3083 3084 return expected; 3085 } 3086 3087 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3088 { 3089 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3090 struct bpf_tcp_iter_state *iter = seq->private; 3091 struct tcp_iter_state *st = &iter->state; 3092 unsigned int expected; 3093 bool resized = false; 3094 struct sock *sk; 3095 3096 /* The st->bucket is done. Directly advance to the next 3097 * bucket instead of having the tcp_seek_last_pos() to skip 3098 * one by one in the current bucket and eventually find out 3099 * it has to advance to the next bucket. 3100 */ 3101 if (iter->st_bucket_done) { 3102 st->offset = 0; 3103 st->bucket++; 3104 if (st->state == TCP_SEQ_STATE_LISTENING && 3105 st->bucket > hinfo->lhash2_mask) { 3106 st->state = TCP_SEQ_STATE_ESTABLISHED; 3107 st->bucket = 0; 3108 } 3109 } 3110 3111 again: 3112 /* Get a new batch */ 3113 iter->cur_sk = 0; 3114 iter->end_sk = 0; 3115 iter->st_bucket_done = false; 3116 3117 sk = tcp_seek_last_pos(seq); 3118 if (!sk) 3119 return NULL; /* Done */ 3120 3121 if (st->state == TCP_SEQ_STATE_LISTENING) 3122 expected = bpf_iter_tcp_listening_batch(seq, sk); 3123 else 3124 expected = bpf_iter_tcp_established_batch(seq, sk); 3125 3126 if (iter->end_sk == expected) { 3127 iter->st_bucket_done = true; 3128 return sk; 3129 } 3130 3131 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3132 resized = true; 3133 goto again; 3134 } 3135 3136 return sk; 3137 } 3138 3139 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3140 { 3141 /* bpf iter does not support lseek, so it always 3142 * continue from where it was stop()-ped. 3143 */ 3144 if (*pos) 3145 return bpf_iter_tcp_batch(seq); 3146 3147 return SEQ_START_TOKEN; 3148 } 3149 3150 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3151 { 3152 struct bpf_tcp_iter_state *iter = seq->private; 3153 struct tcp_iter_state *st = &iter->state; 3154 struct sock *sk; 3155 3156 /* Whenever seq_next() is called, the iter->cur_sk is 3157 * done with seq_show(), so advance to the next sk in 3158 * the batch. 3159 */ 3160 if (iter->cur_sk < iter->end_sk) { 3161 /* Keeping st->num consistent in tcp_iter_state. 3162 * bpf_iter_tcp does not use st->num. 3163 * meta.seq_num is used instead. 3164 */ 3165 st->num++; 3166 /* Move st->offset to the next sk in the bucket such that 3167 * the future start() will resume at st->offset in 3168 * st->bucket. See tcp_seek_last_pos(). 3169 */ 3170 st->offset++; 3171 sock_gen_put(iter->batch[iter->cur_sk++]); 3172 } 3173 3174 if (iter->cur_sk < iter->end_sk) 3175 sk = iter->batch[iter->cur_sk]; 3176 else 3177 sk = bpf_iter_tcp_batch(seq); 3178 3179 ++*pos; 3180 /* Keeping st->last_pos consistent in tcp_iter_state. 3181 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3182 */ 3183 st->last_pos = *pos; 3184 return sk; 3185 } 3186 3187 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3188 { 3189 struct bpf_iter_meta meta; 3190 struct bpf_prog *prog; 3191 struct sock *sk = v; 3192 uid_t uid; 3193 int ret; 3194 3195 if (v == SEQ_START_TOKEN) 3196 return 0; 3197 3198 if (sk_fullsock(sk)) 3199 lock_sock(sk); 3200 3201 if (unlikely(sk_unhashed(sk))) { 3202 ret = SEQ_SKIP; 3203 goto unlock; 3204 } 3205 3206 if (sk->sk_state == TCP_TIME_WAIT) { 3207 uid = 0; 3208 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3209 const struct request_sock *req = v; 3210 3211 uid = from_kuid_munged(seq_user_ns(seq), 3212 sock_i_uid(req->rsk_listener)); 3213 } else { 3214 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3215 } 3216 3217 meta.seq = seq; 3218 prog = bpf_iter_get_info(&meta, false); 3219 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3220 3221 unlock: 3222 if (sk_fullsock(sk)) 3223 release_sock(sk); 3224 return ret; 3225 3226 } 3227 3228 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3229 { 3230 struct bpf_tcp_iter_state *iter = seq->private; 3231 struct bpf_iter_meta meta; 3232 struct bpf_prog *prog; 3233 3234 if (!v) { 3235 meta.seq = seq; 3236 prog = bpf_iter_get_info(&meta, true); 3237 if (prog) 3238 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3239 } 3240 3241 if (iter->cur_sk < iter->end_sk) { 3242 bpf_iter_tcp_put_batch(iter); 3243 iter->st_bucket_done = false; 3244 } 3245 } 3246 3247 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3248 .show = bpf_iter_tcp_seq_show, 3249 .start = bpf_iter_tcp_seq_start, 3250 .next = bpf_iter_tcp_seq_next, 3251 .stop = bpf_iter_tcp_seq_stop, 3252 }; 3253 #endif 3254 static unsigned short seq_file_family(const struct seq_file *seq) 3255 { 3256 const struct tcp_seq_afinfo *afinfo; 3257 3258 #ifdef CONFIG_BPF_SYSCALL 3259 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3260 if (seq->op == &bpf_iter_tcp_seq_ops) 3261 return AF_UNSPEC; 3262 #endif 3263 3264 /* Iterated from proc fs */ 3265 afinfo = pde_data(file_inode(seq->file)); 3266 return afinfo->family; 3267 } 3268 3269 static const struct seq_operations tcp4_seq_ops = { 3270 .show = tcp4_seq_show, 3271 .start = tcp_seq_start, 3272 .next = tcp_seq_next, 3273 .stop = tcp_seq_stop, 3274 }; 3275 3276 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3277 .family = AF_INET, 3278 }; 3279 3280 static int __net_init tcp4_proc_init_net(struct net *net) 3281 { 3282 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3283 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3284 return -ENOMEM; 3285 return 0; 3286 } 3287 3288 static void __net_exit tcp4_proc_exit_net(struct net *net) 3289 { 3290 remove_proc_entry("tcp", net->proc_net); 3291 } 3292 3293 static struct pernet_operations tcp4_net_ops = { 3294 .init = tcp4_proc_init_net, 3295 .exit = tcp4_proc_exit_net, 3296 }; 3297 3298 int __init tcp4_proc_init(void) 3299 { 3300 return register_pernet_subsys(&tcp4_net_ops); 3301 } 3302 3303 void tcp4_proc_exit(void) 3304 { 3305 unregister_pernet_subsys(&tcp4_net_ops); 3306 } 3307 #endif /* CONFIG_PROC_FS */ 3308 3309 /* @wake is one when sk_stream_write_space() calls us. 3310 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3311 * This mimics the strategy used in sock_def_write_space(). 3312 */ 3313 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3314 { 3315 const struct tcp_sock *tp = tcp_sk(sk); 3316 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3317 READ_ONCE(tp->snd_nxt); 3318 3319 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3320 } 3321 EXPORT_SYMBOL(tcp_stream_memory_free); 3322 3323 struct proto tcp_prot = { 3324 .name = "TCP", 3325 .owner = THIS_MODULE, 3326 .close = tcp_close, 3327 .pre_connect = tcp_v4_pre_connect, 3328 .connect = tcp_v4_connect, 3329 .disconnect = tcp_disconnect, 3330 .accept = inet_csk_accept, 3331 .ioctl = tcp_ioctl, 3332 .init = tcp_v4_init_sock, 3333 .destroy = tcp_v4_destroy_sock, 3334 .shutdown = tcp_shutdown, 3335 .setsockopt = tcp_setsockopt, 3336 .getsockopt = tcp_getsockopt, 3337 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3338 .keepalive = tcp_set_keepalive, 3339 .recvmsg = tcp_recvmsg, 3340 .sendmsg = tcp_sendmsg, 3341 .splice_eof = tcp_splice_eof, 3342 .backlog_rcv = tcp_v4_do_rcv, 3343 .release_cb = tcp_release_cb, 3344 .hash = inet_hash, 3345 .unhash = inet_unhash, 3346 .get_port = inet_csk_get_port, 3347 .put_port = inet_put_port, 3348 #ifdef CONFIG_BPF_SYSCALL 3349 .psock_update_sk_prot = tcp_bpf_update_proto, 3350 #endif 3351 .enter_memory_pressure = tcp_enter_memory_pressure, 3352 .leave_memory_pressure = tcp_leave_memory_pressure, 3353 .stream_memory_free = tcp_stream_memory_free, 3354 .sockets_allocated = &tcp_sockets_allocated, 3355 .orphan_count = &tcp_orphan_count, 3356 3357 .memory_allocated = &tcp_memory_allocated, 3358 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3359 3360 .memory_pressure = &tcp_memory_pressure, 3361 .sysctl_mem = sysctl_tcp_mem, 3362 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3363 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3364 .max_header = MAX_TCP_HEADER, 3365 .obj_size = sizeof(struct tcp_sock), 3366 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3367 .twsk_prot = &tcp_timewait_sock_ops, 3368 .rsk_prot = &tcp_request_sock_ops, 3369 .h.hashinfo = NULL, 3370 .no_autobind = true, 3371 .diag_destroy = tcp_abort, 3372 }; 3373 EXPORT_SYMBOL(tcp_prot); 3374 3375 static void __net_exit tcp_sk_exit(struct net *net) 3376 { 3377 if (net->ipv4.tcp_congestion_control) 3378 bpf_module_put(net->ipv4.tcp_congestion_control, 3379 net->ipv4.tcp_congestion_control->owner); 3380 } 3381 3382 static void __net_init tcp_set_hashinfo(struct net *net) 3383 { 3384 struct inet_hashinfo *hinfo; 3385 unsigned int ehash_entries; 3386 struct net *old_net; 3387 3388 if (net_eq(net, &init_net)) 3389 goto fallback; 3390 3391 old_net = current->nsproxy->net_ns; 3392 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3393 if (!ehash_entries) 3394 goto fallback; 3395 3396 ehash_entries = roundup_pow_of_two(ehash_entries); 3397 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3398 if (!hinfo) { 3399 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3400 "for a netns, fallback to the global one\n", 3401 ehash_entries); 3402 fallback: 3403 hinfo = &tcp_hashinfo; 3404 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3405 } 3406 3407 net->ipv4.tcp_death_row.hashinfo = hinfo; 3408 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3409 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3410 } 3411 3412 static int __net_init tcp_sk_init(struct net *net) 3413 { 3414 net->ipv4.sysctl_tcp_ecn = 2; 3415 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3416 3417 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3418 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3419 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3420 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3421 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3422 3423 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3424 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3425 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3426 3427 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3428 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3429 net->ipv4.sysctl_tcp_syncookies = 1; 3430 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3431 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3432 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3433 net->ipv4.sysctl_tcp_orphan_retries = 0; 3434 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3435 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3436 net->ipv4.sysctl_tcp_tw_reuse = 2; 3437 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3438 3439 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3440 tcp_set_hashinfo(net); 3441 3442 net->ipv4.sysctl_tcp_sack = 1; 3443 net->ipv4.sysctl_tcp_window_scaling = 1; 3444 net->ipv4.sysctl_tcp_timestamps = 1; 3445 net->ipv4.sysctl_tcp_early_retrans = 3; 3446 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3447 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3448 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3449 net->ipv4.sysctl_tcp_max_reordering = 300; 3450 net->ipv4.sysctl_tcp_dsack = 1; 3451 net->ipv4.sysctl_tcp_app_win = 31; 3452 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3453 net->ipv4.sysctl_tcp_frto = 2; 3454 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3455 /* This limits the percentage of the congestion window which we 3456 * will allow a single TSO frame to consume. Building TSO frames 3457 * which are too large can cause TCP streams to be bursty. 3458 */ 3459 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3460 /* Default TSQ limit of 16 TSO segments */ 3461 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3462 3463 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3464 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3465 3466 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3467 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3468 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3469 net->ipv4.sysctl_tcp_autocorking = 1; 3470 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3471 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3472 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3473 if (net != &init_net) { 3474 memcpy(net->ipv4.sysctl_tcp_rmem, 3475 init_net.ipv4.sysctl_tcp_rmem, 3476 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3477 memcpy(net->ipv4.sysctl_tcp_wmem, 3478 init_net.ipv4.sysctl_tcp_wmem, 3479 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3480 } 3481 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3482 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3483 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3484 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3485 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3486 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3487 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3488 3489 /* Set default values for PLB */ 3490 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3491 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3492 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3493 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3494 /* Default congestion threshold for PLB to mark a round is 50% */ 3495 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3496 3497 /* Reno is always built in */ 3498 if (!net_eq(net, &init_net) && 3499 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3500 init_net.ipv4.tcp_congestion_control->owner)) 3501 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3502 else 3503 net->ipv4.tcp_congestion_control = &tcp_reno; 3504 3505 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3506 net->ipv4.sysctl_tcp_shrink_window = 0; 3507 3508 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3509 3510 return 0; 3511 } 3512 3513 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3514 { 3515 struct net *net; 3516 3517 tcp_twsk_purge(net_exit_list); 3518 3519 list_for_each_entry(net, net_exit_list, exit_list) { 3520 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3521 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3522 tcp_fastopen_ctx_destroy(net); 3523 } 3524 } 3525 3526 static struct pernet_operations __net_initdata tcp_sk_ops = { 3527 .init = tcp_sk_init, 3528 .exit = tcp_sk_exit, 3529 .exit_batch = tcp_sk_exit_batch, 3530 }; 3531 3532 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3533 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3534 struct sock_common *sk_common, uid_t uid) 3535 3536 #define INIT_BATCH_SZ 16 3537 3538 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3539 { 3540 struct bpf_tcp_iter_state *iter = priv_data; 3541 int err; 3542 3543 err = bpf_iter_init_seq_net(priv_data, aux); 3544 if (err) 3545 return err; 3546 3547 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3548 if (err) { 3549 bpf_iter_fini_seq_net(priv_data); 3550 return err; 3551 } 3552 3553 return 0; 3554 } 3555 3556 static void bpf_iter_fini_tcp(void *priv_data) 3557 { 3558 struct bpf_tcp_iter_state *iter = priv_data; 3559 3560 bpf_iter_fini_seq_net(priv_data); 3561 kvfree(iter->batch); 3562 } 3563 3564 static const struct bpf_iter_seq_info tcp_seq_info = { 3565 .seq_ops = &bpf_iter_tcp_seq_ops, 3566 .init_seq_private = bpf_iter_init_tcp, 3567 .fini_seq_private = bpf_iter_fini_tcp, 3568 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3569 }; 3570 3571 static const struct bpf_func_proto * 3572 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3573 const struct bpf_prog *prog) 3574 { 3575 switch (func_id) { 3576 case BPF_FUNC_setsockopt: 3577 return &bpf_sk_setsockopt_proto; 3578 case BPF_FUNC_getsockopt: 3579 return &bpf_sk_getsockopt_proto; 3580 default: 3581 return NULL; 3582 } 3583 } 3584 3585 static struct bpf_iter_reg tcp_reg_info = { 3586 .target = "tcp", 3587 .ctx_arg_info_size = 1, 3588 .ctx_arg_info = { 3589 { offsetof(struct bpf_iter__tcp, sk_common), 3590 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3591 }, 3592 .get_func_proto = bpf_iter_tcp_get_func_proto, 3593 .seq_info = &tcp_seq_info, 3594 }; 3595 3596 static void __init bpf_iter_register(void) 3597 { 3598 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3599 if (bpf_iter_reg_target(&tcp_reg_info)) 3600 pr_warn("Warning: could not register bpf iterator tcp\n"); 3601 } 3602 3603 #endif 3604 3605 void __init tcp_v4_init(void) 3606 { 3607 int cpu, res; 3608 3609 for_each_possible_cpu(cpu) { 3610 struct sock *sk; 3611 3612 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3613 IPPROTO_TCP, &init_net); 3614 if (res) 3615 panic("Failed to create the TCP control socket.\n"); 3616 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3617 3618 /* Please enforce IP_DF and IPID==0 for RST and 3619 * ACK sent in SYN-RECV and TIME-WAIT state. 3620 */ 3621 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3622 3623 per_cpu(ipv4_tcp_sk, cpu) = sk; 3624 } 3625 if (register_pernet_subsys(&tcp_sk_ops)) 3626 panic("Failed to create the TCP control socket.\n"); 3627 3628 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3629 bpf_iter_register(); 3630 #endif 3631 } 3632