1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 #include <net/rstreason.h> 74 75 #include <linux/inet.h> 76 #include <linux/ipv6.h> 77 #include <linux/stddef.h> 78 #include <linux/proc_fs.h> 79 #include <linux/seq_file.h> 80 #include <linux/inetdevice.h> 81 #include <linux/btf_ids.h> 82 83 #include <crypto/hash.h> 84 #include <linux/scatterlist.h> 85 86 #include <trace/events/tcp.h> 87 88 #ifdef CONFIG_TCP_MD5SIG 89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 90 __be32 daddr, __be32 saddr, const struct tcphdr *th); 91 #endif 92 93 struct inet_hashinfo tcp_hashinfo; 94 EXPORT_SYMBOL(tcp_hashinfo); 95 96 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 97 98 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 99 { 100 return secure_tcp_seq(ip_hdr(skb)->daddr, 101 ip_hdr(skb)->saddr, 102 tcp_hdr(skb)->dest, 103 tcp_hdr(skb)->source); 104 } 105 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 107 { 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 109 } 110 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 112 { 113 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 114 const struct inet_timewait_sock *tw = inet_twsk(sktw); 115 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 116 struct tcp_sock *tp = tcp_sk(sk); 117 118 if (reuse == 2) { 119 /* Still does not detect *everything* that goes through 120 * lo, since we require a loopback src or dst address 121 * or direct binding to 'lo' interface. 122 */ 123 bool loopback = false; 124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 125 loopback = true; 126 #if IS_ENABLED(CONFIG_IPV6) 127 if (tw->tw_family == AF_INET6) { 128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 130 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 131 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 132 loopback = true; 133 } else 134 #endif 135 { 136 if (ipv4_is_loopback(tw->tw_daddr) || 137 ipv4_is_loopback(tw->tw_rcv_saddr)) 138 loopback = true; 139 } 140 if (!loopback) 141 reuse = 0; 142 } 143 144 /* With PAWS, it is safe from the viewpoint 145 of data integrity. Even without PAWS it is safe provided sequence 146 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 147 148 Actually, the idea is close to VJ's one, only timestamp cache is 149 held not per host, but per port pair and TW bucket is used as state 150 holder. 151 152 If TW bucket has been already destroyed we fall back to VJ's scheme 153 and use initial timestamp retrieved from peer table. 154 */ 155 if (tcptw->tw_ts_recent_stamp && 156 (!twp || (reuse && time_after32(ktime_get_seconds(), 157 tcptw->tw_ts_recent_stamp)))) { 158 /* In case of repair and re-using TIME-WAIT sockets we still 159 * want to be sure that it is safe as above but honor the 160 * sequence numbers and time stamps set as part of the repair 161 * process. 162 * 163 * Without this check re-using a TIME-WAIT socket with TCP 164 * repair would accumulate a -1 on the repair assigned 165 * sequence number. The first time it is reused the sequence 166 * is -1, the second time -2, etc. This fixes that issue 167 * without appearing to create any others. 168 */ 169 if (likely(!tp->repair)) { 170 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 171 172 if (!seq) 173 seq = 1; 174 WRITE_ONCE(tp->write_seq, seq); 175 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 176 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 177 } 178 sock_hold(sktw); 179 return 1; 180 } 181 182 return 0; 183 } 184 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 185 186 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 187 int addr_len) 188 { 189 /* This check is replicated from tcp_v4_connect() and intended to 190 * prevent BPF program called below from accessing bytes that are out 191 * of the bound specified by user in addr_len. 192 */ 193 if (addr_len < sizeof(struct sockaddr_in)) 194 return -EINVAL; 195 196 sock_owned_by_me(sk); 197 198 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 199 } 200 201 /* This will initiate an outgoing connection. */ 202 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 203 { 204 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 205 struct inet_timewait_death_row *tcp_death_row; 206 struct inet_sock *inet = inet_sk(sk); 207 struct tcp_sock *tp = tcp_sk(sk); 208 struct ip_options_rcu *inet_opt; 209 struct net *net = sock_net(sk); 210 __be16 orig_sport, orig_dport; 211 __be32 daddr, nexthop; 212 struct flowi4 *fl4; 213 struct rtable *rt; 214 int err; 215 216 if (addr_len < sizeof(struct sockaddr_in)) 217 return -EINVAL; 218 219 if (usin->sin_family != AF_INET) 220 return -EAFNOSUPPORT; 221 222 nexthop = daddr = usin->sin_addr.s_addr; 223 inet_opt = rcu_dereference_protected(inet->inet_opt, 224 lockdep_sock_is_held(sk)); 225 if (inet_opt && inet_opt->opt.srr) { 226 if (!daddr) 227 return -EINVAL; 228 nexthop = inet_opt->opt.faddr; 229 } 230 231 orig_sport = inet->inet_sport; 232 orig_dport = usin->sin_port; 233 fl4 = &inet->cork.fl.u.ip4; 234 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 235 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 236 orig_dport, sk); 237 if (IS_ERR(rt)) { 238 err = PTR_ERR(rt); 239 if (err == -ENETUNREACH) 240 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 241 return err; 242 } 243 244 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 245 ip_rt_put(rt); 246 return -ENETUNREACH; 247 } 248 249 if (!inet_opt || !inet_opt->opt.srr) 250 daddr = fl4->daddr; 251 252 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 253 254 if (!inet->inet_saddr) { 255 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 256 if (err) { 257 ip_rt_put(rt); 258 return err; 259 } 260 } else { 261 sk_rcv_saddr_set(sk, inet->inet_saddr); 262 } 263 264 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 265 /* Reset inherited state */ 266 tp->rx_opt.ts_recent = 0; 267 tp->rx_opt.ts_recent_stamp = 0; 268 if (likely(!tp->repair)) 269 WRITE_ONCE(tp->write_seq, 0); 270 } 271 272 inet->inet_dport = usin->sin_port; 273 sk_daddr_set(sk, daddr); 274 275 inet_csk(sk)->icsk_ext_hdr_len = 0; 276 if (inet_opt) 277 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 278 279 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 280 281 /* Socket identity is still unknown (sport may be zero). 282 * However we set state to SYN-SENT and not releasing socket 283 * lock select source port, enter ourselves into the hash tables and 284 * complete initialization after this. 285 */ 286 tcp_set_state(sk, TCP_SYN_SENT); 287 err = inet_hash_connect(tcp_death_row, sk); 288 if (err) 289 goto failure; 290 291 sk_set_txhash(sk); 292 293 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 294 inet->inet_sport, inet->inet_dport, sk); 295 if (IS_ERR(rt)) { 296 err = PTR_ERR(rt); 297 rt = NULL; 298 goto failure; 299 } 300 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 301 /* OK, now commit destination to socket. */ 302 sk->sk_gso_type = SKB_GSO_TCPV4; 303 sk_setup_caps(sk, &rt->dst); 304 rt = NULL; 305 306 if (likely(!tp->repair)) { 307 if (!tp->write_seq) 308 WRITE_ONCE(tp->write_seq, 309 secure_tcp_seq(inet->inet_saddr, 310 inet->inet_daddr, 311 inet->inet_sport, 312 usin->sin_port)); 313 WRITE_ONCE(tp->tsoffset, 314 secure_tcp_ts_off(net, inet->inet_saddr, 315 inet->inet_daddr)); 316 } 317 318 atomic_set(&inet->inet_id, get_random_u16()); 319 320 if (tcp_fastopen_defer_connect(sk, &err)) 321 return err; 322 if (err) 323 goto failure; 324 325 err = tcp_connect(sk); 326 327 if (err) 328 goto failure; 329 330 return 0; 331 332 failure: 333 /* 334 * This unhashes the socket and releases the local port, 335 * if necessary. 336 */ 337 tcp_set_state(sk, TCP_CLOSE); 338 inet_bhash2_reset_saddr(sk); 339 ip_rt_put(rt); 340 sk->sk_route_caps = 0; 341 inet->inet_dport = 0; 342 return err; 343 } 344 EXPORT_SYMBOL(tcp_v4_connect); 345 346 /* 347 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 348 * It can be called through tcp_release_cb() if socket was owned by user 349 * at the time tcp_v4_err() was called to handle ICMP message. 350 */ 351 void tcp_v4_mtu_reduced(struct sock *sk) 352 { 353 struct inet_sock *inet = inet_sk(sk); 354 struct dst_entry *dst; 355 u32 mtu; 356 357 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 358 return; 359 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 360 dst = inet_csk_update_pmtu(sk, mtu); 361 if (!dst) 362 return; 363 364 /* Something is about to be wrong... Remember soft error 365 * for the case, if this connection will not able to recover. 366 */ 367 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 368 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 369 370 mtu = dst_mtu(dst); 371 372 if (inet->pmtudisc != IP_PMTUDISC_DONT && 373 ip_sk_accept_pmtu(sk) && 374 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 375 tcp_sync_mss(sk, mtu); 376 377 /* Resend the TCP packet because it's 378 * clear that the old packet has been 379 * dropped. This is the new "fast" path mtu 380 * discovery. 381 */ 382 tcp_simple_retransmit(sk); 383 } /* else let the usual retransmit timer handle it */ 384 } 385 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 386 387 static void do_redirect(struct sk_buff *skb, struct sock *sk) 388 { 389 struct dst_entry *dst = __sk_dst_check(sk, 0); 390 391 if (dst) 392 dst->ops->redirect(dst, sk, skb); 393 } 394 395 396 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 397 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 398 { 399 struct request_sock *req = inet_reqsk(sk); 400 struct net *net = sock_net(sk); 401 402 /* ICMPs are not backlogged, hence we cannot get 403 * an established socket here. 404 */ 405 if (seq != tcp_rsk(req)->snt_isn) { 406 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 407 } else if (abort) { 408 /* 409 * Still in SYN_RECV, just remove it silently. 410 * There is no good way to pass the error to the newly 411 * created socket, and POSIX does not want network 412 * errors returned from accept(). 413 */ 414 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 415 tcp_listendrop(req->rsk_listener); 416 } 417 reqsk_put(req); 418 } 419 EXPORT_SYMBOL(tcp_req_err); 420 421 /* TCP-LD (RFC 6069) logic */ 422 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 423 { 424 struct inet_connection_sock *icsk = inet_csk(sk); 425 struct tcp_sock *tp = tcp_sk(sk); 426 struct sk_buff *skb; 427 s32 remaining; 428 u32 delta_us; 429 430 if (sock_owned_by_user(sk)) 431 return; 432 433 if (seq != tp->snd_una || !icsk->icsk_retransmits || 434 !icsk->icsk_backoff) 435 return; 436 437 skb = tcp_rtx_queue_head(sk); 438 if (WARN_ON_ONCE(!skb)) 439 return; 440 441 icsk->icsk_backoff--; 442 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 443 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 444 445 tcp_mstamp_refresh(tp); 446 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 447 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 448 449 if (remaining > 0) { 450 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 451 remaining, TCP_RTO_MAX); 452 } else { 453 /* RTO revert clocked out retransmission. 454 * Will retransmit now. 455 */ 456 tcp_retransmit_timer(sk); 457 } 458 } 459 EXPORT_SYMBOL(tcp_ld_RTO_revert); 460 461 /* 462 * This routine is called by the ICMP module when it gets some 463 * sort of error condition. If err < 0 then the socket should 464 * be closed and the error returned to the user. If err > 0 465 * it's just the icmp type << 8 | icmp code. After adjustment 466 * header points to the first 8 bytes of the tcp header. We need 467 * to find the appropriate port. 468 * 469 * The locking strategy used here is very "optimistic". When 470 * someone else accesses the socket the ICMP is just dropped 471 * and for some paths there is no check at all. 472 * A more general error queue to queue errors for later handling 473 * is probably better. 474 * 475 */ 476 477 int tcp_v4_err(struct sk_buff *skb, u32 info) 478 { 479 const struct iphdr *iph = (const struct iphdr *)skb->data; 480 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 481 struct tcp_sock *tp; 482 const int type = icmp_hdr(skb)->type; 483 const int code = icmp_hdr(skb)->code; 484 struct sock *sk; 485 struct request_sock *fastopen; 486 u32 seq, snd_una; 487 int err; 488 struct net *net = dev_net(skb->dev); 489 490 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 491 iph->daddr, th->dest, iph->saddr, 492 ntohs(th->source), inet_iif(skb), 0); 493 if (!sk) { 494 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 495 return -ENOENT; 496 } 497 if (sk->sk_state == TCP_TIME_WAIT) { 498 /* To increase the counter of ignored icmps for TCP-AO */ 499 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 500 inet_twsk_put(inet_twsk(sk)); 501 return 0; 502 } 503 seq = ntohl(th->seq); 504 if (sk->sk_state == TCP_NEW_SYN_RECV) { 505 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 506 type == ICMP_TIME_EXCEEDED || 507 (type == ICMP_DEST_UNREACH && 508 (code == ICMP_NET_UNREACH || 509 code == ICMP_HOST_UNREACH))); 510 return 0; 511 } 512 513 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 514 sock_put(sk); 515 return 0; 516 } 517 518 bh_lock_sock(sk); 519 /* If too many ICMPs get dropped on busy 520 * servers this needs to be solved differently. 521 * We do take care of PMTU discovery (RFC1191) special case : 522 * we can receive locally generated ICMP messages while socket is held. 523 */ 524 if (sock_owned_by_user(sk)) { 525 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 526 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 527 } 528 if (sk->sk_state == TCP_CLOSE) 529 goto out; 530 531 if (static_branch_unlikely(&ip4_min_ttl)) { 532 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 533 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 534 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 535 goto out; 536 } 537 } 538 539 tp = tcp_sk(sk); 540 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 541 fastopen = rcu_dereference(tp->fastopen_rsk); 542 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 543 if (sk->sk_state != TCP_LISTEN && 544 !between(seq, snd_una, tp->snd_nxt)) { 545 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 546 goto out; 547 } 548 549 switch (type) { 550 case ICMP_REDIRECT: 551 if (!sock_owned_by_user(sk)) 552 do_redirect(skb, sk); 553 goto out; 554 case ICMP_SOURCE_QUENCH: 555 /* Just silently ignore these. */ 556 goto out; 557 case ICMP_PARAMETERPROB: 558 err = EPROTO; 559 break; 560 case ICMP_DEST_UNREACH: 561 if (code > NR_ICMP_UNREACH) 562 goto out; 563 564 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 565 /* We are not interested in TCP_LISTEN and open_requests 566 * (SYN-ACKs send out by Linux are always <576bytes so 567 * they should go through unfragmented). 568 */ 569 if (sk->sk_state == TCP_LISTEN) 570 goto out; 571 572 WRITE_ONCE(tp->mtu_info, info); 573 if (!sock_owned_by_user(sk)) { 574 tcp_v4_mtu_reduced(sk); 575 } else { 576 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 577 sock_hold(sk); 578 } 579 goto out; 580 } 581 582 err = icmp_err_convert[code].errno; 583 /* check if this ICMP message allows revert of backoff. 584 * (see RFC 6069) 585 */ 586 if (!fastopen && 587 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 588 tcp_ld_RTO_revert(sk, seq); 589 break; 590 case ICMP_TIME_EXCEEDED: 591 err = EHOSTUNREACH; 592 break; 593 default: 594 goto out; 595 } 596 597 switch (sk->sk_state) { 598 case TCP_SYN_SENT: 599 case TCP_SYN_RECV: 600 /* Only in fast or simultaneous open. If a fast open socket is 601 * already accepted it is treated as a connected one below. 602 */ 603 if (fastopen && !fastopen->sk) 604 break; 605 606 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 607 608 if (!sock_owned_by_user(sk)) { 609 WRITE_ONCE(sk->sk_err, err); 610 611 sk_error_report(sk); 612 613 tcp_done(sk); 614 } else { 615 WRITE_ONCE(sk->sk_err_soft, err); 616 } 617 goto out; 618 } 619 620 /* If we've already connected we will keep trying 621 * until we time out, or the user gives up. 622 * 623 * rfc1122 4.2.3.9 allows to consider as hard errors 624 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 625 * but it is obsoleted by pmtu discovery). 626 * 627 * Note, that in modern internet, where routing is unreliable 628 * and in each dark corner broken firewalls sit, sending random 629 * errors ordered by their masters even this two messages finally lose 630 * their original sense (even Linux sends invalid PORT_UNREACHs) 631 * 632 * Now we are in compliance with RFCs. 633 * --ANK (980905) 634 */ 635 636 if (!sock_owned_by_user(sk) && 637 inet_test_bit(RECVERR, sk)) { 638 WRITE_ONCE(sk->sk_err, err); 639 sk_error_report(sk); 640 } else { /* Only an error on timeout */ 641 WRITE_ONCE(sk->sk_err_soft, err); 642 } 643 644 out: 645 bh_unlock_sock(sk); 646 sock_put(sk); 647 return 0; 648 } 649 650 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 651 { 652 struct tcphdr *th = tcp_hdr(skb); 653 654 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 655 skb->csum_start = skb_transport_header(skb) - skb->head; 656 skb->csum_offset = offsetof(struct tcphdr, check); 657 } 658 659 /* This routine computes an IPv4 TCP checksum. */ 660 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 661 { 662 const struct inet_sock *inet = inet_sk(sk); 663 664 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 665 } 666 EXPORT_SYMBOL(tcp_v4_send_check); 667 668 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 669 670 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 671 const struct tcp_ao_hdr *aoh, 672 struct ip_reply_arg *arg, struct tcphdr *reply, 673 __be32 reply_options[REPLY_OPTIONS_LEN]) 674 { 675 #ifdef CONFIG_TCP_AO 676 int sdif = tcp_v4_sdif(skb); 677 int dif = inet_iif(skb); 678 int l3index = sdif ? dif : 0; 679 bool allocated_traffic_key; 680 struct tcp_ao_key *key; 681 char *traffic_key; 682 bool drop = true; 683 u32 ao_sne = 0; 684 u8 keyid; 685 686 rcu_read_lock(); 687 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 688 &key, &traffic_key, &allocated_traffic_key, 689 &keyid, &ao_sne)) 690 goto out; 691 692 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 693 (aoh->rnext_keyid << 8) | keyid); 694 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 695 reply->doff = arg->iov[0].iov_len / 4; 696 697 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 698 key, traffic_key, 699 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 700 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 701 reply, ao_sne)) 702 goto out; 703 drop = false; 704 out: 705 rcu_read_unlock(); 706 if (allocated_traffic_key) 707 kfree(traffic_key); 708 return drop; 709 #else 710 return true; 711 #endif 712 } 713 714 /* 715 * This routine will send an RST to the other tcp. 716 * 717 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 718 * for reset. 719 * Answer: if a packet caused RST, it is not for a socket 720 * existing in our system, if it is matched to a socket, 721 * it is just duplicate segment or bug in other side's TCP. 722 * So that we build reply only basing on parameters 723 * arrived with segment. 724 * Exception: precedence violation. We do not implement it in any case. 725 */ 726 727 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 728 enum sk_rst_reason reason) 729 { 730 const struct tcphdr *th = tcp_hdr(skb); 731 struct { 732 struct tcphdr th; 733 __be32 opt[REPLY_OPTIONS_LEN]; 734 } rep; 735 const __u8 *md5_hash_location = NULL; 736 const struct tcp_ao_hdr *aoh; 737 struct ip_reply_arg arg; 738 #ifdef CONFIG_TCP_MD5SIG 739 struct tcp_md5sig_key *key = NULL; 740 unsigned char newhash[16]; 741 struct sock *sk1 = NULL; 742 int genhash; 743 #endif 744 u64 transmit_time = 0; 745 struct sock *ctl_sk; 746 struct net *net; 747 u32 txhash = 0; 748 749 /* Never send a reset in response to a reset. */ 750 if (th->rst) 751 return; 752 753 /* If sk not NULL, it means we did a successful lookup and incoming 754 * route had to be correct. prequeue might have dropped our dst. 755 */ 756 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 757 return; 758 759 /* Swap the send and the receive. */ 760 memset(&rep, 0, sizeof(rep)); 761 rep.th.dest = th->source; 762 rep.th.source = th->dest; 763 rep.th.doff = sizeof(struct tcphdr) / 4; 764 rep.th.rst = 1; 765 766 if (th->ack) { 767 rep.th.seq = th->ack_seq; 768 } else { 769 rep.th.ack = 1; 770 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 771 skb->len - (th->doff << 2)); 772 } 773 774 memset(&arg, 0, sizeof(arg)); 775 arg.iov[0].iov_base = (unsigned char *)&rep; 776 arg.iov[0].iov_len = sizeof(rep.th); 777 778 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 779 780 /* Invalid TCP option size or twice included auth */ 781 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 782 return; 783 784 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 785 return; 786 787 #ifdef CONFIG_TCP_MD5SIG 788 rcu_read_lock(); 789 if (sk && sk_fullsock(sk)) { 790 const union tcp_md5_addr *addr; 791 int l3index; 792 793 /* sdif set, means packet ingressed via a device 794 * in an L3 domain and inet_iif is set to it. 795 */ 796 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 797 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 798 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 799 } else if (md5_hash_location) { 800 const union tcp_md5_addr *addr; 801 int sdif = tcp_v4_sdif(skb); 802 int dif = inet_iif(skb); 803 int l3index; 804 805 /* 806 * active side is lost. Try to find listening socket through 807 * source port, and then find md5 key through listening socket. 808 * we are not loose security here: 809 * Incoming packet is checked with md5 hash with finding key, 810 * no RST generated if md5 hash doesn't match. 811 */ 812 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 813 NULL, 0, ip_hdr(skb)->saddr, 814 th->source, ip_hdr(skb)->daddr, 815 ntohs(th->source), dif, sdif); 816 /* don't send rst if it can't find key */ 817 if (!sk1) 818 goto out; 819 820 /* sdif set, means packet ingressed via a device 821 * in an L3 domain and dif is set to it. 822 */ 823 l3index = sdif ? dif : 0; 824 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 825 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 826 if (!key) 827 goto out; 828 829 830 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 831 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 832 goto out; 833 834 } 835 836 if (key) { 837 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 838 (TCPOPT_NOP << 16) | 839 (TCPOPT_MD5SIG << 8) | 840 TCPOLEN_MD5SIG); 841 /* Update length and the length the header thinks exists */ 842 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 843 rep.th.doff = arg.iov[0].iov_len / 4; 844 845 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 846 key, ip_hdr(skb)->saddr, 847 ip_hdr(skb)->daddr, &rep.th); 848 } 849 #endif 850 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 851 if (rep.opt[0] == 0) { 852 __be32 mrst = mptcp_reset_option(skb); 853 854 if (mrst) { 855 rep.opt[0] = mrst; 856 arg.iov[0].iov_len += sizeof(mrst); 857 rep.th.doff = arg.iov[0].iov_len / 4; 858 } 859 } 860 861 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 862 ip_hdr(skb)->saddr, /* XXX */ 863 arg.iov[0].iov_len, IPPROTO_TCP, 0); 864 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 865 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 866 867 /* When socket is gone, all binding information is lost. 868 * routing might fail in this case. No choice here, if we choose to force 869 * input interface, we will misroute in case of asymmetric route. 870 */ 871 if (sk) 872 arg.bound_dev_if = sk->sk_bound_dev_if; 873 874 trace_tcp_send_reset(sk, skb, reason); 875 876 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 877 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 878 879 arg.tos = ip_hdr(skb)->tos; 880 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 881 local_bh_disable(); 882 ctl_sk = this_cpu_read(ipv4_tcp_sk); 883 sock_net_set(ctl_sk, net); 884 if (sk) { 885 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 886 inet_twsk(sk)->tw_mark : sk->sk_mark; 887 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 888 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 889 transmit_time = tcp_transmit_time(sk); 890 xfrm_sk_clone_policy(ctl_sk, sk); 891 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 892 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 893 } else { 894 ctl_sk->sk_mark = 0; 895 ctl_sk->sk_priority = 0; 896 } 897 ip_send_unicast_reply(ctl_sk, 898 skb, &TCP_SKB_CB(skb)->header.h4.opt, 899 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 900 &arg, arg.iov[0].iov_len, 901 transmit_time, txhash); 902 903 xfrm_sk_free_policy(ctl_sk); 904 sock_net_set(ctl_sk, &init_net); 905 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 906 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 907 local_bh_enable(); 908 909 #ifdef CONFIG_TCP_MD5SIG 910 out: 911 rcu_read_unlock(); 912 #endif 913 } 914 915 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 916 outside socket context is ugly, certainly. What can I do? 917 */ 918 919 static void tcp_v4_send_ack(const struct sock *sk, 920 struct sk_buff *skb, u32 seq, u32 ack, 921 u32 win, u32 tsval, u32 tsecr, int oif, 922 struct tcp_key *key, 923 int reply_flags, u8 tos, u32 txhash) 924 { 925 const struct tcphdr *th = tcp_hdr(skb); 926 struct { 927 struct tcphdr th; 928 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 929 } rep; 930 struct net *net = sock_net(sk); 931 struct ip_reply_arg arg; 932 struct sock *ctl_sk; 933 u64 transmit_time; 934 935 memset(&rep.th, 0, sizeof(struct tcphdr)); 936 memset(&arg, 0, sizeof(arg)); 937 938 arg.iov[0].iov_base = (unsigned char *)&rep; 939 arg.iov[0].iov_len = sizeof(rep.th); 940 if (tsecr) { 941 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 942 (TCPOPT_TIMESTAMP << 8) | 943 TCPOLEN_TIMESTAMP); 944 rep.opt[1] = htonl(tsval); 945 rep.opt[2] = htonl(tsecr); 946 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 947 } 948 949 /* Swap the send and the receive. */ 950 rep.th.dest = th->source; 951 rep.th.source = th->dest; 952 rep.th.doff = arg.iov[0].iov_len / 4; 953 rep.th.seq = htonl(seq); 954 rep.th.ack_seq = htonl(ack); 955 rep.th.ack = 1; 956 rep.th.window = htons(win); 957 958 #ifdef CONFIG_TCP_MD5SIG 959 if (tcp_key_is_md5(key)) { 960 int offset = (tsecr) ? 3 : 0; 961 962 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 963 (TCPOPT_NOP << 16) | 964 (TCPOPT_MD5SIG << 8) | 965 TCPOLEN_MD5SIG); 966 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 967 rep.th.doff = arg.iov[0].iov_len/4; 968 969 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 970 key->md5_key, ip_hdr(skb)->saddr, 971 ip_hdr(skb)->daddr, &rep.th); 972 } 973 #endif 974 #ifdef CONFIG_TCP_AO 975 if (tcp_key_is_ao(key)) { 976 int offset = (tsecr) ? 3 : 0; 977 978 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 979 (tcp_ao_len(key->ao_key) << 16) | 980 (key->ao_key->sndid << 8) | 981 key->rcv_next); 982 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 983 rep.th.doff = arg.iov[0].iov_len / 4; 984 985 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 986 key->ao_key, key->traffic_key, 987 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 988 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 989 &rep.th, key->sne); 990 } 991 #endif 992 arg.flags = reply_flags; 993 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 994 ip_hdr(skb)->saddr, /* XXX */ 995 arg.iov[0].iov_len, IPPROTO_TCP, 0); 996 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 997 if (oif) 998 arg.bound_dev_if = oif; 999 arg.tos = tos; 1000 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1001 local_bh_disable(); 1002 ctl_sk = this_cpu_read(ipv4_tcp_sk); 1003 sock_net_set(ctl_sk, net); 1004 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1005 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1006 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1007 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1008 transmit_time = tcp_transmit_time(sk); 1009 ip_send_unicast_reply(ctl_sk, 1010 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1011 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1012 &arg, arg.iov[0].iov_len, 1013 transmit_time, txhash); 1014 1015 sock_net_set(ctl_sk, &init_net); 1016 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1017 local_bh_enable(); 1018 } 1019 1020 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 1021 { 1022 struct inet_timewait_sock *tw = inet_twsk(sk); 1023 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1024 struct tcp_key key = {}; 1025 #ifdef CONFIG_TCP_AO 1026 struct tcp_ao_info *ao_info; 1027 1028 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1029 /* FIXME: the segment to-be-acked is not verified yet */ 1030 ao_info = rcu_dereference(tcptw->ao_info); 1031 if (ao_info) { 1032 const struct tcp_ao_hdr *aoh; 1033 1034 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1035 inet_twsk_put(tw); 1036 return; 1037 } 1038 1039 if (aoh) 1040 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); 1041 } 1042 } 1043 if (key.ao_key) { 1044 struct tcp_ao_key *rnext_key; 1045 1046 key.traffic_key = snd_other_key(key.ao_key); 1047 key.sne = READ_ONCE(ao_info->snd_sne); 1048 rnext_key = READ_ONCE(ao_info->rnext_key); 1049 key.rcv_next = rnext_key->rcvid; 1050 key.type = TCP_KEY_AO; 1051 #else 1052 if (0) { 1053 #endif 1054 #ifdef CONFIG_TCP_MD5SIG 1055 } else if (static_branch_unlikely(&tcp_md5_needed.key)) { 1056 key.md5_key = tcp_twsk_md5_key(tcptw); 1057 if (key.md5_key) 1058 key.type = TCP_KEY_MD5; 1059 #endif 1060 } 1061 1062 tcp_v4_send_ack(sk, skb, 1063 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 1064 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1065 tcp_tw_tsval(tcptw), 1066 tcptw->tw_ts_recent, 1067 tw->tw_bound_dev_if, &key, 1068 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1069 tw->tw_tos, 1070 tw->tw_txhash); 1071 1072 inet_twsk_put(tw); 1073 } 1074 1075 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1076 struct request_sock *req) 1077 { 1078 struct tcp_key key = {}; 1079 1080 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1081 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1082 */ 1083 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1084 tcp_sk(sk)->snd_nxt; 1085 1086 #ifdef CONFIG_TCP_AO 1087 if (static_branch_unlikely(&tcp_ao_needed.key) && 1088 tcp_rsk_used_ao(req)) { 1089 const union tcp_md5_addr *addr; 1090 const struct tcp_ao_hdr *aoh; 1091 int l3index; 1092 1093 /* Invalid TCP option size or twice included auth */ 1094 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1095 return; 1096 if (!aoh) 1097 return; 1098 1099 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1100 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1101 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1102 aoh->rnext_keyid, -1); 1103 if (unlikely(!key.ao_key)) { 1104 /* Send ACK with any matching MKT for the peer */ 1105 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1106 /* Matching key disappeared (user removed the key?) 1107 * let the handshake timeout. 1108 */ 1109 if (!key.ao_key) { 1110 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1111 addr, 1112 ntohs(tcp_hdr(skb)->source), 1113 &ip_hdr(skb)->daddr, 1114 ntohs(tcp_hdr(skb)->dest)); 1115 return; 1116 } 1117 } 1118 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1119 if (!key.traffic_key) 1120 return; 1121 1122 key.type = TCP_KEY_AO; 1123 key.rcv_next = aoh->keyid; 1124 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1125 #else 1126 if (0) { 1127 #endif 1128 #ifdef CONFIG_TCP_MD5SIG 1129 } else if (static_branch_unlikely(&tcp_md5_needed.key)) { 1130 const union tcp_md5_addr *addr; 1131 int l3index; 1132 1133 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1134 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1135 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1136 if (key.md5_key) 1137 key.type = TCP_KEY_MD5; 1138 #endif 1139 } 1140 1141 /* RFC 7323 2.3 1142 * The window field (SEG.WND) of every outgoing segment, with the 1143 * exception of <SYN> segments, MUST be right-shifted by 1144 * Rcv.Wind.Shift bits: 1145 */ 1146 tcp_v4_send_ack(sk, skb, seq, 1147 tcp_rsk(req)->rcv_nxt, 1148 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 1149 tcp_rsk_tsval(tcp_rsk(req)), 1150 READ_ONCE(req->ts_recent), 1151 0, &key, 1152 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1153 ip_hdr(skb)->tos, 1154 READ_ONCE(tcp_rsk(req)->txhash)); 1155 if (tcp_key_is_ao(&key)) 1156 kfree(key.traffic_key); 1157 } 1158 1159 /* 1160 * Send a SYN-ACK after having received a SYN. 1161 * This still operates on a request_sock only, not on a big 1162 * socket. 1163 */ 1164 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1165 struct flowi *fl, 1166 struct request_sock *req, 1167 struct tcp_fastopen_cookie *foc, 1168 enum tcp_synack_type synack_type, 1169 struct sk_buff *syn_skb) 1170 { 1171 const struct inet_request_sock *ireq = inet_rsk(req); 1172 struct flowi4 fl4; 1173 int err = -1; 1174 struct sk_buff *skb; 1175 u8 tos; 1176 1177 /* First, grab a route. */ 1178 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1179 return -1; 1180 1181 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1182 1183 if (skb) { 1184 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1185 1186 tos = READ_ONCE(inet_sk(sk)->tos); 1187 1188 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1189 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1190 (tos & INET_ECN_MASK); 1191 1192 if (!INET_ECN_is_capable(tos) && 1193 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1194 tos |= INET_ECN_ECT_0; 1195 1196 rcu_read_lock(); 1197 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1198 ireq->ir_rmt_addr, 1199 rcu_dereference(ireq->ireq_opt), 1200 tos); 1201 rcu_read_unlock(); 1202 err = net_xmit_eval(err); 1203 } 1204 1205 return err; 1206 } 1207 1208 /* 1209 * IPv4 request_sock destructor. 1210 */ 1211 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1212 { 1213 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1214 } 1215 1216 #ifdef CONFIG_TCP_MD5SIG 1217 /* 1218 * RFC2385 MD5 checksumming requires a mapping of 1219 * IP address->MD5 Key. 1220 * We need to maintain these in the sk structure. 1221 */ 1222 1223 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1224 EXPORT_SYMBOL(tcp_md5_needed); 1225 1226 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1227 { 1228 if (!old) 1229 return true; 1230 1231 /* l3index always overrides non-l3index */ 1232 if (old->l3index && new->l3index == 0) 1233 return false; 1234 if (old->l3index == 0 && new->l3index) 1235 return true; 1236 1237 return old->prefixlen < new->prefixlen; 1238 } 1239 1240 /* Find the Key structure for an address. */ 1241 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1242 const union tcp_md5_addr *addr, 1243 int family, bool any_l3index) 1244 { 1245 const struct tcp_sock *tp = tcp_sk(sk); 1246 struct tcp_md5sig_key *key; 1247 const struct tcp_md5sig_info *md5sig; 1248 __be32 mask; 1249 struct tcp_md5sig_key *best_match = NULL; 1250 bool match; 1251 1252 /* caller either holds rcu_read_lock() or socket lock */ 1253 md5sig = rcu_dereference_check(tp->md5sig_info, 1254 lockdep_sock_is_held(sk)); 1255 if (!md5sig) 1256 return NULL; 1257 1258 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1259 lockdep_sock_is_held(sk)) { 1260 if (key->family != family) 1261 continue; 1262 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1263 key->l3index != l3index) 1264 continue; 1265 if (family == AF_INET) { 1266 mask = inet_make_mask(key->prefixlen); 1267 match = (key->addr.a4.s_addr & mask) == 1268 (addr->a4.s_addr & mask); 1269 #if IS_ENABLED(CONFIG_IPV6) 1270 } else if (family == AF_INET6) { 1271 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1272 key->prefixlen); 1273 #endif 1274 } else { 1275 match = false; 1276 } 1277 1278 if (match && better_md5_match(best_match, key)) 1279 best_match = key; 1280 } 1281 return best_match; 1282 } 1283 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1284 1285 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1286 const union tcp_md5_addr *addr, 1287 int family, u8 prefixlen, 1288 int l3index, u8 flags) 1289 { 1290 const struct tcp_sock *tp = tcp_sk(sk); 1291 struct tcp_md5sig_key *key; 1292 unsigned int size = sizeof(struct in_addr); 1293 const struct tcp_md5sig_info *md5sig; 1294 1295 /* caller either holds rcu_read_lock() or socket lock */ 1296 md5sig = rcu_dereference_check(tp->md5sig_info, 1297 lockdep_sock_is_held(sk)); 1298 if (!md5sig) 1299 return NULL; 1300 #if IS_ENABLED(CONFIG_IPV6) 1301 if (family == AF_INET6) 1302 size = sizeof(struct in6_addr); 1303 #endif 1304 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1305 lockdep_sock_is_held(sk)) { 1306 if (key->family != family) 1307 continue; 1308 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1309 continue; 1310 if (key->l3index != l3index) 1311 continue; 1312 if (!memcmp(&key->addr, addr, size) && 1313 key->prefixlen == prefixlen) 1314 return key; 1315 } 1316 return NULL; 1317 } 1318 1319 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1320 const struct sock *addr_sk) 1321 { 1322 const union tcp_md5_addr *addr; 1323 int l3index; 1324 1325 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1326 addr_sk->sk_bound_dev_if); 1327 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1328 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1329 } 1330 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1331 1332 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1333 { 1334 struct tcp_sock *tp = tcp_sk(sk); 1335 struct tcp_md5sig_info *md5sig; 1336 1337 md5sig = kmalloc(sizeof(*md5sig), gfp); 1338 if (!md5sig) 1339 return -ENOMEM; 1340 1341 sk_gso_disable(sk); 1342 INIT_HLIST_HEAD(&md5sig->head); 1343 rcu_assign_pointer(tp->md5sig_info, md5sig); 1344 return 0; 1345 } 1346 1347 /* This can be called on a newly created socket, from other files */ 1348 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1349 int family, u8 prefixlen, int l3index, u8 flags, 1350 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1351 { 1352 /* Add Key to the list */ 1353 struct tcp_md5sig_key *key; 1354 struct tcp_sock *tp = tcp_sk(sk); 1355 struct tcp_md5sig_info *md5sig; 1356 1357 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1358 if (key) { 1359 /* Pre-existing entry - just update that one. 1360 * Note that the key might be used concurrently. 1361 * data_race() is telling kcsan that we do not care of 1362 * key mismatches, since changing MD5 key on live flows 1363 * can lead to packet drops. 1364 */ 1365 data_race(memcpy(key->key, newkey, newkeylen)); 1366 1367 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1368 * Also note that a reader could catch new key->keylen value 1369 * but old key->key[], this is the reason we use __GFP_ZERO 1370 * at sock_kmalloc() time below these lines. 1371 */ 1372 WRITE_ONCE(key->keylen, newkeylen); 1373 1374 return 0; 1375 } 1376 1377 md5sig = rcu_dereference_protected(tp->md5sig_info, 1378 lockdep_sock_is_held(sk)); 1379 1380 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1381 if (!key) 1382 return -ENOMEM; 1383 1384 memcpy(key->key, newkey, newkeylen); 1385 key->keylen = newkeylen; 1386 key->family = family; 1387 key->prefixlen = prefixlen; 1388 key->l3index = l3index; 1389 key->flags = flags; 1390 memcpy(&key->addr, addr, 1391 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1392 sizeof(struct in_addr)); 1393 hlist_add_head_rcu(&key->node, &md5sig->head); 1394 return 0; 1395 } 1396 1397 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1398 int family, u8 prefixlen, int l3index, u8 flags, 1399 const u8 *newkey, u8 newkeylen) 1400 { 1401 struct tcp_sock *tp = tcp_sk(sk); 1402 1403 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1404 if (tcp_md5_alloc_sigpool()) 1405 return -ENOMEM; 1406 1407 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1408 tcp_md5_release_sigpool(); 1409 return -ENOMEM; 1410 } 1411 1412 if (!static_branch_inc(&tcp_md5_needed.key)) { 1413 struct tcp_md5sig_info *md5sig; 1414 1415 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1416 rcu_assign_pointer(tp->md5sig_info, NULL); 1417 kfree_rcu(md5sig, rcu); 1418 tcp_md5_release_sigpool(); 1419 return -EUSERS; 1420 } 1421 } 1422 1423 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1424 newkey, newkeylen, GFP_KERNEL); 1425 } 1426 EXPORT_SYMBOL(tcp_md5_do_add); 1427 1428 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1429 int family, u8 prefixlen, int l3index, 1430 struct tcp_md5sig_key *key) 1431 { 1432 struct tcp_sock *tp = tcp_sk(sk); 1433 1434 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1435 tcp_md5_add_sigpool(); 1436 1437 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1438 tcp_md5_release_sigpool(); 1439 return -ENOMEM; 1440 } 1441 1442 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1443 struct tcp_md5sig_info *md5sig; 1444 1445 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1446 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1447 rcu_assign_pointer(tp->md5sig_info, NULL); 1448 kfree_rcu(md5sig, rcu); 1449 tcp_md5_release_sigpool(); 1450 return -EUSERS; 1451 } 1452 } 1453 1454 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1455 key->flags, key->key, key->keylen, 1456 sk_gfp_mask(sk, GFP_ATOMIC)); 1457 } 1458 EXPORT_SYMBOL(tcp_md5_key_copy); 1459 1460 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1461 u8 prefixlen, int l3index, u8 flags) 1462 { 1463 struct tcp_md5sig_key *key; 1464 1465 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1466 if (!key) 1467 return -ENOENT; 1468 hlist_del_rcu(&key->node); 1469 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1470 kfree_rcu(key, rcu); 1471 return 0; 1472 } 1473 EXPORT_SYMBOL(tcp_md5_do_del); 1474 1475 void tcp_clear_md5_list(struct sock *sk) 1476 { 1477 struct tcp_sock *tp = tcp_sk(sk); 1478 struct tcp_md5sig_key *key; 1479 struct hlist_node *n; 1480 struct tcp_md5sig_info *md5sig; 1481 1482 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1483 1484 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1485 hlist_del_rcu(&key->node); 1486 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1487 kfree_rcu(key, rcu); 1488 } 1489 } 1490 1491 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1492 sockptr_t optval, int optlen) 1493 { 1494 struct tcp_md5sig cmd; 1495 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1496 const union tcp_md5_addr *addr; 1497 u8 prefixlen = 32; 1498 int l3index = 0; 1499 bool l3flag; 1500 u8 flags; 1501 1502 if (optlen < sizeof(cmd)) 1503 return -EINVAL; 1504 1505 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1506 return -EFAULT; 1507 1508 if (sin->sin_family != AF_INET) 1509 return -EINVAL; 1510 1511 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1512 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1513 1514 if (optname == TCP_MD5SIG_EXT && 1515 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1516 prefixlen = cmd.tcpm_prefixlen; 1517 if (prefixlen > 32) 1518 return -EINVAL; 1519 } 1520 1521 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1522 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1523 struct net_device *dev; 1524 1525 rcu_read_lock(); 1526 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1527 if (dev && netif_is_l3_master(dev)) 1528 l3index = dev->ifindex; 1529 1530 rcu_read_unlock(); 1531 1532 /* ok to reference set/not set outside of rcu; 1533 * right now device MUST be an L3 master 1534 */ 1535 if (!dev || !l3index) 1536 return -EINVAL; 1537 } 1538 1539 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1540 1541 if (!cmd.tcpm_keylen) 1542 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1543 1544 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1545 return -EINVAL; 1546 1547 /* Don't allow keys for peers that have a matching TCP-AO key. 1548 * See the comment in tcp_ao_add_cmd() 1549 */ 1550 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1551 return -EKEYREJECTED; 1552 1553 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1554 cmd.tcpm_key, cmd.tcpm_keylen); 1555 } 1556 1557 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1558 __be32 daddr, __be32 saddr, 1559 const struct tcphdr *th, int nbytes) 1560 { 1561 struct tcp4_pseudohdr *bp; 1562 struct scatterlist sg; 1563 struct tcphdr *_th; 1564 1565 bp = hp->scratch; 1566 bp->saddr = saddr; 1567 bp->daddr = daddr; 1568 bp->pad = 0; 1569 bp->protocol = IPPROTO_TCP; 1570 bp->len = cpu_to_be16(nbytes); 1571 1572 _th = (struct tcphdr *)(bp + 1); 1573 memcpy(_th, th, sizeof(*th)); 1574 _th->check = 0; 1575 1576 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1577 ahash_request_set_crypt(hp->req, &sg, NULL, 1578 sizeof(*bp) + sizeof(*th)); 1579 return crypto_ahash_update(hp->req); 1580 } 1581 1582 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1583 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1584 { 1585 struct tcp_sigpool hp; 1586 1587 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1588 goto clear_hash_nostart; 1589 1590 if (crypto_ahash_init(hp.req)) 1591 goto clear_hash; 1592 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1593 goto clear_hash; 1594 if (tcp_md5_hash_key(&hp, key)) 1595 goto clear_hash; 1596 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1597 if (crypto_ahash_final(hp.req)) 1598 goto clear_hash; 1599 1600 tcp_sigpool_end(&hp); 1601 return 0; 1602 1603 clear_hash: 1604 tcp_sigpool_end(&hp); 1605 clear_hash_nostart: 1606 memset(md5_hash, 0, 16); 1607 return 1; 1608 } 1609 1610 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1611 const struct sock *sk, 1612 const struct sk_buff *skb) 1613 { 1614 const struct tcphdr *th = tcp_hdr(skb); 1615 struct tcp_sigpool hp; 1616 __be32 saddr, daddr; 1617 1618 if (sk) { /* valid for establish/request sockets */ 1619 saddr = sk->sk_rcv_saddr; 1620 daddr = sk->sk_daddr; 1621 } else { 1622 const struct iphdr *iph = ip_hdr(skb); 1623 saddr = iph->saddr; 1624 daddr = iph->daddr; 1625 } 1626 1627 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1628 goto clear_hash_nostart; 1629 1630 if (crypto_ahash_init(hp.req)) 1631 goto clear_hash; 1632 1633 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1634 goto clear_hash; 1635 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1636 goto clear_hash; 1637 if (tcp_md5_hash_key(&hp, key)) 1638 goto clear_hash; 1639 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1640 if (crypto_ahash_final(hp.req)) 1641 goto clear_hash; 1642 1643 tcp_sigpool_end(&hp); 1644 return 0; 1645 1646 clear_hash: 1647 tcp_sigpool_end(&hp); 1648 clear_hash_nostart: 1649 memset(md5_hash, 0, 16); 1650 return 1; 1651 } 1652 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1653 1654 #endif 1655 1656 static void tcp_v4_init_req(struct request_sock *req, 1657 const struct sock *sk_listener, 1658 struct sk_buff *skb) 1659 { 1660 struct inet_request_sock *ireq = inet_rsk(req); 1661 struct net *net = sock_net(sk_listener); 1662 1663 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1664 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1665 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1666 } 1667 1668 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1669 struct sk_buff *skb, 1670 struct flowi *fl, 1671 struct request_sock *req, 1672 u32 tw_isn) 1673 { 1674 tcp_v4_init_req(req, sk, skb); 1675 1676 if (security_inet_conn_request(sk, skb, req)) 1677 return NULL; 1678 1679 return inet_csk_route_req(sk, &fl->u.ip4, req); 1680 } 1681 1682 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1683 .family = PF_INET, 1684 .obj_size = sizeof(struct tcp_request_sock), 1685 .rtx_syn_ack = tcp_rtx_synack, 1686 .send_ack = tcp_v4_reqsk_send_ack, 1687 .destructor = tcp_v4_reqsk_destructor, 1688 .send_reset = tcp_v4_send_reset, 1689 .syn_ack_timeout = tcp_syn_ack_timeout, 1690 }; 1691 1692 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1693 .mss_clamp = TCP_MSS_DEFAULT, 1694 #ifdef CONFIG_TCP_MD5SIG 1695 .req_md5_lookup = tcp_v4_md5_lookup, 1696 .calc_md5_hash = tcp_v4_md5_hash_skb, 1697 #endif 1698 #ifdef CONFIG_TCP_AO 1699 .ao_lookup = tcp_v4_ao_lookup_rsk, 1700 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1701 .ao_synack_hash = tcp_v4_ao_synack_hash, 1702 #endif 1703 #ifdef CONFIG_SYN_COOKIES 1704 .cookie_init_seq = cookie_v4_init_sequence, 1705 #endif 1706 .route_req = tcp_v4_route_req, 1707 .init_seq = tcp_v4_init_seq, 1708 .init_ts_off = tcp_v4_init_ts_off, 1709 .send_synack = tcp_v4_send_synack, 1710 }; 1711 1712 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1713 { 1714 /* Never answer to SYNs send to broadcast or multicast */ 1715 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1716 goto drop; 1717 1718 return tcp_conn_request(&tcp_request_sock_ops, 1719 &tcp_request_sock_ipv4_ops, sk, skb); 1720 1721 drop: 1722 tcp_listendrop(sk); 1723 return 0; 1724 } 1725 EXPORT_SYMBOL(tcp_v4_conn_request); 1726 1727 1728 /* 1729 * The three way handshake has completed - we got a valid synack - 1730 * now create the new socket. 1731 */ 1732 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1733 struct request_sock *req, 1734 struct dst_entry *dst, 1735 struct request_sock *req_unhash, 1736 bool *own_req) 1737 { 1738 struct inet_request_sock *ireq; 1739 bool found_dup_sk = false; 1740 struct inet_sock *newinet; 1741 struct tcp_sock *newtp; 1742 struct sock *newsk; 1743 #ifdef CONFIG_TCP_MD5SIG 1744 const union tcp_md5_addr *addr; 1745 struct tcp_md5sig_key *key; 1746 int l3index; 1747 #endif 1748 struct ip_options_rcu *inet_opt; 1749 1750 if (sk_acceptq_is_full(sk)) 1751 goto exit_overflow; 1752 1753 newsk = tcp_create_openreq_child(sk, req, skb); 1754 if (!newsk) 1755 goto exit_nonewsk; 1756 1757 newsk->sk_gso_type = SKB_GSO_TCPV4; 1758 inet_sk_rx_dst_set(newsk, skb); 1759 1760 newtp = tcp_sk(newsk); 1761 newinet = inet_sk(newsk); 1762 ireq = inet_rsk(req); 1763 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1764 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1765 newsk->sk_bound_dev_if = ireq->ir_iif; 1766 newinet->inet_saddr = ireq->ir_loc_addr; 1767 inet_opt = rcu_dereference(ireq->ireq_opt); 1768 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1769 newinet->mc_index = inet_iif(skb); 1770 newinet->mc_ttl = ip_hdr(skb)->ttl; 1771 newinet->rcv_tos = ip_hdr(skb)->tos; 1772 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1773 if (inet_opt) 1774 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1775 atomic_set(&newinet->inet_id, get_random_u16()); 1776 1777 /* Set ToS of the new socket based upon the value of incoming SYN. 1778 * ECT bits are set later in tcp_init_transfer(). 1779 */ 1780 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1781 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1782 1783 if (!dst) { 1784 dst = inet_csk_route_child_sock(sk, newsk, req); 1785 if (!dst) 1786 goto put_and_exit; 1787 } else { 1788 /* syncookie case : see end of cookie_v4_check() */ 1789 } 1790 sk_setup_caps(newsk, dst); 1791 1792 tcp_ca_openreq_child(newsk, dst); 1793 1794 tcp_sync_mss(newsk, dst_mtu(dst)); 1795 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1796 1797 tcp_initialize_rcv_mss(newsk); 1798 1799 #ifdef CONFIG_TCP_MD5SIG 1800 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1801 /* Copy over the MD5 key from the original socket */ 1802 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1803 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1804 if (key && !tcp_rsk_used_ao(req)) { 1805 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1806 goto put_and_exit; 1807 sk_gso_disable(newsk); 1808 } 1809 #endif 1810 #ifdef CONFIG_TCP_AO 1811 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1812 goto put_and_exit; /* OOM, release back memory */ 1813 #endif 1814 1815 if (__inet_inherit_port(sk, newsk) < 0) 1816 goto put_and_exit; 1817 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1818 &found_dup_sk); 1819 if (likely(*own_req)) { 1820 tcp_move_syn(newtp, req); 1821 ireq->ireq_opt = NULL; 1822 } else { 1823 newinet->inet_opt = NULL; 1824 1825 if (!req_unhash && found_dup_sk) { 1826 /* This code path should only be executed in the 1827 * syncookie case only 1828 */ 1829 bh_unlock_sock(newsk); 1830 sock_put(newsk); 1831 newsk = NULL; 1832 } 1833 } 1834 return newsk; 1835 1836 exit_overflow: 1837 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1838 exit_nonewsk: 1839 dst_release(dst); 1840 exit: 1841 tcp_listendrop(sk); 1842 return NULL; 1843 put_and_exit: 1844 newinet->inet_opt = NULL; 1845 inet_csk_prepare_forced_close(newsk); 1846 tcp_done(newsk); 1847 goto exit; 1848 } 1849 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1850 1851 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1852 { 1853 #ifdef CONFIG_SYN_COOKIES 1854 const struct tcphdr *th = tcp_hdr(skb); 1855 1856 if (!th->syn) 1857 sk = cookie_v4_check(sk, skb); 1858 #endif 1859 return sk; 1860 } 1861 1862 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1863 struct tcphdr *th, u32 *cookie) 1864 { 1865 u16 mss = 0; 1866 #ifdef CONFIG_SYN_COOKIES 1867 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1868 &tcp_request_sock_ipv4_ops, sk, th); 1869 if (mss) { 1870 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1871 tcp_synq_overflow(sk); 1872 } 1873 #endif 1874 return mss; 1875 } 1876 1877 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1878 u32)); 1879 /* The socket must have it's spinlock held when we get 1880 * here, unless it is a TCP_LISTEN socket. 1881 * 1882 * We have a potential double-lock case here, so even when 1883 * doing backlog processing we use the BH locking scheme. 1884 * This is because we cannot sleep with the original spinlock 1885 * held. 1886 */ 1887 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1888 { 1889 enum skb_drop_reason reason; 1890 struct sock *rsk; 1891 1892 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1893 struct dst_entry *dst; 1894 1895 dst = rcu_dereference_protected(sk->sk_rx_dst, 1896 lockdep_sock_is_held(sk)); 1897 1898 sock_rps_save_rxhash(sk, skb); 1899 sk_mark_napi_id(sk, skb); 1900 if (dst) { 1901 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1902 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1903 dst, 0)) { 1904 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1905 dst_release(dst); 1906 } 1907 } 1908 tcp_rcv_established(sk, skb); 1909 return 0; 1910 } 1911 1912 if (tcp_checksum_complete(skb)) 1913 goto csum_err; 1914 1915 if (sk->sk_state == TCP_LISTEN) { 1916 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1917 1918 if (!nsk) 1919 return 0; 1920 if (nsk != sk) { 1921 reason = tcp_child_process(sk, nsk, skb); 1922 if (reason) { 1923 rsk = nsk; 1924 goto reset; 1925 } 1926 return 0; 1927 } 1928 } else 1929 sock_rps_save_rxhash(sk, skb); 1930 1931 reason = tcp_rcv_state_process(sk, skb); 1932 if (reason) { 1933 rsk = sk; 1934 goto reset; 1935 } 1936 return 0; 1937 1938 reset: 1939 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1940 discard: 1941 kfree_skb_reason(skb, reason); 1942 /* Be careful here. If this function gets more complicated and 1943 * gcc suffers from register pressure on the x86, sk (in %ebx) 1944 * might be destroyed here. This current version compiles correctly, 1945 * but you have been warned. 1946 */ 1947 return 0; 1948 1949 csum_err: 1950 reason = SKB_DROP_REASON_TCP_CSUM; 1951 trace_tcp_bad_csum(skb); 1952 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1953 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1954 goto discard; 1955 } 1956 EXPORT_SYMBOL(tcp_v4_do_rcv); 1957 1958 int tcp_v4_early_demux(struct sk_buff *skb) 1959 { 1960 struct net *net = dev_net(skb->dev); 1961 const struct iphdr *iph; 1962 const struct tcphdr *th; 1963 struct sock *sk; 1964 1965 if (skb->pkt_type != PACKET_HOST) 1966 return 0; 1967 1968 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1969 return 0; 1970 1971 iph = ip_hdr(skb); 1972 th = tcp_hdr(skb); 1973 1974 if (th->doff < sizeof(struct tcphdr) / 4) 1975 return 0; 1976 1977 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1978 iph->saddr, th->source, 1979 iph->daddr, ntohs(th->dest), 1980 skb->skb_iif, inet_sdif(skb)); 1981 if (sk) { 1982 skb->sk = sk; 1983 skb->destructor = sock_edemux; 1984 if (sk_fullsock(sk)) { 1985 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1986 1987 if (dst) 1988 dst = dst_check(dst, 0); 1989 if (dst && 1990 sk->sk_rx_dst_ifindex == skb->skb_iif) 1991 skb_dst_set_noref(skb, dst); 1992 } 1993 } 1994 return 0; 1995 } 1996 1997 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1998 enum skb_drop_reason *reason) 1999 { 2000 u32 tail_gso_size, tail_gso_segs; 2001 struct skb_shared_info *shinfo; 2002 const struct tcphdr *th; 2003 struct tcphdr *thtail; 2004 struct sk_buff *tail; 2005 unsigned int hdrlen; 2006 bool fragstolen; 2007 u32 gso_segs; 2008 u32 gso_size; 2009 u64 limit; 2010 int delta; 2011 2012 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2013 * we can fix skb->truesize to its real value to avoid future drops. 2014 * This is valid because skb is not yet charged to the socket. 2015 * It has been noticed pure SACK packets were sometimes dropped 2016 * (if cooked by drivers without copybreak feature). 2017 */ 2018 skb_condense(skb); 2019 2020 skb_dst_drop(skb); 2021 2022 if (unlikely(tcp_checksum_complete(skb))) { 2023 bh_unlock_sock(sk); 2024 trace_tcp_bad_csum(skb); 2025 *reason = SKB_DROP_REASON_TCP_CSUM; 2026 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2027 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2028 return true; 2029 } 2030 2031 /* Attempt coalescing to last skb in backlog, even if we are 2032 * above the limits. 2033 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2034 */ 2035 th = (const struct tcphdr *)skb->data; 2036 hdrlen = th->doff * 4; 2037 2038 tail = sk->sk_backlog.tail; 2039 if (!tail) 2040 goto no_coalesce; 2041 thtail = (struct tcphdr *)tail->data; 2042 2043 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2044 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2045 ((TCP_SKB_CB(tail)->tcp_flags | 2046 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2047 !((TCP_SKB_CB(tail)->tcp_flags & 2048 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2049 ((TCP_SKB_CB(tail)->tcp_flags ^ 2050 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 2051 !mptcp_skb_can_collapse(tail, skb) || 2052 skb_cmp_decrypted(tail, skb) || 2053 thtail->doff != th->doff || 2054 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2055 goto no_coalesce; 2056 2057 __skb_pull(skb, hdrlen); 2058 2059 shinfo = skb_shinfo(skb); 2060 gso_size = shinfo->gso_size ?: skb->len; 2061 gso_segs = shinfo->gso_segs ?: 1; 2062 2063 shinfo = skb_shinfo(tail); 2064 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2065 tail_gso_segs = shinfo->gso_segs ?: 1; 2066 2067 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2068 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2069 2070 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2071 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2072 thtail->window = th->window; 2073 } 2074 2075 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2076 * thtail->fin, so that the fast path in tcp_rcv_established() 2077 * is not entered if we append a packet with a FIN. 2078 * SYN, RST, URG are not present. 2079 * ACK is set on both packets. 2080 * PSH : we do not really care in TCP stack, 2081 * at least for 'GRO' packets. 2082 */ 2083 thtail->fin |= th->fin; 2084 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2085 2086 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2087 TCP_SKB_CB(tail)->has_rxtstamp = true; 2088 tail->tstamp = skb->tstamp; 2089 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2090 } 2091 2092 /* Not as strict as GRO. We only need to carry mss max value */ 2093 shinfo->gso_size = max(gso_size, tail_gso_size); 2094 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2095 2096 sk->sk_backlog.len += delta; 2097 __NET_INC_STATS(sock_net(sk), 2098 LINUX_MIB_TCPBACKLOGCOALESCE); 2099 kfree_skb_partial(skb, fragstolen); 2100 return false; 2101 } 2102 __skb_push(skb, hdrlen); 2103 2104 no_coalesce: 2105 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2106 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2107 * sk_rcvbuf in normal conditions. 2108 */ 2109 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2110 2111 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2112 2113 /* Only socket owner can try to collapse/prune rx queues 2114 * to reduce memory overhead, so add a little headroom here. 2115 * Few sockets backlog are possibly concurrently non empty. 2116 */ 2117 limit += 64 * 1024; 2118 2119 limit = min_t(u64, limit, UINT_MAX); 2120 2121 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2122 bh_unlock_sock(sk); 2123 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2124 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2125 return true; 2126 } 2127 return false; 2128 } 2129 EXPORT_SYMBOL(tcp_add_backlog); 2130 2131 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2132 { 2133 struct tcphdr *th = (struct tcphdr *)skb->data; 2134 2135 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2136 } 2137 EXPORT_SYMBOL(tcp_filter); 2138 2139 static void tcp_v4_restore_cb(struct sk_buff *skb) 2140 { 2141 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2142 sizeof(struct inet_skb_parm)); 2143 } 2144 2145 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2146 const struct tcphdr *th) 2147 { 2148 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2149 * barrier() makes sure compiler wont play fool^Waliasing games. 2150 */ 2151 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2152 sizeof(struct inet_skb_parm)); 2153 barrier(); 2154 2155 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2156 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2157 skb->len - th->doff * 4); 2158 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2159 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 2160 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2161 TCP_SKB_CB(skb)->sacked = 0; 2162 TCP_SKB_CB(skb)->has_rxtstamp = 2163 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2164 } 2165 2166 /* 2167 * From tcp_input.c 2168 */ 2169 2170 int tcp_v4_rcv(struct sk_buff *skb) 2171 { 2172 struct net *net = dev_net(skb->dev); 2173 enum skb_drop_reason drop_reason; 2174 int sdif = inet_sdif(skb); 2175 int dif = inet_iif(skb); 2176 const struct iphdr *iph; 2177 const struct tcphdr *th; 2178 bool refcounted; 2179 struct sock *sk; 2180 int ret; 2181 u32 isn; 2182 2183 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2184 if (skb->pkt_type != PACKET_HOST) 2185 goto discard_it; 2186 2187 /* Count it even if it's bad */ 2188 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2189 2190 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2191 goto discard_it; 2192 2193 th = (const struct tcphdr *)skb->data; 2194 2195 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2196 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2197 goto bad_packet; 2198 } 2199 if (!pskb_may_pull(skb, th->doff * 4)) 2200 goto discard_it; 2201 2202 /* An explanation is required here, I think. 2203 * Packet length and doff are validated by header prediction, 2204 * provided case of th->doff==0 is eliminated. 2205 * So, we defer the checks. */ 2206 2207 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2208 goto csum_error; 2209 2210 th = (const struct tcphdr *)skb->data; 2211 iph = ip_hdr(skb); 2212 lookup: 2213 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2214 skb, __tcp_hdrlen(th), th->source, 2215 th->dest, sdif, &refcounted); 2216 if (!sk) 2217 goto no_tcp_socket; 2218 2219 if (sk->sk_state == TCP_TIME_WAIT) 2220 goto do_time_wait; 2221 2222 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2223 struct request_sock *req = inet_reqsk(sk); 2224 bool req_stolen = false; 2225 struct sock *nsk; 2226 2227 sk = req->rsk_listener; 2228 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2229 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2230 else 2231 drop_reason = tcp_inbound_hash(sk, req, skb, 2232 &iph->saddr, &iph->daddr, 2233 AF_INET, dif, sdif); 2234 if (unlikely(drop_reason)) { 2235 sk_drops_add(sk, skb); 2236 reqsk_put(req); 2237 goto discard_it; 2238 } 2239 if (tcp_checksum_complete(skb)) { 2240 reqsk_put(req); 2241 goto csum_error; 2242 } 2243 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2244 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2245 if (!nsk) { 2246 inet_csk_reqsk_queue_drop_and_put(sk, req); 2247 goto lookup; 2248 } 2249 sk = nsk; 2250 /* reuseport_migrate_sock() has already held one sk_refcnt 2251 * before returning. 2252 */ 2253 } else { 2254 /* We own a reference on the listener, increase it again 2255 * as we might lose it too soon. 2256 */ 2257 sock_hold(sk); 2258 } 2259 refcounted = true; 2260 nsk = NULL; 2261 if (!tcp_filter(sk, skb)) { 2262 th = (const struct tcphdr *)skb->data; 2263 iph = ip_hdr(skb); 2264 tcp_v4_fill_cb(skb, iph, th); 2265 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2266 } else { 2267 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2268 } 2269 if (!nsk) { 2270 reqsk_put(req); 2271 if (req_stolen) { 2272 /* Another cpu got exclusive access to req 2273 * and created a full blown socket. 2274 * Try to feed this packet to this socket 2275 * instead of discarding it. 2276 */ 2277 tcp_v4_restore_cb(skb); 2278 sock_put(sk); 2279 goto lookup; 2280 } 2281 goto discard_and_relse; 2282 } 2283 nf_reset_ct(skb); 2284 if (nsk == sk) { 2285 reqsk_put(req); 2286 tcp_v4_restore_cb(skb); 2287 } else { 2288 drop_reason = tcp_child_process(sk, nsk, skb); 2289 if (drop_reason) { 2290 enum sk_rst_reason rst_reason; 2291 2292 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2293 tcp_v4_send_reset(nsk, skb, rst_reason); 2294 goto discard_and_relse; 2295 } 2296 sock_put(sk); 2297 return 0; 2298 } 2299 } 2300 2301 process: 2302 if (static_branch_unlikely(&ip4_min_ttl)) { 2303 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2304 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2305 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2306 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2307 goto discard_and_relse; 2308 } 2309 } 2310 2311 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2312 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2313 goto discard_and_relse; 2314 } 2315 2316 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2317 AF_INET, dif, sdif); 2318 if (drop_reason) 2319 goto discard_and_relse; 2320 2321 nf_reset_ct(skb); 2322 2323 if (tcp_filter(sk, skb)) { 2324 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2325 goto discard_and_relse; 2326 } 2327 th = (const struct tcphdr *)skb->data; 2328 iph = ip_hdr(skb); 2329 tcp_v4_fill_cb(skb, iph, th); 2330 2331 skb->dev = NULL; 2332 2333 if (sk->sk_state == TCP_LISTEN) { 2334 ret = tcp_v4_do_rcv(sk, skb); 2335 goto put_and_return; 2336 } 2337 2338 sk_incoming_cpu_update(sk); 2339 2340 bh_lock_sock_nested(sk); 2341 tcp_segs_in(tcp_sk(sk), skb); 2342 ret = 0; 2343 if (!sock_owned_by_user(sk)) { 2344 ret = tcp_v4_do_rcv(sk, skb); 2345 } else { 2346 if (tcp_add_backlog(sk, skb, &drop_reason)) 2347 goto discard_and_relse; 2348 } 2349 bh_unlock_sock(sk); 2350 2351 put_and_return: 2352 if (refcounted) 2353 sock_put(sk); 2354 2355 return ret; 2356 2357 no_tcp_socket: 2358 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2359 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2360 goto discard_it; 2361 2362 tcp_v4_fill_cb(skb, iph, th); 2363 2364 if (tcp_checksum_complete(skb)) { 2365 csum_error: 2366 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2367 trace_tcp_bad_csum(skb); 2368 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2369 bad_packet: 2370 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2371 } else { 2372 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2373 } 2374 2375 discard_it: 2376 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2377 /* Discard frame. */ 2378 kfree_skb_reason(skb, drop_reason); 2379 return 0; 2380 2381 discard_and_relse: 2382 sk_drops_add(sk, skb); 2383 if (refcounted) 2384 sock_put(sk); 2385 goto discard_it; 2386 2387 do_time_wait: 2388 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2389 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2390 inet_twsk_put(inet_twsk(sk)); 2391 goto discard_it; 2392 } 2393 2394 tcp_v4_fill_cb(skb, iph, th); 2395 2396 if (tcp_checksum_complete(skb)) { 2397 inet_twsk_put(inet_twsk(sk)); 2398 goto csum_error; 2399 } 2400 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { 2401 case TCP_TW_SYN: { 2402 struct sock *sk2 = inet_lookup_listener(net, 2403 net->ipv4.tcp_death_row.hashinfo, 2404 skb, __tcp_hdrlen(th), 2405 iph->saddr, th->source, 2406 iph->daddr, th->dest, 2407 inet_iif(skb), 2408 sdif); 2409 if (sk2) { 2410 inet_twsk_deschedule_put(inet_twsk(sk)); 2411 sk = sk2; 2412 tcp_v4_restore_cb(skb); 2413 refcounted = false; 2414 __this_cpu_write(tcp_tw_isn, isn); 2415 goto process; 2416 } 2417 } 2418 /* to ACK */ 2419 fallthrough; 2420 case TCP_TW_ACK: 2421 tcp_v4_timewait_ack(sk, skb); 2422 break; 2423 case TCP_TW_RST: 2424 tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(drop_reason)); 2425 inet_twsk_deschedule_put(inet_twsk(sk)); 2426 goto discard_it; 2427 case TCP_TW_SUCCESS:; 2428 } 2429 goto discard_it; 2430 } 2431 2432 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2433 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2434 .twsk_unique = tcp_twsk_unique, 2435 .twsk_destructor= tcp_twsk_destructor, 2436 }; 2437 2438 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2439 { 2440 struct dst_entry *dst = skb_dst(skb); 2441 2442 if (dst && dst_hold_safe(dst)) { 2443 rcu_assign_pointer(sk->sk_rx_dst, dst); 2444 sk->sk_rx_dst_ifindex = skb->skb_iif; 2445 } 2446 } 2447 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2448 2449 const struct inet_connection_sock_af_ops ipv4_specific = { 2450 .queue_xmit = ip_queue_xmit, 2451 .send_check = tcp_v4_send_check, 2452 .rebuild_header = inet_sk_rebuild_header, 2453 .sk_rx_dst_set = inet_sk_rx_dst_set, 2454 .conn_request = tcp_v4_conn_request, 2455 .syn_recv_sock = tcp_v4_syn_recv_sock, 2456 .net_header_len = sizeof(struct iphdr), 2457 .setsockopt = ip_setsockopt, 2458 .getsockopt = ip_getsockopt, 2459 .addr2sockaddr = inet_csk_addr2sockaddr, 2460 .sockaddr_len = sizeof(struct sockaddr_in), 2461 .mtu_reduced = tcp_v4_mtu_reduced, 2462 }; 2463 EXPORT_SYMBOL(ipv4_specific); 2464 2465 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2466 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2467 #ifdef CONFIG_TCP_MD5SIG 2468 .md5_lookup = tcp_v4_md5_lookup, 2469 .calc_md5_hash = tcp_v4_md5_hash_skb, 2470 .md5_parse = tcp_v4_parse_md5_keys, 2471 #endif 2472 #ifdef CONFIG_TCP_AO 2473 .ao_lookup = tcp_v4_ao_lookup, 2474 .calc_ao_hash = tcp_v4_ao_hash_skb, 2475 .ao_parse = tcp_v4_parse_ao, 2476 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2477 #endif 2478 }; 2479 #endif 2480 2481 /* NOTE: A lot of things set to zero explicitly by call to 2482 * sk_alloc() so need not be done here. 2483 */ 2484 static int tcp_v4_init_sock(struct sock *sk) 2485 { 2486 struct inet_connection_sock *icsk = inet_csk(sk); 2487 2488 tcp_init_sock(sk); 2489 2490 icsk->icsk_af_ops = &ipv4_specific; 2491 2492 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2493 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2494 #endif 2495 2496 return 0; 2497 } 2498 2499 #ifdef CONFIG_TCP_MD5SIG 2500 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2501 { 2502 struct tcp_md5sig_info *md5sig; 2503 2504 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2505 kfree(md5sig); 2506 static_branch_slow_dec_deferred(&tcp_md5_needed); 2507 tcp_md5_release_sigpool(); 2508 } 2509 #endif 2510 2511 void tcp_v4_destroy_sock(struct sock *sk) 2512 { 2513 struct tcp_sock *tp = tcp_sk(sk); 2514 2515 trace_tcp_destroy_sock(sk); 2516 2517 tcp_clear_xmit_timers(sk); 2518 2519 tcp_cleanup_congestion_control(sk); 2520 2521 tcp_cleanup_ulp(sk); 2522 2523 /* Cleanup up the write buffer. */ 2524 tcp_write_queue_purge(sk); 2525 2526 /* Check if we want to disable active TFO */ 2527 tcp_fastopen_active_disable_ofo_check(sk); 2528 2529 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2530 skb_rbtree_purge(&tp->out_of_order_queue); 2531 2532 #ifdef CONFIG_TCP_MD5SIG 2533 /* Clean up the MD5 key list, if any */ 2534 if (tp->md5sig_info) { 2535 struct tcp_md5sig_info *md5sig; 2536 2537 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2538 tcp_clear_md5_list(sk); 2539 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2540 rcu_assign_pointer(tp->md5sig_info, NULL); 2541 } 2542 #endif 2543 tcp_ao_destroy_sock(sk, false); 2544 2545 /* Clean up a referenced TCP bind bucket. */ 2546 if (inet_csk(sk)->icsk_bind_hash) 2547 inet_put_port(sk); 2548 2549 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2550 2551 /* If socket is aborted during connect operation */ 2552 tcp_free_fastopen_req(tp); 2553 tcp_fastopen_destroy_cipher(sk); 2554 tcp_saved_syn_free(tp); 2555 2556 sk_sockets_allocated_dec(sk); 2557 } 2558 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2559 2560 #ifdef CONFIG_PROC_FS 2561 /* Proc filesystem TCP sock list dumping. */ 2562 2563 static unsigned short seq_file_family(const struct seq_file *seq); 2564 2565 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2566 { 2567 unsigned short family = seq_file_family(seq); 2568 2569 /* AF_UNSPEC is used as a match all */ 2570 return ((family == AF_UNSPEC || family == sk->sk_family) && 2571 net_eq(sock_net(sk), seq_file_net(seq))); 2572 } 2573 2574 /* Find a non empty bucket (starting from st->bucket) 2575 * and return the first sk from it. 2576 */ 2577 static void *listening_get_first(struct seq_file *seq) 2578 { 2579 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2580 struct tcp_iter_state *st = seq->private; 2581 2582 st->offset = 0; 2583 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2584 struct inet_listen_hashbucket *ilb2; 2585 struct hlist_nulls_node *node; 2586 struct sock *sk; 2587 2588 ilb2 = &hinfo->lhash2[st->bucket]; 2589 if (hlist_nulls_empty(&ilb2->nulls_head)) 2590 continue; 2591 2592 spin_lock(&ilb2->lock); 2593 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2594 if (seq_sk_match(seq, sk)) 2595 return sk; 2596 } 2597 spin_unlock(&ilb2->lock); 2598 } 2599 2600 return NULL; 2601 } 2602 2603 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2604 * If "cur" is the last one in the st->bucket, 2605 * call listening_get_first() to return the first sk of the next 2606 * non empty bucket. 2607 */ 2608 static void *listening_get_next(struct seq_file *seq, void *cur) 2609 { 2610 struct tcp_iter_state *st = seq->private; 2611 struct inet_listen_hashbucket *ilb2; 2612 struct hlist_nulls_node *node; 2613 struct inet_hashinfo *hinfo; 2614 struct sock *sk = cur; 2615 2616 ++st->num; 2617 ++st->offset; 2618 2619 sk = sk_nulls_next(sk); 2620 sk_nulls_for_each_from(sk, node) { 2621 if (seq_sk_match(seq, sk)) 2622 return sk; 2623 } 2624 2625 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2626 ilb2 = &hinfo->lhash2[st->bucket]; 2627 spin_unlock(&ilb2->lock); 2628 ++st->bucket; 2629 return listening_get_first(seq); 2630 } 2631 2632 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2633 { 2634 struct tcp_iter_state *st = seq->private; 2635 void *rc; 2636 2637 st->bucket = 0; 2638 st->offset = 0; 2639 rc = listening_get_first(seq); 2640 2641 while (rc && *pos) { 2642 rc = listening_get_next(seq, rc); 2643 --*pos; 2644 } 2645 return rc; 2646 } 2647 2648 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2649 const struct tcp_iter_state *st) 2650 { 2651 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2652 } 2653 2654 /* 2655 * Get first established socket starting from bucket given in st->bucket. 2656 * If st->bucket is zero, the very first socket in the hash is returned. 2657 */ 2658 static void *established_get_first(struct seq_file *seq) 2659 { 2660 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2661 struct tcp_iter_state *st = seq->private; 2662 2663 st->offset = 0; 2664 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2665 struct sock *sk; 2666 struct hlist_nulls_node *node; 2667 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2668 2669 cond_resched(); 2670 2671 /* Lockless fast path for the common case of empty buckets */ 2672 if (empty_bucket(hinfo, st)) 2673 continue; 2674 2675 spin_lock_bh(lock); 2676 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2677 if (seq_sk_match(seq, sk)) 2678 return sk; 2679 } 2680 spin_unlock_bh(lock); 2681 } 2682 2683 return NULL; 2684 } 2685 2686 static void *established_get_next(struct seq_file *seq, void *cur) 2687 { 2688 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2689 struct tcp_iter_state *st = seq->private; 2690 struct hlist_nulls_node *node; 2691 struct sock *sk = cur; 2692 2693 ++st->num; 2694 ++st->offset; 2695 2696 sk = sk_nulls_next(sk); 2697 2698 sk_nulls_for_each_from(sk, node) { 2699 if (seq_sk_match(seq, sk)) 2700 return sk; 2701 } 2702 2703 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2704 ++st->bucket; 2705 return established_get_first(seq); 2706 } 2707 2708 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2709 { 2710 struct tcp_iter_state *st = seq->private; 2711 void *rc; 2712 2713 st->bucket = 0; 2714 rc = established_get_first(seq); 2715 2716 while (rc && pos) { 2717 rc = established_get_next(seq, rc); 2718 --pos; 2719 } 2720 return rc; 2721 } 2722 2723 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2724 { 2725 void *rc; 2726 struct tcp_iter_state *st = seq->private; 2727 2728 st->state = TCP_SEQ_STATE_LISTENING; 2729 rc = listening_get_idx(seq, &pos); 2730 2731 if (!rc) { 2732 st->state = TCP_SEQ_STATE_ESTABLISHED; 2733 rc = established_get_idx(seq, pos); 2734 } 2735 2736 return rc; 2737 } 2738 2739 static void *tcp_seek_last_pos(struct seq_file *seq) 2740 { 2741 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2742 struct tcp_iter_state *st = seq->private; 2743 int bucket = st->bucket; 2744 int offset = st->offset; 2745 int orig_num = st->num; 2746 void *rc = NULL; 2747 2748 switch (st->state) { 2749 case TCP_SEQ_STATE_LISTENING: 2750 if (st->bucket > hinfo->lhash2_mask) 2751 break; 2752 rc = listening_get_first(seq); 2753 while (offset-- && rc && bucket == st->bucket) 2754 rc = listening_get_next(seq, rc); 2755 if (rc) 2756 break; 2757 st->bucket = 0; 2758 st->state = TCP_SEQ_STATE_ESTABLISHED; 2759 fallthrough; 2760 case TCP_SEQ_STATE_ESTABLISHED: 2761 if (st->bucket > hinfo->ehash_mask) 2762 break; 2763 rc = established_get_first(seq); 2764 while (offset-- && rc && bucket == st->bucket) 2765 rc = established_get_next(seq, rc); 2766 } 2767 2768 st->num = orig_num; 2769 2770 return rc; 2771 } 2772 2773 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2774 { 2775 struct tcp_iter_state *st = seq->private; 2776 void *rc; 2777 2778 if (*pos && *pos == st->last_pos) { 2779 rc = tcp_seek_last_pos(seq); 2780 if (rc) 2781 goto out; 2782 } 2783 2784 st->state = TCP_SEQ_STATE_LISTENING; 2785 st->num = 0; 2786 st->bucket = 0; 2787 st->offset = 0; 2788 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2789 2790 out: 2791 st->last_pos = *pos; 2792 return rc; 2793 } 2794 EXPORT_SYMBOL(tcp_seq_start); 2795 2796 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2797 { 2798 struct tcp_iter_state *st = seq->private; 2799 void *rc = NULL; 2800 2801 if (v == SEQ_START_TOKEN) { 2802 rc = tcp_get_idx(seq, 0); 2803 goto out; 2804 } 2805 2806 switch (st->state) { 2807 case TCP_SEQ_STATE_LISTENING: 2808 rc = listening_get_next(seq, v); 2809 if (!rc) { 2810 st->state = TCP_SEQ_STATE_ESTABLISHED; 2811 st->bucket = 0; 2812 st->offset = 0; 2813 rc = established_get_first(seq); 2814 } 2815 break; 2816 case TCP_SEQ_STATE_ESTABLISHED: 2817 rc = established_get_next(seq, v); 2818 break; 2819 } 2820 out: 2821 ++*pos; 2822 st->last_pos = *pos; 2823 return rc; 2824 } 2825 EXPORT_SYMBOL(tcp_seq_next); 2826 2827 void tcp_seq_stop(struct seq_file *seq, void *v) 2828 { 2829 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2830 struct tcp_iter_state *st = seq->private; 2831 2832 switch (st->state) { 2833 case TCP_SEQ_STATE_LISTENING: 2834 if (v != SEQ_START_TOKEN) 2835 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2836 break; 2837 case TCP_SEQ_STATE_ESTABLISHED: 2838 if (v) 2839 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2840 break; 2841 } 2842 } 2843 EXPORT_SYMBOL(tcp_seq_stop); 2844 2845 static void get_openreq4(const struct request_sock *req, 2846 struct seq_file *f, int i) 2847 { 2848 const struct inet_request_sock *ireq = inet_rsk(req); 2849 long delta = req->rsk_timer.expires - jiffies; 2850 2851 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2852 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2853 i, 2854 ireq->ir_loc_addr, 2855 ireq->ir_num, 2856 ireq->ir_rmt_addr, 2857 ntohs(ireq->ir_rmt_port), 2858 TCP_SYN_RECV, 2859 0, 0, /* could print option size, but that is af dependent. */ 2860 1, /* timers active (only the expire timer) */ 2861 jiffies_delta_to_clock_t(delta), 2862 req->num_timeout, 2863 from_kuid_munged(seq_user_ns(f), 2864 sock_i_uid(req->rsk_listener)), 2865 0, /* non standard timer */ 2866 0, /* open_requests have no inode */ 2867 0, 2868 req); 2869 } 2870 2871 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2872 { 2873 int timer_active; 2874 unsigned long timer_expires; 2875 const struct tcp_sock *tp = tcp_sk(sk); 2876 const struct inet_connection_sock *icsk = inet_csk(sk); 2877 const struct inet_sock *inet = inet_sk(sk); 2878 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2879 __be32 dest = inet->inet_daddr; 2880 __be32 src = inet->inet_rcv_saddr; 2881 __u16 destp = ntohs(inet->inet_dport); 2882 __u16 srcp = ntohs(inet->inet_sport); 2883 int rx_queue; 2884 int state; 2885 2886 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2887 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2888 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2889 timer_active = 1; 2890 timer_expires = icsk->icsk_timeout; 2891 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2892 timer_active = 4; 2893 timer_expires = icsk->icsk_timeout; 2894 } else if (timer_pending(&sk->sk_timer)) { 2895 timer_active = 2; 2896 timer_expires = sk->sk_timer.expires; 2897 } else { 2898 timer_active = 0; 2899 timer_expires = jiffies; 2900 } 2901 2902 state = inet_sk_state_load(sk); 2903 if (state == TCP_LISTEN) 2904 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2905 else 2906 /* Because we don't lock the socket, 2907 * we might find a transient negative value. 2908 */ 2909 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2910 READ_ONCE(tp->copied_seq), 0); 2911 2912 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2913 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2914 i, src, srcp, dest, destp, state, 2915 READ_ONCE(tp->write_seq) - tp->snd_una, 2916 rx_queue, 2917 timer_active, 2918 jiffies_delta_to_clock_t(timer_expires - jiffies), 2919 icsk->icsk_retransmits, 2920 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2921 icsk->icsk_probes_out, 2922 sock_i_ino(sk), 2923 refcount_read(&sk->sk_refcnt), sk, 2924 jiffies_to_clock_t(icsk->icsk_rto), 2925 jiffies_to_clock_t(icsk->icsk_ack.ato), 2926 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2927 tcp_snd_cwnd(tp), 2928 state == TCP_LISTEN ? 2929 fastopenq->max_qlen : 2930 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2931 } 2932 2933 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2934 struct seq_file *f, int i) 2935 { 2936 long delta = tw->tw_timer.expires - jiffies; 2937 __be32 dest, src; 2938 __u16 destp, srcp; 2939 2940 dest = tw->tw_daddr; 2941 src = tw->tw_rcv_saddr; 2942 destp = ntohs(tw->tw_dport); 2943 srcp = ntohs(tw->tw_sport); 2944 2945 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2946 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2947 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2948 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2949 refcount_read(&tw->tw_refcnt), tw); 2950 } 2951 2952 #define TMPSZ 150 2953 2954 static int tcp4_seq_show(struct seq_file *seq, void *v) 2955 { 2956 struct tcp_iter_state *st; 2957 struct sock *sk = v; 2958 2959 seq_setwidth(seq, TMPSZ - 1); 2960 if (v == SEQ_START_TOKEN) { 2961 seq_puts(seq, " sl local_address rem_address st tx_queue " 2962 "rx_queue tr tm->when retrnsmt uid timeout " 2963 "inode"); 2964 goto out; 2965 } 2966 st = seq->private; 2967 2968 if (sk->sk_state == TCP_TIME_WAIT) 2969 get_timewait4_sock(v, seq, st->num); 2970 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2971 get_openreq4(v, seq, st->num); 2972 else 2973 get_tcp4_sock(v, seq, st->num); 2974 out: 2975 seq_pad(seq, '\n'); 2976 return 0; 2977 } 2978 2979 #ifdef CONFIG_BPF_SYSCALL 2980 struct bpf_tcp_iter_state { 2981 struct tcp_iter_state state; 2982 unsigned int cur_sk; 2983 unsigned int end_sk; 2984 unsigned int max_sk; 2985 struct sock **batch; 2986 bool st_bucket_done; 2987 }; 2988 2989 struct bpf_iter__tcp { 2990 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2991 __bpf_md_ptr(struct sock_common *, sk_common); 2992 uid_t uid __aligned(8); 2993 }; 2994 2995 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2996 struct sock_common *sk_common, uid_t uid) 2997 { 2998 struct bpf_iter__tcp ctx; 2999 3000 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3001 ctx.meta = meta; 3002 ctx.sk_common = sk_common; 3003 ctx.uid = uid; 3004 return bpf_iter_run_prog(prog, &ctx); 3005 } 3006 3007 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3008 { 3009 while (iter->cur_sk < iter->end_sk) 3010 sock_gen_put(iter->batch[iter->cur_sk++]); 3011 } 3012 3013 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3014 unsigned int new_batch_sz) 3015 { 3016 struct sock **new_batch; 3017 3018 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3019 GFP_USER | __GFP_NOWARN); 3020 if (!new_batch) 3021 return -ENOMEM; 3022 3023 bpf_iter_tcp_put_batch(iter); 3024 kvfree(iter->batch); 3025 iter->batch = new_batch; 3026 iter->max_sk = new_batch_sz; 3027 3028 return 0; 3029 } 3030 3031 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3032 struct sock *start_sk) 3033 { 3034 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3035 struct bpf_tcp_iter_state *iter = seq->private; 3036 struct tcp_iter_state *st = &iter->state; 3037 struct hlist_nulls_node *node; 3038 unsigned int expected = 1; 3039 struct sock *sk; 3040 3041 sock_hold(start_sk); 3042 iter->batch[iter->end_sk++] = start_sk; 3043 3044 sk = sk_nulls_next(start_sk); 3045 sk_nulls_for_each_from(sk, node) { 3046 if (seq_sk_match(seq, sk)) { 3047 if (iter->end_sk < iter->max_sk) { 3048 sock_hold(sk); 3049 iter->batch[iter->end_sk++] = sk; 3050 } 3051 expected++; 3052 } 3053 } 3054 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3055 3056 return expected; 3057 } 3058 3059 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3060 struct sock *start_sk) 3061 { 3062 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3063 struct bpf_tcp_iter_state *iter = seq->private; 3064 struct tcp_iter_state *st = &iter->state; 3065 struct hlist_nulls_node *node; 3066 unsigned int expected = 1; 3067 struct sock *sk; 3068 3069 sock_hold(start_sk); 3070 iter->batch[iter->end_sk++] = start_sk; 3071 3072 sk = sk_nulls_next(start_sk); 3073 sk_nulls_for_each_from(sk, node) { 3074 if (seq_sk_match(seq, sk)) { 3075 if (iter->end_sk < iter->max_sk) { 3076 sock_hold(sk); 3077 iter->batch[iter->end_sk++] = sk; 3078 } 3079 expected++; 3080 } 3081 } 3082 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3083 3084 return expected; 3085 } 3086 3087 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3088 { 3089 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3090 struct bpf_tcp_iter_state *iter = seq->private; 3091 struct tcp_iter_state *st = &iter->state; 3092 unsigned int expected; 3093 bool resized = false; 3094 struct sock *sk; 3095 3096 /* The st->bucket is done. Directly advance to the next 3097 * bucket instead of having the tcp_seek_last_pos() to skip 3098 * one by one in the current bucket and eventually find out 3099 * it has to advance to the next bucket. 3100 */ 3101 if (iter->st_bucket_done) { 3102 st->offset = 0; 3103 st->bucket++; 3104 if (st->state == TCP_SEQ_STATE_LISTENING && 3105 st->bucket > hinfo->lhash2_mask) { 3106 st->state = TCP_SEQ_STATE_ESTABLISHED; 3107 st->bucket = 0; 3108 } 3109 } 3110 3111 again: 3112 /* Get a new batch */ 3113 iter->cur_sk = 0; 3114 iter->end_sk = 0; 3115 iter->st_bucket_done = false; 3116 3117 sk = tcp_seek_last_pos(seq); 3118 if (!sk) 3119 return NULL; /* Done */ 3120 3121 if (st->state == TCP_SEQ_STATE_LISTENING) 3122 expected = bpf_iter_tcp_listening_batch(seq, sk); 3123 else 3124 expected = bpf_iter_tcp_established_batch(seq, sk); 3125 3126 if (iter->end_sk == expected) { 3127 iter->st_bucket_done = true; 3128 return sk; 3129 } 3130 3131 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3132 resized = true; 3133 goto again; 3134 } 3135 3136 return sk; 3137 } 3138 3139 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3140 { 3141 /* bpf iter does not support lseek, so it always 3142 * continue from where it was stop()-ped. 3143 */ 3144 if (*pos) 3145 return bpf_iter_tcp_batch(seq); 3146 3147 return SEQ_START_TOKEN; 3148 } 3149 3150 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3151 { 3152 struct bpf_tcp_iter_state *iter = seq->private; 3153 struct tcp_iter_state *st = &iter->state; 3154 struct sock *sk; 3155 3156 /* Whenever seq_next() is called, the iter->cur_sk is 3157 * done with seq_show(), so advance to the next sk in 3158 * the batch. 3159 */ 3160 if (iter->cur_sk < iter->end_sk) { 3161 /* Keeping st->num consistent in tcp_iter_state. 3162 * bpf_iter_tcp does not use st->num. 3163 * meta.seq_num is used instead. 3164 */ 3165 st->num++; 3166 /* Move st->offset to the next sk in the bucket such that 3167 * the future start() will resume at st->offset in 3168 * st->bucket. See tcp_seek_last_pos(). 3169 */ 3170 st->offset++; 3171 sock_gen_put(iter->batch[iter->cur_sk++]); 3172 } 3173 3174 if (iter->cur_sk < iter->end_sk) 3175 sk = iter->batch[iter->cur_sk]; 3176 else 3177 sk = bpf_iter_tcp_batch(seq); 3178 3179 ++*pos; 3180 /* Keeping st->last_pos consistent in tcp_iter_state. 3181 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3182 */ 3183 st->last_pos = *pos; 3184 return sk; 3185 } 3186 3187 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3188 { 3189 struct bpf_iter_meta meta; 3190 struct bpf_prog *prog; 3191 struct sock *sk = v; 3192 uid_t uid; 3193 int ret; 3194 3195 if (v == SEQ_START_TOKEN) 3196 return 0; 3197 3198 if (sk_fullsock(sk)) 3199 lock_sock(sk); 3200 3201 if (unlikely(sk_unhashed(sk))) { 3202 ret = SEQ_SKIP; 3203 goto unlock; 3204 } 3205 3206 if (sk->sk_state == TCP_TIME_WAIT) { 3207 uid = 0; 3208 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3209 const struct request_sock *req = v; 3210 3211 uid = from_kuid_munged(seq_user_ns(seq), 3212 sock_i_uid(req->rsk_listener)); 3213 } else { 3214 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3215 } 3216 3217 meta.seq = seq; 3218 prog = bpf_iter_get_info(&meta, false); 3219 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3220 3221 unlock: 3222 if (sk_fullsock(sk)) 3223 release_sock(sk); 3224 return ret; 3225 3226 } 3227 3228 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3229 { 3230 struct bpf_tcp_iter_state *iter = seq->private; 3231 struct bpf_iter_meta meta; 3232 struct bpf_prog *prog; 3233 3234 if (!v) { 3235 meta.seq = seq; 3236 prog = bpf_iter_get_info(&meta, true); 3237 if (prog) 3238 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3239 } 3240 3241 if (iter->cur_sk < iter->end_sk) { 3242 bpf_iter_tcp_put_batch(iter); 3243 iter->st_bucket_done = false; 3244 } 3245 } 3246 3247 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3248 .show = bpf_iter_tcp_seq_show, 3249 .start = bpf_iter_tcp_seq_start, 3250 .next = bpf_iter_tcp_seq_next, 3251 .stop = bpf_iter_tcp_seq_stop, 3252 }; 3253 #endif 3254 static unsigned short seq_file_family(const struct seq_file *seq) 3255 { 3256 const struct tcp_seq_afinfo *afinfo; 3257 3258 #ifdef CONFIG_BPF_SYSCALL 3259 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3260 if (seq->op == &bpf_iter_tcp_seq_ops) 3261 return AF_UNSPEC; 3262 #endif 3263 3264 /* Iterated from proc fs */ 3265 afinfo = pde_data(file_inode(seq->file)); 3266 return afinfo->family; 3267 } 3268 3269 static const struct seq_operations tcp4_seq_ops = { 3270 .show = tcp4_seq_show, 3271 .start = tcp_seq_start, 3272 .next = tcp_seq_next, 3273 .stop = tcp_seq_stop, 3274 }; 3275 3276 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3277 .family = AF_INET, 3278 }; 3279 3280 static int __net_init tcp4_proc_init_net(struct net *net) 3281 { 3282 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3283 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3284 return -ENOMEM; 3285 return 0; 3286 } 3287 3288 static void __net_exit tcp4_proc_exit_net(struct net *net) 3289 { 3290 remove_proc_entry("tcp", net->proc_net); 3291 } 3292 3293 static struct pernet_operations tcp4_net_ops = { 3294 .init = tcp4_proc_init_net, 3295 .exit = tcp4_proc_exit_net, 3296 }; 3297 3298 int __init tcp4_proc_init(void) 3299 { 3300 return register_pernet_subsys(&tcp4_net_ops); 3301 } 3302 3303 void tcp4_proc_exit(void) 3304 { 3305 unregister_pernet_subsys(&tcp4_net_ops); 3306 } 3307 #endif /* CONFIG_PROC_FS */ 3308 3309 /* @wake is one when sk_stream_write_space() calls us. 3310 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3311 * This mimics the strategy used in sock_def_write_space(). 3312 */ 3313 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3314 { 3315 const struct tcp_sock *tp = tcp_sk(sk); 3316 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3317 READ_ONCE(tp->snd_nxt); 3318 3319 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3320 } 3321 EXPORT_SYMBOL(tcp_stream_memory_free); 3322 3323 struct proto tcp_prot = { 3324 .name = "TCP", 3325 .owner = THIS_MODULE, 3326 .close = tcp_close, 3327 .pre_connect = tcp_v4_pre_connect, 3328 .connect = tcp_v4_connect, 3329 .disconnect = tcp_disconnect, 3330 .accept = inet_csk_accept, 3331 .ioctl = tcp_ioctl, 3332 .init = tcp_v4_init_sock, 3333 .destroy = tcp_v4_destroy_sock, 3334 .shutdown = tcp_shutdown, 3335 .setsockopt = tcp_setsockopt, 3336 .getsockopt = tcp_getsockopt, 3337 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3338 .keepalive = tcp_set_keepalive, 3339 .recvmsg = tcp_recvmsg, 3340 .sendmsg = tcp_sendmsg, 3341 .splice_eof = tcp_splice_eof, 3342 .backlog_rcv = tcp_v4_do_rcv, 3343 .release_cb = tcp_release_cb, 3344 .hash = inet_hash, 3345 .unhash = inet_unhash, 3346 .get_port = inet_csk_get_port, 3347 .put_port = inet_put_port, 3348 #ifdef CONFIG_BPF_SYSCALL 3349 .psock_update_sk_prot = tcp_bpf_update_proto, 3350 #endif 3351 .enter_memory_pressure = tcp_enter_memory_pressure, 3352 .leave_memory_pressure = tcp_leave_memory_pressure, 3353 .stream_memory_free = tcp_stream_memory_free, 3354 .sockets_allocated = &tcp_sockets_allocated, 3355 .orphan_count = &tcp_orphan_count, 3356 3357 .memory_allocated = &tcp_memory_allocated, 3358 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3359 3360 .memory_pressure = &tcp_memory_pressure, 3361 .sysctl_mem = sysctl_tcp_mem, 3362 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3363 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3364 .max_header = MAX_TCP_HEADER, 3365 .obj_size = sizeof(struct tcp_sock), 3366 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3367 .twsk_prot = &tcp_timewait_sock_ops, 3368 .rsk_prot = &tcp_request_sock_ops, 3369 .h.hashinfo = NULL, 3370 .no_autobind = true, 3371 .diag_destroy = tcp_abort, 3372 }; 3373 EXPORT_SYMBOL(tcp_prot); 3374 3375 static void __net_exit tcp_sk_exit(struct net *net) 3376 { 3377 if (net->ipv4.tcp_congestion_control) 3378 bpf_module_put(net->ipv4.tcp_congestion_control, 3379 net->ipv4.tcp_congestion_control->owner); 3380 } 3381 3382 static void __net_init tcp_set_hashinfo(struct net *net) 3383 { 3384 struct inet_hashinfo *hinfo; 3385 unsigned int ehash_entries; 3386 struct net *old_net; 3387 3388 if (net_eq(net, &init_net)) 3389 goto fallback; 3390 3391 old_net = current->nsproxy->net_ns; 3392 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3393 if (!ehash_entries) 3394 goto fallback; 3395 3396 ehash_entries = roundup_pow_of_two(ehash_entries); 3397 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3398 if (!hinfo) { 3399 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3400 "for a netns, fallback to the global one\n", 3401 ehash_entries); 3402 fallback: 3403 hinfo = &tcp_hashinfo; 3404 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3405 } 3406 3407 net->ipv4.tcp_death_row.hashinfo = hinfo; 3408 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3409 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3410 } 3411 3412 static int __net_init tcp_sk_init(struct net *net) 3413 { 3414 net->ipv4.sysctl_tcp_ecn = 2; 3415 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3416 3417 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3418 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3419 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3420 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3421 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3422 3423 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3424 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3425 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3426 3427 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3428 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3429 net->ipv4.sysctl_tcp_syncookies = 1; 3430 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3431 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3432 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3433 net->ipv4.sysctl_tcp_orphan_retries = 0; 3434 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3435 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3436 net->ipv4.sysctl_tcp_tw_reuse = 2; 3437 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3438 3439 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3440 tcp_set_hashinfo(net); 3441 3442 net->ipv4.sysctl_tcp_sack = 1; 3443 net->ipv4.sysctl_tcp_window_scaling = 1; 3444 net->ipv4.sysctl_tcp_timestamps = 1; 3445 net->ipv4.sysctl_tcp_early_retrans = 3; 3446 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3447 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3448 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3449 net->ipv4.sysctl_tcp_max_reordering = 300; 3450 net->ipv4.sysctl_tcp_dsack = 1; 3451 net->ipv4.sysctl_tcp_app_win = 31; 3452 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3453 net->ipv4.sysctl_tcp_frto = 2; 3454 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3455 /* This limits the percentage of the congestion window which we 3456 * will allow a single TSO frame to consume. Building TSO frames 3457 * which are too large can cause TCP streams to be bursty. 3458 */ 3459 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3460 /* Default TSQ limit of 16 TSO segments */ 3461 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3462 3463 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3464 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3465 3466 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3467 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3468 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3469 net->ipv4.sysctl_tcp_autocorking = 1; 3470 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3471 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3472 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3473 if (net != &init_net) { 3474 memcpy(net->ipv4.sysctl_tcp_rmem, 3475 init_net.ipv4.sysctl_tcp_rmem, 3476 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3477 memcpy(net->ipv4.sysctl_tcp_wmem, 3478 init_net.ipv4.sysctl_tcp_wmem, 3479 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3480 } 3481 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3482 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3483 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3484 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3485 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3486 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3487 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3488 3489 /* Set default values for PLB */ 3490 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3491 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3492 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3493 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3494 /* Default congestion threshold for PLB to mark a round is 50% */ 3495 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3496 3497 /* Reno is always built in */ 3498 if (!net_eq(net, &init_net) && 3499 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3500 init_net.ipv4.tcp_congestion_control->owner)) 3501 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3502 else 3503 net->ipv4.tcp_congestion_control = &tcp_reno; 3504 3505 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3506 net->ipv4.sysctl_tcp_shrink_window = 0; 3507 3508 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3509 3510 return 0; 3511 } 3512 3513 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3514 { 3515 struct net *net; 3516 3517 tcp_twsk_purge(net_exit_list); 3518 3519 list_for_each_entry(net, net_exit_list, exit_list) { 3520 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3521 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3522 tcp_fastopen_ctx_destroy(net); 3523 } 3524 } 3525 3526 static struct pernet_operations __net_initdata tcp_sk_ops = { 3527 .init = tcp_sk_init, 3528 .exit = tcp_sk_exit, 3529 .exit_batch = tcp_sk_exit_batch, 3530 }; 3531 3532 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3533 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3534 struct sock_common *sk_common, uid_t uid) 3535 3536 #define INIT_BATCH_SZ 16 3537 3538 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3539 { 3540 struct bpf_tcp_iter_state *iter = priv_data; 3541 int err; 3542 3543 err = bpf_iter_init_seq_net(priv_data, aux); 3544 if (err) 3545 return err; 3546 3547 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3548 if (err) { 3549 bpf_iter_fini_seq_net(priv_data); 3550 return err; 3551 } 3552 3553 return 0; 3554 } 3555 3556 static void bpf_iter_fini_tcp(void *priv_data) 3557 { 3558 struct bpf_tcp_iter_state *iter = priv_data; 3559 3560 bpf_iter_fini_seq_net(priv_data); 3561 kvfree(iter->batch); 3562 } 3563 3564 static const struct bpf_iter_seq_info tcp_seq_info = { 3565 .seq_ops = &bpf_iter_tcp_seq_ops, 3566 .init_seq_private = bpf_iter_init_tcp, 3567 .fini_seq_private = bpf_iter_fini_tcp, 3568 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3569 }; 3570 3571 static const struct bpf_func_proto * 3572 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3573 const struct bpf_prog *prog) 3574 { 3575 switch (func_id) { 3576 case BPF_FUNC_setsockopt: 3577 return &bpf_sk_setsockopt_proto; 3578 case BPF_FUNC_getsockopt: 3579 return &bpf_sk_getsockopt_proto; 3580 default: 3581 return NULL; 3582 } 3583 } 3584 3585 static struct bpf_iter_reg tcp_reg_info = { 3586 .target = "tcp", 3587 .ctx_arg_info_size = 1, 3588 .ctx_arg_info = { 3589 { offsetof(struct bpf_iter__tcp, sk_common), 3590 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3591 }, 3592 .get_func_proto = bpf_iter_tcp_get_func_proto, 3593 .seq_info = &tcp_seq_info, 3594 }; 3595 3596 static void __init bpf_iter_register(void) 3597 { 3598 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3599 if (bpf_iter_reg_target(&tcp_reg_info)) 3600 pr_warn("Warning: could not register bpf iterator tcp\n"); 3601 } 3602 3603 #endif 3604 3605 void __init tcp_v4_init(void) 3606 { 3607 int cpu, res; 3608 3609 for_each_possible_cpu(cpu) { 3610 struct sock *sk; 3611 3612 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3613 IPPROTO_TCP, &init_net); 3614 if (res) 3615 panic("Failed to create the TCP control socket.\n"); 3616 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3617 3618 /* Please enforce IP_DF and IPID==0 for RST and 3619 * ACK sent in SYN-RECV and TIME-WAIT state. 3620 */ 3621 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3622 3623 per_cpu(ipv4_tcp_sk, cpu) = sk; 3624 } 3625 if (register_pernet_subsys(&tcp_sk_ops)) 3626 panic("Failed to create the TCP control socket.\n"); 3627 3628 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3629 bpf_iter_register(); 3630 #endif 3631 } 3632