1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 #include <net/rstreason.h> 74 75 #include <linux/inet.h> 76 #include <linux/ipv6.h> 77 #include <linux/stddef.h> 78 #include <linux/proc_fs.h> 79 #include <linux/seq_file.h> 80 #include <linux/inetdevice.h> 81 #include <linux/btf_ids.h> 82 83 #include <crypto/hash.h> 84 #include <linux/scatterlist.h> 85 86 #include <trace/events/tcp.h> 87 88 #ifdef CONFIG_TCP_MD5SIG 89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 90 __be32 daddr, __be32 saddr, const struct tcphdr *th); 91 #endif 92 93 struct inet_hashinfo tcp_hashinfo; 94 EXPORT_SYMBOL(tcp_hashinfo); 95 96 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 97 98 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 99 { 100 return secure_tcp_seq(ip_hdr(skb)->daddr, 101 ip_hdr(skb)->saddr, 102 tcp_hdr(skb)->dest, 103 tcp_hdr(skb)->source); 104 } 105 106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 107 { 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 109 } 110 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 112 { 113 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 114 const struct inet_timewait_sock *tw = inet_twsk(sktw); 115 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 116 struct tcp_sock *tp = tcp_sk(sk); 117 int ts_recent_stamp; 118 119 if (reuse == 2) { 120 /* Still does not detect *everything* that goes through 121 * lo, since we require a loopback src or dst address 122 * or direct binding to 'lo' interface. 123 */ 124 bool loopback = false; 125 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 126 loopback = true; 127 #if IS_ENABLED(CONFIG_IPV6) 128 if (tw->tw_family == AF_INET6) { 129 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 130 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 132 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 133 loopback = true; 134 } else 135 #endif 136 { 137 if (ipv4_is_loopback(tw->tw_daddr) || 138 ipv4_is_loopback(tw->tw_rcv_saddr)) 139 loopback = true; 140 } 141 if (!loopback) 142 reuse = 0; 143 } 144 145 /* With PAWS, it is safe from the viewpoint 146 of data integrity. Even without PAWS it is safe provided sequence 147 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 148 149 Actually, the idea is close to VJ's one, only timestamp cache is 150 held not per host, but per port pair and TW bucket is used as state 151 holder. 152 153 If TW bucket has been already destroyed we fall back to VJ's scheme 154 and use initial timestamp retrieved from peer table. 155 */ 156 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 157 if (ts_recent_stamp && 158 (!twp || (reuse && time_after32(ktime_get_seconds(), 159 ts_recent_stamp)))) { 160 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 161 * and releasing the bucket lock. 162 */ 163 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 164 return 0; 165 166 /* In case of repair and re-using TIME-WAIT sockets we still 167 * want to be sure that it is safe as above but honor the 168 * sequence numbers and time stamps set as part of the repair 169 * process. 170 * 171 * Without this check re-using a TIME-WAIT socket with TCP 172 * repair would accumulate a -1 on the repair assigned 173 * sequence number. The first time it is reused the sequence 174 * is -1, the second time -2, etc. This fixes that issue 175 * without appearing to create any others. 176 */ 177 if (likely(!tp->repair)) { 178 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 179 180 if (!seq) 181 seq = 1; 182 WRITE_ONCE(tp->write_seq, seq); 183 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 184 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 185 } 186 187 return 1; 188 } 189 190 return 0; 191 } 192 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 193 194 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 195 int addr_len) 196 { 197 /* This check is replicated from tcp_v4_connect() and intended to 198 * prevent BPF program called below from accessing bytes that are out 199 * of the bound specified by user in addr_len. 200 */ 201 if (addr_len < sizeof(struct sockaddr_in)) 202 return -EINVAL; 203 204 sock_owned_by_me(sk); 205 206 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 207 } 208 209 /* This will initiate an outgoing connection. */ 210 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 211 { 212 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 213 struct inet_timewait_death_row *tcp_death_row; 214 struct inet_sock *inet = inet_sk(sk); 215 struct tcp_sock *tp = tcp_sk(sk); 216 struct ip_options_rcu *inet_opt; 217 struct net *net = sock_net(sk); 218 __be16 orig_sport, orig_dport; 219 __be32 daddr, nexthop; 220 struct flowi4 *fl4; 221 struct rtable *rt; 222 int err; 223 224 if (addr_len < sizeof(struct sockaddr_in)) 225 return -EINVAL; 226 227 if (usin->sin_family != AF_INET) 228 return -EAFNOSUPPORT; 229 230 nexthop = daddr = usin->sin_addr.s_addr; 231 inet_opt = rcu_dereference_protected(inet->inet_opt, 232 lockdep_sock_is_held(sk)); 233 if (inet_opt && inet_opt->opt.srr) { 234 if (!daddr) 235 return -EINVAL; 236 nexthop = inet_opt->opt.faddr; 237 } 238 239 orig_sport = inet->inet_sport; 240 orig_dport = usin->sin_port; 241 fl4 = &inet->cork.fl.u.ip4; 242 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 243 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 244 orig_dport, sk); 245 if (IS_ERR(rt)) { 246 err = PTR_ERR(rt); 247 if (err == -ENETUNREACH) 248 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 249 return err; 250 } 251 252 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 253 ip_rt_put(rt); 254 return -ENETUNREACH; 255 } 256 257 if (!inet_opt || !inet_opt->opt.srr) 258 daddr = fl4->daddr; 259 260 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 261 262 if (!inet->inet_saddr) { 263 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 264 if (err) { 265 ip_rt_put(rt); 266 return err; 267 } 268 } else { 269 sk_rcv_saddr_set(sk, inet->inet_saddr); 270 } 271 272 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 273 /* Reset inherited state */ 274 tp->rx_opt.ts_recent = 0; 275 tp->rx_opt.ts_recent_stamp = 0; 276 if (likely(!tp->repair)) 277 WRITE_ONCE(tp->write_seq, 0); 278 } 279 280 inet->inet_dport = usin->sin_port; 281 sk_daddr_set(sk, daddr); 282 283 inet_csk(sk)->icsk_ext_hdr_len = 0; 284 if (inet_opt) 285 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 286 287 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 288 289 /* Socket identity is still unknown (sport may be zero). 290 * However we set state to SYN-SENT and not releasing socket 291 * lock select source port, enter ourselves into the hash tables and 292 * complete initialization after this. 293 */ 294 tcp_set_state(sk, TCP_SYN_SENT); 295 err = inet_hash_connect(tcp_death_row, sk); 296 if (err) 297 goto failure; 298 299 sk_set_txhash(sk); 300 301 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 302 inet->inet_sport, inet->inet_dport, sk); 303 if (IS_ERR(rt)) { 304 err = PTR_ERR(rt); 305 rt = NULL; 306 goto failure; 307 } 308 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 309 /* OK, now commit destination to socket. */ 310 sk->sk_gso_type = SKB_GSO_TCPV4; 311 sk_setup_caps(sk, &rt->dst); 312 rt = NULL; 313 314 if (likely(!tp->repair)) { 315 if (!tp->write_seq) 316 WRITE_ONCE(tp->write_seq, 317 secure_tcp_seq(inet->inet_saddr, 318 inet->inet_daddr, 319 inet->inet_sport, 320 usin->sin_port)); 321 WRITE_ONCE(tp->tsoffset, 322 secure_tcp_ts_off(net, inet->inet_saddr, 323 inet->inet_daddr)); 324 } 325 326 atomic_set(&inet->inet_id, get_random_u16()); 327 328 if (tcp_fastopen_defer_connect(sk, &err)) 329 return err; 330 if (err) 331 goto failure; 332 333 err = tcp_connect(sk); 334 335 if (err) 336 goto failure; 337 338 return 0; 339 340 failure: 341 /* 342 * This unhashes the socket and releases the local port, 343 * if necessary. 344 */ 345 tcp_set_state(sk, TCP_CLOSE); 346 inet_bhash2_reset_saddr(sk); 347 ip_rt_put(rt); 348 sk->sk_route_caps = 0; 349 inet->inet_dport = 0; 350 return err; 351 } 352 EXPORT_SYMBOL(tcp_v4_connect); 353 354 /* 355 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 356 * It can be called through tcp_release_cb() if socket was owned by user 357 * at the time tcp_v4_err() was called to handle ICMP message. 358 */ 359 void tcp_v4_mtu_reduced(struct sock *sk) 360 { 361 struct inet_sock *inet = inet_sk(sk); 362 struct dst_entry *dst; 363 u32 mtu; 364 365 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 366 return; 367 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 368 dst = inet_csk_update_pmtu(sk, mtu); 369 if (!dst) 370 return; 371 372 /* Something is about to be wrong... Remember soft error 373 * for the case, if this connection will not able to recover. 374 */ 375 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 376 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 377 378 mtu = dst_mtu(dst); 379 380 if (inet->pmtudisc != IP_PMTUDISC_DONT && 381 ip_sk_accept_pmtu(sk) && 382 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 383 tcp_sync_mss(sk, mtu); 384 385 /* Resend the TCP packet because it's 386 * clear that the old packet has been 387 * dropped. This is the new "fast" path mtu 388 * discovery. 389 */ 390 tcp_simple_retransmit(sk); 391 } /* else let the usual retransmit timer handle it */ 392 } 393 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 394 395 static void do_redirect(struct sk_buff *skb, struct sock *sk) 396 { 397 struct dst_entry *dst = __sk_dst_check(sk, 0); 398 399 if (dst) 400 dst->ops->redirect(dst, sk, skb); 401 } 402 403 404 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 405 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 406 { 407 struct request_sock *req = inet_reqsk(sk); 408 struct net *net = sock_net(sk); 409 410 /* ICMPs are not backlogged, hence we cannot get 411 * an established socket here. 412 */ 413 if (seq != tcp_rsk(req)->snt_isn) { 414 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 415 } else if (abort) { 416 /* 417 * Still in SYN_RECV, just remove it silently. 418 * There is no good way to pass the error to the newly 419 * created socket, and POSIX does not want network 420 * errors returned from accept(). 421 */ 422 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 423 tcp_listendrop(req->rsk_listener); 424 } 425 reqsk_put(req); 426 } 427 EXPORT_SYMBOL(tcp_req_err); 428 429 /* TCP-LD (RFC 6069) logic */ 430 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 431 { 432 struct inet_connection_sock *icsk = inet_csk(sk); 433 struct tcp_sock *tp = tcp_sk(sk); 434 struct sk_buff *skb; 435 s32 remaining; 436 u32 delta_us; 437 438 if (sock_owned_by_user(sk)) 439 return; 440 441 if (seq != tp->snd_una || !icsk->icsk_retransmits || 442 !icsk->icsk_backoff) 443 return; 444 445 skb = tcp_rtx_queue_head(sk); 446 if (WARN_ON_ONCE(!skb)) 447 return; 448 449 icsk->icsk_backoff--; 450 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 451 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 452 453 tcp_mstamp_refresh(tp); 454 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 455 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 456 457 if (remaining > 0) { 458 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 459 remaining, TCP_RTO_MAX); 460 } else { 461 /* RTO revert clocked out retransmission. 462 * Will retransmit now. 463 */ 464 tcp_retransmit_timer(sk); 465 } 466 } 467 EXPORT_SYMBOL(tcp_ld_RTO_revert); 468 469 /* 470 * This routine is called by the ICMP module when it gets some 471 * sort of error condition. If err < 0 then the socket should 472 * be closed and the error returned to the user. If err > 0 473 * it's just the icmp type << 8 | icmp code. After adjustment 474 * header points to the first 8 bytes of the tcp header. We need 475 * to find the appropriate port. 476 * 477 * The locking strategy used here is very "optimistic". When 478 * someone else accesses the socket the ICMP is just dropped 479 * and for some paths there is no check at all. 480 * A more general error queue to queue errors for later handling 481 * is probably better. 482 * 483 */ 484 485 int tcp_v4_err(struct sk_buff *skb, u32 info) 486 { 487 const struct iphdr *iph = (const struct iphdr *)skb->data; 488 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 489 struct tcp_sock *tp; 490 const int type = icmp_hdr(skb)->type; 491 const int code = icmp_hdr(skb)->code; 492 struct sock *sk; 493 struct request_sock *fastopen; 494 u32 seq, snd_una; 495 int err; 496 struct net *net = dev_net(skb->dev); 497 498 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 499 iph->daddr, th->dest, iph->saddr, 500 ntohs(th->source), inet_iif(skb), 0); 501 if (!sk) { 502 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 503 return -ENOENT; 504 } 505 if (sk->sk_state == TCP_TIME_WAIT) { 506 /* To increase the counter of ignored icmps for TCP-AO */ 507 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 508 inet_twsk_put(inet_twsk(sk)); 509 return 0; 510 } 511 seq = ntohl(th->seq); 512 if (sk->sk_state == TCP_NEW_SYN_RECV) { 513 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 514 type == ICMP_TIME_EXCEEDED || 515 (type == ICMP_DEST_UNREACH && 516 (code == ICMP_NET_UNREACH || 517 code == ICMP_HOST_UNREACH))); 518 return 0; 519 } 520 521 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 522 sock_put(sk); 523 return 0; 524 } 525 526 bh_lock_sock(sk); 527 /* If too many ICMPs get dropped on busy 528 * servers this needs to be solved differently. 529 * We do take care of PMTU discovery (RFC1191) special case : 530 * we can receive locally generated ICMP messages while socket is held. 531 */ 532 if (sock_owned_by_user(sk)) { 533 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 534 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 535 } 536 if (sk->sk_state == TCP_CLOSE) 537 goto out; 538 539 if (static_branch_unlikely(&ip4_min_ttl)) { 540 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 541 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 542 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 543 goto out; 544 } 545 } 546 547 tp = tcp_sk(sk); 548 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 549 fastopen = rcu_dereference(tp->fastopen_rsk); 550 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 551 if (sk->sk_state != TCP_LISTEN && 552 !between(seq, snd_una, tp->snd_nxt)) { 553 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 554 goto out; 555 } 556 557 switch (type) { 558 case ICMP_REDIRECT: 559 if (!sock_owned_by_user(sk)) 560 do_redirect(skb, sk); 561 goto out; 562 case ICMP_SOURCE_QUENCH: 563 /* Just silently ignore these. */ 564 goto out; 565 case ICMP_PARAMETERPROB: 566 err = EPROTO; 567 break; 568 case ICMP_DEST_UNREACH: 569 if (code > NR_ICMP_UNREACH) 570 goto out; 571 572 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 573 /* We are not interested in TCP_LISTEN and open_requests 574 * (SYN-ACKs send out by Linux are always <576bytes so 575 * they should go through unfragmented). 576 */ 577 if (sk->sk_state == TCP_LISTEN) 578 goto out; 579 580 WRITE_ONCE(tp->mtu_info, info); 581 if (!sock_owned_by_user(sk)) { 582 tcp_v4_mtu_reduced(sk); 583 } else { 584 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 585 sock_hold(sk); 586 } 587 goto out; 588 } 589 590 err = icmp_err_convert[code].errno; 591 /* check if this ICMP message allows revert of backoff. 592 * (see RFC 6069) 593 */ 594 if (!fastopen && 595 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 596 tcp_ld_RTO_revert(sk, seq); 597 break; 598 case ICMP_TIME_EXCEEDED: 599 err = EHOSTUNREACH; 600 break; 601 default: 602 goto out; 603 } 604 605 switch (sk->sk_state) { 606 case TCP_SYN_SENT: 607 case TCP_SYN_RECV: 608 /* Only in fast or simultaneous open. If a fast open socket is 609 * already accepted it is treated as a connected one below. 610 */ 611 if (fastopen && !fastopen->sk) 612 break; 613 614 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 615 616 if (!sock_owned_by_user(sk)) 617 tcp_done_with_error(sk, err); 618 else 619 WRITE_ONCE(sk->sk_err_soft, err); 620 goto out; 621 } 622 623 /* If we've already connected we will keep trying 624 * until we time out, or the user gives up. 625 * 626 * rfc1122 4.2.3.9 allows to consider as hard errors 627 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 628 * but it is obsoleted by pmtu discovery). 629 * 630 * Note, that in modern internet, where routing is unreliable 631 * and in each dark corner broken firewalls sit, sending random 632 * errors ordered by their masters even this two messages finally lose 633 * their original sense (even Linux sends invalid PORT_UNREACHs) 634 * 635 * Now we are in compliance with RFCs. 636 * --ANK (980905) 637 */ 638 639 if (!sock_owned_by_user(sk) && 640 inet_test_bit(RECVERR, sk)) { 641 WRITE_ONCE(sk->sk_err, err); 642 sk_error_report(sk); 643 } else { /* Only an error on timeout */ 644 WRITE_ONCE(sk->sk_err_soft, err); 645 } 646 647 out: 648 bh_unlock_sock(sk); 649 sock_put(sk); 650 return 0; 651 } 652 653 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 654 { 655 struct tcphdr *th = tcp_hdr(skb); 656 657 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 658 skb->csum_start = skb_transport_header(skb) - skb->head; 659 skb->csum_offset = offsetof(struct tcphdr, check); 660 } 661 662 /* This routine computes an IPv4 TCP checksum. */ 663 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 664 { 665 const struct inet_sock *inet = inet_sk(sk); 666 667 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 668 } 669 EXPORT_SYMBOL(tcp_v4_send_check); 670 671 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 672 673 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 674 const struct tcp_ao_hdr *aoh, 675 struct ip_reply_arg *arg, struct tcphdr *reply, 676 __be32 reply_options[REPLY_OPTIONS_LEN]) 677 { 678 #ifdef CONFIG_TCP_AO 679 int sdif = tcp_v4_sdif(skb); 680 int dif = inet_iif(skb); 681 int l3index = sdif ? dif : 0; 682 bool allocated_traffic_key; 683 struct tcp_ao_key *key; 684 char *traffic_key; 685 bool drop = true; 686 u32 ao_sne = 0; 687 u8 keyid; 688 689 rcu_read_lock(); 690 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 691 &key, &traffic_key, &allocated_traffic_key, 692 &keyid, &ao_sne)) 693 goto out; 694 695 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 696 (aoh->rnext_keyid << 8) | keyid); 697 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 698 reply->doff = arg->iov[0].iov_len / 4; 699 700 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 701 key, traffic_key, 702 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 703 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 704 reply, ao_sne)) 705 goto out; 706 drop = false; 707 out: 708 rcu_read_unlock(); 709 if (allocated_traffic_key) 710 kfree(traffic_key); 711 return drop; 712 #else 713 return true; 714 #endif 715 } 716 717 /* 718 * This routine will send an RST to the other tcp. 719 * 720 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 721 * for reset. 722 * Answer: if a packet caused RST, it is not for a socket 723 * existing in our system, if it is matched to a socket, 724 * it is just duplicate segment or bug in other side's TCP. 725 * So that we build reply only basing on parameters 726 * arrived with segment. 727 * Exception: precedence violation. We do not implement it in any case. 728 */ 729 730 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 731 enum sk_rst_reason reason) 732 { 733 const struct tcphdr *th = tcp_hdr(skb); 734 struct { 735 struct tcphdr th; 736 __be32 opt[REPLY_OPTIONS_LEN]; 737 } rep; 738 const __u8 *md5_hash_location = NULL; 739 const struct tcp_ao_hdr *aoh; 740 struct ip_reply_arg arg; 741 #ifdef CONFIG_TCP_MD5SIG 742 struct tcp_md5sig_key *key = NULL; 743 unsigned char newhash[16]; 744 struct sock *sk1 = NULL; 745 int genhash; 746 #endif 747 u64 transmit_time = 0; 748 struct sock *ctl_sk; 749 struct net *net; 750 u32 txhash = 0; 751 752 /* Never send a reset in response to a reset. */ 753 if (th->rst) 754 return; 755 756 /* If sk not NULL, it means we did a successful lookup and incoming 757 * route had to be correct. prequeue might have dropped our dst. 758 */ 759 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 760 return; 761 762 /* Swap the send and the receive. */ 763 memset(&rep, 0, sizeof(rep)); 764 rep.th.dest = th->source; 765 rep.th.source = th->dest; 766 rep.th.doff = sizeof(struct tcphdr) / 4; 767 rep.th.rst = 1; 768 769 if (th->ack) { 770 rep.th.seq = th->ack_seq; 771 } else { 772 rep.th.ack = 1; 773 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 774 skb->len - (th->doff << 2)); 775 } 776 777 memset(&arg, 0, sizeof(arg)); 778 arg.iov[0].iov_base = (unsigned char *)&rep; 779 arg.iov[0].iov_len = sizeof(rep.th); 780 781 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 782 783 /* Invalid TCP option size or twice included auth */ 784 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 785 return; 786 787 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 788 return; 789 790 #ifdef CONFIG_TCP_MD5SIG 791 rcu_read_lock(); 792 if (sk && sk_fullsock(sk)) { 793 const union tcp_md5_addr *addr; 794 int l3index; 795 796 /* sdif set, means packet ingressed via a device 797 * in an L3 domain and inet_iif is set to it. 798 */ 799 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 800 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 801 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 802 } else if (md5_hash_location) { 803 const union tcp_md5_addr *addr; 804 int sdif = tcp_v4_sdif(skb); 805 int dif = inet_iif(skb); 806 int l3index; 807 808 /* 809 * active side is lost. Try to find listening socket through 810 * source port, and then find md5 key through listening socket. 811 * we are not loose security here: 812 * Incoming packet is checked with md5 hash with finding key, 813 * no RST generated if md5 hash doesn't match. 814 */ 815 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 816 NULL, 0, ip_hdr(skb)->saddr, 817 th->source, ip_hdr(skb)->daddr, 818 ntohs(th->source), dif, sdif); 819 /* don't send rst if it can't find key */ 820 if (!sk1) 821 goto out; 822 823 /* sdif set, means packet ingressed via a device 824 * in an L3 domain and dif is set to it. 825 */ 826 l3index = sdif ? dif : 0; 827 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 828 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 829 if (!key) 830 goto out; 831 832 833 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 834 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 835 goto out; 836 837 } 838 839 if (key) { 840 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 841 (TCPOPT_NOP << 16) | 842 (TCPOPT_MD5SIG << 8) | 843 TCPOLEN_MD5SIG); 844 /* Update length and the length the header thinks exists */ 845 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 846 rep.th.doff = arg.iov[0].iov_len / 4; 847 848 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 849 key, ip_hdr(skb)->saddr, 850 ip_hdr(skb)->daddr, &rep.th); 851 } 852 #endif 853 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 854 if (rep.opt[0] == 0) { 855 __be32 mrst = mptcp_reset_option(skb); 856 857 if (mrst) { 858 rep.opt[0] = mrst; 859 arg.iov[0].iov_len += sizeof(mrst); 860 rep.th.doff = arg.iov[0].iov_len / 4; 861 } 862 } 863 864 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 865 ip_hdr(skb)->saddr, /* XXX */ 866 arg.iov[0].iov_len, IPPROTO_TCP, 0); 867 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 868 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 869 870 /* When socket is gone, all binding information is lost. 871 * routing might fail in this case. No choice here, if we choose to force 872 * input interface, we will misroute in case of asymmetric route. 873 */ 874 if (sk) 875 arg.bound_dev_if = sk->sk_bound_dev_if; 876 877 trace_tcp_send_reset(sk, skb, reason); 878 879 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 880 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 881 882 arg.tos = ip_hdr(skb)->tos; 883 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 884 local_bh_disable(); 885 ctl_sk = this_cpu_read(ipv4_tcp_sk); 886 sock_net_set(ctl_sk, net); 887 if (sk) { 888 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 889 inet_twsk(sk)->tw_mark : sk->sk_mark; 890 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 891 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 892 transmit_time = tcp_transmit_time(sk); 893 xfrm_sk_clone_policy(ctl_sk, sk); 894 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 895 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 896 } else { 897 ctl_sk->sk_mark = 0; 898 ctl_sk->sk_priority = 0; 899 } 900 ip_send_unicast_reply(ctl_sk, 901 skb, &TCP_SKB_CB(skb)->header.h4.opt, 902 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 903 &arg, arg.iov[0].iov_len, 904 transmit_time, txhash); 905 906 xfrm_sk_free_policy(ctl_sk); 907 sock_net_set(ctl_sk, &init_net); 908 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 909 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 910 local_bh_enable(); 911 912 #ifdef CONFIG_TCP_MD5SIG 913 out: 914 rcu_read_unlock(); 915 #endif 916 } 917 918 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 919 outside socket context is ugly, certainly. What can I do? 920 */ 921 922 static void tcp_v4_send_ack(const struct sock *sk, 923 struct sk_buff *skb, u32 seq, u32 ack, 924 u32 win, u32 tsval, u32 tsecr, int oif, 925 struct tcp_key *key, 926 int reply_flags, u8 tos, u32 txhash) 927 { 928 const struct tcphdr *th = tcp_hdr(skb); 929 struct { 930 struct tcphdr th; 931 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 932 } rep; 933 struct net *net = sock_net(sk); 934 struct ip_reply_arg arg; 935 struct sock *ctl_sk; 936 u64 transmit_time; 937 938 memset(&rep.th, 0, sizeof(struct tcphdr)); 939 memset(&arg, 0, sizeof(arg)); 940 941 arg.iov[0].iov_base = (unsigned char *)&rep; 942 arg.iov[0].iov_len = sizeof(rep.th); 943 if (tsecr) { 944 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 945 (TCPOPT_TIMESTAMP << 8) | 946 TCPOLEN_TIMESTAMP); 947 rep.opt[1] = htonl(tsval); 948 rep.opt[2] = htonl(tsecr); 949 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 950 } 951 952 /* Swap the send and the receive. */ 953 rep.th.dest = th->source; 954 rep.th.source = th->dest; 955 rep.th.doff = arg.iov[0].iov_len / 4; 956 rep.th.seq = htonl(seq); 957 rep.th.ack_seq = htonl(ack); 958 rep.th.ack = 1; 959 rep.th.window = htons(win); 960 961 #ifdef CONFIG_TCP_MD5SIG 962 if (tcp_key_is_md5(key)) { 963 int offset = (tsecr) ? 3 : 0; 964 965 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 966 (TCPOPT_NOP << 16) | 967 (TCPOPT_MD5SIG << 8) | 968 TCPOLEN_MD5SIG); 969 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 970 rep.th.doff = arg.iov[0].iov_len/4; 971 972 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 973 key->md5_key, ip_hdr(skb)->saddr, 974 ip_hdr(skb)->daddr, &rep.th); 975 } 976 #endif 977 #ifdef CONFIG_TCP_AO 978 if (tcp_key_is_ao(key)) { 979 int offset = (tsecr) ? 3 : 0; 980 981 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 982 (tcp_ao_len(key->ao_key) << 16) | 983 (key->ao_key->sndid << 8) | 984 key->rcv_next); 985 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 986 rep.th.doff = arg.iov[0].iov_len / 4; 987 988 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 989 key->ao_key, key->traffic_key, 990 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 991 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 992 &rep.th, key->sne); 993 } 994 #endif 995 arg.flags = reply_flags; 996 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 997 ip_hdr(skb)->saddr, /* XXX */ 998 arg.iov[0].iov_len, IPPROTO_TCP, 0); 999 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1000 if (oif) 1001 arg.bound_dev_if = oif; 1002 arg.tos = tos; 1003 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1004 local_bh_disable(); 1005 ctl_sk = this_cpu_read(ipv4_tcp_sk); 1006 sock_net_set(ctl_sk, net); 1007 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1008 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1009 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1010 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1011 transmit_time = tcp_transmit_time(sk); 1012 ip_send_unicast_reply(ctl_sk, 1013 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1014 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1015 &arg, arg.iov[0].iov_len, 1016 transmit_time, txhash); 1017 1018 sock_net_set(ctl_sk, &init_net); 1019 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1020 local_bh_enable(); 1021 } 1022 1023 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 1024 { 1025 struct inet_timewait_sock *tw = inet_twsk(sk); 1026 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1027 struct tcp_key key = {}; 1028 #ifdef CONFIG_TCP_AO 1029 struct tcp_ao_info *ao_info; 1030 1031 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1032 /* FIXME: the segment to-be-acked is not verified yet */ 1033 ao_info = rcu_dereference(tcptw->ao_info); 1034 if (ao_info) { 1035 const struct tcp_ao_hdr *aoh; 1036 1037 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1038 inet_twsk_put(tw); 1039 return; 1040 } 1041 1042 if (aoh) 1043 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); 1044 } 1045 } 1046 if (key.ao_key) { 1047 struct tcp_ao_key *rnext_key; 1048 1049 key.traffic_key = snd_other_key(key.ao_key); 1050 key.sne = READ_ONCE(ao_info->snd_sne); 1051 rnext_key = READ_ONCE(ao_info->rnext_key); 1052 key.rcv_next = rnext_key->rcvid; 1053 key.type = TCP_KEY_AO; 1054 #else 1055 if (0) { 1056 #endif 1057 } else if (static_branch_tcp_md5()) { 1058 key.md5_key = tcp_twsk_md5_key(tcptw); 1059 if (key.md5_key) 1060 key.type = TCP_KEY_MD5; 1061 } 1062 1063 tcp_v4_send_ack(sk, skb, 1064 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 1065 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1066 tcp_tw_tsval(tcptw), 1067 READ_ONCE(tcptw->tw_ts_recent), 1068 tw->tw_bound_dev_if, &key, 1069 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1070 tw->tw_tos, 1071 tw->tw_txhash); 1072 1073 inet_twsk_put(tw); 1074 } 1075 1076 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1077 struct request_sock *req) 1078 { 1079 struct tcp_key key = {}; 1080 1081 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1082 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1083 */ 1084 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1085 tcp_sk(sk)->snd_nxt; 1086 1087 #ifdef CONFIG_TCP_AO 1088 if (static_branch_unlikely(&tcp_ao_needed.key) && 1089 tcp_rsk_used_ao(req)) { 1090 const union tcp_md5_addr *addr; 1091 const struct tcp_ao_hdr *aoh; 1092 int l3index; 1093 1094 /* Invalid TCP option size or twice included auth */ 1095 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1096 return; 1097 if (!aoh) 1098 return; 1099 1100 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1101 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1102 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1103 aoh->rnext_keyid, -1); 1104 if (unlikely(!key.ao_key)) { 1105 /* Send ACK with any matching MKT for the peer */ 1106 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1107 /* Matching key disappeared (user removed the key?) 1108 * let the handshake timeout. 1109 */ 1110 if (!key.ao_key) { 1111 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1112 addr, 1113 ntohs(tcp_hdr(skb)->source), 1114 &ip_hdr(skb)->daddr, 1115 ntohs(tcp_hdr(skb)->dest)); 1116 return; 1117 } 1118 } 1119 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1120 if (!key.traffic_key) 1121 return; 1122 1123 key.type = TCP_KEY_AO; 1124 key.rcv_next = aoh->keyid; 1125 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1126 #else 1127 if (0) { 1128 #endif 1129 } else if (static_branch_tcp_md5()) { 1130 const union tcp_md5_addr *addr; 1131 int l3index; 1132 1133 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1134 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1135 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1136 if (key.md5_key) 1137 key.type = TCP_KEY_MD5; 1138 } 1139 1140 tcp_v4_send_ack(sk, skb, seq, 1141 tcp_rsk(req)->rcv_nxt, 1142 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1143 tcp_rsk_tsval(tcp_rsk(req)), 1144 READ_ONCE(req->ts_recent), 1145 0, &key, 1146 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1147 ip_hdr(skb)->tos, 1148 READ_ONCE(tcp_rsk(req)->txhash)); 1149 if (tcp_key_is_ao(&key)) 1150 kfree(key.traffic_key); 1151 } 1152 1153 /* 1154 * Send a SYN-ACK after having received a SYN. 1155 * This still operates on a request_sock only, not on a big 1156 * socket. 1157 */ 1158 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1159 struct flowi *fl, 1160 struct request_sock *req, 1161 struct tcp_fastopen_cookie *foc, 1162 enum tcp_synack_type synack_type, 1163 struct sk_buff *syn_skb) 1164 { 1165 const struct inet_request_sock *ireq = inet_rsk(req); 1166 struct flowi4 fl4; 1167 int err = -1; 1168 struct sk_buff *skb; 1169 u8 tos; 1170 1171 /* First, grab a route. */ 1172 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1173 return -1; 1174 1175 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1176 1177 if (skb) { 1178 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1179 1180 tos = READ_ONCE(inet_sk(sk)->tos); 1181 1182 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1183 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1184 (tos & INET_ECN_MASK); 1185 1186 if (!INET_ECN_is_capable(tos) && 1187 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1188 tos |= INET_ECN_ECT_0; 1189 1190 rcu_read_lock(); 1191 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1192 ireq->ir_rmt_addr, 1193 rcu_dereference(ireq->ireq_opt), 1194 tos); 1195 rcu_read_unlock(); 1196 err = net_xmit_eval(err); 1197 } 1198 1199 return err; 1200 } 1201 1202 /* 1203 * IPv4 request_sock destructor. 1204 */ 1205 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1206 { 1207 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1208 } 1209 1210 #ifdef CONFIG_TCP_MD5SIG 1211 /* 1212 * RFC2385 MD5 checksumming requires a mapping of 1213 * IP address->MD5 Key. 1214 * We need to maintain these in the sk structure. 1215 */ 1216 1217 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1218 EXPORT_SYMBOL(tcp_md5_needed); 1219 1220 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1221 { 1222 if (!old) 1223 return true; 1224 1225 /* l3index always overrides non-l3index */ 1226 if (old->l3index && new->l3index == 0) 1227 return false; 1228 if (old->l3index == 0 && new->l3index) 1229 return true; 1230 1231 return old->prefixlen < new->prefixlen; 1232 } 1233 1234 /* Find the Key structure for an address. */ 1235 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1236 const union tcp_md5_addr *addr, 1237 int family, bool any_l3index) 1238 { 1239 const struct tcp_sock *tp = tcp_sk(sk); 1240 struct tcp_md5sig_key *key; 1241 const struct tcp_md5sig_info *md5sig; 1242 __be32 mask; 1243 struct tcp_md5sig_key *best_match = NULL; 1244 bool match; 1245 1246 /* caller either holds rcu_read_lock() or socket lock */ 1247 md5sig = rcu_dereference_check(tp->md5sig_info, 1248 lockdep_sock_is_held(sk)); 1249 if (!md5sig) 1250 return NULL; 1251 1252 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1253 lockdep_sock_is_held(sk)) { 1254 if (key->family != family) 1255 continue; 1256 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1257 key->l3index != l3index) 1258 continue; 1259 if (family == AF_INET) { 1260 mask = inet_make_mask(key->prefixlen); 1261 match = (key->addr.a4.s_addr & mask) == 1262 (addr->a4.s_addr & mask); 1263 #if IS_ENABLED(CONFIG_IPV6) 1264 } else if (family == AF_INET6) { 1265 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1266 key->prefixlen); 1267 #endif 1268 } else { 1269 match = false; 1270 } 1271 1272 if (match && better_md5_match(best_match, key)) 1273 best_match = key; 1274 } 1275 return best_match; 1276 } 1277 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1278 1279 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1280 const union tcp_md5_addr *addr, 1281 int family, u8 prefixlen, 1282 int l3index, u8 flags) 1283 { 1284 const struct tcp_sock *tp = tcp_sk(sk); 1285 struct tcp_md5sig_key *key; 1286 unsigned int size = sizeof(struct in_addr); 1287 const struct tcp_md5sig_info *md5sig; 1288 1289 /* caller either holds rcu_read_lock() or socket lock */ 1290 md5sig = rcu_dereference_check(tp->md5sig_info, 1291 lockdep_sock_is_held(sk)); 1292 if (!md5sig) 1293 return NULL; 1294 #if IS_ENABLED(CONFIG_IPV6) 1295 if (family == AF_INET6) 1296 size = sizeof(struct in6_addr); 1297 #endif 1298 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1299 lockdep_sock_is_held(sk)) { 1300 if (key->family != family) 1301 continue; 1302 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1303 continue; 1304 if (key->l3index != l3index) 1305 continue; 1306 if (!memcmp(&key->addr, addr, size) && 1307 key->prefixlen == prefixlen) 1308 return key; 1309 } 1310 return NULL; 1311 } 1312 1313 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1314 const struct sock *addr_sk) 1315 { 1316 const union tcp_md5_addr *addr; 1317 int l3index; 1318 1319 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1320 addr_sk->sk_bound_dev_if); 1321 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1322 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1323 } 1324 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1325 1326 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1327 { 1328 struct tcp_sock *tp = tcp_sk(sk); 1329 struct tcp_md5sig_info *md5sig; 1330 1331 md5sig = kmalloc(sizeof(*md5sig), gfp); 1332 if (!md5sig) 1333 return -ENOMEM; 1334 1335 sk_gso_disable(sk); 1336 INIT_HLIST_HEAD(&md5sig->head); 1337 rcu_assign_pointer(tp->md5sig_info, md5sig); 1338 return 0; 1339 } 1340 1341 /* This can be called on a newly created socket, from other files */ 1342 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1343 int family, u8 prefixlen, int l3index, u8 flags, 1344 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1345 { 1346 /* Add Key to the list */ 1347 struct tcp_md5sig_key *key; 1348 struct tcp_sock *tp = tcp_sk(sk); 1349 struct tcp_md5sig_info *md5sig; 1350 1351 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1352 if (key) { 1353 /* Pre-existing entry - just update that one. 1354 * Note that the key might be used concurrently. 1355 * data_race() is telling kcsan that we do not care of 1356 * key mismatches, since changing MD5 key on live flows 1357 * can lead to packet drops. 1358 */ 1359 data_race(memcpy(key->key, newkey, newkeylen)); 1360 1361 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1362 * Also note that a reader could catch new key->keylen value 1363 * but old key->key[], this is the reason we use __GFP_ZERO 1364 * at sock_kmalloc() time below these lines. 1365 */ 1366 WRITE_ONCE(key->keylen, newkeylen); 1367 1368 return 0; 1369 } 1370 1371 md5sig = rcu_dereference_protected(tp->md5sig_info, 1372 lockdep_sock_is_held(sk)); 1373 1374 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1375 if (!key) 1376 return -ENOMEM; 1377 1378 memcpy(key->key, newkey, newkeylen); 1379 key->keylen = newkeylen; 1380 key->family = family; 1381 key->prefixlen = prefixlen; 1382 key->l3index = l3index; 1383 key->flags = flags; 1384 memcpy(&key->addr, addr, 1385 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1386 sizeof(struct in_addr)); 1387 hlist_add_head_rcu(&key->node, &md5sig->head); 1388 return 0; 1389 } 1390 1391 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1392 int family, u8 prefixlen, int l3index, u8 flags, 1393 const u8 *newkey, u8 newkeylen) 1394 { 1395 struct tcp_sock *tp = tcp_sk(sk); 1396 1397 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1398 if (tcp_md5_alloc_sigpool()) 1399 return -ENOMEM; 1400 1401 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1402 tcp_md5_release_sigpool(); 1403 return -ENOMEM; 1404 } 1405 1406 if (!static_branch_inc(&tcp_md5_needed.key)) { 1407 struct tcp_md5sig_info *md5sig; 1408 1409 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1410 rcu_assign_pointer(tp->md5sig_info, NULL); 1411 kfree_rcu(md5sig, rcu); 1412 tcp_md5_release_sigpool(); 1413 return -EUSERS; 1414 } 1415 } 1416 1417 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1418 newkey, newkeylen, GFP_KERNEL); 1419 } 1420 EXPORT_SYMBOL(tcp_md5_do_add); 1421 1422 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1423 int family, u8 prefixlen, int l3index, 1424 struct tcp_md5sig_key *key) 1425 { 1426 struct tcp_sock *tp = tcp_sk(sk); 1427 1428 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1429 tcp_md5_add_sigpool(); 1430 1431 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1432 tcp_md5_release_sigpool(); 1433 return -ENOMEM; 1434 } 1435 1436 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1437 struct tcp_md5sig_info *md5sig; 1438 1439 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1440 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1441 rcu_assign_pointer(tp->md5sig_info, NULL); 1442 kfree_rcu(md5sig, rcu); 1443 tcp_md5_release_sigpool(); 1444 return -EUSERS; 1445 } 1446 } 1447 1448 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1449 key->flags, key->key, key->keylen, 1450 sk_gfp_mask(sk, GFP_ATOMIC)); 1451 } 1452 EXPORT_SYMBOL(tcp_md5_key_copy); 1453 1454 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1455 u8 prefixlen, int l3index, u8 flags) 1456 { 1457 struct tcp_md5sig_key *key; 1458 1459 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1460 if (!key) 1461 return -ENOENT; 1462 hlist_del_rcu(&key->node); 1463 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1464 kfree_rcu(key, rcu); 1465 return 0; 1466 } 1467 EXPORT_SYMBOL(tcp_md5_do_del); 1468 1469 void tcp_clear_md5_list(struct sock *sk) 1470 { 1471 struct tcp_sock *tp = tcp_sk(sk); 1472 struct tcp_md5sig_key *key; 1473 struct hlist_node *n; 1474 struct tcp_md5sig_info *md5sig; 1475 1476 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1477 1478 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1479 hlist_del_rcu(&key->node); 1480 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1481 kfree_rcu(key, rcu); 1482 } 1483 } 1484 1485 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1486 sockptr_t optval, int optlen) 1487 { 1488 struct tcp_md5sig cmd; 1489 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1490 const union tcp_md5_addr *addr; 1491 u8 prefixlen = 32; 1492 int l3index = 0; 1493 bool l3flag; 1494 u8 flags; 1495 1496 if (optlen < sizeof(cmd)) 1497 return -EINVAL; 1498 1499 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1500 return -EFAULT; 1501 1502 if (sin->sin_family != AF_INET) 1503 return -EINVAL; 1504 1505 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1506 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1507 1508 if (optname == TCP_MD5SIG_EXT && 1509 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1510 prefixlen = cmd.tcpm_prefixlen; 1511 if (prefixlen > 32) 1512 return -EINVAL; 1513 } 1514 1515 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1516 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1517 struct net_device *dev; 1518 1519 rcu_read_lock(); 1520 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1521 if (dev && netif_is_l3_master(dev)) 1522 l3index = dev->ifindex; 1523 1524 rcu_read_unlock(); 1525 1526 /* ok to reference set/not set outside of rcu; 1527 * right now device MUST be an L3 master 1528 */ 1529 if (!dev || !l3index) 1530 return -EINVAL; 1531 } 1532 1533 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1534 1535 if (!cmd.tcpm_keylen) 1536 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1537 1538 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1539 return -EINVAL; 1540 1541 /* Don't allow keys for peers that have a matching TCP-AO key. 1542 * See the comment in tcp_ao_add_cmd() 1543 */ 1544 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1545 return -EKEYREJECTED; 1546 1547 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1548 cmd.tcpm_key, cmd.tcpm_keylen); 1549 } 1550 1551 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1552 __be32 daddr, __be32 saddr, 1553 const struct tcphdr *th, int nbytes) 1554 { 1555 struct tcp4_pseudohdr *bp; 1556 struct scatterlist sg; 1557 struct tcphdr *_th; 1558 1559 bp = hp->scratch; 1560 bp->saddr = saddr; 1561 bp->daddr = daddr; 1562 bp->pad = 0; 1563 bp->protocol = IPPROTO_TCP; 1564 bp->len = cpu_to_be16(nbytes); 1565 1566 _th = (struct tcphdr *)(bp + 1); 1567 memcpy(_th, th, sizeof(*th)); 1568 _th->check = 0; 1569 1570 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1571 ahash_request_set_crypt(hp->req, &sg, NULL, 1572 sizeof(*bp) + sizeof(*th)); 1573 return crypto_ahash_update(hp->req); 1574 } 1575 1576 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1577 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1578 { 1579 struct tcp_sigpool hp; 1580 1581 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1582 goto clear_hash_nostart; 1583 1584 if (crypto_ahash_init(hp.req)) 1585 goto clear_hash; 1586 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1587 goto clear_hash; 1588 if (tcp_md5_hash_key(&hp, key)) 1589 goto clear_hash; 1590 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1591 if (crypto_ahash_final(hp.req)) 1592 goto clear_hash; 1593 1594 tcp_sigpool_end(&hp); 1595 return 0; 1596 1597 clear_hash: 1598 tcp_sigpool_end(&hp); 1599 clear_hash_nostart: 1600 memset(md5_hash, 0, 16); 1601 return 1; 1602 } 1603 1604 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1605 const struct sock *sk, 1606 const struct sk_buff *skb) 1607 { 1608 const struct tcphdr *th = tcp_hdr(skb); 1609 struct tcp_sigpool hp; 1610 __be32 saddr, daddr; 1611 1612 if (sk) { /* valid for establish/request sockets */ 1613 saddr = sk->sk_rcv_saddr; 1614 daddr = sk->sk_daddr; 1615 } else { 1616 const struct iphdr *iph = ip_hdr(skb); 1617 saddr = iph->saddr; 1618 daddr = iph->daddr; 1619 } 1620 1621 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1622 goto clear_hash_nostart; 1623 1624 if (crypto_ahash_init(hp.req)) 1625 goto clear_hash; 1626 1627 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1628 goto clear_hash; 1629 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1630 goto clear_hash; 1631 if (tcp_md5_hash_key(&hp, key)) 1632 goto clear_hash; 1633 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1634 if (crypto_ahash_final(hp.req)) 1635 goto clear_hash; 1636 1637 tcp_sigpool_end(&hp); 1638 return 0; 1639 1640 clear_hash: 1641 tcp_sigpool_end(&hp); 1642 clear_hash_nostart: 1643 memset(md5_hash, 0, 16); 1644 return 1; 1645 } 1646 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1647 1648 #endif 1649 1650 static void tcp_v4_init_req(struct request_sock *req, 1651 const struct sock *sk_listener, 1652 struct sk_buff *skb) 1653 { 1654 struct inet_request_sock *ireq = inet_rsk(req); 1655 struct net *net = sock_net(sk_listener); 1656 1657 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1658 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1659 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1660 } 1661 1662 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1663 struct sk_buff *skb, 1664 struct flowi *fl, 1665 struct request_sock *req, 1666 u32 tw_isn) 1667 { 1668 tcp_v4_init_req(req, sk, skb); 1669 1670 if (security_inet_conn_request(sk, skb, req)) 1671 return NULL; 1672 1673 return inet_csk_route_req(sk, &fl->u.ip4, req); 1674 } 1675 1676 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1677 .family = PF_INET, 1678 .obj_size = sizeof(struct tcp_request_sock), 1679 .rtx_syn_ack = tcp_rtx_synack, 1680 .send_ack = tcp_v4_reqsk_send_ack, 1681 .destructor = tcp_v4_reqsk_destructor, 1682 .send_reset = tcp_v4_send_reset, 1683 .syn_ack_timeout = tcp_syn_ack_timeout, 1684 }; 1685 1686 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1687 .mss_clamp = TCP_MSS_DEFAULT, 1688 #ifdef CONFIG_TCP_MD5SIG 1689 .req_md5_lookup = tcp_v4_md5_lookup, 1690 .calc_md5_hash = tcp_v4_md5_hash_skb, 1691 #endif 1692 #ifdef CONFIG_TCP_AO 1693 .ao_lookup = tcp_v4_ao_lookup_rsk, 1694 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1695 .ao_synack_hash = tcp_v4_ao_synack_hash, 1696 #endif 1697 #ifdef CONFIG_SYN_COOKIES 1698 .cookie_init_seq = cookie_v4_init_sequence, 1699 #endif 1700 .route_req = tcp_v4_route_req, 1701 .init_seq = tcp_v4_init_seq, 1702 .init_ts_off = tcp_v4_init_ts_off, 1703 .send_synack = tcp_v4_send_synack, 1704 }; 1705 1706 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1707 { 1708 /* Never answer to SYNs send to broadcast or multicast */ 1709 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1710 goto drop; 1711 1712 return tcp_conn_request(&tcp_request_sock_ops, 1713 &tcp_request_sock_ipv4_ops, sk, skb); 1714 1715 drop: 1716 tcp_listendrop(sk); 1717 return 0; 1718 } 1719 EXPORT_SYMBOL(tcp_v4_conn_request); 1720 1721 1722 /* 1723 * The three way handshake has completed - we got a valid synack - 1724 * now create the new socket. 1725 */ 1726 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1727 struct request_sock *req, 1728 struct dst_entry *dst, 1729 struct request_sock *req_unhash, 1730 bool *own_req) 1731 { 1732 struct inet_request_sock *ireq; 1733 bool found_dup_sk = false; 1734 struct inet_sock *newinet; 1735 struct tcp_sock *newtp; 1736 struct sock *newsk; 1737 #ifdef CONFIG_TCP_MD5SIG 1738 const union tcp_md5_addr *addr; 1739 struct tcp_md5sig_key *key; 1740 int l3index; 1741 #endif 1742 struct ip_options_rcu *inet_opt; 1743 1744 if (sk_acceptq_is_full(sk)) 1745 goto exit_overflow; 1746 1747 newsk = tcp_create_openreq_child(sk, req, skb); 1748 if (!newsk) 1749 goto exit_nonewsk; 1750 1751 newsk->sk_gso_type = SKB_GSO_TCPV4; 1752 inet_sk_rx_dst_set(newsk, skb); 1753 1754 newtp = tcp_sk(newsk); 1755 newinet = inet_sk(newsk); 1756 ireq = inet_rsk(req); 1757 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1758 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1759 newsk->sk_bound_dev_if = ireq->ir_iif; 1760 newinet->inet_saddr = ireq->ir_loc_addr; 1761 inet_opt = rcu_dereference(ireq->ireq_opt); 1762 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1763 newinet->mc_index = inet_iif(skb); 1764 newinet->mc_ttl = ip_hdr(skb)->ttl; 1765 newinet->rcv_tos = ip_hdr(skb)->tos; 1766 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1767 if (inet_opt) 1768 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1769 atomic_set(&newinet->inet_id, get_random_u16()); 1770 1771 /* Set ToS of the new socket based upon the value of incoming SYN. 1772 * ECT bits are set later in tcp_init_transfer(). 1773 */ 1774 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1775 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1776 1777 if (!dst) { 1778 dst = inet_csk_route_child_sock(sk, newsk, req); 1779 if (!dst) 1780 goto put_and_exit; 1781 } else { 1782 /* syncookie case : see end of cookie_v4_check() */ 1783 } 1784 sk_setup_caps(newsk, dst); 1785 1786 tcp_ca_openreq_child(newsk, dst); 1787 1788 tcp_sync_mss(newsk, dst_mtu(dst)); 1789 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1790 1791 tcp_initialize_rcv_mss(newsk); 1792 1793 #ifdef CONFIG_TCP_MD5SIG 1794 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1795 /* Copy over the MD5 key from the original socket */ 1796 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1797 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1798 if (key && !tcp_rsk_used_ao(req)) { 1799 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1800 goto put_and_exit; 1801 sk_gso_disable(newsk); 1802 } 1803 #endif 1804 #ifdef CONFIG_TCP_AO 1805 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1806 goto put_and_exit; /* OOM, release back memory */ 1807 #endif 1808 1809 if (__inet_inherit_port(sk, newsk) < 0) 1810 goto put_and_exit; 1811 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1812 &found_dup_sk); 1813 if (likely(*own_req)) { 1814 tcp_move_syn(newtp, req); 1815 ireq->ireq_opt = NULL; 1816 } else { 1817 newinet->inet_opt = NULL; 1818 1819 if (!req_unhash && found_dup_sk) { 1820 /* This code path should only be executed in the 1821 * syncookie case only 1822 */ 1823 bh_unlock_sock(newsk); 1824 sock_put(newsk); 1825 newsk = NULL; 1826 } 1827 } 1828 return newsk; 1829 1830 exit_overflow: 1831 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1832 exit_nonewsk: 1833 dst_release(dst); 1834 exit: 1835 tcp_listendrop(sk); 1836 return NULL; 1837 put_and_exit: 1838 newinet->inet_opt = NULL; 1839 inet_csk_prepare_forced_close(newsk); 1840 tcp_done(newsk); 1841 goto exit; 1842 } 1843 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1844 1845 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1846 { 1847 #ifdef CONFIG_SYN_COOKIES 1848 const struct tcphdr *th = tcp_hdr(skb); 1849 1850 if (!th->syn) 1851 sk = cookie_v4_check(sk, skb); 1852 #endif 1853 return sk; 1854 } 1855 1856 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1857 struct tcphdr *th, u32 *cookie) 1858 { 1859 u16 mss = 0; 1860 #ifdef CONFIG_SYN_COOKIES 1861 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1862 &tcp_request_sock_ipv4_ops, sk, th); 1863 if (mss) { 1864 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1865 tcp_synq_overflow(sk); 1866 } 1867 #endif 1868 return mss; 1869 } 1870 1871 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1872 u32)); 1873 /* The socket must have it's spinlock held when we get 1874 * here, unless it is a TCP_LISTEN socket. 1875 * 1876 * We have a potential double-lock case here, so even when 1877 * doing backlog processing we use the BH locking scheme. 1878 * This is because we cannot sleep with the original spinlock 1879 * held. 1880 */ 1881 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1882 { 1883 enum skb_drop_reason reason; 1884 struct sock *rsk; 1885 1886 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1887 struct dst_entry *dst; 1888 1889 dst = rcu_dereference_protected(sk->sk_rx_dst, 1890 lockdep_sock_is_held(sk)); 1891 1892 sock_rps_save_rxhash(sk, skb); 1893 sk_mark_napi_id(sk, skb); 1894 if (dst) { 1895 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1896 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1897 dst, 0)) { 1898 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1899 dst_release(dst); 1900 } 1901 } 1902 tcp_rcv_established(sk, skb); 1903 return 0; 1904 } 1905 1906 if (tcp_checksum_complete(skb)) 1907 goto csum_err; 1908 1909 if (sk->sk_state == TCP_LISTEN) { 1910 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1911 1912 if (!nsk) 1913 return 0; 1914 if (nsk != sk) { 1915 reason = tcp_child_process(sk, nsk, skb); 1916 if (reason) { 1917 rsk = nsk; 1918 goto reset; 1919 } 1920 return 0; 1921 } 1922 } else 1923 sock_rps_save_rxhash(sk, skb); 1924 1925 reason = tcp_rcv_state_process(sk, skb); 1926 if (reason) { 1927 rsk = sk; 1928 goto reset; 1929 } 1930 return 0; 1931 1932 reset: 1933 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1934 discard: 1935 sk_skb_reason_drop(sk, skb, reason); 1936 /* Be careful here. If this function gets more complicated and 1937 * gcc suffers from register pressure on the x86, sk (in %ebx) 1938 * might be destroyed here. This current version compiles correctly, 1939 * but you have been warned. 1940 */ 1941 return 0; 1942 1943 csum_err: 1944 reason = SKB_DROP_REASON_TCP_CSUM; 1945 trace_tcp_bad_csum(skb); 1946 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1947 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1948 goto discard; 1949 } 1950 EXPORT_SYMBOL(tcp_v4_do_rcv); 1951 1952 int tcp_v4_early_demux(struct sk_buff *skb) 1953 { 1954 struct net *net = dev_net(skb->dev); 1955 const struct iphdr *iph; 1956 const struct tcphdr *th; 1957 struct sock *sk; 1958 1959 if (skb->pkt_type != PACKET_HOST) 1960 return 0; 1961 1962 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1963 return 0; 1964 1965 iph = ip_hdr(skb); 1966 th = tcp_hdr(skb); 1967 1968 if (th->doff < sizeof(struct tcphdr) / 4) 1969 return 0; 1970 1971 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1972 iph->saddr, th->source, 1973 iph->daddr, ntohs(th->dest), 1974 skb->skb_iif, inet_sdif(skb)); 1975 if (sk) { 1976 skb->sk = sk; 1977 skb->destructor = sock_edemux; 1978 if (sk_fullsock(sk)) { 1979 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1980 1981 if (dst) 1982 dst = dst_check(dst, 0); 1983 if (dst && 1984 sk->sk_rx_dst_ifindex == skb->skb_iif) 1985 skb_dst_set_noref(skb, dst); 1986 } 1987 } 1988 return 0; 1989 } 1990 1991 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1992 enum skb_drop_reason *reason) 1993 { 1994 u32 tail_gso_size, tail_gso_segs; 1995 struct skb_shared_info *shinfo; 1996 const struct tcphdr *th; 1997 struct tcphdr *thtail; 1998 struct sk_buff *tail; 1999 unsigned int hdrlen; 2000 bool fragstolen; 2001 u32 gso_segs; 2002 u32 gso_size; 2003 u64 limit; 2004 int delta; 2005 2006 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2007 * we can fix skb->truesize to its real value to avoid future drops. 2008 * This is valid because skb is not yet charged to the socket. 2009 * It has been noticed pure SACK packets were sometimes dropped 2010 * (if cooked by drivers without copybreak feature). 2011 */ 2012 skb_condense(skb); 2013 2014 skb_dst_drop(skb); 2015 2016 if (unlikely(tcp_checksum_complete(skb))) { 2017 bh_unlock_sock(sk); 2018 trace_tcp_bad_csum(skb); 2019 *reason = SKB_DROP_REASON_TCP_CSUM; 2020 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2021 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2022 return true; 2023 } 2024 2025 /* Attempt coalescing to last skb in backlog, even if we are 2026 * above the limits. 2027 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2028 */ 2029 th = (const struct tcphdr *)skb->data; 2030 hdrlen = th->doff * 4; 2031 2032 tail = sk->sk_backlog.tail; 2033 if (!tail) 2034 goto no_coalesce; 2035 thtail = (struct tcphdr *)tail->data; 2036 2037 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2038 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2039 ((TCP_SKB_CB(tail)->tcp_flags | 2040 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2041 !((TCP_SKB_CB(tail)->tcp_flags & 2042 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2043 ((TCP_SKB_CB(tail)->tcp_flags ^ 2044 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 2045 !tcp_skb_can_collapse_rx(tail, skb) || 2046 thtail->doff != th->doff || 2047 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2048 goto no_coalesce; 2049 2050 __skb_pull(skb, hdrlen); 2051 2052 shinfo = skb_shinfo(skb); 2053 gso_size = shinfo->gso_size ?: skb->len; 2054 gso_segs = shinfo->gso_segs ?: 1; 2055 2056 shinfo = skb_shinfo(tail); 2057 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2058 tail_gso_segs = shinfo->gso_segs ?: 1; 2059 2060 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2061 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2062 2063 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2064 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2065 thtail->window = th->window; 2066 } 2067 2068 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2069 * thtail->fin, so that the fast path in tcp_rcv_established() 2070 * is not entered if we append a packet with a FIN. 2071 * SYN, RST, URG are not present. 2072 * ACK is set on both packets. 2073 * PSH : we do not really care in TCP stack, 2074 * at least for 'GRO' packets. 2075 */ 2076 thtail->fin |= th->fin; 2077 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2078 2079 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2080 TCP_SKB_CB(tail)->has_rxtstamp = true; 2081 tail->tstamp = skb->tstamp; 2082 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2083 } 2084 2085 /* Not as strict as GRO. We only need to carry mss max value */ 2086 shinfo->gso_size = max(gso_size, tail_gso_size); 2087 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2088 2089 sk->sk_backlog.len += delta; 2090 __NET_INC_STATS(sock_net(sk), 2091 LINUX_MIB_TCPBACKLOGCOALESCE); 2092 kfree_skb_partial(skb, fragstolen); 2093 return false; 2094 } 2095 __skb_push(skb, hdrlen); 2096 2097 no_coalesce: 2098 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2099 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2100 * sk_rcvbuf in normal conditions. 2101 */ 2102 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2103 2104 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2105 2106 /* Only socket owner can try to collapse/prune rx queues 2107 * to reduce memory overhead, so add a little headroom here. 2108 * Few sockets backlog are possibly concurrently non empty. 2109 */ 2110 limit += 64 * 1024; 2111 2112 limit = min_t(u64, limit, UINT_MAX); 2113 2114 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2115 bh_unlock_sock(sk); 2116 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2117 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2118 return true; 2119 } 2120 return false; 2121 } 2122 EXPORT_SYMBOL(tcp_add_backlog); 2123 2124 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2125 { 2126 struct tcphdr *th = (struct tcphdr *)skb->data; 2127 2128 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2129 } 2130 EXPORT_SYMBOL(tcp_filter); 2131 2132 static void tcp_v4_restore_cb(struct sk_buff *skb) 2133 { 2134 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2135 sizeof(struct inet_skb_parm)); 2136 } 2137 2138 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2139 const struct tcphdr *th) 2140 { 2141 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2142 * barrier() makes sure compiler wont play fool^Waliasing games. 2143 */ 2144 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2145 sizeof(struct inet_skb_parm)); 2146 barrier(); 2147 2148 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2149 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2150 skb->len - th->doff * 4); 2151 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2152 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 2153 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2154 TCP_SKB_CB(skb)->sacked = 0; 2155 TCP_SKB_CB(skb)->has_rxtstamp = 2156 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2157 } 2158 2159 /* 2160 * From tcp_input.c 2161 */ 2162 2163 int tcp_v4_rcv(struct sk_buff *skb) 2164 { 2165 struct net *net = dev_net(skb->dev); 2166 enum skb_drop_reason drop_reason; 2167 int sdif = inet_sdif(skb); 2168 int dif = inet_iif(skb); 2169 const struct iphdr *iph; 2170 const struct tcphdr *th; 2171 struct sock *sk = NULL; 2172 bool refcounted; 2173 int ret; 2174 u32 isn; 2175 2176 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2177 if (skb->pkt_type != PACKET_HOST) 2178 goto discard_it; 2179 2180 /* Count it even if it's bad */ 2181 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2182 2183 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2184 goto discard_it; 2185 2186 th = (const struct tcphdr *)skb->data; 2187 2188 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2189 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2190 goto bad_packet; 2191 } 2192 if (!pskb_may_pull(skb, th->doff * 4)) 2193 goto discard_it; 2194 2195 /* An explanation is required here, I think. 2196 * Packet length and doff are validated by header prediction, 2197 * provided case of th->doff==0 is eliminated. 2198 * So, we defer the checks. */ 2199 2200 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2201 goto csum_error; 2202 2203 th = (const struct tcphdr *)skb->data; 2204 iph = ip_hdr(skb); 2205 lookup: 2206 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2207 skb, __tcp_hdrlen(th), th->source, 2208 th->dest, sdif, &refcounted); 2209 if (!sk) 2210 goto no_tcp_socket; 2211 2212 if (sk->sk_state == TCP_TIME_WAIT) 2213 goto do_time_wait; 2214 2215 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2216 struct request_sock *req = inet_reqsk(sk); 2217 bool req_stolen = false; 2218 struct sock *nsk; 2219 2220 sk = req->rsk_listener; 2221 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2222 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2223 else 2224 drop_reason = tcp_inbound_hash(sk, req, skb, 2225 &iph->saddr, &iph->daddr, 2226 AF_INET, dif, sdif); 2227 if (unlikely(drop_reason)) { 2228 sk_drops_add(sk, skb); 2229 reqsk_put(req); 2230 goto discard_it; 2231 } 2232 if (tcp_checksum_complete(skb)) { 2233 reqsk_put(req); 2234 goto csum_error; 2235 } 2236 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2237 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2238 if (!nsk) { 2239 inet_csk_reqsk_queue_drop_and_put(sk, req); 2240 goto lookup; 2241 } 2242 sk = nsk; 2243 /* reuseport_migrate_sock() has already held one sk_refcnt 2244 * before returning. 2245 */ 2246 } else { 2247 /* We own a reference on the listener, increase it again 2248 * as we might lose it too soon. 2249 */ 2250 sock_hold(sk); 2251 } 2252 refcounted = true; 2253 nsk = NULL; 2254 if (!tcp_filter(sk, skb)) { 2255 th = (const struct tcphdr *)skb->data; 2256 iph = ip_hdr(skb); 2257 tcp_v4_fill_cb(skb, iph, th); 2258 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2259 } else { 2260 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2261 } 2262 if (!nsk) { 2263 reqsk_put(req); 2264 if (req_stolen) { 2265 /* Another cpu got exclusive access to req 2266 * and created a full blown socket. 2267 * Try to feed this packet to this socket 2268 * instead of discarding it. 2269 */ 2270 tcp_v4_restore_cb(skb); 2271 sock_put(sk); 2272 goto lookup; 2273 } 2274 goto discard_and_relse; 2275 } 2276 nf_reset_ct(skb); 2277 if (nsk == sk) { 2278 reqsk_put(req); 2279 tcp_v4_restore_cb(skb); 2280 } else { 2281 drop_reason = tcp_child_process(sk, nsk, skb); 2282 if (drop_reason) { 2283 enum sk_rst_reason rst_reason; 2284 2285 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2286 tcp_v4_send_reset(nsk, skb, rst_reason); 2287 goto discard_and_relse; 2288 } 2289 sock_put(sk); 2290 return 0; 2291 } 2292 } 2293 2294 process: 2295 if (static_branch_unlikely(&ip4_min_ttl)) { 2296 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2297 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2298 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2299 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2300 goto discard_and_relse; 2301 } 2302 } 2303 2304 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2305 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2306 goto discard_and_relse; 2307 } 2308 2309 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2310 AF_INET, dif, sdif); 2311 if (drop_reason) 2312 goto discard_and_relse; 2313 2314 nf_reset_ct(skb); 2315 2316 if (tcp_filter(sk, skb)) { 2317 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2318 goto discard_and_relse; 2319 } 2320 th = (const struct tcphdr *)skb->data; 2321 iph = ip_hdr(skb); 2322 tcp_v4_fill_cb(skb, iph, th); 2323 2324 skb->dev = NULL; 2325 2326 if (sk->sk_state == TCP_LISTEN) { 2327 ret = tcp_v4_do_rcv(sk, skb); 2328 goto put_and_return; 2329 } 2330 2331 sk_incoming_cpu_update(sk); 2332 2333 bh_lock_sock_nested(sk); 2334 tcp_segs_in(tcp_sk(sk), skb); 2335 ret = 0; 2336 if (!sock_owned_by_user(sk)) { 2337 ret = tcp_v4_do_rcv(sk, skb); 2338 } else { 2339 if (tcp_add_backlog(sk, skb, &drop_reason)) 2340 goto discard_and_relse; 2341 } 2342 bh_unlock_sock(sk); 2343 2344 put_and_return: 2345 if (refcounted) 2346 sock_put(sk); 2347 2348 return ret; 2349 2350 no_tcp_socket: 2351 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2352 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2353 goto discard_it; 2354 2355 tcp_v4_fill_cb(skb, iph, th); 2356 2357 if (tcp_checksum_complete(skb)) { 2358 csum_error: 2359 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2360 trace_tcp_bad_csum(skb); 2361 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2362 bad_packet: 2363 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2364 } else { 2365 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2366 } 2367 2368 discard_it: 2369 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2370 /* Discard frame. */ 2371 sk_skb_reason_drop(sk, skb, drop_reason); 2372 return 0; 2373 2374 discard_and_relse: 2375 sk_drops_add(sk, skb); 2376 if (refcounted) 2377 sock_put(sk); 2378 goto discard_it; 2379 2380 do_time_wait: 2381 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2382 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2383 inet_twsk_put(inet_twsk(sk)); 2384 goto discard_it; 2385 } 2386 2387 tcp_v4_fill_cb(skb, iph, th); 2388 2389 if (tcp_checksum_complete(skb)) { 2390 inet_twsk_put(inet_twsk(sk)); 2391 goto csum_error; 2392 } 2393 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { 2394 case TCP_TW_SYN: { 2395 struct sock *sk2 = inet_lookup_listener(net, 2396 net->ipv4.tcp_death_row.hashinfo, 2397 skb, __tcp_hdrlen(th), 2398 iph->saddr, th->source, 2399 iph->daddr, th->dest, 2400 inet_iif(skb), 2401 sdif); 2402 if (sk2) { 2403 inet_twsk_deschedule_put(inet_twsk(sk)); 2404 sk = sk2; 2405 tcp_v4_restore_cb(skb); 2406 refcounted = false; 2407 __this_cpu_write(tcp_tw_isn, isn); 2408 goto process; 2409 } 2410 } 2411 /* to ACK */ 2412 fallthrough; 2413 case TCP_TW_ACK: 2414 tcp_v4_timewait_ack(sk, skb); 2415 break; 2416 case TCP_TW_RST: 2417 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2418 inet_twsk_deschedule_put(inet_twsk(sk)); 2419 goto discard_it; 2420 case TCP_TW_SUCCESS:; 2421 } 2422 goto discard_it; 2423 } 2424 2425 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2426 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2427 .twsk_destructor= tcp_twsk_destructor, 2428 }; 2429 2430 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2431 { 2432 struct dst_entry *dst = skb_dst(skb); 2433 2434 if (dst && dst_hold_safe(dst)) { 2435 rcu_assign_pointer(sk->sk_rx_dst, dst); 2436 sk->sk_rx_dst_ifindex = skb->skb_iif; 2437 } 2438 } 2439 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2440 2441 const struct inet_connection_sock_af_ops ipv4_specific = { 2442 .queue_xmit = ip_queue_xmit, 2443 .send_check = tcp_v4_send_check, 2444 .rebuild_header = inet_sk_rebuild_header, 2445 .sk_rx_dst_set = inet_sk_rx_dst_set, 2446 .conn_request = tcp_v4_conn_request, 2447 .syn_recv_sock = tcp_v4_syn_recv_sock, 2448 .net_header_len = sizeof(struct iphdr), 2449 .setsockopt = ip_setsockopt, 2450 .getsockopt = ip_getsockopt, 2451 .addr2sockaddr = inet_csk_addr2sockaddr, 2452 .sockaddr_len = sizeof(struct sockaddr_in), 2453 .mtu_reduced = tcp_v4_mtu_reduced, 2454 }; 2455 EXPORT_SYMBOL(ipv4_specific); 2456 2457 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2458 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2459 #ifdef CONFIG_TCP_MD5SIG 2460 .md5_lookup = tcp_v4_md5_lookup, 2461 .calc_md5_hash = tcp_v4_md5_hash_skb, 2462 .md5_parse = tcp_v4_parse_md5_keys, 2463 #endif 2464 #ifdef CONFIG_TCP_AO 2465 .ao_lookup = tcp_v4_ao_lookup, 2466 .calc_ao_hash = tcp_v4_ao_hash_skb, 2467 .ao_parse = tcp_v4_parse_ao, 2468 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2469 #endif 2470 }; 2471 #endif 2472 2473 /* NOTE: A lot of things set to zero explicitly by call to 2474 * sk_alloc() so need not be done here. 2475 */ 2476 static int tcp_v4_init_sock(struct sock *sk) 2477 { 2478 struct inet_connection_sock *icsk = inet_csk(sk); 2479 2480 tcp_init_sock(sk); 2481 2482 icsk->icsk_af_ops = &ipv4_specific; 2483 2484 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2485 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2486 #endif 2487 2488 return 0; 2489 } 2490 2491 #ifdef CONFIG_TCP_MD5SIG 2492 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2493 { 2494 struct tcp_md5sig_info *md5sig; 2495 2496 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2497 kfree(md5sig); 2498 static_branch_slow_dec_deferred(&tcp_md5_needed); 2499 tcp_md5_release_sigpool(); 2500 } 2501 #endif 2502 2503 void tcp_v4_destroy_sock(struct sock *sk) 2504 { 2505 struct tcp_sock *tp = tcp_sk(sk); 2506 2507 trace_tcp_destroy_sock(sk); 2508 2509 tcp_clear_xmit_timers(sk); 2510 2511 tcp_cleanup_congestion_control(sk); 2512 2513 tcp_cleanup_ulp(sk); 2514 2515 /* Cleanup up the write buffer. */ 2516 tcp_write_queue_purge(sk); 2517 2518 /* Check if we want to disable active TFO */ 2519 tcp_fastopen_active_disable_ofo_check(sk); 2520 2521 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2522 skb_rbtree_purge(&tp->out_of_order_queue); 2523 2524 #ifdef CONFIG_TCP_MD5SIG 2525 /* Clean up the MD5 key list, if any */ 2526 if (tp->md5sig_info) { 2527 struct tcp_md5sig_info *md5sig; 2528 2529 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2530 tcp_clear_md5_list(sk); 2531 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2532 rcu_assign_pointer(tp->md5sig_info, NULL); 2533 } 2534 #endif 2535 tcp_ao_destroy_sock(sk, false); 2536 2537 /* Clean up a referenced TCP bind bucket. */ 2538 if (inet_csk(sk)->icsk_bind_hash) 2539 inet_put_port(sk); 2540 2541 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2542 2543 /* If socket is aborted during connect operation */ 2544 tcp_free_fastopen_req(tp); 2545 tcp_fastopen_destroy_cipher(sk); 2546 tcp_saved_syn_free(tp); 2547 2548 sk_sockets_allocated_dec(sk); 2549 } 2550 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2551 2552 #ifdef CONFIG_PROC_FS 2553 /* Proc filesystem TCP sock list dumping. */ 2554 2555 static unsigned short seq_file_family(const struct seq_file *seq); 2556 2557 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2558 { 2559 unsigned short family = seq_file_family(seq); 2560 2561 /* AF_UNSPEC is used as a match all */ 2562 return ((family == AF_UNSPEC || family == sk->sk_family) && 2563 net_eq(sock_net(sk), seq_file_net(seq))); 2564 } 2565 2566 /* Find a non empty bucket (starting from st->bucket) 2567 * and return the first sk from it. 2568 */ 2569 static void *listening_get_first(struct seq_file *seq) 2570 { 2571 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2572 struct tcp_iter_state *st = seq->private; 2573 2574 st->offset = 0; 2575 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2576 struct inet_listen_hashbucket *ilb2; 2577 struct hlist_nulls_node *node; 2578 struct sock *sk; 2579 2580 ilb2 = &hinfo->lhash2[st->bucket]; 2581 if (hlist_nulls_empty(&ilb2->nulls_head)) 2582 continue; 2583 2584 spin_lock(&ilb2->lock); 2585 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2586 if (seq_sk_match(seq, sk)) 2587 return sk; 2588 } 2589 spin_unlock(&ilb2->lock); 2590 } 2591 2592 return NULL; 2593 } 2594 2595 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2596 * If "cur" is the last one in the st->bucket, 2597 * call listening_get_first() to return the first sk of the next 2598 * non empty bucket. 2599 */ 2600 static void *listening_get_next(struct seq_file *seq, void *cur) 2601 { 2602 struct tcp_iter_state *st = seq->private; 2603 struct inet_listen_hashbucket *ilb2; 2604 struct hlist_nulls_node *node; 2605 struct inet_hashinfo *hinfo; 2606 struct sock *sk = cur; 2607 2608 ++st->num; 2609 ++st->offset; 2610 2611 sk = sk_nulls_next(sk); 2612 sk_nulls_for_each_from(sk, node) { 2613 if (seq_sk_match(seq, sk)) 2614 return sk; 2615 } 2616 2617 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2618 ilb2 = &hinfo->lhash2[st->bucket]; 2619 spin_unlock(&ilb2->lock); 2620 ++st->bucket; 2621 return listening_get_first(seq); 2622 } 2623 2624 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2625 { 2626 struct tcp_iter_state *st = seq->private; 2627 void *rc; 2628 2629 st->bucket = 0; 2630 st->offset = 0; 2631 rc = listening_get_first(seq); 2632 2633 while (rc && *pos) { 2634 rc = listening_get_next(seq, rc); 2635 --*pos; 2636 } 2637 return rc; 2638 } 2639 2640 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2641 const struct tcp_iter_state *st) 2642 { 2643 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2644 } 2645 2646 /* 2647 * Get first established socket starting from bucket given in st->bucket. 2648 * If st->bucket is zero, the very first socket in the hash is returned. 2649 */ 2650 static void *established_get_first(struct seq_file *seq) 2651 { 2652 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2653 struct tcp_iter_state *st = seq->private; 2654 2655 st->offset = 0; 2656 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2657 struct sock *sk; 2658 struct hlist_nulls_node *node; 2659 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2660 2661 cond_resched(); 2662 2663 /* Lockless fast path for the common case of empty buckets */ 2664 if (empty_bucket(hinfo, st)) 2665 continue; 2666 2667 spin_lock_bh(lock); 2668 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2669 if (seq_sk_match(seq, sk)) 2670 return sk; 2671 } 2672 spin_unlock_bh(lock); 2673 } 2674 2675 return NULL; 2676 } 2677 2678 static void *established_get_next(struct seq_file *seq, void *cur) 2679 { 2680 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2681 struct tcp_iter_state *st = seq->private; 2682 struct hlist_nulls_node *node; 2683 struct sock *sk = cur; 2684 2685 ++st->num; 2686 ++st->offset; 2687 2688 sk = sk_nulls_next(sk); 2689 2690 sk_nulls_for_each_from(sk, node) { 2691 if (seq_sk_match(seq, sk)) 2692 return sk; 2693 } 2694 2695 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2696 ++st->bucket; 2697 return established_get_first(seq); 2698 } 2699 2700 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2701 { 2702 struct tcp_iter_state *st = seq->private; 2703 void *rc; 2704 2705 st->bucket = 0; 2706 rc = established_get_first(seq); 2707 2708 while (rc && pos) { 2709 rc = established_get_next(seq, rc); 2710 --pos; 2711 } 2712 return rc; 2713 } 2714 2715 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2716 { 2717 void *rc; 2718 struct tcp_iter_state *st = seq->private; 2719 2720 st->state = TCP_SEQ_STATE_LISTENING; 2721 rc = listening_get_idx(seq, &pos); 2722 2723 if (!rc) { 2724 st->state = TCP_SEQ_STATE_ESTABLISHED; 2725 rc = established_get_idx(seq, pos); 2726 } 2727 2728 return rc; 2729 } 2730 2731 static void *tcp_seek_last_pos(struct seq_file *seq) 2732 { 2733 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2734 struct tcp_iter_state *st = seq->private; 2735 int bucket = st->bucket; 2736 int offset = st->offset; 2737 int orig_num = st->num; 2738 void *rc = NULL; 2739 2740 switch (st->state) { 2741 case TCP_SEQ_STATE_LISTENING: 2742 if (st->bucket > hinfo->lhash2_mask) 2743 break; 2744 rc = listening_get_first(seq); 2745 while (offset-- && rc && bucket == st->bucket) 2746 rc = listening_get_next(seq, rc); 2747 if (rc) 2748 break; 2749 st->bucket = 0; 2750 st->state = TCP_SEQ_STATE_ESTABLISHED; 2751 fallthrough; 2752 case TCP_SEQ_STATE_ESTABLISHED: 2753 if (st->bucket > hinfo->ehash_mask) 2754 break; 2755 rc = established_get_first(seq); 2756 while (offset-- && rc && bucket == st->bucket) 2757 rc = established_get_next(seq, rc); 2758 } 2759 2760 st->num = orig_num; 2761 2762 return rc; 2763 } 2764 2765 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2766 { 2767 struct tcp_iter_state *st = seq->private; 2768 void *rc; 2769 2770 if (*pos && *pos == st->last_pos) { 2771 rc = tcp_seek_last_pos(seq); 2772 if (rc) 2773 goto out; 2774 } 2775 2776 st->state = TCP_SEQ_STATE_LISTENING; 2777 st->num = 0; 2778 st->bucket = 0; 2779 st->offset = 0; 2780 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2781 2782 out: 2783 st->last_pos = *pos; 2784 return rc; 2785 } 2786 EXPORT_SYMBOL(tcp_seq_start); 2787 2788 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2789 { 2790 struct tcp_iter_state *st = seq->private; 2791 void *rc = NULL; 2792 2793 if (v == SEQ_START_TOKEN) { 2794 rc = tcp_get_idx(seq, 0); 2795 goto out; 2796 } 2797 2798 switch (st->state) { 2799 case TCP_SEQ_STATE_LISTENING: 2800 rc = listening_get_next(seq, v); 2801 if (!rc) { 2802 st->state = TCP_SEQ_STATE_ESTABLISHED; 2803 st->bucket = 0; 2804 st->offset = 0; 2805 rc = established_get_first(seq); 2806 } 2807 break; 2808 case TCP_SEQ_STATE_ESTABLISHED: 2809 rc = established_get_next(seq, v); 2810 break; 2811 } 2812 out: 2813 ++*pos; 2814 st->last_pos = *pos; 2815 return rc; 2816 } 2817 EXPORT_SYMBOL(tcp_seq_next); 2818 2819 void tcp_seq_stop(struct seq_file *seq, void *v) 2820 { 2821 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2822 struct tcp_iter_state *st = seq->private; 2823 2824 switch (st->state) { 2825 case TCP_SEQ_STATE_LISTENING: 2826 if (v != SEQ_START_TOKEN) 2827 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2828 break; 2829 case TCP_SEQ_STATE_ESTABLISHED: 2830 if (v) 2831 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2832 break; 2833 } 2834 } 2835 EXPORT_SYMBOL(tcp_seq_stop); 2836 2837 static void get_openreq4(const struct request_sock *req, 2838 struct seq_file *f, int i) 2839 { 2840 const struct inet_request_sock *ireq = inet_rsk(req); 2841 long delta = req->rsk_timer.expires - jiffies; 2842 2843 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2844 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2845 i, 2846 ireq->ir_loc_addr, 2847 ireq->ir_num, 2848 ireq->ir_rmt_addr, 2849 ntohs(ireq->ir_rmt_port), 2850 TCP_SYN_RECV, 2851 0, 0, /* could print option size, but that is af dependent. */ 2852 1, /* timers active (only the expire timer) */ 2853 jiffies_delta_to_clock_t(delta), 2854 req->num_timeout, 2855 from_kuid_munged(seq_user_ns(f), 2856 sock_i_uid(req->rsk_listener)), 2857 0, /* non standard timer */ 2858 0, /* open_requests have no inode */ 2859 0, 2860 req); 2861 } 2862 2863 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2864 { 2865 int timer_active; 2866 unsigned long timer_expires; 2867 const struct tcp_sock *tp = tcp_sk(sk); 2868 const struct inet_connection_sock *icsk = inet_csk(sk); 2869 const struct inet_sock *inet = inet_sk(sk); 2870 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2871 __be32 dest = inet->inet_daddr; 2872 __be32 src = inet->inet_rcv_saddr; 2873 __u16 destp = ntohs(inet->inet_dport); 2874 __u16 srcp = ntohs(inet->inet_sport); 2875 int rx_queue; 2876 int state; 2877 2878 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2879 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2880 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2881 timer_active = 1; 2882 timer_expires = icsk->icsk_timeout; 2883 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2884 timer_active = 4; 2885 timer_expires = icsk->icsk_timeout; 2886 } else if (timer_pending(&sk->sk_timer)) { 2887 timer_active = 2; 2888 timer_expires = sk->sk_timer.expires; 2889 } else { 2890 timer_active = 0; 2891 timer_expires = jiffies; 2892 } 2893 2894 state = inet_sk_state_load(sk); 2895 if (state == TCP_LISTEN) 2896 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2897 else 2898 /* Because we don't lock the socket, 2899 * we might find a transient negative value. 2900 */ 2901 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2902 READ_ONCE(tp->copied_seq), 0); 2903 2904 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2905 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2906 i, src, srcp, dest, destp, state, 2907 READ_ONCE(tp->write_seq) - tp->snd_una, 2908 rx_queue, 2909 timer_active, 2910 jiffies_delta_to_clock_t(timer_expires - jiffies), 2911 icsk->icsk_retransmits, 2912 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2913 icsk->icsk_probes_out, 2914 sock_i_ino(sk), 2915 refcount_read(&sk->sk_refcnt), sk, 2916 jiffies_to_clock_t(icsk->icsk_rto), 2917 jiffies_to_clock_t(icsk->icsk_ack.ato), 2918 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2919 tcp_snd_cwnd(tp), 2920 state == TCP_LISTEN ? 2921 fastopenq->max_qlen : 2922 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2923 } 2924 2925 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2926 struct seq_file *f, int i) 2927 { 2928 long delta = tw->tw_timer.expires - jiffies; 2929 __be32 dest, src; 2930 __u16 destp, srcp; 2931 2932 dest = tw->tw_daddr; 2933 src = tw->tw_rcv_saddr; 2934 destp = ntohs(tw->tw_dport); 2935 srcp = ntohs(tw->tw_sport); 2936 2937 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2938 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2939 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2940 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2941 refcount_read(&tw->tw_refcnt), tw); 2942 } 2943 2944 #define TMPSZ 150 2945 2946 static int tcp4_seq_show(struct seq_file *seq, void *v) 2947 { 2948 struct tcp_iter_state *st; 2949 struct sock *sk = v; 2950 2951 seq_setwidth(seq, TMPSZ - 1); 2952 if (v == SEQ_START_TOKEN) { 2953 seq_puts(seq, " sl local_address rem_address st tx_queue " 2954 "rx_queue tr tm->when retrnsmt uid timeout " 2955 "inode"); 2956 goto out; 2957 } 2958 st = seq->private; 2959 2960 if (sk->sk_state == TCP_TIME_WAIT) 2961 get_timewait4_sock(v, seq, st->num); 2962 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2963 get_openreq4(v, seq, st->num); 2964 else 2965 get_tcp4_sock(v, seq, st->num); 2966 out: 2967 seq_pad(seq, '\n'); 2968 return 0; 2969 } 2970 2971 #ifdef CONFIG_BPF_SYSCALL 2972 struct bpf_tcp_iter_state { 2973 struct tcp_iter_state state; 2974 unsigned int cur_sk; 2975 unsigned int end_sk; 2976 unsigned int max_sk; 2977 struct sock **batch; 2978 bool st_bucket_done; 2979 }; 2980 2981 struct bpf_iter__tcp { 2982 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2983 __bpf_md_ptr(struct sock_common *, sk_common); 2984 uid_t uid __aligned(8); 2985 }; 2986 2987 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2988 struct sock_common *sk_common, uid_t uid) 2989 { 2990 struct bpf_iter__tcp ctx; 2991 2992 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2993 ctx.meta = meta; 2994 ctx.sk_common = sk_common; 2995 ctx.uid = uid; 2996 return bpf_iter_run_prog(prog, &ctx); 2997 } 2998 2999 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3000 { 3001 while (iter->cur_sk < iter->end_sk) 3002 sock_gen_put(iter->batch[iter->cur_sk++]); 3003 } 3004 3005 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3006 unsigned int new_batch_sz) 3007 { 3008 struct sock **new_batch; 3009 3010 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3011 GFP_USER | __GFP_NOWARN); 3012 if (!new_batch) 3013 return -ENOMEM; 3014 3015 bpf_iter_tcp_put_batch(iter); 3016 kvfree(iter->batch); 3017 iter->batch = new_batch; 3018 iter->max_sk = new_batch_sz; 3019 3020 return 0; 3021 } 3022 3023 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3024 struct sock *start_sk) 3025 { 3026 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3027 struct bpf_tcp_iter_state *iter = seq->private; 3028 struct tcp_iter_state *st = &iter->state; 3029 struct hlist_nulls_node *node; 3030 unsigned int expected = 1; 3031 struct sock *sk; 3032 3033 sock_hold(start_sk); 3034 iter->batch[iter->end_sk++] = start_sk; 3035 3036 sk = sk_nulls_next(start_sk); 3037 sk_nulls_for_each_from(sk, node) { 3038 if (seq_sk_match(seq, sk)) { 3039 if (iter->end_sk < iter->max_sk) { 3040 sock_hold(sk); 3041 iter->batch[iter->end_sk++] = sk; 3042 } 3043 expected++; 3044 } 3045 } 3046 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3047 3048 return expected; 3049 } 3050 3051 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3052 struct sock *start_sk) 3053 { 3054 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3055 struct bpf_tcp_iter_state *iter = seq->private; 3056 struct tcp_iter_state *st = &iter->state; 3057 struct hlist_nulls_node *node; 3058 unsigned int expected = 1; 3059 struct sock *sk; 3060 3061 sock_hold(start_sk); 3062 iter->batch[iter->end_sk++] = start_sk; 3063 3064 sk = sk_nulls_next(start_sk); 3065 sk_nulls_for_each_from(sk, node) { 3066 if (seq_sk_match(seq, sk)) { 3067 if (iter->end_sk < iter->max_sk) { 3068 sock_hold(sk); 3069 iter->batch[iter->end_sk++] = sk; 3070 } 3071 expected++; 3072 } 3073 } 3074 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3075 3076 return expected; 3077 } 3078 3079 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3080 { 3081 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3082 struct bpf_tcp_iter_state *iter = seq->private; 3083 struct tcp_iter_state *st = &iter->state; 3084 unsigned int expected; 3085 bool resized = false; 3086 struct sock *sk; 3087 3088 /* The st->bucket is done. Directly advance to the next 3089 * bucket instead of having the tcp_seek_last_pos() to skip 3090 * one by one in the current bucket and eventually find out 3091 * it has to advance to the next bucket. 3092 */ 3093 if (iter->st_bucket_done) { 3094 st->offset = 0; 3095 st->bucket++; 3096 if (st->state == TCP_SEQ_STATE_LISTENING && 3097 st->bucket > hinfo->lhash2_mask) { 3098 st->state = TCP_SEQ_STATE_ESTABLISHED; 3099 st->bucket = 0; 3100 } 3101 } 3102 3103 again: 3104 /* Get a new batch */ 3105 iter->cur_sk = 0; 3106 iter->end_sk = 0; 3107 iter->st_bucket_done = false; 3108 3109 sk = tcp_seek_last_pos(seq); 3110 if (!sk) 3111 return NULL; /* Done */ 3112 3113 if (st->state == TCP_SEQ_STATE_LISTENING) 3114 expected = bpf_iter_tcp_listening_batch(seq, sk); 3115 else 3116 expected = bpf_iter_tcp_established_batch(seq, sk); 3117 3118 if (iter->end_sk == expected) { 3119 iter->st_bucket_done = true; 3120 return sk; 3121 } 3122 3123 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3124 resized = true; 3125 goto again; 3126 } 3127 3128 return sk; 3129 } 3130 3131 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3132 { 3133 /* bpf iter does not support lseek, so it always 3134 * continue from where it was stop()-ped. 3135 */ 3136 if (*pos) 3137 return bpf_iter_tcp_batch(seq); 3138 3139 return SEQ_START_TOKEN; 3140 } 3141 3142 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3143 { 3144 struct bpf_tcp_iter_state *iter = seq->private; 3145 struct tcp_iter_state *st = &iter->state; 3146 struct sock *sk; 3147 3148 /* Whenever seq_next() is called, the iter->cur_sk is 3149 * done with seq_show(), so advance to the next sk in 3150 * the batch. 3151 */ 3152 if (iter->cur_sk < iter->end_sk) { 3153 /* Keeping st->num consistent in tcp_iter_state. 3154 * bpf_iter_tcp does not use st->num. 3155 * meta.seq_num is used instead. 3156 */ 3157 st->num++; 3158 /* Move st->offset to the next sk in the bucket such that 3159 * the future start() will resume at st->offset in 3160 * st->bucket. See tcp_seek_last_pos(). 3161 */ 3162 st->offset++; 3163 sock_gen_put(iter->batch[iter->cur_sk++]); 3164 } 3165 3166 if (iter->cur_sk < iter->end_sk) 3167 sk = iter->batch[iter->cur_sk]; 3168 else 3169 sk = bpf_iter_tcp_batch(seq); 3170 3171 ++*pos; 3172 /* Keeping st->last_pos consistent in tcp_iter_state. 3173 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3174 */ 3175 st->last_pos = *pos; 3176 return sk; 3177 } 3178 3179 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3180 { 3181 struct bpf_iter_meta meta; 3182 struct bpf_prog *prog; 3183 struct sock *sk = v; 3184 uid_t uid; 3185 int ret; 3186 3187 if (v == SEQ_START_TOKEN) 3188 return 0; 3189 3190 if (sk_fullsock(sk)) 3191 lock_sock(sk); 3192 3193 if (unlikely(sk_unhashed(sk))) { 3194 ret = SEQ_SKIP; 3195 goto unlock; 3196 } 3197 3198 if (sk->sk_state == TCP_TIME_WAIT) { 3199 uid = 0; 3200 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3201 const struct request_sock *req = v; 3202 3203 uid = from_kuid_munged(seq_user_ns(seq), 3204 sock_i_uid(req->rsk_listener)); 3205 } else { 3206 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3207 } 3208 3209 meta.seq = seq; 3210 prog = bpf_iter_get_info(&meta, false); 3211 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3212 3213 unlock: 3214 if (sk_fullsock(sk)) 3215 release_sock(sk); 3216 return ret; 3217 3218 } 3219 3220 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3221 { 3222 struct bpf_tcp_iter_state *iter = seq->private; 3223 struct bpf_iter_meta meta; 3224 struct bpf_prog *prog; 3225 3226 if (!v) { 3227 meta.seq = seq; 3228 prog = bpf_iter_get_info(&meta, true); 3229 if (prog) 3230 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3231 } 3232 3233 if (iter->cur_sk < iter->end_sk) { 3234 bpf_iter_tcp_put_batch(iter); 3235 iter->st_bucket_done = false; 3236 } 3237 } 3238 3239 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3240 .show = bpf_iter_tcp_seq_show, 3241 .start = bpf_iter_tcp_seq_start, 3242 .next = bpf_iter_tcp_seq_next, 3243 .stop = bpf_iter_tcp_seq_stop, 3244 }; 3245 #endif 3246 static unsigned short seq_file_family(const struct seq_file *seq) 3247 { 3248 const struct tcp_seq_afinfo *afinfo; 3249 3250 #ifdef CONFIG_BPF_SYSCALL 3251 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3252 if (seq->op == &bpf_iter_tcp_seq_ops) 3253 return AF_UNSPEC; 3254 #endif 3255 3256 /* Iterated from proc fs */ 3257 afinfo = pde_data(file_inode(seq->file)); 3258 return afinfo->family; 3259 } 3260 3261 static const struct seq_operations tcp4_seq_ops = { 3262 .show = tcp4_seq_show, 3263 .start = tcp_seq_start, 3264 .next = tcp_seq_next, 3265 .stop = tcp_seq_stop, 3266 }; 3267 3268 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3269 .family = AF_INET, 3270 }; 3271 3272 static int __net_init tcp4_proc_init_net(struct net *net) 3273 { 3274 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3275 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3276 return -ENOMEM; 3277 return 0; 3278 } 3279 3280 static void __net_exit tcp4_proc_exit_net(struct net *net) 3281 { 3282 remove_proc_entry("tcp", net->proc_net); 3283 } 3284 3285 static struct pernet_operations tcp4_net_ops = { 3286 .init = tcp4_proc_init_net, 3287 .exit = tcp4_proc_exit_net, 3288 }; 3289 3290 int __init tcp4_proc_init(void) 3291 { 3292 return register_pernet_subsys(&tcp4_net_ops); 3293 } 3294 3295 void tcp4_proc_exit(void) 3296 { 3297 unregister_pernet_subsys(&tcp4_net_ops); 3298 } 3299 #endif /* CONFIG_PROC_FS */ 3300 3301 /* @wake is one when sk_stream_write_space() calls us. 3302 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3303 * This mimics the strategy used in sock_def_write_space(). 3304 */ 3305 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3306 { 3307 const struct tcp_sock *tp = tcp_sk(sk); 3308 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3309 READ_ONCE(tp->snd_nxt); 3310 3311 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3312 } 3313 EXPORT_SYMBOL(tcp_stream_memory_free); 3314 3315 struct proto tcp_prot = { 3316 .name = "TCP", 3317 .owner = THIS_MODULE, 3318 .close = tcp_close, 3319 .pre_connect = tcp_v4_pre_connect, 3320 .connect = tcp_v4_connect, 3321 .disconnect = tcp_disconnect, 3322 .accept = inet_csk_accept, 3323 .ioctl = tcp_ioctl, 3324 .init = tcp_v4_init_sock, 3325 .destroy = tcp_v4_destroy_sock, 3326 .shutdown = tcp_shutdown, 3327 .setsockopt = tcp_setsockopt, 3328 .getsockopt = tcp_getsockopt, 3329 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3330 .keepalive = tcp_set_keepalive, 3331 .recvmsg = tcp_recvmsg, 3332 .sendmsg = tcp_sendmsg, 3333 .splice_eof = tcp_splice_eof, 3334 .backlog_rcv = tcp_v4_do_rcv, 3335 .release_cb = tcp_release_cb, 3336 .hash = inet_hash, 3337 .unhash = inet_unhash, 3338 .get_port = inet_csk_get_port, 3339 .put_port = inet_put_port, 3340 #ifdef CONFIG_BPF_SYSCALL 3341 .psock_update_sk_prot = tcp_bpf_update_proto, 3342 #endif 3343 .enter_memory_pressure = tcp_enter_memory_pressure, 3344 .leave_memory_pressure = tcp_leave_memory_pressure, 3345 .stream_memory_free = tcp_stream_memory_free, 3346 .sockets_allocated = &tcp_sockets_allocated, 3347 .orphan_count = &tcp_orphan_count, 3348 3349 .memory_allocated = &tcp_memory_allocated, 3350 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3351 3352 .memory_pressure = &tcp_memory_pressure, 3353 .sysctl_mem = sysctl_tcp_mem, 3354 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3355 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3356 .max_header = MAX_TCP_HEADER, 3357 .obj_size = sizeof(struct tcp_sock), 3358 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3359 .twsk_prot = &tcp_timewait_sock_ops, 3360 .rsk_prot = &tcp_request_sock_ops, 3361 .h.hashinfo = NULL, 3362 .no_autobind = true, 3363 .diag_destroy = tcp_abort, 3364 }; 3365 EXPORT_SYMBOL(tcp_prot); 3366 3367 static void __net_exit tcp_sk_exit(struct net *net) 3368 { 3369 if (net->ipv4.tcp_congestion_control) 3370 bpf_module_put(net->ipv4.tcp_congestion_control, 3371 net->ipv4.tcp_congestion_control->owner); 3372 } 3373 3374 static void __net_init tcp_set_hashinfo(struct net *net) 3375 { 3376 struct inet_hashinfo *hinfo; 3377 unsigned int ehash_entries; 3378 struct net *old_net; 3379 3380 if (net_eq(net, &init_net)) 3381 goto fallback; 3382 3383 old_net = current->nsproxy->net_ns; 3384 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3385 if (!ehash_entries) 3386 goto fallback; 3387 3388 ehash_entries = roundup_pow_of_two(ehash_entries); 3389 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3390 if (!hinfo) { 3391 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3392 "for a netns, fallback to the global one\n", 3393 ehash_entries); 3394 fallback: 3395 hinfo = &tcp_hashinfo; 3396 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3397 } 3398 3399 net->ipv4.tcp_death_row.hashinfo = hinfo; 3400 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3401 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3402 } 3403 3404 static int __net_init tcp_sk_init(struct net *net) 3405 { 3406 net->ipv4.sysctl_tcp_ecn = 2; 3407 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3408 3409 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3410 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3411 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3412 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3413 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3414 3415 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3416 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3417 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3418 3419 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3420 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3421 net->ipv4.sysctl_tcp_syncookies = 1; 3422 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3423 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3424 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3425 net->ipv4.sysctl_tcp_orphan_retries = 0; 3426 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3427 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3428 net->ipv4.sysctl_tcp_tw_reuse = 2; 3429 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3430 3431 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3432 tcp_set_hashinfo(net); 3433 3434 net->ipv4.sysctl_tcp_sack = 1; 3435 net->ipv4.sysctl_tcp_window_scaling = 1; 3436 net->ipv4.sysctl_tcp_timestamps = 1; 3437 net->ipv4.sysctl_tcp_early_retrans = 3; 3438 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3439 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3440 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3441 net->ipv4.sysctl_tcp_max_reordering = 300; 3442 net->ipv4.sysctl_tcp_dsack = 1; 3443 net->ipv4.sysctl_tcp_app_win = 31; 3444 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3445 net->ipv4.sysctl_tcp_frto = 2; 3446 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3447 /* This limits the percentage of the congestion window which we 3448 * will allow a single TSO frame to consume. Building TSO frames 3449 * which are too large can cause TCP streams to be bursty. 3450 */ 3451 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3452 /* Default TSQ limit of 16 TSO segments */ 3453 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3454 3455 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3456 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3457 3458 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3459 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3460 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3461 net->ipv4.sysctl_tcp_autocorking = 1; 3462 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3463 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3464 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3465 if (net != &init_net) { 3466 memcpy(net->ipv4.sysctl_tcp_rmem, 3467 init_net.ipv4.sysctl_tcp_rmem, 3468 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3469 memcpy(net->ipv4.sysctl_tcp_wmem, 3470 init_net.ipv4.sysctl_tcp_wmem, 3471 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3472 } 3473 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3474 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3475 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3476 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3477 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3478 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3479 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3480 3481 /* Set default values for PLB */ 3482 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3483 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3484 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3485 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3486 /* Default congestion threshold for PLB to mark a round is 50% */ 3487 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3488 3489 /* Reno is always built in */ 3490 if (!net_eq(net, &init_net) && 3491 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3492 init_net.ipv4.tcp_congestion_control->owner)) 3493 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3494 else 3495 net->ipv4.tcp_congestion_control = &tcp_reno; 3496 3497 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3498 net->ipv4.sysctl_tcp_shrink_window = 0; 3499 3500 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3501 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3502 3503 return 0; 3504 } 3505 3506 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3507 { 3508 struct net *net; 3509 3510 tcp_twsk_purge(net_exit_list); 3511 3512 list_for_each_entry(net, net_exit_list, exit_list) { 3513 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3514 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3515 tcp_fastopen_ctx_destroy(net); 3516 } 3517 } 3518 3519 static struct pernet_operations __net_initdata tcp_sk_ops = { 3520 .init = tcp_sk_init, 3521 .exit = tcp_sk_exit, 3522 .exit_batch = tcp_sk_exit_batch, 3523 }; 3524 3525 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3526 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3527 struct sock_common *sk_common, uid_t uid) 3528 3529 #define INIT_BATCH_SZ 16 3530 3531 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3532 { 3533 struct bpf_tcp_iter_state *iter = priv_data; 3534 int err; 3535 3536 err = bpf_iter_init_seq_net(priv_data, aux); 3537 if (err) 3538 return err; 3539 3540 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3541 if (err) { 3542 bpf_iter_fini_seq_net(priv_data); 3543 return err; 3544 } 3545 3546 return 0; 3547 } 3548 3549 static void bpf_iter_fini_tcp(void *priv_data) 3550 { 3551 struct bpf_tcp_iter_state *iter = priv_data; 3552 3553 bpf_iter_fini_seq_net(priv_data); 3554 kvfree(iter->batch); 3555 } 3556 3557 static const struct bpf_iter_seq_info tcp_seq_info = { 3558 .seq_ops = &bpf_iter_tcp_seq_ops, 3559 .init_seq_private = bpf_iter_init_tcp, 3560 .fini_seq_private = bpf_iter_fini_tcp, 3561 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3562 }; 3563 3564 static const struct bpf_func_proto * 3565 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3566 const struct bpf_prog *prog) 3567 { 3568 switch (func_id) { 3569 case BPF_FUNC_setsockopt: 3570 return &bpf_sk_setsockopt_proto; 3571 case BPF_FUNC_getsockopt: 3572 return &bpf_sk_getsockopt_proto; 3573 default: 3574 return NULL; 3575 } 3576 } 3577 3578 static struct bpf_iter_reg tcp_reg_info = { 3579 .target = "tcp", 3580 .ctx_arg_info_size = 1, 3581 .ctx_arg_info = { 3582 { offsetof(struct bpf_iter__tcp, sk_common), 3583 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3584 }, 3585 .get_func_proto = bpf_iter_tcp_get_func_proto, 3586 .seq_info = &tcp_seq_info, 3587 }; 3588 3589 static void __init bpf_iter_register(void) 3590 { 3591 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3592 if (bpf_iter_reg_target(&tcp_reg_info)) 3593 pr_warn("Warning: could not register bpf iterator tcp\n"); 3594 } 3595 3596 #endif 3597 3598 void __init tcp_v4_init(void) 3599 { 3600 int cpu, res; 3601 3602 for_each_possible_cpu(cpu) { 3603 struct sock *sk; 3604 3605 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3606 IPPROTO_TCP, &init_net); 3607 if (res) 3608 panic("Failed to create the TCP control socket.\n"); 3609 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3610 3611 /* Please enforce IP_DF and IPID==0 for RST and 3612 * ACK sent in SYN-RECV and TIME-WAIT state. 3613 */ 3614 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3615 3616 sk->sk_clockid = CLOCK_MONOTONIC; 3617 3618 per_cpu(ipv4_tcp_sk, cpu) = sk; 3619 } 3620 if (register_pernet_subsys(&tcp_sk_ops)) 3621 panic("Failed to create the TCP control socket.\n"); 3622 3623 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3624 bpf_iter_register(); 3625 #endif 3626 } 3627