1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 #include <net/rstreason.h> 74 75 #include <linux/inet.h> 76 #include <linux/ipv6.h> 77 #include <linux/stddef.h> 78 #include <linux/proc_fs.h> 79 #include <linux/seq_file.h> 80 #include <linux/inetdevice.h> 81 #include <linux/btf_ids.h> 82 83 #include <crypto/hash.h> 84 #include <linux/scatterlist.h> 85 86 #include <trace/events/tcp.h> 87 88 #ifdef CONFIG_TCP_MD5SIG 89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 90 __be32 daddr, __be32 saddr, const struct tcphdr *th); 91 #endif 92 93 struct inet_hashinfo tcp_hashinfo; 94 EXPORT_SYMBOL(tcp_hashinfo); 95 96 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { 97 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 98 }; 99 100 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 101 { 102 return secure_tcp_seq(ip_hdr(skb)->daddr, 103 ip_hdr(skb)->saddr, 104 tcp_hdr(skb)->dest, 105 tcp_hdr(skb)->source); 106 } 107 108 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 109 { 110 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 111 } 112 113 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 114 { 115 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 116 const struct inet_timewait_sock *tw = inet_twsk(sktw); 117 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 118 struct tcp_sock *tp = tcp_sk(sk); 119 int ts_recent_stamp; 120 121 if (reuse == 2) { 122 /* Still does not detect *everything* that goes through 123 * lo, since we require a loopback src or dst address 124 * or direct binding to 'lo' interface. 125 */ 126 bool loopback = false; 127 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 128 loopback = true; 129 #if IS_ENABLED(CONFIG_IPV6) 130 if (tw->tw_family == AF_INET6) { 131 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 132 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 133 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 134 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 135 loopback = true; 136 } else 137 #endif 138 { 139 if (ipv4_is_loopback(tw->tw_daddr) || 140 ipv4_is_loopback(tw->tw_rcv_saddr)) 141 loopback = true; 142 } 143 if (!loopback) 144 reuse = 0; 145 } 146 147 /* With PAWS, it is safe from the viewpoint 148 of data integrity. Even without PAWS it is safe provided sequence 149 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 150 151 Actually, the idea is close to VJ's one, only timestamp cache is 152 held not per host, but per port pair and TW bucket is used as state 153 holder. 154 155 If TW bucket has been already destroyed we fall back to VJ's scheme 156 and use initial timestamp retrieved from peer table. 157 */ 158 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); 159 if (ts_recent_stamp && 160 (!twp || (reuse && time_after32(ktime_get_seconds(), 161 ts_recent_stamp)))) { 162 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk 163 * and releasing the bucket lock. 164 */ 165 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) 166 return 0; 167 168 /* In case of repair and re-using TIME-WAIT sockets we still 169 * want to be sure that it is safe as above but honor the 170 * sequence numbers and time stamps set as part of the repair 171 * process. 172 * 173 * Without this check re-using a TIME-WAIT socket with TCP 174 * repair would accumulate a -1 on the repair assigned 175 * sequence number. The first time it is reused the sequence 176 * is -1, the second time -2, etc. This fixes that issue 177 * without appearing to create any others. 178 */ 179 if (likely(!tp->repair)) { 180 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 181 182 if (!seq) 183 seq = 1; 184 WRITE_ONCE(tp->write_seq, seq); 185 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); 186 tp->rx_opt.ts_recent_stamp = ts_recent_stamp; 187 } 188 189 return 1; 190 } 191 192 return 0; 193 } 194 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 195 196 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 197 int addr_len) 198 { 199 /* This check is replicated from tcp_v4_connect() and intended to 200 * prevent BPF program called below from accessing bytes that are out 201 * of the bound specified by user in addr_len. 202 */ 203 if (addr_len < sizeof(struct sockaddr_in)) 204 return -EINVAL; 205 206 sock_owned_by_me(sk); 207 208 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 209 } 210 211 /* This will initiate an outgoing connection. */ 212 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 213 { 214 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 215 struct inet_timewait_death_row *tcp_death_row; 216 struct inet_sock *inet = inet_sk(sk); 217 struct tcp_sock *tp = tcp_sk(sk); 218 struct ip_options_rcu *inet_opt; 219 struct net *net = sock_net(sk); 220 __be16 orig_sport, orig_dport; 221 __be32 daddr, nexthop; 222 struct flowi4 *fl4; 223 struct rtable *rt; 224 int err; 225 226 if (addr_len < sizeof(struct sockaddr_in)) 227 return -EINVAL; 228 229 if (usin->sin_family != AF_INET) 230 return -EAFNOSUPPORT; 231 232 nexthop = daddr = usin->sin_addr.s_addr; 233 inet_opt = rcu_dereference_protected(inet->inet_opt, 234 lockdep_sock_is_held(sk)); 235 if (inet_opt && inet_opt->opt.srr) { 236 if (!daddr) 237 return -EINVAL; 238 nexthop = inet_opt->opt.faddr; 239 } 240 241 orig_sport = inet->inet_sport; 242 orig_dport = usin->sin_port; 243 fl4 = &inet->cork.fl.u.ip4; 244 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 245 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 246 orig_dport, sk); 247 if (IS_ERR(rt)) { 248 err = PTR_ERR(rt); 249 if (err == -ENETUNREACH) 250 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 251 return err; 252 } 253 254 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 255 ip_rt_put(rt); 256 return -ENETUNREACH; 257 } 258 259 if (!inet_opt || !inet_opt->opt.srr) 260 daddr = fl4->daddr; 261 262 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 263 264 if (!inet->inet_saddr) { 265 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 266 if (err) { 267 ip_rt_put(rt); 268 return err; 269 } 270 } else { 271 sk_rcv_saddr_set(sk, inet->inet_saddr); 272 } 273 274 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 275 /* Reset inherited state */ 276 tp->rx_opt.ts_recent = 0; 277 tp->rx_opt.ts_recent_stamp = 0; 278 if (likely(!tp->repair)) 279 WRITE_ONCE(tp->write_seq, 0); 280 } 281 282 inet->inet_dport = usin->sin_port; 283 sk_daddr_set(sk, daddr); 284 285 inet_csk(sk)->icsk_ext_hdr_len = 0; 286 if (inet_opt) 287 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 288 289 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 290 291 /* Socket identity is still unknown (sport may be zero). 292 * However we set state to SYN-SENT and not releasing socket 293 * lock select source port, enter ourselves into the hash tables and 294 * complete initialization after this. 295 */ 296 tcp_set_state(sk, TCP_SYN_SENT); 297 err = inet_hash_connect(tcp_death_row, sk); 298 if (err) 299 goto failure; 300 301 sk_set_txhash(sk); 302 303 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 304 inet->inet_sport, inet->inet_dport, sk); 305 if (IS_ERR(rt)) { 306 err = PTR_ERR(rt); 307 rt = NULL; 308 goto failure; 309 } 310 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 311 /* OK, now commit destination to socket. */ 312 sk->sk_gso_type = SKB_GSO_TCPV4; 313 sk_setup_caps(sk, &rt->dst); 314 rt = NULL; 315 316 if (likely(!tp->repair)) { 317 if (!tp->write_seq) 318 WRITE_ONCE(tp->write_seq, 319 secure_tcp_seq(inet->inet_saddr, 320 inet->inet_daddr, 321 inet->inet_sport, 322 usin->sin_port)); 323 WRITE_ONCE(tp->tsoffset, 324 secure_tcp_ts_off(net, inet->inet_saddr, 325 inet->inet_daddr)); 326 } 327 328 atomic_set(&inet->inet_id, get_random_u16()); 329 330 if (tcp_fastopen_defer_connect(sk, &err)) 331 return err; 332 if (err) 333 goto failure; 334 335 err = tcp_connect(sk); 336 337 if (err) 338 goto failure; 339 340 return 0; 341 342 failure: 343 /* 344 * This unhashes the socket and releases the local port, 345 * if necessary. 346 */ 347 tcp_set_state(sk, TCP_CLOSE); 348 inet_bhash2_reset_saddr(sk); 349 ip_rt_put(rt); 350 sk->sk_route_caps = 0; 351 inet->inet_dport = 0; 352 return err; 353 } 354 EXPORT_SYMBOL(tcp_v4_connect); 355 356 /* 357 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 358 * It can be called through tcp_release_cb() if socket was owned by user 359 * at the time tcp_v4_err() was called to handle ICMP message. 360 */ 361 void tcp_v4_mtu_reduced(struct sock *sk) 362 { 363 struct inet_sock *inet = inet_sk(sk); 364 struct dst_entry *dst; 365 u32 mtu; 366 367 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 368 return; 369 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 370 dst = inet_csk_update_pmtu(sk, mtu); 371 if (!dst) 372 return; 373 374 /* Something is about to be wrong... Remember soft error 375 * for the case, if this connection will not able to recover. 376 */ 377 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 378 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 379 380 mtu = dst_mtu(dst); 381 382 if (inet->pmtudisc != IP_PMTUDISC_DONT && 383 ip_sk_accept_pmtu(sk) && 384 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 385 tcp_sync_mss(sk, mtu); 386 387 /* Resend the TCP packet because it's 388 * clear that the old packet has been 389 * dropped. This is the new "fast" path mtu 390 * discovery. 391 */ 392 tcp_simple_retransmit(sk); 393 } /* else let the usual retransmit timer handle it */ 394 } 395 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 396 397 static void do_redirect(struct sk_buff *skb, struct sock *sk) 398 { 399 struct dst_entry *dst = __sk_dst_check(sk, 0); 400 401 if (dst) 402 dst->ops->redirect(dst, sk, skb); 403 } 404 405 406 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 407 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 408 { 409 struct request_sock *req = inet_reqsk(sk); 410 struct net *net = sock_net(sk); 411 412 /* ICMPs are not backlogged, hence we cannot get 413 * an established socket here. 414 */ 415 if (seq != tcp_rsk(req)->snt_isn) { 416 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 417 } else if (abort) { 418 /* 419 * Still in SYN_RECV, just remove it silently. 420 * There is no good way to pass the error to the newly 421 * created socket, and POSIX does not want network 422 * errors returned from accept(). 423 */ 424 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 425 tcp_listendrop(req->rsk_listener); 426 } 427 reqsk_put(req); 428 } 429 EXPORT_SYMBOL(tcp_req_err); 430 431 /* TCP-LD (RFC 6069) logic */ 432 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 433 { 434 struct inet_connection_sock *icsk = inet_csk(sk); 435 struct tcp_sock *tp = tcp_sk(sk); 436 struct sk_buff *skb; 437 s32 remaining; 438 u32 delta_us; 439 440 if (sock_owned_by_user(sk)) 441 return; 442 443 if (seq != tp->snd_una || !icsk->icsk_retransmits || 444 !icsk->icsk_backoff) 445 return; 446 447 skb = tcp_rtx_queue_head(sk); 448 if (WARN_ON_ONCE(!skb)) 449 return; 450 451 icsk->icsk_backoff--; 452 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 453 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 454 455 tcp_mstamp_refresh(tp); 456 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 457 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 458 459 if (remaining > 0) { 460 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 461 remaining, TCP_RTO_MAX); 462 } else { 463 /* RTO revert clocked out retransmission. 464 * Will retransmit now. 465 */ 466 tcp_retransmit_timer(sk); 467 } 468 } 469 EXPORT_SYMBOL(tcp_ld_RTO_revert); 470 471 /* 472 * This routine is called by the ICMP module when it gets some 473 * sort of error condition. If err < 0 then the socket should 474 * be closed and the error returned to the user. If err > 0 475 * it's just the icmp type << 8 | icmp code. After adjustment 476 * header points to the first 8 bytes of the tcp header. We need 477 * to find the appropriate port. 478 * 479 * The locking strategy used here is very "optimistic". When 480 * someone else accesses the socket the ICMP is just dropped 481 * and for some paths there is no check at all. 482 * A more general error queue to queue errors for later handling 483 * is probably better. 484 * 485 */ 486 487 int tcp_v4_err(struct sk_buff *skb, u32 info) 488 { 489 const struct iphdr *iph = (const struct iphdr *)skb->data; 490 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 491 struct tcp_sock *tp; 492 const int type = icmp_hdr(skb)->type; 493 const int code = icmp_hdr(skb)->code; 494 struct sock *sk; 495 struct request_sock *fastopen; 496 u32 seq, snd_una; 497 int err; 498 struct net *net = dev_net(skb->dev); 499 500 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 501 iph->daddr, th->dest, iph->saddr, 502 ntohs(th->source), inet_iif(skb), 0); 503 if (!sk) { 504 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 505 return -ENOENT; 506 } 507 if (sk->sk_state == TCP_TIME_WAIT) { 508 /* To increase the counter of ignored icmps for TCP-AO */ 509 tcp_ao_ignore_icmp(sk, AF_INET, type, code); 510 inet_twsk_put(inet_twsk(sk)); 511 return 0; 512 } 513 seq = ntohl(th->seq); 514 if (sk->sk_state == TCP_NEW_SYN_RECV) { 515 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 516 type == ICMP_TIME_EXCEEDED || 517 (type == ICMP_DEST_UNREACH && 518 (code == ICMP_NET_UNREACH || 519 code == ICMP_HOST_UNREACH))); 520 return 0; 521 } 522 523 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { 524 sock_put(sk); 525 return 0; 526 } 527 528 bh_lock_sock(sk); 529 /* If too many ICMPs get dropped on busy 530 * servers this needs to be solved differently. 531 * We do take care of PMTU discovery (RFC1191) special case : 532 * we can receive locally generated ICMP messages while socket is held. 533 */ 534 if (sock_owned_by_user(sk)) { 535 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 536 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 537 } 538 if (sk->sk_state == TCP_CLOSE) 539 goto out; 540 541 if (static_branch_unlikely(&ip4_min_ttl)) { 542 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 543 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 544 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 545 goto out; 546 } 547 } 548 549 tp = tcp_sk(sk); 550 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 551 fastopen = rcu_dereference(tp->fastopen_rsk); 552 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 553 if (sk->sk_state != TCP_LISTEN && 554 !between(seq, snd_una, tp->snd_nxt)) { 555 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 556 goto out; 557 } 558 559 switch (type) { 560 case ICMP_REDIRECT: 561 if (!sock_owned_by_user(sk)) 562 do_redirect(skb, sk); 563 goto out; 564 case ICMP_SOURCE_QUENCH: 565 /* Just silently ignore these. */ 566 goto out; 567 case ICMP_PARAMETERPROB: 568 err = EPROTO; 569 break; 570 case ICMP_DEST_UNREACH: 571 if (code > NR_ICMP_UNREACH) 572 goto out; 573 574 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 575 /* We are not interested in TCP_LISTEN and open_requests 576 * (SYN-ACKs send out by Linux are always <576bytes so 577 * they should go through unfragmented). 578 */ 579 if (sk->sk_state == TCP_LISTEN) 580 goto out; 581 582 WRITE_ONCE(tp->mtu_info, info); 583 if (!sock_owned_by_user(sk)) { 584 tcp_v4_mtu_reduced(sk); 585 } else { 586 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 587 sock_hold(sk); 588 } 589 goto out; 590 } 591 592 err = icmp_err_convert[code].errno; 593 /* check if this ICMP message allows revert of backoff. 594 * (see RFC 6069) 595 */ 596 if (!fastopen && 597 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 598 tcp_ld_RTO_revert(sk, seq); 599 break; 600 case ICMP_TIME_EXCEEDED: 601 err = EHOSTUNREACH; 602 break; 603 default: 604 goto out; 605 } 606 607 switch (sk->sk_state) { 608 case TCP_SYN_SENT: 609 case TCP_SYN_RECV: 610 /* Only in fast or simultaneous open. If a fast open socket is 611 * already accepted it is treated as a connected one below. 612 */ 613 if (fastopen && !fastopen->sk) 614 break; 615 616 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 617 618 if (!sock_owned_by_user(sk)) 619 tcp_done_with_error(sk, err); 620 else 621 WRITE_ONCE(sk->sk_err_soft, err); 622 goto out; 623 } 624 625 /* If we've already connected we will keep trying 626 * until we time out, or the user gives up. 627 * 628 * rfc1122 4.2.3.9 allows to consider as hard errors 629 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 630 * but it is obsoleted by pmtu discovery). 631 * 632 * Note, that in modern internet, where routing is unreliable 633 * and in each dark corner broken firewalls sit, sending random 634 * errors ordered by their masters even this two messages finally lose 635 * their original sense (even Linux sends invalid PORT_UNREACHs) 636 * 637 * Now we are in compliance with RFCs. 638 * --ANK (980905) 639 */ 640 641 if (!sock_owned_by_user(sk) && 642 inet_test_bit(RECVERR, sk)) { 643 WRITE_ONCE(sk->sk_err, err); 644 sk_error_report(sk); 645 } else { /* Only an error on timeout */ 646 WRITE_ONCE(sk->sk_err_soft, err); 647 } 648 649 out: 650 bh_unlock_sock(sk); 651 sock_put(sk); 652 return 0; 653 } 654 655 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 656 { 657 struct tcphdr *th = tcp_hdr(skb); 658 659 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 660 skb->csum_start = skb_transport_header(skb) - skb->head; 661 skb->csum_offset = offsetof(struct tcphdr, check); 662 } 663 664 /* This routine computes an IPv4 TCP checksum. */ 665 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 666 { 667 const struct inet_sock *inet = inet_sk(sk); 668 669 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 670 } 671 EXPORT_SYMBOL(tcp_v4_send_check); 672 673 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) 674 675 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb, 676 const struct tcp_ao_hdr *aoh, 677 struct ip_reply_arg *arg, struct tcphdr *reply, 678 __be32 reply_options[REPLY_OPTIONS_LEN]) 679 { 680 #ifdef CONFIG_TCP_AO 681 int sdif = tcp_v4_sdif(skb); 682 int dif = inet_iif(skb); 683 int l3index = sdif ? dif : 0; 684 bool allocated_traffic_key; 685 struct tcp_ao_key *key; 686 char *traffic_key; 687 bool drop = true; 688 u32 ao_sne = 0; 689 u8 keyid; 690 691 rcu_read_lock(); 692 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq), 693 &key, &traffic_key, &allocated_traffic_key, 694 &keyid, &ao_sne)) 695 goto out; 696 697 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) | 698 (aoh->rnext_keyid << 8) | keyid); 699 arg->iov[0].iov_len += tcp_ao_len_aligned(key); 700 reply->doff = arg->iov[0].iov_len / 4; 701 702 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1], 703 key, traffic_key, 704 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 705 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 706 reply, ao_sne)) 707 goto out; 708 drop = false; 709 out: 710 rcu_read_unlock(); 711 if (allocated_traffic_key) 712 kfree(traffic_key); 713 return drop; 714 #else 715 return true; 716 #endif 717 } 718 719 /* 720 * This routine will send an RST to the other tcp. 721 * 722 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 723 * for reset. 724 * Answer: if a packet caused RST, it is not for a socket 725 * existing in our system, if it is matched to a socket, 726 * it is just duplicate segment or bug in other side's TCP. 727 * So that we build reply only basing on parameters 728 * arrived with segment. 729 * Exception: precedence violation. We do not implement it in any case. 730 */ 731 732 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, 733 enum sk_rst_reason reason) 734 { 735 const struct tcphdr *th = tcp_hdr(skb); 736 struct { 737 struct tcphdr th; 738 __be32 opt[REPLY_OPTIONS_LEN]; 739 } rep; 740 const __u8 *md5_hash_location = NULL; 741 const struct tcp_ao_hdr *aoh; 742 struct ip_reply_arg arg; 743 #ifdef CONFIG_TCP_MD5SIG 744 struct tcp_md5sig_key *key = NULL; 745 unsigned char newhash[16]; 746 struct sock *sk1 = NULL; 747 int genhash; 748 #endif 749 u64 transmit_time = 0; 750 struct sock *ctl_sk; 751 struct net *net; 752 u32 txhash = 0; 753 754 /* Never send a reset in response to a reset. */ 755 if (th->rst) 756 return; 757 758 /* If sk not NULL, it means we did a successful lookup and incoming 759 * route had to be correct. prequeue might have dropped our dst. 760 */ 761 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 762 return; 763 764 /* Swap the send and the receive. */ 765 memset(&rep, 0, sizeof(rep)); 766 rep.th.dest = th->source; 767 rep.th.source = th->dest; 768 rep.th.doff = sizeof(struct tcphdr) / 4; 769 rep.th.rst = 1; 770 771 if (th->ack) { 772 rep.th.seq = th->ack_seq; 773 } else { 774 rep.th.ack = 1; 775 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 776 skb->len - (th->doff << 2)); 777 } 778 779 memset(&arg, 0, sizeof(arg)); 780 arg.iov[0].iov_base = (unsigned char *)&rep; 781 arg.iov[0].iov_len = sizeof(rep.th); 782 783 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 784 785 /* Invalid TCP option size or twice included auth */ 786 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) 787 return; 788 789 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt)) 790 return; 791 792 #ifdef CONFIG_TCP_MD5SIG 793 rcu_read_lock(); 794 if (sk && sk_fullsock(sk)) { 795 const union tcp_md5_addr *addr; 796 int l3index; 797 798 /* sdif set, means packet ingressed via a device 799 * in an L3 domain and inet_iif is set to it. 800 */ 801 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 802 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 803 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 804 } else if (md5_hash_location) { 805 const union tcp_md5_addr *addr; 806 int sdif = tcp_v4_sdif(skb); 807 int dif = inet_iif(skb); 808 int l3index; 809 810 /* 811 * active side is lost. Try to find listening socket through 812 * source port, and then find md5 key through listening socket. 813 * we are not loose security here: 814 * Incoming packet is checked with md5 hash with finding key, 815 * no RST generated if md5 hash doesn't match. 816 */ 817 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 818 NULL, 0, ip_hdr(skb)->saddr, 819 th->source, ip_hdr(skb)->daddr, 820 ntohs(th->source), dif, sdif); 821 /* don't send rst if it can't find key */ 822 if (!sk1) 823 goto out; 824 825 /* sdif set, means packet ingressed via a device 826 * in an L3 domain and dif is set to it. 827 */ 828 l3index = sdif ? dif : 0; 829 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 830 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 831 if (!key) 832 goto out; 833 834 835 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 836 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) 837 goto out; 838 839 } 840 841 if (key) { 842 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 843 (TCPOPT_NOP << 16) | 844 (TCPOPT_MD5SIG << 8) | 845 TCPOLEN_MD5SIG); 846 /* Update length and the length the header thinks exists */ 847 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 848 rep.th.doff = arg.iov[0].iov_len / 4; 849 850 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 851 key, ip_hdr(skb)->saddr, 852 ip_hdr(skb)->daddr, &rep.th); 853 } 854 #endif 855 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 856 if (rep.opt[0] == 0) { 857 __be32 mrst = mptcp_reset_option(skb); 858 859 if (mrst) { 860 rep.opt[0] = mrst; 861 arg.iov[0].iov_len += sizeof(mrst); 862 rep.th.doff = arg.iov[0].iov_len / 4; 863 } 864 } 865 866 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 867 ip_hdr(skb)->saddr, /* XXX */ 868 arg.iov[0].iov_len, IPPROTO_TCP, 0); 869 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 870 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 871 872 /* When socket is gone, all binding information is lost. 873 * routing might fail in this case. No choice here, if we choose to force 874 * input interface, we will misroute in case of asymmetric route. 875 */ 876 if (sk) 877 arg.bound_dev_if = sk->sk_bound_dev_if; 878 879 trace_tcp_send_reset(sk, skb, reason); 880 881 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 882 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 883 884 arg.tos = ip_hdr(skb)->tos; 885 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 886 local_bh_disable(); 887 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 888 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 889 890 sock_net_set(ctl_sk, net); 891 if (sk) { 892 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 893 inet_twsk(sk)->tw_mark : sk->sk_mark; 894 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 895 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 896 transmit_time = tcp_transmit_time(sk); 897 xfrm_sk_clone_policy(ctl_sk, sk); 898 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 899 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 900 } else { 901 ctl_sk->sk_mark = 0; 902 ctl_sk->sk_priority = 0; 903 } 904 ip_send_unicast_reply(ctl_sk, 905 skb, &TCP_SKB_CB(skb)->header.h4.opt, 906 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 907 &arg, arg.iov[0].iov_len, 908 transmit_time, txhash); 909 910 xfrm_sk_free_policy(ctl_sk); 911 sock_net_set(ctl_sk, &init_net); 912 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 913 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 914 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 915 local_bh_enable(); 916 917 #ifdef CONFIG_TCP_MD5SIG 918 out: 919 rcu_read_unlock(); 920 #endif 921 } 922 923 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 924 outside socket context is ugly, certainly. What can I do? 925 */ 926 927 static void tcp_v4_send_ack(const struct sock *sk, 928 struct sk_buff *skb, u32 seq, u32 ack, 929 u32 win, u32 tsval, u32 tsecr, int oif, 930 struct tcp_key *key, 931 int reply_flags, u8 tos, u32 txhash) 932 { 933 const struct tcphdr *th = tcp_hdr(skb); 934 struct { 935 struct tcphdr th; 936 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)]; 937 } rep; 938 struct net *net = sock_net(sk); 939 struct ip_reply_arg arg; 940 struct sock *ctl_sk; 941 u64 transmit_time; 942 943 memset(&rep.th, 0, sizeof(struct tcphdr)); 944 memset(&arg, 0, sizeof(arg)); 945 946 arg.iov[0].iov_base = (unsigned char *)&rep; 947 arg.iov[0].iov_len = sizeof(rep.th); 948 if (tsecr) { 949 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 950 (TCPOPT_TIMESTAMP << 8) | 951 TCPOLEN_TIMESTAMP); 952 rep.opt[1] = htonl(tsval); 953 rep.opt[2] = htonl(tsecr); 954 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 955 } 956 957 /* Swap the send and the receive. */ 958 rep.th.dest = th->source; 959 rep.th.source = th->dest; 960 rep.th.doff = arg.iov[0].iov_len / 4; 961 rep.th.seq = htonl(seq); 962 rep.th.ack_seq = htonl(ack); 963 rep.th.ack = 1; 964 rep.th.window = htons(win); 965 966 #ifdef CONFIG_TCP_MD5SIG 967 if (tcp_key_is_md5(key)) { 968 int offset = (tsecr) ? 3 : 0; 969 970 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 971 (TCPOPT_NOP << 16) | 972 (TCPOPT_MD5SIG << 8) | 973 TCPOLEN_MD5SIG); 974 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 975 rep.th.doff = arg.iov[0].iov_len/4; 976 977 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 978 key->md5_key, ip_hdr(skb)->saddr, 979 ip_hdr(skb)->daddr, &rep.th); 980 } 981 #endif 982 #ifdef CONFIG_TCP_AO 983 if (tcp_key_is_ao(key)) { 984 int offset = (tsecr) ? 3 : 0; 985 986 rep.opt[offset++] = htonl((TCPOPT_AO << 24) | 987 (tcp_ao_len(key->ao_key) << 16) | 988 (key->ao_key->sndid << 8) | 989 key->rcv_next); 990 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key); 991 rep.th.doff = arg.iov[0].iov_len / 4; 992 993 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset], 994 key->ao_key, key->traffic_key, 995 (union tcp_ao_addr *)&ip_hdr(skb)->saddr, 996 (union tcp_ao_addr *)&ip_hdr(skb)->daddr, 997 &rep.th, key->sne); 998 } 999 #endif 1000 arg.flags = reply_flags; 1001 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 1002 ip_hdr(skb)->saddr, /* XXX */ 1003 arg.iov[0].iov_len, IPPROTO_TCP, 0); 1004 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 1005 if (oif) 1006 arg.bound_dev_if = oif; 1007 arg.tos = tos; 1008 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 1009 local_bh_disable(); 1010 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); 1011 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); 1012 sock_net_set(ctl_sk, net); 1013 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 1014 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 1015 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 1016 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 1017 transmit_time = tcp_transmit_time(sk); 1018 ip_send_unicast_reply(ctl_sk, 1019 skb, &TCP_SKB_CB(skb)->header.h4.opt, 1020 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 1021 &arg, arg.iov[0].iov_len, 1022 transmit_time, txhash); 1023 1024 sock_net_set(ctl_sk, &init_net); 1025 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 1026 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); 1027 local_bh_enable(); 1028 } 1029 1030 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 1031 { 1032 struct inet_timewait_sock *tw = inet_twsk(sk); 1033 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 1034 struct tcp_key key = {}; 1035 #ifdef CONFIG_TCP_AO 1036 struct tcp_ao_info *ao_info; 1037 1038 if (static_branch_unlikely(&tcp_ao_needed.key)) { 1039 /* FIXME: the segment to-be-acked is not verified yet */ 1040 ao_info = rcu_dereference(tcptw->ao_info); 1041 if (ao_info) { 1042 const struct tcp_ao_hdr *aoh; 1043 1044 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) { 1045 inet_twsk_put(tw); 1046 return; 1047 } 1048 1049 if (aoh) 1050 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); 1051 } 1052 } 1053 if (key.ao_key) { 1054 struct tcp_ao_key *rnext_key; 1055 1056 key.traffic_key = snd_other_key(key.ao_key); 1057 key.sne = READ_ONCE(ao_info->snd_sne); 1058 rnext_key = READ_ONCE(ao_info->rnext_key); 1059 key.rcv_next = rnext_key->rcvid; 1060 key.type = TCP_KEY_AO; 1061 #else 1062 if (0) { 1063 #endif 1064 } else if (static_branch_tcp_md5()) { 1065 key.md5_key = tcp_twsk_md5_key(tcptw); 1066 if (key.md5_key) 1067 key.type = TCP_KEY_MD5; 1068 } 1069 1070 tcp_v4_send_ack(sk, skb, 1071 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 1072 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 1073 tcp_tw_tsval(tcptw), 1074 READ_ONCE(tcptw->tw_ts_recent), 1075 tw->tw_bound_dev_if, &key, 1076 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 1077 tw->tw_tos, 1078 tw->tw_txhash); 1079 1080 inet_twsk_put(tw); 1081 } 1082 1083 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 1084 struct request_sock *req) 1085 { 1086 struct tcp_key key = {}; 1087 1088 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 1089 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 1090 */ 1091 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 1092 tcp_sk(sk)->snd_nxt; 1093 1094 #ifdef CONFIG_TCP_AO 1095 if (static_branch_unlikely(&tcp_ao_needed.key) && 1096 tcp_rsk_used_ao(req)) { 1097 const union tcp_md5_addr *addr; 1098 const struct tcp_ao_hdr *aoh; 1099 int l3index; 1100 1101 /* Invalid TCP option size or twice included auth */ 1102 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) 1103 return; 1104 if (!aoh) 1105 return; 1106 1107 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1108 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1109 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, 1110 aoh->rnext_keyid, -1); 1111 if (unlikely(!key.ao_key)) { 1112 /* Send ACK with any matching MKT for the peer */ 1113 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1); 1114 /* Matching key disappeared (user removed the key?) 1115 * let the handshake timeout. 1116 */ 1117 if (!key.ao_key) { 1118 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n", 1119 addr, 1120 ntohs(tcp_hdr(skb)->source), 1121 &ip_hdr(skb)->daddr, 1122 ntohs(tcp_hdr(skb)->dest)); 1123 return; 1124 } 1125 } 1126 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); 1127 if (!key.traffic_key) 1128 return; 1129 1130 key.type = TCP_KEY_AO; 1131 key.rcv_next = aoh->keyid; 1132 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); 1133 #else 1134 if (0) { 1135 #endif 1136 } else if (static_branch_tcp_md5()) { 1137 const union tcp_md5_addr *addr; 1138 int l3index; 1139 1140 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 1141 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 1142 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1143 if (key.md5_key) 1144 key.type = TCP_KEY_MD5; 1145 } 1146 1147 tcp_v4_send_ack(sk, skb, seq, 1148 tcp_rsk(req)->rcv_nxt, 1149 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, 1150 tcp_rsk_tsval(tcp_rsk(req)), 1151 READ_ONCE(req->ts_recent), 1152 0, &key, 1153 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1154 ip_hdr(skb)->tos, 1155 READ_ONCE(tcp_rsk(req)->txhash)); 1156 if (tcp_key_is_ao(&key)) 1157 kfree(key.traffic_key); 1158 } 1159 1160 /* 1161 * Send a SYN-ACK after having received a SYN. 1162 * This still operates on a request_sock only, not on a big 1163 * socket. 1164 */ 1165 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1166 struct flowi *fl, 1167 struct request_sock *req, 1168 struct tcp_fastopen_cookie *foc, 1169 enum tcp_synack_type synack_type, 1170 struct sk_buff *syn_skb) 1171 { 1172 const struct inet_request_sock *ireq = inet_rsk(req); 1173 struct flowi4 fl4; 1174 int err = -1; 1175 struct sk_buff *skb; 1176 u8 tos; 1177 1178 /* First, grab a route. */ 1179 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1180 return -1; 1181 1182 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1183 1184 if (skb) { 1185 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1186 1187 tos = READ_ONCE(inet_sk(sk)->tos); 1188 1189 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1190 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1191 (tos & INET_ECN_MASK); 1192 1193 if (!INET_ECN_is_capable(tos) && 1194 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1195 tos |= INET_ECN_ECT_0; 1196 1197 rcu_read_lock(); 1198 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1199 ireq->ir_rmt_addr, 1200 rcu_dereference(ireq->ireq_opt), 1201 tos); 1202 rcu_read_unlock(); 1203 err = net_xmit_eval(err); 1204 } 1205 1206 return err; 1207 } 1208 1209 /* 1210 * IPv4 request_sock destructor. 1211 */ 1212 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1213 { 1214 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1215 } 1216 1217 #ifdef CONFIG_TCP_MD5SIG 1218 /* 1219 * RFC2385 MD5 checksumming requires a mapping of 1220 * IP address->MD5 Key. 1221 * We need to maintain these in the sk structure. 1222 */ 1223 1224 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1225 EXPORT_SYMBOL(tcp_md5_needed); 1226 1227 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1228 { 1229 if (!old) 1230 return true; 1231 1232 /* l3index always overrides non-l3index */ 1233 if (old->l3index && new->l3index == 0) 1234 return false; 1235 if (old->l3index == 0 && new->l3index) 1236 return true; 1237 1238 return old->prefixlen < new->prefixlen; 1239 } 1240 1241 /* Find the Key structure for an address. */ 1242 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1243 const union tcp_md5_addr *addr, 1244 int family, bool any_l3index) 1245 { 1246 const struct tcp_sock *tp = tcp_sk(sk); 1247 struct tcp_md5sig_key *key; 1248 const struct tcp_md5sig_info *md5sig; 1249 __be32 mask; 1250 struct tcp_md5sig_key *best_match = NULL; 1251 bool match; 1252 1253 /* caller either holds rcu_read_lock() or socket lock */ 1254 md5sig = rcu_dereference_check(tp->md5sig_info, 1255 lockdep_sock_is_held(sk)); 1256 if (!md5sig) 1257 return NULL; 1258 1259 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1260 lockdep_sock_is_held(sk)) { 1261 if (key->family != family) 1262 continue; 1263 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX && 1264 key->l3index != l3index) 1265 continue; 1266 if (family == AF_INET) { 1267 mask = inet_make_mask(key->prefixlen); 1268 match = (key->addr.a4.s_addr & mask) == 1269 (addr->a4.s_addr & mask); 1270 #if IS_ENABLED(CONFIG_IPV6) 1271 } else if (family == AF_INET6) { 1272 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1273 key->prefixlen); 1274 #endif 1275 } else { 1276 match = false; 1277 } 1278 1279 if (match && better_md5_match(best_match, key)) 1280 best_match = key; 1281 } 1282 return best_match; 1283 } 1284 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1285 1286 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1287 const union tcp_md5_addr *addr, 1288 int family, u8 prefixlen, 1289 int l3index, u8 flags) 1290 { 1291 const struct tcp_sock *tp = tcp_sk(sk); 1292 struct tcp_md5sig_key *key; 1293 unsigned int size = sizeof(struct in_addr); 1294 const struct tcp_md5sig_info *md5sig; 1295 1296 /* caller either holds rcu_read_lock() or socket lock */ 1297 md5sig = rcu_dereference_check(tp->md5sig_info, 1298 lockdep_sock_is_held(sk)); 1299 if (!md5sig) 1300 return NULL; 1301 #if IS_ENABLED(CONFIG_IPV6) 1302 if (family == AF_INET6) 1303 size = sizeof(struct in6_addr); 1304 #endif 1305 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1306 lockdep_sock_is_held(sk)) { 1307 if (key->family != family) 1308 continue; 1309 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1310 continue; 1311 if (key->l3index != l3index) 1312 continue; 1313 if (!memcmp(&key->addr, addr, size) && 1314 key->prefixlen == prefixlen) 1315 return key; 1316 } 1317 return NULL; 1318 } 1319 1320 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1321 const struct sock *addr_sk) 1322 { 1323 const union tcp_md5_addr *addr; 1324 int l3index; 1325 1326 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1327 addr_sk->sk_bound_dev_if); 1328 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1329 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1330 } 1331 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1332 1333 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1334 { 1335 struct tcp_sock *tp = tcp_sk(sk); 1336 struct tcp_md5sig_info *md5sig; 1337 1338 md5sig = kmalloc(sizeof(*md5sig), gfp); 1339 if (!md5sig) 1340 return -ENOMEM; 1341 1342 sk_gso_disable(sk); 1343 INIT_HLIST_HEAD(&md5sig->head); 1344 rcu_assign_pointer(tp->md5sig_info, md5sig); 1345 return 0; 1346 } 1347 1348 /* This can be called on a newly created socket, from other files */ 1349 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1350 int family, u8 prefixlen, int l3index, u8 flags, 1351 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1352 { 1353 /* Add Key to the list */ 1354 struct tcp_md5sig_key *key; 1355 struct tcp_sock *tp = tcp_sk(sk); 1356 struct tcp_md5sig_info *md5sig; 1357 1358 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1359 if (key) { 1360 /* Pre-existing entry - just update that one. 1361 * Note that the key might be used concurrently. 1362 * data_race() is telling kcsan that we do not care of 1363 * key mismatches, since changing MD5 key on live flows 1364 * can lead to packet drops. 1365 */ 1366 data_race(memcpy(key->key, newkey, newkeylen)); 1367 1368 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1369 * Also note that a reader could catch new key->keylen value 1370 * but old key->key[], this is the reason we use __GFP_ZERO 1371 * at sock_kmalloc() time below these lines. 1372 */ 1373 WRITE_ONCE(key->keylen, newkeylen); 1374 1375 return 0; 1376 } 1377 1378 md5sig = rcu_dereference_protected(tp->md5sig_info, 1379 lockdep_sock_is_held(sk)); 1380 1381 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1382 if (!key) 1383 return -ENOMEM; 1384 1385 memcpy(key->key, newkey, newkeylen); 1386 key->keylen = newkeylen; 1387 key->family = family; 1388 key->prefixlen = prefixlen; 1389 key->l3index = l3index; 1390 key->flags = flags; 1391 memcpy(&key->addr, addr, 1392 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1393 sizeof(struct in_addr)); 1394 hlist_add_head_rcu(&key->node, &md5sig->head); 1395 return 0; 1396 } 1397 1398 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1399 int family, u8 prefixlen, int l3index, u8 flags, 1400 const u8 *newkey, u8 newkeylen) 1401 { 1402 struct tcp_sock *tp = tcp_sk(sk); 1403 1404 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1405 if (tcp_md5_alloc_sigpool()) 1406 return -ENOMEM; 1407 1408 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { 1409 tcp_md5_release_sigpool(); 1410 return -ENOMEM; 1411 } 1412 1413 if (!static_branch_inc(&tcp_md5_needed.key)) { 1414 struct tcp_md5sig_info *md5sig; 1415 1416 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1417 rcu_assign_pointer(tp->md5sig_info, NULL); 1418 kfree_rcu(md5sig, rcu); 1419 tcp_md5_release_sigpool(); 1420 return -EUSERS; 1421 } 1422 } 1423 1424 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1425 newkey, newkeylen, GFP_KERNEL); 1426 } 1427 EXPORT_SYMBOL(tcp_md5_do_add); 1428 1429 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1430 int family, u8 prefixlen, int l3index, 1431 struct tcp_md5sig_key *key) 1432 { 1433 struct tcp_sock *tp = tcp_sk(sk); 1434 1435 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1436 tcp_md5_add_sigpool(); 1437 1438 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { 1439 tcp_md5_release_sigpool(); 1440 return -ENOMEM; 1441 } 1442 1443 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1444 struct tcp_md5sig_info *md5sig; 1445 1446 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1447 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1448 rcu_assign_pointer(tp->md5sig_info, NULL); 1449 kfree_rcu(md5sig, rcu); 1450 tcp_md5_release_sigpool(); 1451 return -EUSERS; 1452 } 1453 } 1454 1455 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1456 key->flags, key->key, key->keylen, 1457 sk_gfp_mask(sk, GFP_ATOMIC)); 1458 } 1459 EXPORT_SYMBOL(tcp_md5_key_copy); 1460 1461 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1462 u8 prefixlen, int l3index, u8 flags) 1463 { 1464 struct tcp_md5sig_key *key; 1465 1466 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1467 if (!key) 1468 return -ENOENT; 1469 hlist_del_rcu(&key->node); 1470 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1471 kfree_rcu(key, rcu); 1472 return 0; 1473 } 1474 EXPORT_SYMBOL(tcp_md5_do_del); 1475 1476 void tcp_clear_md5_list(struct sock *sk) 1477 { 1478 struct tcp_sock *tp = tcp_sk(sk); 1479 struct tcp_md5sig_key *key; 1480 struct hlist_node *n; 1481 struct tcp_md5sig_info *md5sig; 1482 1483 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1484 1485 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1486 hlist_del_rcu(&key->node); 1487 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1488 kfree_rcu(key, rcu); 1489 } 1490 } 1491 1492 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1493 sockptr_t optval, int optlen) 1494 { 1495 struct tcp_md5sig cmd; 1496 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1497 const union tcp_md5_addr *addr; 1498 u8 prefixlen = 32; 1499 int l3index = 0; 1500 bool l3flag; 1501 u8 flags; 1502 1503 if (optlen < sizeof(cmd)) 1504 return -EINVAL; 1505 1506 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1507 return -EFAULT; 1508 1509 if (sin->sin_family != AF_INET) 1510 return -EINVAL; 1511 1512 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1513 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1514 1515 if (optname == TCP_MD5SIG_EXT && 1516 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1517 prefixlen = cmd.tcpm_prefixlen; 1518 if (prefixlen > 32) 1519 return -EINVAL; 1520 } 1521 1522 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1523 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1524 struct net_device *dev; 1525 1526 rcu_read_lock(); 1527 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1528 if (dev && netif_is_l3_master(dev)) 1529 l3index = dev->ifindex; 1530 1531 rcu_read_unlock(); 1532 1533 /* ok to reference set/not set outside of rcu; 1534 * right now device MUST be an L3 master 1535 */ 1536 if (!dev || !l3index) 1537 return -EINVAL; 1538 } 1539 1540 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1541 1542 if (!cmd.tcpm_keylen) 1543 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1544 1545 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1546 return -EINVAL; 1547 1548 /* Don't allow keys for peers that have a matching TCP-AO key. 1549 * See the comment in tcp_ao_add_cmd() 1550 */ 1551 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) 1552 return -EKEYREJECTED; 1553 1554 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1555 cmd.tcpm_key, cmd.tcpm_keylen); 1556 } 1557 1558 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, 1559 __be32 daddr, __be32 saddr, 1560 const struct tcphdr *th, int nbytes) 1561 { 1562 struct tcp4_pseudohdr *bp; 1563 struct scatterlist sg; 1564 struct tcphdr *_th; 1565 1566 bp = hp->scratch; 1567 bp->saddr = saddr; 1568 bp->daddr = daddr; 1569 bp->pad = 0; 1570 bp->protocol = IPPROTO_TCP; 1571 bp->len = cpu_to_be16(nbytes); 1572 1573 _th = (struct tcphdr *)(bp + 1); 1574 memcpy(_th, th, sizeof(*th)); 1575 _th->check = 0; 1576 1577 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1578 ahash_request_set_crypt(hp->req, &sg, NULL, 1579 sizeof(*bp) + sizeof(*th)); 1580 return crypto_ahash_update(hp->req); 1581 } 1582 1583 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1584 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1585 { 1586 struct tcp_sigpool hp; 1587 1588 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1589 goto clear_hash_nostart; 1590 1591 if (crypto_ahash_init(hp.req)) 1592 goto clear_hash; 1593 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) 1594 goto clear_hash; 1595 if (tcp_md5_hash_key(&hp, key)) 1596 goto clear_hash; 1597 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1598 if (crypto_ahash_final(hp.req)) 1599 goto clear_hash; 1600 1601 tcp_sigpool_end(&hp); 1602 return 0; 1603 1604 clear_hash: 1605 tcp_sigpool_end(&hp); 1606 clear_hash_nostart: 1607 memset(md5_hash, 0, 16); 1608 return 1; 1609 } 1610 1611 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1612 const struct sock *sk, 1613 const struct sk_buff *skb) 1614 { 1615 const struct tcphdr *th = tcp_hdr(skb); 1616 struct tcp_sigpool hp; 1617 __be32 saddr, daddr; 1618 1619 if (sk) { /* valid for establish/request sockets */ 1620 saddr = sk->sk_rcv_saddr; 1621 daddr = sk->sk_daddr; 1622 } else { 1623 const struct iphdr *iph = ip_hdr(skb); 1624 saddr = iph->saddr; 1625 daddr = iph->daddr; 1626 } 1627 1628 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) 1629 goto clear_hash_nostart; 1630 1631 if (crypto_ahash_init(hp.req)) 1632 goto clear_hash; 1633 1634 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) 1635 goto clear_hash; 1636 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) 1637 goto clear_hash; 1638 if (tcp_md5_hash_key(&hp, key)) 1639 goto clear_hash; 1640 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); 1641 if (crypto_ahash_final(hp.req)) 1642 goto clear_hash; 1643 1644 tcp_sigpool_end(&hp); 1645 return 0; 1646 1647 clear_hash: 1648 tcp_sigpool_end(&hp); 1649 clear_hash_nostart: 1650 memset(md5_hash, 0, 16); 1651 return 1; 1652 } 1653 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1654 1655 #endif 1656 1657 static void tcp_v4_init_req(struct request_sock *req, 1658 const struct sock *sk_listener, 1659 struct sk_buff *skb) 1660 { 1661 struct inet_request_sock *ireq = inet_rsk(req); 1662 struct net *net = sock_net(sk_listener); 1663 1664 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1665 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1666 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1667 } 1668 1669 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1670 struct sk_buff *skb, 1671 struct flowi *fl, 1672 struct request_sock *req, 1673 u32 tw_isn) 1674 { 1675 tcp_v4_init_req(req, sk, skb); 1676 1677 if (security_inet_conn_request(sk, skb, req)) 1678 return NULL; 1679 1680 return inet_csk_route_req(sk, &fl->u.ip4, req); 1681 } 1682 1683 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1684 .family = PF_INET, 1685 .obj_size = sizeof(struct tcp_request_sock), 1686 .rtx_syn_ack = tcp_rtx_synack, 1687 .send_ack = tcp_v4_reqsk_send_ack, 1688 .destructor = tcp_v4_reqsk_destructor, 1689 .send_reset = tcp_v4_send_reset, 1690 .syn_ack_timeout = tcp_syn_ack_timeout, 1691 }; 1692 1693 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1694 .mss_clamp = TCP_MSS_DEFAULT, 1695 #ifdef CONFIG_TCP_MD5SIG 1696 .req_md5_lookup = tcp_v4_md5_lookup, 1697 .calc_md5_hash = tcp_v4_md5_hash_skb, 1698 #endif 1699 #ifdef CONFIG_TCP_AO 1700 .ao_lookup = tcp_v4_ao_lookup_rsk, 1701 .ao_calc_key = tcp_v4_ao_calc_key_rsk, 1702 .ao_synack_hash = tcp_v4_ao_synack_hash, 1703 #endif 1704 #ifdef CONFIG_SYN_COOKIES 1705 .cookie_init_seq = cookie_v4_init_sequence, 1706 #endif 1707 .route_req = tcp_v4_route_req, 1708 .init_seq = tcp_v4_init_seq, 1709 .init_ts_off = tcp_v4_init_ts_off, 1710 .send_synack = tcp_v4_send_synack, 1711 }; 1712 1713 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1714 { 1715 /* Never answer to SYNs send to broadcast or multicast */ 1716 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1717 goto drop; 1718 1719 return tcp_conn_request(&tcp_request_sock_ops, 1720 &tcp_request_sock_ipv4_ops, sk, skb); 1721 1722 drop: 1723 tcp_listendrop(sk); 1724 return 0; 1725 } 1726 EXPORT_SYMBOL(tcp_v4_conn_request); 1727 1728 1729 /* 1730 * The three way handshake has completed - we got a valid synack - 1731 * now create the new socket. 1732 */ 1733 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1734 struct request_sock *req, 1735 struct dst_entry *dst, 1736 struct request_sock *req_unhash, 1737 bool *own_req) 1738 { 1739 struct inet_request_sock *ireq; 1740 bool found_dup_sk = false; 1741 struct inet_sock *newinet; 1742 struct tcp_sock *newtp; 1743 struct sock *newsk; 1744 #ifdef CONFIG_TCP_MD5SIG 1745 const union tcp_md5_addr *addr; 1746 struct tcp_md5sig_key *key; 1747 int l3index; 1748 #endif 1749 struct ip_options_rcu *inet_opt; 1750 1751 if (sk_acceptq_is_full(sk)) 1752 goto exit_overflow; 1753 1754 newsk = tcp_create_openreq_child(sk, req, skb); 1755 if (!newsk) 1756 goto exit_nonewsk; 1757 1758 newsk->sk_gso_type = SKB_GSO_TCPV4; 1759 inet_sk_rx_dst_set(newsk, skb); 1760 1761 newtp = tcp_sk(newsk); 1762 newinet = inet_sk(newsk); 1763 ireq = inet_rsk(req); 1764 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1765 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1766 newsk->sk_bound_dev_if = ireq->ir_iif; 1767 newinet->inet_saddr = ireq->ir_loc_addr; 1768 inet_opt = rcu_dereference(ireq->ireq_opt); 1769 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1770 newinet->mc_index = inet_iif(skb); 1771 newinet->mc_ttl = ip_hdr(skb)->ttl; 1772 newinet->rcv_tos = ip_hdr(skb)->tos; 1773 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1774 if (inet_opt) 1775 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1776 atomic_set(&newinet->inet_id, get_random_u16()); 1777 1778 /* Set ToS of the new socket based upon the value of incoming SYN. 1779 * ECT bits are set later in tcp_init_transfer(). 1780 */ 1781 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1782 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1783 1784 if (!dst) { 1785 dst = inet_csk_route_child_sock(sk, newsk, req); 1786 if (!dst) 1787 goto put_and_exit; 1788 } else { 1789 /* syncookie case : see end of cookie_v4_check() */ 1790 } 1791 sk_setup_caps(newsk, dst); 1792 1793 tcp_ca_openreq_child(newsk, dst); 1794 1795 tcp_sync_mss(newsk, dst_mtu(dst)); 1796 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1797 1798 tcp_initialize_rcv_mss(newsk); 1799 1800 #ifdef CONFIG_TCP_MD5SIG 1801 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1802 /* Copy over the MD5 key from the original socket */ 1803 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1804 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1805 if (key && !tcp_rsk_used_ao(req)) { 1806 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1807 goto put_and_exit; 1808 sk_gso_disable(newsk); 1809 } 1810 #endif 1811 #ifdef CONFIG_TCP_AO 1812 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET)) 1813 goto put_and_exit; /* OOM, release back memory */ 1814 #endif 1815 1816 if (__inet_inherit_port(sk, newsk) < 0) 1817 goto put_and_exit; 1818 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1819 &found_dup_sk); 1820 if (likely(*own_req)) { 1821 tcp_move_syn(newtp, req); 1822 ireq->ireq_opt = NULL; 1823 } else { 1824 newinet->inet_opt = NULL; 1825 1826 if (!req_unhash && found_dup_sk) { 1827 /* This code path should only be executed in the 1828 * syncookie case only 1829 */ 1830 bh_unlock_sock(newsk); 1831 sock_put(newsk); 1832 newsk = NULL; 1833 } 1834 } 1835 return newsk; 1836 1837 exit_overflow: 1838 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1839 exit_nonewsk: 1840 dst_release(dst); 1841 exit: 1842 tcp_listendrop(sk); 1843 return NULL; 1844 put_and_exit: 1845 newinet->inet_opt = NULL; 1846 inet_csk_prepare_forced_close(newsk); 1847 tcp_done(newsk); 1848 goto exit; 1849 } 1850 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1851 1852 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1853 { 1854 #ifdef CONFIG_SYN_COOKIES 1855 const struct tcphdr *th = tcp_hdr(skb); 1856 1857 if (!th->syn) 1858 sk = cookie_v4_check(sk, skb); 1859 #endif 1860 return sk; 1861 } 1862 1863 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1864 struct tcphdr *th, u32 *cookie) 1865 { 1866 u16 mss = 0; 1867 #ifdef CONFIG_SYN_COOKIES 1868 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1869 &tcp_request_sock_ipv4_ops, sk, th); 1870 if (mss) { 1871 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1872 tcp_synq_overflow(sk); 1873 } 1874 #endif 1875 return mss; 1876 } 1877 1878 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1879 u32)); 1880 /* The socket must have it's spinlock held when we get 1881 * here, unless it is a TCP_LISTEN socket. 1882 * 1883 * We have a potential double-lock case here, so even when 1884 * doing backlog processing we use the BH locking scheme. 1885 * This is because we cannot sleep with the original spinlock 1886 * held. 1887 */ 1888 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1889 { 1890 enum skb_drop_reason reason; 1891 struct sock *rsk; 1892 1893 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1894 struct dst_entry *dst; 1895 1896 dst = rcu_dereference_protected(sk->sk_rx_dst, 1897 lockdep_sock_is_held(sk)); 1898 1899 sock_rps_save_rxhash(sk, skb); 1900 sk_mark_napi_id(sk, skb); 1901 if (dst) { 1902 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1903 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1904 dst, 0)) { 1905 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1906 dst_release(dst); 1907 } 1908 } 1909 tcp_rcv_established(sk, skb); 1910 return 0; 1911 } 1912 1913 if (tcp_checksum_complete(skb)) 1914 goto csum_err; 1915 1916 if (sk->sk_state == TCP_LISTEN) { 1917 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1918 1919 if (!nsk) 1920 return 0; 1921 if (nsk != sk) { 1922 reason = tcp_child_process(sk, nsk, skb); 1923 if (reason) { 1924 rsk = nsk; 1925 goto reset; 1926 } 1927 return 0; 1928 } 1929 } else 1930 sock_rps_save_rxhash(sk, skb); 1931 1932 reason = tcp_rcv_state_process(sk, skb); 1933 if (reason) { 1934 rsk = sk; 1935 goto reset; 1936 } 1937 return 0; 1938 1939 reset: 1940 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); 1941 discard: 1942 sk_skb_reason_drop(sk, skb, reason); 1943 /* Be careful here. If this function gets more complicated and 1944 * gcc suffers from register pressure on the x86, sk (in %ebx) 1945 * might be destroyed here. This current version compiles correctly, 1946 * but you have been warned. 1947 */ 1948 return 0; 1949 1950 csum_err: 1951 reason = SKB_DROP_REASON_TCP_CSUM; 1952 trace_tcp_bad_csum(skb); 1953 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1954 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1955 goto discard; 1956 } 1957 EXPORT_SYMBOL(tcp_v4_do_rcv); 1958 1959 int tcp_v4_early_demux(struct sk_buff *skb) 1960 { 1961 struct net *net = dev_net(skb->dev); 1962 const struct iphdr *iph; 1963 const struct tcphdr *th; 1964 struct sock *sk; 1965 1966 if (skb->pkt_type != PACKET_HOST) 1967 return 0; 1968 1969 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1970 return 0; 1971 1972 iph = ip_hdr(skb); 1973 th = tcp_hdr(skb); 1974 1975 if (th->doff < sizeof(struct tcphdr) / 4) 1976 return 0; 1977 1978 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1979 iph->saddr, th->source, 1980 iph->daddr, ntohs(th->dest), 1981 skb->skb_iif, inet_sdif(skb)); 1982 if (sk) { 1983 skb->sk = sk; 1984 skb->destructor = sock_edemux; 1985 if (sk_fullsock(sk)) { 1986 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1987 1988 if (dst) 1989 dst = dst_check(dst, 0); 1990 if (dst && 1991 sk->sk_rx_dst_ifindex == skb->skb_iif) 1992 skb_dst_set_noref(skb, dst); 1993 } 1994 } 1995 return 0; 1996 } 1997 1998 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1999 enum skb_drop_reason *reason) 2000 { 2001 u32 tail_gso_size, tail_gso_segs; 2002 struct skb_shared_info *shinfo; 2003 const struct tcphdr *th; 2004 struct tcphdr *thtail; 2005 struct sk_buff *tail; 2006 unsigned int hdrlen; 2007 bool fragstolen; 2008 u32 gso_segs; 2009 u32 gso_size; 2010 u64 limit; 2011 int delta; 2012 2013 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2014 * we can fix skb->truesize to its real value to avoid future drops. 2015 * This is valid because skb is not yet charged to the socket. 2016 * It has been noticed pure SACK packets were sometimes dropped 2017 * (if cooked by drivers without copybreak feature). 2018 */ 2019 skb_condense(skb); 2020 2021 skb_dst_drop(skb); 2022 2023 if (unlikely(tcp_checksum_complete(skb))) { 2024 bh_unlock_sock(sk); 2025 trace_tcp_bad_csum(skb); 2026 *reason = SKB_DROP_REASON_TCP_CSUM; 2027 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 2028 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 2029 return true; 2030 } 2031 2032 /* Attempt coalescing to last skb in backlog, even if we are 2033 * above the limits. 2034 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 2035 */ 2036 th = (const struct tcphdr *)skb->data; 2037 hdrlen = th->doff * 4; 2038 2039 tail = sk->sk_backlog.tail; 2040 if (!tail) 2041 goto no_coalesce; 2042 thtail = (struct tcphdr *)tail->data; 2043 2044 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 2045 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 2046 ((TCP_SKB_CB(tail)->tcp_flags | 2047 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 2048 !((TCP_SKB_CB(tail)->tcp_flags & 2049 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 2050 ((TCP_SKB_CB(tail)->tcp_flags ^ 2051 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 2052 !tcp_skb_can_collapse_rx(tail, skb) || 2053 thtail->doff != th->doff || 2054 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 2055 goto no_coalesce; 2056 2057 __skb_pull(skb, hdrlen); 2058 2059 shinfo = skb_shinfo(skb); 2060 gso_size = shinfo->gso_size ?: skb->len; 2061 gso_segs = shinfo->gso_segs ?: 1; 2062 2063 shinfo = skb_shinfo(tail); 2064 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 2065 tail_gso_segs = shinfo->gso_segs ?: 1; 2066 2067 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 2068 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 2069 2070 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 2071 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 2072 thtail->window = th->window; 2073 } 2074 2075 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 2076 * thtail->fin, so that the fast path in tcp_rcv_established() 2077 * is not entered if we append a packet with a FIN. 2078 * SYN, RST, URG are not present. 2079 * ACK is set on both packets. 2080 * PSH : we do not really care in TCP stack, 2081 * at least for 'GRO' packets. 2082 */ 2083 thtail->fin |= th->fin; 2084 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 2085 2086 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2087 TCP_SKB_CB(tail)->has_rxtstamp = true; 2088 tail->tstamp = skb->tstamp; 2089 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 2090 } 2091 2092 /* Not as strict as GRO. We only need to carry mss max value */ 2093 shinfo->gso_size = max(gso_size, tail_gso_size); 2094 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 2095 2096 sk->sk_backlog.len += delta; 2097 __NET_INC_STATS(sock_net(sk), 2098 LINUX_MIB_TCPBACKLOGCOALESCE); 2099 kfree_skb_partial(skb, fragstolen); 2100 return false; 2101 } 2102 __skb_push(skb, hdrlen); 2103 2104 no_coalesce: 2105 /* sk->sk_backlog.len is reset only at the end of __release_sock(). 2106 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach 2107 * sk_rcvbuf in normal conditions. 2108 */ 2109 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; 2110 2111 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; 2112 2113 /* Only socket owner can try to collapse/prune rx queues 2114 * to reduce memory overhead, so add a little headroom here. 2115 * Few sockets backlog are possibly concurrently non empty. 2116 */ 2117 limit += 64 * 1024; 2118 2119 limit = min_t(u64, limit, UINT_MAX); 2120 2121 if (unlikely(sk_add_backlog(sk, skb, limit))) { 2122 bh_unlock_sock(sk); 2123 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2124 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2125 return true; 2126 } 2127 return false; 2128 } 2129 EXPORT_SYMBOL(tcp_add_backlog); 2130 2131 int tcp_filter(struct sock *sk, struct sk_buff *skb) 2132 { 2133 struct tcphdr *th = (struct tcphdr *)skb->data; 2134 2135 return sk_filter_trim_cap(sk, skb, th->doff * 4); 2136 } 2137 EXPORT_SYMBOL(tcp_filter); 2138 2139 static void tcp_v4_restore_cb(struct sk_buff *skb) 2140 { 2141 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 2142 sizeof(struct inet_skb_parm)); 2143 } 2144 2145 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 2146 const struct tcphdr *th) 2147 { 2148 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 2149 * barrier() makes sure compiler wont play fool^Waliasing games. 2150 */ 2151 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 2152 sizeof(struct inet_skb_parm)); 2153 barrier(); 2154 2155 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 2156 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 2157 skb->len - th->doff * 4); 2158 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 2159 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 2160 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 2161 TCP_SKB_CB(skb)->sacked = 0; 2162 TCP_SKB_CB(skb)->has_rxtstamp = 2163 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 2164 } 2165 2166 /* 2167 * From tcp_input.c 2168 */ 2169 2170 int tcp_v4_rcv(struct sk_buff *skb) 2171 { 2172 struct net *net = dev_net(skb->dev); 2173 enum skb_drop_reason drop_reason; 2174 int sdif = inet_sdif(skb); 2175 int dif = inet_iif(skb); 2176 const struct iphdr *iph; 2177 const struct tcphdr *th; 2178 struct sock *sk = NULL; 2179 bool refcounted; 2180 int ret; 2181 u32 isn; 2182 2183 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2184 if (skb->pkt_type != PACKET_HOST) 2185 goto discard_it; 2186 2187 /* Count it even if it's bad */ 2188 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2189 2190 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2191 goto discard_it; 2192 2193 th = (const struct tcphdr *)skb->data; 2194 2195 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2196 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2197 goto bad_packet; 2198 } 2199 if (!pskb_may_pull(skb, th->doff * 4)) 2200 goto discard_it; 2201 2202 /* An explanation is required here, I think. 2203 * Packet length and doff are validated by header prediction, 2204 * provided case of th->doff==0 is eliminated. 2205 * So, we defer the checks. */ 2206 2207 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2208 goto csum_error; 2209 2210 th = (const struct tcphdr *)skb->data; 2211 iph = ip_hdr(skb); 2212 lookup: 2213 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2214 skb, __tcp_hdrlen(th), th->source, 2215 th->dest, sdif, &refcounted); 2216 if (!sk) 2217 goto no_tcp_socket; 2218 2219 if (sk->sk_state == TCP_TIME_WAIT) 2220 goto do_time_wait; 2221 2222 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2223 struct request_sock *req = inet_reqsk(sk); 2224 bool req_stolen = false; 2225 struct sock *nsk; 2226 2227 sk = req->rsk_listener; 2228 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2229 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2230 else 2231 drop_reason = tcp_inbound_hash(sk, req, skb, 2232 &iph->saddr, &iph->daddr, 2233 AF_INET, dif, sdif); 2234 if (unlikely(drop_reason)) { 2235 sk_drops_add(sk, skb); 2236 reqsk_put(req); 2237 goto discard_it; 2238 } 2239 if (tcp_checksum_complete(skb)) { 2240 reqsk_put(req); 2241 goto csum_error; 2242 } 2243 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2244 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2245 if (!nsk) { 2246 inet_csk_reqsk_queue_drop_and_put(sk, req); 2247 goto lookup; 2248 } 2249 sk = nsk; 2250 /* reuseport_migrate_sock() has already held one sk_refcnt 2251 * before returning. 2252 */ 2253 } else { 2254 /* We own a reference on the listener, increase it again 2255 * as we might lose it too soon. 2256 */ 2257 sock_hold(sk); 2258 } 2259 refcounted = true; 2260 nsk = NULL; 2261 if (!tcp_filter(sk, skb)) { 2262 th = (const struct tcphdr *)skb->data; 2263 iph = ip_hdr(skb); 2264 tcp_v4_fill_cb(skb, iph, th); 2265 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2266 } else { 2267 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2268 } 2269 if (!nsk) { 2270 reqsk_put(req); 2271 if (req_stolen) { 2272 /* Another cpu got exclusive access to req 2273 * and created a full blown socket. 2274 * Try to feed this packet to this socket 2275 * instead of discarding it. 2276 */ 2277 tcp_v4_restore_cb(skb); 2278 sock_put(sk); 2279 goto lookup; 2280 } 2281 goto discard_and_relse; 2282 } 2283 nf_reset_ct(skb); 2284 if (nsk == sk) { 2285 reqsk_put(req); 2286 tcp_v4_restore_cb(skb); 2287 } else { 2288 drop_reason = tcp_child_process(sk, nsk, skb); 2289 if (drop_reason) { 2290 enum sk_rst_reason rst_reason; 2291 2292 rst_reason = sk_rst_convert_drop_reason(drop_reason); 2293 tcp_v4_send_reset(nsk, skb, rst_reason); 2294 goto discard_and_relse; 2295 } 2296 sock_put(sk); 2297 return 0; 2298 } 2299 } 2300 2301 process: 2302 if (static_branch_unlikely(&ip4_min_ttl)) { 2303 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2304 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2305 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2306 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2307 goto discard_and_relse; 2308 } 2309 } 2310 2311 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2312 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2313 goto discard_and_relse; 2314 } 2315 2316 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr, 2317 AF_INET, dif, sdif); 2318 if (drop_reason) 2319 goto discard_and_relse; 2320 2321 nf_reset_ct(skb); 2322 2323 if (tcp_filter(sk, skb)) { 2324 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2325 goto discard_and_relse; 2326 } 2327 th = (const struct tcphdr *)skb->data; 2328 iph = ip_hdr(skb); 2329 tcp_v4_fill_cb(skb, iph, th); 2330 2331 skb->dev = NULL; 2332 2333 if (sk->sk_state == TCP_LISTEN) { 2334 ret = tcp_v4_do_rcv(sk, skb); 2335 goto put_and_return; 2336 } 2337 2338 sk_incoming_cpu_update(sk); 2339 2340 bh_lock_sock_nested(sk); 2341 tcp_segs_in(tcp_sk(sk), skb); 2342 ret = 0; 2343 if (!sock_owned_by_user(sk)) { 2344 ret = tcp_v4_do_rcv(sk, skb); 2345 } else { 2346 if (tcp_add_backlog(sk, skb, &drop_reason)) 2347 goto discard_and_relse; 2348 } 2349 bh_unlock_sock(sk); 2350 2351 put_and_return: 2352 if (refcounted) 2353 sock_put(sk); 2354 2355 return ret; 2356 2357 no_tcp_socket: 2358 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2359 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2360 goto discard_it; 2361 2362 tcp_v4_fill_cb(skb, iph, th); 2363 2364 if (tcp_checksum_complete(skb)) { 2365 csum_error: 2366 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2367 trace_tcp_bad_csum(skb); 2368 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2369 bad_packet: 2370 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2371 } else { 2372 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); 2373 } 2374 2375 discard_it: 2376 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2377 /* Discard frame. */ 2378 sk_skb_reason_drop(sk, skb, drop_reason); 2379 return 0; 2380 2381 discard_and_relse: 2382 sk_drops_add(sk, skb); 2383 if (refcounted) 2384 sock_put(sk); 2385 goto discard_it; 2386 2387 do_time_wait: 2388 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2389 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2390 inet_twsk_put(inet_twsk(sk)); 2391 goto discard_it; 2392 } 2393 2394 tcp_v4_fill_cb(skb, iph, th); 2395 2396 if (tcp_checksum_complete(skb)) { 2397 inet_twsk_put(inet_twsk(sk)); 2398 goto csum_error; 2399 } 2400 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { 2401 case TCP_TW_SYN: { 2402 struct sock *sk2 = inet_lookup_listener(net, 2403 net->ipv4.tcp_death_row.hashinfo, 2404 skb, __tcp_hdrlen(th), 2405 iph->saddr, th->source, 2406 iph->daddr, th->dest, 2407 inet_iif(skb), 2408 sdif); 2409 if (sk2) { 2410 inet_twsk_deschedule_put(inet_twsk(sk)); 2411 sk = sk2; 2412 tcp_v4_restore_cb(skb); 2413 refcounted = false; 2414 __this_cpu_write(tcp_tw_isn, isn); 2415 goto process; 2416 } 2417 } 2418 /* to ACK */ 2419 fallthrough; 2420 case TCP_TW_ACK: 2421 tcp_v4_timewait_ack(sk, skb); 2422 break; 2423 case TCP_TW_RST: 2424 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); 2425 inet_twsk_deschedule_put(inet_twsk(sk)); 2426 goto discard_it; 2427 case TCP_TW_SUCCESS:; 2428 } 2429 goto discard_it; 2430 } 2431 2432 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2433 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2434 .twsk_destructor= tcp_twsk_destructor, 2435 }; 2436 2437 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2438 { 2439 struct dst_entry *dst = skb_dst(skb); 2440 2441 if (dst && dst_hold_safe(dst)) { 2442 rcu_assign_pointer(sk->sk_rx_dst, dst); 2443 sk->sk_rx_dst_ifindex = skb->skb_iif; 2444 } 2445 } 2446 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2447 2448 const struct inet_connection_sock_af_ops ipv4_specific = { 2449 .queue_xmit = ip_queue_xmit, 2450 .send_check = tcp_v4_send_check, 2451 .rebuild_header = inet_sk_rebuild_header, 2452 .sk_rx_dst_set = inet_sk_rx_dst_set, 2453 .conn_request = tcp_v4_conn_request, 2454 .syn_recv_sock = tcp_v4_syn_recv_sock, 2455 .net_header_len = sizeof(struct iphdr), 2456 .setsockopt = ip_setsockopt, 2457 .getsockopt = ip_getsockopt, 2458 .addr2sockaddr = inet_csk_addr2sockaddr, 2459 .sockaddr_len = sizeof(struct sockaddr_in), 2460 .mtu_reduced = tcp_v4_mtu_reduced, 2461 }; 2462 EXPORT_SYMBOL(ipv4_specific); 2463 2464 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2465 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2466 #ifdef CONFIG_TCP_MD5SIG 2467 .md5_lookup = tcp_v4_md5_lookup, 2468 .calc_md5_hash = tcp_v4_md5_hash_skb, 2469 .md5_parse = tcp_v4_parse_md5_keys, 2470 #endif 2471 #ifdef CONFIG_TCP_AO 2472 .ao_lookup = tcp_v4_ao_lookup, 2473 .calc_ao_hash = tcp_v4_ao_hash_skb, 2474 .ao_parse = tcp_v4_parse_ao, 2475 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, 2476 #endif 2477 }; 2478 #endif 2479 2480 /* NOTE: A lot of things set to zero explicitly by call to 2481 * sk_alloc() so need not be done here. 2482 */ 2483 static int tcp_v4_init_sock(struct sock *sk) 2484 { 2485 struct inet_connection_sock *icsk = inet_csk(sk); 2486 2487 tcp_init_sock(sk); 2488 2489 icsk->icsk_af_ops = &ipv4_specific; 2490 2491 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) 2492 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2493 #endif 2494 2495 return 0; 2496 } 2497 2498 #ifdef CONFIG_TCP_MD5SIG 2499 static void tcp_md5sig_info_free_rcu(struct rcu_head *head) 2500 { 2501 struct tcp_md5sig_info *md5sig; 2502 2503 md5sig = container_of(head, struct tcp_md5sig_info, rcu); 2504 kfree(md5sig); 2505 static_branch_slow_dec_deferred(&tcp_md5_needed); 2506 tcp_md5_release_sigpool(); 2507 } 2508 #endif 2509 2510 void tcp_v4_destroy_sock(struct sock *sk) 2511 { 2512 struct tcp_sock *tp = tcp_sk(sk); 2513 2514 trace_tcp_destroy_sock(sk); 2515 2516 tcp_clear_xmit_timers(sk); 2517 2518 tcp_cleanup_congestion_control(sk); 2519 2520 tcp_cleanup_ulp(sk); 2521 2522 /* Cleanup up the write buffer. */ 2523 tcp_write_queue_purge(sk); 2524 2525 /* Check if we want to disable active TFO */ 2526 tcp_fastopen_active_disable_ofo_check(sk); 2527 2528 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2529 skb_rbtree_purge(&tp->out_of_order_queue); 2530 2531 #ifdef CONFIG_TCP_MD5SIG 2532 /* Clean up the MD5 key list, if any */ 2533 if (tp->md5sig_info) { 2534 struct tcp_md5sig_info *md5sig; 2535 2536 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 2537 tcp_clear_md5_list(sk); 2538 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); 2539 rcu_assign_pointer(tp->md5sig_info, NULL); 2540 } 2541 #endif 2542 tcp_ao_destroy_sock(sk, false); 2543 2544 /* Clean up a referenced TCP bind bucket. */ 2545 if (inet_csk(sk)->icsk_bind_hash) 2546 inet_put_port(sk); 2547 2548 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2549 2550 /* If socket is aborted during connect operation */ 2551 tcp_free_fastopen_req(tp); 2552 tcp_fastopen_destroy_cipher(sk); 2553 tcp_saved_syn_free(tp); 2554 2555 sk_sockets_allocated_dec(sk); 2556 } 2557 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2558 2559 #ifdef CONFIG_PROC_FS 2560 /* Proc filesystem TCP sock list dumping. */ 2561 2562 static unsigned short seq_file_family(const struct seq_file *seq); 2563 2564 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2565 { 2566 unsigned short family = seq_file_family(seq); 2567 2568 /* AF_UNSPEC is used as a match all */ 2569 return ((family == AF_UNSPEC || family == sk->sk_family) && 2570 net_eq(sock_net(sk), seq_file_net(seq))); 2571 } 2572 2573 /* Find a non empty bucket (starting from st->bucket) 2574 * and return the first sk from it. 2575 */ 2576 static void *listening_get_first(struct seq_file *seq) 2577 { 2578 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2579 struct tcp_iter_state *st = seq->private; 2580 2581 st->offset = 0; 2582 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2583 struct inet_listen_hashbucket *ilb2; 2584 struct hlist_nulls_node *node; 2585 struct sock *sk; 2586 2587 ilb2 = &hinfo->lhash2[st->bucket]; 2588 if (hlist_nulls_empty(&ilb2->nulls_head)) 2589 continue; 2590 2591 spin_lock(&ilb2->lock); 2592 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2593 if (seq_sk_match(seq, sk)) 2594 return sk; 2595 } 2596 spin_unlock(&ilb2->lock); 2597 } 2598 2599 return NULL; 2600 } 2601 2602 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2603 * If "cur" is the last one in the st->bucket, 2604 * call listening_get_first() to return the first sk of the next 2605 * non empty bucket. 2606 */ 2607 static void *listening_get_next(struct seq_file *seq, void *cur) 2608 { 2609 struct tcp_iter_state *st = seq->private; 2610 struct inet_listen_hashbucket *ilb2; 2611 struct hlist_nulls_node *node; 2612 struct inet_hashinfo *hinfo; 2613 struct sock *sk = cur; 2614 2615 ++st->num; 2616 ++st->offset; 2617 2618 sk = sk_nulls_next(sk); 2619 sk_nulls_for_each_from(sk, node) { 2620 if (seq_sk_match(seq, sk)) 2621 return sk; 2622 } 2623 2624 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2625 ilb2 = &hinfo->lhash2[st->bucket]; 2626 spin_unlock(&ilb2->lock); 2627 ++st->bucket; 2628 return listening_get_first(seq); 2629 } 2630 2631 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2632 { 2633 struct tcp_iter_state *st = seq->private; 2634 void *rc; 2635 2636 st->bucket = 0; 2637 st->offset = 0; 2638 rc = listening_get_first(seq); 2639 2640 while (rc && *pos) { 2641 rc = listening_get_next(seq, rc); 2642 --*pos; 2643 } 2644 return rc; 2645 } 2646 2647 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2648 const struct tcp_iter_state *st) 2649 { 2650 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2651 } 2652 2653 /* 2654 * Get first established socket starting from bucket given in st->bucket. 2655 * If st->bucket is zero, the very first socket in the hash is returned. 2656 */ 2657 static void *established_get_first(struct seq_file *seq) 2658 { 2659 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2660 struct tcp_iter_state *st = seq->private; 2661 2662 st->offset = 0; 2663 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2664 struct sock *sk; 2665 struct hlist_nulls_node *node; 2666 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2667 2668 cond_resched(); 2669 2670 /* Lockless fast path for the common case of empty buckets */ 2671 if (empty_bucket(hinfo, st)) 2672 continue; 2673 2674 spin_lock_bh(lock); 2675 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2676 if (seq_sk_match(seq, sk)) 2677 return sk; 2678 } 2679 spin_unlock_bh(lock); 2680 } 2681 2682 return NULL; 2683 } 2684 2685 static void *established_get_next(struct seq_file *seq, void *cur) 2686 { 2687 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2688 struct tcp_iter_state *st = seq->private; 2689 struct hlist_nulls_node *node; 2690 struct sock *sk = cur; 2691 2692 ++st->num; 2693 ++st->offset; 2694 2695 sk = sk_nulls_next(sk); 2696 2697 sk_nulls_for_each_from(sk, node) { 2698 if (seq_sk_match(seq, sk)) 2699 return sk; 2700 } 2701 2702 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2703 ++st->bucket; 2704 return established_get_first(seq); 2705 } 2706 2707 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2708 { 2709 struct tcp_iter_state *st = seq->private; 2710 void *rc; 2711 2712 st->bucket = 0; 2713 rc = established_get_first(seq); 2714 2715 while (rc && pos) { 2716 rc = established_get_next(seq, rc); 2717 --pos; 2718 } 2719 return rc; 2720 } 2721 2722 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2723 { 2724 void *rc; 2725 struct tcp_iter_state *st = seq->private; 2726 2727 st->state = TCP_SEQ_STATE_LISTENING; 2728 rc = listening_get_idx(seq, &pos); 2729 2730 if (!rc) { 2731 st->state = TCP_SEQ_STATE_ESTABLISHED; 2732 rc = established_get_idx(seq, pos); 2733 } 2734 2735 return rc; 2736 } 2737 2738 static void *tcp_seek_last_pos(struct seq_file *seq) 2739 { 2740 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2741 struct tcp_iter_state *st = seq->private; 2742 int bucket = st->bucket; 2743 int offset = st->offset; 2744 int orig_num = st->num; 2745 void *rc = NULL; 2746 2747 switch (st->state) { 2748 case TCP_SEQ_STATE_LISTENING: 2749 if (st->bucket > hinfo->lhash2_mask) 2750 break; 2751 rc = listening_get_first(seq); 2752 while (offset-- && rc && bucket == st->bucket) 2753 rc = listening_get_next(seq, rc); 2754 if (rc) 2755 break; 2756 st->bucket = 0; 2757 st->state = TCP_SEQ_STATE_ESTABLISHED; 2758 fallthrough; 2759 case TCP_SEQ_STATE_ESTABLISHED: 2760 if (st->bucket > hinfo->ehash_mask) 2761 break; 2762 rc = established_get_first(seq); 2763 while (offset-- && rc && bucket == st->bucket) 2764 rc = established_get_next(seq, rc); 2765 } 2766 2767 st->num = orig_num; 2768 2769 return rc; 2770 } 2771 2772 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2773 { 2774 struct tcp_iter_state *st = seq->private; 2775 void *rc; 2776 2777 if (*pos && *pos == st->last_pos) { 2778 rc = tcp_seek_last_pos(seq); 2779 if (rc) 2780 goto out; 2781 } 2782 2783 st->state = TCP_SEQ_STATE_LISTENING; 2784 st->num = 0; 2785 st->bucket = 0; 2786 st->offset = 0; 2787 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2788 2789 out: 2790 st->last_pos = *pos; 2791 return rc; 2792 } 2793 EXPORT_SYMBOL(tcp_seq_start); 2794 2795 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2796 { 2797 struct tcp_iter_state *st = seq->private; 2798 void *rc = NULL; 2799 2800 if (v == SEQ_START_TOKEN) { 2801 rc = tcp_get_idx(seq, 0); 2802 goto out; 2803 } 2804 2805 switch (st->state) { 2806 case TCP_SEQ_STATE_LISTENING: 2807 rc = listening_get_next(seq, v); 2808 if (!rc) { 2809 st->state = TCP_SEQ_STATE_ESTABLISHED; 2810 st->bucket = 0; 2811 st->offset = 0; 2812 rc = established_get_first(seq); 2813 } 2814 break; 2815 case TCP_SEQ_STATE_ESTABLISHED: 2816 rc = established_get_next(seq, v); 2817 break; 2818 } 2819 out: 2820 ++*pos; 2821 st->last_pos = *pos; 2822 return rc; 2823 } 2824 EXPORT_SYMBOL(tcp_seq_next); 2825 2826 void tcp_seq_stop(struct seq_file *seq, void *v) 2827 { 2828 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2829 struct tcp_iter_state *st = seq->private; 2830 2831 switch (st->state) { 2832 case TCP_SEQ_STATE_LISTENING: 2833 if (v != SEQ_START_TOKEN) 2834 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2835 break; 2836 case TCP_SEQ_STATE_ESTABLISHED: 2837 if (v) 2838 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2839 break; 2840 } 2841 } 2842 EXPORT_SYMBOL(tcp_seq_stop); 2843 2844 static void get_openreq4(const struct request_sock *req, 2845 struct seq_file *f, int i) 2846 { 2847 const struct inet_request_sock *ireq = inet_rsk(req); 2848 long delta = req->rsk_timer.expires - jiffies; 2849 2850 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2851 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2852 i, 2853 ireq->ir_loc_addr, 2854 ireq->ir_num, 2855 ireq->ir_rmt_addr, 2856 ntohs(ireq->ir_rmt_port), 2857 TCP_SYN_RECV, 2858 0, 0, /* could print option size, but that is af dependent. */ 2859 1, /* timers active (only the expire timer) */ 2860 jiffies_delta_to_clock_t(delta), 2861 req->num_timeout, 2862 from_kuid_munged(seq_user_ns(f), 2863 sock_i_uid(req->rsk_listener)), 2864 0, /* non standard timer */ 2865 0, /* open_requests have no inode */ 2866 0, 2867 req); 2868 } 2869 2870 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2871 { 2872 int timer_active; 2873 unsigned long timer_expires; 2874 const struct tcp_sock *tp = tcp_sk(sk); 2875 const struct inet_connection_sock *icsk = inet_csk(sk); 2876 const struct inet_sock *inet = inet_sk(sk); 2877 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2878 __be32 dest = inet->inet_daddr; 2879 __be32 src = inet->inet_rcv_saddr; 2880 __u16 destp = ntohs(inet->inet_dport); 2881 __u16 srcp = ntohs(inet->inet_sport); 2882 int rx_queue; 2883 int state; 2884 2885 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2886 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2887 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2888 timer_active = 1; 2889 timer_expires = icsk->icsk_timeout; 2890 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2891 timer_active = 4; 2892 timer_expires = icsk->icsk_timeout; 2893 } else if (timer_pending(&sk->sk_timer)) { 2894 timer_active = 2; 2895 timer_expires = sk->sk_timer.expires; 2896 } else { 2897 timer_active = 0; 2898 timer_expires = jiffies; 2899 } 2900 2901 state = inet_sk_state_load(sk); 2902 if (state == TCP_LISTEN) 2903 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2904 else 2905 /* Because we don't lock the socket, 2906 * we might find a transient negative value. 2907 */ 2908 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2909 READ_ONCE(tp->copied_seq), 0); 2910 2911 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2912 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2913 i, src, srcp, dest, destp, state, 2914 READ_ONCE(tp->write_seq) - tp->snd_una, 2915 rx_queue, 2916 timer_active, 2917 jiffies_delta_to_clock_t(timer_expires - jiffies), 2918 icsk->icsk_retransmits, 2919 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2920 icsk->icsk_probes_out, 2921 sock_i_ino(sk), 2922 refcount_read(&sk->sk_refcnt), sk, 2923 jiffies_to_clock_t(icsk->icsk_rto), 2924 jiffies_to_clock_t(icsk->icsk_ack.ato), 2925 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2926 tcp_snd_cwnd(tp), 2927 state == TCP_LISTEN ? 2928 fastopenq->max_qlen : 2929 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2930 } 2931 2932 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2933 struct seq_file *f, int i) 2934 { 2935 long delta = tw->tw_timer.expires - jiffies; 2936 __be32 dest, src; 2937 __u16 destp, srcp; 2938 2939 dest = tw->tw_daddr; 2940 src = tw->tw_rcv_saddr; 2941 destp = ntohs(tw->tw_dport); 2942 srcp = ntohs(tw->tw_sport); 2943 2944 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2945 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2946 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2947 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2948 refcount_read(&tw->tw_refcnt), tw); 2949 } 2950 2951 #define TMPSZ 150 2952 2953 static int tcp4_seq_show(struct seq_file *seq, void *v) 2954 { 2955 struct tcp_iter_state *st; 2956 struct sock *sk = v; 2957 2958 seq_setwidth(seq, TMPSZ - 1); 2959 if (v == SEQ_START_TOKEN) { 2960 seq_puts(seq, " sl local_address rem_address st tx_queue " 2961 "rx_queue tr tm->when retrnsmt uid timeout " 2962 "inode"); 2963 goto out; 2964 } 2965 st = seq->private; 2966 2967 if (sk->sk_state == TCP_TIME_WAIT) 2968 get_timewait4_sock(v, seq, st->num); 2969 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2970 get_openreq4(v, seq, st->num); 2971 else 2972 get_tcp4_sock(v, seq, st->num); 2973 out: 2974 seq_pad(seq, '\n'); 2975 return 0; 2976 } 2977 2978 #ifdef CONFIG_BPF_SYSCALL 2979 struct bpf_tcp_iter_state { 2980 struct tcp_iter_state state; 2981 unsigned int cur_sk; 2982 unsigned int end_sk; 2983 unsigned int max_sk; 2984 struct sock **batch; 2985 bool st_bucket_done; 2986 }; 2987 2988 struct bpf_iter__tcp { 2989 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2990 __bpf_md_ptr(struct sock_common *, sk_common); 2991 uid_t uid __aligned(8); 2992 }; 2993 2994 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2995 struct sock_common *sk_common, uid_t uid) 2996 { 2997 struct bpf_iter__tcp ctx; 2998 2999 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3000 ctx.meta = meta; 3001 ctx.sk_common = sk_common; 3002 ctx.uid = uid; 3003 return bpf_iter_run_prog(prog, &ctx); 3004 } 3005 3006 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 3007 { 3008 while (iter->cur_sk < iter->end_sk) 3009 sock_gen_put(iter->batch[iter->cur_sk++]); 3010 } 3011 3012 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 3013 unsigned int new_batch_sz) 3014 { 3015 struct sock **new_batch; 3016 3017 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3018 GFP_USER | __GFP_NOWARN); 3019 if (!new_batch) 3020 return -ENOMEM; 3021 3022 bpf_iter_tcp_put_batch(iter); 3023 kvfree(iter->batch); 3024 iter->batch = new_batch; 3025 iter->max_sk = new_batch_sz; 3026 3027 return 0; 3028 } 3029 3030 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 3031 struct sock *start_sk) 3032 { 3033 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3034 struct bpf_tcp_iter_state *iter = seq->private; 3035 struct tcp_iter_state *st = &iter->state; 3036 struct hlist_nulls_node *node; 3037 unsigned int expected = 1; 3038 struct sock *sk; 3039 3040 sock_hold(start_sk); 3041 iter->batch[iter->end_sk++] = start_sk; 3042 3043 sk = sk_nulls_next(start_sk); 3044 sk_nulls_for_each_from(sk, node) { 3045 if (seq_sk_match(seq, sk)) { 3046 if (iter->end_sk < iter->max_sk) { 3047 sock_hold(sk); 3048 iter->batch[iter->end_sk++] = sk; 3049 } 3050 expected++; 3051 } 3052 } 3053 spin_unlock(&hinfo->lhash2[st->bucket].lock); 3054 3055 return expected; 3056 } 3057 3058 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 3059 struct sock *start_sk) 3060 { 3061 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3062 struct bpf_tcp_iter_state *iter = seq->private; 3063 struct tcp_iter_state *st = &iter->state; 3064 struct hlist_nulls_node *node; 3065 unsigned int expected = 1; 3066 struct sock *sk; 3067 3068 sock_hold(start_sk); 3069 iter->batch[iter->end_sk++] = start_sk; 3070 3071 sk = sk_nulls_next(start_sk); 3072 sk_nulls_for_each_from(sk, node) { 3073 if (seq_sk_match(seq, sk)) { 3074 if (iter->end_sk < iter->max_sk) { 3075 sock_hold(sk); 3076 iter->batch[iter->end_sk++] = sk; 3077 } 3078 expected++; 3079 } 3080 } 3081 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 3082 3083 return expected; 3084 } 3085 3086 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 3087 { 3088 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 3089 struct bpf_tcp_iter_state *iter = seq->private; 3090 struct tcp_iter_state *st = &iter->state; 3091 unsigned int expected; 3092 bool resized = false; 3093 struct sock *sk; 3094 3095 /* The st->bucket is done. Directly advance to the next 3096 * bucket instead of having the tcp_seek_last_pos() to skip 3097 * one by one in the current bucket and eventually find out 3098 * it has to advance to the next bucket. 3099 */ 3100 if (iter->st_bucket_done) { 3101 st->offset = 0; 3102 st->bucket++; 3103 if (st->state == TCP_SEQ_STATE_LISTENING && 3104 st->bucket > hinfo->lhash2_mask) { 3105 st->state = TCP_SEQ_STATE_ESTABLISHED; 3106 st->bucket = 0; 3107 } 3108 } 3109 3110 again: 3111 /* Get a new batch */ 3112 iter->cur_sk = 0; 3113 iter->end_sk = 0; 3114 iter->st_bucket_done = false; 3115 3116 sk = tcp_seek_last_pos(seq); 3117 if (!sk) 3118 return NULL; /* Done */ 3119 3120 if (st->state == TCP_SEQ_STATE_LISTENING) 3121 expected = bpf_iter_tcp_listening_batch(seq, sk); 3122 else 3123 expected = bpf_iter_tcp_established_batch(seq, sk); 3124 3125 if (iter->end_sk == expected) { 3126 iter->st_bucket_done = true; 3127 return sk; 3128 } 3129 3130 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 3131 resized = true; 3132 goto again; 3133 } 3134 3135 return sk; 3136 } 3137 3138 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 3139 { 3140 /* bpf iter does not support lseek, so it always 3141 * continue from where it was stop()-ped. 3142 */ 3143 if (*pos) 3144 return bpf_iter_tcp_batch(seq); 3145 3146 return SEQ_START_TOKEN; 3147 } 3148 3149 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3150 { 3151 struct bpf_tcp_iter_state *iter = seq->private; 3152 struct tcp_iter_state *st = &iter->state; 3153 struct sock *sk; 3154 3155 /* Whenever seq_next() is called, the iter->cur_sk is 3156 * done with seq_show(), so advance to the next sk in 3157 * the batch. 3158 */ 3159 if (iter->cur_sk < iter->end_sk) { 3160 /* Keeping st->num consistent in tcp_iter_state. 3161 * bpf_iter_tcp does not use st->num. 3162 * meta.seq_num is used instead. 3163 */ 3164 st->num++; 3165 /* Move st->offset to the next sk in the bucket such that 3166 * the future start() will resume at st->offset in 3167 * st->bucket. See tcp_seek_last_pos(). 3168 */ 3169 st->offset++; 3170 sock_gen_put(iter->batch[iter->cur_sk++]); 3171 } 3172 3173 if (iter->cur_sk < iter->end_sk) 3174 sk = iter->batch[iter->cur_sk]; 3175 else 3176 sk = bpf_iter_tcp_batch(seq); 3177 3178 ++*pos; 3179 /* Keeping st->last_pos consistent in tcp_iter_state. 3180 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 3181 */ 3182 st->last_pos = *pos; 3183 return sk; 3184 } 3185 3186 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 3187 { 3188 struct bpf_iter_meta meta; 3189 struct bpf_prog *prog; 3190 struct sock *sk = v; 3191 uid_t uid; 3192 int ret; 3193 3194 if (v == SEQ_START_TOKEN) 3195 return 0; 3196 3197 if (sk_fullsock(sk)) 3198 lock_sock(sk); 3199 3200 if (unlikely(sk_unhashed(sk))) { 3201 ret = SEQ_SKIP; 3202 goto unlock; 3203 } 3204 3205 if (sk->sk_state == TCP_TIME_WAIT) { 3206 uid = 0; 3207 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 3208 const struct request_sock *req = v; 3209 3210 uid = from_kuid_munged(seq_user_ns(seq), 3211 sock_i_uid(req->rsk_listener)); 3212 } else { 3213 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3214 } 3215 3216 meta.seq = seq; 3217 prog = bpf_iter_get_info(&meta, false); 3218 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3219 3220 unlock: 3221 if (sk_fullsock(sk)) 3222 release_sock(sk); 3223 return ret; 3224 3225 } 3226 3227 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3228 { 3229 struct bpf_tcp_iter_state *iter = seq->private; 3230 struct bpf_iter_meta meta; 3231 struct bpf_prog *prog; 3232 3233 if (!v) { 3234 meta.seq = seq; 3235 prog = bpf_iter_get_info(&meta, true); 3236 if (prog) 3237 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3238 } 3239 3240 if (iter->cur_sk < iter->end_sk) { 3241 bpf_iter_tcp_put_batch(iter); 3242 iter->st_bucket_done = false; 3243 } 3244 } 3245 3246 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3247 .show = bpf_iter_tcp_seq_show, 3248 .start = bpf_iter_tcp_seq_start, 3249 .next = bpf_iter_tcp_seq_next, 3250 .stop = bpf_iter_tcp_seq_stop, 3251 }; 3252 #endif 3253 static unsigned short seq_file_family(const struct seq_file *seq) 3254 { 3255 const struct tcp_seq_afinfo *afinfo; 3256 3257 #ifdef CONFIG_BPF_SYSCALL 3258 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3259 if (seq->op == &bpf_iter_tcp_seq_ops) 3260 return AF_UNSPEC; 3261 #endif 3262 3263 /* Iterated from proc fs */ 3264 afinfo = pde_data(file_inode(seq->file)); 3265 return afinfo->family; 3266 } 3267 3268 static const struct seq_operations tcp4_seq_ops = { 3269 .show = tcp4_seq_show, 3270 .start = tcp_seq_start, 3271 .next = tcp_seq_next, 3272 .stop = tcp_seq_stop, 3273 }; 3274 3275 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3276 .family = AF_INET, 3277 }; 3278 3279 static int __net_init tcp4_proc_init_net(struct net *net) 3280 { 3281 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3282 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3283 return -ENOMEM; 3284 return 0; 3285 } 3286 3287 static void __net_exit tcp4_proc_exit_net(struct net *net) 3288 { 3289 remove_proc_entry("tcp", net->proc_net); 3290 } 3291 3292 static struct pernet_operations tcp4_net_ops = { 3293 .init = tcp4_proc_init_net, 3294 .exit = tcp4_proc_exit_net, 3295 }; 3296 3297 int __init tcp4_proc_init(void) 3298 { 3299 return register_pernet_subsys(&tcp4_net_ops); 3300 } 3301 3302 void tcp4_proc_exit(void) 3303 { 3304 unregister_pernet_subsys(&tcp4_net_ops); 3305 } 3306 #endif /* CONFIG_PROC_FS */ 3307 3308 /* @wake is one when sk_stream_write_space() calls us. 3309 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3310 * This mimics the strategy used in sock_def_write_space(). 3311 */ 3312 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3313 { 3314 const struct tcp_sock *tp = tcp_sk(sk); 3315 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3316 READ_ONCE(tp->snd_nxt); 3317 3318 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3319 } 3320 EXPORT_SYMBOL(tcp_stream_memory_free); 3321 3322 struct proto tcp_prot = { 3323 .name = "TCP", 3324 .owner = THIS_MODULE, 3325 .close = tcp_close, 3326 .pre_connect = tcp_v4_pre_connect, 3327 .connect = tcp_v4_connect, 3328 .disconnect = tcp_disconnect, 3329 .accept = inet_csk_accept, 3330 .ioctl = tcp_ioctl, 3331 .init = tcp_v4_init_sock, 3332 .destroy = tcp_v4_destroy_sock, 3333 .shutdown = tcp_shutdown, 3334 .setsockopt = tcp_setsockopt, 3335 .getsockopt = tcp_getsockopt, 3336 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3337 .keepalive = tcp_set_keepalive, 3338 .recvmsg = tcp_recvmsg, 3339 .sendmsg = tcp_sendmsg, 3340 .splice_eof = tcp_splice_eof, 3341 .backlog_rcv = tcp_v4_do_rcv, 3342 .release_cb = tcp_release_cb, 3343 .hash = inet_hash, 3344 .unhash = inet_unhash, 3345 .get_port = inet_csk_get_port, 3346 .put_port = inet_put_port, 3347 #ifdef CONFIG_BPF_SYSCALL 3348 .psock_update_sk_prot = tcp_bpf_update_proto, 3349 #endif 3350 .enter_memory_pressure = tcp_enter_memory_pressure, 3351 .leave_memory_pressure = tcp_leave_memory_pressure, 3352 .stream_memory_free = tcp_stream_memory_free, 3353 .sockets_allocated = &tcp_sockets_allocated, 3354 .orphan_count = &tcp_orphan_count, 3355 3356 .memory_allocated = &tcp_memory_allocated, 3357 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3358 3359 .memory_pressure = &tcp_memory_pressure, 3360 .sysctl_mem = sysctl_tcp_mem, 3361 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3362 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3363 .max_header = MAX_TCP_HEADER, 3364 .obj_size = sizeof(struct tcp_sock), 3365 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3366 .twsk_prot = &tcp_timewait_sock_ops, 3367 .rsk_prot = &tcp_request_sock_ops, 3368 .h.hashinfo = NULL, 3369 .no_autobind = true, 3370 .diag_destroy = tcp_abort, 3371 }; 3372 EXPORT_SYMBOL(tcp_prot); 3373 3374 static void __net_exit tcp_sk_exit(struct net *net) 3375 { 3376 if (net->ipv4.tcp_congestion_control) 3377 bpf_module_put(net->ipv4.tcp_congestion_control, 3378 net->ipv4.tcp_congestion_control->owner); 3379 } 3380 3381 static void __net_init tcp_set_hashinfo(struct net *net) 3382 { 3383 struct inet_hashinfo *hinfo; 3384 unsigned int ehash_entries; 3385 struct net *old_net; 3386 3387 if (net_eq(net, &init_net)) 3388 goto fallback; 3389 3390 old_net = current->nsproxy->net_ns; 3391 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3392 if (!ehash_entries) 3393 goto fallback; 3394 3395 ehash_entries = roundup_pow_of_two(ehash_entries); 3396 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3397 if (!hinfo) { 3398 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3399 "for a netns, fallback to the global one\n", 3400 ehash_entries); 3401 fallback: 3402 hinfo = &tcp_hashinfo; 3403 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3404 } 3405 3406 net->ipv4.tcp_death_row.hashinfo = hinfo; 3407 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3408 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3409 } 3410 3411 static int __net_init tcp_sk_init(struct net *net) 3412 { 3413 net->ipv4.sysctl_tcp_ecn = 2; 3414 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3415 3416 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3417 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3418 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3419 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3420 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3421 3422 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3423 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3424 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3425 3426 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3427 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3428 net->ipv4.sysctl_tcp_syncookies = 1; 3429 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3430 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3431 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3432 net->ipv4.sysctl_tcp_orphan_retries = 0; 3433 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3434 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3435 net->ipv4.sysctl_tcp_tw_reuse = 2; 3436 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3437 3438 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3439 tcp_set_hashinfo(net); 3440 3441 net->ipv4.sysctl_tcp_sack = 1; 3442 net->ipv4.sysctl_tcp_window_scaling = 1; 3443 net->ipv4.sysctl_tcp_timestamps = 1; 3444 net->ipv4.sysctl_tcp_early_retrans = 3; 3445 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3446 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3447 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3448 net->ipv4.sysctl_tcp_max_reordering = 300; 3449 net->ipv4.sysctl_tcp_dsack = 1; 3450 net->ipv4.sysctl_tcp_app_win = 31; 3451 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3452 net->ipv4.sysctl_tcp_frto = 2; 3453 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3454 /* This limits the percentage of the congestion window which we 3455 * will allow a single TSO frame to consume. Building TSO frames 3456 * which are too large can cause TCP streams to be bursty. 3457 */ 3458 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3459 /* Default TSQ limit of 16 TSO segments */ 3460 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3461 3462 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3463 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3464 3465 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3466 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3467 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3468 net->ipv4.sysctl_tcp_autocorking = 1; 3469 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3470 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3471 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3472 if (net != &init_net) { 3473 memcpy(net->ipv4.sysctl_tcp_rmem, 3474 init_net.ipv4.sysctl_tcp_rmem, 3475 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3476 memcpy(net->ipv4.sysctl_tcp_wmem, 3477 init_net.ipv4.sysctl_tcp_wmem, 3478 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3479 } 3480 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3481 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3482 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3483 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3484 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3485 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3486 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3487 3488 /* Set default values for PLB */ 3489 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3490 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3491 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3492 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3493 /* Default congestion threshold for PLB to mark a round is 50% */ 3494 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3495 3496 /* Reno is always built in */ 3497 if (!net_eq(net, &init_net) && 3498 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3499 init_net.ipv4.tcp_congestion_control->owner)) 3500 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3501 else 3502 net->ipv4.tcp_congestion_control = &tcp_reno; 3503 3504 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3505 net->ipv4.sysctl_tcp_shrink_window = 0; 3506 3507 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3508 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); 3509 3510 return 0; 3511 } 3512 3513 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3514 { 3515 struct net *net; 3516 3517 tcp_twsk_purge(net_exit_list); 3518 3519 list_for_each_entry(net, net_exit_list, exit_list) { 3520 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3521 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3522 tcp_fastopen_ctx_destroy(net); 3523 } 3524 } 3525 3526 static struct pernet_operations __net_initdata tcp_sk_ops = { 3527 .init = tcp_sk_init, 3528 .exit = tcp_sk_exit, 3529 .exit_batch = tcp_sk_exit_batch, 3530 }; 3531 3532 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3533 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3534 struct sock_common *sk_common, uid_t uid) 3535 3536 #define INIT_BATCH_SZ 16 3537 3538 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3539 { 3540 struct bpf_tcp_iter_state *iter = priv_data; 3541 int err; 3542 3543 err = bpf_iter_init_seq_net(priv_data, aux); 3544 if (err) 3545 return err; 3546 3547 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3548 if (err) { 3549 bpf_iter_fini_seq_net(priv_data); 3550 return err; 3551 } 3552 3553 return 0; 3554 } 3555 3556 static void bpf_iter_fini_tcp(void *priv_data) 3557 { 3558 struct bpf_tcp_iter_state *iter = priv_data; 3559 3560 bpf_iter_fini_seq_net(priv_data); 3561 kvfree(iter->batch); 3562 } 3563 3564 static const struct bpf_iter_seq_info tcp_seq_info = { 3565 .seq_ops = &bpf_iter_tcp_seq_ops, 3566 .init_seq_private = bpf_iter_init_tcp, 3567 .fini_seq_private = bpf_iter_fini_tcp, 3568 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3569 }; 3570 3571 static const struct bpf_func_proto * 3572 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3573 const struct bpf_prog *prog) 3574 { 3575 switch (func_id) { 3576 case BPF_FUNC_setsockopt: 3577 return &bpf_sk_setsockopt_proto; 3578 case BPF_FUNC_getsockopt: 3579 return &bpf_sk_getsockopt_proto; 3580 default: 3581 return NULL; 3582 } 3583 } 3584 3585 static struct bpf_iter_reg tcp_reg_info = { 3586 .target = "tcp", 3587 .ctx_arg_info_size = 1, 3588 .ctx_arg_info = { 3589 { offsetof(struct bpf_iter__tcp, sk_common), 3590 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3591 }, 3592 .get_func_proto = bpf_iter_tcp_get_func_proto, 3593 .seq_info = &tcp_seq_info, 3594 }; 3595 3596 static void __init bpf_iter_register(void) 3597 { 3598 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3599 if (bpf_iter_reg_target(&tcp_reg_info)) 3600 pr_warn("Warning: could not register bpf iterator tcp\n"); 3601 } 3602 3603 #endif 3604 3605 void __init tcp_v4_init(void) 3606 { 3607 int cpu, res; 3608 3609 for_each_possible_cpu(cpu) { 3610 struct sock *sk; 3611 3612 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3613 IPPROTO_TCP, &init_net); 3614 if (res) 3615 panic("Failed to create the TCP control socket.\n"); 3616 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3617 3618 /* Please enforce IP_DF and IPID==0 for RST and 3619 * ACK sent in SYN-RECV and TIME-WAIT state. 3620 */ 3621 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3622 3623 sk->sk_clockid = CLOCK_MONOTONIC; 3624 3625 per_cpu(ipv4_tcp_sk.sock, cpu) = sk; 3626 } 3627 if (register_pernet_subsys(&tcp_sk_ops)) 3628 panic("Failed to create the TCP control socket.\n"); 3629 3630 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3631 bpf_iter_register(); 3632 #endif 3633 } 3634