1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 #include <linux/sched.h> 61 62 #include <net/net_namespace.h> 63 #include <net/icmp.h> 64 #include <net/inet_hashtables.h> 65 #include <net/tcp.h> 66 #include <net/transp_v6.h> 67 #include <net/ipv6.h> 68 #include <net/inet_common.h> 69 #include <net/timewait_sock.h> 70 #include <net/xfrm.h> 71 #include <net/secure_seq.h> 72 #include <net/busy_poll.h> 73 74 #include <linux/inet.h> 75 #include <linux/ipv6.h> 76 #include <linux/stddef.h> 77 #include <linux/proc_fs.h> 78 #include <linux/seq_file.h> 79 #include <linux/inetdevice.h> 80 #include <linux/btf_ids.h> 81 82 #include <crypto/hash.h> 83 #include <linux/scatterlist.h> 84 85 #include <trace/events/tcp.h> 86 87 #ifdef CONFIG_TCP_MD5SIG 88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 89 __be32 daddr, __be32 saddr, const struct tcphdr *th); 90 #endif 91 92 struct inet_hashinfo tcp_hashinfo; 93 EXPORT_SYMBOL(tcp_hashinfo); 94 95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 96 97 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 98 { 99 return secure_tcp_seq(ip_hdr(skb)->daddr, 100 ip_hdr(skb)->saddr, 101 tcp_hdr(skb)->dest, 102 tcp_hdr(skb)->source); 103 } 104 105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 106 { 107 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 108 } 109 110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 111 { 112 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 113 const struct inet_timewait_sock *tw = inet_twsk(sktw); 114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 115 struct tcp_sock *tp = tcp_sk(sk); 116 117 if (reuse == 2) { 118 /* Still does not detect *everything* that goes through 119 * lo, since we require a loopback src or dst address 120 * or direct binding to 'lo' interface. 121 */ 122 bool loopback = false; 123 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 124 loopback = true; 125 #if IS_ENABLED(CONFIG_IPV6) 126 if (tw->tw_family == AF_INET6) { 127 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 128 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 129 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 130 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 131 loopback = true; 132 } else 133 #endif 134 { 135 if (ipv4_is_loopback(tw->tw_daddr) || 136 ipv4_is_loopback(tw->tw_rcv_saddr)) 137 loopback = true; 138 } 139 if (!loopback) 140 reuse = 0; 141 } 142 143 /* With PAWS, it is safe from the viewpoint 144 of data integrity. Even without PAWS it is safe provided sequence 145 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 146 147 Actually, the idea is close to VJ's one, only timestamp cache is 148 held not per host, but per port pair and TW bucket is used as state 149 holder. 150 151 If TW bucket has been already destroyed we fall back to VJ's scheme 152 and use initial timestamp retrieved from peer table. 153 */ 154 if (tcptw->tw_ts_recent_stamp && 155 (!twp || (reuse && time_after32(ktime_get_seconds(), 156 tcptw->tw_ts_recent_stamp)))) { 157 /* In case of repair and re-using TIME-WAIT sockets we still 158 * want to be sure that it is safe as above but honor the 159 * sequence numbers and time stamps set as part of the repair 160 * process. 161 * 162 * Without this check re-using a TIME-WAIT socket with TCP 163 * repair would accumulate a -1 on the repair assigned 164 * sequence number. The first time it is reused the sequence 165 * is -1, the second time -2, etc. This fixes that issue 166 * without appearing to create any others. 167 */ 168 if (likely(!tp->repair)) { 169 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 170 171 if (!seq) 172 seq = 1; 173 WRITE_ONCE(tp->write_seq, seq); 174 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 175 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 176 } 177 sock_hold(sktw); 178 return 1; 179 } 180 181 return 0; 182 } 183 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 184 185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 186 int addr_len) 187 { 188 /* This check is replicated from tcp_v4_connect() and intended to 189 * prevent BPF program called below from accessing bytes that are out 190 * of the bound specified by user in addr_len. 191 */ 192 if (addr_len < sizeof(struct sockaddr_in)) 193 return -EINVAL; 194 195 sock_owned_by_me(sk); 196 197 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); 198 } 199 200 /* This will initiate an outgoing connection. */ 201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 202 { 203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 204 struct inet_timewait_death_row *tcp_death_row; 205 struct inet_sock *inet = inet_sk(sk); 206 struct tcp_sock *tp = tcp_sk(sk); 207 struct ip_options_rcu *inet_opt; 208 struct net *net = sock_net(sk); 209 __be16 orig_sport, orig_dport; 210 __be32 daddr, nexthop; 211 struct flowi4 *fl4; 212 struct rtable *rt; 213 int err; 214 215 if (addr_len < sizeof(struct sockaddr_in)) 216 return -EINVAL; 217 218 if (usin->sin_family != AF_INET) 219 return -EAFNOSUPPORT; 220 221 nexthop = daddr = usin->sin_addr.s_addr; 222 inet_opt = rcu_dereference_protected(inet->inet_opt, 223 lockdep_sock_is_held(sk)); 224 if (inet_opt && inet_opt->opt.srr) { 225 if (!daddr) 226 return -EINVAL; 227 nexthop = inet_opt->opt.faddr; 228 } 229 230 orig_sport = inet->inet_sport; 231 orig_dport = usin->sin_port; 232 fl4 = &inet->cork.fl.u.ip4; 233 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 234 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 235 orig_dport, sk); 236 if (IS_ERR(rt)) { 237 err = PTR_ERR(rt); 238 if (err == -ENETUNREACH) 239 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); 240 return err; 241 } 242 243 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 244 ip_rt_put(rt); 245 return -ENETUNREACH; 246 } 247 248 if (!inet_opt || !inet_opt->opt.srr) 249 daddr = fl4->daddr; 250 251 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 252 253 if (!inet->inet_saddr) { 254 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); 255 if (err) { 256 ip_rt_put(rt); 257 return err; 258 } 259 } else { 260 sk_rcv_saddr_set(sk, inet->inet_saddr); 261 } 262 263 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 264 /* Reset inherited state */ 265 tp->rx_opt.ts_recent = 0; 266 tp->rx_opt.ts_recent_stamp = 0; 267 if (likely(!tp->repair)) 268 WRITE_ONCE(tp->write_seq, 0); 269 } 270 271 inet->inet_dport = usin->sin_port; 272 sk_daddr_set(sk, daddr); 273 274 inet_csk(sk)->icsk_ext_hdr_len = 0; 275 if (inet_opt) 276 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 277 278 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 279 280 /* Socket identity is still unknown (sport may be zero). 281 * However we set state to SYN-SENT and not releasing socket 282 * lock select source port, enter ourselves into the hash tables and 283 * complete initialization after this. 284 */ 285 tcp_set_state(sk, TCP_SYN_SENT); 286 err = inet_hash_connect(tcp_death_row, sk); 287 if (err) 288 goto failure; 289 290 sk_set_txhash(sk); 291 292 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 293 inet->inet_sport, inet->inet_dport, sk); 294 if (IS_ERR(rt)) { 295 err = PTR_ERR(rt); 296 rt = NULL; 297 goto failure; 298 } 299 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); 300 /* OK, now commit destination to socket. */ 301 sk->sk_gso_type = SKB_GSO_TCPV4; 302 sk_setup_caps(sk, &rt->dst); 303 rt = NULL; 304 305 if (likely(!tp->repair)) { 306 if (!tp->write_seq) 307 WRITE_ONCE(tp->write_seq, 308 secure_tcp_seq(inet->inet_saddr, 309 inet->inet_daddr, 310 inet->inet_sport, 311 usin->sin_port)); 312 WRITE_ONCE(tp->tsoffset, 313 secure_tcp_ts_off(net, inet->inet_saddr, 314 inet->inet_daddr)); 315 } 316 317 atomic_set(&inet->inet_id, get_random_u16()); 318 319 if (tcp_fastopen_defer_connect(sk, &err)) 320 return err; 321 if (err) 322 goto failure; 323 324 err = tcp_connect(sk); 325 326 if (err) 327 goto failure; 328 329 return 0; 330 331 failure: 332 /* 333 * This unhashes the socket and releases the local port, 334 * if necessary. 335 */ 336 tcp_set_state(sk, TCP_CLOSE); 337 inet_bhash2_reset_saddr(sk); 338 ip_rt_put(rt); 339 sk->sk_route_caps = 0; 340 inet->inet_dport = 0; 341 return err; 342 } 343 EXPORT_SYMBOL(tcp_v4_connect); 344 345 /* 346 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 347 * It can be called through tcp_release_cb() if socket was owned by user 348 * at the time tcp_v4_err() was called to handle ICMP message. 349 */ 350 void tcp_v4_mtu_reduced(struct sock *sk) 351 { 352 struct inet_sock *inet = inet_sk(sk); 353 struct dst_entry *dst; 354 u32 mtu; 355 356 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 357 return; 358 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 359 dst = inet_csk_update_pmtu(sk, mtu); 360 if (!dst) 361 return; 362 363 /* Something is about to be wrong... Remember soft error 364 * for the case, if this connection will not able to recover. 365 */ 366 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 367 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); 368 369 mtu = dst_mtu(dst); 370 371 if (inet->pmtudisc != IP_PMTUDISC_DONT && 372 ip_sk_accept_pmtu(sk) && 373 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 374 tcp_sync_mss(sk, mtu); 375 376 /* Resend the TCP packet because it's 377 * clear that the old packet has been 378 * dropped. This is the new "fast" path mtu 379 * discovery. 380 */ 381 tcp_simple_retransmit(sk); 382 } /* else let the usual retransmit timer handle it */ 383 } 384 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 385 386 static void do_redirect(struct sk_buff *skb, struct sock *sk) 387 { 388 struct dst_entry *dst = __sk_dst_check(sk, 0); 389 390 if (dst) 391 dst->ops->redirect(dst, sk, skb); 392 } 393 394 395 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 396 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 397 { 398 struct request_sock *req = inet_reqsk(sk); 399 struct net *net = sock_net(sk); 400 401 /* ICMPs are not backlogged, hence we cannot get 402 * an established socket here. 403 */ 404 if (seq != tcp_rsk(req)->snt_isn) { 405 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 406 } else if (abort) { 407 /* 408 * Still in SYN_RECV, just remove it silently. 409 * There is no good way to pass the error to the newly 410 * created socket, and POSIX does not want network 411 * errors returned from accept(). 412 */ 413 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 414 tcp_listendrop(req->rsk_listener); 415 } 416 reqsk_put(req); 417 } 418 EXPORT_SYMBOL(tcp_req_err); 419 420 /* TCP-LD (RFC 6069) logic */ 421 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 422 { 423 struct inet_connection_sock *icsk = inet_csk(sk); 424 struct tcp_sock *tp = tcp_sk(sk); 425 struct sk_buff *skb; 426 s32 remaining; 427 u32 delta_us; 428 429 if (sock_owned_by_user(sk)) 430 return; 431 432 if (seq != tp->snd_una || !icsk->icsk_retransmits || 433 !icsk->icsk_backoff) 434 return; 435 436 skb = tcp_rtx_queue_head(sk); 437 if (WARN_ON_ONCE(!skb)) 438 return; 439 440 icsk->icsk_backoff--; 441 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 442 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 443 444 tcp_mstamp_refresh(tp); 445 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 446 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 447 448 if (remaining > 0) { 449 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 450 remaining, TCP_RTO_MAX); 451 } else { 452 /* RTO revert clocked out retransmission. 453 * Will retransmit now. 454 */ 455 tcp_retransmit_timer(sk); 456 } 457 } 458 EXPORT_SYMBOL(tcp_ld_RTO_revert); 459 460 /* 461 * This routine is called by the ICMP module when it gets some 462 * sort of error condition. If err < 0 then the socket should 463 * be closed and the error returned to the user. If err > 0 464 * it's just the icmp type << 8 | icmp code. After adjustment 465 * header points to the first 8 bytes of the tcp header. We need 466 * to find the appropriate port. 467 * 468 * The locking strategy used here is very "optimistic". When 469 * someone else accesses the socket the ICMP is just dropped 470 * and for some paths there is no check at all. 471 * A more general error queue to queue errors for later handling 472 * is probably better. 473 * 474 */ 475 476 int tcp_v4_err(struct sk_buff *skb, u32 info) 477 { 478 const struct iphdr *iph = (const struct iphdr *)skb->data; 479 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 480 struct tcp_sock *tp; 481 const int type = icmp_hdr(skb)->type; 482 const int code = icmp_hdr(skb)->code; 483 struct sock *sk; 484 struct request_sock *fastopen; 485 u32 seq, snd_una; 486 int err; 487 struct net *net = dev_net(skb->dev); 488 489 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 490 iph->daddr, th->dest, iph->saddr, 491 ntohs(th->source), inet_iif(skb), 0); 492 if (!sk) { 493 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 494 return -ENOENT; 495 } 496 if (sk->sk_state == TCP_TIME_WAIT) { 497 inet_twsk_put(inet_twsk(sk)); 498 return 0; 499 } 500 seq = ntohl(th->seq); 501 if (sk->sk_state == TCP_NEW_SYN_RECV) { 502 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 503 type == ICMP_TIME_EXCEEDED || 504 (type == ICMP_DEST_UNREACH && 505 (code == ICMP_NET_UNREACH || 506 code == ICMP_HOST_UNREACH))); 507 return 0; 508 } 509 510 bh_lock_sock(sk); 511 /* If too many ICMPs get dropped on busy 512 * servers this needs to be solved differently. 513 * We do take care of PMTU discovery (RFC1191) special case : 514 * we can receive locally generated ICMP messages while socket is held. 515 */ 516 if (sock_owned_by_user(sk)) { 517 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 518 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 519 } 520 if (sk->sk_state == TCP_CLOSE) 521 goto out; 522 523 if (static_branch_unlikely(&ip4_min_ttl)) { 524 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 525 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 526 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 527 goto out; 528 } 529 } 530 531 tp = tcp_sk(sk); 532 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 533 fastopen = rcu_dereference(tp->fastopen_rsk); 534 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 535 if (sk->sk_state != TCP_LISTEN && 536 !between(seq, snd_una, tp->snd_nxt)) { 537 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 538 goto out; 539 } 540 541 switch (type) { 542 case ICMP_REDIRECT: 543 if (!sock_owned_by_user(sk)) 544 do_redirect(skb, sk); 545 goto out; 546 case ICMP_SOURCE_QUENCH: 547 /* Just silently ignore these. */ 548 goto out; 549 case ICMP_PARAMETERPROB: 550 err = EPROTO; 551 break; 552 case ICMP_DEST_UNREACH: 553 if (code > NR_ICMP_UNREACH) 554 goto out; 555 556 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 557 /* We are not interested in TCP_LISTEN and open_requests 558 * (SYN-ACKs send out by Linux are always <576bytes so 559 * they should go through unfragmented). 560 */ 561 if (sk->sk_state == TCP_LISTEN) 562 goto out; 563 564 WRITE_ONCE(tp->mtu_info, info); 565 if (!sock_owned_by_user(sk)) { 566 tcp_v4_mtu_reduced(sk); 567 } else { 568 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 569 sock_hold(sk); 570 } 571 goto out; 572 } 573 574 err = icmp_err_convert[code].errno; 575 /* check if this ICMP message allows revert of backoff. 576 * (see RFC 6069) 577 */ 578 if (!fastopen && 579 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 580 tcp_ld_RTO_revert(sk, seq); 581 break; 582 case ICMP_TIME_EXCEEDED: 583 err = EHOSTUNREACH; 584 break; 585 default: 586 goto out; 587 } 588 589 switch (sk->sk_state) { 590 case TCP_SYN_SENT: 591 case TCP_SYN_RECV: 592 /* Only in fast or simultaneous open. If a fast open socket is 593 * already accepted it is treated as a connected one below. 594 */ 595 if (fastopen && !fastopen->sk) 596 break; 597 598 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 599 600 if (!sock_owned_by_user(sk)) { 601 WRITE_ONCE(sk->sk_err, err); 602 603 sk_error_report(sk); 604 605 tcp_done(sk); 606 } else { 607 WRITE_ONCE(sk->sk_err_soft, err); 608 } 609 goto out; 610 } 611 612 /* If we've already connected we will keep trying 613 * until we time out, or the user gives up. 614 * 615 * rfc1122 4.2.3.9 allows to consider as hard errors 616 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 617 * but it is obsoleted by pmtu discovery). 618 * 619 * Note, that in modern internet, where routing is unreliable 620 * and in each dark corner broken firewalls sit, sending random 621 * errors ordered by their masters even this two messages finally lose 622 * their original sense (even Linux sends invalid PORT_UNREACHs) 623 * 624 * Now we are in compliance with RFCs. 625 * --ANK (980905) 626 */ 627 628 if (!sock_owned_by_user(sk) && 629 inet_test_bit(RECVERR, sk)) { 630 WRITE_ONCE(sk->sk_err, err); 631 sk_error_report(sk); 632 } else { /* Only an error on timeout */ 633 WRITE_ONCE(sk->sk_err_soft, err); 634 } 635 636 out: 637 bh_unlock_sock(sk); 638 sock_put(sk); 639 return 0; 640 } 641 642 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 643 { 644 struct tcphdr *th = tcp_hdr(skb); 645 646 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 647 skb->csum_start = skb_transport_header(skb) - skb->head; 648 skb->csum_offset = offsetof(struct tcphdr, check); 649 } 650 651 /* This routine computes an IPv4 TCP checksum. */ 652 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 653 { 654 const struct inet_sock *inet = inet_sk(sk); 655 656 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 657 } 658 EXPORT_SYMBOL(tcp_v4_send_check); 659 660 /* 661 * This routine will send an RST to the other tcp. 662 * 663 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 664 * for reset. 665 * Answer: if a packet caused RST, it is not for a socket 666 * existing in our system, if it is matched to a socket, 667 * it is just duplicate segment or bug in other side's TCP. 668 * So that we build reply only basing on parameters 669 * arrived with segment. 670 * Exception: precedence violation. We do not implement it in any case. 671 */ 672 673 #ifdef CONFIG_TCP_MD5SIG 674 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 675 #else 676 #define OPTION_BYTES sizeof(__be32) 677 #endif 678 679 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 680 { 681 const struct tcphdr *th = tcp_hdr(skb); 682 struct { 683 struct tcphdr th; 684 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 685 } rep; 686 struct ip_reply_arg arg; 687 #ifdef CONFIG_TCP_MD5SIG 688 struct tcp_md5sig_key *key = NULL; 689 const __u8 *hash_location = NULL; 690 unsigned char newhash[16]; 691 int genhash; 692 struct sock *sk1 = NULL; 693 #endif 694 u64 transmit_time = 0; 695 struct sock *ctl_sk; 696 struct net *net; 697 u32 txhash = 0; 698 699 /* Never send a reset in response to a reset. */ 700 if (th->rst) 701 return; 702 703 /* If sk not NULL, it means we did a successful lookup and incoming 704 * route had to be correct. prequeue might have dropped our dst. 705 */ 706 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 707 return; 708 709 /* Swap the send and the receive. */ 710 memset(&rep, 0, sizeof(rep)); 711 rep.th.dest = th->source; 712 rep.th.source = th->dest; 713 rep.th.doff = sizeof(struct tcphdr) / 4; 714 rep.th.rst = 1; 715 716 if (th->ack) { 717 rep.th.seq = th->ack_seq; 718 } else { 719 rep.th.ack = 1; 720 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 721 skb->len - (th->doff << 2)); 722 } 723 724 memset(&arg, 0, sizeof(arg)); 725 arg.iov[0].iov_base = (unsigned char *)&rep; 726 arg.iov[0].iov_len = sizeof(rep.th); 727 728 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 729 #ifdef CONFIG_TCP_MD5SIG 730 rcu_read_lock(); 731 hash_location = tcp_parse_md5sig_option(th); 732 if (sk && sk_fullsock(sk)) { 733 const union tcp_md5_addr *addr; 734 int l3index; 735 736 /* sdif set, means packet ingressed via a device 737 * in an L3 domain and inet_iif is set to it. 738 */ 739 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 740 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 741 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 742 } else if (hash_location) { 743 const union tcp_md5_addr *addr; 744 int sdif = tcp_v4_sdif(skb); 745 int dif = inet_iif(skb); 746 int l3index; 747 748 /* 749 * active side is lost. Try to find listening socket through 750 * source port, and then find md5 key through listening socket. 751 * we are not loose security here: 752 * Incoming packet is checked with md5 hash with finding key, 753 * no RST generated if md5 hash doesn't match. 754 */ 755 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, 756 NULL, 0, ip_hdr(skb)->saddr, 757 th->source, ip_hdr(skb)->daddr, 758 ntohs(th->source), dif, sdif); 759 /* don't send rst if it can't find key */ 760 if (!sk1) 761 goto out; 762 763 /* sdif set, means packet ingressed via a device 764 * in an L3 domain and dif is set to it. 765 */ 766 l3index = sdif ? dif : 0; 767 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 768 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 769 if (!key) 770 goto out; 771 772 773 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 774 if (genhash || memcmp(hash_location, newhash, 16) != 0) 775 goto out; 776 777 } 778 779 if (key) { 780 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 781 (TCPOPT_NOP << 16) | 782 (TCPOPT_MD5SIG << 8) | 783 TCPOLEN_MD5SIG); 784 /* Update length and the length the header thinks exists */ 785 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 786 rep.th.doff = arg.iov[0].iov_len / 4; 787 788 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 789 key, ip_hdr(skb)->saddr, 790 ip_hdr(skb)->daddr, &rep.th); 791 } 792 #endif 793 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 794 if (rep.opt[0] == 0) { 795 __be32 mrst = mptcp_reset_option(skb); 796 797 if (mrst) { 798 rep.opt[0] = mrst; 799 arg.iov[0].iov_len += sizeof(mrst); 800 rep.th.doff = arg.iov[0].iov_len / 4; 801 } 802 } 803 804 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 805 ip_hdr(skb)->saddr, /* XXX */ 806 arg.iov[0].iov_len, IPPROTO_TCP, 0); 807 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 808 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 809 810 /* When socket is gone, all binding information is lost. 811 * routing might fail in this case. No choice here, if we choose to force 812 * input interface, we will misroute in case of asymmetric route. 813 */ 814 if (sk) { 815 arg.bound_dev_if = sk->sk_bound_dev_if; 816 if (sk_fullsock(sk)) 817 trace_tcp_send_reset(sk, skb); 818 } 819 820 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 821 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 822 823 arg.tos = ip_hdr(skb)->tos; 824 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 825 local_bh_disable(); 826 ctl_sk = this_cpu_read(ipv4_tcp_sk); 827 sock_net_set(ctl_sk, net); 828 if (sk) { 829 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 830 inet_twsk(sk)->tw_mark : sk->sk_mark; 831 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 832 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 833 transmit_time = tcp_transmit_time(sk); 834 xfrm_sk_clone_policy(ctl_sk, sk); 835 txhash = (sk->sk_state == TCP_TIME_WAIT) ? 836 inet_twsk(sk)->tw_txhash : sk->sk_txhash; 837 } else { 838 ctl_sk->sk_mark = 0; 839 ctl_sk->sk_priority = 0; 840 } 841 ip_send_unicast_reply(ctl_sk, 842 skb, &TCP_SKB_CB(skb)->header.h4.opt, 843 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 844 &arg, arg.iov[0].iov_len, 845 transmit_time, txhash); 846 847 xfrm_sk_free_policy(ctl_sk); 848 sock_net_set(ctl_sk, &init_net); 849 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 850 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 851 local_bh_enable(); 852 853 #ifdef CONFIG_TCP_MD5SIG 854 out: 855 rcu_read_unlock(); 856 #endif 857 } 858 859 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 860 outside socket context is ugly, certainly. What can I do? 861 */ 862 863 static void tcp_v4_send_ack(const struct sock *sk, 864 struct sk_buff *skb, u32 seq, u32 ack, 865 u32 win, u32 tsval, u32 tsecr, int oif, 866 struct tcp_md5sig_key *key, 867 int reply_flags, u8 tos, u32 txhash) 868 { 869 const struct tcphdr *th = tcp_hdr(skb); 870 struct { 871 struct tcphdr th; 872 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 873 #ifdef CONFIG_TCP_MD5SIG 874 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 875 #endif 876 ]; 877 } rep; 878 struct net *net = sock_net(sk); 879 struct ip_reply_arg arg; 880 struct sock *ctl_sk; 881 u64 transmit_time; 882 883 memset(&rep.th, 0, sizeof(struct tcphdr)); 884 memset(&arg, 0, sizeof(arg)); 885 886 arg.iov[0].iov_base = (unsigned char *)&rep; 887 arg.iov[0].iov_len = sizeof(rep.th); 888 if (tsecr) { 889 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 890 (TCPOPT_TIMESTAMP << 8) | 891 TCPOLEN_TIMESTAMP); 892 rep.opt[1] = htonl(tsval); 893 rep.opt[2] = htonl(tsecr); 894 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 895 } 896 897 /* Swap the send and the receive. */ 898 rep.th.dest = th->source; 899 rep.th.source = th->dest; 900 rep.th.doff = arg.iov[0].iov_len / 4; 901 rep.th.seq = htonl(seq); 902 rep.th.ack_seq = htonl(ack); 903 rep.th.ack = 1; 904 rep.th.window = htons(win); 905 906 #ifdef CONFIG_TCP_MD5SIG 907 if (key) { 908 int offset = (tsecr) ? 3 : 0; 909 910 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 911 (TCPOPT_NOP << 16) | 912 (TCPOPT_MD5SIG << 8) | 913 TCPOLEN_MD5SIG); 914 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 915 rep.th.doff = arg.iov[0].iov_len/4; 916 917 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 918 key, ip_hdr(skb)->saddr, 919 ip_hdr(skb)->daddr, &rep.th); 920 } 921 #endif 922 arg.flags = reply_flags; 923 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 924 ip_hdr(skb)->saddr, /* XXX */ 925 arg.iov[0].iov_len, IPPROTO_TCP, 0); 926 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 927 if (oif) 928 arg.bound_dev_if = oif; 929 arg.tos = tos; 930 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 931 local_bh_disable(); 932 ctl_sk = this_cpu_read(ipv4_tcp_sk); 933 sock_net_set(ctl_sk, net); 934 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 935 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); 936 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 937 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); 938 transmit_time = tcp_transmit_time(sk); 939 ip_send_unicast_reply(ctl_sk, 940 skb, &TCP_SKB_CB(skb)->header.h4.opt, 941 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 942 &arg, arg.iov[0].iov_len, 943 transmit_time, txhash); 944 945 sock_net_set(ctl_sk, &init_net); 946 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 947 local_bh_enable(); 948 } 949 950 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 951 { 952 struct inet_timewait_sock *tw = inet_twsk(sk); 953 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 954 955 tcp_v4_send_ack(sk, skb, 956 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 957 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 958 tcp_tw_tsval(tcptw), 959 tcptw->tw_ts_recent, 960 tw->tw_bound_dev_if, 961 tcp_twsk_md5_key(tcptw), 962 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 963 tw->tw_tos, 964 tw->tw_txhash 965 ); 966 967 inet_twsk_put(tw); 968 } 969 970 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 971 struct request_sock *req) 972 { 973 const union tcp_md5_addr *addr; 974 int l3index; 975 976 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 977 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 978 */ 979 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 980 tcp_sk(sk)->snd_nxt; 981 982 /* RFC 7323 2.3 983 * The window field (SEG.WND) of every outgoing segment, with the 984 * exception of <SYN> segments, MUST be right-shifted by 985 * Rcv.Wind.Shift bits: 986 */ 987 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 988 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 989 tcp_v4_send_ack(sk, skb, seq, 990 tcp_rsk(req)->rcv_nxt, 991 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 992 tcp_rsk_tsval(tcp_rsk(req)), 993 READ_ONCE(req->ts_recent), 994 0, 995 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 996 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 997 ip_hdr(skb)->tos, 998 READ_ONCE(tcp_rsk(req)->txhash)); 999 } 1000 1001 /* 1002 * Send a SYN-ACK after having received a SYN. 1003 * This still operates on a request_sock only, not on a big 1004 * socket. 1005 */ 1006 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1007 struct flowi *fl, 1008 struct request_sock *req, 1009 struct tcp_fastopen_cookie *foc, 1010 enum tcp_synack_type synack_type, 1011 struct sk_buff *syn_skb) 1012 { 1013 const struct inet_request_sock *ireq = inet_rsk(req); 1014 struct flowi4 fl4; 1015 int err = -1; 1016 struct sk_buff *skb; 1017 u8 tos; 1018 1019 /* First, grab a route. */ 1020 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1021 return -1; 1022 1023 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1024 1025 if (skb) { 1026 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1027 1028 tos = READ_ONCE(inet_sk(sk)->tos); 1029 1030 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1031 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1032 (tos & INET_ECN_MASK); 1033 1034 if (!INET_ECN_is_capable(tos) && 1035 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1036 tos |= INET_ECN_ECT_0; 1037 1038 rcu_read_lock(); 1039 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1040 ireq->ir_rmt_addr, 1041 rcu_dereference(ireq->ireq_opt), 1042 tos); 1043 rcu_read_unlock(); 1044 err = net_xmit_eval(err); 1045 } 1046 1047 return err; 1048 } 1049 1050 /* 1051 * IPv4 request_sock destructor. 1052 */ 1053 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1054 { 1055 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1056 } 1057 1058 #ifdef CONFIG_TCP_MD5SIG 1059 /* 1060 * RFC2385 MD5 checksumming requires a mapping of 1061 * IP address->MD5 Key. 1062 * We need to maintain these in the sk structure. 1063 */ 1064 1065 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); 1066 EXPORT_SYMBOL(tcp_md5_needed); 1067 1068 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1069 { 1070 if (!old) 1071 return true; 1072 1073 /* l3index always overrides non-l3index */ 1074 if (old->l3index && new->l3index == 0) 1075 return false; 1076 if (old->l3index == 0 && new->l3index) 1077 return true; 1078 1079 return old->prefixlen < new->prefixlen; 1080 } 1081 1082 /* Find the Key structure for an address. */ 1083 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1084 const union tcp_md5_addr *addr, 1085 int family) 1086 { 1087 const struct tcp_sock *tp = tcp_sk(sk); 1088 struct tcp_md5sig_key *key; 1089 const struct tcp_md5sig_info *md5sig; 1090 __be32 mask; 1091 struct tcp_md5sig_key *best_match = NULL; 1092 bool match; 1093 1094 /* caller either holds rcu_read_lock() or socket lock */ 1095 md5sig = rcu_dereference_check(tp->md5sig_info, 1096 lockdep_sock_is_held(sk)); 1097 if (!md5sig) 1098 return NULL; 1099 1100 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1101 lockdep_sock_is_held(sk)) { 1102 if (key->family != family) 1103 continue; 1104 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1105 continue; 1106 if (family == AF_INET) { 1107 mask = inet_make_mask(key->prefixlen); 1108 match = (key->addr.a4.s_addr & mask) == 1109 (addr->a4.s_addr & mask); 1110 #if IS_ENABLED(CONFIG_IPV6) 1111 } else if (family == AF_INET6) { 1112 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1113 key->prefixlen); 1114 #endif 1115 } else { 1116 match = false; 1117 } 1118 1119 if (match && better_md5_match(best_match, key)) 1120 best_match = key; 1121 } 1122 return best_match; 1123 } 1124 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1125 1126 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1127 const union tcp_md5_addr *addr, 1128 int family, u8 prefixlen, 1129 int l3index, u8 flags) 1130 { 1131 const struct tcp_sock *tp = tcp_sk(sk); 1132 struct tcp_md5sig_key *key; 1133 unsigned int size = sizeof(struct in_addr); 1134 const struct tcp_md5sig_info *md5sig; 1135 1136 /* caller either holds rcu_read_lock() or socket lock */ 1137 md5sig = rcu_dereference_check(tp->md5sig_info, 1138 lockdep_sock_is_held(sk)); 1139 if (!md5sig) 1140 return NULL; 1141 #if IS_ENABLED(CONFIG_IPV6) 1142 if (family == AF_INET6) 1143 size = sizeof(struct in6_addr); 1144 #endif 1145 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1146 lockdep_sock_is_held(sk)) { 1147 if (key->family != family) 1148 continue; 1149 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1150 continue; 1151 if (key->l3index != l3index) 1152 continue; 1153 if (!memcmp(&key->addr, addr, size) && 1154 key->prefixlen == prefixlen) 1155 return key; 1156 } 1157 return NULL; 1158 } 1159 1160 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1161 const struct sock *addr_sk) 1162 { 1163 const union tcp_md5_addr *addr; 1164 int l3index; 1165 1166 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1167 addr_sk->sk_bound_dev_if); 1168 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1169 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1170 } 1171 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1172 1173 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) 1174 { 1175 struct tcp_sock *tp = tcp_sk(sk); 1176 struct tcp_md5sig_info *md5sig; 1177 1178 md5sig = kmalloc(sizeof(*md5sig), gfp); 1179 if (!md5sig) 1180 return -ENOMEM; 1181 1182 sk_gso_disable(sk); 1183 INIT_HLIST_HEAD(&md5sig->head); 1184 rcu_assign_pointer(tp->md5sig_info, md5sig); 1185 return 0; 1186 } 1187 1188 /* This can be called on a newly created socket, from other files */ 1189 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1190 int family, u8 prefixlen, int l3index, u8 flags, 1191 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1192 { 1193 /* Add Key to the list */ 1194 struct tcp_md5sig_key *key; 1195 struct tcp_sock *tp = tcp_sk(sk); 1196 struct tcp_md5sig_info *md5sig; 1197 1198 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1199 if (key) { 1200 /* Pre-existing entry - just update that one. 1201 * Note that the key might be used concurrently. 1202 * data_race() is telling kcsan that we do not care of 1203 * key mismatches, since changing MD5 key on live flows 1204 * can lead to packet drops. 1205 */ 1206 data_race(memcpy(key->key, newkey, newkeylen)); 1207 1208 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1209 * Also note that a reader could catch new key->keylen value 1210 * but old key->key[], this is the reason we use __GFP_ZERO 1211 * at sock_kmalloc() time below these lines. 1212 */ 1213 WRITE_ONCE(key->keylen, newkeylen); 1214 1215 return 0; 1216 } 1217 1218 md5sig = rcu_dereference_protected(tp->md5sig_info, 1219 lockdep_sock_is_held(sk)); 1220 1221 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1222 if (!key) 1223 return -ENOMEM; 1224 if (!tcp_alloc_md5sig_pool()) { 1225 sock_kfree_s(sk, key, sizeof(*key)); 1226 return -ENOMEM; 1227 } 1228 1229 memcpy(key->key, newkey, newkeylen); 1230 key->keylen = newkeylen; 1231 key->family = family; 1232 key->prefixlen = prefixlen; 1233 key->l3index = l3index; 1234 key->flags = flags; 1235 memcpy(&key->addr, addr, 1236 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1237 sizeof(struct in_addr)); 1238 hlist_add_head_rcu(&key->node, &md5sig->head); 1239 return 0; 1240 } 1241 1242 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1243 int family, u8 prefixlen, int l3index, u8 flags, 1244 const u8 *newkey, u8 newkeylen) 1245 { 1246 struct tcp_sock *tp = tcp_sk(sk); 1247 1248 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1249 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) 1250 return -ENOMEM; 1251 1252 if (!static_branch_inc(&tcp_md5_needed.key)) { 1253 struct tcp_md5sig_info *md5sig; 1254 1255 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1256 rcu_assign_pointer(tp->md5sig_info, NULL); 1257 kfree_rcu(md5sig, rcu); 1258 return -EUSERS; 1259 } 1260 } 1261 1262 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, 1263 newkey, newkeylen, GFP_KERNEL); 1264 } 1265 EXPORT_SYMBOL(tcp_md5_do_add); 1266 1267 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, 1268 int family, u8 prefixlen, int l3index, 1269 struct tcp_md5sig_key *key) 1270 { 1271 struct tcp_sock *tp = tcp_sk(sk); 1272 1273 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { 1274 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) 1275 return -ENOMEM; 1276 1277 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { 1278 struct tcp_md5sig_info *md5sig; 1279 1280 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); 1281 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); 1282 rcu_assign_pointer(tp->md5sig_info, NULL); 1283 kfree_rcu(md5sig, rcu); 1284 return -EUSERS; 1285 } 1286 } 1287 1288 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, 1289 key->flags, key->key, key->keylen, 1290 sk_gfp_mask(sk, GFP_ATOMIC)); 1291 } 1292 EXPORT_SYMBOL(tcp_md5_key_copy); 1293 1294 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1295 u8 prefixlen, int l3index, u8 flags) 1296 { 1297 struct tcp_md5sig_key *key; 1298 1299 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1300 if (!key) 1301 return -ENOENT; 1302 hlist_del_rcu(&key->node); 1303 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1304 kfree_rcu(key, rcu); 1305 return 0; 1306 } 1307 EXPORT_SYMBOL(tcp_md5_do_del); 1308 1309 static void tcp_clear_md5_list(struct sock *sk) 1310 { 1311 struct tcp_sock *tp = tcp_sk(sk); 1312 struct tcp_md5sig_key *key; 1313 struct hlist_node *n; 1314 struct tcp_md5sig_info *md5sig; 1315 1316 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1317 1318 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1319 hlist_del_rcu(&key->node); 1320 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1321 kfree_rcu(key, rcu); 1322 } 1323 } 1324 1325 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1326 sockptr_t optval, int optlen) 1327 { 1328 struct tcp_md5sig cmd; 1329 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1330 const union tcp_md5_addr *addr; 1331 u8 prefixlen = 32; 1332 int l3index = 0; 1333 u8 flags; 1334 1335 if (optlen < sizeof(cmd)) 1336 return -EINVAL; 1337 1338 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1339 return -EFAULT; 1340 1341 if (sin->sin_family != AF_INET) 1342 return -EINVAL; 1343 1344 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1345 1346 if (optname == TCP_MD5SIG_EXT && 1347 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1348 prefixlen = cmd.tcpm_prefixlen; 1349 if (prefixlen > 32) 1350 return -EINVAL; 1351 } 1352 1353 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1354 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1355 struct net_device *dev; 1356 1357 rcu_read_lock(); 1358 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1359 if (dev && netif_is_l3_master(dev)) 1360 l3index = dev->ifindex; 1361 1362 rcu_read_unlock(); 1363 1364 /* ok to reference set/not set outside of rcu; 1365 * right now device MUST be an L3 master 1366 */ 1367 if (!dev || !l3index) 1368 return -EINVAL; 1369 } 1370 1371 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1372 1373 if (!cmd.tcpm_keylen) 1374 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1375 1376 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1377 return -EINVAL; 1378 1379 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1380 cmd.tcpm_key, cmd.tcpm_keylen); 1381 } 1382 1383 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1384 __be32 daddr, __be32 saddr, 1385 const struct tcphdr *th, int nbytes) 1386 { 1387 struct tcp4_pseudohdr *bp; 1388 struct scatterlist sg; 1389 struct tcphdr *_th; 1390 1391 bp = hp->scratch; 1392 bp->saddr = saddr; 1393 bp->daddr = daddr; 1394 bp->pad = 0; 1395 bp->protocol = IPPROTO_TCP; 1396 bp->len = cpu_to_be16(nbytes); 1397 1398 _th = (struct tcphdr *)(bp + 1); 1399 memcpy(_th, th, sizeof(*th)); 1400 _th->check = 0; 1401 1402 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1403 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1404 sizeof(*bp) + sizeof(*th)); 1405 return crypto_ahash_update(hp->md5_req); 1406 } 1407 1408 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1409 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1410 { 1411 struct tcp_md5sig_pool *hp; 1412 struct ahash_request *req; 1413 1414 hp = tcp_get_md5sig_pool(); 1415 if (!hp) 1416 goto clear_hash_noput; 1417 req = hp->md5_req; 1418 1419 if (crypto_ahash_init(req)) 1420 goto clear_hash; 1421 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1422 goto clear_hash; 1423 if (tcp_md5_hash_key(hp, key)) 1424 goto clear_hash; 1425 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1426 if (crypto_ahash_final(req)) 1427 goto clear_hash; 1428 1429 tcp_put_md5sig_pool(); 1430 return 0; 1431 1432 clear_hash: 1433 tcp_put_md5sig_pool(); 1434 clear_hash_noput: 1435 memset(md5_hash, 0, 16); 1436 return 1; 1437 } 1438 1439 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1440 const struct sock *sk, 1441 const struct sk_buff *skb) 1442 { 1443 struct tcp_md5sig_pool *hp; 1444 struct ahash_request *req; 1445 const struct tcphdr *th = tcp_hdr(skb); 1446 __be32 saddr, daddr; 1447 1448 if (sk) { /* valid for establish/request sockets */ 1449 saddr = sk->sk_rcv_saddr; 1450 daddr = sk->sk_daddr; 1451 } else { 1452 const struct iphdr *iph = ip_hdr(skb); 1453 saddr = iph->saddr; 1454 daddr = iph->daddr; 1455 } 1456 1457 hp = tcp_get_md5sig_pool(); 1458 if (!hp) 1459 goto clear_hash_noput; 1460 req = hp->md5_req; 1461 1462 if (crypto_ahash_init(req)) 1463 goto clear_hash; 1464 1465 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1466 goto clear_hash; 1467 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1468 goto clear_hash; 1469 if (tcp_md5_hash_key(hp, key)) 1470 goto clear_hash; 1471 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1472 if (crypto_ahash_final(req)) 1473 goto clear_hash; 1474 1475 tcp_put_md5sig_pool(); 1476 return 0; 1477 1478 clear_hash: 1479 tcp_put_md5sig_pool(); 1480 clear_hash_noput: 1481 memset(md5_hash, 0, 16); 1482 return 1; 1483 } 1484 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1485 1486 #endif 1487 1488 static void tcp_v4_init_req(struct request_sock *req, 1489 const struct sock *sk_listener, 1490 struct sk_buff *skb) 1491 { 1492 struct inet_request_sock *ireq = inet_rsk(req); 1493 struct net *net = sock_net(sk_listener); 1494 1495 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1496 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1497 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1498 } 1499 1500 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1501 struct sk_buff *skb, 1502 struct flowi *fl, 1503 struct request_sock *req) 1504 { 1505 tcp_v4_init_req(req, sk, skb); 1506 1507 if (security_inet_conn_request(sk, skb, req)) 1508 return NULL; 1509 1510 return inet_csk_route_req(sk, &fl->u.ip4, req); 1511 } 1512 1513 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1514 .family = PF_INET, 1515 .obj_size = sizeof(struct tcp_request_sock), 1516 .rtx_syn_ack = tcp_rtx_synack, 1517 .send_ack = tcp_v4_reqsk_send_ack, 1518 .destructor = tcp_v4_reqsk_destructor, 1519 .send_reset = tcp_v4_send_reset, 1520 .syn_ack_timeout = tcp_syn_ack_timeout, 1521 }; 1522 1523 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1524 .mss_clamp = TCP_MSS_DEFAULT, 1525 #ifdef CONFIG_TCP_MD5SIG 1526 .req_md5_lookup = tcp_v4_md5_lookup, 1527 .calc_md5_hash = tcp_v4_md5_hash_skb, 1528 #endif 1529 #ifdef CONFIG_SYN_COOKIES 1530 .cookie_init_seq = cookie_v4_init_sequence, 1531 #endif 1532 .route_req = tcp_v4_route_req, 1533 .init_seq = tcp_v4_init_seq, 1534 .init_ts_off = tcp_v4_init_ts_off, 1535 .send_synack = tcp_v4_send_synack, 1536 }; 1537 1538 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1539 { 1540 /* Never answer to SYNs send to broadcast or multicast */ 1541 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1542 goto drop; 1543 1544 return tcp_conn_request(&tcp_request_sock_ops, 1545 &tcp_request_sock_ipv4_ops, sk, skb); 1546 1547 drop: 1548 tcp_listendrop(sk); 1549 return 0; 1550 } 1551 EXPORT_SYMBOL(tcp_v4_conn_request); 1552 1553 1554 /* 1555 * The three way handshake has completed - we got a valid synack - 1556 * now create the new socket. 1557 */ 1558 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1559 struct request_sock *req, 1560 struct dst_entry *dst, 1561 struct request_sock *req_unhash, 1562 bool *own_req) 1563 { 1564 struct inet_request_sock *ireq; 1565 bool found_dup_sk = false; 1566 struct inet_sock *newinet; 1567 struct tcp_sock *newtp; 1568 struct sock *newsk; 1569 #ifdef CONFIG_TCP_MD5SIG 1570 const union tcp_md5_addr *addr; 1571 struct tcp_md5sig_key *key; 1572 int l3index; 1573 #endif 1574 struct ip_options_rcu *inet_opt; 1575 1576 if (sk_acceptq_is_full(sk)) 1577 goto exit_overflow; 1578 1579 newsk = tcp_create_openreq_child(sk, req, skb); 1580 if (!newsk) 1581 goto exit_nonewsk; 1582 1583 newsk->sk_gso_type = SKB_GSO_TCPV4; 1584 inet_sk_rx_dst_set(newsk, skb); 1585 1586 newtp = tcp_sk(newsk); 1587 newinet = inet_sk(newsk); 1588 ireq = inet_rsk(req); 1589 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1590 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1591 newsk->sk_bound_dev_if = ireq->ir_iif; 1592 newinet->inet_saddr = ireq->ir_loc_addr; 1593 inet_opt = rcu_dereference(ireq->ireq_opt); 1594 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1595 newinet->mc_index = inet_iif(skb); 1596 newinet->mc_ttl = ip_hdr(skb)->ttl; 1597 newinet->rcv_tos = ip_hdr(skb)->tos; 1598 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1599 if (inet_opt) 1600 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1601 atomic_set(&newinet->inet_id, get_random_u16()); 1602 1603 /* Set ToS of the new socket based upon the value of incoming SYN. 1604 * ECT bits are set later in tcp_init_transfer(). 1605 */ 1606 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1607 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1608 1609 if (!dst) { 1610 dst = inet_csk_route_child_sock(sk, newsk, req); 1611 if (!dst) 1612 goto put_and_exit; 1613 } else { 1614 /* syncookie case : see end of cookie_v4_check() */ 1615 } 1616 sk_setup_caps(newsk, dst); 1617 1618 tcp_ca_openreq_child(newsk, dst); 1619 1620 tcp_sync_mss(newsk, dst_mtu(dst)); 1621 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1622 1623 tcp_initialize_rcv_mss(newsk); 1624 1625 #ifdef CONFIG_TCP_MD5SIG 1626 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1627 /* Copy over the MD5 key from the original socket */ 1628 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1629 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1630 if (key) { 1631 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) 1632 goto put_and_exit; 1633 sk_gso_disable(newsk); 1634 } 1635 #endif 1636 1637 if (__inet_inherit_port(sk, newsk) < 0) 1638 goto put_and_exit; 1639 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1640 &found_dup_sk); 1641 if (likely(*own_req)) { 1642 tcp_move_syn(newtp, req); 1643 ireq->ireq_opt = NULL; 1644 } else { 1645 newinet->inet_opt = NULL; 1646 1647 if (!req_unhash && found_dup_sk) { 1648 /* This code path should only be executed in the 1649 * syncookie case only 1650 */ 1651 bh_unlock_sock(newsk); 1652 sock_put(newsk); 1653 newsk = NULL; 1654 } 1655 } 1656 return newsk; 1657 1658 exit_overflow: 1659 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1660 exit_nonewsk: 1661 dst_release(dst); 1662 exit: 1663 tcp_listendrop(sk); 1664 return NULL; 1665 put_and_exit: 1666 newinet->inet_opt = NULL; 1667 inet_csk_prepare_forced_close(newsk); 1668 tcp_done(newsk); 1669 goto exit; 1670 } 1671 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1672 1673 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1674 { 1675 #ifdef CONFIG_SYN_COOKIES 1676 const struct tcphdr *th = tcp_hdr(skb); 1677 1678 if (!th->syn) 1679 sk = cookie_v4_check(sk, skb); 1680 #endif 1681 return sk; 1682 } 1683 1684 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1685 struct tcphdr *th, u32 *cookie) 1686 { 1687 u16 mss = 0; 1688 #ifdef CONFIG_SYN_COOKIES 1689 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1690 &tcp_request_sock_ipv4_ops, sk, th); 1691 if (mss) { 1692 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1693 tcp_synq_overflow(sk); 1694 } 1695 #endif 1696 return mss; 1697 } 1698 1699 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1700 u32)); 1701 /* The socket must have it's spinlock held when we get 1702 * here, unless it is a TCP_LISTEN socket. 1703 * 1704 * We have a potential double-lock case here, so even when 1705 * doing backlog processing we use the BH locking scheme. 1706 * This is because we cannot sleep with the original spinlock 1707 * held. 1708 */ 1709 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1710 { 1711 enum skb_drop_reason reason; 1712 struct sock *rsk; 1713 1714 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1715 struct dst_entry *dst; 1716 1717 dst = rcu_dereference_protected(sk->sk_rx_dst, 1718 lockdep_sock_is_held(sk)); 1719 1720 sock_rps_save_rxhash(sk, skb); 1721 sk_mark_napi_id(sk, skb); 1722 if (dst) { 1723 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1724 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1725 dst, 0)) { 1726 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1727 dst_release(dst); 1728 } 1729 } 1730 tcp_rcv_established(sk, skb); 1731 return 0; 1732 } 1733 1734 reason = SKB_DROP_REASON_NOT_SPECIFIED; 1735 if (tcp_checksum_complete(skb)) 1736 goto csum_err; 1737 1738 if (sk->sk_state == TCP_LISTEN) { 1739 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1740 1741 if (!nsk) 1742 goto discard; 1743 if (nsk != sk) { 1744 if (tcp_child_process(sk, nsk, skb)) { 1745 rsk = nsk; 1746 goto reset; 1747 } 1748 return 0; 1749 } 1750 } else 1751 sock_rps_save_rxhash(sk, skb); 1752 1753 if (tcp_rcv_state_process(sk, skb)) { 1754 rsk = sk; 1755 goto reset; 1756 } 1757 return 0; 1758 1759 reset: 1760 tcp_v4_send_reset(rsk, skb); 1761 discard: 1762 kfree_skb_reason(skb, reason); 1763 /* Be careful here. If this function gets more complicated and 1764 * gcc suffers from register pressure on the x86, sk (in %ebx) 1765 * might be destroyed here. This current version compiles correctly, 1766 * but you have been warned. 1767 */ 1768 return 0; 1769 1770 csum_err: 1771 reason = SKB_DROP_REASON_TCP_CSUM; 1772 trace_tcp_bad_csum(skb); 1773 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1774 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1775 goto discard; 1776 } 1777 EXPORT_SYMBOL(tcp_v4_do_rcv); 1778 1779 int tcp_v4_early_demux(struct sk_buff *skb) 1780 { 1781 struct net *net = dev_net(skb->dev); 1782 const struct iphdr *iph; 1783 const struct tcphdr *th; 1784 struct sock *sk; 1785 1786 if (skb->pkt_type != PACKET_HOST) 1787 return 0; 1788 1789 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1790 return 0; 1791 1792 iph = ip_hdr(skb); 1793 th = tcp_hdr(skb); 1794 1795 if (th->doff < sizeof(struct tcphdr) / 4) 1796 return 0; 1797 1798 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 1799 iph->saddr, th->source, 1800 iph->daddr, ntohs(th->dest), 1801 skb->skb_iif, inet_sdif(skb)); 1802 if (sk) { 1803 skb->sk = sk; 1804 skb->destructor = sock_edemux; 1805 if (sk_fullsock(sk)) { 1806 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1807 1808 if (dst) 1809 dst = dst_check(dst, 0); 1810 if (dst && 1811 sk->sk_rx_dst_ifindex == skb->skb_iif) 1812 skb_dst_set_noref(skb, dst); 1813 } 1814 } 1815 return 0; 1816 } 1817 1818 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1819 enum skb_drop_reason *reason) 1820 { 1821 u32 limit, tail_gso_size, tail_gso_segs; 1822 struct skb_shared_info *shinfo; 1823 const struct tcphdr *th; 1824 struct tcphdr *thtail; 1825 struct sk_buff *tail; 1826 unsigned int hdrlen; 1827 bool fragstolen; 1828 u32 gso_segs; 1829 u32 gso_size; 1830 int delta; 1831 1832 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1833 * we can fix skb->truesize to its real value to avoid future drops. 1834 * This is valid because skb is not yet charged to the socket. 1835 * It has been noticed pure SACK packets were sometimes dropped 1836 * (if cooked by drivers without copybreak feature). 1837 */ 1838 skb_condense(skb); 1839 1840 skb_dst_drop(skb); 1841 1842 if (unlikely(tcp_checksum_complete(skb))) { 1843 bh_unlock_sock(sk); 1844 trace_tcp_bad_csum(skb); 1845 *reason = SKB_DROP_REASON_TCP_CSUM; 1846 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1847 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1848 return true; 1849 } 1850 1851 /* Attempt coalescing to last skb in backlog, even if we are 1852 * above the limits. 1853 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1854 */ 1855 th = (const struct tcphdr *)skb->data; 1856 hdrlen = th->doff * 4; 1857 1858 tail = sk->sk_backlog.tail; 1859 if (!tail) 1860 goto no_coalesce; 1861 thtail = (struct tcphdr *)tail->data; 1862 1863 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1864 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1865 ((TCP_SKB_CB(tail)->tcp_flags | 1866 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1867 !((TCP_SKB_CB(tail)->tcp_flags & 1868 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1869 ((TCP_SKB_CB(tail)->tcp_flags ^ 1870 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1871 #ifdef CONFIG_TLS_DEVICE 1872 tail->decrypted != skb->decrypted || 1873 #endif 1874 !mptcp_skb_can_collapse(tail, skb) || 1875 thtail->doff != th->doff || 1876 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1877 goto no_coalesce; 1878 1879 __skb_pull(skb, hdrlen); 1880 1881 shinfo = skb_shinfo(skb); 1882 gso_size = shinfo->gso_size ?: skb->len; 1883 gso_segs = shinfo->gso_segs ?: 1; 1884 1885 shinfo = skb_shinfo(tail); 1886 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1887 tail_gso_segs = shinfo->gso_segs ?: 1; 1888 1889 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1890 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1891 1892 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1893 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1894 thtail->window = th->window; 1895 } 1896 1897 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1898 * thtail->fin, so that the fast path in tcp_rcv_established() 1899 * is not entered if we append a packet with a FIN. 1900 * SYN, RST, URG are not present. 1901 * ACK is set on both packets. 1902 * PSH : we do not really care in TCP stack, 1903 * at least for 'GRO' packets. 1904 */ 1905 thtail->fin |= th->fin; 1906 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1907 1908 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1909 TCP_SKB_CB(tail)->has_rxtstamp = true; 1910 tail->tstamp = skb->tstamp; 1911 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1912 } 1913 1914 /* Not as strict as GRO. We only need to carry mss max value */ 1915 shinfo->gso_size = max(gso_size, tail_gso_size); 1916 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1917 1918 sk->sk_backlog.len += delta; 1919 __NET_INC_STATS(sock_net(sk), 1920 LINUX_MIB_TCPBACKLOGCOALESCE); 1921 kfree_skb_partial(skb, fragstolen); 1922 return false; 1923 } 1924 __skb_push(skb, hdrlen); 1925 1926 no_coalesce: 1927 limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); 1928 1929 /* Only socket owner can try to collapse/prune rx queues 1930 * to reduce memory overhead, so add a little headroom here. 1931 * Few sockets backlog are possibly concurrently non empty. 1932 */ 1933 limit += 64 * 1024; 1934 1935 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1936 bh_unlock_sock(sk); 1937 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 1938 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1939 return true; 1940 } 1941 return false; 1942 } 1943 EXPORT_SYMBOL(tcp_add_backlog); 1944 1945 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1946 { 1947 struct tcphdr *th = (struct tcphdr *)skb->data; 1948 1949 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1950 } 1951 EXPORT_SYMBOL(tcp_filter); 1952 1953 static void tcp_v4_restore_cb(struct sk_buff *skb) 1954 { 1955 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1956 sizeof(struct inet_skb_parm)); 1957 } 1958 1959 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1960 const struct tcphdr *th) 1961 { 1962 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1963 * barrier() makes sure compiler wont play fool^Waliasing games. 1964 */ 1965 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1966 sizeof(struct inet_skb_parm)); 1967 barrier(); 1968 1969 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1970 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1971 skb->len - th->doff * 4); 1972 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1973 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1974 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1975 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1976 TCP_SKB_CB(skb)->sacked = 0; 1977 TCP_SKB_CB(skb)->has_rxtstamp = 1978 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1979 } 1980 1981 /* 1982 * From tcp_input.c 1983 */ 1984 1985 int tcp_v4_rcv(struct sk_buff *skb) 1986 { 1987 struct net *net = dev_net(skb->dev); 1988 enum skb_drop_reason drop_reason; 1989 int sdif = inet_sdif(skb); 1990 int dif = inet_iif(skb); 1991 const struct iphdr *iph; 1992 const struct tcphdr *th; 1993 bool refcounted; 1994 struct sock *sk; 1995 int ret; 1996 1997 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 1998 if (skb->pkt_type != PACKET_HOST) 1999 goto discard_it; 2000 2001 /* Count it even if it's bad */ 2002 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 2003 2004 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 2005 goto discard_it; 2006 2007 th = (const struct tcphdr *)skb->data; 2008 2009 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 2010 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 2011 goto bad_packet; 2012 } 2013 if (!pskb_may_pull(skb, th->doff * 4)) 2014 goto discard_it; 2015 2016 /* An explanation is required here, I think. 2017 * Packet length and doff are validated by header prediction, 2018 * provided case of th->doff==0 is eliminated. 2019 * So, we defer the checks. */ 2020 2021 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 2022 goto csum_error; 2023 2024 th = (const struct tcphdr *)skb->data; 2025 iph = ip_hdr(skb); 2026 lookup: 2027 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, 2028 skb, __tcp_hdrlen(th), th->source, 2029 th->dest, sdif, &refcounted); 2030 if (!sk) 2031 goto no_tcp_socket; 2032 2033 process: 2034 if (sk->sk_state == TCP_TIME_WAIT) 2035 goto do_time_wait; 2036 2037 if (sk->sk_state == TCP_NEW_SYN_RECV) { 2038 struct request_sock *req = inet_reqsk(sk); 2039 bool req_stolen = false; 2040 struct sock *nsk; 2041 2042 sk = req->rsk_listener; 2043 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 2044 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2045 else 2046 drop_reason = tcp_inbound_md5_hash(sk, skb, 2047 &iph->saddr, &iph->daddr, 2048 AF_INET, dif, sdif); 2049 if (unlikely(drop_reason)) { 2050 sk_drops_add(sk, skb); 2051 reqsk_put(req); 2052 goto discard_it; 2053 } 2054 if (tcp_checksum_complete(skb)) { 2055 reqsk_put(req); 2056 goto csum_error; 2057 } 2058 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2059 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2060 if (!nsk) { 2061 inet_csk_reqsk_queue_drop_and_put(sk, req); 2062 goto lookup; 2063 } 2064 sk = nsk; 2065 /* reuseport_migrate_sock() has already held one sk_refcnt 2066 * before returning. 2067 */ 2068 } else { 2069 /* We own a reference on the listener, increase it again 2070 * as we might lose it too soon. 2071 */ 2072 sock_hold(sk); 2073 } 2074 refcounted = true; 2075 nsk = NULL; 2076 if (!tcp_filter(sk, skb)) { 2077 th = (const struct tcphdr *)skb->data; 2078 iph = ip_hdr(skb); 2079 tcp_v4_fill_cb(skb, iph, th); 2080 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2081 } else { 2082 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2083 } 2084 if (!nsk) { 2085 reqsk_put(req); 2086 if (req_stolen) { 2087 /* Another cpu got exclusive access to req 2088 * and created a full blown socket. 2089 * Try to feed this packet to this socket 2090 * instead of discarding it. 2091 */ 2092 tcp_v4_restore_cb(skb); 2093 sock_put(sk); 2094 goto lookup; 2095 } 2096 goto discard_and_relse; 2097 } 2098 nf_reset_ct(skb); 2099 if (nsk == sk) { 2100 reqsk_put(req); 2101 tcp_v4_restore_cb(skb); 2102 } else if (tcp_child_process(sk, nsk, skb)) { 2103 tcp_v4_send_reset(nsk, skb); 2104 goto discard_and_relse; 2105 } else { 2106 sock_put(sk); 2107 return 0; 2108 } 2109 } 2110 2111 if (static_branch_unlikely(&ip4_min_ttl)) { 2112 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2113 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2114 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2115 drop_reason = SKB_DROP_REASON_TCP_MINTTL; 2116 goto discard_and_relse; 2117 } 2118 } 2119 2120 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2121 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2122 goto discard_and_relse; 2123 } 2124 2125 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, 2126 &iph->daddr, AF_INET, dif, sdif); 2127 if (drop_reason) 2128 goto discard_and_relse; 2129 2130 nf_reset_ct(skb); 2131 2132 if (tcp_filter(sk, skb)) { 2133 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2134 goto discard_and_relse; 2135 } 2136 th = (const struct tcphdr *)skb->data; 2137 iph = ip_hdr(skb); 2138 tcp_v4_fill_cb(skb, iph, th); 2139 2140 skb->dev = NULL; 2141 2142 if (sk->sk_state == TCP_LISTEN) { 2143 ret = tcp_v4_do_rcv(sk, skb); 2144 goto put_and_return; 2145 } 2146 2147 sk_incoming_cpu_update(sk); 2148 2149 bh_lock_sock_nested(sk); 2150 tcp_segs_in(tcp_sk(sk), skb); 2151 ret = 0; 2152 if (!sock_owned_by_user(sk)) { 2153 ret = tcp_v4_do_rcv(sk, skb); 2154 } else { 2155 if (tcp_add_backlog(sk, skb, &drop_reason)) 2156 goto discard_and_relse; 2157 } 2158 bh_unlock_sock(sk); 2159 2160 put_and_return: 2161 if (refcounted) 2162 sock_put(sk); 2163 2164 return ret; 2165 2166 no_tcp_socket: 2167 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2168 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2169 goto discard_it; 2170 2171 tcp_v4_fill_cb(skb, iph, th); 2172 2173 if (tcp_checksum_complete(skb)) { 2174 csum_error: 2175 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2176 trace_tcp_bad_csum(skb); 2177 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2178 bad_packet: 2179 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2180 } else { 2181 tcp_v4_send_reset(NULL, skb); 2182 } 2183 2184 discard_it: 2185 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2186 /* Discard frame. */ 2187 kfree_skb_reason(skb, drop_reason); 2188 return 0; 2189 2190 discard_and_relse: 2191 sk_drops_add(sk, skb); 2192 if (refcounted) 2193 sock_put(sk); 2194 goto discard_it; 2195 2196 do_time_wait: 2197 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2198 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2199 inet_twsk_put(inet_twsk(sk)); 2200 goto discard_it; 2201 } 2202 2203 tcp_v4_fill_cb(skb, iph, th); 2204 2205 if (tcp_checksum_complete(skb)) { 2206 inet_twsk_put(inet_twsk(sk)); 2207 goto csum_error; 2208 } 2209 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2210 case TCP_TW_SYN: { 2211 struct sock *sk2 = inet_lookup_listener(net, 2212 net->ipv4.tcp_death_row.hashinfo, 2213 skb, __tcp_hdrlen(th), 2214 iph->saddr, th->source, 2215 iph->daddr, th->dest, 2216 inet_iif(skb), 2217 sdif); 2218 if (sk2) { 2219 inet_twsk_deschedule_put(inet_twsk(sk)); 2220 sk = sk2; 2221 tcp_v4_restore_cb(skb); 2222 refcounted = false; 2223 goto process; 2224 } 2225 } 2226 /* to ACK */ 2227 fallthrough; 2228 case TCP_TW_ACK: 2229 tcp_v4_timewait_ack(sk, skb); 2230 break; 2231 case TCP_TW_RST: 2232 tcp_v4_send_reset(sk, skb); 2233 inet_twsk_deschedule_put(inet_twsk(sk)); 2234 goto discard_it; 2235 case TCP_TW_SUCCESS:; 2236 } 2237 goto discard_it; 2238 } 2239 2240 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2241 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2242 .twsk_unique = tcp_twsk_unique, 2243 .twsk_destructor= tcp_twsk_destructor, 2244 }; 2245 2246 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2247 { 2248 struct dst_entry *dst = skb_dst(skb); 2249 2250 if (dst && dst_hold_safe(dst)) { 2251 rcu_assign_pointer(sk->sk_rx_dst, dst); 2252 sk->sk_rx_dst_ifindex = skb->skb_iif; 2253 } 2254 } 2255 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2256 2257 const struct inet_connection_sock_af_ops ipv4_specific = { 2258 .queue_xmit = ip_queue_xmit, 2259 .send_check = tcp_v4_send_check, 2260 .rebuild_header = inet_sk_rebuild_header, 2261 .sk_rx_dst_set = inet_sk_rx_dst_set, 2262 .conn_request = tcp_v4_conn_request, 2263 .syn_recv_sock = tcp_v4_syn_recv_sock, 2264 .net_header_len = sizeof(struct iphdr), 2265 .setsockopt = ip_setsockopt, 2266 .getsockopt = ip_getsockopt, 2267 .addr2sockaddr = inet_csk_addr2sockaddr, 2268 .sockaddr_len = sizeof(struct sockaddr_in), 2269 .mtu_reduced = tcp_v4_mtu_reduced, 2270 }; 2271 EXPORT_SYMBOL(ipv4_specific); 2272 2273 #ifdef CONFIG_TCP_MD5SIG 2274 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2275 .md5_lookup = tcp_v4_md5_lookup, 2276 .calc_md5_hash = tcp_v4_md5_hash_skb, 2277 .md5_parse = tcp_v4_parse_md5_keys, 2278 }; 2279 #endif 2280 2281 /* NOTE: A lot of things set to zero explicitly by call to 2282 * sk_alloc() so need not be done here. 2283 */ 2284 static int tcp_v4_init_sock(struct sock *sk) 2285 { 2286 struct inet_connection_sock *icsk = inet_csk(sk); 2287 2288 tcp_init_sock(sk); 2289 2290 icsk->icsk_af_ops = &ipv4_specific; 2291 2292 #ifdef CONFIG_TCP_MD5SIG 2293 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2294 #endif 2295 2296 return 0; 2297 } 2298 2299 void tcp_v4_destroy_sock(struct sock *sk) 2300 { 2301 struct tcp_sock *tp = tcp_sk(sk); 2302 2303 trace_tcp_destroy_sock(sk); 2304 2305 tcp_clear_xmit_timers(sk); 2306 2307 tcp_cleanup_congestion_control(sk); 2308 2309 tcp_cleanup_ulp(sk); 2310 2311 /* Cleanup up the write buffer. */ 2312 tcp_write_queue_purge(sk); 2313 2314 /* Check if we want to disable active TFO */ 2315 tcp_fastopen_active_disable_ofo_check(sk); 2316 2317 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2318 skb_rbtree_purge(&tp->out_of_order_queue); 2319 2320 #ifdef CONFIG_TCP_MD5SIG 2321 /* Clean up the MD5 key list, if any */ 2322 if (tp->md5sig_info) { 2323 tcp_clear_md5_list(sk); 2324 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2325 tp->md5sig_info = NULL; 2326 static_branch_slow_dec_deferred(&tcp_md5_needed); 2327 } 2328 #endif 2329 2330 /* Clean up a referenced TCP bind bucket. */ 2331 if (inet_csk(sk)->icsk_bind_hash) 2332 inet_put_port(sk); 2333 2334 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2335 2336 /* If socket is aborted during connect operation */ 2337 tcp_free_fastopen_req(tp); 2338 tcp_fastopen_destroy_cipher(sk); 2339 tcp_saved_syn_free(tp); 2340 2341 sk_sockets_allocated_dec(sk); 2342 } 2343 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2344 2345 #ifdef CONFIG_PROC_FS 2346 /* Proc filesystem TCP sock list dumping. */ 2347 2348 static unsigned short seq_file_family(const struct seq_file *seq); 2349 2350 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2351 { 2352 unsigned short family = seq_file_family(seq); 2353 2354 /* AF_UNSPEC is used as a match all */ 2355 return ((family == AF_UNSPEC || family == sk->sk_family) && 2356 net_eq(sock_net(sk), seq_file_net(seq))); 2357 } 2358 2359 /* Find a non empty bucket (starting from st->bucket) 2360 * and return the first sk from it. 2361 */ 2362 static void *listening_get_first(struct seq_file *seq) 2363 { 2364 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2365 struct tcp_iter_state *st = seq->private; 2366 2367 st->offset = 0; 2368 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { 2369 struct inet_listen_hashbucket *ilb2; 2370 struct hlist_nulls_node *node; 2371 struct sock *sk; 2372 2373 ilb2 = &hinfo->lhash2[st->bucket]; 2374 if (hlist_nulls_empty(&ilb2->nulls_head)) 2375 continue; 2376 2377 spin_lock(&ilb2->lock); 2378 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2379 if (seq_sk_match(seq, sk)) 2380 return sk; 2381 } 2382 spin_unlock(&ilb2->lock); 2383 } 2384 2385 return NULL; 2386 } 2387 2388 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2389 * If "cur" is the last one in the st->bucket, 2390 * call listening_get_first() to return the first sk of the next 2391 * non empty bucket. 2392 */ 2393 static void *listening_get_next(struct seq_file *seq, void *cur) 2394 { 2395 struct tcp_iter_state *st = seq->private; 2396 struct inet_listen_hashbucket *ilb2; 2397 struct hlist_nulls_node *node; 2398 struct inet_hashinfo *hinfo; 2399 struct sock *sk = cur; 2400 2401 ++st->num; 2402 ++st->offset; 2403 2404 sk = sk_nulls_next(sk); 2405 sk_nulls_for_each_from(sk, node) { 2406 if (seq_sk_match(seq, sk)) 2407 return sk; 2408 } 2409 2410 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2411 ilb2 = &hinfo->lhash2[st->bucket]; 2412 spin_unlock(&ilb2->lock); 2413 ++st->bucket; 2414 return listening_get_first(seq); 2415 } 2416 2417 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2418 { 2419 struct tcp_iter_state *st = seq->private; 2420 void *rc; 2421 2422 st->bucket = 0; 2423 st->offset = 0; 2424 rc = listening_get_first(seq); 2425 2426 while (rc && *pos) { 2427 rc = listening_get_next(seq, rc); 2428 --*pos; 2429 } 2430 return rc; 2431 } 2432 2433 static inline bool empty_bucket(struct inet_hashinfo *hinfo, 2434 const struct tcp_iter_state *st) 2435 { 2436 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); 2437 } 2438 2439 /* 2440 * Get first established socket starting from bucket given in st->bucket. 2441 * If st->bucket is zero, the very first socket in the hash is returned. 2442 */ 2443 static void *established_get_first(struct seq_file *seq) 2444 { 2445 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2446 struct tcp_iter_state *st = seq->private; 2447 2448 st->offset = 0; 2449 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { 2450 struct sock *sk; 2451 struct hlist_nulls_node *node; 2452 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); 2453 2454 cond_resched(); 2455 2456 /* Lockless fast path for the common case of empty buckets */ 2457 if (empty_bucket(hinfo, st)) 2458 continue; 2459 2460 spin_lock_bh(lock); 2461 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { 2462 if (seq_sk_match(seq, sk)) 2463 return sk; 2464 } 2465 spin_unlock_bh(lock); 2466 } 2467 2468 return NULL; 2469 } 2470 2471 static void *established_get_next(struct seq_file *seq, void *cur) 2472 { 2473 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2474 struct tcp_iter_state *st = seq->private; 2475 struct hlist_nulls_node *node; 2476 struct sock *sk = cur; 2477 2478 ++st->num; 2479 ++st->offset; 2480 2481 sk = sk_nulls_next(sk); 2482 2483 sk_nulls_for_each_from(sk, node) { 2484 if (seq_sk_match(seq, sk)) 2485 return sk; 2486 } 2487 2488 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2489 ++st->bucket; 2490 return established_get_first(seq); 2491 } 2492 2493 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2494 { 2495 struct tcp_iter_state *st = seq->private; 2496 void *rc; 2497 2498 st->bucket = 0; 2499 rc = established_get_first(seq); 2500 2501 while (rc && pos) { 2502 rc = established_get_next(seq, rc); 2503 --pos; 2504 } 2505 return rc; 2506 } 2507 2508 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2509 { 2510 void *rc; 2511 struct tcp_iter_state *st = seq->private; 2512 2513 st->state = TCP_SEQ_STATE_LISTENING; 2514 rc = listening_get_idx(seq, &pos); 2515 2516 if (!rc) { 2517 st->state = TCP_SEQ_STATE_ESTABLISHED; 2518 rc = established_get_idx(seq, pos); 2519 } 2520 2521 return rc; 2522 } 2523 2524 static void *tcp_seek_last_pos(struct seq_file *seq) 2525 { 2526 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2527 struct tcp_iter_state *st = seq->private; 2528 int bucket = st->bucket; 2529 int offset = st->offset; 2530 int orig_num = st->num; 2531 void *rc = NULL; 2532 2533 switch (st->state) { 2534 case TCP_SEQ_STATE_LISTENING: 2535 if (st->bucket > hinfo->lhash2_mask) 2536 break; 2537 rc = listening_get_first(seq); 2538 while (offset-- && rc && bucket == st->bucket) 2539 rc = listening_get_next(seq, rc); 2540 if (rc) 2541 break; 2542 st->bucket = 0; 2543 st->state = TCP_SEQ_STATE_ESTABLISHED; 2544 fallthrough; 2545 case TCP_SEQ_STATE_ESTABLISHED: 2546 if (st->bucket > hinfo->ehash_mask) 2547 break; 2548 rc = established_get_first(seq); 2549 while (offset-- && rc && bucket == st->bucket) 2550 rc = established_get_next(seq, rc); 2551 } 2552 2553 st->num = orig_num; 2554 2555 return rc; 2556 } 2557 2558 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2559 { 2560 struct tcp_iter_state *st = seq->private; 2561 void *rc; 2562 2563 if (*pos && *pos == st->last_pos) { 2564 rc = tcp_seek_last_pos(seq); 2565 if (rc) 2566 goto out; 2567 } 2568 2569 st->state = TCP_SEQ_STATE_LISTENING; 2570 st->num = 0; 2571 st->bucket = 0; 2572 st->offset = 0; 2573 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2574 2575 out: 2576 st->last_pos = *pos; 2577 return rc; 2578 } 2579 EXPORT_SYMBOL(tcp_seq_start); 2580 2581 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2582 { 2583 struct tcp_iter_state *st = seq->private; 2584 void *rc = NULL; 2585 2586 if (v == SEQ_START_TOKEN) { 2587 rc = tcp_get_idx(seq, 0); 2588 goto out; 2589 } 2590 2591 switch (st->state) { 2592 case TCP_SEQ_STATE_LISTENING: 2593 rc = listening_get_next(seq, v); 2594 if (!rc) { 2595 st->state = TCP_SEQ_STATE_ESTABLISHED; 2596 st->bucket = 0; 2597 st->offset = 0; 2598 rc = established_get_first(seq); 2599 } 2600 break; 2601 case TCP_SEQ_STATE_ESTABLISHED: 2602 rc = established_get_next(seq, v); 2603 break; 2604 } 2605 out: 2606 ++*pos; 2607 st->last_pos = *pos; 2608 return rc; 2609 } 2610 EXPORT_SYMBOL(tcp_seq_next); 2611 2612 void tcp_seq_stop(struct seq_file *seq, void *v) 2613 { 2614 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2615 struct tcp_iter_state *st = seq->private; 2616 2617 switch (st->state) { 2618 case TCP_SEQ_STATE_LISTENING: 2619 if (v != SEQ_START_TOKEN) 2620 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2621 break; 2622 case TCP_SEQ_STATE_ESTABLISHED: 2623 if (v) 2624 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2625 break; 2626 } 2627 } 2628 EXPORT_SYMBOL(tcp_seq_stop); 2629 2630 static void get_openreq4(const struct request_sock *req, 2631 struct seq_file *f, int i) 2632 { 2633 const struct inet_request_sock *ireq = inet_rsk(req); 2634 long delta = req->rsk_timer.expires - jiffies; 2635 2636 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2637 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2638 i, 2639 ireq->ir_loc_addr, 2640 ireq->ir_num, 2641 ireq->ir_rmt_addr, 2642 ntohs(ireq->ir_rmt_port), 2643 TCP_SYN_RECV, 2644 0, 0, /* could print option size, but that is af dependent. */ 2645 1, /* timers active (only the expire timer) */ 2646 jiffies_delta_to_clock_t(delta), 2647 req->num_timeout, 2648 from_kuid_munged(seq_user_ns(f), 2649 sock_i_uid(req->rsk_listener)), 2650 0, /* non standard timer */ 2651 0, /* open_requests have no inode */ 2652 0, 2653 req); 2654 } 2655 2656 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2657 { 2658 int timer_active; 2659 unsigned long timer_expires; 2660 const struct tcp_sock *tp = tcp_sk(sk); 2661 const struct inet_connection_sock *icsk = inet_csk(sk); 2662 const struct inet_sock *inet = inet_sk(sk); 2663 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2664 __be32 dest = inet->inet_daddr; 2665 __be32 src = inet->inet_rcv_saddr; 2666 __u16 destp = ntohs(inet->inet_dport); 2667 __u16 srcp = ntohs(inet->inet_sport); 2668 int rx_queue; 2669 int state; 2670 2671 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2672 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2673 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2674 timer_active = 1; 2675 timer_expires = icsk->icsk_timeout; 2676 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2677 timer_active = 4; 2678 timer_expires = icsk->icsk_timeout; 2679 } else if (timer_pending(&sk->sk_timer)) { 2680 timer_active = 2; 2681 timer_expires = sk->sk_timer.expires; 2682 } else { 2683 timer_active = 0; 2684 timer_expires = jiffies; 2685 } 2686 2687 state = inet_sk_state_load(sk); 2688 if (state == TCP_LISTEN) 2689 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2690 else 2691 /* Because we don't lock the socket, 2692 * we might find a transient negative value. 2693 */ 2694 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2695 READ_ONCE(tp->copied_seq), 0); 2696 2697 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2698 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2699 i, src, srcp, dest, destp, state, 2700 READ_ONCE(tp->write_seq) - tp->snd_una, 2701 rx_queue, 2702 timer_active, 2703 jiffies_delta_to_clock_t(timer_expires - jiffies), 2704 icsk->icsk_retransmits, 2705 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2706 icsk->icsk_probes_out, 2707 sock_i_ino(sk), 2708 refcount_read(&sk->sk_refcnt), sk, 2709 jiffies_to_clock_t(icsk->icsk_rto), 2710 jiffies_to_clock_t(icsk->icsk_ack.ato), 2711 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2712 tcp_snd_cwnd(tp), 2713 state == TCP_LISTEN ? 2714 fastopenq->max_qlen : 2715 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2716 } 2717 2718 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2719 struct seq_file *f, int i) 2720 { 2721 long delta = tw->tw_timer.expires - jiffies; 2722 __be32 dest, src; 2723 __u16 destp, srcp; 2724 2725 dest = tw->tw_daddr; 2726 src = tw->tw_rcv_saddr; 2727 destp = ntohs(tw->tw_dport); 2728 srcp = ntohs(tw->tw_sport); 2729 2730 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2731 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2732 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2733 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2734 refcount_read(&tw->tw_refcnt), tw); 2735 } 2736 2737 #define TMPSZ 150 2738 2739 static int tcp4_seq_show(struct seq_file *seq, void *v) 2740 { 2741 struct tcp_iter_state *st; 2742 struct sock *sk = v; 2743 2744 seq_setwidth(seq, TMPSZ - 1); 2745 if (v == SEQ_START_TOKEN) { 2746 seq_puts(seq, " sl local_address rem_address st tx_queue " 2747 "rx_queue tr tm->when retrnsmt uid timeout " 2748 "inode"); 2749 goto out; 2750 } 2751 st = seq->private; 2752 2753 if (sk->sk_state == TCP_TIME_WAIT) 2754 get_timewait4_sock(v, seq, st->num); 2755 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2756 get_openreq4(v, seq, st->num); 2757 else 2758 get_tcp4_sock(v, seq, st->num); 2759 out: 2760 seq_pad(seq, '\n'); 2761 return 0; 2762 } 2763 2764 #ifdef CONFIG_BPF_SYSCALL 2765 struct bpf_tcp_iter_state { 2766 struct tcp_iter_state state; 2767 unsigned int cur_sk; 2768 unsigned int end_sk; 2769 unsigned int max_sk; 2770 struct sock **batch; 2771 bool st_bucket_done; 2772 }; 2773 2774 struct bpf_iter__tcp { 2775 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2776 __bpf_md_ptr(struct sock_common *, sk_common); 2777 uid_t uid __aligned(8); 2778 }; 2779 2780 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2781 struct sock_common *sk_common, uid_t uid) 2782 { 2783 struct bpf_iter__tcp ctx; 2784 2785 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2786 ctx.meta = meta; 2787 ctx.sk_common = sk_common; 2788 ctx.uid = uid; 2789 return bpf_iter_run_prog(prog, &ctx); 2790 } 2791 2792 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2793 { 2794 while (iter->cur_sk < iter->end_sk) 2795 sock_gen_put(iter->batch[iter->cur_sk++]); 2796 } 2797 2798 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2799 unsigned int new_batch_sz) 2800 { 2801 struct sock **new_batch; 2802 2803 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2804 GFP_USER | __GFP_NOWARN); 2805 if (!new_batch) 2806 return -ENOMEM; 2807 2808 bpf_iter_tcp_put_batch(iter); 2809 kvfree(iter->batch); 2810 iter->batch = new_batch; 2811 iter->max_sk = new_batch_sz; 2812 2813 return 0; 2814 } 2815 2816 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2817 struct sock *start_sk) 2818 { 2819 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2820 struct bpf_tcp_iter_state *iter = seq->private; 2821 struct tcp_iter_state *st = &iter->state; 2822 struct hlist_nulls_node *node; 2823 unsigned int expected = 1; 2824 struct sock *sk; 2825 2826 sock_hold(start_sk); 2827 iter->batch[iter->end_sk++] = start_sk; 2828 2829 sk = sk_nulls_next(start_sk); 2830 sk_nulls_for_each_from(sk, node) { 2831 if (seq_sk_match(seq, sk)) { 2832 if (iter->end_sk < iter->max_sk) { 2833 sock_hold(sk); 2834 iter->batch[iter->end_sk++] = sk; 2835 } 2836 expected++; 2837 } 2838 } 2839 spin_unlock(&hinfo->lhash2[st->bucket].lock); 2840 2841 return expected; 2842 } 2843 2844 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2845 struct sock *start_sk) 2846 { 2847 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2848 struct bpf_tcp_iter_state *iter = seq->private; 2849 struct tcp_iter_state *st = &iter->state; 2850 struct hlist_nulls_node *node; 2851 unsigned int expected = 1; 2852 struct sock *sk; 2853 2854 sock_hold(start_sk); 2855 iter->batch[iter->end_sk++] = start_sk; 2856 2857 sk = sk_nulls_next(start_sk); 2858 sk_nulls_for_each_from(sk, node) { 2859 if (seq_sk_match(seq, sk)) { 2860 if (iter->end_sk < iter->max_sk) { 2861 sock_hold(sk); 2862 iter->batch[iter->end_sk++] = sk; 2863 } 2864 expected++; 2865 } 2866 } 2867 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); 2868 2869 return expected; 2870 } 2871 2872 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2873 { 2874 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; 2875 struct bpf_tcp_iter_state *iter = seq->private; 2876 struct tcp_iter_state *st = &iter->state; 2877 unsigned int expected; 2878 bool resized = false; 2879 struct sock *sk; 2880 2881 /* The st->bucket is done. Directly advance to the next 2882 * bucket instead of having the tcp_seek_last_pos() to skip 2883 * one by one in the current bucket and eventually find out 2884 * it has to advance to the next bucket. 2885 */ 2886 if (iter->st_bucket_done) { 2887 st->offset = 0; 2888 st->bucket++; 2889 if (st->state == TCP_SEQ_STATE_LISTENING && 2890 st->bucket > hinfo->lhash2_mask) { 2891 st->state = TCP_SEQ_STATE_ESTABLISHED; 2892 st->bucket = 0; 2893 } 2894 } 2895 2896 again: 2897 /* Get a new batch */ 2898 iter->cur_sk = 0; 2899 iter->end_sk = 0; 2900 iter->st_bucket_done = false; 2901 2902 sk = tcp_seek_last_pos(seq); 2903 if (!sk) 2904 return NULL; /* Done */ 2905 2906 if (st->state == TCP_SEQ_STATE_LISTENING) 2907 expected = bpf_iter_tcp_listening_batch(seq, sk); 2908 else 2909 expected = bpf_iter_tcp_established_batch(seq, sk); 2910 2911 if (iter->end_sk == expected) { 2912 iter->st_bucket_done = true; 2913 return sk; 2914 } 2915 2916 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2917 resized = true; 2918 goto again; 2919 } 2920 2921 return sk; 2922 } 2923 2924 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2925 { 2926 /* bpf iter does not support lseek, so it always 2927 * continue from where it was stop()-ped. 2928 */ 2929 if (*pos) 2930 return bpf_iter_tcp_batch(seq); 2931 2932 return SEQ_START_TOKEN; 2933 } 2934 2935 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2936 { 2937 struct bpf_tcp_iter_state *iter = seq->private; 2938 struct tcp_iter_state *st = &iter->state; 2939 struct sock *sk; 2940 2941 /* Whenever seq_next() is called, the iter->cur_sk is 2942 * done with seq_show(), so advance to the next sk in 2943 * the batch. 2944 */ 2945 if (iter->cur_sk < iter->end_sk) { 2946 /* Keeping st->num consistent in tcp_iter_state. 2947 * bpf_iter_tcp does not use st->num. 2948 * meta.seq_num is used instead. 2949 */ 2950 st->num++; 2951 /* Move st->offset to the next sk in the bucket such that 2952 * the future start() will resume at st->offset in 2953 * st->bucket. See tcp_seek_last_pos(). 2954 */ 2955 st->offset++; 2956 sock_gen_put(iter->batch[iter->cur_sk++]); 2957 } 2958 2959 if (iter->cur_sk < iter->end_sk) 2960 sk = iter->batch[iter->cur_sk]; 2961 else 2962 sk = bpf_iter_tcp_batch(seq); 2963 2964 ++*pos; 2965 /* Keeping st->last_pos consistent in tcp_iter_state. 2966 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2967 */ 2968 st->last_pos = *pos; 2969 return sk; 2970 } 2971 2972 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2973 { 2974 struct bpf_iter_meta meta; 2975 struct bpf_prog *prog; 2976 struct sock *sk = v; 2977 uid_t uid; 2978 int ret; 2979 2980 if (v == SEQ_START_TOKEN) 2981 return 0; 2982 2983 if (sk_fullsock(sk)) 2984 lock_sock(sk); 2985 2986 if (unlikely(sk_unhashed(sk))) { 2987 ret = SEQ_SKIP; 2988 goto unlock; 2989 } 2990 2991 if (sk->sk_state == TCP_TIME_WAIT) { 2992 uid = 0; 2993 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2994 const struct request_sock *req = v; 2995 2996 uid = from_kuid_munged(seq_user_ns(seq), 2997 sock_i_uid(req->rsk_listener)); 2998 } else { 2999 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3000 } 3001 3002 meta.seq = seq; 3003 prog = bpf_iter_get_info(&meta, false); 3004 ret = tcp_prog_seq_show(prog, &meta, v, uid); 3005 3006 unlock: 3007 if (sk_fullsock(sk)) 3008 release_sock(sk); 3009 return ret; 3010 3011 } 3012 3013 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 3014 { 3015 struct bpf_tcp_iter_state *iter = seq->private; 3016 struct bpf_iter_meta meta; 3017 struct bpf_prog *prog; 3018 3019 if (!v) { 3020 meta.seq = seq; 3021 prog = bpf_iter_get_info(&meta, true); 3022 if (prog) 3023 (void)tcp_prog_seq_show(prog, &meta, v, 0); 3024 } 3025 3026 if (iter->cur_sk < iter->end_sk) { 3027 bpf_iter_tcp_put_batch(iter); 3028 iter->st_bucket_done = false; 3029 } 3030 } 3031 3032 static const struct seq_operations bpf_iter_tcp_seq_ops = { 3033 .show = bpf_iter_tcp_seq_show, 3034 .start = bpf_iter_tcp_seq_start, 3035 .next = bpf_iter_tcp_seq_next, 3036 .stop = bpf_iter_tcp_seq_stop, 3037 }; 3038 #endif 3039 static unsigned short seq_file_family(const struct seq_file *seq) 3040 { 3041 const struct tcp_seq_afinfo *afinfo; 3042 3043 #ifdef CONFIG_BPF_SYSCALL 3044 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 3045 if (seq->op == &bpf_iter_tcp_seq_ops) 3046 return AF_UNSPEC; 3047 #endif 3048 3049 /* Iterated from proc fs */ 3050 afinfo = pde_data(file_inode(seq->file)); 3051 return afinfo->family; 3052 } 3053 3054 static const struct seq_operations tcp4_seq_ops = { 3055 .show = tcp4_seq_show, 3056 .start = tcp_seq_start, 3057 .next = tcp_seq_next, 3058 .stop = tcp_seq_stop, 3059 }; 3060 3061 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 3062 .family = AF_INET, 3063 }; 3064 3065 static int __net_init tcp4_proc_init_net(struct net *net) 3066 { 3067 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3068 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3069 return -ENOMEM; 3070 return 0; 3071 } 3072 3073 static void __net_exit tcp4_proc_exit_net(struct net *net) 3074 { 3075 remove_proc_entry("tcp", net->proc_net); 3076 } 3077 3078 static struct pernet_operations tcp4_net_ops = { 3079 .init = tcp4_proc_init_net, 3080 .exit = tcp4_proc_exit_net, 3081 }; 3082 3083 int __init tcp4_proc_init(void) 3084 { 3085 return register_pernet_subsys(&tcp4_net_ops); 3086 } 3087 3088 void tcp4_proc_exit(void) 3089 { 3090 unregister_pernet_subsys(&tcp4_net_ops); 3091 } 3092 #endif /* CONFIG_PROC_FS */ 3093 3094 /* @wake is one when sk_stream_write_space() calls us. 3095 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3096 * This mimics the strategy used in sock_def_write_space(). 3097 */ 3098 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3099 { 3100 const struct tcp_sock *tp = tcp_sk(sk); 3101 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3102 READ_ONCE(tp->snd_nxt); 3103 3104 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3105 } 3106 EXPORT_SYMBOL(tcp_stream_memory_free); 3107 3108 struct proto tcp_prot = { 3109 .name = "TCP", 3110 .owner = THIS_MODULE, 3111 .close = tcp_close, 3112 .pre_connect = tcp_v4_pre_connect, 3113 .connect = tcp_v4_connect, 3114 .disconnect = tcp_disconnect, 3115 .accept = inet_csk_accept, 3116 .ioctl = tcp_ioctl, 3117 .init = tcp_v4_init_sock, 3118 .destroy = tcp_v4_destroy_sock, 3119 .shutdown = tcp_shutdown, 3120 .setsockopt = tcp_setsockopt, 3121 .getsockopt = tcp_getsockopt, 3122 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3123 .keepalive = tcp_set_keepalive, 3124 .recvmsg = tcp_recvmsg, 3125 .sendmsg = tcp_sendmsg, 3126 .splice_eof = tcp_splice_eof, 3127 .backlog_rcv = tcp_v4_do_rcv, 3128 .release_cb = tcp_release_cb, 3129 .hash = inet_hash, 3130 .unhash = inet_unhash, 3131 .get_port = inet_csk_get_port, 3132 .put_port = inet_put_port, 3133 #ifdef CONFIG_BPF_SYSCALL 3134 .psock_update_sk_prot = tcp_bpf_update_proto, 3135 #endif 3136 .enter_memory_pressure = tcp_enter_memory_pressure, 3137 .leave_memory_pressure = tcp_leave_memory_pressure, 3138 .stream_memory_free = tcp_stream_memory_free, 3139 .sockets_allocated = &tcp_sockets_allocated, 3140 .orphan_count = &tcp_orphan_count, 3141 3142 .memory_allocated = &tcp_memory_allocated, 3143 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3144 3145 .memory_pressure = &tcp_memory_pressure, 3146 .sysctl_mem = sysctl_tcp_mem, 3147 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3148 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3149 .max_header = MAX_TCP_HEADER, 3150 .obj_size = sizeof(struct tcp_sock), 3151 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3152 .twsk_prot = &tcp_timewait_sock_ops, 3153 .rsk_prot = &tcp_request_sock_ops, 3154 .h.hashinfo = NULL, 3155 .no_autobind = true, 3156 .diag_destroy = tcp_abort, 3157 }; 3158 EXPORT_SYMBOL(tcp_prot); 3159 3160 static void __net_exit tcp_sk_exit(struct net *net) 3161 { 3162 if (net->ipv4.tcp_congestion_control) 3163 bpf_module_put(net->ipv4.tcp_congestion_control, 3164 net->ipv4.tcp_congestion_control->owner); 3165 } 3166 3167 static void __net_init tcp_set_hashinfo(struct net *net) 3168 { 3169 struct inet_hashinfo *hinfo; 3170 unsigned int ehash_entries; 3171 struct net *old_net; 3172 3173 if (net_eq(net, &init_net)) 3174 goto fallback; 3175 3176 old_net = current->nsproxy->net_ns; 3177 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); 3178 if (!ehash_entries) 3179 goto fallback; 3180 3181 ehash_entries = roundup_pow_of_two(ehash_entries); 3182 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); 3183 if (!hinfo) { 3184 pr_warn("Failed to allocate TCP ehash (entries: %u) " 3185 "for a netns, fallback to the global one\n", 3186 ehash_entries); 3187 fallback: 3188 hinfo = &tcp_hashinfo; 3189 ehash_entries = tcp_hashinfo.ehash_mask + 1; 3190 } 3191 3192 net->ipv4.tcp_death_row.hashinfo = hinfo; 3193 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; 3194 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); 3195 } 3196 3197 static int __net_init tcp_sk_init(struct net *net) 3198 { 3199 net->ipv4.sysctl_tcp_ecn = 2; 3200 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3201 3202 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3203 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3204 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3205 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3206 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3207 3208 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3209 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3210 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3211 3212 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3213 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3214 net->ipv4.sysctl_tcp_syncookies = 1; 3215 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3216 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3217 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3218 net->ipv4.sysctl_tcp_orphan_retries = 0; 3219 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3220 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3221 net->ipv4.sysctl_tcp_tw_reuse = 2; 3222 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3223 3224 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); 3225 tcp_set_hashinfo(net); 3226 3227 net->ipv4.sysctl_tcp_sack = 1; 3228 net->ipv4.sysctl_tcp_window_scaling = 1; 3229 net->ipv4.sysctl_tcp_timestamps = 1; 3230 net->ipv4.sysctl_tcp_early_retrans = 3; 3231 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3232 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3233 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3234 net->ipv4.sysctl_tcp_max_reordering = 300; 3235 net->ipv4.sysctl_tcp_dsack = 1; 3236 net->ipv4.sysctl_tcp_app_win = 31; 3237 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3238 net->ipv4.sysctl_tcp_frto = 2; 3239 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3240 /* This limits the percentage of the congestion window which we 3241 * will allow a single TSO frame to consume. Building TSO frames 3242 * which are too large can cause TCP streams to be bursty. 3243 */ 3244 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3245 /* Default TSQ limit of 16 TSO segments */ 3246 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3247 3248 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3249 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3250 3251 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3252 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3253 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3254 net->ipv4.sysctl_tcp_autocorking = 1; 3255 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3256 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3257 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3258 if (net != &init_net) { 3259 memcpy(net->ipv4.sysctl_tcp_rmem, 3260 init_net.ipv4.sysctl_tcp_rmem, 3261 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3262 memcpy(net->ipv4.sysctl_tcp_wmem, 3263 init_net.ipv4.sysctl_tcp_wmem, 3264 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3265 } 3266 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3267 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3268 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3269 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3270 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3271 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3272 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3273 3274 /* Set default values for PLB */ 3275 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ 3276 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; 3277 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; 3278 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; 3279 /* Default congestion threshold for PLB to mark a round is 50% */ 3280 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; 3281 3282 /* Reno is always built in */ 3283 if (!net_eq(net, &init_net) && 3284 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3285 init_net.ipv4.tcp_congestion_control->owner)) 3286 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3287 else 3288 net->ipv4.tcp_congestion_control = &tcp_reno; 3289 3290 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; 3291 net->ipv4.sysctl_tcp_shrink_window = 0; 3292 3293 net->ipv4.sysctl_tcp_pingpong_thresh = 1; 3294 3295 return 0; 3296 } 3297 3298 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3299 { 3300 struct net *net; 3301 3302 tcp_twsk_purge(net_exit_list, AF_INET); 3303 3304 list_for_each_entry(net, net_exit_list, exit_list) { 3305 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); 3306 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); 3307 tcp_fastopen_ctx_destroy(net); 3308 } 3309 } 3310 3311 static struct pernet_operations __net_initdata tcp_sk_ops = { 3312 .init = tcp_sk_init, 3313 .exit = tcp_sk_exit, 3314 .exit_batch = tcp_sk_exit_batch, 3315 }; 3316 3317 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3318 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3319 struct sock_common *sk_common, uid_t uid) 3320 3321 #define INIT_BATCH_SZ 16 3322 3323 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3324 { 3325 struct bpf_tcp_iter_state *iter = priv_data; 3326 int err; 3327 3328 err = bpf_iter_init_seq_net(priv_data, aux); 3329 if (err) 3330 return err; 3331 3332 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3333 if (err) { 3334 bpf_iter_fini_seq_net(priv_data); 3335 return err; 3336 } 3337 3338 return 0; 3339 } 3340 3341 static void bpf_iter_fini_tcp(void *priv_data) 3342 { 3343 struct bpf_tcp_iter_state *iter = priv_data; 3344 3345 bpf_iter_fini_seq_net(priv_data); 3346 kvfree(iter->batch); 3347 } 3348 3349 static const struct bpf_iter_seq_info tcp_seq_info = { 3350 .seq_ops = &bpf_iter_tcp_seq_ops, 3351 .init_seq_private = bpf_iter_init_tcp, 3352 .fini_seq_private = bpf_iter_fini_tcp, 3353 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3354 }; 3355 3356 static const struct bpf_func_proto * 3357 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3358 const struct bpf_prog *prog) 3359 { 3360 switch (func_id) { 3361 case BPF_FUNC_setsockopt: 3362 return &bpf_sk_setsockopt_proto; 3363 case BPF_FUNC_getsockopt: 3364 return &bpf_sk_getsockopt_proto; 3365 default: 3366 return NULL; 3367 } 3368 } 3369 3370 static struct bpf_iter_reg tcp_reg_info = { 3371 .target = "tcp", 3372 .ctx_arg_info_size = 1, 3373 .ctx_arg_info = { 3374 { offsetof(struct bpf_iter__tcp, sk_common), 3375 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 3376 }, 3377 .get_func_proto = bpf_iter_tcp_get_func_proto, 3378 .seq_info = &tcp_seq_info, 3379 }; 3380 3381 static void __init bpf_iter_register(void) 3382 { 3383 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3384 if (bpf_iter_reg_target(&tcp_reg_info)) 3385 pr_warn("Warning: could not register bpf iterator tcp\n"); 3386 } 3387 3388 #endif 3389 3390 void __init tcp_v4_init(void) 3391 { 3392 int cpu, res; 3393 3394 for_each_possible_cpu(cpu) { 3395 struct sock *sk; 3396 3397 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3398 IPPROTO_TCP, &init_net); 3399 if (res) 3400 panic("Failed to create the TCP control socket.\n"); 3401 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3402 3403 /* Please enforce IP_DF and IPID==0 for RST and 3404 * ACK sent in SYN-RECV and TIME-WAIT state. 3405 */ 3406 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3407 3408 per_cpu(ipv4_tcp_sk, cpu) = sk; 3409 } 3410 if (register_pernet_subsys(&tcp_sk_ops)) 3411 panic("Failed to create the TCP control socket.\n"); 3412 3413 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3414 bpf_iter_register(); 3415 #endif 3416 } 3417