1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); 95 96 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 97 { 98 return secure_tcp_seq(ip_hdr(skb)->daddr, 99 ip_hdr(skb)->saddr, 100 tcp_hdr(skb)->dest, 101 tcp_hdr(skb)->source); 102 } 103 104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 105 { 106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 107 } 108 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 110 { 111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse); 112 const struct inet_timewait_sock *tw = inet_twsk(sktw); 113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 114 struct tcp_sock *tp = tcp_sk(sk); 115 116 if (reuse == 2) { 117 /* Still does not detect *everything* that goes through 118 * lo, since we require a loopback src or dst address 119 * or direct binding to 'lo' interface. 120 */ 121 bool loopback = false; 122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 123 loopback = true; 124 #if IS_ENABLED(CONFIG_IPV6) 125 if (tw->tw_family == AF_INET6) { 126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 130 loopback = true; 131 } else 132 #endif 133 { 134 if (ipv4_is_loopback(tw->tw_daddr) || 135 ipv4_is_loopback(tw->tw_rcv_saddr)) 136 loopback = true; 137 } 138 if (!loopback) 139 reuse = 0; 140 } 141 142 /* With PAWS, it is safe from the viewpoint 143 of data integrity. Even without PAWS it is safe provided sequence 144 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 145 146 Actually, the idea is close to VJ's one, only timestamp cache is 147 held not per host, but per port pair and TW bucket is used as state 148 holder. 149 150 If TW bucket has been already destroyed we fall back to VJ's scheme 151 and use initial timestamp retrieved from peer table. 152 */ 153 if (tcptw->tw_ts_recent_stamp && 154 (!twp || (reuse && time_after32(ktime_get_seconds(), 155 tcptw->tw_ts_recent_stamp)))) { 156 /* In case of repair and re-using TIME-WAIT sockets we still 157 * want to be sure that it is safe as above but honor the 158 * sequence numbers and time stamps set as part of the repair 159 * process. 160 * 161 * Without this check re-using a TIME-WAIT socket with TCP 162 * repair would accumulate a -1 on the repair assigned 163 * sequence number. The first time it is reused the sequence 164 * is -1, the second time -2, etc. This fixes that issue 165 * without appearing to create any others. 166 */ 167 if (likely(!tp->repair)) { 168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 169 170 if (!seq) 171 seq = 1; 172 WRITE_ONCE(tp->write_seq, seq); 173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 175 } 176 sock_hold(sktw); 177 return 1; 178 } 179 180 return 0; 181 } 182 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 183 184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 185 int addr_len) 186 { 187 /* This check is replicated from tcp_v4_connect() and intended to 188 * prevent BPF program called below from accessing bytes that are out 189 * of the bound specified by user in addr_len. 190 */ 191 if (addr_len < sizeof(struct sockaddr_in)) 192 return -EINVAL; 193 194 sock_owned_by_me(sk); 195 196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 197 } 198 199 /* This will initiate an outgoing connection. */ 200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 201 { 202 struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; 203 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 204 __be32 daddr, nexthop, prev_sk_rcv_saddr; 205 struct inet_sock *inet = inet_sk(sk); 206 struct tcp_sock *tp = tcp_sk(sk); 207 __be16 orig_sport, orig_dport; 208 struct flowi4 *fl4; 209 struct rtable *rt; 210 int err; 211 struct ip_options_rcu *inet_opt; 212 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; 213 214 if (addr_len < sizeof(struct sockaddr_in)) 215 return -EINVAL; 216 217 if (usin->sin_family != AF_INET) 218 return -EAFNOSUPPORT; 219 220 nexthop = daddr = usin->sin_addr.s_addr; 221 inet_opt = rcu_dereference_protected(inet->inet_opt, 222 lockdep_sock_is_held(sk)); 223 if (inet_opt && inet_opt->opt.srr) { 224 if (!daddr) 225 return -EINVAL; 226 nexthop = inet_opt->opt.faddr; 227 } 228 229 orig_sport = inet->inet_sport; 230 orig_dport = usin->sin_port; 231 fl4 = &inet->cork.fl.u.ip4; 232 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 233 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, 234 orig_dport, sk); 235 if (IS_ERR(rt)) { 236 err = PTR_ERR(rt); 237 if (err == -ENETUNREACH) 238 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 239 return err; 240 } 241 242 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 243 ip_rt_put(rt); 244 return -ENETUNREACH; 245 } 246 247 if (!inet_opt || !inet_opt->opt.srr) 248 daddr = fl4->daddr; 249 250 if (!inet->inet_saddr) { 251 if (inet_csk(sk)->icsk_bind2_hash) { 252 prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo, 253 sk, sock_net(sk), 254 inet->inet_num); 255 prev_sk_rcv_saddr = sk->sk_rcv_saddr; 256 } 257 inet->inet_saddr = fl4->saddr; 258 } 259 260 sk_rcv_saddr_set(sk, inet->inet_saddr); 261 262 if (prev_addr_hashbucket) { 263 err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); 264 if (err) { 265 inet->inet_saddr = 0; 266 sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); 267 ip_rt_put(rt); 268 return err; 269 } 270 } 271 272 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 273 /* Reset inherited state */ 274 tp->rx_opt.ts_recent = 0; 275 tp->rx_opt.ts_recent_stamp = 0; 276 if (likely(!tp->repair)) 277 WRITE_ONCE(tp->write_seq, 0); 278 } 279 280 inet->inet_dport = usin->sin_port; 281 sk_daddr_set(sk, daddr); 282 283 inet_csk(sk)->icsk_ext_hdr_len = 0; 284 if (inet_opt) 285 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 286 287 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 288 289 /* Socket identity is still unknown (sport may be zero). 290 * However we set state to SYN-SENT and not releasing socket 291 * lock select source port, enter ourselves into the hash tables and 292 * complete initialization after this. 293 */ 294 tcp_set_state(sk, TCP_SYN_SENT); 295 err = inet_hash_connect(tcp_death_row, sk); 296 if (err) 297 goto failure; 298 299 sk_set_txhash(sk); 300 301 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 302 inet->inet_sport, inet->inet_dport, sk); 303 if (IS_ERR(rt)) { 304 err = PTR_ERR(rt); 305 rt = NULL; 306 goto failure; 307 } 308 /* OK, now commit destination to socket. */ 309 sk->sk_gso_type = SKB_GSO_TCPV4; 310 sk_setup_caps(sk, &rt->dst); 311 rt = NULL; 312 313 if (likely(!tp->repair)) { 314 if (!tp->write_seq) 315 WRITE_ONCE(tp->write_seq, 316 secure_tcp_seq(inet->inet_saddr, 317 inet->inet_daddr, 318 inet->inet_sport, 319 usin->sin_port)); 320 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 321 inet->inet_saddr, 322 inet->inet_daddr); 323 } 324 325 inet->inet_id = prandom_u32(); 326 327 if (tcp_fastopen_defer_connect(sk, &err)) 328 return err; 329 if (err) 330 goto failure; 331 332 err = tcp_connect(sk); 333 334 if (err) 335 goto failure; 336 337 return 0; 338 339 failure: 340 /* 341 * This unhashes the socket and releases the local port, 342 * if necessary. 343 */ 344 tcp_set_state(sk, TCP_CLOSE); 345 ip_rt_put(rt); 346 sk->sk_route_caps = 0; 347 inet->inet_dport = 0; 348 return err; 349 } 350 EXPORT_SYMBOL(tcp_v4_connect); 351 352 /* 353 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 354 * It can be called through tcp_release_cb() if socket was owned by user 355 * at the time tcp_v4_err() was called to handle ICMP message. 356 */ 357 void tcp_v4_mtu_reduced(struct sock *sk) 358 { 359 struct inet_sock *inet = inet_sk(sk); 360 struct dst_entry *dst; 361 u32 mtu; 362 363 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 364 return; 365 mtu = READ_ONCE(tcp_sk(sk)->mtu_info); 366 dst = inet_csk_update_pmtu(sk, mtu); 367 if (!dst) 368 return; 369 370 /* Something is about to be wrong... Remember soft error 371 * for the case, if this connection will not able to recover. 372 */ 373 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 374 sk->sk_err_soft = EMSGSIZE; 375 376 mtu = dst_mtu(dst); 377 378 if (inet->pmtudisc != IP_PMTUDISC_DONT && 379 ip_sk_accept_pmtu(sk) && 380 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 381 tcp_sync_mss(sk, mtu); 382 383 /* Resend the TCP packet because it's 384 * clear that the old packet has been 385 * dropped. This is the new "fast" path mtu 386 * discovery. 387 */ 388 tcp_simple_retransmit(sk); 389 } /* else let the usual retransmit timer handle it */ 390 } 391 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 392 393 static void do_redirect(struct sk_buff *skb, struct sock *sk) 394 { 395 struct dst_entry *dst = __sk_dst_check(sk, 0); 396 397 if (dst) 398 dst->ops->redirect(dst, sk, skb); 399 } 400 401 402 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 403 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 404 { 405 struct request_sock *req = inet_reqsk(sk); 406 struct net *net = sock_net(sk); 407 408 /* ICMPs are not backlogged, hence we cannot get 409 * an established socket here. 410 */ 411 if (seq != tcp_rsk(req)->snt_isn) { 412 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 413 } else if (abort) { 414 /* 415 * Still in SYN_RECV, just remove it silently. 416 * There is no good way to pass the error to the newly 417 * created socket, and POSIX does not want network 418 * errors returned from accept(). 419 */ 420 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 421 tcp_listendrop(req->rsk_listener); 422 } 423 reqsk_put(req); 424 } 425 EXPORT_SYMBOL(tcp_req_err); 426 427 /* TCP-LD (RFC 6069) logic */ 428 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 429 { 430 struct inet_connection_sock *icsk = inet_csk(sk); 431 struct tcp_sock *tp = tcp_sk(sk); 432 struct sk_buff *skb; 433 s32 remaining; 434 u32 delta_us; 435 436 if (sock_owned_by_user(sk)) 437 return; 438 439 if (seq != tp->snd_una || !icsk->icsk_retransmits || 440 !icsk->icsk_backoff) 441 return; 442 443 skb = tcp_rtx_queue_head(sk); 444 if (WARN_ON_ONCE(!skb)) 445 return; 446 447 icsk->icsk_backoff--; 448 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 449 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 450 451 tcp_mstamp_refresh(tp); 452 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 453 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 454 455 if (remaining > 0) { 456 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 457 remaining, TCP_RTO_MAX); 458 } else { 459 /* RTO revert clocked out retransmission. 460 * Will retransmit now. 461 */ 462 tcp_retransmit_timer(sk); 463 } 464 } 465 EXPORT_SYMBOL(tcp_ld_RTO_revert); 466 467 /* 468 * This routine is called by the ICMP module when it gets some 469 * sort of error condition. If err < 0 then the socket should 470 * be closed and the error returned to the user. If err > 0 471 * it's just the icmp type << 8 | icmp code. After adjustment 472 * header points to the first 8 bytes of the tcp header. We need 473 * to find the appropriate port. 474 * 475 * The locking strategy used here is very "optimistic". When 476 * someone else accesses the socket the ICMP is just dropped 477 * and for some paths there is no check at all. 478 * A more general error queue to queue errors for later handling 479 * is probably better. 480 * 481 */ 482 483 int tcp_v4_err(struct sk_buff *skb, u32 info) 484 { 485 const struct iphdr *iph = (const struct iphdr *)skb->data; 486 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 487 struct tcp_sock *tp; 488 struct inet_sock *inet; 489 const int type = icmp_hdr(skb)->type; 490 const int code = icmp_hdr(skb)->code; 491 struct sock *sk; 492 struct request_sock *fastopen; 493 u32 seq, snd_una; 494 int err; 495 struct net *net = dev_net(skb->dev); 496 497 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 498 th->dest, iph->saddr, ntohs(th->source), 499 inet_iif(skb), 0); 500 if (!sk) { 501 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 502 return -ENOENT; 503 } 504 if (sk->sk_state == TCP_TIME_WAIT) { 505 inet_twsk_put(inet_twsk(sk)); 506 return 0; 507 } 508 seq = ntohl(th->seq); 509 if (sk->sk_state == TCP_NEW_SYN_RECV) { 510 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 511 type == ICMP_TIME_EXCEEDED || 512 (type == ICMP_DEST_UNREACH && 513 (code == ICMP_NET_UNREACH || 514 code == ICMP_HOST_UNREACH))); 515 return 0; 516 } 517 518 bh_lock_sock(sk); 519 /* If too many ICMPs get dropped on busy 520 * servers this needs to be solved differently. 521 * We do take care of PMTU discovery (RFC1191) special case : 522 * we can receive locally generated ICMP messages while socket is held. 523 */ 524 if (sock_owned_by_user(sk)) { 525 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 526 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 527 } 528 if (sk->sk_state == TCP_CLOSE) 529 goto out; 530 531 if (static_branch_unlikely(&ip4_min_ttl)) { 532 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 533 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 534 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 535 goto out; 536 } 537 } 538 539 tp = tcp_sk(sk); 540 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 541 fastopen = rcu_dereference(tp->fastopen_rsk); 542 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 543 if (sk->sk_state != TCP_LISTEN && 544 !between(seq, snd_una, tp->snd_nxt)) { 545 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 546 goto out; 547 } 548 549 switch (type) { 550 case ICMP_REDIRECT: 551 if (!sock_owned_by_user(sk)) 552 do_redirect(skb, sk); 553 goto out; 554 case ICMP_SOURCE_QUENCH: 555 /* Just silently ignore these. */ 556 goto out; 557 case ICMP_PARAMETERPROB: 558 err = EPROTO; 559 break; 560 case ICMP_DEST_UNREACH: 561 if (code > NR_ICMP_UNREACH) 562 goto out; 563 564 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 565 /* We are not interested in TCP_LISTEN and open_requests 566 * (SYN-ACKs send out by Linux are always <576bytes so 567 * they should go through unfragmented). 568 */ 569 if (sk->sk_state == TCP_LISTEN) 570 goto out; 571 572 WRITE_ONCE(tp->mtu_info, info); 573 if (!sock_owned_by_user(sk)) { 574 tcp_v4_mtu_reduced(sk); 575 } else { 576 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 577 sock_hold(sk); 578 } 579 goto out; 580 } 581 582 err = icmp_err_convert[code].errno; 583 /* check if this ICMP message allows revert of backoff. 584 * (see RFC 6069) 585 */ 586 if (!fastopen && 587 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 588 tcp_ld_RTO_revert(sk, seq); 589 break; 590 case ICMP_TIME_EXCEEDED: 591 err = EHOSTUNREACH; 592 break; 593 default: 594 goto out; 595 } 596 597 switch (sk->sk_state) { 598 case TCP_SYN_SENT: 599 case TCP_SYN_RECV: 600 /* Only in fast or simultaneous open. If a fast open socket is 601 * already accepted it is treated as a connected one below. 602 */ 603 if (fastopen && !fastopen->sk) 604 break; 605 606 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 607 608 if (!sock_owned_by_user(sk)) { 609 sk->sk_err = err; 610 611 sk_error_report(sk); 612 613 tcp_done(sk); 614 } else { 615 sk->sk_err_soft = err; 616 } 617 goto out; 618 } 619 620 /* If we've already connected we will keep trying 621 * until we time out, or the user gives up. 622 * 623 * rfc1122 4.2.3.9 allows to consider as hard errors 624 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 625 * but it is obsoleted by pmtu discovery). 626 * 627 * Note, that in modern internet, where routing is unreliable 628 * and in each dark corner broken firewalls sit, sending random 629 * errors ordered by their masters even this two messages finally lose 630 * their original sense (even Linux sends invalid PORT_UNREACHs) 631 * 632 * Now we are in compliance with RFCs. 633 * --ANK (980905) 634 */ 635 636 inet = inet_sk(sk); 637 if (!sock_owned_by_user(sk) && inet->recverr) { 638 sk->sk_err = err; 639 sk_error_report(sk); 640 } else { /* Only an error on timeout */ 641 sk->sk_err_soft = err; 642 } 643 644 out: 645 bh_unlock_sock(sk); 646 sock_put(sk); 647 return 0; 648 } 649 650 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 651 { 652 struct tcphdr *th = tcp_hdr(skb); 653 654 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 655 skb->csum_start = skb_transport_header(skb) - skb->head; 656 skb->csum_offset = offsetof(struct tcphdr, check); 657 } 658 659 /* This routine computes an IPv4 TCP checksum. */ 660 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 661 { 662 const struct inet_sock *inet = inet_sk(sk); 663 664 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 665 } 666 EXPORT_SYMBOL(tcp_v4_send_check); 667 668 /* 669 * This routine will send an RST to the other tcp. 670 * 671 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 672 * for reset. 673 * Answer: if a packet caused RST, it is not for a socket 674 * existing in our system, if it is matched to a socket, 675 * it is just duplicate segment or bug in other side's TCP. 676 * So that we build reply only basing on parameters 677 * arrived with segment. 678 * Exception: precedence violation. We do not implement it in any case. 679 */ 680 681 #ifdef CONFIG_TCP_MD5SIG 682 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED 683 #else 684 #define OPTION_BYTES sizeof(__be32) 685 #endif 686 687 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 688 { 689 const struct tcphdr *th = tcp_hdr(skb); 690 struct { 691 struct tcphdr th; 692 __be32 opt[OPTION_BYTES / sizeof(__be32)]; 693 } rep; 694 struct ip_reply_arg arg; 695 #ifdef CONFIG_TCP_MD5SIG 696 struct tcp_md5sig_key *key = NULL; 697 const __u8 *hash_location = NULL; 698 unsigned char newhash[16]; 699 int genhash; 700 struct sock *sk1 = NULL; 701 #endif 702 u64 transmit_time = 0; 703 struct sock *ctl_sk; 704 struct net *net; 705 706 /* Never send a reset in response to a reset. */ 707 if (th->rst) 708 return; 709 710 /* If sk not NULL, it means we did a successful lookup and incoming 711 * route had to be correct. prequeue might have dropped our dst. 712 */ 713 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 714 return; 715 716 /* Swap the send and the receive. */ 717 memset(&rep, 0, sizeof(rep)); 718 rep.th.dest = th->source; 719 rep.th.source = th->dest; 720 rep.th.doff = sizeof(struct tcphdr) / 4; 721 rep.th.rst = 1; 722 723 if (th->ack) { 724 rep.th.seq = th->ack_seq; 725 } else { 726 rep.th.ack = 1; 727 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 728 skb->len - (th->doff << 2)); 729 } 730 731 memset(&arg, 0, sizeof(arg)); 732 arg.iov[0].iov_base = (unsigned char *)&rep; 733 arg.iov[0].iov_len = sizeof(rep.th); 734 735 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 736 #ifdef CONFIG_TCP_MD5SIG 737 rcu_read_lock(); 738 hash_location = tcp_parse_md5sig_option(th); 739 if (sk && sk_fullsock(sk)) { 740 const union tcp_md5_addr *addr; 741 int l3index; 742 743 /* sdif set, means packet ingressed via a device 744 * in an L3 domain and inet_iif is set to it. 745 */ 746 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 747 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 748 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 749 } else if (hash_location) { 750 const union tcp_md5_addr *addr; 751 int sdif = tcp_v4_sdif(skb); 752 int dif = inet_iif(skb); 753 int l3index; 754 755 /* 756 * active side is lost. Try to find listening socket through 757 * source port, and then find md5 key through listening socket. 758 * we are not loose security here: 759 * Incoming packet is checked with md5 hash with finding key, 760 * no RST generated if md5 hash doesn't match. 761 */ 762 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 763 ip_hdr(skb)->saddr, 764 th->source, ip_hdr(skb)->daddr, 765 ntohs(th->source), dif, sdif); 766 /* don't send rst if it can't find key */ 767 if (!sk1) 768 goto out; 769 770 /* sdif set, means packet ingressed via a device 771 * in an L3 domain and dif is set to it. 772 */ 773 l3index = sdif ? dif : 0; 774 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 775 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 776 if (!key) 777 goto out; 778 779 780 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 781 if (genhash || memcmp(hash_location, newhash, 16) != 0) 782 goto out; 783 784 } 785 786 if (key) { 787 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 788 (TCPOPT_NOP << 16) | 789 (TCPOPT_MD5SIG << 8) | 790 TCPOLEN_MD5SIG); 791 /* Update length and the length the header thinks exists */ 792 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 793 rep.th.doff = arg.iov[0].iov_len / 4; 794 795 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 796 key, ip_hdr(skb)->saddr, 797 ip_hdr(skb)->daddr, &rep.th); 798 } 799 #endif 800 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ 801 if (rep.opt[0] == 0) { 802 __be32 mrst = mptcp_reset_option(skb); 803 804 if (mrst) { 805 rep.opt[0] = mrst; 806 arg.iov[0].iov_len += sizeof(mrst); 807 rep.th.doff = arg.iov[0].iov_len / 4; 808 } 809 } 810 811 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 812 ip_hdr(skb)->saddr, /* XXX */ 813 arg.iov[0].iov_len, IPPROTO_TCP, 0); 814 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 815 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 816 817 /* When socket is gone, all binding information is lost. 818 * routing might fail in this case. No choice here, if we choose to force 819 * input interface, we will misroute in case of asymmetric route. 820 */ 821 if (sk) { 822 arg.bound_dev_if = sk->sk_bound_dev_if; 823 if (sk_fullsock(sk)) 824 trace_tcp_send_reset(sk, skb); 825 } 826 827 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 828 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 829 830 arg.tos = ip_hdr(skb)->tos; 831 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 832 local_bh_disable(); 833 ctl_sk = this_cpu_read(ipv4_tcp_sk); 834 sock_net_set(ctl_sk, net); 835 if (sk) { 836 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 837 inet_twsk(sk)->tw_mark : sk->sk_mark; 838 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 839 inet_twsk(sk)->tw_priority : sk->sk_priority; 840 transmit_time = tcp_transmit_time(sk); 841 xfrm_sk_clone_policy(ctl_sk, sk); 842 } 843 ip_send_unicast_reply(ctl_sk, 844 skb, &TCP_SKB_CB(skb)->header.h4.opt, 845 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 846 &arg, arg.iov[0].iov_len, 847 transmit_time); 848 849 ctl_sk->sk_mark = 0; 850 xfrm_sk_free_policy(ctl_sk); 851 sock_net_set(ctl_sk, &init_net); 852 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 853 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 854 local_bh_enable(); 855 856 #ifdef CONFIG_TCP_MD5SIG 857 out: 858 rcu_read_unlock(); 859 #endif 860 } 861 862 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 863 outside socket context is ugly, certainly. What can I do? 864 */ 865 866 static void tcp_v4_send_ack(const struct sock *sk, 867 struct sk_buff *skb, u32 seq, u32 ack, 868 u32 win, u32 tsval, u32 tsecr, int oif, 869 struct tcp_md5sig_key *key, 870 int reply_flags, u8 tos) 871 { 872 const struct tcphdr *th = tcp_hdr(skb); 873 struct { 874 struct tcphdr th; 875 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 876 #ifdef CONFIG_TCP_MD5SIG 877 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 878 #endif 879 ]; 880 } rep; 881 struct net *net = sock_net(sk); 882 struct ip_reply_arg arg; 883 struct sock *ctl_sk; 884 u64 transmit_time; 885 886 memset(&rep.th, 0, sizeof(struct tcphdr)); 887 memset(&arg, 0, sizeof(arg)); 888 889 arg.iov[0].iov_base = (unsigned char *)&rep; 890 arg.iov[0].iov_len = sizeof(rep.th); 891 if (tsecr) { 892 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 893 (TCPOPT_TIMESTAMP << 8) | 894 TCPOLEN_TIMESTAMP); 895 rep.opt[1] = htonl(tsval); 896 rep.opt[2] = htonl(tsecr); 897 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 898 } 899 900 /* Swap the send and the receive. */ 901 rep.th.dest = th->source; 902 rep.th.source = th->dest; 903 rep.th.doff = arg.iov[0].iov_len / 4; 904 rep.th.seq = htonl(seq); 905 rep.th.ack_seq = htonl(ack); 906 rep.th.ack = 1; 907 rep.th.window = htons(win); 908 909 #ifdef CONFIG_TCP_MD5SIG 910 if (key) { 911 int offset = (tsecr) ? 3 : 0; 912 913 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 914 (TCPOPT_NOP << 16) | 915 (TCPOPT_MD5SIG << 8) | 916 TCPOLEN_MD5SIG); 917 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 918 rep.th.doff = arg.iov[0].iov_len/4; 919 920 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 921 key, ip_hdr(skb)->saddr, 922 ip_hdr(skb)->daddr, &rep.th); 923 } 924 #endif 925 arg.flags = reply_flags; 926 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 927 ip_hdr(skb)->saddr, /* XXX */ 928 arg.iov[0].iov_len, IPPROTO_TCP, 0); 929 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 930 if (oif) 931 arg.bound_dev_if = oif; 932 arg.tos = tos; 933 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 934 local_bh_disable(); 935 ctl_sk = this_cpu_read(ipv4_tcp_sk); 936 sock_net_set(ctl_sk, net); 937 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 938 inet_twsk(sk)->tw_mark : sk->sk_mark; 939 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 940 inet_twsk(sk)->tw_priority : sk->sk_priority; 941 transmit_time = tcp_transmit_time(sk); 942 ip_send_unicast_reply(ctl_sk, 943 skb, &TCP_SKB_CB(skb)->header.h4.opt, 944 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 945 &arg, arg.iov[0].iov_len, 946 transmit_time); 947 948 ctl_sk->sk_mark = 0; 949 sock_net_set(ctl_sk, &init_net); 950 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 951 local_bh_enable(); 952 } 953 954 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 955 { 956 struct inet_timewait_sock *tw = inet_twsk(sk); 957 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 958 959 tcp_v4_send_ack(sk, skb, 960 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 961 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 962 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 963 tcptw->tw_ts_recent, 964 tw->tw_bound_dev_if, 965 tcp_twsk_md5_key(tcptw), 966 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 967 tw->tw_tos 968 ); 969 970 inet_twsk_put(tw); 971 } 972 973 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 974 struct request_sock *req) 975 { 976 const union tcp_md5_addr *addr; 977 int l3index; 978 979 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 980 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 981 */ 982 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 983 tcp_sk(sk)->snd_nxt; 984 985 /* RFC 7323 2.3 986 * The window field (SEG.WND) of every outgoing segment, with the 987 * exception of <SYN> segments, MUST be right-shifted by 988 * Rcv.Wind.Shift bits: 989 */ 990 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 991 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 992 tcp_v4_send_ack(sk, skb, seq, 993 tcp_rsk(req)->rcv_nxt, 994 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 995 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 996 req->ts_recent, 997 0, 998 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 999 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 1000 ip_hdr(skb)->tos); 1001 } 1002 1003 /* 1004 * Send a SYN-ACK after having received a SYN. 1005 * This still operates on a request_sock only, not on a big 1006 * socket. 1007 */ 1008 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 1009 struct flowi *fl, 1010 struct request_sock *req, 1011 struct tcp_fastopen_cookie *foc, 1012 enum tcp_synack_type synack_type, 1013 struct sk_buff *syn_skb) 1014 { 1015 const struct inet_request_sock *ireq = inet_rsk(req); 1016 struct flowi4 fl4; 1017 int err = -1; 1018 struct sk_buff *skb; 1019 u8 tos; 1020 1021 /* First, grab a route. */ 1022 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 1023 return -1; 1024 1025 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1026 1027 if (skb) { 1028 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1029 1030 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? 1031 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | 1032 (inet_sk(sk)->tos & INET_ECN_MASK) : 1033 inet_sk(sk)->tos; 1034 1035 if (!INET_ECN_is_capable(tos) && 1036 tcp_bpf_ca_needs_ecn((struct sock *)req)) 1037 tos |= INET_ECN_ECT_0; 1038 1039 rcu_read_lock(); 1040 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 1041 ireq->ir_rmt_addr, 1042 rcu_dereference(ireq->ireq_opt), 1043 tos); 1044 rcu_read_unlock(); 1045 err = net_xmit_eval(err); 1046 } 1047 1048 return err; 1049 } 1050 1051 /* 1052 * IPv4 request_sock destructor. 1053 */ 1054 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1055 { 1056 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1057 } 1058 1059 #ifdef CONFIG_TCP_MD5SIG 1060 /* 1061 * RFC2385 MD5 checksumming requires a mapping of 1062 * IP address->MD5 Key. 1063 * We need to maintain these in the sk structure. 1064 */ 1065 1066 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1067 EXPORT_SYMBOL(tcp_md5_needed); 1068 1069 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) 1070 { 1071 if (!old) 1072 return true; 1073 1074 /* l3index always overrides non-l3index */ 1075 if (old->l3index && new->l3index == 0) 1076 return false; 1077 if (old->l3index == 0 && new->l3index) 1078 return true; 1079 1080 return old->prefixlen < new->prefixlen; 1081 } 1082 1083 /* Find the Key structure for an address. */ 1084 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1085 const union tcp_md5_addr *addr, 1086 int family) 1087 { 1088 const struct tcp_sock *tp = tcp_sk(sk); 1089 struct tcp_md5sig_key *key; 1090 const struct tcp_md5sig_info *md5sig; 1091 __be32 mask; 1092 struct tcp_md5sig_key *best_match = NULL; 1093 bool match; 1094 1095 /* caller either holds rcu_read_lock() or socket lock */ 1096 md5sig = rcu_dereference_check(tp->md5sig_info, 1097 lockdep_sock_is_held(sk)); 1098 if (!md5sig) 1099 return NULL; 1100 1101 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1102 lockdep_sock_is_held(sk)) { 1103 if (key->family != family) 1104 continue; 1105 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) 1106 continue; 1107 if (family == AF_INET) { 1108 mask = inet_make_mask(key->prefixlen); 1109 match = (key->addr.a4.s_addr & mask) == 1110 (addr->a4.s_addr & mask); 1111 #if IS_ENABLED(CONFIG_IPV6) 1112 } else if (family == AF_INET6) { 1113 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1114 key->prefixlen); 1115 #endif 1116 } else { 1117 match = false; 1118 } 1119 1120 if (match && better_md5_match(best_match, key)) 1121 best_match = key; 1122 } 1123 return best_match; 1124 } 1125 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1126 1127 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1128 const union tcp_md5_addr *addr, 1129 int family, u8 prefixlen, 1130 int l3index, u8 flags) 1131 { 1132 const struct tcp_sock *tp = tcp_sk(sk); 1133 struct tcp_md5sig_key *key; 1134 unsigned int size = sizeof(struct in_addr); 1135 const struct tcp_md5sig_info *md5sig; 1136 1137 /* caller either holds rcu_read_lock() or socket lock */ 1138 md5sig = rcu_dereference_check(tp->md5sig_info, 1139 lockdep_sock_is_held(sk)); 1140 if (!md5sig) 1141 return NULL; 1142 #if IS_ENABLED(CONFIG_IPV6) 1143 if (family == AF_INET6) 1144 size = sizeof(struct in6_addr); 1145 #endif 1146 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1147 lockdep_sock_is_held(sk)) { 1148 if (key->family != family) 1149 continue; 1150 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) 1151 continue; 1152 if (key->l3index != l3index) 1153 continue; 1154 if (!memcmp(&key->addr, addr, size) && 1155 key->prefixlen == prefixlen) 1156 return key; 1157 } 1158 return NULL; 1159 } 1160 1161 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1162 const struct sock *addr_sk) 1163 { 1164 const union tcp_md5_addr *addr; 1165 int l3index; 1166 1167 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1168 addr_sk->sk_bound_dev_if); 1169 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1170 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1171 } 1172 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1173 1174 /* This can be called on a newly created socket, from other files */ 1175 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1176 int family, u8 prefixlen, int l3index, u8 flags, 1177 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1178 { 1179 /* Add Key to the list */ 1180 struct tcp_md5sig_key *key; 1181 struct tcp_sock *tp = tcp_sk(sk); 1182 struct tcp_md5sig_info *md5sig; 1183 1184 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1185 if (key) { 1186 /* Pre-existing entry - just update that one. 1187 * Note that the key might be used concurrently. 1188 * data_race() is telling kcsan that we do not care of 1189 * key mismatches, since changing MD5 key on live flows 1190 * can lead to packet drops. 1191 */ 1192 data_race(memcpy(key->key, newkey, newkeylen)); 1193 1194 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1195 * Also note that a reader could catch new key->keylen value 1196 * but old key->key[], this is the reason we use __GFP_ZERO 1197 * at sock_kmalloc() time below these lines. 1198 */ 1199 WRITE_ONCE(key->keylen, newkeylen); 1200 1201 return 0; 1202 } 1203 1204 md5sig = rcu_dereference_protected(tp->md5sig_info, 1205 lockdep_sock_is_held(sk)); 1206 if (!md5sig) { 1207 md5sig = kmalloc(sizeof(*md5sig), gfp); 1208 if (!md5sig) 1209 return -ENOMEM; 1210 1211 sk_gso_disable(sk); 1212 INIT_HLIST_HEAD(&md5sig->head); 1213 rcu_assign_pointer(tp->md5sig_info, md5sig); 1214 } 1215 1216 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1217 if (!key) 1218 return -ENOMEM; 1219 if (!tcp_alloc_md5sig_pool()) { 1220 sock_kfree_s(sk, key, sizeof(*key)); 1221 return -ENOMEM; 1222 } 1223 1224 memcpy(key->key, newkey, newkeylen); 1225 key->keylen = newkeylen; 1226 key->family = family; 1227 key->prefixlen = prefixlen; 1228 key->l3index = l3index; 1229 key->flags = flags; 1230 memcpy(&key->addr, addr, 1231 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) : 1232 sizeof(struct in_addr)); 1233 hlist_add_head_rcu(&key->node, &md5sig->head); 1234 return 0; 1235 } 1236 EXPORT_SYMBOL(tcp_md5_do_add); 1237 1238 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1239 u8 prefixlen, int l3index, u8 flags) 1240 { 1241 struct tcp_md5sig_key *key; 1242 1243 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); 1244 if (!key) 1245 return -ENOENT; 1246 hlist_del_rcu(&key->node); 1247 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1248 kfree_rcu(key, rcu); 1249 return 0; 1250 } 1251 EXPORT_SYMBOL(tcp_md5_do_del); 1252 1253 static void tcp_clear_md5_list(struct sock *sk) 1254 { 1255 struct tcp_sock *tp = tcp_sk(sk); 1256 struct tcp_md5sig_key *key; 1257 struct hlist_node *n; 1258 struct tcp_md5sig_info *md5sig; 1259 1260 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1261 1262 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1263 hlist_del_rcu(&key->node); 1264 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1265 kfree_rcu(key, rcu); 1266 } 1267 } 1268 1269 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1270 sockptr_t optval, int optlen) 1271 { 1272 struct tcp_md5sig cmd; 1273 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1274 const union tcp_md5_addr *addr; 1275 u8 prefixlen = 32; 1276 int l3index = 0; 1277 u8 flags; 1278 1279 if (optlen < sizeof(cmd)) 1280 return -EINVAL; 1281 1282 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1283 return -EFAULT; 1284 1285 if (sin->sin_family != AF_INET) 1286 return -EINVAL; 1287 1288 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; 1289 1290 if (optname == TCP_MD5SIG_EXT && 1291 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1292 prefixlen = cmd.tcpm_prefixlen; 1293 if (prefixlen > 32) 1294 return -EINVAL; 1295 } 1296 1297 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && 1298 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1299 struct net_device *dev; 1300 1301 rcu_read_lock(); 1302 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1303 if (dev && netif_is_l3_master(dev)) 1304 l3index = dev->ifindex; 1305 1306 rcu_read_unlock(); 1307 1308 /* ok to reference set/not set outside of rcu; 1309 * right now device MUST be an L3 master 1310 */ 1311 if (!dev || !l3index) 1312 return -EINVAL; 1313 } 1314 1315 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1316 1317 if (!cmd.tcpm_keylen) 1318 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); 1319 1320 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1321 return -EINVAL; 1322 1323 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, 1324 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1325 } 1326 1327 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1328 __be32 daddr, __be32 saddr, 1329 const struct tcphdr *th, int nbytes) 1330 { 1331 struct tcp4_pseudohdr *bp; 1332 struct scatterlist sg; 1333 struct tcphdr *_th; 1334 1335 bp = hp->scratch; 1336 bp->saddr = saddr; 1337 bp->daddr = daddr; 1338 bp->pad = 0; 1339 bp->protocol = IPPROTO_TCP; 1340 bp->len = cpu_to_be16(nbytes); 1341 1342 _th = (struct tcphdr *)(bp + 1); 1343 memcpy(_th, th, sizeof(*th)); 1344 _th->check = 0; 1345 1346 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1347 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1348 sizeof(*bp) + sizeof(*th)); 1349 return crypto_ahash_update(hp->md5_req); 1350 } 1351 1352 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1353 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1354 { 1355 struct tcp_md5sig_pool *hp; 1356 struct ahash_request *req; 1357 1358 hp = tcp_get_md5sig_pool(); 1359 if (!hp) 1360 goto clear_hash_noput; 1361 req = hp->md5_req; 1362 1363 if (crypto_ahash_init(req)) 1364 goto clear_hash; 1365 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1366 goto clear_hash; 1367 if (tcp_md5_hash_key(hp, key)) 1368 goto clear_hash; 1369 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1370 if (crypto_ahash_final(req)) 1371 goto clear_hash; 1372 1373 tcp_put_md5sig_pool(); 1374 return 0; 1375 1376 clear_hash: 1377 tcp_put_md5sig_pool(); 1378 clear_hash_noput: 1379 memset(md5_hash, 0, 16); 1380 return 1; 1381 } 1382 1383 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1384 const struct sock *sk, 1385 const struct sk_buff *skb) 1386 { 1387 struct tcp_md5sig_pool *hp; 1388 struct ahash_request *req; 1389 const struct tcphdr *th = tcp_hdr(skb); 1390 __be32 saddr, daddr; 1391 1392 if (sk) { /* valid for establish/request sockets */ 1393 saddr = sk->sk_rcv_saddr; 1394 daddr = sk->sk_daddr; 1395 } else { 1396 const struct iphdr *iph = ip_hdr(skb); 1397 saddr = iph->saddr; 1398 daddr = iph->daddr; 1399 } 1400 1401 hp = tcp_get_md5sig_pool(); 1402 if (!hp) 1403 goto clear_hash_noput; 1404 req = hp->md5_req; 1405 1406 if (crypto_ahash_init(req)) 1407 goto clear_hash; 1408 1409 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1410 goto clear_hash; 1411 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1412 goto clear_hash; 1413 if (tcp_md5_hash_key(hp, key)) 1414 goto clear_hash; 1415 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1416 if (crypto_ahash_final(req)) 1417 goto clear_hash; 1418 1419 tcp_put_md5sig_pool(); 1420 return 0; 1421 1422 clear_hash: 1423 tcp_put_md5sig_pool(); 1424 clear_hash_noput: 1425 memset(md5_hash, 0, 16); 1426 return 1; 1427 } 1428 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1429 1430 #endif 1431 1432 static void tcp_v4_init_req(struct request_sock *req, 1433 const struct sock *sk_listener, 1434 struct sk_buff *skb) 1435 { 1436 struct inet_request_sock *ireq = inet_rsk(req); 1437 struct net *net = sock_net(sk_listener); 1438 1439 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1440 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1441 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1442 } 1443 1444 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1445 struct sk_buff *skb, 1446 struct flowi *fl, 1447 struct request_sock *req) 1448 { 1449 tcp_v4_init_req(req, sk, skb); 1450 1451 if (security_inet_conn_request(sk, skb, req)) 1452 return NULL; 1453 1454 return inet_csk_route_req(sk, &fl->u.ip4, req); 1455 } 1456 1457 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1458 .family = PF_INET, 1459 .obj_size = sizeof(struct tcp_request_sock), 1460 .rtx_syn_ack = tcp_rtx_synack, 1461 .send_ack = tcp_v4_reqsk_send_ack, 1462 .destructor = tcp_v4_reqsk_destructor, 1463 .send_reset = tcp_v4_send_reset, 1464 .syn_ack_timeout = tcp_syn_ack_timeout, 1465 }; 1466 1467 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1468 .mss_clamp = TCP_MSS_DEFAULT, 1469 #ifdef CONFIG_TCP_MD5SIG 1470 .req_md5_lookup = tcp_v4_md5_lookup, 1471 .calc_md5_hash = tcp_v4_md5_hash_skb, 1472 #endif 1473 #ifdef CONFIG_SYN_COOKIES 1474 .cookie_init_seq = cookie_v4_init_sequence, 1475 #endif 1476 .route_req = tcp_v4_route_req, 1477 .init_seq = tcp_v4_init_seq, 1478 .init_ts_off = tcp_v4_init_ts_off, 1479 .send_synack = tcp_v4_send_synack, 1480 }; 1481 1482 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1483 { 1484 /* Never answer to SYNs send to broadcast or multicast */ 1485 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1486 goto drop; 1487 1488 return tcp_conn_request(&tcp_request_sock_ops, 1489 &tcp_request_sock_ipv4_ops, sk, skb); 1490 1491 drop: 1492 tcp_listendrop(sk); 1493 return 0; 1494 } 1495 EXPORT_SYMBOL(tcp_v4_conn_request); 1496 1497 1498 /* 1499 * The three way handshake has completed - we got a valid synack - 1500 * now create the new socket. 1501 */ 1502 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1503 struct request_sock *req, 1504 struct dst_entry *dst, 1505 struct request_sock *req_unhash, 1506 bool *own_req) 1507 { 1508 struct inet_request_sock *ireq; 1509 bool found_dup_sk = false; 1510 struct inet_sock *newinet; 1511 struct tcp_sock *newtp; 1512 struct sock *newsk; 1513 #ifdef CONFIG_TCP_MD5SIG 1514 const union tcp_md5_addr *addr; 1515 struct tcp_md5sig_key *key; 1516 int l3index; 1517 #endif 1518 struct ip_options_rcu *inet_opt; 1519 1520 if (sk_acceptq_is_full(sk)) 1521 goto exit_overflow; 1522 1523 newsk = tcp_create_openreq_child(sk, req, skb); 1524 if (!newsk) 1525 goto exit_nonewsk; 1526 1527 newsk->sk_gso_type = SKB_GSO_TCPV4; 1528 inet_sk_rx_dst_set(newsk, skb); 1529 1530 newtp = tcp_sk(newsk); 1531 newinet = inet_sk(newsk); 1532 ireq = inet_rsk(req); 1533 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1534 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1535 newsk->sk_bound_dev_if = ireq->ir_iif; 1536 newinet->inet_saddr = ireq->ir_loc_addr; 1537 inet_opt = rcu_dereference(ireq->ireq_opt); 1538 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1539 newinet->mc_index = inet_iif(skb); 1540 newinet->mc_ttl = ip_hdr(skb)->ttl; 1541 newinet->rcv_tos = ip_hdr(skb)->tos; 1542 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1543 if (inet_opt) 1544 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1545 newinet->inet_id = prandom_u32(); 1546 1547 /* Set ToS of the new socket based upon the value of incoming SYN. 1548 * ECT bits are set later in tcp_init_transfer(). 1549 */ 1550 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) 1551 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; 1552 1553 if (!dst) { 1554 dst = inet_csk_route_child_sock(sk, newsk, req); 1555 if (!dst) 1556 goto put_and_exit; 1557 } else { 1558 /* syncookie case : see end of cookie_v4_check() */ 1559 } 1560 sk_setup_caps(newsk, dst); 1561 1562 tcp_ca_openreq_child(newsk, dst); 1563 1564 tcp_sync_mss(newsk, dst_mtu(dst)); 1565 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1566 1567 tcp_initialize_rcv_mss(newsk); 1568 1569 #ifdef CONFIG_TCP_MD5SIG 1570 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1571 /* Copy over the MD5 key from the original socket */ 1572 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1573 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1574 if (key) { 1575 /* 1576 * We're using one, so create a matching key 1577 * on the newsk structure. If we fail to get 1578 * memory, then we end up not copying the key 1579 * across. Shucks. 1580 */ 1581 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, 1582 key->key, key->keylen, GFP_ATOMIC); 1583 sk_gso_disable(newsk); 1584 } 1585 #endif 1586 1587 if (__inet_inherit_port(sk, newsk) < 0) 1588 goto put_and_exit; 1589 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), 1590 &found_dup_sk); 1591 if (likely(*own_req)) { 1592 tcp_move_syn(newtp, req); 1593 ireq->ireq_opt = NULL; 1594 } else { 1595 newinet->inet_opt = NULL; 1596 1597 if (!req_unhash && found_dup_sk) { 1598 /* This code path should only be executed in the 1599 * syncookie case only 1600 */ 1601 bh_unlock_sock(newsk); 1602 sock_put(newsk); 1603 newsk = NULL; 1604 } 1605 } 1606 return newsk; 1607 1608 exit_overflow: 1609 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1610 exit_nonewsk: 1611 dst_release(dst); 1612 exit: 1613 tcp_listendrop(sk); 1614 return NULL; 1615 put_and_exit: 1616 newinet->inet_opt = NULL; 1617 inet_csk_prepare_forced_close(newsk); 1618 tcp_done(newsk); 1619 goto exit; 1620 } 1621 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1622 1623 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1624 { 1625 #ifdef CONFIG_SYN_COOKIES 1626 const struct tcphdr *th = tcp_hdr(skb); 1627 1628 if (!th->syn) 1629 sk = cookie_v4_check(sk, skb); 1630 #endif 1631 return sk; 1632 } 1633 1634 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1635 struct tcphdr *th, u32 *cookie) 1636 { 1637 u16 mss = 0; 1638 #ifdef CONFIG_SYN_COOKIES 1639 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1640 &tcp_request_sock_ipv4_ops, sk, th); 1641 if (mss) { 1642 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1643 tcp_synq_overflow(sk); 1644 } 1645 #endif 1646 return mss; 1647 } 1648 1649 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 1650 u32)); 1651 /* The socket must have it's spinlock held when we get 1652 * here, unless it is a TCP_LISTEN socket. 1653 * 1654 * We have a potential double-lock case here, so even when 1655 * doing backlog processing we use the BH locking scheme. 1656 * This is because we cannot sleep with the original spinlock 1657 * held. 1658 */ 1659 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1660 { 1661 enum skb_drop_reason reason; 1662 struct sock *rsk; 1663 1664 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1665 struct dst_entry *dst; 1666 1667 dst = rcu_dereference_protected(sk->sk_rx_dst, 1668 lockdep_sock_is_held(sk)); 1669 1670 sock_rps_save_rxhash(sk, skb); 1671 sk_mark_napi_id(sk, skb); 1672 if (dst) { 1673 if (sk->sk_rx_dst_ifindex != skb->skb_iif || 1674 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, 1675 dst, 0)) { 1676 RCU_INIT_POINTER(sk->sk_rx_dst, NULL); 1677 dst_release(dst); 1678 } 1679 } 1680 tcp_rcv_established(sk, skb); 1681 return 0; 1682 } 1683 1684 reason = SKB_DROP_REASON_NOT_SPECIFIED; 1685 if (tcp_checksum_complete(skb)) 1686 goto csum_err; 1687 1688 if (sk->sk_state == TCP_LISTEN) { 1689 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1690 1691 if (!nsk) 1692 goto discard; 1693 if (nsk != sk) { 1694 if (tcp_child_process(sk, nsk, skb)) { 1695 rsk = nsk; 1696 goto reset; 1697 } 1698 return 0; 1699 } 1700 } else 1701 sock_rps_save_rxhash(sk, skb); 1702 1703 if (tcp_rcv_state_process(sk, skb)) { 1704 rsk = sk; 1705 goto reset; 1706 } 1707 return 0; 1708 1709 reset: 1710 tcp_v4_send_reset(rsk, skb); 1711 discard: 1712 kfree_skb_reason(skb, reason); 1713 /* Be careful here. If this function gets more complicated and 1714 * gcc suffers from register pressure on the x86, sk (in %ebx) 1715 * might be destroyed here. This current version compiles correctly, 1716 * but you have been warned. 1717 */ 1718 return 0; 1719 1720 csum_err: 1721 reason = SKB_DROP_REASON_TCP_CSUM; 1722 trace_tcp_bad_csum(skb); 1723 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1724 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1725 goto discard; 1726 } 1727 EXPORT_SYMBOL(tcp_v4_do_rcv); 1728 1729 int tcp_v4_early_demux(struct sk_buff *skb) 1730 { 1731 const struct iphdr *iph; 1732 const struct tcphdr *th; 1733 struct sock *sk; 1734 1735 if (skb->pkt_type != PACKET_HOST) 1736 return 0; 1737 1738 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1739 return 0; 1740 1741 iph = ip_hdr(skb); 1742 th = tcp_hdr(skb); 1743 1744 if (th->doff < sizeof(struct tcphdr) / 4) 1745 return 0; 1746 1747 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1748 iph->saddr, th->source, 1749 iph->daddr, ntohs(th->dest), 1750 skb->skb_iif, inet_sdif(skb)); 1751 if (sk) { 1752 skb->sk = sk; 1753 skb->destructor = sock_edemux; 1754 if (sk_fullsock(sk)) { 1755 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); 1756 1757 if (dst) 1758 dst = dst_check(dst, 0); 1759 if (dst && 1760 sk->sk_rx_dst_ifindex == skb->skb_iif) 1761 skb_dst_set_noref(skb, dst); 1762 } 1763 } 1764 return 0; 1765 } 1766 1767 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, 1768 enum skb_drop_reason *reason) 1769 { 1770 u32 limit, tail_gso_size, tail_gso_segs; 1771 struct skb_shared_info *shinfo; 1772 const struct tcphdr *th; 1773 struct tcphdr *thtail; 1774 struct sk_buff *tail; 1775 unsigned int hdrlen; 1776 bool fragstolen; 1777 u32 gso_segs; 1778 u32 gso_size; 1779 int delta; 1780 1781 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1782 * we can fix skb->truesize to its real value to avoid future drops. 1783 * This is valid because skb is not yet charged to the socket. 1784 * It has been noticed pure SACK packets were sometimes dropped 1785 * (if cooked by drivers without copybreak feature). 1786 */ 1787 skb_condense(skb); 1788 1789 skb_dst_drop(skb); 1790 1791 if (unlikely(tcp_checksum_complete(skb))) { 1792 bh_unlock_sock(sk); 1793 trace_tcp_bad_csum(skb); 1794 *reason = SKB_DROP_REASON_TCP_CSUM; 1795 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1796 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1797 return true; 1798 } 1799 1800 /* Attempt coalescing to last skb in backlog, even if we are 1801 * above the limits. 1802 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1803 */ 1804 th = (const struct tcphdr *)skb->data; 1805 hdrlen = th->doff * 4; 1806 1807 tail = sk->sk_backlog.tail; 1808 if (!tail) 1809 goto no_coalesce; 1810 thtail = (struct tcphdr *)tail->data; 1811 1812 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1813 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1814 ((TCP_SKB_CB(tail)->tcp_flags | 1815 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1816 !((TCP_SKB_CB(tail)->tcp_flags & 1817 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1818 ((TCP_SKB_CB(tail)->tcp_flags ^ 1819 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1820 #ifdef CONFIG_TLS_DEVICE 1821 tail->decrypted != skb->decrypted || 1822 #endif 1823 thtail->doff != th->doff || 1824 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1825 goto no_coalesce; 1826 1827 __skb_pull(skb, hdrlen); 1828 1829 shinfo = skb_shinfo(skb); 1830 gso_size = shinfo->gso_size ?: skb->len; 1831 gso_segs = shinfo->gso_segs ?: 1; 1832 1833 shinfo = skb_shinfo(tail); 1834 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); 1835 tail_gso_segs = shinfo->gso_segs ?: 1; 1836 1837 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1838 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1839 1840 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) { 1841 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1842 thtail->window = th->window; 1843 } 1844 1845 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1846 * thtail->fin, so that the fast path in tcp_rcv_established() 1847 * is not entered if we append a packet with a FIN. 1848 * SYN, RST, URG are not present. 1849 * ACK is set on both packets. 1850 * PSH : we do not really care in TCP stack, 1851 * at least for 'GRO' packets. 1852 */ 1853 thtail->fin |= th->fin; 1854 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1855 1856 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1857 TCP_SKB_CB(tail)->has_rxtstamp = true; 1858 tail->tstamp = skb->tstamp; 1859 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1860 } 1861 1862 /* Not as strict as GRO. We only need to carry mss max value */ 1863 shinfo->gso_size = max(gso_size, tail_gso_size); 1864 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); 1865 1866 sk->sk_backlog.len += delta; 1867 __NET_INC_STATS(sock_net(sk), 1868 LINUX_MIB_TCPBACKLOGCOALESCE); 1869 kfree_skb_partial(skb, fragstolen); 1870 return false; 1871 } 1872 __skb_push(skb, hdrlen); 1873 1874 no_coalesce: 1875 /* Only socket owner can try to collapse/prune rx queues 1876 * to reduce memory overhead, so add a little headroom here. 1877 * Few sockets backlog are possibly concurrently non empty. 1878 */ 1879 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024; 1880 1881 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1882 bh_unlock_sock(sk); 1883 *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 1884 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1885 return true; 1886 } 1887 return false; 1888 } 1889 EXPORT_SYMBOL(tcp_add_backlog); 1890 1891 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1892 { 1893 struct tcphdr *th = (struct tcphdr *)skb->data; 1894 1895 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1896 } 1897 EXPORT_SYMBOL(tcp_filter); 1898 1899 static void tcp_v4_restore_cb(struct sk_buff *skb) 1900 { 1901 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1902 sizeof(struct inet_skb_parm)); 1903 } 1904 1905 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1906 const struct tcphdr *th) 1907 { 1908 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1909 * barrier() makes sure compiler wont play fool^Waliasing games. 1910 */ 1911 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1912 sizeof(struct inet_skb_parm)); 1913 barrier(); 1914 1915 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1916 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1917 skb->len - th->doff * 4); 1918 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1919 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1920 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1921 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1922 TCP_SKB_CB(skb)->sacked = 0; 1923 TCP_SKB_CB(skb)->has_rxtstamp = 1924 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1925 } 1926 1927 /* 1928 * From tcp_input.c 1929 */ 1930 1931 int tcp_v4_rcv(struct sk_buff *skb) 1932 { 1933 struct net *net = dev_net(skb->dev); 1934 enum skb_drop_reason drop_reason; 1935 int sdif = inet_sdif(skb); 1936 int dif = inet_iif(skb); 1937 const struct iphdr *iph; 1938 const struct tcphdr *th; 1939 bool refcounted; 1940 struct sock *sk; 1941 int ret; 1942 1943 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 1944 if (skb->pkt_type != PACKET_HOST) 1945 goto discard_it; 1946 1947 /* Count it even if it's bad */ 1948 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1949 1950 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1951 goto discard_it; 1952 1953 th = (const struct tcphdr *)skb->data; 1954 1955 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { 1956 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; 1957 goto bad_packet; 1958 } 1959 if (!pskb_may_pull(skb, th->doff * 4)) 1960 goto discard_it; 1961 1962 /* An explanation is required here, I think. 1963 * Packet length and doff are validated by header prediction, 1964 * provided case of th->doff==0 is eliminated. 1965 * So, we defer the checks. */ 1966 1967 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1968 goto csum_error; 1969 1970 th = (const struct tcphdr *)skb->data; 1971 iph = ip_hdr(skb); 1972 lookup: 1973 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1974 th->dest, sdif, &refcounted); 1975 if (!sk) 1976 goto no_tcp_socket; 1977 1978 process: 1979 if (sk->sk_state == TCP_TIME_WAIT) 1980 goto do_time_wait; 1981 1982 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1983 struct request_sock *req = inet_reqsk(sk); 1984 bool req_stolen = false; 1985 struct sock *nsk; 1986 1987 sk = req->rsk_listener; 1988 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1989 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 1990 else 1991 drop_reason = tcp_inbound_md5_hash(sk, skb, 1992 &iph->saddr, &iph->daddr, 1993 AF_INET, dif, sdif); 1994 if (unlikely(drop_reason)) { 1995 sk_drops_add(sk, skb); 1996 reqsk_put(req); 1997 goto discard_it; 1998 } 1999 if (tcp_checksum_complete(skb)) { 2000 reqsk_put(req); 2001 goto csum_error; 2002 } 2003 if (unlikely(sk->sk_state != TCP_LISTEN)) { 2004 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); 2005 if (!nsk) { 2006 inet_csk_reqsk_queue_drop_and_put(sk, req); 2007 goto lookup; 2008 } 2009 sk = nsk; 2010 /* reuseport_migrate_sock() has already held one sk_refcnt 2011 * before returning. 2012 */ 2013 } else { 2014 /* We own a reference on the listener, increase it again 2015 * as we might lose it too soon. 2016 */ 2017 sock_hold(sk); 2018 } 2019 refcounted = true; 2020 nsk = NULL; 2021 if (!tcp_filter(sk, skb)) { 2022 th = (const struct tcphdr *)skb->data; 2023 iph = ip_hdr(skb); 2024 tcp_v4_fill_cb(skb, iph, th); 2025 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 2026 } else { 2027 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2028 } 2029 if (!nsk) { 2030 reqsk_put(req); 2031 if (req_stolen) { 2032 /* Another cpu got exclusive access to req 2033 * and created a full blown socket. 2034 * Try to feed this packet to this socket 2035 * instead of discarding it. 2036 */ 2037 tcp_v4_restore_cb(skb); 2038 sock_put(sk); 2039 goto lookup; 2040 } 2041 goto discard_and_relse; 2042 } 2043 nf_reset_ct(skb); 2044 if (nsk == sk) { 2045 reqsk_put(req); 2046 tcp_v4_restore_cb(skb); 2047 } else if (tcp_child_process(sk, nsk, skb)) { 2048 tcp_v4_send_reset(nsk, skb); 2049 goto discard_and_relse; 2050 } else { 2051 sock_put(sk); 2052 return 0; 2053 } 2054 } 2055 2056 if (static_branch_unlikely(&ip4_min_ttl)) { 2057 /* min_ttl can be changed concurrently from do_ip_setsockopt() */ 2058 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { 2059 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 2060 goto discard_and_relse; 2061 } 2062 } 2063 2064 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { 2065 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2066 goto discard_and_relse; 2067 } 2068 2069 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, 2070 &iph->daddr, AF_INET, dif, sdif); 2071 if (drop_reason) 2072 goto discard_and_relse; 2073 2074 nf_reset_ct(skb); 2075 2076 if (tcp_filter(sk, skb)) { 2077 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2078 goto discard_and_relse; 2079 } 2080 th = (const struct tcphdr *)skb->data; 2081 iph = ip_hdr(skb); 2082 tcp_v4_fill_cb(skb, iph, th); 2083 2084 skb->dev = NULL; 2085 2086 if (sk->sk_state == TCP_LISTEN) { 2087 ret = tcp_v4_do_rcv(sk, skb); 2088 goto put_and_return; 2089 } 2090 2091 sk_incoming_cpu_update(sk); 2092 2093 bh_lock_sock_nested(sk); 2094 tcp_segs_in(tcp_sk(sk), skb); 2095 ret = 0; 2096 if (!sock_owned_by_user(sk)) { 2097 ret = tcp_v4_do_rcv(sk, skb); 2098 } else { 2099 if (tcp_add_backlog(sk, skb, &drop_reason)) 2100 goto discard_and_relse; 2101 } 2102 bh_unlock_sock(sk); 2103 2104 put_and_return: 2105 if (refcounted) 2106 sock_put(sk); 2107 2108 return ret; 2109 2110 no_tcp_socket: 2111 drop_reason = SKB_DROP_REASON_NO_SOCKET; 2112 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2113 goto discard_it; 2114 2115 tcp_v4_fill_cb(skb, iph, th); 2116 2117 if (tcp_checksum_complete(skb)) { 2118 csum_error: 2119 drop_reason = SKB_DROP_REASON_TCP_CSUM; 2120 trace_tcp_bad_csum(skb); 2121 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2122 bad_packet: 2123 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2124 } else { 2125 tcp_v4_send_reset(NULL, skb); 2126 } 2127 2128 discard_it: 2129 SKB_DR_OR(drop_reason, NOT_SPECIFIED); 2130 /* Discard frame. */ 2131 kfree_skb_reason(skb, drop_reason); 2132 return 0; 2133 2134 discard_and_relse: 2135 sk_drops_add(sk, skb); 2136 if (refcounted) 2137 sock_put(sk); 2138 goto discard_it; 2139 2140 do_time_wait: 2141 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2142 drop_reason = SKB_DROP_REASON_XFRM_POLICY; 2143 inet_twsk_put(inet_twsk(sk)); 2144 goto discard_it; 2145 } 2146 2147 tcp_v4_fill_cb(skb, iph, th); 2148 2149 if (tcp_checksum_complete(skb)) { 2150 inet_twsk_put(inet_twsk(sk)); 2151 goto csum_error; 2152 } 2153 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2154 case TCP_TW_SYN: { 2155 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2156 &tcp_hashinfo, skb, 2157 __tcp_hdrlen(th), 2158 iph->saddr, th->source, 2159 iph->daddr, th->dest, 2160 inet_iif(skb), 2161 sdif); 2162 if (sk2) { 2163 inet_twsk_deschedule_put(inet_twsk(sk)); 2164 sk = sk2; 2165 tcp_v4_restore_cb(skb); 2166 refcounted = false; 2167 goto process; 2168 } 2169 } 2170 /* to ACK */ 2171 fallthrough; 2172 case TCP_TW_ACK: 2173 tcp_v4_timewait_ack(sk, skb); 2174 break; 2175 case TCP_TW_RST: 2176 tcp_v4_send_reset(sk, skb); 2177 inet_twsk_deschedule_put(inet_twsk(sk)); 2178 goto discard_it; 2179 case TCP_TW_SUCCESS:; 2180 } 2181 goto discard_it; 2182 } 2183 2184 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2185 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2186 .twsk_unique = tcp_twsk_unique, 2187 .twsk_destructor= tcp_twsk_destructor, 2188 }; 2189 2190 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2191 { 2192 struct dst_entry *dst = skb_dst(skb); 2193 2194 if (dst && dst_hold_safe(dst)) { 2195 rcu_assign_pointer(sk->sk_rx_dst, dst); 2196 sk->sk_rx_dst_ifindex = skb->skb_iif; 2197 } 2198 } 2199 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2200 2201 const struct inet_connection_sock_af_ops ipv4_specific = { 2202 .queue_xmit = ip_queue_xmit, 2203 .send_check = tcp_v4_send_check, 2204 .rebuild_header = inet_sk_rebuild_header, 2205 .sk_rx_dst_set = inet_sk_rx_dst_set, 2206 .conn_request = tcp_v4_conn_request, 2207 .syn_recv_sock = tcp_v4_syn_recv_sock, 2208 .net_header_len = sizeof(struct iphdr), 2209 .setsockopt = ip_setsockopt, 2210 .getsockopt = ip_getsockopt, 2211 .addr2sockaddr = inet_csk_addr2sockaddr, 2212 .sockaddr_len = sizeof(struct sockaddr_in), 2213 .mtu_reduced = tcp_v4_mtu_reduced, 2214 }; 2215 EXPORT_SYMBOL(ipv4_specific); 2216 2217 #ifdef CONFIG_TCP_MD5SIG 2218 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2219 .md5_lookup = tcp_v4_md5_lookup, 2220 .calc_md5_hash = tcp_v4_md5_hash_skb, 2221 .md5_parse = tcp_v4_parse_md5_keys, 2222 }; 2223 #endif 2224 2225 /* NOTE: A lot of things set to zero explicitly by call to 2226 * sk_alloc() so need not be done here. 2227 */ 2228 static int tcp_v4_init_sock(struct sock *sk) 2229 { 2230 struct inet_connection_sock *icsk = inet_csk(sk); 2231 2232 tcp_init_sock(sk); 2233 2234 icsk->icsk_af_ops = &ipv4_specific; 2235 2236 #ifdef CONFIG_TCP_MD5SIG 2237 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2238 #endif 2239 2240 return 0; 2241 } 2242 2243 void tcp_v4_destroy_sock(struct sock *sk) 2244 { 2245 struct tcp_sock *tp = tcp_sk(sk); 2246 2247 trace_tcp_destroy_sock(sk); 2248 2249 tcp_clear_xmit_timers(sk); 2250 2251 tcp_cleanup_congestion_control(sk); 2252 2253 tcp_cleanup_ulp(sk); 2254 2255 /* Cleanup up the write buffer. */ 2256 tcp_write_queue_purge(sk); 2257 2258 /* Check if we want to disable active TFO */ 2259 tcp_fastopen_active_disable_ofo_check(sk); 2260 2261 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2262 skb_rbtree_purge(&tp->out_of_order_queue); 2263 2264 #ifdef CONFIG_TCP_MD5SIG 2265 /* Clean up the MD5 key list, if any */ 2266 if (tp->md5sig_info) { 2267 tcp_clear_md5_list(sk); 2268 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2269 tp->md5sig_info = NULL; 2270 } 2271 #endif 2272 2273 /* Clean up a referenced TCP bind bucket. */ 2274 if (inet_csk(sk)->icsk_bind_hash) 2275 inet_put_port(sk); 2276 2277 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2278 2279 /* If socket is aborted during connect operation */ 2280 tcp_free_fastopen_req(tp); 2281 tcp_fastopen_destroy_cipher(sk); 2282 tcp_saved_syn_free(tp); 2283 2284 sk_sockets_allocated_dec(sk); 2285 } 2286 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2287 2288 #ifdef CONFIG_PROC_FS 2289 /* Proc filesystem TCP sock list dumping. */ 2290 2291 static unsigned short seq_file_family(const struct seq_file *seq); 2292 2293 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) 2294 { 2295 unsigned short family = seq_file_family(seq); 2296 2297 /* AF_UNSPEC is used as a match all */ 2298 return ((family == AF_UNSPEC || family == sk->sk_family) && 2299 net_eq(sock_net(sk), seq_file_net(seq))); 2300 } 2301 2302 /* Find a non empty bucket (starting from st->bucket) 2303 * and return the first sk from it. 2304 */ 2305 static void *listening_get_first(struct seq_file *seq) 2306 { 2307 struct tcp_iter_state *st = seq->private; 2308 2309 st->offset = 0; 2310 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { 2311 struct inet_listen_hashbucket *ilb2; 2312 struct hlist_nulls_node *node; 2313 struct sock *sk; 2314 2315 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2316 if (hlist_nulls_empty(&ilb2->nulls_head)) 2317 continue; 2318 2319 spin_lock(&ilb2->lock); 2320 sk_nulls_for_each(sk, node, &ilb2->nulls_head) { 2321 if (seq_sk_match(seq, sk)) 2322 return sk; 2323 } 2324 spin_unlock(&ilb2->lock); 2325 } 2326 2327 return NULL; 2328 } 2329 2330 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket). 2331 * If "cur" is the last one in the st->bucket, 2332 * call listening_get_first() to return the first sk of the next 2333 * non empty bucket. 2334 */ 2335 static void *listening_get_next(struct seq_file *seq, void *cur) 2336 { 2337 struct tcp_iter_state *st = seq->private; 2338 struct inet_listen_hashbucket *ilb2; 2339 struct hlist_nulls_node *node; 2340 struct sock *sk = cur; 2341 2342 ++st->num; 2343 ++st->offset; 2344 2345 sk = sk_nulls_next(sk); 2346 sk_nulls_for_each_from(sk, node) { 2347 if (seq_sk_match(seq, sk)) 2348 return sk; 2349 } 2350 2351 ilb2 = &tcp_hashinfo.lhash2[st->bucket]; 2352 spin_unlock(&ilb2->lock); 2353 ++st->bucket; 2354 return listening_get_first(seq); 2355 } 2356 2357 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2358 { 2359 struct tcp_iter_state *st = seq->private; 2360 void *rc; 2361 2362 st->bucket = 0; 2363 st->offset = 0; 2364 rc = listening_get_first(seq); 2365 2366 while (rc && *pos) { 2367 rc = listening_get_next(seq, rc); 2368 --*pos; 2369 } 2370 return rc; 2371 } 2372 2373 static inline bool empty_bucket(const struct tcp_iter_state *st) 2374 { 2375 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2376 } 2377 2378 /* 2379 * Get first established socket starting from bucket given in st->bucket. 2380 * If st->bucket is zero, the very first socket in the hash is returned. 2381 */ 2382 static void *established_get_first(struct seq_file *seq) 2383 { 2384 struct tcp_iter_state *st = seq->private; 2385 2386 st->offset = 0; 2387 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2388 struct sock *sk; 2389 struct hlist_nulls_node *node; 2390 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2391 2392 /* Lockless fast path for the common case of empty buckets */ 2393 if (empty_bucket(st)) 2394 continue; 2395 2396 spin_lock_bh(lock); 2397 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2398 if (seq_sk_match(seq, sk)) 2399 return sk; 2400 } 2401 spin_unlock_bh(lock); 2402 } 2403 2404 return NULL; 2405 } 2406 2407 static void *established_get_next(struct seq_file *seq, void *cur) 2408 { 2409 struct sock *sk = cur; 2410 struct hlist_nulls_node *node; 2411 struct tcp_iter_state *st = seq->private; 2412 2413 ++st->num; 2414 ++st->offset; 2415 2416 sk = sk_nulls_next(sk); 2417 2418 sk_nulls_for_each_from(sk, node) { 2419 if (seq_sk_match(seq, sk)) 2420 return sk; 2421 } 2422 2423 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2424 ++st->bucket; 2425 return established_get_first(seq); 2426 } 2427 2428 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2429 { 2430 struct tcp_iter_state *st = seq->private; 2431 void *rc; 2432 2433 st->bucket = 0; 2434 rc = established_get_first(seq); 2435 2436 while (rc && pos) { 2437 rc = established_get_next(seq, rc); 2438 --pos; 2439 } 2440 return rc; 2441 } 2442 2443 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2444 { 2445 void *rc; 2446 struct tcp_iter_state *st = seq->private; 2447 2448 st->state = TCP_SEQ_STATE_LISTENING; 2449 rc = listening_get_idx(seq, &pos); 2450 2451 if (!rc) { 2452 st->state = TCP_SEQ_STATE_ESTABLISHED; 2453 rc = established_get_idx(seq, pos); 2454 } 2455 2456 return rc; 2457 } 2458 2459 static void *tcp_seek_last_pos(struct seq_file *seq) 2460 { 2461 struct tcp_iter_state *st = seq->private; 2462 int bucket = st->bucket; 2463 int offset = st->offset; 2464 int orig_num = st->num; 2465 void *rc = NULL; 2466 2467 switch (st->state) { 2468 case TCP_SEQ_STATE_LISTENING: 2469 if (st->bucket > tcp_hashinfo.lhash2_mask) 2470 break; 2471 st->state = TCP_SEQ_STATE_LISTENING; 2472 rc = listening_get_first(seq); 2473 while (offset-- && rc && bucket == st->bucket) 2474 rc = listening_get_next(seq, rc); 2475 if (rc) 2476 break; 2477 st->bucket = 0; 2478 st->state = TCP_SEQ_STATE_ESTABLISHED; 2479 fallthrough; 2480 case TCP_SEQ_STATE_ESTABLISHED: 2481 if (st->bucket > tcp_hashinfo.ehash_mask) 2482 break; 2483 rc = established_get_first(seq); 2484 while (offset-- && rc && bucket == st->bucket) 2485 rc = established_get_next(seq, rc); 2486 } 2487 2488 st->num = orig_num; 2489 2490 return rc; 2491 } 2492 2493 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2494 { 2495 struct tcp_iter_state *st = seq->private; 2496 void *rc; 2497 2498 if (*pos && *pos == st->last_pos) { 2499 rc = tcp_seek_last_pos(seq); 2500 if (rc) 2501 goto out; 2502 } 2503 2504 st->state = TCP_SEQ_STATE_LISTENING; 2505 st->num = 0; 2506 st->bucket = 0; 2507 st->offset = 0; 2508 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2509 2510 out: 2511 st->last_pos = *pos; 2512 return rc; 2513 } 2514 EXPORT_SYMBOL(tcp_seq_start); 2515 2516 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2517 { 2518 struct tcp_iter_state *st = seq->private; 2519 void *rc = NULL; 2520 2521 if (v == SEQ_START_TOKEN) { 2522 rc = tcp_get_idx(seq, 0); 2523 goto out; 2524 } 2525 2526 switch (st->state) { 2527 case TCP_SEQ_STATE_LISTENING: 2528 rc = listening_get_next(seq, v); 2529 if (!rc) { 2530 st->state = TCP_SEQ_STATE_ESTABLISHED; 2531 st->bucket = 0; 2532 st->offset = 0; 2533 rc = established_get_first(seq); 2534 } 2535 break; 2536 case TCP_SEQ_STATE_ESTABLISHED: 2537 rc = established_get_next(seq, v); 2538 break; 2539 } 2540 out: 2541 ++*pos; 2542 st->last_pos = *pos; 2543 return rc; 2544 } 2545 EXPORT_SYMBOL(tcp_seq_next); 2546 2547 void tcp_seq_stop(struct seq_file *seq, void *v) 2548 { 2549 struct tcp_iter_state *st = seq->private; 2550 2551 switch (st->state) { 2552 case TCP_SEQ_STATE_LISTENING: 2553 if (v != SEQ_START_TOKEN) 2554 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2555 break; 2556 case TCP_SEQ_STATE_ESTABLISHED: 2557 if (v) 2558 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2559 break; 2560 } 2561 } 2562 EXPORT_SYMBOL(tcp_seq_stop); 2563 2564 static void get_openreq4(const struct request_sock *req, 2565 struct seq_file *f, int i) 2566 { 2567 const struct inet_request_sock *ireq = inet_rsk(req); 2568 long delta = req->rsk_timer.expires - jiffies; 2569 2570 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2571 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2572 i, 2573 ireq->ir_loc_addr, 2574 ireq->ir_num, 2575 ireq->ir_rmt_addr, 2576 ntohs(ireq->ir_rmt_port), 2577 TCP_SYN_RECV, 2578 0, 0, /* could print option size, but that is af dependent. */ 2579 1, /* timers active (only the expire timer) */ 2580 jiffies_delta_to_clock_t(delta), 2581 req->num_timeout, 2582 from_kuid_munged(seq_user_ns(f), 2583 sock_i_uid(req->rsk_listener)), 2584 0, /* non standard timer */ 2585 0, /* open_requests have no inode */ 2586 0, 2587 req); 2588 } 2589 2590 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2591 { 2592 int timer_active; 2593 unsigned long timer_expires; 2594 const struct tcp_sock *tp = tcp_sk(sk); 2595 const struct inet_connection_sock *icsk = inet_csk(sk); 2596 const struct inet_sock *inet = inet_sk(sk); 2597 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2598 __be32 dest = inet->inet_daddr; 2599 __be32 src = inet->inet_rcv_saddr; 2600 __u16 destp = ntohs(inet->inet_dport); 2601 __u16 srcp = ntohs(inet->inet_sport); 2602 int rx_queue; 2603 int state; 2604 2605 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2606 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2607 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2608 timer_active = 1; 2609 timer_expires = icsk->icsk_timeout; 2610 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2611 timer_active = 4; 2612 timer_expires = icsk->icsk_timeout; 2613 } else if (timer_pending(&sk->sk_timer)) { 2614 timer_active = 2; 2615 timer_expires = sk->sk_timer.expires; 2616 } else { 2617 timer_active = 0; 2618 timer_expires = jiffies; 2619 } 2620 2621 state = inet_sk_state_load(sk); 2622 if (state == TCP_LISTEN) 2623 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2624 else 2625 /* Because we don't lock the socket, 2626 * we might find a transient negative value. 2627 */ 2628 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2629 READ_ONCE(tp->copied_seq), 0); 2630 2631 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2632 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2633 i, src, srcp, dest, destp, state, 2634 READ_ONCE(tp->write_seq) - tp->snd_una, 2635 rx_queue, 2636 timer_active, 2637 jiffies_delta_to_clock_t(timer_expires - jiffies), 2638 icsk->icsk_retransmits, 2639 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2640 icsk->icsk_probes_out, 2641 sock_i_ino(sk), 2642 refcount_read(&sk->sk_refcnt), sk, 2643 jiffies_to_clock_t(icsk->icsk_rto), 2644 jiffies_to_clock_t(icsk->icsk_ack.ato), 2645 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2646 tcp_snd_cwnd(tp), 2647 state == TCP_LISTEN ? 2648 fastopenq->max_qlen : 2649 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2650 } 2651 2652 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2653 struct seq_file *f, int i) 2654 { 2655 long delta = tw->tw_timer.expires - jiffies; 2656 __be32 dest, src; 2657 __u16 destp, srcp; 2658 2659 dest = tw->tw_daddr; 2660 src = tw->tw_rcv_saddr; 2661 destp = ntohs(tw->tw_dport); 2662 srcp = ntohs(tw->tw_sport); 2663 2664 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2665 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2666 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2667 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2668 refcount_read(&tw->tw_refcnt), tw); 2669 } 2670 2671 #define TMPSZ 150 2672 2673 static int tcp4_seq_show(struct seq_file *seq, void *v) 2674 { 2675 struct tcp_iter_state *st; 2676 struct sock *sk = v; 2677 2678 seq_setwidth(seq, TMPSZ - 1); 2679 if (v == SEQ_START_TOKEN) { 2680 seq_puts(seq, " sl local_address rem_address st tx_queue " 2681 "rx_queue tr tm->when retrnsmt uid timeout " 2682 "inode"); 2683 goto out; 2684 } 2685 st = seq->private; 2686 2687 if (sk->sk_state == TCP_TIME_WAIT) 2688 get_timewait4_sock(v, seq, st->num); 2689 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2690 get_openreq4(v, seq, st->num); 2691 else 2692 get_tcp4_sock(v, seq, st->num); 2693 out: 2694 seq_pad(seq, '\n'); 2695 return 0; 2696 } 2697 2698 #ifdef CONFIG_BPF_SYSCALL 2699 struct bpf_tcp_iter_state { 2700 struct tcp_iter_state state; 2701 unsigned int cur_sk; 2702 unsigned int end_sk; 2703 unsigned int max_sk; 2704 struct sock **batch; 2705 bool st_bucket_done; 2706 }; 2707 2708 struct bpf_iter__tcp { 2709 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2710 __bpf_md_ptr(struct sock_common *, sk_common); 2711 uid_t uid __aligned(8); 2712 }; 2713 2714 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2715 struct sock_common *sk_common, uid_t uid) 2716 { 2717 struct bpf_iter__tcp ctx; 2718 2719 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2720 ctx.meta = meta; 2721 ctx.sk_common = sk_common; 2722 ctx.uid = uid; 2723 return bpf_iter_run_prog(prog, &ctx); 2724 } 2725 2726 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) 2727 { 2728 while (iter->cur_sk < iter->end_sk) 2729 sock_put(iter->batch[iter->cur_sk++]); 2730 } 2731 2732 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, 2733 unsigned int new_batch_sz) 2734 { 2735 struct sock **new_batch; 2736 2737 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 2738 GFP_USER | __GFP_NOWARN); 2739 if (!new_batch) 2740 return -ENOMEM; 2741 2742 bpf_iter_tcp_put_batch(iter); 2743 kvfree(iter->batch); 2744 iter->batch = new_batch; 2745 iter->max_sk = new_batch_sz; 2746 2747 return 0; 2748 } 2749 2750 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, 2751 struct sock *start_sk) 2752 { 2753 struct bpf_tcp_iter_state *iter = seq->private; 2754 struct tcp_iter_state *st = &iter->state; 2755 struct hlist_nulls_node *node; 2756 unsigned int expected = 1; 2757 struct sock *sk; 2758 2759 sock_hold(start_sk); 2760 iter->batch[iter->end_sk++] = start_sk; 2761 2762 sk = sk_nulls_next(start_sk); 2763 sk_nulls_for_each_from(sk, node) { 2764 if (seq_sk_match(seq, sk)) { 2765 if (iter->end_sk < iter->max_sk) { 2766 sock_hold(sk); 2767 iter->batch[iter->end_sk++] = sk; 2768 } 2769 expected++; 2770 } 2771 } 2772 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); 2773 2774 return expected; 2775 } 2776 2777 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, 2778 struct sock *start_sk) 2779 { 2780 struct bpf_tcp_iter_state *iter = seq->private; 2781 struct tcp_iter_state *st = &iter->state; 2782 struct hlist_nulls_node *node; 2783 unsigned int expected = 1; 2784 struct sock *sk; 2785 2786 sock_hold(start_sk); 2787 iter->batch[iter->end_sk++] = start_sk; 2788 2789 sk = sk_nulls_next(start_sk); 2790 sk_nulls_for_each_from(sk, node) { 2791 if (seq_sk_match(seq, sk)) { 2792 if (iter->end_sk < iter->max_sk) { 2793 sock_hold(sk); 2794 iter->batch[iter->end_sk++] = sk; 2795 } 2796 expected++; 2797 } 2798 } 2799 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2800 2801 return expected; 2802 } 2803 2804 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) 2805 { 2806 struct bpf_tcp_iter_state *iter = seq->private; 2807 struct tcp_iter_state *st = &iter->state; 2808 unsigned int expected; 2809 bool resized = false; 2810 struct sock *sk; 2811 2812 /* The st->bucket is done. Directly advance to the next 2813 * bucket instead of having the tcp_seek_last_pos() to skip 2814 * one by one in the current bucket and eventually find out 2815 * it has to advance to the next bucket. 2816 */ 2817 if (iter->st_bucket_done) { 2818 st->offset = 0; 2819 st->bucket++; 2820 if (st->state == TCP_SEQ_STATE_LISTENING && 2821 st->bucket > tcp_hashinfo.lhash2_mask) { 2822 st->state = TCP_SEQ_STATE_ESTABLISHED; 2823 st->bucket = 0; 2824 } 2825 } 2826 2827 again: 2828 /* Get a new batch */ 2829 iter->cur_sk = 0; 2830 iter->end_sk = 0; 2831 iter->st_bucket_done = false; 2832 2833 sk = tcp_seek_last_pos(seq); 2834 if (!sk) 2835 return NULL; /* Done */ 2836 2837 if (st->state == TCP_SEQ_STATE_LISTENING) 2838 expected = bpf_iter_tcp_listening_batch(seq, sk); 2839 else 2840 expected = bpf_iter_tcp_established_batch(seq, sk); 2841 2842 if (iter->end_sk == expected) { 2843 iter->st_bucket_done = true; 2844 return sk; 2845 } 2846 2847 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { 2848 resized = true; 2849 goto again; 2850 } 2851 2852 return sk; 2853 } 2854 2855 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) 2856 { 2857 /* bpf iter does not support lseek, so it always 2858 * continue from where it was stop()-ped. 2859 */ 2860 if (*pos) 2861 return bpf_iter_tcp_batch(seq); 2862 2863 return SEQ_START_TOKEN; 2864 } 2865 2866 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2867 { 2868 struct bpf_tcp_iter_state *iter = seq->private; 2869 struct tcp_iter_state *st = &iter->state; 2870 struct sock *sk; 2871 2872 /* Whenever seq_next() is called, the iter->cur_sk is 2873 * done with seq_show(), so advance to the next sk in 2874 * the batch. 2875 */ 2876 if (iter->cur_sk < iter->end_sk) { 2877 /* Keeping st->num consistent in tcp_iter_state. 2878 * bpf_iter_tcp does not use st->num. 2879 * meta.seq_num is used instead. 2880 */ 2881 st->num++; 2882 /* Move st->offset to the next sk in the bucket such that 2883 * the future start() will resume at st->offset in 2884 * st->bucket. See tcp_seek_last_pos(). 2885 */ 2886 st->offset++; 2887 sock_put(iter->batch[iter->cur_sk++]); 2888 } 2889 2890 if (iter->cur_sk < iter->end_sk) 2891 sk = iter->batch[iter->cur_sk]; 2892 else 2893 sk = bpf_iter_tcp_batch(seq); 2894 2895 ++*pos; 2896 /* Keeping st->last_pos consistent in tcp_iter_state. 2897 * bpf iter does not do lseek, so st->last_pos always equals to *pos. 2898 */ 2899 st->last_pos = *pos; 2900 return sk; 2901 } 2902 2903 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2904 { 2905 struct bpf_iter_meta meta; 2906 struct bpf_prog *prog; 2907 struct sock *sk = v; 2908 bool slow; 2909 uid_t uid; 2910 int ret; 2911 2912 if (v == SEQ_START_TOKEN) 2913 return 0; 2914 2915 if (sk_fullsock(sk)) 2916 slow = lock_sock_fast(sk); 2917 2918 if (unlikely(sk_unhashed(sk))) { 2919 ret = SEQ_SKIP; 2920 goto unlock; 2921 } 2922 2923 if (sk->sk_state == TCP_TIME_WAIT) { 2924 uid = 0; 2925 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2926 const struct request_sock *req = v; 2927 2928 uid = from_kuid_munged(seq_user_ns(seq), 2929 sock_i_uid(req->rsk_listener)); 2930 } else { 2931 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2932 } 2933 2934 meta.seq = seq; 2935 prog = bpf_iter_get_info(&meta, false); 2936 ret = tcp_prog_seq_show(prog, &meta, v, uid); 2937 2938 unlock: 2939 if (sk_fullsock(sk)) 2940 unlock_sock_fast(sk, slow); 2941 return ret; 2942 2943 } 2944 2945 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2946 { 2947 struct bpf_tcp_iter_state *iter = seq->private; 2948 struct bpf_iter_meta meta; 2949 struct bpf_prog *prog; 2950 2951 if (!v) { 2952 meta.seq = seq; 2953 prog = bpf_iter_get_info(&meta, true); 2954 if (prog) 2955 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2956 } 2957 2958 if (iter->cur_sk < iter->end_sk) { 2959 bpf_iter_tcp_put_batch(iter); 2960 iter->st_bucket_done = false; 2961 } 2962 } 2963 2964 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2965 .show = bpf_iter_tcp_seq_show, 2966 .start = bpf_iter_tcp_seq_start, 2967 .next = bpf_iter_tcp_seq_next, 2968 .stop = bpf_iter_tcp_seq_stop, 2969 }; 2970 #endif 2971 static unsigned short seq_file_family(const struct seq_file *seq) 2972 { 2973 const struct tcp_seq_afinfo *afinfo; 2974 2975 #ifdef CONFIG_BPF_SYSCALL 2976 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ 2977 if (seq->op == &bpf_iter_tcp_seq_ops) 2978 return AF_UNSPEC; 2979 #endif 2980 2981 /* Iterated from proc fs */ 2982 afinfo = pde_data(file_inode(seq->file)); 2983 return afinfo->family; 2984 } 2985 2986 static const struct seq_operations tcp4_seq_ops = { 2987 .show = tcp4_seq_show, 2988 .start = tcp_seq_start, 2989 .next = tcp_seq_next, 2990 .stop = tcp_seq_stop, 2991 }; 2992 2993 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2994 .family = AF_INET, 2995 }; 2996 2997 static int __net_init tcp4_proc_init_net(struct net *net) 2998 { 2999 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 3000 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 3001 return -ENOMEM; 3002 return 0; 3003 } 3004 3005 static void __net_exit tcp4_proc_exit_net(struct net *net) 3006 { 3007 remove_proc_entry("tcp", net->proc_net); 3008 } 3009 3010 static struct pernet_operations tcp4_net_ops = { 3011 .init = tcp4_proc_init_net, 3012 .exit = tcp4_proc_exit_net, 3013 }; 3014 3015 int __init tcp4_proc_init(void) 3016 { 3017 return register_pernet_subsys(&tcp4_net_ops); 3018 } 3019 3020 void tcp4_proc_exit(void) 3021 { 3022 unregister_pernet_subsys(&tcp4_net_ops); 3023 } 3024 #endif /* CONFIG_PROC_FS */ 3025 3026 /* @wake is one when sk_stream_write_space() calls us. 3027 * This sends EPOLLOUT only if notsent_bytes is half the limit. 3028 * This mimics the strategy used in sock_def_write_space(). 3029 */ 3030 bool tcp_stream_memory_free(const struct sock *sk, int wake) 3031 { 3032 const struct tcp_sock *tp = tcp_sk(sk); 3033 u32 notsent_bytes = READ_ONCE(tp->write_seq) - 3034 READ_ONCE(tp->snd_nxt); 3035 3036 return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 3037 } 3038 EXPORT_SYMBOL(tcp_stream_memory_free); 3039 3040 struct proto tcp_prot = { 3041 .name = "TCP", 3042 .owner = THIS_MODULE, 3043 .close = tcp_close, 3044 .pre_connect = tcp_v4_pre_connect, 3045 .connect = tcp_v4_connect, 3046 .disconnect = tcp_disconnect, 3047 .accept = inet_csk_accept, 3048 .ioctl = tcp_ioctl, 3049 .init = tcp_v4_init_sock, 3050 .destroy = tcp_v4_destroy_sock, 3051 .shutdown = tcp_shutdown, 3052 .setsockopt = tcp_setsockopt, 3053 .getsockopt = tcp_getsockopt, 3054 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, 3055 .keepalive = tcp_set_keepalive, 3056 .recvmsg = tcp_recvmsg, 3057 .sendmsg = tcp_sendmsg, 3058 .sendpage = tcp_sendpage, 3059 .backlog_rcv = tcp_v4_do_rcv, 3060 .release_cb = tcp_release_cb, 3061 .hash = inet_hash, 3062 .unhash = inet_unhash, 3063 .get_port = inet_csk_get_port, 3064 .put_port = inet_put_port, 3065 #ifdef CONFIG_BPF_SYSCALL 3066 .psock_update_sk_prot = tcp_bpf_update_proto, 3067 #endif 3068 .enter_memory_pressure = tcp_enter_memory_pressure, 3069 .leave_memory_pressure = tcp_leave_memory_pressure, 3070 .stream_memory_free = tcp_stream_memory_free, 3071 .sockets_allocated = &tcp_sockets_allocated, 3072 .orphan_count = &tcp_orphan_count, 3073 3074 .memory_allocated = &tcp_memory_allocated, 3075 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, 3076 3077 .memory_pressure = &tcp_memory_pressure, 3078 .sysctl_mem = sysctl_tcp_mem, 3079 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 3080 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 3081 .max_header = MAX_TCP_HEADER, 3082 .obj_size = sizeof(struct tcp_sock), 3083 .slab_flags = SLAB_TYPESAFE_BY_RCU, 3084 .twsk_prot = &tcp_timewait_sock_ops, 3085 .rsk_prot = &tcp_request_sock_ops, 3086 .h.hashinfo = &tcp_hashinfo, 3087 .no_autobind = true, 3088 .diag_destroy = tcp_abort, 3089 }; 3090 EXPORT_SYMBOL(tcp_prot); 3091 3092 static void __net_exit tcp_sk_exit(struct net *net) 3093 { 3094 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row; 3095 3096 if (net->ipv4.tcp_congestion_control) 3097 bpf_module_put(net->ipv4.tcp_congestion_control, 3098 net->ipv4.tcp_congestion_control->owner); 3099 if (refcount_dec_and_test(&tcp_death_row->tw_refcount)) 3100 kfree(tcp_death_row); 3101 } 3102 3103 static int __net_init tcp_sk_init(struct net *net) 3104 { 3105 int cnt; 3106 3107 net->ipv4.sysctl_tcp_ecn = 2; 3108 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3109 3110 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 3111 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 3112 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 3113 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 3114 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 3115 3116 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 3117 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 3118 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 3119 3120 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 3121 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 3122 net->ipv4.sysctl_tcp_syncookies = 1; 3123 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 3124 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 3125 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 3126 net->ipv4.sysctl_tcp_orphan_retries = 0; 3127 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 3128 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 3129 net->ipv4.sysctl_tcp_tw_reuse = 2; 3130 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 3131 3132 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL); 3133 if (!net->ipv4.tcp_death_row) 3134 return -ENOMEM; 3135 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1); 3136 cnt = tcp_hashinfo.ehash_mask + 1; 3137 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2; 3138 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo; 3139 3140 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 3141 net->ipv4.sysctl_tcp_sack = 1; 3142 net->ipv4.sysctl_tcp_window_scaling = 1; 3143 net->ipv4.sysctl_tcp_timestamps = 1; 3144 net->ipv4.sysctl_tcp_early_retrans = 3; 3145 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 3146 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 3147 net->ipv4.sysctl_tcp_retrans_collapse = 1; 3148 net->ipv4.sysctl_tcp_max_reordering = 300; 3149 net->ipv4.sysctl_tcp_dsack = 1; 3150 net->ipv4.sysctl_tcp_app_win = 31; 3151 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3152 net->ipv4.sysctl_tcp_frto = 2; 3153 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3154 /* This limits the percentage of the congestion window which we 3155 * will allow a single TSO frame to consume. Building TSO frames 3156 * which are too large can cause TCP streams to be bursty. 3157 */ 3158 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3159 /* Default TSQ limit of 16 TSO segments */ 3160 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3161 3162 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3163 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; 3164 3165 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3166 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3167 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3168 net->ipv4.sysctl_tcp_autocorking = 1; 3169 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 3170 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 3171 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 3172 if (net != &init_net) { 3173 memcpy(net->ipv4.sysctl_tcp_rmem, 3174 init_net.ipv4.sysctl_tcp_rmem, 3175 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 3176 memcpy(net->ipv4.sysctl_tcp_wmem, 3177 init_net.ipv4.sysctl_tcp_wmem, 3178 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 3179 } 3180 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3181 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3182 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3183 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3184 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; 3185 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 3186 3187 /* Reno is always built in */ 3188 if (!net_eq(net, &init_net) && 3189 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 3190 init_net.ipv4.tcp_congestion_control->owner)) 3191 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 3192 else 3193 net->ipv4.tcp_congestion_control = &tcp_reno; 3194 3195 return 0; 3196 } 3197 3198 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 3199 { 3200 struct net *net; 3201 3202 inet_twsk_purge(&tcp_hashinfo, AF_INET); 3203 3204 list_for_each_entry(net, net_exit_list, exit_list) 3205 tcp_fastopen_ctx_destroy(net); 3206 } 3207 3208 static struct pernet_operations __net_initdata tcp_sk_ops = { 3209 .init = tcp_sk_init, 3210 .exit = tcp_sk_exit, 3211 .exit_batch = tcp_sk_exit_batch, 3212 }; 3213 3214 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3215 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 3216 struct sock_common *sk_common, uid_t uid) 3217 3218 #define INIT_BATCH_SZ 16 3219 3220 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 3221 { 3222 struct bpf_tcp_iter_state *iter = priv_data; 3223 int err; 3224 3225 err = bpf_iter_init_seq_net(priv_data, aux); 3226 if (err) 3227 return err; 3228 3229 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); 3230 if (err) { 3231 bpf_iter_fini_seq_net(priv_data); 3232 return err; 3233 } 3234 3235 return 0; 3236 } 3237 3238 static void bpf_iter_fini_tcp(void *priv_data) 3239 { 3240 struct bpf_tcp_iter_state *iter = priv_data; 3241 3242 bpf_iter_fini_seq_net(priv_data); 3243 kvfree(iter->batch); 3244 } 3245 3246 static const struct bpf_iter_seq_info tcp_seq_info = { 3247 .seq_ops = &bpf_iter_tcp_seq_ops, 3248 .init_seq_private = bpf_iter_init_tcp, 3249 .fini_seq_private = bpf_iter_fini_tcp, 3250 .seq_priv_size = sizeof(struct bpf_tcp_iter_state), 3251 }; 3252 3253 static const struct bpf_func_proto * 3254 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, 3255 const struct bpf_prog *prog) 3256 { 3257 switch (func_id) { 3258 case BPF_FUNC_setsockopt: 3259 return &bpf_sk_setsockopt_proto; 3260 case BPF_FUNC_getsockopt: 3261 return &bpf_sk_getsockopt_proto; 3262 default: 3263 return NULL; 3264 } 3265 } 3266 3267 static struct bpf_iter_reg tcp_reg_info = { 3268 .target = "tcp", 3269 .ctx_arg_info_size = 1, 3270 .ctx_arg_info = { 3271 { offsetof(struct bpf_iter__tcp, sk_common), 3272 PTR_TO_BTF_ID_OR_NULL }, 3273 }, 3274 .get_func_proto = bpf_iter_tcp_get_func_proto, 3275 .seq_info = &tcp_seq_info, 3276 }; 3277 3278 static void __init bpf_iter_register(void) 3279 { 3280 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 3281 if (bpf_iter_reg_target(&tcp_reg_info)) 3282 pr_warn("Warning: could not register bpf iterator tcp\n"); 3283 } 3284 3285 #endif 3286 3287 void __init tcp_v4_init(void) 3288 { 3289 int cpu, res; 3290 3291 for_each_possible_cpu(cpu) { 3292 struct sock *sk; 3293 3294 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 3295 IPPROTO_TCP, &init_net); 3296 if (res) 3297 panic("Failed to create the TCP control socket.\n"); 3298 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 3299 3300 /* Please enforce IP_DF and IPID==0 for RST and 3301 * ACK sent in SYN-RECV and TIME-WAIT state. 3302 */ 3303 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 3304 3305 per_cpu(ipv4_tcp_sk, cpu) = sk; 3306 } 3307 if (register_pernet_subsys(&tcp_sk_ops)) 3308 panic("Failed to create the TCP control socket.\n"); 3309 3310 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3311 bpf_iter_register(); 3312 #endif 3313 } 3314