1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 #include <linux/btf_ids.h> 80 81 #include <crypto/hash.h> 82 #include <linux/scatterlist.h> 83 84 #include <trace/events/tcp.h> 85 86 #ifdef CONFIG_TCP_MD5SIG 87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88 __be32 daddr, __be32 saddr, const struct tcphdr *th); 89 #endif 90 91 struct inet_hashinfo tcp_hashinfo; 92 EXPORT_SYMBOL(tcp_hashinfo); 93 94 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 95 { 96 return secure_tcp_seq(ip_hdr(skb)->daddr, 97 ip_hdr(skb)->saddr, 98 tcp_hdr(skb)->dest, 99 tcp_hdr(skb)->source); 100 } 101 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 103 { 104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 105 } 106 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 108 { 109 const struct inet_timewait_sock *tw = inet_twsk(sktw); 110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 111 struct tcp_sock *tp = tcp_sk(sk); 112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 113 114 if (reuse == 2) { 115 /* Still does not detect *everything* that goes through 116 * lo, since we require a loopback src or dst address 117 * or direct binding to 'lo' interface. 118 */ 119 bool loopback = false; 120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 121 loopback = true; 122 #if IS_ENABLED(CONFIG_IPV6) 123 if (tw->tw_family == AF_INET6) { 124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 128 loopback = true; 129 } else 130 #endif 131 { 132 if (ipv4_is_loopback(tw->tw_daddr) || 133 ipv4_is_loopback(tw->tw_rcv_saddr)) 134 loopback = true; 135 } 136 if (!loopback) 137 reuse = 0; 138 } 139 140 /* With PAWS, it is safe from the viewpoint 141 of data integrity. Even without PAWS it is safe provided sequence 142 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 143 144 Actually, the idea is close to VJ's one, only timestamp cache is 145 held not per host, but per port pair and TW bucket is used as state 146 holder. 147 148 If TW bucket has been already destroyed we fall back to VJ's scheme 149 and use initial timestamp retrieved from peer table. 150 */ 151 if (tcptw->tw_ts_recent_stamp && 152 (!twp || (reuse && time_after32(ktime_get_seconds(), 153 tcptw->tw_ts_recent_stamp)))) { 154 /* In case of repair and re-using TIME-WAIT sockets we still 155 * want to be sure that it is safe as above but honor the 156 * sequence numbers and time stamps set as part of the repair 157 * process. 158 * 159 * Without this check re-using a TIME-WAIT socket with TCP 160 * repair would accumulate a -1 on the repair assigned 161 * sequence number. The first time it is reused the sequence 162 * is -1, the second time -2, etc. This fixes that issue 163 * without appearing to create any others. 164 */ 165 if (likely(!tp->repair)) { 166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 167 168 if (!seq) 169 seq = 1; 170 WRITE_ONCE(tp->write_seq, seq); 171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 173 } 174 sock_hold(sktw); 175 return 1; 176 } 177 178 return 0; 179 } 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 181 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 183 int addr_len) 184 { 185 /* This check is replicated from tcp_v4_connect() and intended to 186 * prevent BPF program called below from accessing bytes that are out 187 * of the bound specified by user in addr_len. 188 */ 189 if (addr_len < sizeof(struct sockaddr_in)) 190 return -EINVAL; 191 192 sock_owned_by_me(sk); 193 194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 195 } 196 197 /* This will initiate an outgoing connection. */ 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 199 { 200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 201 struct inet_sock *inet = inet_sk(sk); 202 struct tcp_sock *tp = tcp_sk(sk); 203 __be16 orig_sport, orig_dport; 204 __be32 daddr, nexthop; 205 struct flowi4 *fl4; 206 struct rtable *rt; 207 int err; 208 struct ip_options_rcu *inet_opt; 209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 210 211 if (addr_len < sizeof(struct sockaddr_in)) 212 return -EINVAL; 213 214 if (usin->sin_family != AF_INET) 215 return -EAFNOSUPPORT; 216 217 nexthop = daddr = usin->sin_addr.s_addr; 218 inet_opt = rcu_dereference_protected(inet->inet_opt, 219 lockdep_sock_is_held(sk)); 220 if (inet_opt && inet_opt->opt.srr) { 221 if (!daddr) 222 return -EINVAL; 223 nexthop = inet_opt->opt.faddr; 224 } 225 226 orig_sport = inet->inet_sport; 227 orig_dport = usin->sin_port; 228 fl4 = &inet->cork.fl.u.ip4; 229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 231 IPPROTO_TCP, 232 orig_sport, orig_dport, sk); 233 if (IS_ERR(rt)) { 234 err = PTR_ERR(rt); 235 if (err == -ENETUNREACH) 236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 237 return err; 238 } 239 240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 241 ip_rt_put(rt); 242 return -ENETUNREACH; 243 } 244 245 if (!inet_opt || !inet_opt->opt.srr) 246 daddr = fl4->daddr; 247 248 if (!inet->inet_saddr) 249 inet->inet_saddr = fl4->saddr; 250 sk_rcv_saddr_set(sk, inet->inet_saddr); 251 252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 253 /* Reset inherited state */ 254 tp->rx_opt.ts_recent = 0; 255 tp->rx_opt.ts_recent_stamp = 0; 256 if (likely(!tp->repair)) 257 WRITE_ONCE(tp->write_seq, 0); 258 } 259 260 inet->inet_dport = usin->sin_port; 261 sk_daddr_set(sk, daddr); 262 263 inet_csk(sk)->icsk_ext_hdr_len = 0; 264 if (inet_opt) 265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 266 267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 268 269 /* Socket identity is still unknown (sport may be zero). 270 * However we set state to SYN-SENT and not releasing socket 271 * lock select source port, enter ourselves into the hash tables and 272 * complete initialization after this. 273 */ 274 tcp_set_state(sk, TCP_SYN_SENT); 275 err = inet_hash_connect(tcp_death_row, sk); 276 if (err) 277 goto failure; 278 279 sk_set_txhash(sk); 280 281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 282 inet->inet_sport, inet->inet_dport, sk); 283 if (IS_ERR(rt)) { 284 err = PTR_ERR(rt); 285 rt = NULL; 286 goto failure; 287 } 288 /* OK, now commit destination to socket. */ 289 sk->sk_gso_type = SKB_GSO_TCPV4; 290 sk_setup_caps(sk, &rt->dst); 291 rt = NULL; 292 293 if (likely(!tp->repair)) { 294 if (!tp->write_seq) 295 WRITE_ONCE(tp->write_seq, 296 secure_tcp_seq(inet->inet_saddr, 297 inet->inet_daddr, 298 inet->inet_sport, 299 usin->sin_port)); 300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 301 inet->inet_saddr, 302 inet->inet_daddr); 303 } 304 305 inet->inet_id = prandom_u32(); 306 307 if (tcp_fastopen_defer_connect(sk, &err)) 308 return err; 309 if (err) 310 goto failure; 311 312 err = tcp_connect(sk); 313 314 if (err) 315 goto failure; 316 317 return 0; 318 319 failure: 320 /* 321 * This unhashes the socket and releases the local port, 322 * if necessary. 323 */ 324 tcp_set_state(sk, TCP_CLOSE); 325 ip_rt_put(rt); 326 sk->sk_route_caps = 0; 327 inet->inet_dport = 0; 328 return err; 329 } 330 EXPORT_SYMBOL(tcp_v4_connect); 331 332 /* 333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 334 * It can be called through tcp_release_cb() if socket was owned by user 335 * at the time tcp_v4_err() was called to handle ICMP message. 336 */ 337 void tcp_v4_mtu_reduced(struct sock *sk) 338 { 339 struct inet_sock *inet = inet_sk(sk); 340 struct dst_entry *dst; 341 u32 mtu; 342 343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 344 return; 345 mtu = tcp_sk(sk)->mtu_info; 346 dst = inet_csk_update_pmtu(sk, mtu); 347 if (!dst) 348 return; 349 350 /* Something is about to be wrong... Remember soft error 351 * for the case, if this connection will not able to recover. 352 */ 353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 354 sk->sk_err_soft = EMSGSIZE; 355 356 mtu = dst_mtu(dst); 357 358 if (inet->pmtudisc != IP_PMTUDISC_DONT && 359 ip_sk_accept_pmtu(sk) && 360 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 361 tcp_sync_mss(sk, mtu); 362 363 /* Resend the TCP packet because it's 364 * clear that the old packet has been 365 * dropped. This is the new "fast" path mtu 366 * discovery. 367 */ 368 tcp_simple_retransmit(sk); 369 } /* else let the usual retransmit timer handle it */ 370 } 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 372 373 static void do_redirect(struct sk_buff *skb, struct sock *sk) 374 { 375 struct dst_entry *dst = __sk_dst_check(sk, 0); 376 377 if (dst) 378 dst->ops->redirect(dst, sk, skb); 379 } 380 381 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 384 { 385 struct request_sock *req = inet_reqsk(sk); 386 struct net *net = sock_net(sk); 387 388 /* ICMPs are not backlogged, hence we cannot get 389 * an established socket here. 390 */ 391 if (seq != tcp_rsk(req)->snt_isn) { 392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 393 } else if (abort) { 394 /* 395 * Still in SYN_RECV, just remove it silently. 396 * There is no good way to pass the error to the newly 397 * created socket, and POSIX does not want network 398 * errors returned from accept(). 399 */ 400 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 401 tcp_listendrop(req->rsk_listener); 402 } 403 reqsk_put(req); 404 } 405 EXPORT_SYMBOL(tcp_req_err); 406 407 /* TCP-LD (RFC 6069) logic */ 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 409 { 410 struct inet_connection_sock *icsk = inet_csk(sk); 411 struct tcp_sock *tp = tcp_sk(sk); 412 struct sk_buff *skb; 413 s32 remaining; 414 u32 delta_us; 415 416 if (sock_owned_by_user(sk)) 417 return; 418 419 if (seq != tp->snd_una || !icsk->icsk_retransmits || 420 !icsk->icsk_backoff) 421 return; 422 423 skb = tcp_rtx_queue_head(sk); 424 if (WARN_ON_ONCE(!skb)) 425 return; 426 427 icsk->icsk_backoff--; 428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 430 431 tcp_mstamp_refresh(tp); 432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 434 435 if (remaining > 0) { 436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 437 remaining, TCP_RTO_MAX); 438 } else { 439 /* RTO revert clocked out retransmission. 440 * Will retransmit now. 441 */ 442 tcp_retransmit_timer(sk); 443 } 444 } 445 EXPORT_SYMBOL(tcp_ld_RTO_revert); 446 447 /* 448 * This routine is called by the ICMP module when it gets some 449 * sort of error condition. If err < 0 then the socket should 450 * be closed and the error returned to the user. If err > 0 451 * it's just the icmp type << 8 | icmp code. After adjustment 452 * header points to the first 8 bytes of the tcp header. We need 453 * to find the appropriate port. 454 * 455 * The locking strategy used here is very "optimistic". When 456 * someone else accesses the socket the ICMP is just dropped 457 * and for some paths there is no check at all. 458 * A more general error queue to queue errors for later handling 459 * is probably better. 460 * 461 */ 462 463 int tcp_v4_err(struct sk_buff *skb, u32 info) 464 { 465 const struct iphdr *iph = (const struct iphdr *)skb->data; 466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 467 struct tcp_sock *tp; 468 struct inet_sock *inet; 469 const int type = icmp_hdr(skb)->type; 470 const int code = icmp_hdr(skb)->code; 471 struct sock *sk; 472 struct request_sock *fastopen; 473 u32 seq, snd_una; 474 int err; 475 struct net *net = dev_net(skb->dev); 476 477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 478 th->dest, iph->saddr, ntohs(th->source), 479 inet_iif(skb), 0); 480 if (!sk) { 481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 482 return -ENOENT; 483 } 484 if (sk->sk_state == TCP_TIME_WAIT) { 485 inet_twsk_put(inet_twsk(sk)); 486 return 0; 487 } 488 seq = ntohl(th->seq); 489 if (sk->sk_state == TCP_NEW_SYN_RECV) { 490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 491 type == ICMP_TIME_EXCEEDED || 492 (type == ICMP_DEST_UNREACH && 493 (code == ICMP_NET_UNREACH || 494 code == ICMP_HOST_UNREACH))); 495 return 0; 496 } 497 498 bh_lock_sock(sk); 499 /* If too many ICMPs get dropped on busy 500 * servers this needs to be solved differently. 501 * We do take care of PMTU discovery (RFC1191) special case : 502 * we can receive locally generated ICMP messages while socket is held. 503 */ 504 if (sock_owned_by_user(sk)) { 505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 507 } 508 if (sk->sk_state == TCP_CLOSE) 509 goto out; 510 511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 513 goto out; 514 } 515 516 tp = tcp_sk(sk); 517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 518 fastopen = rcu_dereference(tp->fastopen_rsk); 519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 520 if (sk->sk_state != TCP_LISTEN && 521 !between(seq, snd_una, tp->snd_nxt)) { 522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 523 goto out; 524 } 525 526 switch (type) { 527 case ICMP_REDIRECT: 528 if (!sock_owned_by_user(sk)) 529 do_redirect(skb, sk); 530 goto out; 531 case ICMP_SOURCE_QUENCH: 532 /* Just silently ignore these. */ 533 goto out; 534 case ICMP_PARAMETERPROB: 535 err = EPROTO; 536 break; 537 case ICMP_DEST_UNREACH: 538 if (code > NR_ICMP_UNREACH) 539 goto out; 540 541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 542 /* We are not interested in TCP_LISTEN and open_requests 543 * (SYN-ACKs send out by Linux are always <576bytes so 544 * they should go through unfragmented). 545 */ 546 if (sk->sk_state == TCP_LISTEN) 547 goto out; 548 549 tp->mtu_info = info; 550 if (!sock_owned_by_user(sk)) { 551 tcp_v4_mtu_reduced(sk); 552 } else { 553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 554 sock_hold(sk); 555 } 556 goto out; 557 } 558 559 err = icmp_err_convert[code].errno; 560 /* check if this ICMP message allows revert of backoff. 561 * (see RFC 6069) 562 */ 563 if (!fastopen && 564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 565 tcp_ld_RTO_revert(sk, seq); 566 break; 567 case ICMP_TIME_EXCEEDED: 568 err = EHOSTUNREACH; 569 break; 570 default: 571 goto out; 572 } 573 574 switch (sk->sk_state) { 575 case TCP_SYN_SENT: 576 case TCP_SYN_RECV: 577 /* Only in fast or simultaneous open. If a fast open socket is 578 * already accepted it is treated as a connected one below. 579 */ 580 if (fastopen && !fastopen->sk) 581 break; 582 583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 584 585 if (!sock_owned_by_user(sk)) { 586 sk->sk_err = err; 587 588 sk->sk_error_report(sk); 589 590 tcp_done(sk); 591 } else { 592 sk->sk_err_soft = err; 593 } 594 goto out; 595 } 596 597 /* If we've already connected we will keep trying 598 * until we time out, or the user gives up. 599 * 600 * rfc1122 4.2.3.9 allows to consider as hard errors 601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 602 * but it is obsoleted by pmtu discovery). 603 * 604 * Note, that in modern internet, where routing is unreliable 605 * and in each dark corner broken firewalls sit, sending random 606 * errors ordered by their masters even this two messages finally lose 607 * their original sense (even Linux sends invalid PORT_UNREACHs) 608 * 609 * Now we are in compliance with RFCs. 610 * --ANK (980905) 611 */ 612 613 inet = inet_sk(sk); 614 if (!sock_owned_by_user(sk) && inet->recverr) { 615 sk->sk_err = err; 616 sk->sk_error_report(sk); 617 } else { /* Only an error on timeout */ 618 sk->sk_err_soft = err; 619 } 620 621 out: 622 bh_unlock_sock(sk); 623 sock_put(sk); 624 return 0; 625 } 626 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 628 { 629 struct tcphdr *th = tcp_hdr(skb); 630 631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 632 skb->csum_start = skb_transport_header(skb) - skb->head; 633 skb->csum_offset = offsetof(struct tcphdr, check); 634 } 635 636 /* This routine computes an IPv4 TCP checksum. */ 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 638 { 639 const struct inet_sock *inet = inet_sk(sk); 640 641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 642 } 643 EXPORT_SYMBOL(tcp_v4_send_check); 644 645 /* 646 * This routine will send an RST to the other tcp. 647 * 648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 649 * for reset. 650 * Answer: if a packet caused RST, it is not for a socket 651 * existing in our system, if it is matched to a socket, 652 * it is just duplicate segment or bug in other side's TCP. 653 * So that we build reply only basing on parameters 654 * arrived with segment. 655 * Exception: precedence violation. We do not implement it in any case. 656 */ 657 658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 659 { 660 const struct tcphdr *th = tcp_hdr(skb); 661 struct { 662 struct tcphdr th; 663 #ifdef CONFIG_TCP_MD5SIG 664 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 665 #endif 666 } rep; 667 struct ip_reply_arg arg; 668 #ifdef CONFIG_TCP_MD5SIG 669 struct tcp_md5sig_key *key = NULL; 670 const __u8 *hash_location = NULL; 671 unsigned char newhash[16]; 672 int genhash; 673 struct sock *sk1 = NULL; 674 #endif 675 u64 transmit_time = 0; 676 struct sock *ctl_sk; 677 struct net *net; 678 679 /* Never send a reset in response to a reset. */ 680 if (th->rst) 681 return; 682 683 /* If sk not NULL, it means we did a successful lookup and incoming 684 * route had to be correct. prequeue might have dropped our dst. 685 */ 686 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 687 return; 688 689 /* Swap the send and the receive. */ 690 memset(&rep, 0, sizeof(rep)); 691 rep.th.dest = th->source; 692 rep.th.source = th->dest; 693 rep.th.doff = sizeof(struct tcphdr) / 4; 694 rep.th.rst = 1; 695 696 if (th->ack) { 697 rep.th.seq = th->ack_seq; 698 } else { 699 rep.th.ack = 1; 700 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 701 skb->len - (th->doff << 2)); 702 } 703 704 memset(&arg, 0, sizeof(arg)); 705 arg.iov[0].iov_base = (unsigned char *)&rep; 706 arg.iov[0].iov_len = sizeof(rep.th); 707 708 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 709 #ifdef CONFIG_TCP_MD5SIG 710 rcu_read_lock(); 711 hash_location = tcp_parse_md5sig_option(th); 712 if (sk && sk_fullsock(sk)) { 713 const union tcp_md5_addr *addr; 714 int l3index; 715 716 /* sdif set, means packet ingressed via a device 717 * in an L3 domain and inet_iif is set to it. 718 */ 719 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 720 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 721 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 722 } else if (hash_location) { 723 const union tcp_md5_addr *addr; 724 int sdif = tcp_v4_sdif(skb); 725 int dif = inet_iif(skb); 726 int l3index; 727 728 /* 729 * active side is lost. Try to find listening socket through 730 * source port, and then find md5 key through listening socket. 731 * we are not loose security here: 732 * Incoming packet is checked with md5 hash with finding key, 733 * no RST generated if md5 hash doesn't match. 734 */ 735 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 736 ip_hdr(skb)->saddr, 737 th->source, ip_hdr(skb)->daddr, 738 ntohs(th->source), dif, sdif); 739 /* don't send rst if it can't find key */ 740 if (!sk1) 741 goto out; 742 743 /* sdif set, means packet ingressed via a device 744 * in an L3 domain and dif is set to it. 745 */ 746 l3index = sdif ? dif : 0; 747 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 748 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 749 if (!key) 750 goto out; 751 752 753 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 754 if (genhash || memcmp(hash_location, newhash, 16) != 0) 755 goto out; 756 757 } 758 759 if (key) { 760 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 761 (TCPOPT_NOP << 16) | 762 (TCPOPT_MD5SIG << 8) | 763 TCPOLEN_MD5SIG); 764 /* Update length and the length the header thinks exists */ 765 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 766 rep.th.doff = arg.iov[0].iov_len / 4; 767 768 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 769 key, ip_hdr(skb)->saddr, 770 ip_hdr(skb)->daddr, &rep.th); 771 } 772 #endif 773 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 774 ip_hdr(skb)->saddr, /* XXX */ 775 arg.iov[0].iov_len, IPPROTO_TCP, 0); 776 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 777 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 778 779 /* When socket is gone, all binding information is lost. 780 * routing might fail in this case. No choice here, if we choose to force 781 * input interface, we will misroute in case of asymmetric route. 782 */ 783 if (sk) { 784 arg.bound_dev_if = sk->sk_bound_dev_if; 785 if (sk_fullsock(sk)) 786 trace_tcp_send_reset(sk, skb); 787 } 788 789 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 790 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 791 792 arg.tos = ip_hdr(skb)->tos; 793 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 794 local_bh_disable(); 795 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 796 if (sk) { 797 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 798 inet_twsk(sk)->tw_mark : sk->sk_mark; 799 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 800 inet_twsk(sk)->tw_priority : sk->sk_priority; 801 transmit_time = tcp_transmit_time(sk); 802 } 803 ip_send_unicast_reply(ctl_sk, 804 skb, &TCP_SKB_CB(skb)->header.h4.opt, 805 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 806 &arg, arg.iov[0].iov_len, 807 transmit_time); 808 809 ctl_sk->sk_mark = 0; 810 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 811 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 812 local_bh_enable(); 813 814 #ifdef CONFIG_TCP_MD5SIG 815 out: 816 rcu_read_unlock(); 817 #endif 818 } 819 820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 821 outside socket context is ugly, certainly. What can I do? 822 */ 823 824 static void tcp_v4_send_ack(const struct sock *sk, 825 struct sk_buff *skb, u32 seq, u32 ack, 826 u32 win, u32 tsval, u32 tsecr, int oif, 827 struct tcp_md5sig_key *key, 828 int reply_flags, u8 tos) 829 { 830 const struct tcphdr *th = tcp_hdr(skb); 831 struct { 832 struct tcphdr th; 833 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 834 #ifdef CONFIG_TCP_MD5SIG 835 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 836 #endif 837 ]; 838 } rep; 839 struct net *net = sock_net(sk); 840 struct ip_reply_arg arg; 841 struct sock *ctl_sk; 842 u64 transmit_time; 843 844 memset(&rep.th, 0, sizeof(struct tcphdr)); 845 memset(&arg, 0, sizeof(arg)); 846 847 arg.iov[0].iov_base = (unsigned char *)&rep; 848 arg.iov[0].iov_len = sizeof(rep.th); 849 if (tsecr) { 850 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 851 (TCPOPT_TIMESTAMP << 8) | 852 TCPOLEN_TIMESTAMP); 853 rep.opt[1] = htonl(tsval); 854 rep.opt[2] = htonl(tsecr); 855 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 856 } 857 858 /* Swap the send and the receive. */ 859 rep.th.dest = th->source; 860 rep.th.source = th->dest; 861 rep.th.doff = arg.iov[0].iov_len / 4; 862 rep.th.seq = htonl(seq); 863 rep.th.ack_seq = htonl(ack); 864 rep.th.ack = 1; 865 rep.th.window = htons(win); 866 867 #ifdef CONFIG_TCP_MD5SIG 868 if (key) { 869 int offset = (tsecr) ? 3 : 0; 870 871 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 872 (TCPOPT_NOP << 16) | 873 (TCPOPT_MD5SIG << 8) | 874 TCPOLEN_MD5SIG); 875 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 876 rep.th.doff = arg.iov[0].iov_len/4; 877 878 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 879 key, ip_hdr(skb)->saddr, 880 ip_hdr(skb)->daddr, &rep.th); 881 } 882 #endif 883 arg.flags = reply_flags; 884 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 885 ip_hdr(skb)->saddr, /* XXX */ 886 arg.iov[0].iov_len, IPPROTO_TCP, 0); 887 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 888 if (oif) 889 arg.bound_dev_if = oif; 890 arg.tos = tos; 891 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 892 local_bh_disable(); 893 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 894 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 895 inet_twsk(sk)->tw_mark : sk->sk_mark; 896 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 897 inet_twsk(sk)->tw_priority : sk->sk_priority; 898 transmit_time = tcp_transmit_time(sk); 899 ip_send_unicast_reply(ctl_sk, 900 skb, &TCP_SKB_CB(skb)->header.h4.opt, 901 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 902 &arg, arg.iov[0].iov_len, 903 transmit_time); 904 905 ctl_sk->sk_mark = 0; 906 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 907 local_bh_enable(); 908 } 909 910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 911 { 912 struct inet_timewait_sock *tw = inet_twsk(sk); 913 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 914 915 tcp_v4_send_ack(sk, skb, 916 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 917 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 918 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 919 tcptw->tw_ts_recent, 920 tw->tw_bound_dev_if, 921 tcp_twsk_md5_key(tcptw), 922 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 923 tw->tw_tos 924 ); 925 926 inet_twsk_put(tw); 927 } 928 929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 930 struct request_sock *req) 931 { 932 const union tcp_md5_addr *addr; 933 int l3index; 934 935 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 936 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 937 */ 938 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 939 tcp_sk(sk)->snd_nxt; 940 941 /* RFC 7323 2.3 942 * The window field (SEG.WND) of every outgoing segment, with the 943 * exception of <SYN> segments, MUST be right-shifted by 944 * Rcv.Wind.Shift bits: 945 */ 946 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 947 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 948 tcp_v4_send_ack(sk, skb, seq, 949 tcp_rsk(req)->rcv_nxt, 950 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 951 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 952 req->ts_recent, 953 0, 954 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 955 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 956 ip_hdr(skb)->tos); 957 } 958 959 /* 960 * Send a SYN-ACK after having received a SYN. 961 * This still operates on a request_sock only, not on a big 962 * socket. 963 */ 964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 965 struct flowi *fl, 966 struct request_sock *req, 967 struct tcp_fastopen_cookie *foc, 968 enum tcp_synack_type synack_type, 969 struct sk_buff *syn_skb) 970 { 971 const struct inet_request_sock *ireq = inet_rsk(req); 972 struct flowi4 fl4; 973 int err = -1; 974 struct sk_buff *skb; 975 976 /* First, grab a route. */ 977 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 978 return -1; 979 980 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 981 982 if (skb) { 983 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 984 985 rcu_read_lock(); 986 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 987 ireq->ir_rmt_addr, 988 rcu_dereference(ireq->ireq_opt)); 989 rcu_read_unlock(); 990 err = net_xmit_eval(err); 991 } 992 993 return err; 994 } 995 996 /* 997 * IPv4 request_sock destructor. 998 */ 999 static void tcp_v4_reqsk_destructor(struct request_sock *req) 1000 { 1001 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1002 } 1003 1004 #ifdef CONFIG_TCP_MD5SIG 1005 /* 1006 * RFC2385 MD5 checksumming requires a mapping of 1007 * IP address->MD5 Key. 1008 * We need to maintain these in the sk structure. 1009 */ 1010 1011 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1012 EXPORT_SYMBOL(tcp_md5_needed); 1013 1014 /* Find the Key structure for an address. */ 1015 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1016 const union tcp_md5_addr *addr, 1017 int family) 1018 { 1019 const struct tcp_sock *tp = tcp_sk(sk); 1020 struct tcp_md5sig_key *key; 1021 const struct tcp_md5sig_info *md5sig; 1022 __be32 mask; 1023 struct tcp_md5sig_key *best_match = NULL; 1024 bool match; 1025 1026 /* caller either holds rcu_read_lock() or socket lock */ 1027 md5sig = rcu_dereference_check(tp->md5sig_info, 1028 lockdep_sock_is_held(sk)); 1029 if (!md5sig) 1030 return NULL; 1031 1032 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1033 lockdep_sock_is_held(sk)) { 1034 if (key->family != family) 1035 continue; 1036 if (key->l3index && key->l3index != l3index) 1037 continue; 1038 if (family == AF_INET) { 1039 mask = inet_make_mask(key->prefixlen); 1040 match = (key->addr.a4.s_addr & mask) == 1041 (addr->a4.s_addr & mask); 1042 #if IS_ENABLED(CONFIG_IPV6) 1043 } else if (family == AF_INET6) { 1044 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1045 key->prefixlen); 1046 #endif 1047 } else { 1048 match = false; 1049 } 1050 1051 if (match && (!best_match || 1052 key->prefixlen > best_match->prefixlen)) 1053 best_match = key; 1054 } 1055 return best_match; 1056 } 1057 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1058 1059 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1060 const union tcp_md5_addr *addr, 1061 int family, u8 prefixlen, 1062 int l3index) 1063 { 1064 const struct tcp_sock *tp = tcp_sk(sk); 1065 struct tcp_md5sig_key *key; 1066 unsigned int size = sizeof(struct in_addr); 1067 const struct tcp_md5sig_info *md5sig; 1068 1069 /* caller either holds rcu_read_lock() or socket lock */ 1070 md5sig = rcu_dereference_check(tp->md5sig_info, 1071 lockdep_sock_is_held(sk)); 1072 if (!md5sig) 1073 return NULL; 1074 #if IS_ENABLED(CONFIG_IPV6) 1075 if (family == AF_INET6) 1076 size = sizeof(struct in6_addr); 1077 #endif 1078 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1079 lockdep_sock_is_held(sk)) { 1080 if (key->family != family) 1081 continue; 1082 if (key->l3index && key->l3index != l3index) 1083 continue; 1084 if (!memcmp(&key->addr, addr, size) && 1085 key->prefixlen == prefixlen) 1086 return key; 1087 } 1088 return NULL; 1089 } 1090 1091 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1092 const struct sock *addr_sk) 1093 { 1094 const union tcp_md5_addr *addr; 1095 int l3index; 1096 1097 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1098 addr_sk->sk_bound_dev_if); 1099 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1100 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1101 } 1102 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1103 1104 /* This can be called on a newly created socket, from other files */ 1105 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1106 int family, u8 prefixlen, int l3index, 1107 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1108 { 1109 /* Add Key to the list */ 1110 struct tcp_md5sig_key *key; 1111 struct tcp_sock *tp = tcp_sk(sk); 1112 struct tcp_md5sig_info *md5sig; 1113 1114 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1115 if (key) { 1116 /* Pre-existing entry - just update that one. 1117 * Note that the key might be used concurrently. 1118 * data_race() is telling kcsan that we do not care of 1119 * key mismatches, since changing MD5 key on live flows 1120 * can lead to packet drops. 1121 */ 1122 data_race(memcpy(key->key, newkey, newkeylen)); 1123 1124 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1125 * Also note that a reader could catch new key->keylen value 1126 * but old key->key[], this is the reason we use __GFP_ZERO 1127 * at sock_kmalloc() time below these lines. 1128 */ 1129 WRITE_ONCE(key->keylen, newkeylen); 1130 1131 return 0; 1132 } 1133 1134 md5sig = rcu_dereference_protected(tp->md5sig_info, 1135 lockdep_sock_is_held(sk)); 1136 if (!md5sig) { 1137 md5sig = kmalloc(sizeof(*md5sig), gfp); 1138 if (!md5sig) 1139 return -ENOMEM; 1140 1141 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1142 INIT_HLIST_HEAD(&md5sig->head); 1143 rcu_assign_pointer(tp->md5sig_info, md5sig); 1144 } 1145 1146 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1147 if (!key) 1148 return -ENOMEM; 1149 if (!tcp_alloc_md5sig_pool()) { 1150 sock_kfree_s(sk, key, sizeof(*key)); 1151 return -ENOMEM; 1152 } 1153 1154 memcpy(key->key, newkey, newkeylen); 1155 key->keylen = newkeylen; 1156 key->family = family; 1157 key->prefixlen = prefixlen; 1158 key->l3index = l3index; 1159 memcpy(&key->addr, addr, 1160 (family == AF_INET6) ? sizeof(struct in6_addr) : 1161 sizeof(struct in_addr)); 1162 hlist_add_head_rcu(&key->node, &md5sig->head); 1163 return 0; 1164 } 1165 EXPORT_SYMBOL(tcp_md5_do_add); 1166 1167 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1168 u8 prefixlen, int l3index) 1169 { 1170 struct tcp_md5sig_key *key; 1171 1172 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1173 if (!key) 1174 return -ENOENT; 1175 hlist_del_rcu(&key->node); 1176 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1177 kfree_rcu(key, rcu); 1178 return 0; 1179 } 1180 EXPORT_SYMBOL(tcp_md5_do_del); 1181 1182 static void tcp_clear_md5_list(struct sock *sk) 1183 { 1184 struct tcp_sock *tp = tcp_sk(sk); 1185 struct tcp_md5sig_key *key; 1186 struct hlist_node *n; 1187 struct tcp_md5sig_info *md5sig; 1188 1189 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1190 1191 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1192 hlist_del_rcu(&key->node); 1193 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1194 kfree_rcu(key, rcu); 1195 } 1196 } 1197 1198 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1199 sockptr_t optval, int optlen) 1200 { 1201 struct tcp_md5sig cmd; 1202 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1203 const union tcp_md5_addr *addr; 1204 u8 prefixlen = 32; 1205 int l3index = 0; 1206 1207 if (optlen < sizeof(cmd)) 1208 return -EINVAL; 1209 1210 if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1211 return -EFAULT; 1212 1213 if (sin->sin_family != AF_INET) 1214 return -EINVAL; 1215 1216 if (optname == TCP_MD5SIG_EXT && 1217 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1218 prefixlen = cmd.tcpm_prefixlen; 1219 if (prefixlen > 32) 1220 return -EINVAL; 1221 } 1222 1223 if (optname == TCP_MD5SIG_EXT && 1224 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1225 struct net_device *dev; 1226 1227 rcu_read_lock(); 1228 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1229 if (dev && netif_is_l3_master(dev)) 1230 l3index = dev->ifindex; 1231 1232 rcu_read_unlock(); 1233 1234 /* ok to reference set/not set outside of rcu; 1235 * right now device MUST be an L3 master 1236 */ 1237 if (!dev || !l3index) 1238 return -EINVAL; 1239 } 1240 1241 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1242 1243 if (!cmd.tcpm_keylen) 1244 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); 1245 1246 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1247 return -EINVAL; 1248 1249 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, 1250 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1251 } 1252 1253 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1254 __be32 daddr, __be32 saddr, 1255 const struct tcphdr *th, int nbytes) 1256 { 1257 struct tcp4_pseudohdr *bp; 1258 struct scatterlist sg; 1259 struct tcphdr *_th; 1260 1261 bp = hp->scratch; 1262 bp->saddr = saddr; 1263 bp->daddr = daddr; 1264 bp->pad = 0; 1265 bp->protocol = IPPROTO_TCP; 1266 bp->len = cpu_to_be16(nbytes); 1267 1268 _th = (struct tcphdr *)(bp + 1); 1269 memcpy(_th, th, sizeof(*th)); 1270 _th->check = 0; 1271 1272 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1273 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1274 sizeof(*bp) + sizeof(*th)); 1275 return crypto_ahash_update(hp->md5_req); 1276 } 1277 1278 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1279 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1280 { 1281 struct tcp_md5sig_pool *hp; 1282 struct ahash_request *req; 1283 1284 hp = tcp_get_md5sig_pool(); 1285 if (!hp) 1286 goto clear_hash_noput; 1287 req = hp->md5_req; 1288 1289 if (crypto_ahash_init(req)) 1290 goto clear_hash; 1291 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1292 goto clear_hash; 1293 if (tcp_md5_hash_key(hp, key)) 1294 goto clear_hash; 1295 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1296 if (crypto_ahash_final(req)) 1297 goto clear_hash; 1298 1299 tcp_put_md5sig_pool(); 1300 return 0; 1301 1302 clear_hash: 1303 tcp_put_md5sig_pool(); 1304 clear_hash_noput: 1305 memset(md5_hash, 0, 16); 1306 return 1; 1307 } 1308 1309 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1310 const struct sock *sk, 1311 const struct sk_buff *skb) 1312 { 1313 struct tcp_md5sig_pool *hp; 1314 struct ahash_request *req; 1315 const struct tcphdr *th = tcp_hdr(skb); 1316 __be32 saddr, daddr; 1317 1318 if (sk) { /* valid for establish/request sockets */ 1319 saddr = sk->sk_rcv_saddr; 1320 daddr = sk->sk_daddr; 1321 } else { 1322 const struct iphdr *iph = ip_hdr(skb); 1323 saddr = iph->saddr; 1324 daddr = iph->daddr; 1325 } 1326 1327 hp = tcp_get_md5sig_pool(); 1328 if (!hp) 1329 goto clear_hash_noput; 1330 req = hp->md5_req; 1331 1332 if (crypto_ahash_init(req)) 1333 goto clear_hash; 1334 1335 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1336 goto clear_hash; 1337 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1338 goto clear_hash; 1339 if (tcp_md5_hash_key(hp, key)) 1340 goto clear_hash; 1341 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1342 if (crypto_ahash_final(req)) 1343 goto clear_hash; 1344 1345 tcp_put_md5sig_pool(); 1346 return 0; 1347 1348 clear_hash: 1349 tcp_put_md5sig_pool(); 1350 clear_hash_noput: 1351 memset(md5_hash, 0, 16); 1352 return 1; 1353 } 1354 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1355 1356 #endif 1357 1358 /* Called with rcu_read_lock() */ 1359 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1360 const struct sk_buff *skb, 1361 int dif, int sdif) 1362 { 1363 #ifdef CONFIG_TCP_MD5SIG 1364 /* 1365 * This gets called for each TCP segment that arrives 1366 * so we want to be efficient. 1367 * We have 3 drop cases: 1368 * o No MD5 hash and one expected. 1369 * o MD5 hash and we're not expecting one. 1370 * o MD5 hash and its wrong. 1371 */ 1372 const __u8 *hash_location = NULL; 1373 struct tcp_md5sig_key *hash_expected; 1374 const struct iphdr *iph = ip_hdr(skb); 1375 const struct tcphdr *th = tcp_hdr(skb); 1376 const union tcp_md5_addr *addr; 1377 unsigned char newhash[16]; 1378 int genhash, l3index; 1379 1380 /* sdif set, means packet ingressed via a device 1381 * in an L3 domain and dif is set to the l3mdev 1382 */ 1383 l3index = sdif ? dif : 0; 1384 1385 addr = (union tcp_md5_addr *)&iph->saddr; 1386 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1387 hash_location = tcp_parse_md5sig_option(th); 1388 1389 /* We've parsed the options - do we have a hash? */ 1390 if (!hash_expected && !hash_location) 1391 return false; 1392 1393 if (hash_expected && !hash_location) { 1394 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1395 return true; 1396 } 1397 1398 if (!hash_expected && hash_location) { 1399 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1400 return true; 1401 } 1402 1403 /* Okay, so this is hash_expected and hash_location - 1404 * so we need to calculate the checksum. 1405 */ 1406 genhash = tcp_v4_md5_hash_skb(newhash, 1407 hash_expected, 1408 NULL, skb); 1409 1410 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1411 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1412 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1413 &iph->saddr, ntohs(th->source), 1414 &iph->daddr, ntohs(th->dest), 1415 genhash ? " tcp_v4_calc_md5_hash failed" 1416 : "", l3index); 1417 return true; 1418 } 1419 return false; 1420 #endif 1421 return false; 1422 } 1423 1424 static void tcp_v4_init_req(struct request_sock *req, 1425 const struct sock *sk_listener, 1426 struct sk_buff *skb) 1427 { 1428 struct inet_request_sock *ireq = inet_rsk(req); 1429 struct net *net = sock_net(sk_listener); 1430 1431 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1432 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1433 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1434 } 1435 1436 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1437 struct flowi *fl, 1438 const struct request_sock *req) 1439 { 1440 return inet_csk_route_req(sk, &fl->u.ip4, req); 1441 } 1442 1443 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1444 .family = PF_INET, 1445 .obj_size = sizeof(struct tcp_request_sock), 1446 .rtx_syn_ack = tcp_rtx_synack, 1447 .send_ack = tcp_v4_reqsk_send_ack, 1448 .destructor = tcp_v4_reqsk_destructor, 1449 .send_reset = tcp_v4_send_reset, 1450 .syn_ack_timeout = tcp_syn_ack_timeout, 1451 }; 1452 1453 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1454 .mss_clamp = TCP_MSS_DEFAULT, 1455 #ifdef CONFIG_TCP_MD5SIG 1456 .req_md5_lookup = tcp_v4_md5_lookup, 1457 .calc_md5_hash = tcp_v4_md5_hash_skb, 1458 #endif 1459 .init_req = tcp_v4_init_req, 1460 #ifdef CONFIG_SYN_COOKIES 1461 .cookie_init_seq = cookie_v4_init_sequence, 1462 #endif 1463 .route_req = tcp_v4_route_req, 1464 .init_seq = tcp_v4_init_seq, 1465 .init_ts_off = tcp_v4_init_ts_off, 1466 .send_synack = tcp_v4_send_synack, 1467 }; 1468 1469 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1470 { 1471 /* Never answer to SYNs send to broadcast or multicast */ 1472 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1473 goto drop; 1474 1475 return tcp_conn_request(&tcp_request_sock_ops, 1476 &tcp_request_sock_ipv4_ops, sk, skb); 1477 1478 drop: 1479 tcp_listendrop(sk); 1480 return 0; 1481 } 1482 EXPORT_SYMBOL(tcp_v4_conn_request); 1483 1484 1485 /* 1486 * The three way handshake has completed - we got a valid synack - 1487 * now create the new socket. 1488 */ 1489 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1490 struct request_sock *req, 1491 struct dst_entry *dst, 1492 struct request_sock *req_unhash, 1493 bool *own_req) 1494 { 1495 struct inet_request_sock *ireq; 1496 struct inet_sock *newinet; 1497 struct tcp_sock *newtp; 1498 struct sock *newsk; 1499 #ifdef CONFIG_TCP_MD5SIG 1500 const union tcp_md5_addr *addr; 1501 struct tcp_md5sig_key *key; 1502 int l3index; 1503 #endif 1504 struct ip_options_rcu *inet_opt; 1505 1506 if (sk_acceptq_is_full(sk)) 1507 goto exit_overflow; 1508 1509 newsk = tcp_create_openreq_child(sk, req, skb); 1510 if (!newsk) 1511 goto exit_nonewsk; 1512 1513 newsk->sk_gso_type = SKB_GSO_TCPV4; 1514 inet_sk_rx_dst_set(newsk, skb); 1515 1516 newtp = tcp_sk(newsk); 1517 newinet = inet_sk(newsk); 1518 ireq = inet_rsk(req); 1519 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1520 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1521 newsk->sk_bound_dev_if = ireq->ir_iif; 1522 newinet->inet_saddr = ireq->ir_loc_addr; 1523 inet_opt = rcu_dereference(ireq->ireq_opt); 1524 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1525 newinet->mc_index = inet_iif(skb); 1526 newinet->mc_ttl = ip_hdr(skb)->ttl; 1527 newinet->rcv_tos = ip_hdr(skb)->tos; 1528 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1529 if (inet_opt) 1530 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1531 newinet->inet_id = prandom_u32(); 1532 1533 if (!dst) { 1534 dst = inet_csk_route_child_sock(sk, newsk, req); 1535 if (!dst) 1536 goto put_and_exit; 1537 } else { 1538 /* syncookie case : see end of cookie_v4_check() */ 1539 } 1540 sk_setup_caps(newsk, dst); 1541 1542 tcp_ca_openreq_child(newsk, dst); 1543 1544 tcp_sync_mss(newsk, dst_mtu(dst)); 1545 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1546 1547 tcp_initialize_rcv_mss(newsk); 1548 1549 #ifdef CONFIG_TCP_MD5SIG 1550 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1551 /* Copy over the MD5 key from the original socket */ 1552 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1553 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1554 if (key) { 1555 /* 1556 * We're using one, so create a matching key 1557 * on the newsk structure. If we fail to get 1558 * memory, then we end up not copying the key 1559 * across. Shucks. 1560 */ 1561 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, 1562 key->key, key->keylen, GFP_ATOMIC); 1563 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1564 } 1565 #endif 1566 1567 if (__inet_inherit_port(sk, newsk) < 0) 1568 goto put_and_exit; 1569 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1570 if (likely(*own_req)) { 1571 tcp_move_syn(newtp, req); 1572 ireq->ireq_opt = NULL; 1573 } else { 1574 newinet->inet_opt = NULL; 1575 } 1576 return newsk; 1577 1578 exit_overflow: 1579 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1580 exit_nonewsk: 1581 dst_release(dst); 1582 exit: 1583 tcp_listendrop(sk); 1584 return NULL; 1585 put_and_exit: 1586 newinet->inet_opt = NULL; 1587 inet_csk_prepare_forced_close(newsk); 1588 tcp_done(newsk); 1589 goto exit; 1590 } 1591 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1592 1593 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1594 { 1595 #ifdef CONFIG_SYN_COOKIES 1596 const struct tcphdr *th = tcp_hdr(skb); 1597 1598 if (!th->syn) 1599 sk = cookie_v4_check(sk, skb); 1600 #endif 1601 return sk; 1602 } 1603 1604 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1605 struct tcphdr *th, u32 *cookie) 1606 { 1607 u16 mss = 0; 1608 #ifdef CONFIG_SYN_COOKIES 1609 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1610 &tcp_request_sock_ipv4_ops, sk, th); 1611 if (mss) { 1612 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1613 tcp_synq_overflow(sk); 1614 } 1615 #endif 1616 return mss; 1617 } 1618 1619 /* The socket must have it's spinlock held when we get 1620 * here, unless it is a TCP_LISTEN socket. 1621 * 1622 * We have a potential double-lock case here, so even when 1623 * doing backlog processing we use the BH locking scheme. 1624 * This is because we cannot sleep with the original spinlock 1625 * held. 1626 */ 1627 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1628 { 1629 struct sock *rsk; 1630 1631 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1632 struct dst_entry *dst = sk->sk_rx_dst; 1633 1634 sock_rps_save_rxhash(sk, skb); 1635 sk_mark_napi_id(sk, skb); 1636 if (dst) { 1637 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1638 !dst->ops->check(dst, 0)) { 1639 dst_release(dst); 1640 sk->sk_rx_dst = NULL; 1641 } 1642 } 1643 tcp_rcv_established(sk, skb); 1644 return 0; 1645 } 1646 1647 if (tcp_checksum_complete(skb)) 1648 goto csum_err; 1649 1650 if (sk->sk_state == TCP_LISTEN) { 1651 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1652 1653 if (!nsk) 1654 goto discard; 1655 if (nsk != sk) { 1656 if (tcp_child_process(sk, nsk, skb)) { 1657 rsk = nsk; 1658 goto reset; 1659 } 1660 return 0; 1661 } 1662 } else 1663 sock_rps_save_rxhash(sk, skb); 1664 1665 if (tcp_rcv_state_process(sk, skb)) { 1666 rsk = sk; 1667 goto reset; 1668 } 1669 return 0; 1670 1671 reset: 1672 tcp_v4_send_reset(rsk, skb); 1673 discard: 1674 kfree_skb(skb); 1675 /* Be careful here. If this function gets more complicated and 1676 * gcc suffers from register pressure on the x86, sk (in %ebx) 1677 * might be destroyed here. This current version compiles correctly, 1678 * but you have been warned. 1679 */ 1680 return 0; 1681 1682 csum_err: 1683 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1684 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1685 goto discard; 1686 } 1687 EXPORT_SYMBOL(tcp_v4_do_rcv); 1688 1689 int tcp_v4_early_demux(struct sk_buff *skb) 1690 { 1691 const struct iphdr *iph; 1692 const struct tcphdr *th; 1693 struct sock *sk; 1694 1695 if (skb->pkt_type != PACKET_HOST) 1696 return 0; 1697 1698 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1699 return 0; 1700 1701 iph = ip_hdr(skb); 1702 th = tcp_hdr(skb); 1703 1704 if (th->doff < sizeof(struct tcphdr) / 4) 1705 return 0; 1706 1707 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1708 iph->saddr, th->source, 1709 iph->daddr, ntohs(th->dest), 1710 skb->skb_iif, inet_sdif(skb)); 1711 if (sk) { 1712 skb->sk = sk; 1713 skb->destructor = sock_edemux; 1714 if (sk_fullsock(sk)) { 1715 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1716 1717 if (dst) 1718 dst = dst_check(dst, 0); 1719 if (dst && 1720 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1721 skb_dst_set_noref(skb, dst); 1722 } 1723 } 1724 return 0; 1725 } 1726 1727 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1728 { 1729 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 1730 struct skb_shared_info *shinfo; 1731 const struct tcphdr *th; 1732 struct tcphdr *thtail; 1733 struct sk_buff *tail; 1734 unsigned int hdrlen; 1735 bool fragstolen; 1736 u32 gso_segs; 1737 int delta; 1738 1739 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1740 * we can fix skb->truesize to its real value to avoid future drops. 1741 * This is valid because skb is not yet charged to the socket. 1742 * It has been noticed pure SACK packets were sometimes dropped 1743 * (if cooked by drivers without copybreak feature). 1744 */ 1745 skb_condense(skb); 1746 1747 skb_dst_drop(skb); 1748 1749 if (unlikely(tcp_checksum_complete(skb))) { 1750 bh_unlock_sock(sk); 1751 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1752 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1753 return true; 1754 } 1755 1756 /* Attempt coalescing to last skb in backlog, even if we are 1757 * above the limits. 1758 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1759 */ 1760 th = (const struct tcphdr *)skb->data; 1761 hdrlen = th->doff * 4; 1762 shinfo = skb_shinfo(skb); 1763 1764 if (!shinfo->gso_size) 1765 shinfo->gso_size = skb->len - hdrlen; 1766 1767 if (!shinfo->gso_segs) 1768 shinfo->gso_segs = 1; 1769 1770 tail = sk->sk_backlog.tail; 1771 if (!tail) 1772 goto no_coalesce; 1773 thtail = (struct tcphdr *)tail->data; 1774 1775 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1776 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1777 ((TCP_SKB_CB(tail)->tcp_flags | 1778 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1779 !((TCP_SKB_CB(tail)->tcp_flags & 1780 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1781 ((TCP_SKB_CB(tail)->tcp_flags ^ 1782 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1783 #ifdef CONFIG_TLS_DEVICE 1784 tail->decrypted != skb->decrypted || 1785 #endif 1786 thtail->doff != th->doff || 1787 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1788 goto no_coalesce; 1789 1790 __skb_pull(skb, hdrlen); 1791 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1792 thtail->window = th->window; 1793 1794 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1795 1796 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) 1797 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1798 1799 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1800 * thtail->fin, so that the fast path in tcp_rcv_established() 1801 * is not entered if we append a packet with a FIN. 1802 * SYN, RST, URG are not present. 1803 * ACK is set on both packets. 1804 * PSH : we do not really care in TCP stack, 1805 * at least for 'GRO' packets. 1806 */ 1807 thtail->fin |= th->fin; 1808 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1809 1810 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1811 TCP_SKB_CB(tail)->has_rxtstamp = true; 1812 tail->tstamp = skb->tstamp; 1813 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1814 } 1815 1816 /* Not as strict as GRO. We only need to carry mss max value */ 1817 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1818 skb_shinfo(tail)->gso_size); 1819 1820 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1821 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1822 1823 sk->sk_backlog.len += delta; 1824 __NET_INC_STATS(sock_net(sk), 1825 LINUX_MIB_TCPBACKLOGCOALESCE); 1826 kfree_skb_partial(skb, fragstolen); 1827 return false; 1828 } 1829 __skb_push(skb, hdrlen); 1830 1831 no_coalesce: 1832 /* Only socket owner can try to collapse/prune rx queues 1833 * to reduce memory overhead, so add a little headroom here. 1834 * Few sockets backlog are possibly concurrently non empty. 1835 */ 1836 limit += 64*1024; 1837 1838 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1839 bh_unlock_sock(sk); 1840 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1841 return true; 1842 } 1843 return false; 1844 } 1845 EXPORT_SYMBOL(tcp_add_backlog); 1846 1847 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1848 { 1849 struct tcphdr *th = (struct tcphdr *)skb->data; 1850 1851 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1852 } 1853 EXPORT_SYMBOL(tcp_filter); 1854 1855 static void tcp_v4_restore_cb(struct sk_buff *skb) 1856 { 1857 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1858 sizeof(struct inet_skb_parm)); 1859 } 1860 1861 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1862 const struct tcphdr *th) 1863 { 1864 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1865 * barrier() makes sure compiler wont play fool^Waliasing games. 1866 */ 1867 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1868 sizeof(struct inet_skb_parm)); 1869 barrier(); 1870 1871 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1872 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1873 skb->len - th->doff * 4); 1874 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1875 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1876 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1877 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1878 TCP_SKB_CB(skb)->sacked = 0; 1879 TCP_SKB_CB(skb)->has_rxtstamp = 1880 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1881 } 1882 1883 /* 1884 * From tcp_input.c 1885 */ 1886 1887 int tcp_v4_rcv(struct sk_buff *skb) 1888 { 1889 struct net *net = dev_net(skb->dev); 1890 struct sk_buff *skb_to_free; 1891 int sdif = inet_sdif(skb); 1892 int dif = inet_iif(skb); 1893 const struct iphdr *iph; 1894 const struct tcphdr *th; 1895 bool refcounted; 1896 struct sock *sk; 1897 int ret; 1898 1899 if (skb->pkt_type != PACKET_HOST) 1900 goto discard_it; 1901 1902 /* Count it even if it's bad */ 1903 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1904 1905 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1906 goto discard_it; 1907 1908 th = (const struct tcphdr *)skb->data; 1909 1910 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1911 goto bad_packet; 1912 if (!pskb_may_pull(skb, th->doff * 4)) 1913 goto discard_it; 1914 1915 /* An explanation is required here, I think. 1916 * Packet length and doff are validated by header prediction, 1917 * provided case of th->doff==0 is eliminated. 1918 * So, we defer the checks. */ 1919 1920 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1921 goto csum_error; 1922 1923 th = (const struct tcphdr *)skb->data; 1924 iph = ip_hdr(skb); 1925 lookup: 1926 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1927 th->dest, sdif, &refcounted); 1928 if (!sk) 1929 goto no_tcp_socket; 1930 1931 process: 1932 if (sk->sk_state == TCP_TIME_WAIT) 1933 goto do_time_wait; 1934 1935 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1936 struct request_sock *req = inet_reqsk(sk); 1937 bool req_stolen = false; 1938 struct sock *nsk; 1939 1940 sk = req->rsk_listener; 1941 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 1942 sk_drops_add(sk, skb); 1943 reqsk_put(req); 1944 goto discard_it; 1945 } 1946 if (tcp_checksum_complete(skb)) { 1947 reqsk_put(req); 1948 goto csum_error; 1949 } 1950 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1951 inet_csk_reqsk_queue_drop_and_put(sk, req); 1952 goto lookup; 1953 } 1954 /* We own a reference on the listener, increase it again 1955 * as we might lose it too soon. 1956 */ 1957 sock_hold(sk); 1958 refcounted = true; 1959 nsk = NULL; 1960 if (!tcp_filter(sk, skb)) { 1961 th = (const struct tcphdr *)skb->data; 1962 iph = ip_hdr(skb); 1963 tcp_v4_fill_cb(skb, iph, th); 1964 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1965 } 1966 if (!nsk) { 1967 reqsk_put(req); 1968 if (req_stolen) { 1969 /* Another cpu got exclusive access to req 1970 * and created a full blown socket. 1971 * Try to feed this packet to this socket 1972 * instead of discarding it. 1973 */ 1974 tcp_v4_restore_cb(skb); 1975 sock_put(sk); 1976 goto lookup; 1977 } 1978 goto discard_and_relse; 1979 } 1980 if (nsk == sk) { 1981 reqsk_put(req); 1982 tcp_v4_restore_cb(skb); 1983 } else if (tcp_child_process(sk, nsk, skb)) { 1984 tcp_v4_send_reset(nsk, skb); 1985 goto discard_and_relse; 1986 } else { 1987 sock_put(sk); 1988 return 0; 1989 } 1990 } 1991 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1992 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1993 goto discard_and_relse; 1994 } 1995 1996 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1997 goto discard_and_relse; 1998 1999 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 2000 goto discard_and_relse; 2001 2002 nf_reset_ct(skb); 2003 2004 if (tcp_filter(sk, skb)) 2005 goto discard_and_relse; 2006 th = (const struct tcphdr *)skb->data; 2007 iph = ip_hdr(skb); 2008 tcp_v4_fill_cb(skb, iph, th); 2009 2010 skb->dev = NULL; 2011 2012 if (sk->sk_state == TCP_LISTEN) { 2013 ret = tcp_v4_do_rcv(sk, skb); 2014 goto put_and_return; 2015 } 2016 2017 sk_incoming_cpu_update(sk); 2018 2019 bh_lock_sock_nested(sk); 2020 tcp_segs_in(tcp_sk(sk), skb); 2021 ret = 0; 2022 if (!sock_owned_by_user(sk)) { 2023 skb_to_free = sk->sk_rx_skb_cache; 2024 sk->sk_rx_skb_cache = NULL; 2025 ret = tcp_v4_do_rcv(sk, skb); 2026 } else { 2027 if (tcp_add_backlog(sk, skb)) 2028 goto discard_and_relse; 2029 skb_to_free = NULL; 2030 } 2031 bh_unlock_sock(sk); 2032 if (skb_to_free) 2033 __kfree_skb(skb_to_free); 2034 2035 put_and_return: 2036 if (refcounted) 2037 sock_put(sk); 2038 2039 return ret; 2040 2041 no_tcp_socket: 2042 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2043 goto discard_it; 2044 2045 tcp_v4_fill_cb(skb, iph, th); 2046 2047 if (tcp_checksum_complete(skb)) { 2048 csum_error: 2049 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2050 bad_packet: 2051 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2052 } else { 2053 tcp_v4_send_reset(NULL, skb); 2054 } 2055 2056 discard_it: 2057 /* Discard frame. */ 2058 kfree_skb(skb); 2059 return 0; 2060 2061 discard_and_relse: 2062 sk_drops_add(sk, skb); 2063 if (refcounted) 2064 sock_put(sk); 2065 goto discard_it; 2066 2067 do_time_wait: 2068 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2069 inet_twsk_put(inet_twsk(sk)); 2070 goto discard_it; 2071 } 2072 2073 tcp_v4_fill_cb(skb, iph, th); 2074 2075 if (tcp_checksum_complete(skb)) { 2076 inet_twsk_put(inet_twsk(sk)); 2077 goto csum_error; 2078 } 2079 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2080 case TCP_TW_SYN: { 2081 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2082 &tcp_hashinfo, skb, 2083 __tcp_hdrlen(th), 2084 iph->saddr, th->source, 2085 iph->daddr, th->dest, 2086 inet_iif(skb), 2087 sdif); 2088 if (sk2) { 2089 inet_twsk_deschedule_put(inet_twsk(sk)); 2090 sk = sk2; 2091 tcp_v4_restore_cb(skb); 2092 refcounted = false; 2093 goto process; 2094 } 2095 } 2096 /* to ACK */ 2097 fallthrough; 2098 case TCP_TW_ACK: 2099 tcp_v4_timewait_ack(sk, skb); 2100 break; 2101 case TCP_TW_RST: 2102 tcp_v4_send_reset(sk, skb); 2103 inet_twsk_deschedule_put(inet_twsk(sk)); 2104 goto discard_it; 2105 case TCP_TW_SUCCESS:; 2106 } 2107 goto discard_it; 2108 } 2109 2110 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2111 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2112 .twsk_unique = tcp_twsk_unique, 2113 .twsk_destructor= tcp_twsk_destructor, 2114 }; 2115 2116 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2117 { 2118 struct dst_entry *dst = skb_dst(skb); 2119 2120 if (dst && dst_hold_safe(dst)) { 2121 sk->sk_rx_dst = dst; 2122 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2123 } 2124 } 2125 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2126 2127 const struct inet_connection_sock_af_ops ipv4_specific = { 2128 .queue_xmit = ip_queue_xmit, 2129 .send_check = tcp_v4_send_check, 2130 .rebuild_header = inet_sk_rebuild_header, 2131 .sk_rx_dst_set = inet_sk_rx_dst_set, 2132 .conn_request = tcp_v4_conn_request, 2133 .syn_recv_sock = tcp_v4_syn_recv_sock, 2134 .net_header_len = sizeof(struct iphdr), 2135 .setsockopt = ip_setsockopt, 2136 .getsockopt = ip_getsockopt, 2137 .addr2sockaddr = inet_csk_addr2sockaddr, 2138 .sockaddr_len = sizeof(struct sockaddr_in), 2139 .mtu_reduced = tcp_v4_mtu_reduced, 2140 }; 2141 EXPORT_SYMBOL(ipv4_specific); 2142 2143 #ifdef CONFIG_TCP_MD5SIG 2144 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2145 .md5_lookup = tcp_v4_md5_lookup, 2146 .calc_md5_hash = tcp_v4_md5_hash_skb, 2147 .md5_parse = tcp_v4_parse_md5_keys, 2148 }; 2149 #endif 2150 2151 /* NOTE: A lot of things set to zero explicitly by call to 2152 * sk_alloc() so need not be done here. 2153 */ 2154 static int tcp_v4_init_sock(struct sock *sk) 2155 { 2156 struct inet_connection_sock *icsk = inet_csk(sk); 2157 2158 tcp_init_sock(sk); 2159 2160 icsk->icsk_af_ops = &ipv4_specific; 2161 2162 #ifdef CONFIG_TCP_MD5SIG 2163 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2164 #endif 2165 2166 return 0; 2167 } 2168 2169 void tcp_v4_destroy_sock(struct sock *sk) 2170 { 2171 struct tcp_sock *tp = tcp_sk(sk); 2172 2173 trace_tcp_destroy_sock(sk); 2174 2175 tcp_clear_xmit_timers(sk); 2176 2177 tcp_cleanup_congestion_control(sk); 2178 2179 tcp_cleanup_ulp(sk); 2180 2181 /* Cleanup up the write buffer. */ 2182 tcp_write_queue_purge(sk); 2183 2184 /* Check if we want to disable active TFO */ 2185 tcp_fastopen_active_disable_ofo_check(sk); 2186 2187 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2188 skb_rbtree_purge(&tp->out_of_order_queue); 2189 2190 #ifdef CONFIG_TCP_MD5SIG 2191 /* Clean up the MD5 key list, if any */ 2192 if (tp->md5sig_info) { 2193 tcp_clear_md5_list(sk); 2194 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2195 tp->md5sig_info = NULL; 2196 } 2197 #endif 2198 2199 /* Clean up a referenced TCP bind bucket. */ 2200 if (inet_csk(sk)->icsk_bind_hash) 2201 inet_put_port(sk); 2202 2203 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2204 2205 /* If socket is aborted during connect operation */ 2206 tcp_free_fastopen_req(tp); 2207 tcp_fastopen_destroy_cipher(sk); 2208 tcp_saved_syn_free(tp); 2209 2210 sk_sockets_allocated_dec(sk); 2211 } 2212 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2213 2214 #ifdef CONFIG_PROC_FS 2215 /* Proc filesystem TCP sock list dumping. */ 2216 2217 /* 2218 * Get next listener socket follow cur. If cur is NULL, get first socket 2219 * starting from bucket given in st->bucket; when st->bucket is zero the 2220 * very first socket in the hash table is returned. 2221 */ 2222 static void *listening_get_next(struct seq_file *seq, void *cur) 2223 { 2224 struct tcp_seq_afinfo *afinfo; 2225 struct tcp_iter_state *st = seq->private; 2226 struct net *net = seq_file_net(seq); 2227 struct inet_listen_hashbucket *ilb; 2228 struct hlist_nulls_node *node; 2229 struct sock *sk = cur; 2230 2231 if (st->bpf_seq_afinfo) 2232 afinfo = st->bpf_seq_afinfo; 2233 else 2234 afinfo = PDE_DATA(file_inode(seq->file)); 2235 2236 if (!sk) { 2237 get_head: 2238 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2239 spin_lock(&ilb->lock); 2240 sk = sk_nulls_head(&ilb->nulls_head); 2241 st->offset = 0; 2242 goto get_sk; 2243 } 2244 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2245 ++st->num; 2246 ++st->offset; 2247 2248 sk = sk_nulls_next(sk); 2249 get_sk: 2250 sk_nulls_for_each_from(sk, node) { 2251 if (!net_eq(sock_net(sk), net)) 2252 continue; 2253 if (afinfo->family == AF_UNSPEC || 2254 sk->sk_family == afinfo->family) 2255 return sk; 2256 } 2257 spin_unlock(&ilb->lock); 2258 st->offset = 0; 2259 if (++st->bucket < INET_LHTABLE_SIZE) 2260 goto get_head; 2261 return NULL; 2262 } 2263 2264 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2265 { 2266 struct tcp_iter_state *st = seq->private; 2267 void *rc; 2268 2269 st->bucket = 0; 2270 st->offset = 0; 2271 rc = listening_get_next(seq, NULL); 2272 2273 while (rc && *pos) { 2274 rc = listening_get_next(seq, rc); 2275 --*pos; 2276 } 2277 return rc; 2278 } 2279 2280 static inline bool empty_bucket(const struct tcp_iter_state *st) 2281 { 2282 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2283 } 2284 2285 /* 2286 * Get first established socket starting from bucket given in st->bucket. 2287 * If st->bucket is zero, the very first socket in the hash is returned. 2288 */ 2289 static void *established_get_first(struct seq_file *seq) 2290 { 2291 struct tcp_seq_afinfo *afinfo; 2292 struct tcp_iter_state *st = seq->private; 2293 struct net *net = seq_file_net(seq); 2294 void *rc = NULL; 2295 2296 if (st->bpf_seq_afinfo) 2297 afinfo = st->bpf_seq_afinfo; 2298 else 2299 afinfo = PDE_DATA(file_inode(seq->file)); 2300 2301 st->offset = 0; 2302 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2303 struct sock *sk; 2304 struct hlist_nulls_node *node; 2305 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2306 2307 /* Lockless fast path for the common case of empty buckets */ 2308 if (empty_bucket(st)) 2309 continue; 2310 2311 spin_lock_bh(lock); 2312 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2313 if ((afinfo->family != AF_UNSPEC && 2314 sk->sk_family != afinfo->family) || 2315 !net_eq(sock_net(sk), net)) { 2316 continue; 2317 } 2318 rc = sk; 2319 goto out; 2320 } 2321 spin_unlock_bh(lock); 2322 } 2323 out: 2324 return rc; 2325 } 2326 2327 static void *established_get_next(struct seq_file *seq, void *cur) 2328 { 2329 struct tcp_seq_afinfo *afinfo; 2330 struct sock *sk = cur; 2331 struct hlist_nulls_node *node; 2332 struct tcp_iter_state *st = seq->private; 2333 struct net *net = seq_file_net(seq); 2334 2335 if (st->bpf_seq_afinfo) 2336 afinfo = st->bpf_seq_afinfo; 2337 else 2338 afinfo = PDE_DATA(file_inode(seq->file)); 2339 2340 ++st->num; 2341 ++st->offset; 2342 2343 sk = sk_nulls_next(sk); 2344 2345 sk_nulls_for_each_from(sk, node) { 2346 if ((afinfo->family == AF_UNSPEC || 2347 sk->sk_family == afinfo->family) && 2348 net_eq(sock_net(sk), net)) 2349 return sk; 2350 } 2351 2352 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2353 ++st->bucket; 2354 return established_get_first(seq); 2355 } 2356 2357 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2358 { 2359 struct tcp_iter_state *st = seq->private; 2360 void *rc; 2361 2362 st->bucket = 0; 2363 rc = established_get_first(seq); 2364 2365 while (rc && pos) { 2366 rc = established_get_next(seq, rc); 2367 --pos; 2368 } 2369 return rc; 2370 } 2371 2372 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2373 { 2374 void *rc; 2375 struct tcp_iter_state *st = seq->private; 2376 2377 st->state = TCP_SEQ_STATE_LISTENING; 2378 rc = listening_get_idx(seq, &pos); 2379 2380 if (!rc) { 2381 st->state = TCP_SEQ_STATE_ESTABLISHED; 2382 rc = established_get_idx(seq, pos); 2383 } 2384 2385 return rc; 2386 } 2387 2388 static void *tcp_seek_last_pos(struct seq_file *seq) 2389 { 2390 struct tcp_iter_state *st = seq->private; 2391 int offset = st->offset; 2392 int orig_num = st->num; 2393 void *rc = NULL; 2394 2395 switch (st->state) { 2396 case TCP_SEQ_STATE_LISTENING: 2397 if (st->bucket >= INET_LHTABLE_SIZE) 2398 break; 2399 st->state = TCP_SEQ_STATE_LISTENING; 2400 rc = listening_get_next(seq, NULL); 2401 while (offset-- && rc) 2402 rc = listening_get_next(seq, rc); 2403 if (rc) 2404 break; 2405 st->bucket = 0; 2406 st->state = TCP_SEQ_STATE_ESTABLISHED; 2407 fallthrough; 2408 case TCP_SEQ_STATE_ESTABLISHED: 2409 if (st->bucket > tcp_hashinfo.ehash_mask) 2410 break; 2411 rc = established_get_first(seq); 2412 while (offset-- && rc) 2413 rc = established_get_next(seq, rc); 2414 } 2415 2416 st->num = orig_num; 2417 2418 return rc; 2419 } 2420 2421 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2422 { 2423 struct tcp_iter_state *st = seq->private; 2424 void *rc; 2425 2426 if (*pos && *pos == st->last_pos) { 2427 rc = tcp_seek_last_pos(seq); 2428 if (rc) 2429 goto out; 2430 } 2431 2432 st->state = TCP_SEQ_STATE_LISTENING; 2433 st->num = 0; 2434 st->bucket = 0; 2435 st->offset = 0; 2436 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2437 2438 out: 2439 st->last_pos = *pos; 2440 return rc; 2441 } 2442 EXPORT_SYMBOL(tcp_seq_start); 2443 2444 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2445 { 2446 struct tcp_iter_state *st = seq->private; 2447 void *rc = NULL; 2448 2449 if (v == SEQ_START_TOKEN) { 2450 rc = tcp_get_idx(seq, 0); 2451 goto out; 2452 } 2453 2454 switch (st->state) { 2455 case TCP_SEQ_STATE_LISTENING: 2456 rc = listening_get_next(seq, v); 2457 if (!rc) { 2458 st->state = TCP_SEQ_STATE_ESTABLISHED; 2459 st->bucket = 0; 2460 st->offset = 0; 2461 rc = established_get_first(seq); 2462 } 2463 break; 2464 case TCP_SEQ_STATE_ESTABLISHED: 2465 rc = established_get_next(seq, v); 2466 break; 2467 } 2468 out: 2469 ++*pos; 2470 st->last_pos = *pos; 2471 return rc; 2472 } 2473 EXPORT_SYMBOL(tcp_seq_next); 2474 2475 void tcp_seq_stop(struct seq_file *seq, void *v) 2476 { 2477 struct tcp_iter_state *st = seq->private; 2478 2479 switch (st->state) { 2480 case TCP_SEQ_STATE_LISTENING: 2481 if (v != SEQ_START_TOKEN) 2482 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2483 break; 2484 case TCP_SEQ_STATE_ESTABLISHED: 2485 if (v) 2486 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2487 break; 2488 } 2489 } 2490 EXPORT_SYMBOL(tcp_seq_stop); 2491 2492 static void get_openreq4(const struct request_sock *req, 2493 struct seq_file *f, int i) 2494 { 2495 const struct inet_request_sock *ireq = inet_rsk(req); 2496 long delta = req->rsk_timer.expires - jiffies; 2497 2498 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2499 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2500 i, 2501 ireq->ir_loc_addr, 2502 ireq->ir_num, 2503 ireq->ir_rmt_addr, 2504 ntohs(ireq->ir_rmt_port), 2505 TCP_SYN_RECV, 2506 0, 0, /* could print option size, but that is af dependent. */ 2507 1, /* timers active (only the expire timer) */ 2508 jiffies_delta_to_clock_t(delta), 2509 req->num_timeout, 2510 from_kuid_munged(seq_user_ns(f), 2511 sock_i_uid(req->rsk_listener)), 2512 0, /* non standard timer */ 2513 0, /* open_requests have no inode */ 2514 0, 2515 req); 2516 } 2517 2518 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2519 { 2520 int timer_active; 2521 unsigned long timer_expires; 2522 const struct tcp_sock *tp = tcp_sk(sk); 2523 const struct inet_connection_sock *icsk = inet_csk(sk); 2524 const struct inet_sock *inet = inet_sk(sk); 2525 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2526 __be32 dest = inet->inet_daddr; 2527 __be32 src = inet->inet_rcv_saddr; 2528 __u16 destp = ntohs(inet->inet_dport); 2529 __u16 srcp = ntohs(inet->inet_sport); 2530 int rx_queue; 2531 int state; 2532 2533 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2534 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2535 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2536 timer_active = 1; 2537 timer_expires = icsk->icsk_timeout; 2538 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2539 timer_active = 4; 2540 timer_expires = icsk->icsk_timeout; 2541 } else if (timer_pending(&sk->sk_timer)) { 2542 timer_active = 2; 2543 timer_expires = sk->sk_timer.expires; 2544 } else { 2545 timer_active = 0; 2546 timer_expires = jiffies; 2547 } 2548 2549 state = inet_sk_state_load(sk); 2550 if (state == TCP_LISTEN) 2551 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2552 else 2553 /* Because we don't lock the socket, 2554 * we might find a transient negative value. 2555 */ 2556 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2557 READ_ONCE(tp->copied_seq), 0); 2558 2559 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2560 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2561 i, src, srcp, dest, destp, state, 2562 READ_ONCE(tp->write_seq) - tp->snd_una, 2563 rx_queue, 2564 timer_active, 2565 jiffies_delta_to_clock_t(timer_expires - jiffies), 2566 icsk->icsk_retransmits, 2567 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2568 icsk->icsk_probes_out, 2569 sock_i_ino(sk), 2570 refcount_read(&sk->sk_refcnt), sk, 2571 jiffies_to_clock_t(icsk->icsk_rto), 2572 jiffies_to_clock_t(icsk->icsk_ack.ato), 2573 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2574 tp->snd_cwnd, 2575 state == TCP_LISTEN ? 2576 fastopenq->max_qlen : 2577 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2578 } 2579 2580 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2581 struct seq_file *f, int i) 2582 { 2583 long delta = tw->tw_timer.expires - jiffies; 2584 __be32 dest, src; 2585 __u16 destp, srcp; 2586 2587 dest = tw->tw_daddr; 2588 src = tw->tw_rcv_saddr; 2589 destp = ntohs(tw->tw_dport); 2590 srcp = ntohs(tw->tw_sport); 2591 2592 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2593 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2594 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2595 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2596 refcount_read(&tw->tw_refcnt), tw); 2597 } 2598 2599 #define TMPSZ 150 2600 2601 static int tcp4_seq_show(struct seq_file *seq, void *v) 2602 { 2603 struct tcp_iter_state *st; 2604 struct sock *sk = v; 2605 2606 seq_setwidth(seq, TMPSZ - 1); 2607 if (v == SEQ_START_TOKEN) { 2608 seq_puts(seq, " sl local_address rem_address st tx_queue " 2609 "rx_queue tr tm->when retrnsmt uid timeout " 2610 "inode"); 2611 goto out; 2612 } 2613 st = seq->private; 2614 2615 if (sk->sk_state == TCP_TIME_WAIT) 2616 get_timewait4_sock(v, seq, st->num); 2617 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2618 get_openreq4(v, seq, st->num); 2619 else 2620 get_tcp4_sock(v, seq, st->num); 2621 out: 2622 seq_pad(seq, '\n'); 2623 return 0; 2624 } 2625 2626 #ifdef CONFIG_BPF_SYSCALL 2627 struct bpf_iter__tcp { 2628 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2629 __bpf_md_ptr(struct sock_common *, sk_common); 2630 uid_t uid __aligned(8); 2631 }; 2632 2633 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2634 struct sock_common *sk_common, uid_t uid) 2635 { 2636 struct bpf_iter__tcp ctx; 2637 2638 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2639 ctx.meta = meta; 2640 ctx.sk_common = sk_common; 2641 ctx.uid = uid; 2642 return bpf_iter_run_prog(prog, &ctx); 2643 } 2644 2645 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2646 { 2647 struct bpf_iter_meta meta; 2648 struct bpf_prog *prog; 2649 struct sock *sk = v; 2650 uid_t uid; 2651 2652 if (v == SEQ_START_TOKEN) 2653 return 0; 2654 2655 if (sk->sk_state == TCP_TIME_WAIT) { 2656 uid = 0; 2657 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2658 const struct request_sock *req = v; 2659 2660 uid = from_kuid_munged(seq_user_ns(seq), 2661 sock_i_uid(req->rsk_listener)); 2662 } else { 2663 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2664 } 2665 2666 meta.seq = seq; 2667 prog = bpf_iter_get_info(&meta, false); 2668 return tcp_prog_seq_show(prog, &meta, v, uid); 2669 } 2670 2671 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2672 { 2673 struct bpf_iter_meta meta; 2674 struct bpf_prog *prog; 2675 2676 if (!v) { 2677 meta.seq = seq; 2678 prog = bpf_iter_get_info(&meta, true); 2679 if (prog) 2680 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2681 } 2682 2683 tcp_seq_stop(seq, v); 2684 } 2685 2686 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2687 .show = bpf_iter_tcp_seq_show, 2688 .start = tcp_seq_start, 2689 .next = tcp_seq_next, 2690 .stop = bpf_iter_tcp_seq_stop, 2691 }; 2692 #endif 2693 2694 static const struct seq_operations tcp4_seq_ops = { 2695 .show = tcp4_seq_show, 2696 .start = tcp_seq_start, 2697 .next = tcp_seq_next, 2698 .stop = tcp_seq_stop, 2699 }; 2700 2701 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2702 .family = AF_INET, 2703 }; 2704 2705 static int __net_init tcp4_proc_init_net(struct net *net) 2706 { 2707 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2708 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2709 return -ENOMEM; 2710 return 0; 2711 } 2712 2713 static void __net_exit tcp4_proc_exit_net(struct net *net) 2714 { 2715 remove_proc_entry("tcp", net->proc_net); 2716 } 2717 2718 static struct pernet_operations tcp4_net_ops = { 2719 .init = tcp4_proc_init_net, 2720 .exit = tcp4_proc_exit_net, 2721 }; 2722 2723 int __init tcp4_proc_init(void) 2724 { 2725 return register_pernet_subsys(&tcp4_net_ops); 2726 } 2727 2728 void tcp4_proc_exit(void) 2729 { 2730 unregister_pernet_subsys(&tcp4_net_ops); 2731 } 2732 #endif /* CONFIG_PROC_FS */ 2733 2734 struct proto tcp_prot = { 2735 .name = "TCP", 2736 .owner = THIS_MODULE, 2737 .close = tcp_close, 2738 .pre_connect = tcp_v4_pre_connect, 2739 .connect = tcp_v4_connect, 2740 .disconnect = tcp_disconnect, 2741 .accept = inet_csk_accept, 2742 .ioctl = tcp_ioctl, 2743 .init = tcp_v4_init_sock, 2744 .destroy = tcp_v4_destroy_sock, 2745 .shutdown = tcp_shutdown, 2746 .setsockopt = tcp_setsockopt, 2747 .getsockopt = tcp_getsockopt, 2748 .keepalive = tcp_set_keepalive, 2749 .recvmsg = tcp_recvmsg, 2750 .sendmsg = tcp_sendmsg, 2751 .sendpage = tcp_sendpage, 2752 .backlog_rcv = tcp_v4_do_rcv, 2753 .release_cb = tcp_release_cb, 2754 .hash = inet_hash, 2755 .unhash = inet_unhash, 2756 .get_port = inet_csk_get_port, 2757 .enter_memory_pressure = tcp_enter_memory_pressure, 2758 .leave_memory_pressure = tcp_leave_memory_pressure, 2759 .stream_memory_free = tcp_stream_memory_free, 2760 .sockets_allocated = &tcp_sockets_allocated, 2761 .orphan_count = &tcp_orphan_count, 2762 .memory_allocated = &tcp_memory_allocated, 2763 .memory_pressure = &tcp_memory_pressure, 2764 .sysctl_mem = sysctl_tcp_mem, 2765 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2766 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2767 .max_header = MAX_TCP_HEADER, 2768 .obj_size = sizeof(struct tcp_sock), 2769 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2770 .twsk_prot = &tcp_timewait_sock_ops, 2771 .rsk_prot = &tcp_request_sock_ops, 2772 .h.hashinfo = &tcp_hashinfo, 2773 .no_autobind = true, 2774 .diag_destroy = tcp_abort, 2775 }; 2776 EXPORT_SYMBOL(tcp_prot); 2777 2778 static void __net_exit tcp_sk_exit(struct net *net) 2779 { 2780 int cpu; 2781 2782 if (net->ipv4.tcp_congestion_control) 2783 bpf_module_put(net->ipv4.tcp_congestion_control, 2784 net->ipv4.tcp_congestion_control->owner); 2785 2786 for_each_possible_cpu(cpu) 2787 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2788 free_percpu(net->ipv4.tcp_sk); 2789 } 2790 2791 static int __net_init tcp_sk_init(struct net *net) 2792 { 2793 int res, cpu, cnt; 2794 2795 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2796 if (!net->ipv4.tcp_sk) 2797 return -ENOMEM; 2798 2799 for_each_possible_cpu(cpu) { 2800 struct sock *sk; 2801 2802 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2803 IPPROTO_TCP, net); 2804 if (res) 2805 goto fail; 2806 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2807 2808 /* Please enforce IP_DF and IPID==0 for RST and 2809 * ACK sent in SYN-RECV and TIME-WAIT state. 2810 */ 2811 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2812 2813 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2814 } 2815 2816 net->ipv4.sysctl_tcp_ecn = 2; 2817 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2818 2819 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2820 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 2821 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2822 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2823 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 2824 2825 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2826 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2827 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2828 2829 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2830 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2831 net->ipv4.sysctl_tcp_syncookies = 1; 2832 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2833 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2834 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2835 net->ipv4.sysctl_tcp_orphan_retries = 0; 2836 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2837 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2838 net->ipv4.sysctl_tcp_tw_reuse = 2; 2839 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 2840 2841 cnt = tcp_hashinfo.ehash_mask + 1; 2842 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2843 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2844 2845 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 2846 net->ipv4.sysctl_tcp_sack = 1; 2847 net->ipv4.sysctl_tcp_window_scaling = 1; 2848 net->ipv4.sysctl_tcp_timestamps = 1; 2849 net->ipv4.sysctl_tcp_early_retrans = 3; 2850 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2851 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2852 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2853 net->ipv4.sysctl_tcp_max_reordering = 300; 2854 net->ipv4.sysctl_tcp_dsack = 1; 2855 net->ipv4.sysctl_tcp_app_win = 31; 2856 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2857 net->ipv4.sysctl_tcp_frto = 2; 2858 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2859 /* This limits the percentage of the congestion window which we 2860 * will allow a single TSO frame to consume. Building TSO frames 2861 * which are too large can cause TCP streams to be bursty. 2862 */ 2863 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2864 /* Default TSQ limit of 16 TSO segments */ 2865 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2866 /* rfc5961 challenge ack rate limiting */ 2867 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2868 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2869 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2870 net->ipv4.sysctl_tcp_autocorking = 1; 2871 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2872 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2873 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2874 if (net != &init_net) { 2875 memcpy(net->ipv4.sysctl_tcp_rmem, 2876 init_net.ipv4.sysctl_tcp_rmem, 2877 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2878 memcpy(net->ipv4.sysctl_tcp_wmem, 2879 init_net.ipv4.sysctl_tcp_wmem, 2880 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2881 } 2882 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2883 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 2884 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2885 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2886 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2887 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2888 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2889 2890 /* Reno is always built in */ 2891 if (!net_eq(net, &init_net) && 2892 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 2893 init_net.ipv4.tcp_congestion_control->owner)) 2894 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2895 else 2896 net->ipv4.tcp_congestion_control = &tcp_reno; 2897 2898 return 0; 2899 fail: 2900 tcp_sk_exit(net); 2901 2902 return res; 2903 } 2904 2905 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2906 { 2907 struct net *net; 2908 2909 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2910 2911 list_for_each_entry(net, net_exit_list, exit_list) 2912 tcp_fastopen_ctx_destroy(net); 2913 } 2914 2915 static struct pernet_operations __net_initdata tcp_sk_ops = { 2916 .init = tcp_sk_init, 2917 .exit = tcp_sk_exit, 2918 .exit_batch = tcp_sk_exit_batch, 2919 }; 2920 2921 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 2922 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 2923 struct sock_common *sk_common, uid_t uid) 2924 2925 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) 2926 { 2927 struct tcp_iter_state *st = priv_data; 2928 struct tcp_seq_afinfo *afinfo; 2929 int ret; 2930 2931 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); 2932 if (!afinfo) 2933 return -ENOMEM; 2934 2935 afinfo->family = AF_UNSPEC; 2936 st->bpf_seq_afinfo = afinfo; 2937 ret = bpf_iter_init_seq_net(priv_data, aux); 2938 if (ret) 2939 kfree(afinfo); 2940 return ret; 2941 } 2942 2943 static void bpf_iter_fini_tcp(void *priv_data) 2944 { 2945 struct tcp_iter_state *st = priv_data; 2946 2947 kfree(st->bpf_seq_afinfo); 2948 bpf_iter_fini_seq_net(priv_data); 2949 } 2950 2951 static const struct bpf_iter_seq_info tcp_seq_info = { 2952 .seq_ops = &bpf_iter_tcp_seq_ops, 2953 .init_seq_private = bpf_iter_init_tcp, 2954 .fini_seq_private = bpf_iter_fini_tcp, 2955 .seq_priv_size = sizeof(struct tcp_iter_state), 2956 }; 2957 2958 static struct bpf_iter_reg tcp_reg_info = { 2959 .target = "tcp", 2960 .ctx_arg_info_size = 1, 2961 .ctx_arg_info = { 2962 { offsetof(struct bpf_iter__tcp, sk_common), 2963 PTR_TO_BTF_ID_OR_NULL }, 2964 }, 2965 .seq_info = &tcp_seq_info, 2966 }; 2967 2968 static void __init bpf_iter_register(void) 2969 { 2970 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 2971 if (bpf_iter_reg_target(&tcp_reg_info)) 2972 pr_warn("Warning: could not register bpf iterator tcp\n"); 2973 } 2974 2975 #endif 2976 2977 void __init tcp_v4_init(void) 2978 { 2979 if (register_pernet_subsys(&tcp_sk_ops)) 2980 panic("Failed to create the TCP control socket.\n"); 2981 2982 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 2983 bpf_iter_register(); 2984 #endif 2985 } 2986