1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Implementation of the Transmission Control Protocol(TCP). 8 * 9 * IPv4 specific functions 10 * 11 * code split from: 12 * linux/ipv4/tcp.c 13 * linux/ipv4/tcp_input.c 14 * linux/ipv4/tcp_output.c 15 * 16 * See tcp.c for author information 17 */ 18 19 /* 20 * Changes: 21 * David S. Miller : New socket lookup architecture. 22 * This code is dedicated to John Dyson. 23 * David S. Miller : Change semantics of established hash, 24 * half is devoted to TIME_WAIT sockets 25 * and the rest go in the other half. 26 * Andi Kleen : Add support for syncookies and fixed 27 * some bugs: ip options weren't passed to 28 * the TCP layer, missed a check for an 29 * ACK bit. 30 * Andi Kleen : Implemented fast path mtu discovery. 31 * Fixed many serious bugs in the 32 * request_sock handling and moved 33 * most of it into the af independent code. 34 * Added tail drop and some other bugfixes. 35 * Added new listen semantics. 36 * Mike McLagan : Routing by source 37 * Juan Jose Ciarlante: ip_dynaddr bits 38 * Andi Kleen: various fixes. 39 * Vitaly E. Lavrov : Transparent proxy revived after year 40 * coma. 41 * Andi Kleen : Fix new listen. 42 * Andi Kleen : Fix accept error reporting. 43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 45 * a single port at the same time. 46 */ 47 48 #define pr_fmt(fmt) "TCP: " fmt 49 50 #include <linux/bottom_half.h> 51 #include <linux/types.h> 52 #include <linux/fcntl.h> 53 #include <linux/module.h> 54 #include <linux/random.h> 55 #include <linux/cache.h> 56 #include <linux/jhash.h> 57 #include <linux/init.h> 58 #include <linux/times.h> 59 #include <linux/slab.h> 60 61 #include <net/net_namespace.h> 62 #include <net/icmp.h> 63 #include <net/inet_hashtables.h> 64 #include <net/tcp.h> 65 #include <net/transp_v6.h> 66 #include <net/ipv6.h> 67 #include <net/inet_common.h> 68 #include <net/timewait_sock.h> 69 #include <net/xfrm.h> 70 #include <net/secure_seq.h> 71 #include <net/busy_poll.h> 72 73 #include <linux/inet.h> 74 #include <linux/ipv6.h> 75 #include <linux/stddef.h> 76 #include <linux/proc_fs.h> 77 #include <linux/seq_file.h> 78 #include <linux/inetdevice.h> 79 80 #include <crypto/hash.h> 81 #include <linux/scatterlist.h> 82 83 #include <trace/events/tcp.h> 84 85 #ifdef CONFIG_TCP_MD5SIG 86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 87 __be32 daddr, __be32 saddr, const struct tcphdr *th); 88 #endif 89 90 struct inet_hashinfo tcp_hashinfo; 91 EXPORT_SYMBOL(tcp_hashinfo); 92 93 static u32 tcp_v4_init_seq(const struct sk_buff *skb) 94 { 95 return secure_tcp_seq(ip_hdr(skb)->daddr, 96 ip_hdr(skb)->saddr, 97 tcp_hdr(skb)->dest, 98 tcp_hdr(skb)->source); 99 } 100 101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 102 { 103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 104 } 105 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 107 { 108 const struct inet_timewait_sock *tw = inet_twsk(sktw); 109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 110 struct tcp_sock *tp = tcp_sk(sk); 111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 112 113 if (reuse == 2) { 114 /* Still does not detect *everything* that goes through 115 * lo, since we require a loopback src or dst address 116 * or direct binding to 'lo' interface. 117 */ 118 bool loopback = false; 119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 120 loopback = true; 121 #if IS_ENABLED(CONFIG_IPV6) 122 if (tw->tw_family == AF_INET6) { 123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 127 loopback = true; 128 } else 129 #endif 130 { 131 if (ipv4_is_loopback(tw->tw_daddr) || 132 ipv4_is_loopback(tw->tw_rcv_saddr)) 133 loopback = true; 134 } 135 if (!loopback) 136 reuse = 0; 137 } 138 139 /* With PAWS, it is safe from the viewpoint 140 of data integrity. Even without PAWS it is safe provided sequence 141 spaces do not overlap i.e. at data rates <= 80Mbit/sec. 142 143 Actually, the idea is close to VJ's one, only timestamp cache is 144 held not per host, but per port pair and TW bucket is used as state 145 holder. 146 147 If TW bucket has been already destroyed we fall back to VJ's scheme 148 and use initial timestamp retrieved from peer table. 149 */ 150 if (tcptw->tw_ts_recent_stamp && 151 (!twp || (reuse && time_after32(ktime_get_seconds(), 152 tcptw->tw_ts_recent_stamp)))) { 153 /* In case of repair and re-using TIME-WAIT sockets we still 154 * want to be sure that it is safe as above but honor the 155 * sequence numbers and time stamps set as part of the repair 156 * process. 157 * 158 * Without this check re-using a TIME-WAIT socket with TCP 159 * repair would accumulate a -1 on the repair assigned 160 * sequence number. The first time it is reused the sequence 161 * is -1, the second time -2, etc. This fixes that issue 162 * without appearing to create any others. 163 */ 164 if (likely(!tp->repair)) { 165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 166 167 if (!seq) 168 seq = 1; 169 WRITE_ONCE(tp->write_seq, seq); 170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 172 } 173 sock_hold(sktw); 174 return 1; 175 } 176 177 return 0; 178 } 179 EXPORT_SYMBOL_GPL(tcp_twsk_unique); 180 181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 182 int addr_len) 183 { 184 /* This check is replicated from tcp_v4_connect() and intended to 185 * prevent BPF program called below from accessing bytes that are out 186 * of the bound specified by user in addr_len. 187 */ 188 if (addr_len < sizeof(struct sockaddr_in)) 189 return -EINVAL; 190 191 sock_owned_by_me(sk); 192 193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 194 } 195 196 /* This will initiate an outgoing connection. */ 197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 198 { 199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 200 struct inet_sock *inet = inet_sk(sk); 201 struct tcp_sock *tp = tcp_sk(sk); 202 __be16 orig_sport, orig_dport; 203 __be32 daddr, nexthop; 204 struct flowi4 *fl4; 205 struct rtable *rt; 206 int err; 207 struct ip_options_rcu *inet_opt; 208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 209 210 if (addr_len < sizeof(struct sockaddr_in)) 211 return -EINVAL; 212 213 if (usin->sin_family != AF_INET) 214 return -EAFNOSUPPORT; 215 216 nexthop = daddr = usin->sin_addr.s_addr; 217 inet_opt = rcu_dereference_protected(inet->inet_opt, 218 lockdep_sock_is_held(sk)); 219 if (inet_opt && inet_opt->opt.srr) { 220 if (!daddr) 221 return -EINVAL; 222 nexthop = inet_opt->opt.faddr; 223 } 224 225 orig_sport = inet->inet_sport; 226 orig_dport = usin->sin_port; 227 fl4 = &inet->cork.fl.u.ip4; 228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 230 IPPROTO_TCP, 231 orig_sport, orig_dport, sk); 232 if (IS_ERR(rt)) { 233 err = PTR_ERR(rt); 234 if (err == -ENETUNREACH) 235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 236 return err; 237 } 238 239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 240 ip_rt_put(rt); 241 return -ENETUNREACH; 242 } 243 244 if (!inet_opt || !inet_opt->opt.srr) 245 daddr = fl4->daddr; 246 247 if (!inet->inet_saddr) 248 inet->inet_saddr = fl4->saddr; 249 sk_rcv_saddr_set(sk, inet->inet_saddr); 250 251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 252 /* Reset inherited state */ 253 tp->rx_opt.ts_recent = 0; 254 tp->rx_opt.ts_recent_stamp = 0; 255 if (likely(!tp->repair)) 256 WRITE_ONCE(tp->write_seq, 0); 257 } 258 259 inet->inet_dport = usin->sin_port; 260 sk_daddr_set(sk, daddr); 261 262 inet_csk(sk)->icsk_ext_hdr_len = 0; 263 if (inet_opt) 264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 265 266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 267 268 /* Socket identity is still unknown (sport may be zero). 269 * However we set state to SYN-SENT and not releasing socket 270 * lock select source port, enter ourselves into the hash tables and 271 * complete initialization after this. 272 */ 273 tcp_set_state(sk, TCP_SYN_SENT); 274 err = inet_hash_connect(tcp_death_row, sk); 275 if (err) 276 goto failure; 277 278 sk_set_txhash(sk); 279 280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 281 inet->inet_sport, inet->inet_dport, sk); 282 if (IS_ERR(rt)) { 283 err = PTR_ERR(rt); 284 rt = NULL; 285 goto failure; 286 } 287 /* OK, now commit destination to socket. */ 288 sk->sk_gso_type = SKB_GSO_TCPV4; 289 sk_setup_caps(sk, &rt->dst); 290 rt = NULL; 291 292 if (likely(!tp->repair)) { 293 if (!tp->write_seq) 294 WRITE_ONCE(tp->write_seq, 295 secure_tcp_seq(inet->inet_saddr, 296 inet->inet_daddr, 297 inet->inet_sport, 298 usin->sin_port)); 299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 300 inet->inet_saddr, 301 inet->inet_daddr); 302 } 303 304 inet->inet_id = prandom_u32(); 305 306 if (tcp_fastopen_defer_connect(sk, &err)) 307 return err; 308 if (err) 309 goto failure; 310 311 err = tcp_connect(sk); 312 313 if (err) 314 goto failure; 315 316 return 0; 317 318 failure: 319 /* 320 * This unhashes the socket and releases the local port, 321 * if necessary. 322 */ 323 tcp_set_state(sk, TCP_CLOSE); 324 ip_rt_put(rt); 325 sk->sk_route_caps = 0; 326 inet->inet_dport = 0; 327 return err; 328 } 329 EXPORT_SYMBOL(tcp_v4_connect); 330 331 /* 332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 333 * It can be called through tcp_release_cb() if socket was owned by user 334 * at the time tcp_v4_err() was called to handle ICMP message. 335 */ 336 void tcp_v4_mtu_reduced(struct sock *sk) 337 { 338 struct inet_sock *inet = inet_sk(sk); 339 struct dst_entry *dst; 340 u32 mtu; 341 342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 343 return; 344 mtu = tcp_sk(sk)->mtu_info; 345 dst = inet_csk_update_pmtu(sk, mtu); 346 if (!dst) 347 return; 348 349 /* Something is about to be wrong... Remember soft error 350 * for the case, if this connection will not able to recover. 351 */ 352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 353 sk->sk_err_soft = EMSGSIZE; 354 355 mtu = dst_mtu(dst); 356 357 if (inet->pmtudisc != IP_PMTUDISC_DONT && 358 ip_sk_accept_pmtu(sk) && 359 inet_csk(sk)->icsk_pmtu_cookie > mtu) { 360 tcp_sync_mss(sk, mtu); 361 362 /* Resend the TCP packet because it's 363 * clear that the old packet has been 364 * dropped. This is the new "fast" path mtu 365 * discovery. 366 */ 367 tcp_simple_retransmit(sk); 368 } /* else let the usual retransmit timer handle it */ 369 } 370 EXPORT_SYMBOL(tcp_v4_mtu_reduced); 371 372 static void do_redirect(struct sk_buff *skb, struct sock *sk) 373 { 374 struct dst_entry *dst = __sk_dst_check(sk, 0); 375 376 if (dst) 377 dst->ops->redirect(dst, sk, skb); 378 } 379 380 381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 382 void tcp_req_err(struct sock *sk, u32 seq, bool abort) 383 { 384 struct request_sock *req = inet_reqsk(sk); 385 struct net *net = sock_net(sk); 386 387 /* ICMPs are not backlogged, hence we cannot get 388 * an established socket here. 389 */ 390 if (seq != tcp_rsk(req)->snt_isn) { 391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 392 } else if (abort) { 393 /* 394 * Still in SYN_RECV, just remove it silently. 395 * There is no good way to pass the error to the newly 396 * created socket, and POSIX does not want network 397 * errors returned from accept(). 398 */ 399 inet_csk_reqsk_queue_drop(req->rsk_listener, req); 400 tcp_listendrop(req->rsk_listener); 401 } 402 reqsk_put(req); 403 } 404 EXPORT_SYMBOL(tcp_req_err); 405 406 /* TCP-LD (RFC 6069) logic */ 407 void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 408 { 409 struct inet_connection_sock *icsk = inet_csk(sk); 410 struct tcp_sock *tp = tcp_sk(sk); 411 struct sk_buff *skb; 412 s32 remaining; 413 u32 delta_us; 414 415 if (sock_owned_by_user(sk)) 416 return; 417 418 if (seq != tp->snd_una || !icsk->icsk_retransmits || 419 !icsk->icsk_backoff) 420 return; 421 422 skb = tcp_rtx_queue_head(sk); 423 if (WARN_ON_ONCE(!skb)) 424 return; 425 426 icsk->icsk_backoff--; 427 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 428 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 429 430 tcp_mstamp_refresh(tp); 431 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 432 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 433 434 if (remaining > 0) { 435 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 436 remaining, TCP_RTO_MAX); 437 } else { 438 /* RTO revert clocked out retransmission. 439 * Will retransmit now. 440 */ 441 tcp_retransmit_timer(sk); 442 } 443 } 444 EXPORT_SYMBOL(tcp_ld_RTO_revert); 445 446 /* 447 * This routine is called by the ICMP module when it gets some 448 * sort of error condition. If err < 0 then the socket should 449 * be closed and the error returned to the user. If err > 0 450 * it's just the icmp type << 8 | icmp code. After adjustment 451 * header points to the first 8 bytes of the tcp header. We need 452 * to find the appropriate port. 453 * 454 * The locking strategy used here is very "optimistic". When 455 * someone else accesses the socket the ICMP is just dropped 456 * and for some paths there is no check at all. 457 * A more general error queue to queue errors for later handling 458 * is probably better. 459 * 460 */ 461 462 int tcp_v4_err(struct sk_buff *skb, u32 info) 463 { 464 const struct iphdr *iph = (const struct iphdr *)skb->data; 465 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 466 struct tcp_sock *tp; 467 struct inet_sock *inet; 468 const int type = icmp_hdr(skb)->type; 469 const int code = icmp_hdr(skb)->code; 470 struct sock *sk; 471 struct request_sock *fastopen; 472 u32 seq, snd_una; 473 int err; 474 struct net *net = dev_net(skb->dev); 475 476 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 477 th->dest, iph->saddr, ntohs(th->source), 478 inet_iif(skb), 0); 479 if (!sk) { 480 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 481 return -ENOENT; 482 } 483 if (sk->sk_state == TCP_TIME_WAIT) { 484 inet_twsk_put(inet_twsk(sk)); 485 return 0; 486 } 487 seq = ntohl(th->seq); 488 if (sk->sk_state == TCP_NEW_SYN_RECV) { 489 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 490 type == ICMP_TIME_EXCEEDED || 491 (type == ICMP_DEST_UNREACH && 492 (code == ICMP_NET_UNREACH || 493 code == ICMP_HOST_UNREACH))); 494 return 0; 495 } 496 497 bh_lock_sock(sk); 498 /* If too many ICMPs get dropped on busy 499 * servers this needs to be solved differently. 500 * We do take care of PMTU discovery (RFC1191) special case : 501 * we can receive locally generated ICMP messages while socket is held. 502 */ 503 if (sock_owned_by_user(sk)) { 504 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 505 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 506 } 507 if (sk->sk_state == TCP_CLOSE) 508 goto out; 509 510 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 511 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 512 goto out; 513 } 514 515 tp = tcp_sk(sk); 516 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 517 fastopen = rcu_dereference(tp->fastopen_rsk); 518 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 519 if (sk->sk_state != TCP_LISTEN && 520 !between(seq, snd_una, tp->snd_nxt)) { 521 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 522 goto out; 523 } 524 525 switch (type) { 526 case ICMP_REDIRECT: 527 if (!sock_owned_by_user(sk)) 528 do_redirect(skb, sk); 529 goto out; 530 case ICMP_SOURCE_QUENCH: 531 /* Just silently ignore these. */ 532 goto out; 533 case ICMP_PARAMETERPROB: 534 err = EPROTO; 535 break; 536 case ICMP_DEST_UNREACH: 537 if (code > NR_ICMP_UNREACH) 538 goto out; 539 540 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 541 /* We are not interested in TCP_LISTEN and open_requests 542 * (SYN-ACKs send out by Linux are always <576bytes so 543 * they should go through unfragmented). 544 */ 545 if (sk->sk_state == TCP_LISTEN) 546 goto out; 547 548 tp->mtu_info = info; 549 if (!sock_owned_by_user(sk)) { 550 tcp_v4_mtu_reduced(sk); 551 } else { 552 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 553 sock_hold(sk); 554 } 555 goto out; 556 } 557 558 err = icmp_err_convert[code].errno; 559 /* check if this ICMP message allows revert of backoff. 560 * (see RFC 6069) 561 */ 562 if (!fastopen && 563 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 564 tcp_ld_RTO_revert(sk, seq); 565 break; 566 case ICMP_TIME_EXCEEDED: 567 err = EHOSTUNREACH; 568 break; 569 default: 570 goto out; 571 } 572 573 switch (sk->sk_state) { 574 case TCP_SYN_SENT: 575 case TCP_SYN_RECV: 576 /* Only in fast or simultaneous open. If a fast open socket is 577 * is already accepted it is treated as a connected one below. 578 */ 579 if (fastopen && !fastopen->sk) 580 break; 581 582 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 583 584 if (!sock_owned_by_user(sk)) { 585 sk->sk_err = err; 586 587 sk->sk_error_report(sk); 588 589 tcp_done(sk); 590 } else { 591 sk->sk_err_soft = err; 592 } 593 goto out; 594 } 595 596 /* If we've already connected we will keep trying 597 * until we time out, or the user gives up. 598 * 599 * rfc1122 4.2.3.9 allows to consider as hard errors 600 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 601 * but it is obsoleted by pmtu discovery). 602 * 603 * Note, that in modern internet, where routing is unreliable 604 * and in each dark corner broken firewalls sit, sending random 605 * errors ordered by their masters even this two messages finally lose 606 * their original sense (even Linux sends invalid PORT_UNREACHs) 607 * 608 * Now we are in compliance with RFCs. 609 * --ANK (980905) 610 */ 611 612 inet = inet_sk(sk); 613 if (!sock_owned_by_user(sk) && inet->recverr) { 614 sk->sk_err = err; 615 sk->sk_error_report(sk); 616 } else { /* Only an error on timeout */ 617 sk->sk_err_soft = err; 618 } 619 620 out: 621 bh_unlock_sock(sk); 622 sock_put(sk); 623 return 0; 624 } 625 626 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 627 { 628 struct tcphdr *th = tcp_hdr(skb); 629 630 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 631 skb->csum_start = skb_transport_header(skb) - skb->head; 632 skb->csum_offset = offsetof(struct tcphdr, check); 633 } 634 635 /* This routine computes an IPv4 TCP checksum. */ 636 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 637 { 638 const struct inet_sock *inet = inet_sk(sk); 639 640 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 641 } 642 EXPORT_SYMBOL(tcp_v4_send_check); 643 644 /* 645 * This routine will send an RST to the other tcp. 646 * 647 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 648 * for reset. 649 * Answer: if a packet caused RST, it is not for a socket 650 * existing in our system, if it is matched to a socket, 651 * it is just duplicate segment or bug in other side's TCP. 652 * So that we build reply only basing on parameters 653 * arrived with segment. 654 * Exception: precedence violation. We do not implement it in any case. 655 */ 656 657 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 658 { 659 const struct tcphdr *th = tcp_hdr(skb); 660 struct { 661 struct tcphdr th; 662 #ifdef CONFIG_TCP_MD5SIG 663 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 664 #endif 665 } rep; 666 struct ip_reply_arg arg; 667 #ifdef CONFIG_TCP_MD5SIG 668 struct tcp_md5sig_key *key = NULL; 669 const __u8 *hash_location = NULL; 670 unsigned char newhash[16]; 671 int genhash; 672 struct sock *sk1 = NULL; 673 #endif 674 u64 transmit_time = 0; 675 struct sock *ctl_sk; 676 struct net *net; 677 678 /* Never send a reset in response to a reset. */ 679 if (th->rst) 680 return; 681 682 /* If sk not NULL, it means we did a successful lookup and incoming 683 * route had to be correct. prequeue might have dropped our dst. 684 */ 685 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 686 return; 687 688 /* Swap the send and the receive. */ 689 memset(&rep, 0, sizeof(rep)); 690 rep.th.dest = th->source; 691 rep.th.source = th->dest; 692 rep.th.doff = sizeof(struct tcphdr) / 4; 693 rep.th.rst = 1; 694 695 if (th->ack) { 696 rep.th.seq = th->ack_seq; 697 } else { 698 rep.th.ack = 1; 699 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 700 skb->len - (th->doff << 2)); 701 } 702 703 memset(&arg, 0, sizeof(arg)); 704 arg.iov[0].iov_base = (unsigned char *)&rep; 705 arg.iov[0].iov_len = sizeof(rep.th); 706 707 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 708 #ifdef CONFIG_TCP_MD5SIG 709 rcu_read_lock(); 710 hash_location = tcp_parse_md5sig_option(th); 711 if (sk && sk_fullsock(sk)) { 712 const union tcp_md5_addr *addr; 713 int l3index; 714 715 /* sdif set, means packet ingressed via a device 716 * in an L3 domain and inet_iif is set to it. 717 */ 718 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 719 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 720 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 721 } else if (hash_location) { 722 const union tcp_md5_addr *addr; 723 int sdif = tcp_v4_sdif(skb); 724 int dif = inet_iif(skb); 725 int l3index; 726 727 /* 728 * active side is lost. Try to find listening socket through 729 * source port, and then find md5 key through listening socket. 730 * we are not loose security here: 731 * Incoming packet is checked with md5 hash with finding key, 732 * no RST generated if md5 hash doesn't match. 733 */ 734 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 735 ip_hdr(skb)->saddr, 736 th->source, ip_hdr(skb)->daddr, 737 ntohs(th->source), dif, sdif); 738 /* don't send rst if it can't find key */ 739 if (!sk1) 740 goto out; 741 742 /* sdif set, means packet ingressed via a device 743 * in an L3 domain and dif is set to it. 744 */ 745 l3index = sdif ? dif : 0; 746 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 747 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 748 if (!key) 749 goto out; 750 751 752 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 753 if (genhash || memcmp(hash_location, newhash, 16) != 0) 754 goto out; 755 756 } 757 758 if (key) { 759 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 760 (TCPOPT_NOP << 16) | 761 (TCPOPT_MD5SIG << 8) | 762 TCPOLEN_MD5SIG); 763 /* Update length and the length the header thinks exists */ 764 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 765 rep.th.doff = arg.iov[0].iov_len / 4; 766 767 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 768 key, ip_hdr(skb)->saddr, 769 ip_hdr(skb)->daddr, &rep.th); 770 } 771 #endif 772 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 773 ip_hdr(skb)->saddr, /* XXX */ 774 arg.iov[0].iov_len, IPPROTO_TCP, 0); 775 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 776 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 777 778 /* When socket is gone, all binding information is lost. 779 * routing might fail in this case. No choice here, if we choose to force 780 * input interface, we will misroute in case of asymmetric route. 781 */ 782 if (sk) { 783 arg.bound_dev_if = sk->sk_bound_dev_if; 784 if (sk_fullsock(sk)) 785 trace_tcp_send_reset(sk, skb); 786 } 787 788 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 789 offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 790 791 arg.tos = ip_hdr(skb)->tos; 792 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 793 local_bh_disable(); 794 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 795 if (sk) { 796 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 797 inet_twsk(sk)->tw_mark : sk->sk_mark; 798 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 799 inet_twsk(sk)->tw_priority : sk->sk_priority; 800 transmit_time = tcp_transmit_time(sk); 801 } 802 ip_send_unicast_reply(ctl_sk, 803 skb, &TCP_SKB_CB(skb)->header.h4.opt, 804 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 805 &arg, arg.iov[0].iov_len, 806 transmit_time); 807 808 ctl_sk->sk_mark = 0; 809 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 810 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 811 local_bh_enable(); 812 813 #ifdef CONFIG_TCP_MD5SIG 814 out: 815 rcu_read_unlock(); 816 #endif 817 } 818 819 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 820 outside socket context is ugly, certainly. What can I do? 821 */ 822 823 static void tcp_v4_send_ack(const struct sock *sk, 824 struct sk_buff *skb, u32 seq, u32 ack, 825 u32 win, u32 tsval, u32 tsecr, int oif, 826 struct tcp_md5sig_key *key, 827 int reply_flags, u8 tos) 828 { 829 const struct tcphdr *th = tcp_hdr(skb); 830 struct { 831 struct tcphdr th; 832 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 833 #ifdef CONFIG_TCP_MD5SIG 834 + (TCPOLEN_MD5SIG_ALIGNED >> 2) 835 #endif 836 ]; 837 } rep; 838 struct net *net = sock_net(sk); 839 struct ip_reply_arg arg; 840 struct sock *ctl_sk; 841 u64 transmit_time; 842 843 memset(&rep.th, 0, sizeof(struct tcphdr)); 844 memset(&arg, 0, sizeof(arg)); 845 846 arg.iov[0].iov_base = (unsigned char *)&rep; 847 arg.iov[0].iov_len = sizeof(rep.th); 848 if (tsecr) { 849 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 850 (TCPOPT_TIMESTAMP << 8) | 851 TCPOLEN_TIMESTAMP); 852 rep.opt[1] = htonl(tsval); 853 rep.opt[2] = htonl(tsecr); 854 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 855 } 856 857 /* Swap the send and the receive. */ 858 rep.th.dest = th->source; 859 rep.th.source = th->dest; 860 rep.th.doff = arg.iov[0].iov_len / 4; 861 rep.th.seq = htonl(seq); 862 rep.th.ack_seq = htonl(ack); 863 rep.th.ack = 1; 864 rep.th.window = htons(win); 865 866 #ifdef CONFIG_TCP_MD5SIG 867 if (key) { 868 int offset = (tsecr) ? 3 : 0; 869 870 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 871 (TCPOPT_NOP << 16) | 872 (TCPOPT_MD5SIG << 8) | 873 TCPOLEN_MD5SIG); 874 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 875 rep.th.doff = arg.iov[0].iov_len/4; 876 877 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 878 key, ip_hdr(skb)->saddr, 879 ip_hdr(skb)->daddr, &rep.th); 880 } 881 #endif 882 arg.flags = reply_flags; 883 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 884 ip_hdr(skb)->saddr, /* XXX */ 885 arg.iov[0].iov_len, IPPROTO_TCP, 0); 886 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 887 if (oif) 888 arg.bound_dev_if = oif; 889 arg.tos = tos; 890 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 891 local_bh_disable(); 892 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 893 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 894 inet_twsk(sk)->tw_mark : sk->sk_mark; 895 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 896 inet_twsk(sk)->tw_priority : sk->sk_priority; 897 transmit_time = tcp_transmit_time(sk); 898 ip_send_unicast_reply(ctl_sk, 899 skb, &TCP_SKB_CB(skb)->header.h4.opt, 900 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 901 &arg, arg.iov[0].iov_len, 902 transmit_time); 903 904 ctl_sk->sk_mark = 0; 905 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 906 local_bh_enable(); 907 } 908 909 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 910 { 911 struct inet_timewait_sock *tw = inet_twsk(sk); 912 struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 913 914 tcp_v4_send_ack(sk, skb, 915 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 916 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 917 tcp_time_stamp_raw() + tcptw->tw_ts_offset, 918 tcptw->tw_ts_recent, 919 tw->tw_bound_dev_if, 920 tcp_twsk_md5_key(tcptw), 921 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 922 tw->tw_tos 923 ); 924 925 inet_twsk_put(tw); 926 } 927 928 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 929 struct request_sock *req) 930 { 931 const union tcp_md5_addr *addr; 932 int l3index; 933 934 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 935 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 936 */ 937 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 938 tcp_sk(sk)->snd_nxt; 939 940 /* RFC 7323 2.3 941 * The window field (SEG.WND) of every outgoing segment, with the 942 * exception of <SYN> segments, MUST be right-shifted by 943 * Rcv.Wind.Shift bits: 944 */ 945 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 946 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 947 tcp_v4_send_ack(sk, skb, seq, 948 tcp_rsk(req)->rcv_nxt, 949 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 950 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 951 req->ts_recent, 952 0, 953 tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 954 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 955 ip_hdr(skb)->tos); 956 } 957 958 /* 959 * Send a SYN-ACK after having received a SYN. 960 * This still operates on a request_sock only, not on a big 961 * socket. 962 */ 963 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 964 struct flowi *fl, 965 struct request_sock *req, 966 struct tcp_fastopen_cookie *foc, 967 enum tcp_synack_type synack_type) 968 { 969 const struct inet_request_sock *ireq = inet_rsk(req); 970 struct flowi4 fl4; 971 int err = -1; 972 struct sk_buff *skb; 973 974 /* First, grab a route. */ 975 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 976 return -1; 977 978 skb = tcp_make_synack(sk, dst, req, foc, synack_type); 979 980 if (skb) { 981 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 982 983 rcu_read_lock(); 984 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 985 ireq->ir_rmt_addr, 986 rcu_dereference(ireq->ireq_opt)); 987 rcu_read_unlock(); 988 err = net_xmit_eval(err); 989 } 990 991 return err; 992 } 993 994 /* 995 * IPv4 request_sock destructor. 996 */ 997 static void tcp_v4_reqsk_destructor(struct request_sock *req) 998 { 999 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 1000 } 1001 1002 #ifdef CONFIG_TCP_MD5SIG 1003 /* 1004 * RFC2385 MD5 checksumming requires a mapping of 1005 * IP address->MD5 Key. 1006 * We need to maintain these in the sk structure. 1007 */ 1008 1009 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 1010 EXPORT_SYMBOL(tcp_md5_needed); 1011 1012 /* Find the Key structure for an address. */ 1013 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1014 const union tcp_md5_addr *addr, 1015 int family) 1016 { 1017 const struct tcp_sock *tp = tcp_sk(sk); 1018 struct tcp_md5sig_key *key; 1019 const struct tcp_md5sig_info *md5sig; 1020 __be32 mask; 1021 struct tcp_md5sig_key *best_match = NULL; 1022 bool match; 1023 1024 /* caller either holds rcu_read_lock() or socket lock */ 1025 md5sig = rcu_dereference_check(tp->md5sig_info, 1026 lockdep_sock_is_held(sk)); 1027 if (!md5sig) 1028 return NULL; 1029 1030 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1031 lockdep_sock_is_held(sk)) { 1032 if (key->family != family) 1033 continue; 1034 if (key->l3index && key->l3index != l3index) 1035 continue; 1036 if (family == AF_INET) { 1037 mask = inet_make_mask(key->prefixlen); 1038 match = (key->addr.a4.s_addr & mask) == 1039 (addr->a4.s_addr & mask); 1040 #if IS_ENABLED(CONFIG_IPV6) 1041 } else if (family == AF_INET6) { 1042 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 1043 key->prefixlen); 1044 #endif 1045 } else { 1046 match = false; 1047 } 1048 1049 if (match && (!best_match || 1050 key->prefixlen > best_match->prefixlen)) 1051 best_match = key; 1052 } 1053 return best_match; 1054 } 1055 EXPORT_SYMBOL(__tcp_md5_do_lookup); 1056 1057 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 1058 const union tcp_md5_addr *addr, 1059 int family, u8 prefixlen, 1060 int l3index) 1061 { 1062 const struct tcp_sock *tp = tcp_sk(sk); 1063 struct tcp_md5sig_key *key; 1064 unsigned int size = sizeof(struct in_addr); 1065 const struct tcp_md5sig_info *md5sig; 1066 1067 /* caller either holds rcu_read_lock() or socket lock */ 1068 md5sig = rcu_dereference_check(tp->md5sig_info, 1069 lockdep_sock_is_held(sk)); 1070 if (!md5sig) 1071 return NULL; 1072 #if IS_ENABLED(CONFIG_IPV6) 1073 if (family == AF_INET6) 1074 size = sizeof(struct in6_addr); 1075 #endif 1076 hlist_for_each_entry_rcu(key, &md5sig->head, node, 1077 lockdep_sock_is_held(sk)) { 1078 if (key->family != family) 1079 continue; 1080 if (key->l3index && key->l3index != l3index) 1081 continue; 1082 if (!memcmp(&key->addr, addr, size) && 1083 key->prefixlen == prefixlen) 1084 return key; 1085 } 1086 return NULL; 1087 } 1088 1089 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1090 const struct sock *addr_sk) 1091 { 1092 const union tcp_md5_addr *addr; 1093 int l3index; 1094 1095 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1096 addr_sk->sk_bound_dev_if); 1097 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1098 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1099 } 1100 EXPORT_SYMBOL(tcp_v4_md5_lookup); 1101 1102 /* This can be called on a newly created socket, from other files */ 1103 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1104 int family, u8 prefixlen, int l3index, 1105 const u8 *newkey, u8 newkeylen, gfp_t gfp) 1106 { 1107 /* Add Key to the list */ 1108 struct tcp_md5sig_key *key; 1109 struct tcp_sock *tp = tcp_sk(sk); 1110 struct tcp_md5sig_info *md5sig; 1111 1112 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1113 if (key) { 1114 /* Pre-existing entry - just update that one. 1115 * Note that the key might be used concurrently. 1116 * data_race() is telling kcsan that we do not care of 1117 * key mismatches, since changing MD5 key on live flows 1118 * can lead to packet drops. 1119 */ 1120 data_race(memcpy(key->key, newkey, newkeylen)); 1121 1122 /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1123 * Also note that a reader could catch new key->keylen value 1124 * but old key->key[], this is the reason we use __GFP_ZERO 1125 * at sock_kmalloc() time below these lines. 1126 */ 1127 WRITE_ONCE(key->keylen, newkeylen); 1128 1129 return 0; 1130 } 1131 1132 md5sig = rcu_dereference_protected(tp->md5sig_info, 1133 lockdep_sock_is_held(sk)); 1134 if (!md5sig) { 1135 md5sig = kmalloc(sizeof(*md5sig), gfp); 1136 if (!md5sig) 1137 return -ENOMEM; 1138 1139 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1140 INIT_HLIST_HEAD(&md5sig->head); 1141 rcu_assign_pointer(tp->md5sig_info, md5sig); 1142 } 1143 1144 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1145 if (!key) 1146 return -ENOMEM; 1147 if (!tcp_alloc_md5sig_pool()) { 1148 sock_kfree_s(sk, key, sizeof(*key)); 1149 return -ENOMEM; 1150 } 1151 1152 memcpy(key->key, newkey, newkeylen); 1153 key->keylen = newkeylen; 1154 key->family = family; 1155 key->prefixlen = prefixlen; 1156 key->l3index = l3index; 1157 memcpy(&key->addr, addr, 1158 (family == AF_INET6) ? sizeof(struct in6_addr) : 1159 sizeof(struct in_addr)); 1160 hlist_add_head_rcu(&key->node, &md5sig->head); 1161 return 0; 1162 } 1163 EXPORT_SYMBOL(tcp_md5_do_add); 1164 1165 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1166 u8 prefixlen, int l3index) 1167 { 1168 struct tcp_md5sig_key *key; 1169 1170 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1171 if (!key) 1172 return -ENOENT; 1173 hlist_del_rcu(&key->node); 1174 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1175 kfree_rcu(key, rcu); 1176 return 0; 1177 } 1178 EXPORT_SYMBOL(tcp_md5_do_del); 1179 1180 static void tcp_clear_md5_list(struct sock *sk) 1181 { 1182 struct tcp_sock *tp = tcp_sk(sk); 1183 struct tcp_md5sig_key *key; 1184 struct hlist_node *n; 1185 struct tcp_md5sig_info *md5sig; 1186 1187 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1188 1189 hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1190 hlist_del_rcu(&key->node); 1191 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1192 kfree_rcu(key, rcu); 1193 } 1194 } 1195 1196 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1197 char __user *optval, int optlen) 1198 { 1199 struct tcp_md5sig cmd; 1200 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1201 const union tcp_md5_addr *addr; 1202 u8 prefixlen = 32; 1203 int l3index = 0; 1204 1205 if (optlen < sizeof(cmd)) 1206 return -EINVAL; 1207 1208 if (copy_from_user(&cmd, optval, sizeof(cmd))) 1209 return -EFAULT; 1210 1211 if (sin->sin_family != AF_INET) 1212 return -EINVAL; 1213 1214 if (optname == TCP_MD5SIG_EXT && 1215 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 1216 prefixlen = cmd.tcpm_prefixlen; 1217 if (prefixlen > 32) 1218 return -EINVAL; 1219 } 1220 1221 if (optname == TCP_MD5SIG_EXT && 1222 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 1223 struct net_device *dev; 1224 1225 rcu_read_lock(); 1226 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 1227 if (dev && netif_is_l3_master(dev)) 1228 l3index = dev->ifindex; 1229 1230 rcu_read_unlock(); 1231 1232 /* ok to reference set/not set outside of rcu; 1233 * right now device MUST be an L3 master 1234 */ 1235 if (!dev || !l3index) 1236 return -EINVAL; 1237 } 1238 1239 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1240 1241 if (!cmd.tcpm_keylen) 1242 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); 1243 1244 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1245 return -EINVAL; 1246 1247 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, 1248 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1249 } 1250 1251 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 1252 __be32 daddr, __be32 saddr, 1253 const struct tcphdr *th, int nbytes) 1254 { 1255 struct tcp4_pseudohdr *bp; 1256 struct scatterlist sg; 1257 struct tcphdr *_th; 1258 1259 bp = hp->scratch; 1260 bp->saddr = saddr; 1261 bp->daddr = daddr; 1262 bp->pad = 0; 1263 bp->protocol = IPPROTO_TCP; 1264 bp->len = cpu_to_be16(nbytes); 1265 1266 _th = (struct tcphdr *)(bp + 1); 1267 memcpy(_th, th, sizeof(*th)); 1268 _th->check = 0; 1269 1270 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 1271 ahash_request_set_crypt(hp->md5_req, &sg, NULL, 1272 sizeof(*bp) + sizeof(*th)); 1273 return crypto_ahash_update(hp->md5_req); 1274 } 1275 1276 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1277 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1278 { 1279 struct tcp_md5sig_pool *hp; 1280 struct ahash_request *req; 1281 1282 hp = tcp_get_md5sig_pool(); 1283 if (!hp) 1284 goto clear_hash_noput; 1285 req = hp->md5_req; 1286 1287 if (crypto_ahash_init(req)) 1288 goto clear_hash; 1289 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 1290 goto clear_hash; 1291 if (tcp_md5_hash_key(hp, key)) 1292 goto clear_hash; 1293 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1294 if (crypto_ahash_final(req)) 1295 goto clear_hash; 1296 1297 tcp_put_md5sig_pool(); 1298 return 0; 1299 1300 clear_hash: 1301 tcp_put_md5sig_pool(); 1302 clear_hash_noput: 1303 memset(md5_hash, 0, 16); 1304 return 1; 1305 } 1306 1307 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 1308 const struct sock *sk, 1309 const struct sk_buff *skb) 1310 { 1311 struct tcp_md5sig_pool *hp; 1312 struct ahash_request *req; 1313 const struct tcphdr *th = tcp_hdr(skb); 1314 __be32 saddr, daddr; 1315 1316 if (sk) { /* valid for establish/request sockets */ 1317 saddr = sk->sk_rcv_saddr; 1318 daddr = sk->sk_daddr; 1319 } else { 1320 const struct iphdr *iph = ip_hdr(skb); 1321 saddr = iph->saddr; 1322 daddr = iph->daddr; 1323 } 1324 1325 hp = tcp_get_md5sig_pool(); 1326 if (!hp) 1327 goto clear_hash_noput; 1328 req = hp->md5_req; 1329 1330 if (crypto_ahash_init(req)) 1331 goto clear_hash; 1332 1333 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 1334 goto clear_hash; 1335 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 1336 goto clear_hash; 1337 if (tcp_md5_hash_key(hp, key)) 1338 goto clear_hash; 1339 ahash_request_set_crypt(req, NULL, md5_hash, 0); 1340 if (crypto_ahash_final(req)) 1341 goto clear_hash; 1342 1343 tcp_put_md5sig_pool(); 1344 return 0; 1345 1346 clear_hash: 1347 tcp_put_md5sig_pool(); 1348 clear_hash_noput: 1349 memset(md5_hash, 0, 16); 1350 return 1; 1351 } 1352 EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1353 1354 #endif 1355 1356 /* Called with rcu_read_lock() */ 1357 static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1358 const struct sk_buff *skb, 1359 int dif, int sdif) 1360 { 1361 #ifdef CONFIG_TCP_MD5SIG 1362 /* 1363 * This gets called for each TCP segment that arrives 1364 * so we want to be efficient. 1365 * We have 3 drop cases: 1366 * o No MD5 hash and one expected. 1367 * o MD5 hash and we're not expecting one. 1368 * o MD5 hash and its wrong. 1369 */ 1370 const __u8 *hash_location = NULL; 1371 struct tcp_md5sig_key *hash_expected; 1372 const struct iphdr *iph = ip_hdr(skb); 1373 const struct tcphdr *th = tcp_hdr(skb); 1374 const union tcp_md5_addr *addr; 1375 unsigned char newhash[16]; 1376 int genhash, l3index; 1377 1378 /* sdif set, means packet ingressed via a device 1379 * in an L3 domain and dif is set to the l3mdev 1380 */ 1381 l3index = sdif ? dif : 0; 1382 1383 addr = (union tcp_md5_addr *)&iph->saddr; 1384 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1385 hash_location = tcp_parse_md5sig_option(th); 1386 1387 /* We've parsed the options - do we have a hash? */ 1388 if (!hash_expected && !hash_location) 1389 return false; 1390 1391 if (hash_expected && !hash_location) { 1392 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1393 return true; 1394 } 1395 1396 if (!hash_expected && hash_location) { 1397 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1398 return true; 1399 } 1400 1401 /* Okay, so this is hash_expected and hash_location - 1402 * so we need to calculate the checksum. 1403 */ 1404 genhash = tcp_v4_md5_hash_skb(newhash, 1405 hash_expected, 1406 NULL, skb); 1407 1408 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1409 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1410 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1411 &iph->saddr, ntohs(th->source), 1412 &iph->daddr, ntohs(th->dest), 1413 genhash ? " tcp_v4_calc_md5_hash failed" 1414 : "", l3index); 1415 return true; 1416 } 1417 return false; 1418 #endif 1419 return false; 1420 } 1421 1422 static void tcp_v4_init_req(struct request_sock *req, 1423 const struct sock *sk_listener, 1424 struct sk_buff *skb) 1425 { 1426 struct inet_request_sock *ireq = inet_rsk(req); 1427 struct net *net = sock_net(sk_listener); 1428 1429 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 1430 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1431 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 1432 } 1433 1434 static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1435 struct flowi *fl, 1436 const struct request_sock *req) 1437 { 1438 return inet_csk_route_req(sk, &fl->u.ip4, req); 1439 } 1440 1441 struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1442 .family = PF_INET, 1443 .obj_size = sizeof(struct tcp_request_sock), 1444 .rtx_syn_ack = tcp_rtx_synack, 1445 .send_ack = tcp_v4_reqsk_send_ack, 1446 .destructor = tcp_v4_reqsk_destructor, 1447 .send_reset = tcp_v4_send_reset, 1448 .syn_ack_timeout = tcp_syn_ack_timeout, 1449 }; 1450 1451 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1452 .mss_clamp = TCP_MSS_DEFAULT, 1453 #ifdef CONFIG_TCP_MD5SIG 1454 .req_md5_lookup = tcp_v4_md5_lookup, 1455 .calc_md5_hash = tcp_v4_md5_hash_skb, 1456 #endif 1457 .init_req = tcp_v4_init_req, 1458 #ifdef CONFIG_SYN_COOKIES 1459 .cookie_init_seq = cookie_v4_init_sequence, 1460 #endif 1461 .route_req = tcp_v4_route_req, 1462 .init_seq = tcp_v4_init_seq, 1463 .init_ts_off = tcp_v4_init_ts_off, 1464 .send_synack = tcp_v4_send_synack, 1465 }; 1466 1467 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1468 { 1469 /* Never answer to SYNs send to broadcast or multicast */ 1470 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1471 goto drop; 1472 1473 return tcp_conn_request(&tcp_request_sock_ops, 1474 &tcp_request_sock_ipv4_ops, sk, skb); 1475 1476 drop: 1477 tcp_listendrop(sk); 1478 return 0; 1479 } 1480 EXPORT_SYMBOL(tcp_v4_conn_request); 1481 1482 1483 /* 1484 * The three way handshake has completed - we got a valid synack - 1485 * now create the new socket. 1486 */ 1487 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 1488 struct request_sock *req, 1489 struct dst_entry *dst, 1490 struct request_sock *req_unhash, 1491 bool *own_req) 1492 { 1493 struct inet_request_sock *ireq; 1494 struct inet_sock *newinet; 1495 struct tcp_sock *newtp; 1496 struct sock *newsk; 1497 #ifdef CONFIG_TCP_MD5SIG 1498 const union tcp_md5_addr *addr; 1499 struct tcp_md5sig_key *key; 1500 int l3index; 1501 #endif 1502 struct ip_options_rcu *inet_opt; 1503 1504 if (sk_acceptq_is_full(sk)) 1505 goto exit_overflow; 1506 1507 newsk = tcp_create_openreq_child(sk, req, skb); 1508 if (!newsk) 1509 goto exit_nonewsk; 1510 1511 newsk->sk_gso_type = SKB_GSO_TCPV4; 1512 inet_sk_rx_dst_set(newsk, skb); 1513 1514 newtp = tcp_sk(newsk); 1515 newinet = inet_sk(newsk); 1516 ireq = inet_rsk(req); 1517 sk_daddr_set(newsk, ireq->ir_rmt_addr); 1518 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 1519 newsk->sk_bound_dev_if = ireq->ir_iif; 1520 newinet->inet_saddr = ireq->ir_loc_addr; 1521 inet_opt = rcu_dereference(ireq->ireq_opt); 1522 RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1523 newinet->mc_index = inet_iif(skb); 1524 newinet->mc_ttl = ip_hdr(skb)->ttl; 1525 newinet->rcv_tos = ip_hdr(skb)->tos; 1526 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1527 if (inet_opt) 1528 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1529 newinet->inet_id = prandom_u32(); 1530 1531 if (!dst) { 1532 dst = inet_csk_route_child_sock(sk, newsk, req); 1533 if (!dst) 1534 goto put_and_exit; 1535 } else { 1536 /* syncookie case : see end of cookie_v4_check() */ 1537 } 1538 sk_setup_caps(newsk, dst); 1539 1540 tcp_ca_openreq_child(newsk, dst); 1541 1542 tcp_sync_mss(newsk, dst_mtu(dst)); 1543 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1544 1545 tcp_initialize_rcv_mss(newsk); 1546 1547 #ifdef CONFIG_TCP_MD5SIG 1548 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1549 /* Copy over the MD5 key from the original socket */ 1550 addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1551 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1552 if (key) { 1553 /* 1554 * We're using one, so create a matching key 1555 * on the newsk structure. If we fail to get 1556 * memory, then we end up not copying the key 1557 * across. Shucks. 1558 */ 1559 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, 1560 key->key, key->keylen, GFP_ATOMIC); 1561 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1562 } 1563 #endif 1564 1565 if (__inet_inherit_port(sk, newsk) < 0) 1566 goto put_and_exit; 1567 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1568 if (likely(*own_req)) { 1569 tcp_move_syn(newtp, req); 1570 ireq->ireq_opt = NULL; 1571 } else { 1572 newinet->inet_opt = NULL; 1573 } 1574 return newsk; 1575 1576 exit_overflow: 1577 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1578 exit_nonewsk: 1579 dst_release(dst); 1580 exit: 1581 tcp_listendrop(sk); 1582 return NULL; 1583 put_and_exit: 1584 newinet->inet_opt = NULL; 1585 inet_csk_prepare_forced_close(newsk); 1586 tcp_done(newsk); 1587 goto exit; 1588 } 1589 EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1590 1591 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 1592 { 1593 #ifdef CONFIG_SYN_COOKIES 1594 const struct tcphdr *th = tcp_hdr(skb); 1595 1596 if (!th->syn) 1597 sk = cookie_v4_check(sk, skb); 1598 #endif 1599 return sk; 1600 } 1601 1602 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 1603 struct tcphdr *th, u32 *cookie) 1604 { 1605 u16 mss = 0; 1606 #ifdef CONFIG_SYN_COOKIES 1607 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 1608 &tcp_request_sock_ipv4_ops, sk, th); 1609 if (mss) { 1610 *cookie = __cookie_v4_init_sequence(iph, th, &mss); 1611 tcp_synq_overflow(sk); 1612 } 1613 #endif 1614 return mss; 1615 } 1616 1617 /* The socket must have it's spinlock held when we get 1618 * here, unless it is a TCP_LISTEN socket. 1619 * 1620 * We have a potential double-lock case here, so even when 1621 * doing backlog processing we use the BH locking scheme. 1622 * This is because we cannot sleep with the original spinlock 1623 * held. 1624 */ 1625 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 1626 { 1627 struct sock *rsk; 1628 1629 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1630 struct dst_entry *dst = sk->sk_rx_dst; 1631 1632 sock_rps_save_rxhash(sk, skb); 1633 sk_mark_napi_id(sk, skb); 1634 if (dst) { 1635 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1636 !dst->ops->check(dst, 0)) { 1637 dst_release(dst); 1638 sk->sk_rx_dst = NULL; 1639 } 1640 } 1641 tcp_rcv_established(sk, skb); 1642 return 0; 1643 } 1644 1645 if (tcp_checksum_complete(skb)) 1646 goto csum_err; 1647 1648 if (sk->sk_state == TCP_LISTEN) { 1649 struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1650 1651 if (!nsk) 1652 goto discard; 1653 if (nsk != sk) { 1654 if (tcp_child_process(sk, nsk, skb)) { 1655 rsk = nsk; 1656 goto reset; 1657 } 1658 return 0; 1659 } 1660 } else 1661 sock_rps_save_rxhash(sk, skb); 1662 1663 if (tcp_rcv_state_process(sk, skb)) { 1664 rsk = sk; 1665 goto reset; 1666 } 1667 return 0; 1668 1669 reset: 1670 tcp_v4_send_reset(rsk, skb); 1671 discard: 1672 kfree_skb(skb); 1673 /* Be careful here. If this function gets more complicated and 1674 * gcc suffers from register pressure on the x86, sk (in %ebx) 1675 * might be destroyed here. This current version compiles correctly, 1676 * but you have been warned. 1677 */ 1678 return 0; 1679 1680 csum_err: 1681 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1682 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1683 goto discard; 1684 } 1685 EXPORT_SYMBOL(tcp_v4_do_rcv); 1686 1687 int tcp_v4_early_demux(struct sk_buff *skb) 1688 { 1689 const struct iphdr *iph; 1690 const struct tcphdr *th; 1691 struct sock *sk; 1692 1693 if (skb->pkt_type != PACKET_HOST) 1694 return 0; 1695 1696 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 1697 return 0; 1698 1699 iph = ip_hdr(skb); 1700 th = tcp_hdr(skb); 1701 1702 if (th->doff < sizeof(struct tcphdr) / 4) 1703 return 0; 1704 1705 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 1706 iph->saddr, th->source, 1707 iph->daddr, ntohs(th->dest), 1708 skb->skb_iif, inet_sdif(skb)); 1709 if (sk) { 1710 skb->sk = sk; 1711 skb->destructor = sock_edemux; 1712 if (sk_fullsock(sk)) { 1713 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1714 1715 if (dst) 1716 dst = dst_check(dst, 0); 1717 if (dst && 1718 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 1719 skb_dst_set_noref(skb, dst); 1720 } 1721 } 1722 return 0; 1723 } 1724 1725 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1726 { 1727 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 1728 struct skb_shared_info *shinfo; 1729 const struct tcphdr *th; 1730 struct tcphdr *thtail; 1731 struct sk_buff *tail; 1732 unsigned int hdrlen; 1733 bool fragstolen; 1734 u32 gso_segs; 1735 int delta; 1736 1737 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1738 * we can fix skb->truesize to its real value to avoid future drops. 1739 * This is valid because skb is not yet charged to the socket. 1740 * It has been noticed pure SACK packets were sometimes dropped 1741 * (if cooked by drivers without copybreak feature). 1742 */ 1743 skb_condense(skb); 1744 1745 skb_dst_drop(skb); 1746 1747 if (unlikely(tcp_checksum_complete(skb))) { 1748 bh_unlock_sock(sk); 1749 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1750 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 1751 return true; 1752 } 1753 1754 /* Attempt coalescing to last skb in backlog, even if we are 1755 * above the limits. 1756 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 1757 */ 1758 th = (const struct tcphdr *)skb->data; 1759 hdrlen = th->doff * 4; 1760 shinfo = skb_shinfo(skb); 1761 1762 if (!shinfo->gso_size) 1763 shinfo->gso_size = skb->len - hdrlen; 1764 1765 if (!shinfo->gso_segs) 1766 shinfo->gso_segs = 1; 1767 1768 tail = sk->sk_backlog.tail; 1769 if (!tail) 1770 goto no_coalesce; 1771 thtail = (struct tcphdr *)tail->data; 1772 1773 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 1774 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 1775 ((TCP_SKB_CB(tail)->tcp_flags | 1776 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1777 !((TCP_SKB_CB(tail)->tcp_flags & 1778 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 1779 ((TCP_SKB_CB(tail)->tcp_flags ^ 1780 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 1781 #ifdef CONFIG_TLS_DEVICE 1782 tail->decrypted != skb->decrypted || 1783 #endif 1784 thtail->doff != th->doff || 1785 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 1786 goto no_coalesce; 1787 1788 __skb_pull(skb, hdrlen); 1789 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 1790 thtail->window = th->window; 1791 1792 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 1793 1794 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) 1795 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 1796 1797 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1798 * thtail->fin, so that the fast path in tcp_rcv_established() 1799 * is not entered if we append a packet with a FIN. 1800 * SYN, RST, URG are not present. 1801 * ACK is set on both packets. 1802 * PSH : we do not really care in TCP stack, 1803 * at least for 'GRO' packets. 1804 */ 1805 thtail->fin |= th->fin; 1806 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1807 1808 if (TCP_SKB_CB(skb)->has_rxtstamp) { 1809 TCP_SKB_CB(tail)->has_rxtstamp = true; 1810 tail->tstamp = skb->tstamp; 1811 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 1812 } 1813 1814 /* Not as strict as GRO. We only need to carry mss max value */ 1815 skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 1816 skb_shinfo(tail)->gso_size); 1817 1818 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 1819 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 1820 1821 sk->sk_backlog.len += delta; 1822 __NET_INC_STATS(sock_net(sk), 1823 LINUX_MIB_TCPBACKLOGCOALESCE); 1824 kfree_skb_partial(skb, fragstolen); 1825 return false; 1826 } 1827 __skb_push(skb, hdrlen); 1828 1829 no_coalesce: 1830 /* Only socket owner can try to collapse/prune rx queues 1831 * to reduce memory overhead, so add a little headroom here. 1832 * Few sockets backlog are possibly concurrently non empty. 1833 */ 1834 limit += 64*1024; 1835 1836 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1837 bh_unlock_sock(sk); 1838 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1839 return true; 1840 } 1841 return false; 1842 } 1843 EXPORT_SYMBOL(tcp_add_backlog); 1844 1845 int tcp_filter(struct sock *sk, struct sk_buff *skb) 1846 { 1847 struct tcphdr *th = (struct tcphdr *)skb->data; 1848 1849 return sk_filter_trim_cap(sk, skb, th->doff * 4); 1850 } 1851 EXPORT_SYMBOL(tcp_filter); 1852 1853 static void tcp_v4_restore_cb(struct sk_buff *skb) 1854 { 1855 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1856 sizeof(struct inet_skb_parm)); 1857 } 1858 1859 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1860 const struct tcphdr *th) 1861 { 1862 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1863 * barrier() makes sure compiler wont play fool^Waliasing games. 1864 */ 1865 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1866 sizeof(struct inet_skb_parm)); 1867 barrier(); 1868 1869 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1870 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1871 skb->len - th->doff * 4); 1872 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1873 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1874 TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1875 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1876 TCP_SKB_CB(skb)->sacked = 0; 1877 TCP_SKB_CB(skb)->has_rxtstamp = 1878 skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1879 } 1880 1881 /* 1882 * From tcp_input.c 1883 */ 1884 1885 int tcp_v4_rcv(struct sk_buff *skb) 1886 { 1887 struct net *net = dev_net(skb->dev); 1888 struct sk_buff *skb_to_free; 1889 int sdif = inet_sdif(skb); 1890 int dif = inet_iif(skb); 1891 const struct iphdr *iph; 1892 const struct tcphdr *th; 1893 bool refcounted; 1894 struct sock *sk; 1895 int ret; 1896 1897 if (skb->pkt_type != PACKET_HOST) 1898 goto discard_it; 1899 1900 /* Count it even if it's bad */ 1901 __TCP_INC_STATS(net, TCP_MIB_INSEGS); 1902 1903 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 1904 goto discard_it; 1905 1906 th = (const struct tcphdr *)skb->data; 1907 1908 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 1909 goto bad_packet; 1910 if (!pskb_may_pull(skb, th->doff * 4)) 1911 goto discard_it; 1912 1913 /* An explanation is required here, I think. 1914 * Packet length and doff are validated by header prediction, 1915 * provided case of th->doff==0 is eliminated. 1916 * So, we defer the checks. */ 1917 1918 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 1919 goto csum_error; 1920 1921 th = (const struct tcphdr *)skb->data; 1922 iph = ip_hdr(skb); 1923 lookup: 1924 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 1925 th->dest, sdif, &refcounted); 1926 if (!sk) 1927 goto no_tcp_socket; 1928 1929 process: 1930 if (sk->sk_state == TCP_TIME_WAIT) 1931 goto do_time_wait; 1932 1933 if (sk->sk_state == TCP_NEW_SYN_RECV) { 1934 struct request_sock *req = inet_reqsk(sk); 1935 bool req_stolen = false; 1936 struct sock *nsk; 1937 1938 sk = req->rsk_listener; 1939 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 1940 sk_drops_add(sk, skb); 1941 reqsk_put(req); 1942 goto discard_it; 1943 } 1944 if (tcp_checksum_complete(skb)) { 1945 reqsk_put(req); 1946 goto csum_error; 1947 } 1948 if (unlikely(sk->sk_state != TCP_LISTEN)) { 1949 inet_csk_reqsk_queue_drop_and_put(sk, req); 1950 goto lookup; 1951 } 1952 /* We own a reference on the listener, increase it again 1953 * as we might lose it too soon. 1954 */ 1955 sock_hold(sk); 1956 refcounted = true; 1957 nsk = NULL; 1958 if (!tcp_filter(sk, skb)) { 1959 th = (const struct tcphdr *)skb->data; 1960 iph = ip_hdr(skb); 1961 tcp_v4_fill_cb(skb, iph, th); 1962 nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1963 } 1964 if (!nsk) { 1965 reqsk_put(req); 1966 if (req_stolen) { 1967 /* Another cpu got exclusive access to req 1968 * and created a full blown socket. 1969 * Try to feed this packet to this socket 1970 * instead of discarding it. 1971 */ 1972 tcp_v4_restore_cb(skb); 1973 sock_put(sk); 1974 goto lookup; 1975 } 1976 goto discard_and_relse; 1977 } 1978 if (nsk == sk) { 1979 reqsk_put(req); 1980 tcp_v4_restore_cb(skb); 1981 } else if (tcp_child_process(sk, nsk, skb)) { 1982 tcp_v4_send_reset(nsk, skb); 1983 goto discard_and_relse; 1984 } else { 1985 sock_put(sk); 1986 return 0; 1987 } 1988 } 1989 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 1990 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1991 goto discard_and_relse; 1992 } 1993 1994 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1995 goto discard_and_relse; 1996 1997 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 1998 goto discard_and_relse; 1999 2000 nf_reset_ct(skb); 2001 2002 if (tcp_filter(sk, skb)) 2003 goto discard_and_relse; 2004 th = (const struct tcphdr *)skb->data; 2005 iph = ip_hdr(skb); 2006 tcp_v4_fill_cb(skb, iph, th); 2007 2008 skb->dev = NULL; 2009 2010 if (sk->sk_state == TCP_LISTEN) { 2011 ret = tcp_v4_do_rcv(sk, skb); 2012 goto put_and_return; 2013 } 2014 2015 sk_incoming_cpu_update(sk); 2016 2017 bh_lock_sock_nested(sk); 2018 tcp_segs_in(tcp_sk(sk), skb); 2019 ret = 0; 2020 if (!sock_owned_by_user(sk)) { 2021 skb_to_free = sk->sk_rx_skb_cache; 2022 sk->sk_rx_skb_cache = NULL; 2023 ret = tcp_v4_do_rcv(sk, skb); 2024 } else { 2025 if (tcp_add_backlog(sk, skb)) 2026 goto discard_and_relse; 2027 skb_to_free = NULL; 2028 } 2029 bh_unlock_sock(sk); 2030 if (skb_to_free) 2031 __kfree_skb(skb_to_free); 2032 2033 put_and_return: 2034 if (refcounted) 2035 sock_put(sk); 2036 2037 return ret; 2038 2039 no_tcp_socket: 2040 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 2041 goto discard_it; 2042 2043 tcp_v4_fill_cb(skb, iph, th); 2044 2045 if (tcp_checksum_complete(skb)) { 2046 csum_error: 2047 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 2048 bad_packet: 2049 __TCP_INC_STATS(net, TCP_MIB_INERRS); 2050 } else { 2051 tcp_v4_send_reset(NULL, skb); 2052 } 2053 2054 discard_it: 2055 /* Discard frame. */ 2056 kfree_skb(skb); 2057 return 0; 2058 2059 discard_and_relse: 2060 sk_drops_add(sk, skb); 2061 if (refcounted) 2062 sock_put(sk); 2063 goto discard_it; 2064 2065 do_time_wait: 2066 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 2067 inet_twsk_put(inet_twsk(sk)); 2068 goto discard_it; 2069 } 2070 2071 tcp_v4_fill_cb(skb, iph, th); 2072 2073 if (tcp_checksum_complete(skb)) { 2074 inet_twsk_put(inet_twsk(sk)); 2075 goto csum_error; 2076 } 2077 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 2078 case TCP_TW_SYN: { 2079 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2080 &tcp_hashinfo, skb, 2081 __tcp_hdrlen(th), 2082 iph->saddr, th->source, 2083 iph->daddr, th->dest, 2084 inet_iif(skb), 2085 sdif); 2086 if (sk2) { 2087 inet_twsk_deschedule_put(inet_twsk(sk)); 2088 sk = sk2; 2089 tcp_v4_restore_cb(skb); 2090 refcounted = false; 2091 goto process; 2092 } 2093 } 2094 /* to ACK */ 2095 fallthrough; 2096 case TCP_TW_ACK: 2097 tcp_v4_timewait_ack(sk, skb); 2098 break; 2099 case TCP_TW_RST: 2100 tcp_v4_send_reset(sk, skb); 2101 inet_twsk_deschedule_put(inet_twsk(sk)); 2102 goto discard_it; 2103 case TCP_TW_SUCCESS:; 2104 } 2105 goto discard_it; 2106 } 2107 2108 static struct timewait_sock_ops tcp_timewait_sock_ops = { 2109 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2110 .twsk_unique = tcp_twsk_unique, 2111 .twsk_destructor= tcp_twsk_destructor, 2112 }; 2113 2114 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 2115 { 2116 struct dst_entry *dst = skb_dst(skb); 2117 2118 if (dst && dst_hold_safe(dst)) { 2119 sk->sk_rx_dst = dst; 2120 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 2121 } 2122 } 2123 EXPORT_SYMBOL(inet_sk_rx_dst_set); 2124 2125 const struct inet_connection_sock_af_ops ipv4_specific = { 2126 .queue_xmit = ip_queue_xmit, 2127 .send_check = tcp_v4_send_check, 2128 .rebuild_header = inet_sk_rebuild_header, 2129 .sk_rx_dst_set = inet_sk_rx_dst_set, 2130 .conn_request = tcp_v4_conn_request, 2131 .syn_recv_sock = tcp_v4_syn_recv_sock, 2132 .net_header_len = sizeof(struct iphdr), 2133 .setsockopt = ip_setsockopt, 2134 .getsockopt = ip_getsockopt, 2135 .addr2sockaddr = inet_csk_addr2sockaddr, 2136 .sockaddr_len = sizeof(struct sockaddr_in), 2137 #ifdef CONFIG_COMPAT 2138 .compat_setsockopt = compat_ip_setsockopt, 2139 .compat_getsockopt = compat_ip_getsockopt, 2140 #endif 2141 .mtu_reduced = tcp_v4_mtu_reduced, 2142 }; 2143 EXPORT_SYMBOL(ipv4_specific); 2144 2145 #ifdef CONFIG_TCP_MD5SIG 2146 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2147 .md5_lookup = tcp_v4_md5_lookup, 2148 .calc_md5_hash = tcp_v4_md5_hash_skb, 2149 .md5_parse = tcp_v4_parse_md5_keys, 2150 }; 2151 #endif 2152 2153 /* NOTE: A lot of things set to zero explicitly by call to 2154 * sk_alloc() so need not be done here. 2155 */ 2156 static int tcp_v4_init_sock(struct sock *sk) 2157 { 2158 struct inet_connection_sock *icsk = inet_csk(sk); 2159 2160 tcp_init_sock(sk); 2161 2162 icsk->icsk_af_ops = &ipv4_specific; 2163 2164 #ifdef CONFIG_TCP_MD5SIG 2165 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2166 #endif 2167 2168 return 0; 2169 } 2170 2171 void tcp_v4_destroy_sock(struct sock *sk) 2172 { 2173 struct tcp_sock *tp = tcp_sk(sk); 2174 2175 trace_tcp_destroy_sock(sk); 2176 2177 tcp_clear_xmit_timers(sk); 2178 2179 tcp_cleanup_congestion_control(sk); 2180 2181 tcp_cleanup_ulp(sk); 2182 2183 /* Cleanup up the write buffer. */ 2184 tcp_write_queue_purge(sk); 2185 2186 /* Check if we want to disable active TFO */ 2187 tcp_fastopen_active_disable_ofo_check(sk); 2188 2189 /* Cleans up our, hopefully empty, out_of_order_queue. */ 2190 skb_rbtree_purge(&tp->out_of_order_queue); 2191 2192 #ifdef CONFIG_TCP_MD5SIG 2193 /* Clean up the MD5 key list, if any */ 2194 if (tp->md5sig_info) { 2195 tcp_clear_md5_list(sk); 2196 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2197 tp->md5sig_info = NULL; 2198 } 2199 #endif 2200 2201 /* Clean up a referenced TCP bind bucket. */ 2202 if (inet_csk(sk)->icsk_bind_hash) 2203 inet_put_port(sk); 2204 2205 BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2206 2207 /* If socket is aborted during connect operation */ 2208 tcp_free_fastopen_req(tp); 2209 tcp_fastopen_destroy_cipher(sk); 2210 tcp_saved_syn_free(tp); 2211 2212 sk_sockets_allocated_dec(sk); 2213 } 2214 EXPORT_SYMBOL(tcp_v4_destroy_sock); 2215 2216 #ifdef CONFIG_PROC_FS 2217 /* Proc filesystem TCP sock list dumping. */ 2218 2219 /* 2220 * Get next listener socket follow cur. If cur is NULL, get first socket 2221 * starting from bucket given in st->bucket; when st->bucket is zero the 2222 * very first socket in the hash table is returned. 2223 */ 2224 static void *listening_get_next(struct seq_file *seq, void *cur) 2225 { 2226 struct tcp_seq_afinfo *afinfo; 2227 struct tcp_iter_state *st = seq->private; 2228 struct net *net = seq_file_net(seq); 2229 struct inet_listen_hashbucket *ilb; 2230 struct hlist_nulls_node *node; 2231 struct sock *sk = cur; 2232 2233 if (st->bpf_seq_afinfo) 2234 afinfo = st->bpf_seq_afinfo; 2235 else 2236 afinfo = PDE_DATA(file_inode(seq->file)); 2237 2238 if (!sk) { 2239 get_head: 2240 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2241 spin_lock(&ilb->lock); 2242 sk = sk_nulls_head(&ilb->nulls_head); 2243 st->offset = 0; 2244 goto get_sk; 2245 } 2246 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2247 ++st->num; 2248 ++st->offset; 2249 2250 sk = sk_nulls_next(sk); 2251 get_sk: 2252 sk_nulls_for_each_from(sk, node) { 2253 if (!net_eq(sock_net(sk), net)) 2254 continue; 2255 if (afinfo->family == AF_UNSPEC || 2256 sk->sk_family == afinfo->family) 2257 return sk; 2258 } 2259 spin_unlock(&ilb->lock); 2260 st->offset = 0; 2261 if (++st->bucket < INET_LHTABLE_SIZE) 2262 goto get_head; 2263 return NULL; 2264 } 2265 2266 static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2267 { 2268 struct tcp_iter_state *st = seq->private; 2269 void *rc; 2270 2271 st->bucket = 0; 2272 st->offset = 0; 2273 rc = listening_get_next(seq, NULL); 2274 2275 while (rc && *pos) { 2276 rc = listening_get_next(seq, rc); 2277 --*pos; 2278 } 2279 return rc; 2280 } 2281 2282 static inline bool empty_bucket(const struct tcp_iter_state *st) 2283 { 2284 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 2285 } 2286 2287 /* 2288 * Get first established socket starting from bucket given in st->bucket. 2289 * If st->bucket is zero, the very first socket in the hash is returned. 2290 */ 2291 static void *established_get_first(struct seq_file *seq) 2292 { 2293 struct tcp_seq_afinfo *afinfo; 2294 struct tcp_iter_state *st = seq->private; 2295 struct net *net = seq_file_net(seq); 2296 void *rc = NULL; 2297 2298 if (st->bpf_seq_afinfo) 2299 afinfo = st->bpf_seq_afinfo; 2300 else 2301 afinfo = PDE_DATA(file_inode(seq->file)); 2302 2303 st->offset = 0; 2304 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2305 struct sock *sk; 2306 struct hlist_nulls_node *node; 2307 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 2308 2309 /* Lockless fast path for the common case of empty buckets */ 2310 if (empty_bucket(st)) 2311 continue; 2312 2313 spin_lock_bh(lock); 2314 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2315 if ((afinfo->family != AF_UNSPEC && 2316 sk->sk_family != afinfo->family) || 2317 !net_eq(sock_net(sk), net)) { 2318 continue; 2319 } 2320 rc = sk; 2321 goto out; 2322 } 2323 spin_unlock_bh(lock); 2324 } 2325 out: 2326 return rc; 2327 } 2328 2329 static void *established_get_next(struct seq_file *seq, void *cur) 2330 { 2331 struct tcp_seq_afinfo *afinfo; 2332 struct sock *sk = cur; 2333 struct hlist_nulls_node *node; 2334 struct tcp_iter_state *st = seq->private; 2335 struct net *net = seq_file_net(seq); 2336 2337 if (st->bpf_seq_afinfo) 2338 afinfo = st->bpf_seq_afinfo; 2339 else 2340 afinfo = PDE_DATA(file_inode(seq->file)); 2341 2342 ++st->num; 2343 ++st->offset; 2344 2345 sk = sk_nulls_next(sk); 2346 2347 sk_nulls_for_each_from(sk, node) { 2348 if ((afinfo->family == AF_UNSPEC || 2349 sk->sk_family == afinfo->family) && 2350 net_eq(sock_net(sk), net)) 2351 return sk; 2352 } 2353 2354 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2355 ++st->bucket; 2356 return established_get_first(seq); 2357 } 2358 2359 static void *established_get_idx(struct seq_file *seq, loff_t pos) 2360 { 2361 struct tcp_iter_state *st = seq->private; 2362 void *rc; 2363 2364 st->bucket = 0; 2365 rc = established_get_first(seq); 2366 2367 while (rc && pos) { 2368 rc = established_get_next(seq, rc); 2369 --pos; 2370 } 2371 return rc; 2372 } 2373 2374 static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 2375 { 2376 void *rc; 2377 struct tcp_iter_state *st = seq->private; 2378 2379 st->state = TCP_SEQ_STATE_LISTENING; 2380 rc = listening_get_idx(seq, &pos); 2381 2382 if (!rc) { 2383 st->state = TCP_SEQ_STATE_ESTABLISHED; 2384 rc = established_get_idx(seq, pos); 2385 } 2386 2387 return rc; 2388 } 2389 2390 static void *tcp_seek_last_pos(struct seq_file *seq) 2391 { 2392 struct tcp_iter_state *st = seq->private; 2393 int offset = st->offset; 2394 int orig_num = st->num; 2395 void *rc = NULL; 2396 2397 switch (st->state) { 2398 case TCP_SEQ_STATE_LISTENING: 2399 if (st->bucket >= INET_LHTABLE_SIZE) 2400 break; 2401 st->state = TCP_SEQ_STATE_LISTENING; 2402 rc = listening_get_next(seq, NULL); 2403 while (offset-- && rc) 2404 rc = listening_get_next(seq, rc); 2405 if (rc) 2406 break; 2407 st->bucket = 0; 2408 st->state = TCP_SEQ_STATE_ESTABLISHED; 2409 fallthrough; 2410 case TCP_SEQ_STATE_ESTABLISHED: 2411 if (st->bucket > tcp_hashinfo.ehash_mask) 2412 break; 2413 rc = established_get_first(seq); 2414 while (offset-- && rc) 2415 rc = established_get_next(seq, rc); 2416 } 2417 2418 st->num = orig_num; 2419 2420 return rc; 2421 } 2422 2423 void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2424 { 2425 struct tcp_iter_state *st = seq->private; 2426 void *rc; 2427 2428 if (*pos && *pos == st->last_pos) { 2429 rc = tcp_seek_last_pos(seq); 2430 if (rc) 2431 goto out; 2432 } 2433 2434 st->state = TCP_SEQ_STATE_LISTENING; 2435 st->num = 0; 2436 st->bucket = 0; 2437 st->offset = 0; 2438 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2439 2440 out: 2441 st->last_pos = *pos; 2442 return rc; 2443 } 2444 EXPORT_SYMBOL(tcp_seq_start); 2445 2446 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2447 { 2448 struct tcp_iter_state *st = seq->private; 2449 void *rc = NULL; 2450 2451 if (v == SEQ_START_TOKEN) { 2452 rc = tcp_get_idx(seq, 0); 2453 goto out; 2454 } 2455 2456 switch (st->state) { 2457 case TCP_SEQ_STATE_LISTENING: 2458 rc = listening_get_next(seq, v); 2459 if (!rc) { 2460 st->state = TCP_SEQ_STATE_ESTABLISHED; 2461 st->bucket = 0; 2462 st->offset = 0; 2463 rc = established_get_first(seq); 2464 } 2465 break; 2466 case TCP_SEQ_STATE_ESTABLISHED: 2467 rc = established_get_next(seq, v); 2468 break; 2469 } 2470 out: 2471 ++*pos; 2472 st->last_pos = *pos; 2473 return rc; 2474 } 2475 EXPORT_SYMBOL(tcp_seq_next); 2476 2477 void tcp_seq_stop(struct seq_file *seq, void *v) 2478 { 2479 struct tcp_iter_state *st = seq->private; 2480 2481 switch (st->state) { 2482 case TCP_SEQ_STATE_LISTENING: 2483 if (v != SEQ_START_TOKEN) 2484 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 2485 break; 2486 case TCP_SEQ_STATE_ESTABLISHED: 2487 if (v) 2488 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2489 break; 2490 } 2491 } 2492 EXPORT_SYMBOL(tcp_seq_stop); 2493 2494 static void get_openreq4(const struct request_sock *req, 2495 struct seq_file *f, int i) 2496 { 2497 const struct inet_request_sock *ireq = inet_rsk(req); 2498 long delta = req->rsk_timer.expires - jiffies; 2499 2500 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2501 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2502 i, 2503 ireq->ir_loc_addr, 2504 ireq->ir_num, 2505 ireq->ir_rmt_addr, 2506 ntohs(ireq->ir_rmt_port), 2507 TCP_SYN_RECV, 2508 0, 0, /* could print option size, but that is af dependent. */ 2509 1, /* timers active (only the expire timer) */ 2510 jiffies_delta_to_clock_t(delta), 2511 req->num_timeout, 2512 from_kuid_munged(seq_user_ns(f), 2513 sock_i_uid(req->rsk_listener)), 2514 0, /* non standard timer */ 2515 0, /* open_requests have no inode */ 2516 0, 2517 req); 2518 } 2519 2520 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 2521 { 2522 int timer_active; 2523 unsigned long timer_expires; 2524 const struct tcp_sock *tp = tcp_sk(sk); 2525 const struct inet_connection_sock *icsk = inet_csk(sk); 2526 const struct inet_sock *inet = inet_sk(sk); 2527 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2528 __be32 dest = inet->inet_daddr; 2529 __be32 src = inet->inet_rcv_saddr; 2530 __u16 destp = ntohs(inet->inet_dport); 2531 __u16 srcp = ntohs(inet->inet_sport); 2532 int rx_queue; 2533 int state; 2534 2535 if (icsk->icsk_pending == ICSK_TIME_RETRANS || 2536 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 2537 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2538 timer_active = 1; 2539 timer_expires = icsk->icsk_timeout; 2540 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2541 timer_active = 4; 2542 timer_expires = icsk->icsk_timeout; 2543 } else if (timer_pending(&sk->sk_timer)) { 2544 timer_active = 2; 2545 timer_expires = sk->sk_timer.expires; 2546 } else { 2547 timer_active = 0; 2548 timer_expires = jiffies; 2549 } 2550 2551 state = inet_sk_state_load(sk); 2552 if (state == TCP_LISTEN) 2553 rx_queue = READ_ONCE(sk->sk_ack_backlog); 2554 else 2555 /* Because we don't lock the socket, 2556 * we might find a transient negative value. 2557 */ 2558 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 2559 READ_ONCE(tp->copied_seq), 0); 2560 2561 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2562 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 2563 i, src, srcp, dest, destp, state, 2564 READ_ONCE(tp->write_seq) - tp->snd_una, 2565 rx_queue, 2566 timer_active, 2567 jiffies_delta_to_clock_t(timer_expires - jiffies), 2568 icsk->icsk_retransmits, 2569 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2570 icsk->icsk_probes_out, 2571 sock_i_ino(sk), 2572 refcount_read(&sk->sk_refcnt), sk, 2573 jiffies_to_clock_t(icsk->icsk_rto), 2574 jiffies_to_clock_t(icsk->icsk_ack.ato), 2575 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 2576 tp->snd_cwnd, 2577 state == TCP_LISTEN ? 2578 fastopenq->max_qlen : 2579 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 2580 } 2581 2582 static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2583 struct seq_file *f, int i) 2584 { 2585 long delta = tw->tw_timer.expires - jiffies; 2586 __be32 dest, src; 2587 __u16 destp, srcp; 2588 2589 dest = tw->tw_daddr; 2590 src = tw->tw_rcv_saddr; 2591 destp = ntohs(tw->tw_dport); 2592 srcp = ntohs(tw->tw_sport); 2593 2594 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2595 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 2596 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2597 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2598 refcount_read(&tw->tw_refcnt), tw); 2599 } 2600 2601 #define TMPSZ 150 2602 2603 static int tcp4_seq_show(struct seq_file *seq, void *v) 2604 { 2605 struct tcp_iter_state *st; 2606 struct sock *sk = v; 2607 2608 seq_setwidth(seq, TMPSZ - 1); 2609 if (v == SEQ_START_TOKEN) { 2610 seq_puts(seq, " sl local_address rem_address st tx_queue " 2611 "rx_queue tr tm->when retrnsmt uid timeout " 2612 "inode"); 2613 goto out; 2614 } 2615 st = seq->private; 2616 2617 if (sk->sk_state == TCP_TIME_WAIT) 2618 get_timewait4_sock(v, seq, st->num); 2619 else if (sk->sk_state == TCP_NEW_SYN_RECV) 2620 get_openreq4(v, seq, st->num); 2621 else 2622 get_tcp4_sock(v, seq, st->num); 2623 out: 2624 seq_pad(seq, '\n'); 2625 return 0; 2626 } 2627 2628 #ifdef CONFIG_BPF_SYSCALL 2629 struct bpf_iter__tcp { 2630 __bpf_md_ptr(struct bpf_iter_meta *, meta); 2631 __bpf_md_ptr(struct sock_common *, sk_common); 2632 uid_t uid __aligned(8); 2633 }; 2634 2635 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 2636 struct sock_common *sk_common, uid_t uid) 2637 { 2638 struct bpf_iter__tcp ctx; 2639 2640 meta->seq_num--; /* skip SEQ_START_TOKEN */ 2641 ctx.meta = meta; 2642 ctx.sk_common = sk_common; 2643 ctx.uid = uid; 2644 return bpf_iter_run_prog(prog, &ctx); 2645 } 2646 2647 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 2648 { 2649 struct bpf_iter_meta meta; 2650 struct bpf_prog *prog; 2651 struct sock *sk = v; 2652 uid_t uid; 2653 2654 if (v == SEQ_START_TOKEN) 2655 return 0; 2656 2657 if (sk->sk_state == TCP_TIME_WAIT) { 2658 uid = 0; 2659 } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 2660 const struct request_sock *req = v; 2661 2662 uid = from_kuid_munged(seq_user_ns(seq), 2663 sock_i_uid(req->rsk_listener)); 2664 } else { 2665 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 2666 } 2667 2668 meta.seq = seq; 2669 prog = bpf_iter_get_info(&meta, false); 2670 return tcp_prog_seq_show(prog, &meta, v, uid); 2671 } 2672 2673 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 2674 { 2675 struct bpf_iter_meta meta; 2676 struct bpf_prog *prog; 2677 2678 if (!v) { 2679 meta.seq = seq; 2680 prog = bpf_iter_get_info(&meta, true); 2681 if (prog) 2682 (void)tcp_prog_seq_show(prog, &meta, v, 0); 2683 } 2684 2685 tcp_seq_stop(seq, v); 2686 } 2687 2688 static const struct seq_operations bpf_iter_tcp_seq_ops = { 2689 .show = bpf_iter_tcp_seq_show, 2690 .start = tcp_seq_start, 2691 .next = tcp_seq_next, 2692 .stop = bpf_iter_tcp_seq_stop, 2693 }; 2694 #endif 2695 2696 static const struct seq_operations tcp4_seq_ops = { 2697 .show = tcp4_seq_show, 2698 .start = tcp_seq_start, 2699 .next = tcp_seq_next, 2700 .stop = tcp_seq_stop, 2701 }; 2702 2703 static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2704 .family = AF_INET, 2705 }; 2706 2707 static int __net_init tcp4_proc_init_net(struct net *net) 2708 { 2709 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2710 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 2711 return -ENOMEM; 2712 return 0; 2713 } 2714 2715 static void __net_exit tcp4_proc_exit_net(struct net *net) 2716 { 2717 remove_proc_entry("tcp", net->proc_net); 2718 } 2719 2720 static struct pernet_operations tcp4_net_ops = { 2721 .init = tcp4_proc_init_net, 2722 .exit = tcp4_proc_exit_net, 2723 }; 2724 2725 int __init tcp4_proc_init(void) 2726 { 2727 return register_pernet_subsys(&tcp4_net_ops); 2728 } 2729 2730 void tcp4_proc_exit(void) 2731 { 2732 unregister_pernet_subsys(&tcp4_net_ops); 2733 } 2734 #endif /* CONFIG_PROC_FS */ 2735 2736 struct proto tcp_prot = { 2737 .name = "TCP", 2738 .owner = THIS_MODULE, 2739 .close = tcp_close, 2740 .pre_connect = tcp_v4_pre_connect, 2741 .connect = tcp_v4_connect, 2742 .disconnect = tcp_disconnect, 2743 .accept = inet_csk_accept, 2744 .ioctl = tcp_ioctl, 2745 .init = tcp_v4_init_sock, 2746 .destroy = tcp_v4_destroy_sock, 2747 .shutdown = tcp_shutdown, 2748 .setsockopt = tcp_setsockopt, 2749 .getsockopt = tcp_getsockopt, 2750 .keepalive = tcp_set_keepalive, 2751 .recvmsg = tcp_recvmsg, 2752 .sendmsg = tcp_sendmsg, 2753 .sendpage = tcp_sendpage, 2754 .backlog_rcv = tcp_v4_do_rcv, 2755 .release_cb = tcp_release_cb, 2756 .hash = inet_hash, 2757 .unhash = inet_unhash, 2758 .get_port = inet_csk_get_port, 2759 .enter_memory_pressure = tcp_enter_memory_pressure, 2760 .leave_memory_pressure = tcp_leave_memory_pressure, 2761 .stream_memory_free = tcp_stream_memory_free, 2762 .sockets_allocated = &tcp_sockets_allocated, 2763 .orphan_count = &tcp_orphan_count, 2764 .memory_allocated = &tcp_memory_allocated, 2765 .memory_pressure = &tcp_memory_pressure, 2766 .sysctl_mem = sysctl_tcp_mem, 2767 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2768 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 2769 .max_header = MAX_TCP_HEADER, 2770 .obj_size = sizeof(struct tcp_sock), 2771 .slab_flags = SLAB_TYPESAFE_BY_RCU, 2772 .twsk_prot = &tcp_timewait_sock_ops, 2773 .rsk_prot = &tcp_request_sock_ops, 2774 .h.hashinfo = &tcp_hashinfo, 2775 .no_autobind = true, 2776 #ifdef CONFIG_COMPAT 2777 .compat_setsockopt = compat_tcp_setsockopt, 2778 .compat_getsockopt = compat_tcp_getsockopt, 2779 #endif 2780 .diag_destroy = tcp_abort, 2781 }; 2782 EXPORT_SYMBOL(tcp_prot); 2783 2784 static void __net_exit tcp_sk_exit(struct net *net) 2785 { 2786 int cpu; 2787 2788 if (net->ipv4.tcp_congestion_control) 2789 bpf_module_put(net->ipv4.tcp_congestion_control, 2790 net->ipv4.tcp_congestion_control->owner); 2791 2792 for_each_possible_cpu(cpu) 2793 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2794 free_percpu(net->ipv4.tcp_sk); 2795 } 2796 2797 static int __net_init tcp_sk_init(struct net *net) 2798 { 2799 int res, cpu, cnt; 2800 2801 net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2802 if (!net->ipv4.tcp_sk) 2803 return -ENOMEM; 2804 2805 for_each_possible_cpu(cpu) { 2806 struct sock *sk; 2807 2808 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2809 IPPROTO_TCP, net); 2810 if (res) 2811 goto fail; 2812 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2813 2814 /* Please enforce IP_DF and IPID==0 for RST and 2815 * ACK sent in SYN-RECV and TIME-WAIT state. 2816 */ 2817 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2818 2819 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2820 } 2821 2822 net->ipv4.sysctl_tcp_ecn = 2; 2823 net->ipv4.sysctl_tcp_ecn_fallback = 1; 2824 2825 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2826 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 2827 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 2828 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2829 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 2830 2831 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 2832 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2833 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 2834 2835 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 2836 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 2837 net->ipv4.sysctl_tcp_syncookies = 1; 2838 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2839 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2840 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2841 net->ipv4.sysctl_tcp_orphan_retries = 0; 2842 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2843 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2844 net->ipv4.sysctl_tcp_tw_reuse = 2; 2845 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 2846 2847 cnt = tcp_hashinfo.ehash_mask + 1; 2848 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 2849 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 2850 2851 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 2852 net->ipv4.sysctl_tcp_sack = 1; 2853 net->ipv4.sysctl_tcp_window_scaling = 1; 2854 net->ipv4.sysctl_tcp_timestamps = 1; 2855 net->ipv4.sysctl_tcp_early_retrans = 3; 2856 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2857 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2858 net->ipv4.sysctl_tcp_retrans_collapse = 1; 2859 net->ipv4.sysctl_tcp_max_reordering = 300; 2860 net->ipv4.sysctl_tcp_dsack = 1; 2861 net->ipv4.sysctl_tcp_app_win = 31; 2862 net->ipv4.sysctl_tcp_adv_win_scale = 1; 2863 net->ipv4.sysctl_tcp_frto = 2; 2864 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2865 /* This limits the percentage of the congestion window which we 2866 * will allow a single TSO frame to consume. Building TSO frames 2867 * which are too large can cause TCP streams to be bursty. 2868 */ 2869 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2870 /* Default TSQ limit of 16 TSO segments */ 2871 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2872 /* rfc5961 challenge ack rate limiting */ 2873 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 2874 net->ipv4.sysctl_tcp_min_tso_segs = 2; 2875 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2876 net->ipv4.sysctl_tcp_autocorking = 1; 2877 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 2878 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2879 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2880 if (net != &init_net) { 2881 memcpy(net->ipv4.sysctl_tcp_rmem, 2882 init_net.ipv4.sysctl_tcp_rmem, 2883 sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2884 memcpy(net->ipv4.sysctl_tcp_wmem, 2885 init_net.ipv4.sysctl_tcp_wmem, 2886 sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2887 } 2888 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2889 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 2890 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2891 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 2892 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 2893 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2894 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2895 2896 /* Reno is always built in */ 2897 if (!net_eq(net, &init_net) && 2898 bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 2899 init_net.ipv4.tcp_congestion_control->owner)) 2900 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2901 else 2902 net->ipv4.tcp_congestion_control = &tcp_reno; 2903 2904 return 0; 2905 fail: 2906 tcp_sk_exit(net); 2907 2908 return res; 2909 } 2910 2911 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2912 { 2913 struct net *net; 2914 2915 inet_twsk_purge(&tcp_hashinfo, AF_INET); 2916 2917 list_for_each_entry(net, net_exit_list, exit_list) 2918 tcp_fastopen_ctx_destroy(net); 2919 } 2920 2921 static struct pernet_operations __net_initdata tcp_sk_ops = { 2922 .init = tcp_sk_init, 2923 .exit = tcp_sk_exit, 2924 .exit_batch = tcp_sk_exit_batch, 2925 }; 2926 2927 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 2928 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 2929 struct sock_common *sk_common, uid_t uid) 2930 2931 static int bpf_iter_init_tcp(void *priv_data) 2932 { 2933 struct tcp_iter_state *st = priv_data; 2934 struct tcp_seq_afinfo *afinfo; 2935 int ret; 2936 2937 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); 2938 if (!afinfo) 2939 return -ENOMEM; 2940 2941 afinfo->family = AF_UNSPEC; 2942 st->bpf_seq_afinfo = afinfo; 2943 ret = bpf_iter_init_seq_net(priv_data); 2944 if (ret) 2945 kfree(afinfo); 2946 return ret; 2947 } 2948 2949 static void bpf_iter_fini_tcp(void *priv_data) 2950 { 2951 struct tcp_iter_state *st = priv_data; 2952 2953 kfree(st->bpf_seq_afinfo); 2954 bpf_iter_fini_seq_net(priv_data); 2955 } 2956 2957 static const struct bpf_iter_reg tcp_reg_info = { 2958 .target = "tcp", 2959 .seq_ops = &bpf_iter_tcp_seq_ops, 2960 .init_seq_private = bpf_iter_init_tcp, 2961 .fini_seq_private = bpf_iter_fini_tcp, 2962 .seq_priv_size = sizeof(struct tcp_iter_state), 2963 .ctx_arg_info_size = 1, 2964 .ctx_arg_info = { 2965 { offsetof(struct bpf_iter__tcp, sk_common), 2966 PTR_TO_BTF_ID_OR_NULL }, 2967 }, 2968 }; 2969 2970 static void __init bpf_iter_register(void) 2971 { 2972 if (bpf_iter_reg_target(&tcp_reg_info)) 2973 pr_warn("Warning: could not register bpf iterator tcp\n"); 2974 } 2975 2976 #endif 2977 2978 void __init tcp_v4_init(void) 2979 { 2980 if (register_pernet_subsys(&tcp_sk_ops)) 2981 panic("Failed to create the TCP control socket.\n"); 2982 2983 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 2984 bpf_iter_register(); 2985 #endif 2986 } 2987