12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * INET An implementation of the TCP/IP protocol suite for the LINUX 41da177e4SLinus Torvalds * operating system. INET is implemented using the BSD Socket 51da177e4SLinus Torvalds * interface as the means of communication with the user level. 61da177e4SLinus Torvalds * 71da177e4SLinus Torvalds * Implementation of the Transmission Control Protocol(TCP). 81da177e4SLinus Torvalds * 91da177e4SLinus Torvalds * IPv4 specific functions 101da177e4SLinus Torvalds * 111da177e4SLinus Torvalds * code split from: 121da177e4SLinus Torvalds * linux/ipv4/tcp.c 131da177e4SLinus Torvalds * linux/ipv4/tcp_input.c 141da177e4SLinus Torvalds * linux/ipv4/tcp_output.c 151da177e4SLinus Torvalds * 161da177e4SLinus Torvalds * See tcp.c for author information 171da177e4SLinus Torvalds */ 181da177e4SLinus Torvalds 191da177e4SLinus Torvalds /* 201da177e4SLinus Torvalds * Changes: 211da177e4SLinus Torvalds * David S. Miller : New socket lookup architecture. 221da177e4SLinus Torvalds * This code is dedicated to John Dyson. 231da177e4SLinus Torvalds * David S. Miller : Change semantics of established hash, 241da177e4SLinus Torvalds * half is devoted to TIME_WAIT sockets 251da177e4SLinus Torvalds * and the rest go in the other half. 261da177e4SLinus Torvalds * Andi Kleen : Add support for syncookies and fixed 271da177e4SLinus Torvalds * some bugs: ip options weren't passed to 281da177e4SLinus Torvalds * the TCP layer, missed a check for an 291da177e4SLinus Torvalds * ACK bit. 301da177e4SLinus Torvalds * Andi Kleen : Implemented fast path mtu discovery. 311da177e4SLinus Torvalds * Fixed many serious bugs in the 3260236fddSArnaldo Carvalho de Melo * request_sock handling and moved 331da177e4SLinus Torvalds * most of it into the af independent code. 341da177e4SLinus Torvalds * Added tail drop and some other bugfixes. 35caa20d9aSStephen Hemminger * Added new listen semantics. 361da177e4SLinus Torvalds * Mike McLagan : Routing by source 371da177e4SLinus Torvalds * Juan Jose Ciarlante: ip_dynaddr bits 381da177e4SLinus Torvalds * Andi Kleen: various fixes. 391da177e4SLinus Torvalds * Vitaly E. Lavrov : Transparent proxy revived after year 401da177e4SLinus Torvalds * coma. 411da177e4SLinus Torvalds * Andi Kleen : Fix new listen. 421da177e4SLinus Torvalds * Andi Kleen : Fix accept error reporting. 431da177e4SLinus Torvalds * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 441da177e4SLinus Torvalds * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 451da177e4SLinus Torvalds * a single port at the same time. 461da177e4SLinus Torvalds */ 471da177e4SLinus Torvalds 48afd46503SJoe Perches #define pr_fmt(fmt) "TCP: " fmt 491da177e4SLinus Torvalds 50eb4dea58SHerbert Xu #include <linux/bottom_half.h> 511da177e4SLinus Torvalds #include <linux/types.h> 521da177e4SLinus Torvalds #include <linux/fcntl.h> 531da177e4SLinus Torvalds #include <linux/module.h> 541da177e4SLinus Torvalds #include <linux/random.h> 551da177e4SLinus Torvalds #include <linux/cache.h> 561da177e4SLinus Torvalds #include <linux/jhash.h> 571da177e4SLinus Torvalds #include <linux/init.h> 581da177e4SLinus Torvalds #include <linux/times.h> 595a0e3ad6STejun Heo #include <linux/slab.h> 601da177e4SLinus Torvalds 61457c4cbcSEric W. Biederman #include <net/net_namespace.h> 621da177e4SLinus Torvalds #include <net/icmp.h> 63304a1618SArnaldo Carvalho de Melo #include <net/inet_hashtables.h> 641da177e4SLinus Torvalds #include <net/tcp.h> 6520380731SArnaldo Carvalho de Melo #include <net/transp_v6.h> 661da177e4SLinus Torvalds #include <net/ipv6.h> 671da177e4SLinus Torvalds #include <net/inet_common.h> 686d6ee43eSArnaldo Carvalho de Melo #include <net/timewait_sock.h> 691da177e4SLinus Torvalds #include <net/xfrm.h> 706e5714eaSDavid S. Miller #include <net/secure_seq.h> 71076bb0c8SEliezer Tamir #include <net/busy_poll.h> 721da177e4SLinus Torvalds 731da177e4SLinus Torvalds #include <linux/inet.h> 741da177e4SLinus Torvalds #include <linux/ipv6.h> 751da177e4SLinus Torvalds #include <linux/stddef.h> 761da177e4SLinus Torvalds #include <linux/proc_fs.h> 771da177e4SLinus Torvalds #include <linux/seq_file.h> 786797318eSIvan Delalande #include <linux/inetdevice.h> 79951cf368SYonghong Song #include <linux/btf_ids.h> 801da177e4SLinus Torvalds 81cf80e0e4SHerbert Xu #include <crypto/hash.h> 82cfb6eeb4SYOSHIFUJI Hideaki #include <linux/scatterlist.h> 83cfb6eeb4SYOSHIFUJI Hideaki 84c24b14c4SSong Liu #include <trace/events/tcp.h> 85c24b14c4SSong Liu 86cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 87a915da9bSEric Dumazet static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 88318cf7aaSEric Dumazet __be32 daddr, __be32 saddr, const struct tcphdr *th); 89cfb6eeb4SYOSHIFUJI Hideaki #endif 90cfb6eeb4SYOSHIFUJI Hideaki 915caea4eaSEric Dumazet struct inet_hashinfo tcp_hashinfo; 924bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_hashinfo); 931da177e4SLinus Torvalds 9484b114b9SEric Dumazet static u32 tcp_v4_init_seq(const struct sk_buff *skb) 951da177e4SLinus Torvalds { 9684b114b9SEric Dumazet return secure_tcp_seq(ip_hdr(skb)->daddr, 97eddc9ec5SArnaldo Carvalho de Melo ip_hdr(skb)->saddr, 98aa8223c7SArnaldo Carvalho de Melo tcp_hdr(skb)->dest, 9984b114b9SEric Dumazet tcp_hdr(skb)->source); 10084b114b9SEric Dumazet } 10184b114b9SEric Dumazet 1025d2ed052SEric Dumazet static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) 10384b114b9SEric Dumazet { 1045d2ed052SEric Dumazet return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); 1051da177e4SLinus Torvalds } 1061da177e4SLinus Torvalds 1076d6ee43eSArnaldo Carvalho de Melo int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 1086d6ee43eSArnaldo Carvalho de Melo { 10979e9fed4SMaciej Żenczykowski const struct inet_timewait_sock *tw = inet_twsk(sktw); 1106d6ee43eSArnaldo Carvalho de Melo const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 1116d6ee43eSArnaldo Carvalho de Melo struct tcp_sock *tp = tcp_sk(sk); 11279e9fed4SMaciej Żenczykowski int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; 11379e9fed4SMaciej Żenczykowski 11479e9fed4SMaciej Żenczykowski if (reuse == 2) { 11579e9fed4SMaciej Żenczykowski /* Still does not detect *everything* that goes through 11679e9fed4SMaciej Żenczykowski * lo, since we require a loopback src or dst address 11779e9fed4SMaciej Żenczykowski * or direct binding to 'lo' interface. 11879e9fed4SMaciej Żenczykowski */ 11979e9fed4SMaciej Żenczykowski bool loopback = false; 12079e9fed4SMaciej Żenczykowski if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) 12179e9fed4SMaciej Żenczykowski loopback = true; 12279e9fed4SMaciej Żenczykowski #if IS_ENABLED(CONFIG_IPV6) 12379e9fed4SMaciej Żenczykowski if (tw->tw_family == AF_INET6) { 12479e9fed4SMaciej Żenczykowski if (ipv6_addr_loopback(&tw->tw_v6_daddr) || 125be2644aaSEric Dumazet ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) || 12679e9fed4SMaciej Żenczykowski ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || 127be2644aaSEric Dumazet ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr)) 12879e9fed4SMaciej Żenczykowski loopback = true; 12979e9fed4SMaciej Żenczykowski } else 13079e9fed4SMaciej Żenczykowski #endif 13179e9fed4SMaciej Żenczykowski { 13279e9fed4SMaciej Żenczykowski if (ipv4_is_loopback(tw->tw_daddr) || 13379e9fed4SMaciej Żenczykowski ipv4_is_loopback(tw->tw_rcv_saddr)) 13479e9fed4SMaciej Żenczykowski loopback = true; 13579e9fed4SMaciej Żenczykowski } 13679e9fed4SMaciej Żenczykowski if (!loopback) 13779e9fed4SMaciej Żenczykowski reuse = 0; 13879e9fed4SMaciej Żenczykowski } 1396d6ee43eSArnaldo Carvalho de Melo 1406d6ee43eSArnaldo Carvalho de Melo /* With PAWS, it is safe from the viewpoint 1416d6ee43eSArnaldo Carvalho de Melo of data integrity. Even without PAWS it is safe provided sequence 1426d6ee43eSArnaldo Carvalho de Melo spaces do not overlap i.e. at data rates <= 80Mbit/sec. 1436d6ee43eSArnaldo Carvalho de Melo 1446d6ee43eSArnaldo Carvalho de Melo Actually, the idea is close to VJ's one, only timestamp cache is 1456d6ee43eSArnaldo Carvalho de Melo held not per host, but per port pair and TW bucket is used as state 1466d6ee43eSArnaldo Carvalho de Melo holder. 1476d6ee43eSArnaldo Carvalho de Melo 1486d6ee43eSArnaldo Carvalho de Melo If TW bucket has been already destroyed we fall back to VJ's scheme 1496d6ee43eSArnaldo Carvalho de Melo and use initial timestamp retrieved from peer table. 1506d6ee43eSArnaldo Carvalho de Melo */ 1516d6ee43eSArnaldo Carvalho de Melo if (tcptw->tw_ts_recent_stamp && 152cca9bab1SArnd Bergmann (!twp || (reuse && time_after32(ktime_get_seconds(), 153cca9bab1SArnd Bergmann tcptw->tw_ts_recent_stamp)))) { 15421684dc4SStefan Baranoff /* In case of repair and re-using TIME-WAIT sockets we still 15521684dc4SStefan Baranoff * want to be sure that it is safe as above but honor the 15621684dc4SStefan Baranoff * sequence numbers and time stamps set as part of the repair 15721684dc4SStefan Baranoff * process. 15821684dc4SStefan Baranoff * 15921684dc4SStefan Baranoff * Without this check re-using a TIME-WAIT socket with TCP 16021684dc4SStefan Baranoff * repair would accumulate a -1 on the repair assigned 16121684dc4SStefan Baranoff * sequence number. The first time it is reused the sequence 16221684dc4SStefan Baranoff * is -1, the second time -2, etc. This fixes that issue 16321684dc4SStefan Baranoff * without appearing to create any others. 16421684dc4SStefan Baranoff */ 16521684dc4SStefan Baranoff if (likely(!tp->repair)) { 1660f317464SEric Dumazet u32 seq = tcptw->tw_snd_nxt + 65535 + 2; 1670f317464SEric Dumazet 1680f317464SEric Dumazet if (!seq) 1690f317464SEric Dumazet seq = 1; 1700f317464SEric Dumazet WRITE_ONCE(tp->write_seq, seq); 1716d6ee43eSArnaldo Carvalho de Melo tp->rx_opt.ts_recent = tcptw->tw_ts_recent; 1726d6ee43eSArnaldo Carvalho de Melo tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 17321684dc4SStefan Baranoff } 1746d6ee43eSArnaldo Carvalho de Melo sock_hold(sktw); 1756d6ee43eSArnaldo Carvalho de Melo return 1; 1766d6ee43eSArnaldo Carvalho de Melo } 1776d6ee43eSArnaldo Carvalho de Melo 1786d6ee43eSArnaldo Carvalho de Melo return 0; 1796d6ee43eSArnaldo Carvalho de Melo } 1806d6ee43eSArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(tcp_twsk_unique); 1816d6ee43eSArnaldo Carvalho de Melo 182d74bad4eSAndrey Ignatov static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, 183d74bad4eSAndrey Ignatov int addr_len) 184d74bad4eSAndrey Ignatov { 185d74bad4eSAndrey Ignatov /* This check is replicated from tcp_v4_connect() and intended to 186d74bad4eSAndrey Ignatov * prevent BPF program called below from accessing bytes that are out 187d74bad4eSAndrey Ignatov * of the bound specified by user in addr_len. 188d74bad4eSAndrey Ignatov */ 189d74bad4eSAndrey Ignatov if (addr_len < sizeof(struct sockaddr_in)) 190d74bad4eSAndrey Ignatov return -EINVAL; 191d74bad4eSAndrey Ignatov 192d74bad4eSAndrey Ignatov sock_owned_by_me(sk); 193d74bad4eSAndrey Ignatov 194d74bad4eSAndrey Ignatov return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); 195d74bad4eSAndrey Ignatov } 196d74bad4eSAndrey Ignatov 1971da177e4SLinus Torvalds /* This will initiate an outgoing connection. */ 1981da177e4SLinus Torvalds int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 1991da177e4SLinus Torvalds { 2002d7192d6SDavid S. Miller struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 2011da177e4SLinus Torvalds struct inet_sock *inet = inet_sk(sk); 2021da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 203dca8b089SDavid S. Miller __be16 orig_sport, orig_dport; 204bada8adcSAl Viro __be32 daddr, nexthop; 205da905bd1SDavid S. Miller struct flowi4 *fl4; 2062d7192d6SDavid S. Miller struct rtable *rt; 2071da177e4SLinus Torvalds int err; 208f6d8bd05SEric Dumazet struct ip_options_rcu *inet_opt; 2091946e672SHaishuang Yan struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 2101da177e4SLinus Torvalds 2111da177e4SLinus Torvalds if (addr_len < sizeof(struct sockaddr_in)) 2121da177e4SLinus Torvalds return -EINVAL; 2131da177e4SLinus Torvalds 2141da177e4SLinus Torvalds if (usin->sin_family != AF_INET) 2151da177e4SLinus Torvalds return -EAFNOSUPPORT; 2161da177e4SLinus Torvalds 2171da177e4SLinus Torvalds nexthop = daddr = usin->sin_addr.s_addr; 218f6d8bd05SEric Dumazet inet_opt = rcu_dereference_protected(inet->inet_opt, 2191e1d04e6SHannes Frederic Sowa lockdep_sock_is_held(sk)); 220f6d8bd05SEric Dumazet if (inet_opt && inet_opt->opt.srr) { 2211da177e4SLinus Torvalds if (!daddr) 2221da177e4SLinus Torvalds return -EINVAL; 223f6d8bd05SEric Dumazet nexthop = inet_opt->opt.faddr; 2241da177e4SLinus Torvalds } 2251da177e4SLinus Torvalds 226dca8b089SDavid S. Miller orig_sport = inet->inet_sport; 227dca8b089SDavid S. Miller orig_dport = usin->sin_port; 228da905bd1SDavid S. Miller fl4 = &inet->cork.fl.u.ip4; 229da905bd1SDavid S. Miller rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 2301da177e4SLinus Torvalds RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 2311da177e4SLinus Torvalds IPPROTO_TCP, 2320e0d44abSSteffen Klassert orig_sport, orig_dport, sk); 233b23dd4feSDavid S. Miller if (IS_ERR(rt)) { 234b23dd4feSDavid S. Miller err = PTR_ERR(rt); 235b23dd4feSDavid S. Miller if (err == -ENETUNREACH) 236f1d8cba6SEric Dumazet IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 237b23dd4feSDavid S. Miller return err; 238584bdf8cSWei Dong } 2391da177e4SLinus Torvalds 2401da177e4SLinus Torvalds if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 2411da177e4SLinus Torvalds ip_rt_put(rt); 2421da177e4SLinus Torvalds return -ENETUNREACH; 2431da177e4SLinus Torvalds } 2441da177e4SLinus Torvalds 245f6d8bd05SEric Dumazet if (!inet_opt || !inet_opt->opt.srr) 246da905bd1SDavid S. Miller daddr = fl4->daddr; 2471da177e4SLinus Torvalds 248c720c7e8SEric Dumazet if (!inet->inet_saddr) 249da905bd1SDavid S. Miller inet->inet_saddr = fl4->saddr; 250d1e559d0SEric Dumazet sk_rcv_saddr_set(sk, inet->inet_saddr); 2511da177e4SLinus Torvalds 252c720c7e8SEric Dumazet if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 2531da177e4SLinus Torvalds /* Reset inherited state */ 2541da177e4SLinus Torvalds tp->rx_opt.ts_recent = 0; 2551da177e4SLinus Torvalds tp->rx_opt.ts_recent_stamp = 0; 256ee995283SPavel Emelyanov if (likely(!tp->repair)) 2570f317464SEric Dumazet WRITE_ONCE(tp->write_seq, 0); 2581da177e4SLinus Torvalds } 2591da177e4SLinus Torvalds 260c720c7e8SEric Dumazet inet->inet_dport = usin->sin_port; 261d1e559d0SEric Dumazet sk_daddr_set(sk, daddr); 2621da177e4SLinus Torvalds 263d83d8461SArnaldo Carvalho de Melo inet_csk(sk)->icsk_ext_hdr_len = 0; 264f6d8bd05SEric Dumazet if (inet_opt) 265f6d8bd05SEric Dumazet inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 2661da177e4SLinus Torvalds 267bee7ca9eSWilliam Allen Simpson tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 2681da177e4SLinus Torvalds 2691da177e4SLinus Torvalds /* Socket identity is still unknown (sport may be zero). 2701da177e4SLinus Torvalds * However we set state to SYN-SENT and not releasing socket 2711da177e4SLinus Torvalds * lock select source port, enter ourselves into the hash tables and 2721da177e4SLinus Torvalds * complete initialization after this. 2731da177e4SLinus Torvalds */ 2741da177e4SLinus Torvalds tcp_set_state(sk, TCP_SYN_SENT); 2751946e672SHaishuang Yan err = inet_hash_connect(tcp_death_row, sk); 2761da177e4SLinus Torvalds if (err) 2771da177e4SLinus Torvalds goto failure; 2781da177e4SLinus Torvalds 279877d1f62STom Herbert sk_set_txhash(sk); 2809e7ceb06SSathya Perla 281da905bd1SDavid S. Miller rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 282c720c7e8SEric Dumazet inet->inet_sport, inet->inet_dport, sk); 283b23dd4feSDavid S. Miller if (IS_ERR(rt)) { 284b23dd4feSDavid S. Miller err = PTR_ERR(rt); 285b23dd4feSDavid S. Miller rt = NULL; 2861da177e4SLinus Torvalds goto failure; 287b23dd4feSDavid S. Miller } 2881da177e4SLinus Torvalds /* OK, now commit destination to socket. */ 289bcd76111SHerbert Xu sk->sk_gso_type = SKB_GSO_TCPV4; 290d8d1f30bSChangli Gao sk_setup_caps(sk, &rt->dst); 29119f6d3f3SWei Wang rt = NULL; 2921da177e4SLinus Torvalds 29300355fa5SAlexey Kodanev if (likely(!tp->repair)) { 29484b114b9SEric Dumazet if (!tp->write_seq) 2950f317464SEric Dumazet WRITE_ONCE(tp->write_seq, 2960f317464SEric Dumazet secure_tcp_seq(inet->inet_saddr, 297c720c7e8SEric Dumazet inet->inet_daddr, 298c720c7e8SEric Dumazet inet->inet_sport, 2990f317464SEric Dumazet usin->sin_port)); 3005d2ed052SEric Dumazet tp->tsoffset = secure_tcp_ts_off(sock_net(sk), 3015d2ed052SEric Dumazet inet->inet_saddr, 30284b114b9SEric Dumazet inet->inet_daddr); 30300355fa5SAlexey Kodanev } 3041da177e4SLinus Torvalds 305a904a069SEric Dumazet inet->inet_id = prandom_u32(); 3061da177e4SLinus Torvalds 30719f6d3f3SWei Wang if (tcp_fastopen_defer_connect(sk, &err)) 30819f6d3f3SWei Wang return err; 30919f6d3f3SWei Wang if (err) 31019f6d3f3SWei Wang goto failure; 31119f6d3f3SWei Wang 3121da177e4SLinus Torvalds err = tcp_connect(sk); 313ee995283SPavel Emelyanov 3141da177e4SLinus Torvalds if (err) 3151da177e4SLinus Torvalds goto failure; 3161da177e4SLinus Torvalds 3171da177e4SLinus Torvalds return 0; 3181da177e4SLinus Torvalds 3191da177e4SLinus Torvalds failure: 3207174259eSArnaldo Carvalho de Melo /* 3217174259eSArnaldo Carvalho de Melo * This unhashes the socket and releases the local port, 3227174259eSArnaldo Carvalho de Melo * if necessary. 3237174259eSArnaldo Carvalho de Melo */ 3241da177e4SLinus Torvalds tcp_set_state(sk, TCP_CLOSE); 3251da177e4SLinus Torvalds ip_rt_put(rt); 3261da177e4SLinus Torvalds sk->sk_route_caps = 0; 327c720c7e8SEric Dumazet inet->inet_dport = 0; 3281da177e4SLinus Torvalds return err; 3291da177e4SLinus Torvalds } 3304bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_connect); 3311da177e4SLinus Torvalds 3321da177e4SLinus Torvalds /* 333563d34d0SEric Dumazet * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 334563d34d0SEric Dumazet * It can be called through tcp_release_cb() if socket was owned by user 335563d34d0SEric Dumazet * at the time tcp_v4_err() was called to handle ICMP message. 3361da177e4SLinus Torvalds */ 3374fab9071SNeal Cardwell void tcp_v4_mtu_reduced(struct sock *sk) 3381da177e4SLinus Torvalds { 3391da177e4SLinus Torvalds struct inet_sock *inet = inet_sk(sk); 34002b2faafSEric Dumazet struct dst_entry *dst; 34102b2faafSEric Dumazet u32 mtu; 3421da177e4SLinus Torvalds 34302b2faafSEric Dumazet if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 34402b2faafSEric Dumazet return; 34502b2faafSEric Dumazet mtu = tcp_sk(sk)->mtu_info; 34680d0a69fSDavid S. Miller dst = inet_csk_update_pmtu(sk, mtu); 34780d0a69fSDavid S. Miller if (!dst) 3481da177e4SLinus Torvalds return; 3491da177e4SLinus Torvalds 3501da177e4SLinus Torvalds /* Something is about to be wrong... Remember soft error 3511da177e4SLinus Torvalds * for the case, if this connection will not able to recover. 3521da177e4SLinus Torvalds */ 3531da177e4SLinus Torvalds if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) 3541da177e4SLinus Torvalds sk->sk_err_soft = EMSGSIZE; 3551da177e4SLinus Torvalds 3561da177e4SLinus Torvalds mtu = dst_mtu(dst); 3571da177e4SLinus Torvalds 3581da177e4SLinus Torvalds if (inet->pmtudisc != IP_PMTUDISC_DONT && 359482fc609SHannes Frederic Sowa ip_sk_accept_pmtu(sk) && 360d83d8461SArnaldo Carvalho de Melo inet_csk(sk)->icsk_pmtu_cookie > mtu) { 3611da177e4SLinus Torvalds tcp_sync_mss(sk, mtu); 3621da177e4SLinus Torvalds 3631da177e4SLinus Torvalds /* Resend the TCP packet because it's 3641da177e4SLinus Torvalds * clear that the old packet has been 3651da177e4SLinus Torvalds * dropped. This is the new "fast" path mtu 3661da177e4SLinus Torvalds * discovery. 3671da177e4SLinus Torvalds */ 3681da177e4SLinus Torvalds tcp_simple_retransmit(sk); 3691da177e4SLinus Torvalds } /* else let the usual retransmit timer handle it */ 3701da177e4SLinus Torvalds } 3714fab9071SNeal Cardwell EXPORT_SYMBOL(tcp_v4_mtu_reduced); 3721da177e4SLinus Torvalds 37355be7a9cSDavid S. Miller static void do_redirect(struct sk_buff *skb, struct sock *sk) 37455be7a9cSDavid S. Miller { 37555be7a9cSDavid S. Miller struct dst_entry *dst = __sk_dst_check(sk, 0); 37655be7a9cSDavid S. Miller 3771ed5c48fSDavid S. Miller if (dst) 3786700c270SDavid S. Miller dst->ops->redirect(dst, sk, skb); 37955be7a9cSDavid S. Miller } 38055be7a9cSDavid S. Miller 38126e37360SEric Dumazet 38226e37360SEric Dumazet /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ 3839cf74903SEric Dumazet void tcp_req_err(struct sock *sk, u32 seq, bool abort) 38426e37360SEric Dumazet { 38526e37360SEric Dumazet struct request_sock *req = inet_reqsk(sk); 38626e37360SEric Dumazet struct net *net = sock_net(sk); 38726e37360SEric Dumazet 38826e37360SEric Dumazet /* ICMPs are not backlogged, hence we cannot get 38926e37360SEric Dumazet * an established socket here. 39026e37360SEric Dumazet */ 39126e37360SEric Dumazet if (seq != tcp_rsk(req)->snt_isn) { 39202a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 3939cf74903SEric Dumazet } else if (abort) { 39426e37360SEric Dumazet /* 39526e37360SEric Dumazet * Still in SYN_RECV, just remove it silently. 39626e37360SEric Dumazet * There is no good way to pass the error to the newly 39726e37360SEric Dumazet * created socket, and POSIX does not want network 39826e37360SEric Dumazet * errors returned from accept(). 39926e37360SEric Dumazet */ 400c6973669SFan Du inet_csk_reqsk_queue_drop(req->rsk_listener, req); 4019caad864SEric Dumazet tcp_listendrop(req->rsk_listener); 40226e37360SEric Dumazet } 403ef84d8ceSEric Dumazet reqsk_put(req); 40426e37360SEric Dumazet } 40526e37360SEric Dumazet EXPORT_SYMBOL(tcp_req_err); 40626e37360SEric Dumazet 407f7456642SEric Dumazet /* TCP-LD (RFC 6069) logic */ 408d2924569SEric Dumazet void tcp_ld_RTO_revert(struct sock *sk, u32 seq) 409f7456642SEric Dumazet { 410f7456642SEric Dumazet struct inet_connection_sock *icsk = inet_csk(sk); 411f7456642SEric Dumazet struct tcp_sock *tp = tcp_sk(sk); 412f7456642SEric Dumazet struct sk_buff *skb; 413f7456642SEric Dumazet s32 remaining; 414f7456642SEric Dumazet u32 delta_us; 415f7456642SEric Dumazet 416f7456642SEric Dumazet if (sock_owned_by_user(sk)) 417f7456642SEric Dumazet return; 418f7456642SEric Dumazet 419f7456642SEric Dumazet if (seq != tp->snd_una || !icsk->icsk_retransmits || 420f7456642SEric Dumazet !icsk->icsk_backoff) 421f7456642SEric Dumazet return; 422f7456642SEric Dumazet 423f7456642SEric Dumazet skb = tcp_rtx_queue_head(sk); 424f7456642SEric Dumazet if (WARN_ON_ONCE(!skb)) 425f7456642SEric Dumazet return; 426f7456642SEric Dumazet 427f7456642SEric Dumazet icsk->icsk_backoff--; 428f7456642SEric Dumazet icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; 429f7456642SEric Dumazet icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 430f7456642SEric Dumazet 431f7456642SEric Dumazet tcp_mstamp_refresh(tp); 432f7456642SEric Dumazet delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); 433f7456642SEric Dumazet remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); 434f7456642SEric Dumazet 435f7456642SEric Dumazet if (remaining > 0) { 436f7456642SEric Dumazet inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 437f7456642SEric Dumazet remaining, TCP_RTO_MAX); 438f7456642SEric Dumazet } else { 439f7456642SEric Dumazet /* RTO revert clocked out retransmission. 440f7456642SEric Dumazet * Will retransmit now. 441f7456642SEric Dumazet */ 442f7456642SEric Dumazet tcp_retransmit_timer(sk); 443f7456642SEric Dumazet } 444f7456642SEric Dumazet } 445d2924569SEric Dumazet EXPORT_SYMBOL(tcp_ld_RTO_revert); 446f7456642SEric Dumazet 4471da177e4SLinus Torvalds /* 4481da177e4SLinus Torvalds * This routine is called by the ICMP module when it gets some 4491da177e4SLinus Torvalds * sort of error condition. If err < 0 then the socket should 4501da177e4SLinus Torvalds * be closed and the error returned to the user. If err > 0 4511da177e4SLinus Torvalds * it's just the icmp type << 8 | icmp code. After adjustment 4521da177e4SLinus Torvalds * header points to the first 8 bytes of the tcp header. We need 4531da177e4SLinus Torvalds * to find the appropriate port. 4541da177e4SLinus Torvalds * 4551da177e4SLinus Torvalds * The locking strategy used here is very "optimistic". When 4561da177e4SLinus Torvalds * someone else accesses the socket the ICMP is just dropped 4571da177e4SLinus Torvalds * and for some paths there is no check at all. 4581da177e4SLinus Torvalds * A more general error queue to queue errors for later handling 4591da177e4SLinus Torvalds * is probably better. 4601da177e4SLinus Torvalds * 4611da177e4SLinus Torvalds */ 4621da177e4SLinus Torvalds 463a12daf13SEric Dumazet int tcp_v4_err(struct sk_buff *skb, u32 info) 4641da177e4SLinus Torvalds { 465a12daf13SEric Dumazet const struct iphdr *iph = (const struct iphdr *)skb->data; 466a12daf13SEric Dumazet struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 4671da177e4SLinus Torvalds struct tcp_sock *tp; 4681da177e4SLinus Torvalds struct inet_sock *inet; 469a12daf13SEric Dumazet const int type = icmp_hdr(skb)->type; 470a12daf13SEric Dumazet const int code = icmp_hdr(skb)->code; 4711da177e4SLinus Torvalds struct sock *sk; 4720a672f74SYuchung Cheng struct request_sock *fastopen; 4739a568de4SEric Dumazet u32 seq, snd_una; 4741da177e4SLinus Torvalds int err; 475a12daf13SEric Dumazet struct net *net = dev_net(skb->dev); 4761da177e4SLinus Torvalds 47726e37360SEric Dumazet sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, 47826e37360SEric Dumazet th->dest, iph->saddr, ntohs(th->source), 479a12daf13SEric Dumazet inet_iif(skb), 0); 4801da177e4SLinus Torvalds if (!sk) { 4815d3848bcSEric Dumazet __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); 48232bbd879SStefano Brivio return -ENOENT; 4831da177e4SLinus Torvalds } 4841da177e4SLinus Torvalds if (sk->sk_state == TCP_TIME_WAIT) { 4859469c7b4SYOSHIFUJI Hideaki inet_twsk_put(inet_twsk(sk)); 48632bbd879SStefano Brivio return 0; 4871da177e4SLinus Torvalds } 48826e37360SEric Dumazet seq = ntohl(th->seq); 48932bbd879SStefano Brivio if (sk->sk_state == TCP_NEW_SYN_RECV) { 49032bbd879SStefano Brivio tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || 4919cf74903SEric Dumazet type == ICMP_TIME_EXCEEDED || 4929cf74903SEric Dumazet (type == ICMP_DEST_UNREACH && 4939cf74903SEric Dumazet (code == ICMP_NET_UNREACH || 4949cf74903SEric Dumazet code == ICMP_HOST_UNREACH))); 49532bbd879SStefano Brivio return 0; 49632bbd879SStefano Brivio } 4971da177e4SLinus Torvalds 4981da177e4SLinus Torvalds bh_lock_sock(sk); 4991da177e4SLinus Torvalds /* If too many ICMPs get dropped on busy 5001da177e4SLinus Torvalds * servers this needs to be solved differently. 501563d34d0SEric Dumazet * We do take care of PMTU discovery (RFC1191) special case : 502563d34d0SEric Dumazet * we can receive locally generated ICMP messages while socket is held. 5031da177e4SLinus Torvalds */ 504b74aa930SEric Dumazet if (sock_owned_by_user(sk)) { 505b74aa930SEric Dumazet if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) 50602a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); 507b74aa930SEric Dumazet } 5081da177e4SLinus Torvalds if (sk->sk_state == TCP_CLOSE) 5091da177e4SLinus Torvalds goto out; 5101da177e4SLinus Torvalds 51197e3ecd1Sstephen hemminger if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 51202a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 51397e3ecd1Sstephen hemminger goto out; 51497e3ecd1Sstephen hemminger } 51597e3ecd1Sstephen hemminger 5161da177e4SLinus Torvalds tp = tcp_sk(sk); 5170a672f74SYuchung Cheng /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 518d983ea6fSEric Dumazet fastopen = rcu_dereference(tp->fastopen_rsk); 5190a672f74SYuchung Cheng snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 5201da177e4SLinus Torvalds if (sk->sk_state != TCP_LISTEN && 5210a672f74SYuchung Cheng !between(seq, snd_una, tp->snd_nxt)) { 52202a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); 5231da177e4SLinus Torvalds goto out; 5241da177e4SLinus Torvalds } 5251da177e4SLinus Torvalds 5261da177e4SLinus Torvalds switch (type) { 52755be7a9cSDavid S. Miller case ICMP_REDIRECT: 52845caeaa5SJon Maxwell if (!sock_owned_by_user(sk)) 529a12daf13SEric Dumazet do_redirect(skb, sk); 53055be7a9cSDavid S. Miller goto out; 5311da177e4SLinus Torvalds case ICMP_SOURCE_QUENCH: 5321da177e4SLinus Torvalds /* Just silently ignore these. */ 5331da177e4SLinus Torvalds goto out; 5341da177e4SLinus Torvalds case ICMP_PARAMETERPROB: 5351da177e4SLinus Torvalds err = EPROTO; 5361da177e4SLinus Torvalds break; 5371da177e4SLinus Torvalds case ICMP_DEST_UNREACH: 5381da177e4SLinus Torvalds if (code > NR_ICMP_UNREACH) 5391da177e4SLinus Torvalds goto out; 5401da177e4SLinus Torvalds 5411da177e4SLinus Torvalds if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 5420d4f0608SEric Dumazet /* We are not interested in TCP_LISTEN and open_requests 5430d4f0608SEric Dumazet * (SYN-ACKs send out by Linux are always <576bytes so 5440d4f0608SEric Dumazet * they should go through unfragmented). 5450d4f0608SEric Dumazet */ 5460d4f0608SEric Dumazet if (sk->sk_state == TCP_LISTEN) 5470d4f0608SEric Dumazet goto out; 5480d4f0608SEric Dumazet 549563d34d0SEric Dumazet tp->mtu_info = info; 550144d56e9SEric Dumazet if (!sock_owned_by_user(sk)) { 551563d34d0SEric Dumazet tcp_v4_mtu_reduced(sk); 552144d56e9SEric Dumazet } else { 5537aa5470cSEric Dumazet if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) 554144d56e9SEric Dumazet sock_hold(sk); 555144d56e9SEric Dumazet } 5561da177e4SLinus Torvalds goto out; 5571da177e4SLinus Torvalds } 5581da177e4SLinus Torvalds 5591da177e4SLinus Torvalds err = icmp_err_convert[code].errno; 560f7456642SEric Dumazet /* check if this ICMP message allows revert of backoff. 561f7456642SEric Dumazet * (see RFC 6069) 562f7456642SEric Dumazet */ 563f7456642SEric Dumazet if (!fastopen && 564f7456642SEric Dumazet (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) 565f7456642SEric Dumazet tcp_ld_RTO_revert(sk, seq); 5661da177e4SLinus Torvalds break; 5671da177e4SLinus Torvalds case ICMP_TIME_EXCEEDED: 5681da177e4SLinus Torvalds err = EHOSTUNREACH; 5691da177e4SLinus Torvalds break; 5701da177e4SLinus Torvalds default: 5711da177e4SLinus Torvalds goto out; 5721da177e4SLinus Torvalds } 5731da177e4SLinus Torvalds 5741da177e4SLinus Torvalds switch (sk->sk_state) { 5751da177e4SLinus Torvalds case TCP_SYN_SENT: 5760a672f74SYuchung Cheng case TCP_SYN_RECV: 5770a672f74SYuchung Cheng /* Only in fast or simultaneous open. If a fast open socket is 5780a672f74SYuchung Cheng * is already accepted it is treated as a connected one below. 5791da177e4SLinus Torvalds */ 58051456b29SIan Morris if (fastopen && !fastopen->sk) 5810a672f74SYuchung Cheng break; 5820a672f74SYuchung Cheng 583a12daf13SEric Dumazet ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 58445af29caSEric Dumazet 5851da177e4SLinus Torvalds if (!sock_owned_by_user(sk)) { 5861da177e4SLinus Torvalds sk->sk_err = err; 5871da177e4SLinus Torvalds 5881da177e4SLinus Torvalds sk->sk_error_report(sk); 5891da177e4SLinus Torvalds 5901da177e4SLinus Torvalds tcp_done(sk); 5911da177e4SLinus Torvalds } else { 5921da177e4SLinus Torvalds sk->sk_err_soft = err; 5931da177e4SLinus Torvalds } 5941da177e4SLinus Torvalds goto out; 5951da177e4SLinus Torvalds } 5961da177e4SLinus Torvalds 5971da177e4SLinus Torvalds /* If we've already connected we will keep trying 5981da177e4SLinus Torvalds * until we time out, or the user gives up. 5991da177e4SLinus Torvalds * 6001da177e4SLinus Torvalds * rfc1122 4.2.3.9 allows to consider as hard errors 6011da177e4SLinus Torvalds * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 6021da177e4SLinus Torvalds * but it is obsoleted by pmtu discovery). 6031da177e4SLinus Torvalds * 6041da177e4SLinus Torvalds * Note, that in modern internet, where routing is unreliable 6051da177e4SLinus Torvalds * and in each dark corner broken firewalls sit, sending random 6061da177e4SLinus Torvalds * errors ordered by their masters even this two messages finally lose 6071da177e4SLinus Torvalds * their original sense (even Linux sends invalid PORT_UNREACHs) 6081da177e4SLinus Torvalds * 6091da177e4SLinus Torvalds * Now we are in compliance with RFCs. 6101da177e4SLinus Torvalds * --ANK (980905) 6111da177e4SLinus Torvalds */ 6121da177e4SLinus Torvalds 6131da177e4SLinus Torvalds inet = inet_sk(sk); 6141da177e4SLinus Torvalds if (!sock_owned_by_user(sk) && inet->recverr) { 6151da177e4SLinus Torvalds sk->sk_err = err; 6161da177e4SLinus Torvalds sk->sk_error_report(sk); 6171da177e4SLinus Torvalds } else { /* Only an error on timeout */ 6181da177e4SLinus Torvalds sk->sk_err_soft = err; 6191da177e4SLinus Torvalds } 6201da177e4SLinus Torvalds 6211da177e4SLinus Torvalds out: 6221da177e4SLinus Torvalds bh_unlock_sock(sk); 6231da177e4SLinus Torvalds sock_put(sk); 62432bbd879SStefano Brivio return 0; 6251da177e4SLinus Torvalds } 6261da177e4SLinus Torvalds 62728850dc7SDaniel Borkmann void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) 6281da177e4SLinus Torvalds { 629aa8223c7SArnaldo Carvalho de Melo struct tcphdr *th = tcp_hdr(skb); 6301da177e4SLinus Torvalds 631419f9f89SHerbert Xu th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); 632663ead3bSHerbert Xu skb->csum_start = skb_transport_header(skb) - skb->head; 633ff1dcadbSAl Viro skb->csum_offset = offsetof(struct tcphdr, check); 6341da177e4SLinus Torvalds } 6351da177e4SLinus Torvalds 636419f9f89SHerbert Xu /* This routine computes an IPv4 TCP checksum. */ 637bb296246SHerbert Xu void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 638419f9f89SHerbert Xu { 639cf533ea5SEric Dumazet const struct inet_sock *inet = inet_sk(sk); 640419f9f89SHerbert Xu 641419f9f89SHerbert Xu __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 642419f9f89SHerbert Xu } 6434bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_send_check); 644419f9f89SHerbert Xu 6451da177e4SLinus Torvalds /* 6461da177e4SLinus Torvalds * This routine will send an RST to the other tcp. 6471da177e4SLinus Torvalds * 6481da177e4SLinus Torvalds * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 6491da177e4SLinus Torvalds * for reset. 6501da177e4SLinus Torvalds * Answer: if a packet caused RST, it is not for a socket 6511da177e4SLinus Torvalds * existing in our system, if it is matched to a socket, 6521da177e4SLinus Torvalds * it is just duplicate segment or bug in other side's TCP. 6531da177e4SLinus Torvalds * So that we build reply only basing on parameters 6541da177e4SLinus Torvalds * arrived with segment. 6551da177e4SLinus Torvalds * Exception: precedence violation. We do not implement it in any case. 6561da177e4SLinus Torvalds */ 6571da177e4SLinus Torvalds 658a00e7444SEric Dumazet static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) 6591da177e4SLinus Torvalds { 660cf533ea5SEric Dumazet const struct tcphdr *th = tcp_hdr(skb); 661cfb6eeb4SYOSHIFUJI Hideaki struct { 662cfb6eeb4SYOSHIFUJI Hideaki struct tcphdr th; 663cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 664714e85beSAl Viro __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; 665cfb6eeb4SYOSHIFUJI Hideaki #endif 666cfb6eeb4SYOSHIFUJI Hideaki } rep; 6671da177e4SLinus Torvalds struct ip_reply_arg arg; 668cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 669e46787f0SFlorian Westphal struct tcp_md5sig_key *key = NULL; 670658ddaafSShawn Lu const __u8 *hash_location = NULL; 671658ddaafSShawn Lu unsigned char newhash[16]; 672658ddaafSShawn Lu int genhash; 673658ddaafSShawn Lu struct sock *sk1 = NULL; 674cfb6eeb4SYOSHIFUJI Hideaki #endif 675d6fb396cSEric Dumazet u64 transmit_time = 0; 67600483690SJon Maxwell struct sock *ctl_sk; 677d6fb396cSEric Dumazet struct net *net; 6781da177e4SLinus Torvalds 6791da177e4SLinus Torvalds /* Never send a reset in response to a reset. */ 6801da177e4SLinus Torvalds if (th->rst) 6811da177e4SLinus Torvalds return; 6821da177e4SLinus Torvalds 683c3658e8dSEric Dumazet /* If sk not NULL, it means we did a successful lookup and incoming 684c3658e8dSEric Dumazet * route had to be correct. prequeue might have dropped our dst. 685c3658e8dSEric Dumazet */ 686c3658e8dSEric Dumazet if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) 6871da177e4SLinus Torvalds return; 6881da177e4SLinus Torvalds 6891da177e4SLinus Torvalds /* Swap the send and the receive. */ 690cfb6eeb4SYOSHIFUJI Hideaki memset(&rep, 0, sizeof(rep)); 691cfb6eeb4SYOSHIFUJI Hideaki rep.th.dest = th->source; 692cfb6eeb4SYOSHIFUJI Hideaki rep.th.source = th->dest; 693cfb6eeb4SYOSHIFUJI Hideaki rep.th.doff = sizeof(struct tcphdr) / 4; 694cfb6eeb4SYOSHIFUJI Hideaki rep.th.rst = 1; 6951da177e4SLinus Torvalds 6961da177e4SLinus Torvalds if (th->ack) { 697cfb6eeb4SYOSHIFUJI Hideaki rep.th.seq = th->ack_seq; 6981da177e4SLinus Torvalds } else { 699cfb6eeb4SYOSHIFUJI Hideaki rep.th.ack = 1; 700cfb6eeb4SYOSHIFUJI Hideaki rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + 7011da177e4SLinus Torvalds skb->len - (th->doff << 2)); 7021da177e4SLinus Torvalds } 7031da177e4SLinus Torvalds 7047174259eSArnaldo Carvalho de Melo memset(&arg, 0, sizeof(arg)); 705cfb6eeb4SYOSHIFUJI Hideaki arg.iov[0].iov_base = (unsigned char *)&rep; 706cfb6eeb4SYOSHIFUJI Hideaki arg.iov[0].iov_len = sizeof(rep.th); 707cfb6eeb4SYOSHIFUJI Hideaki 7080f85feaeSEric Dumazet net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); 709cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 7103b24d854SEric Dumazet rcu_read_lock(); 711658ddaafSShawn Lu hash_location = tcp_parse_md5sig_option(th); 712271c3b9bSFlorian Westphal if (sk && sk_fullsock(sk)) { 713cea97609SDavid Ahern const union tcp_md5_addr *addr; 714dea53bb8SDavid Ahern int l3index; 715cea97609SDavid Ahern 716dea53bb8SDavid Ahern /* sdif set, means packet ingressed via a device 717dea53bb8SDavid Ahern * in an L3 domain and inet_iif is set to it. 718dea53bb8SDavid Ahern */ 719dea53bb8SDavid Ahern l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 720cea97609SDavid Ahern addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 721dea53bb8SDavid Ahern key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 722e46787f0SFlorian Westphal } else if (hash_location) { 723cea97609SDavid Ahern const union tcp_md5_addr *addr; 724534322caSDavid Ahern int sdif = tcp_v4_sdif(skb); 725534322caSDavid Ahern int dif = inet_iif(skb); 726dea53bb8SDavid Ahern int l3index; 727cea97609SDavid Ahern 728658ddaafSShawn Lu /* 729658ddaafSShawn Lu * active side is lost. Try to find listening socket through 730658ddaafSShawn Lu * source port, and then find md5 key through listening socket. 731658ddaafSShawn Lu * we are not loose security here: 732658ddaafSShawn Lu * Incoming packet is checked with md5 hash with finding key, 733658ddaafSShawn Lu * no RST generated if md5 hash doesn't match. 734658ddaafSShawn Lu */ 735a583636aSCraig Gallek sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, 736a583636aSCraig Gallek ip_hdr(skb)->saddr, 737da5e3630STom Herbert th->source, ip_hdr(skb)->daddr, 738534322caSDavid Ahern ntohs(th->source), dif, sdif); 739658ddaafSShawn Lu /* don't send rst if it can't find key */ 740658ddaafSShawn Lu if (!sk1) 7413b24d854SEric Dumazet goto out; 7423b24d854SEric Dumazet 743dea53bb8SDavid Ahern /* sdif set, means packet ingressed via a device 744dea53bb8SDavid Ahern * in an L3 domain and dif is set to it. 745dea53bb8SDavid Ahern */ 746dea53bb8SDavid Ahern l3index = sdif ? dif : 0; 747cea97609SDavid Ahern addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 748dea53bb8SDavid Ahern key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); 749658ddaafSShawn Lu if (!key) 7503b24d854SEric Dumazet goto out; 7513b24d854SEric Dumazet 752658ddaafSShawn Lu 75339f8e58eSEric Dumazet genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); 754658ddaafSShawn Lu if (genhash || memcmp(hash_location, newhash, 16) != 0) 7553b24d854SEric Dumazet goto out; 7563b24d854SEric Dumazet 757658ddaafSShawn Lu } 758658ddaafSShawn Lu 759cfb6eeb4SYOSHIFUJI Hideaki if (key) { 760cfb6eeb4SYOSHIFUJI Hideaki rep.opt[0] = htonl((TCPOPT_NOP << 24) | 761cfb6eeb4SYOSHIFUJI Hideaki (TCPOPT_NOP << 16) | 762cfb6eeb4SYOSHIFUJI Hideaki (TCPOPT_MD5SIG << 8) | 763cfb6eeb4SYOSHIFUJI Hideaki TCPOLEN_MD5SIG); 764cfb6eeb4SYOSHIFUJI Hideaki /* Update length and the length the header thinks exists */ 765cfb6eeb4SYOSHIFUJI Hideaki arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 766cfb6eeb4SYOSHIFUJI Hideaki rep.th.doff = arg.iov[0].iov_len / 4; 767cfb6eeb4SYOSHIFUJI Hideaki 76849a72dfbSAdam Langley tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 76978e645cbSIlpo Järvinen key, ip_hdr(skb)->saddr, 77078e645cbSIlpo Järvinen ip_hdr(skb)->daddr, &rep.th); 771cfb6eeb4SYOSHIFUJI Hideaki } 772cfb6eeb4SYOSHIFUJI Hideaki #endif 773eddc9ec5SArnaldo Carvalho de Melo arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 774eddc9ec5SArnaldo Carvalho de Melo ip_hdr(skb)->saddr, /* XXX */ 77552cd5750SIlpo Järvinen arg.iov[0].iov_len, IPPROTO_TCP, 0); 7761da177e4SLinus Torvalds arg.csumoffset = offsetof(struct tcphdr, check) / 2; 777271c3b9bSFlorian Westphal arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; 778271c3b9bSFlorian Westphal 779e2446eaaSShawn Lu /* When socket is gone, all binding information is lost. 7804c675258SAlexey Kuznetsov * routing might fail in this case. No choice here, if we choose to force 7814c675258SAlexey Kuznetsov * input interface, we will misroute in case of asymmetric route. 782e2446eaaSShawn Lu */ 783c24b14c4SSong Liu if (sk) { 7844c675258SAlexey Kuznetsov arg.bound_dev_if = sk->sk_bound_dev_if; 7855c487bb9SSong Liu if (sk_fullsock(sk)) 786c24b14c4SSong Liu trace_tcp_send_reset(sk, skb); 787c24b14c4SSong Liu } 7881da177e4SLinus Torvalds 789271c3b9bSFlorian Westphal BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != 790271c3b9bSFlorian Westphal offsetof(struct inet_timewait_sock, tw_bound_dev_if)); 791271c3b9bSFlorian Westphal 79266b13d99SEric Dumazet arg.tos = ip_hdr(skb)->tos; 793e2d118a1SLorenzo Colitti arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); 79447dcc20aSEric Dumazet local_bh_disable(); 7955472c3c6SEric Dumazet ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 796a842fe14SEric Dumazet if (sk) { 79700483690SJon Maxwell ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 79800483690SJon Maxwell inet_twsk(sk)->tw_mark : sk->sk_mark; 799f6c0f5d2SEric Dumazet ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 800f6c0f5d2SEric Dumazet inet_twsk(sk)->tw_priority : sk->sk_priority; 801d6fb396cSEric Dumazet transmit_time = tcp_transmit_time(sk); 802a842fe14SEric Dumazet } 80300483690SJon Maxwell ip_send_unicast_reply(ctl_sk, 804bdbbb852SEric Dumazet skb, &TCP_SKB_CB(skb)->header.h4.opt, 80524a2d43dSEric Dumazet ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 806d6fb396cSEric Dumazet &arg, arg.iov[0].iov_len, 807d6fb396cSEric Dumazet transmit_time); 8081da177e4SLinus Torvalds 80900483690SJon Maxwell ctl_sk->sk_mark = 0; 81090bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 81190bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); 81247dcc20aSEric Dumazet local_bh_enable(); 813658ddaafSShawn Lu 814658ddaafSShawn Lu #ifdef CONFIG_TCP_MD5SIG 8153b24d854SEric Dumazet out: 816658ddaafSShawn Lu rcu_read_unlock(); 817658ddaafSShawn Lu #endif 8181da177e4SLinus Torvalds } 8191da177e4SLinus Torvalds 8201da177e4SLinus Torvalds /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 8211da177e4SLinus Torvalds outside socket context is ugly, certainly. What can I do? 8221da177e4SLinus Torvalds */ 8231da177e4SLinus Torvalds 824e2d118a1SLorenzo Colitti static void tcp_v4_send_ack(const struct sock *sk, 825e62a123bSEric Dumazet struct sk_buff *skb, u32 seq, u32 ack, 826ee684b6fSAndrey Vagin u32 win, u32 tsval, u32 tsecr, int oif, 82788ef4a5aSKOVACS Krisztian struct tcp_md5sig_key *key, 82866b13d99SEric Dumazet int reply_flags, u8 tos) 8291da177e4SLinus Torvalds { 830cf533ea5SEric Dumazet const struct tcphdr *th = tcp_hdr(skb); 8311da177e4SLinus Torvalds struct { 8321da177e4SLinus Torvalds struct tcphdr th; 833714e85beSAl Viro __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 834cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 835cfb6eeb4SYOSHIFUJI Hideaki + (TCPOLEN_MD5SIG_ALIGNED >> 2) 836cfb6eeb4SYOSHIFUJI Hideaki #endif 837cfb6eeb4SYOSHIFUJI Hideaki ]; 8381da177e4SLinus Torvalds } rep; 839e2d118a1SLorenzo Colitti struct net *net = sock_net(sk); 8401da177e4SLinus Torvalds struct ip_reply_arg arg; 84100483690SJon Maxwell struct sock *ctl_sk; 842d6fb396cSEric Dumazet u64 transmit_time; 8431da177e4SLinus Torvalds 8441da177e4SLinus Torvalds memset(&rep.th, 0, sizeof(struct tcphdr)); 8457174259eSArnaldo Carvalho de Melo memset(&arg, 0, sizeof(arg)); 8461da177e4SLinus Torvalds 8471da177e4SLinus Torvalds arg.iov[0].iov_base = (unsigned char *)&rep; 8481da177e4SLinus Torvalds arg.iov[0].iov_len = sizeof(rep.th); 849ee684b6fSAndrey Vagin if (tsecr) { 850cfb6eeb4SYOSHIFUJI Hideaki rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 8511da177e4SLinus Torvalds (TCPOPT_TIMESTAMP << 8) | 8521da177e4SLinus Torvalds TCPOLEN_TIMESTAMP); 853ee684b6fSAndrey Vagin rep.opt[1] = htonl(tsval); 854ee684b6fSAndrey Vagin rep.opt[2] = htonl(tsecr); 855cb48cfe8SCraig Schlenter arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; 8561da177e4SLinus Torvalds } 8571da177e4SLinus Torvalds 8581da177e4SLinus Torvalds /* Swap the send and the receive. */ 8591da177e4SLinus Torvalds rep.th.dest = th->source; 8601da177e4SLinus Torvalds rep.th.source = th->dest; 8611da177e4SLinus Torvalds rep.th.doff = arg.iov[0].iov_len / 4; 8621da177e4SLinus Torvalds rep.th.seq = htonl(seq); 8631da177e4SLinus Torvalds rep.th.ack_seq = htonl(ack); 8641da177e4SLinus Torvalds rep.th.ack = 1; 8651da177e4SLinus Torvalds rep.th.window = htons(win); 8661da177e4SLinus Torvalds 867cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 868cfb6eeb4SYOSHIFUJI Hideaki if (key) { 869ee684b6fSAndrey Vagin int offset = (tsecr) ? 3 : 0; 870cfb6eeb4SYOSHIFUJI Hideaki 871cfb6eeb4SYOSHIFUJI Hideaki rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | 872cfb6eeb4SYOSHIFUJI Hideaki (TCPOPT_NOP << 16) | 873cfb6eeb4SYOSHIFUJI Hideaki (TCPOPT_MD5SIG << 8) | 874cfb6eeb4SYOSHIFUJI Hideaki TCPOLEN_MD5SIG); 875cfb6eeb4SYOSHIFUJI Hideaki arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; 876cfb6eeb4SYOSHIFUJI Hideaki rep.th.doff = arg.iov[0].iov_len/4; 877cfb6eeb4SYOSHIFUJI Hideaki 87849a72dfbSAdam Langley tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], 87990b7e112SAdam Langley key, ip_hdr(skb)->saddr, 88090b7e112SAdam Langley ip_hdr(skb)->daddr, &rep.th); 881cfb6eeb4SYOSHIFUJI Hideaki } 882cfb6eeb4SYOSHIFUJI Hideaki #endif 88388ef4a5aSKOVACS Krisztian arg.flags = reply_flags; 884eddc9ec5SArnaldo Carvalho de Melo arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 885eddc9ec5SArnaldo Carvalho de Melo ip_hdr(skb)->saddr, /* XXX */ 8861da177e4SLinus Torvalds arg.iov[0].iov_len, IPPROTO_TCP, 0); 8871da177e4SLinus Torvalds arg.csumoffset = offsetof(struct tcphdr, check) / 2; 8889501f972SYOSHIFUJI Hideaki if (oif) 8899501f972SYOSHIFUJI Hideaki arg.bound_dev_if = oif; 89066b13d99SEric Dumazet arg.tos = tos; 891e2d118a1SLorenzo Colitti arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); 89247dcc20aSEric Dumazet local_bh_disable(); 8935472c3c6SEric Dumazet ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); 89400483690SJon Maxwell ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? 89500483690SJon Maxwell inet_twsk(sk)->tw_mark : sk->sk_mark; 896f6c0f5d2SEric Dumazet ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? 897f6c0f5d2SEric Dumazet inet_twsk(sk)->tw_priority : sk->sk_priority; 898d6fb396cSEric Dumazet transmit_time = tcp_transmit_time(sk); 89900483690SJon Maxwell ip_send_unicast_reply(ctl_sk, 900bdbbb852SEric Dumazet skb, &TCP_SKB_CB(skb)->header.h4.opt, 90124a2d43dSEric Dumazet ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 902d6fb396cSEric Dumazet &arg, arg.iov[0].iov_len, 903d6fb396cSEric Dumazet transmit_time); 9041da177e4SLinus Torvalds 90500483690SJon Maxwell ctl_sk->sk_mark = 0; 90690bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); 90747dcc20aSEric Dumazet local_bh_enable(); 9081da177e4SLinus Torvalds } 9091da177e4SLinus Torvalds 9101da177e4SLinus Torvalds static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 9111da177e4SLinus Torvalds { 9128feaf0c0SArnaldo Carvalho de Melo struct inet_timewait_sock *tw = inet_twsk(sk); 913cfb6eeb4SYOSHIFUJI Hideaki struct tcp_timewait_sock *tcptw = tcp_twsk(sk); 9141da177e4SLinus Torvalds 915e2d118a1SLorenzo Colitti tcp_v4_send_ack(sk, skb, 916e62a123bSEric Dumazet tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, 9177174259eSArnaldo Carvalho de Melo tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 9189a568de4SEric Dumazet tcp_time_stamp_raw() + tcptw->tw_ts_offset, 9199501f972SYOSHIFUJI Hideaki tcptw->tw_ts_recent, 9209501f972SYOSHIFUJI Hideaki tw->tw_bound_dev_if, 92188ef4a5aSKOVACS Krisztian tcp_twsk_md5_key(tcptw), 92266b13d99SEric Dumazet tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 92366b13d99SEric Dumazet tw->tw_tos 9249501f972SYOSHIFUJI Hideaki ); 9251da177e4SLinus Torvalds 9268feaf0c0SArnaldo Carvalho de Melo inet_twsk_put(tw); 9271da177e4SLinus Torvalds } 9281da177e4SLinus Torvalds 929a00e7444SEric Dumazet static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 9307174259eSArnaldo Carvalho de Melo struct request_sock *req) 9311da177e4SLinus Torvalds { 932cea97609SDavid Ahern const union tcp_md5_addr *addr; 933dea53bb8SDavid Ahern int l3index; 934cea97609SDavid Ahern 935168a8f58SJerry Chu /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 936168a8f58SJerry Chu * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 937168a8f58SJerry Chu */ 938e62a123bSEric Dumazet u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : 939e62a123bSEric Dumazet tcp_sk(sk)->snd_nxt; 940e62a123bSEric Dumazet 94120a2b49fSEric Dumazet /* RFC 7323 2.3 94220a2b49fSEric Dumazet * The window field (SEG.WND) of every outgoing segment, with the 94320a2b49fSEric Dumazet * exception of <SYN> segments, MUST be right-shifted by 94420a2b49fSEric Dumazet * Rcv.Wind.Shift bits: 94520a2b49fSEric Dumazet */ 946cea97609SDavid Ahern addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; 947dea53bb8SDavid Ahern l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; 948e2d118a1SLorenzo Colitti tcp_v4_send_ack(sk, skb, seq, 94920a2b49fSEric Dumazet tcp_rsk(req)->rcv_nxt, 95020a2b49fSEric Dumazet req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, 9519a568de4SEric Dumazet tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, 9529501f972SYOSHIFUJI Hideaki req->ts_recent, 9539501f972SYOSHIFUJI Hideaki 0, 954dea53bb8SDavid Ahern tcp_md5_do_lookup(sk, l3index, addr, AF_INET), 95566b13d99SEric Dumazet inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, 95666b13d99SEric Dumazet ip_hdr(skb)->tos); 9571da177e4SLinus Torvalds } 9581da177e4SLinus Torvalds 9591da177e4SLinus Torvalds /* 9609bf1d83eSKris Katterjohn * Send a SYN-ACK after having received a SYN. 96160236fddSArnaldo Carvalho de Melo * This still operates on a request_sock only, not on a big 9621da177e4SLinus Torvalds * socket. 9631da177e4SLinus Torvalds */ 9640f935dbeSEric Dumazet static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 965d6274bd8SOctavian Purdila struct flowi *fl, 966e6b4d113SWilliam Allen Simpson struct request_sock *req, 967ca6fb065SEric Dumazet struct tcp_fastopen_cookie *foc, 968b3d05147SEric Dumazet enum tcp_synack_type synack_type) 9691da177e4SLinus Torvalds { 9702e6599cbSArnaldo Carvalho de Melo const struct inet_request_sock *ireq = inet_rsk(req); 9716bd023f3SDavid S. Miller struct flowi4 fl4; 9721da177e4SLinus Torvalds int err = -1; 9731da177e4SLinus Torvalds struct sk_buff *skb; 9741da177e4SLinus Torvalds 9751da177e4SLinus Torvalds /* First, grab a route. */ 976ba3f7f04SDavid S. Miller if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 977fd80eb94SDenis V. Lunev return -1; 9781da177e4SLinus Torvalds 979b3d05147SEric Dumazet skb = tcp_make_synack(sk, dst, req, foc, synack_type); 9801da177e4SLinus Torvalds 9811da177e4SLinus Torvalds if (skb) { 982634fb979SEric Dumazet __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 9831da177e4SLinus Torvalds 9842ab2ddd3SEric Dumazet rcu_read_lock(); 985634fb979SEric Dumazet err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, 986634fb979SEric Dumazet ireq->ir_rmt_addr, 9872ab2ddd3SEric Dumazet rcu_dereference(ireq->ireq_opt)); 9882ab2ddd3SEric Dumazet rcu_read_unlock(); 989b9df3cb8SGerrit Renker err = net_xmit_eval(err); 9901da177e4SLinus Torvalds } 9911da177e4SLinus Torvalds 9921da177e4SLinus Torvalds return err; 9931da177e4SLinus Torvalds } 9941da177e4SLinus Torvalds 9951da177e4SLinus Torvalds /* 99660236fddSArnaldo Carvalho de Melo * IPv4 request_sock destructor. 9971da177e4SLinus Torvalds */ 99860236fddSArnaldo Carvalho de Melo static void tcp_v4_reqsk_destructor(struct request_sock *req) 9991da177e4SLinus Torvalds { 1000c92e8c02SEric Dumazet kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); 10011da177e4SLinus Torvalds } 10021da177e4SLinus Torvalds 1003cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 1004cfb6eeb4SYOSHIFUJI Hideaki /* 1005cfb6eeb4SYOSHIFUJI Hideaki * RFC2385 MD5 checksumming requires a mapping of 1006cfb6eeb4SYOSHIFUJI Hideaki * IP address->MD5 Key. 1007cfb6eeb4SYOSHIFUJI Hideaki * We need to maintain these in the sk structure. 1008cfb6eeb4SYOSHIFUJI Hideaki */ 1009cfb6eeb4SYOSHIFUJI Hideaki 1010921f9a0fSEric Dumazet DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); 10116015c71eSEric Dumazet EXPORT_SYMBOL(tcp_md5_needed); 10126015c71eSEric Dumazet 1013cfb6eeb4SYOSHIFUJI Hideaki /* Find the Key structure for an address. */ 1014dea53bb8SDavid Ahern struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, 1015a915da9bSEric Dumazet const union tcp_md5_addr *addr, 1016a915da9bSEric Dumazet int family) 1017cfb6eeb4SYOSHIFUJI Hideaki { 1018fd3a154aSEric Dumazet const struct tcp_sock *tp = tcp_sk(sk); 1019a915da9bSEric Dumazet struct tcp_md5sig_key *key; 1020fd3a154aSEric Dumazet const struct tcp_md5sig_info *md5sig; 10216797318eSIvan Delalande __be32 mask; 10226797318eSIvan Delalande struct tcp_md5sig_key *best_match = NULL; 10236797318eSIvan Delalande bool match; 1024cfb6eeb4SYOSHIFUJI Hideaki 1025a8afca03SEric Dumazet /* caller either holds rcu_read_lock() or socket lock */ 1026a8afca03SEric Dumazet md5sig = rcu_dereference_check(tp->md5sig_info, 10271e1d04e6SHannes Frederic Sowa lockdep_sock_is_held(sk)); 1028a8afca03SEric Dumazet if (!md5sig) 1029cfb6eeb4SYOSHIFUJI Hideaki return NULL; 1030083a0326SArnd Bergmann 1031c8b91770SAmol Grover hlist_for_each_entry_rcu(key, &md5sig->head, node, 1032c8b91770SAmol Grover lockdep_sock_is_held(sk)) { 1033a915da9bSEric Dumazet if (key->family != family) 1034a915da9bSEric Dumazet continue; 1035dea53bb8SDavid Ahern if (key->l3index && key->l3index != l3index) 1036dea53bb8SDavid Ahern continue; 10376797318eSIvan Delalande if (family == AF_INET) { 10386797318eSIvan Delalande mask = inet_make_mask(key->prefixlen); 10396797318eSIvan Delalande match = (key->addr.a4.s_addr & mask) == 10406797318eSIvan Delalande (addr->a4.s_addr & mask); 10416797318eSIvan Delalande #if IS_ENABLED(CONFIG_IPV6) 10426797318eSIvan Delalande } else if (family == AF_INET6) { 10436797318eSIvan Delalande match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, 10446797318eSIvan Delalande key->prefixlen); 10456797318eSIvan Delalande #endif 10466797318eSIvan Delalande } else { 10476797318eSIvan Delalande match = false; 10486797318eSIvan Delalande } 10496797318eSIvan Delalande 10506797318eSIvan Delalande if (match && (!best_match || 10516797318eSIvan Delalande key->prefixlen > best_match->prefixlen)) 10526797318eSIvan Delalande best_match = key; 10536797318eSIvan Delalande } 10546797318eSIvan Delalande return best_match; 10556797318eSIvan Delalande } 10566015c71eSEric Dumazet EXPORT_SYMBOL(__tcp_md5_do_lookup); 10576797318eSIvan Delalande 1058e8f37d57SWu Fengguang static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, 10596797318eSIvan Delalande const union tcp_md5_addr *addr, 1060dea53bb8SDavid Ahern int family, u8 prefixlen, 1061dea53bb8SDavid Ahern int l3index) 10626797318eSIvan Delalande { 10636797318eSIvan Delalande const struct tcp_sock *tp = tcp_sk(sk); 10646797318eSIvan Delalande struct tcp_md5sig_key *key; 10656797318eSIvan Delalande unsigned int size = sizeof(struct in_addr); 10666797318eSIvan Delalande const struct tcp_md5sig_info *md5sig; 10676797318eSIvan Delalande 10686797318eSIvan Delalande /* caller either holds rcu_read_lock() or socket lock */ 10696797318eSIvan Delalande md5sig = rcu_dereference_check(tp->md5sig_info, 10706797318eSIvan Delalande lockdep_sock_is_held(sk)); 10716797318eSIvan Delalande if (!md5sig) 10726797318eSIvan Delalande return NULL; 10736797318eSIvan Delalande #if IS_ENABLED(CONFIG_IPV6) 10746797318eSIvan Delalande if (family == AF_INET6) 10756797318eSIvan Delalande size = sizeof(struct in6_addr); 10766797318eSIvan Delalande #endif 1077c8b91770SAmol Grover hlist_for_each_entry_rcu(key, &md5sig->head, node, 1078c8b91770SAmol Grover lockdep_sock_is_held(sk)) { 10796797318eSIvan Delalande if (key->family != family) 10806797318eSIvan Delalande continue; 1081dea53bb8SDavid Ahern if (key->l3index && key->l3index != l3index) 1082dea53bb8SDavid Ahern continue; 10836797318eSIvan Delalande if (!memcmp(&key->addr, addr, size) && 10846797318eSIvan Delalande key->prefixlen == prefixlen) 1085a915da9bSEric Dumazet return key; 1086cfb6eeb4SYOSHIFUJI Hideaki } 1087cfb6eeb4SYOSHIFUJI Hideaki return NULL; 1088cfb6eeb4SYOSHIFUJI Hideaki } 1089cfb6eeb4SYOSHIFUJI Hideaki 1090b83e3debSEric Dumazet struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, 1091fd3a154aSEric Dumazet const struct sock *addr_sk) 1092cfb6eeb4SYOSHIFUJI Hideaki { 1093b52e6921SEric Dumazet const union tcp_md5_addr *addr; 1094dea53bb8SDavid Ahern int l3index; 1095a915da9bSEric Dumazet 1096dea53bb8SDavid Ahern l3index = l3mdev_master_ifindex_by_index(sock_net(sk), 1097dea53bb8SDavid Ahern addr_sk->sk_bound_dev_if); 1098b52e6921SEric Dumazet addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; 1099dea53bb8SDavid Ahern return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 1100cfb6eeb4SYOSHIFUJI Hideaki } 1101cfb6eeb4SYOSHIFUJI Hideaki EXPORT_SYMBOL(tcp_v4_md5_lookup); 1102cfb6eeb4SYOSHIFUJI Hideaki 1103cfb6eeb4SYOSHIFUJI Hideaki /* This can be called on a newly created socket, from other files */ 1104a915da9bSEric Dumazet int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 1105dea53bb8SDavid Ahern int family, u8 prefixlen, int l3index, 1106dea53bb8SDavid Ahern const u8 *newkey, u8 newkeylen, gfp_t gfp) 1107cfb6eeb4SYOSHIFUJI Hideaki { 1108cfb6eeb4SYOSHIFUJI Hideaki /* Add Key to the list */ 1109b0a713e9SMatthias M. Dellweg struct tcp_md5sig_key *key; 1110cfb6eeb4SYOSHIFUJI Hideaki struct tcp_sock *tp = tcp_sk(sk); 1111f6685938SArnaldo Carvalho de Melo struct tcp_md5sig_info *md5sig; 1112f6685938SArnaldo Carvalho de Melo 1113dea53bb8SDavid Ahern key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1114a915da9bSEric Dumazet if (key) { 1115e6ced831SEric Dumazet /* Pre-existing entry - just update that one. 1116e6ced831SEric Dumazet * Note that the key might be used concurrently. 1117e6ced831SEric Dumazet * data_race() is telling kcsan that we do not care of 1118e6ced831SEric Dumazet * key mismatches, since changing MD5 key on live flows 1119e6ced831SEric Dumazet * can lead to packet drops. 1120e6ced831SEric Dumazet */ 1121e6ced831SEric Dumazet data_race(memcpy(key->key, newkey, newkeylen)); 11226a2febecSEric Dumazet 1123e6ced831SEric Dumazet /* Pairs with READ_ONCE() in tcp_md5_hash_key(). 1124e6ced831SEric Dumazet * Also note that a reader could catch new key->keylen value 1125e6ced831SEric Dumazet * but old key->key[], this is the reason we use __GFP_ZERO 1126e6ced831SEric Dumazet * at sock_kmalloc() time below these lines. 1127e6ced831SEric Dumazet */ 1128e6ced831SEric Dumazet WRITE_ONCE(key->keylen, newkeylen); 11296a2febecSEric Dumazet 1130a915da9bSEric Dumazet return 0; 1131cfb6eeb4SYOSHIFUJI Hideaki } 1132260fcbebSYan, Zheng 1133a8afca03SEric Dumazet md5sig = rcu_dereference_protected(tp->md5sig_info, 11341e1d04e6SHannes Frederic Sowa lockdep_sock_is_held(sk)); 1135a915da9bSEric Dumazet if (!md5sig) { 1136a915da9bSEric Dumazet md5sig = kmalloc(sizeof(*md5sig), gfp); 1137a915da9bSEric Dumazet if (!md5sig) 1138a915da9bSEric Dumazet return -ENOMEM; 1139a915da9bSEric Dumazet 1140a915da9bSEric Dumazet sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1141a915da9bSEric Dumazet INIT_HLIST_HEAD(&md5sig->head); 1142a8afca03SEric Dumazet rcu_assign_pointer(tp->md5sig_info, md5sig); 1143a915da9bSEric Dumazet } 1144a915da9bSEric Dumazet 1145e6ced831SEric Dumazet key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); 1146a915da9bSEric Dumazet if (!key) 1147a915da9bSEric Dumazet return -ENOMEM; 114871cea17eSEric Dumazet if (!tcp_alloc_md5sig_pool()) { 11495f3d9cb2SEric Dumazet sock_kfree_s(sk, key, sizeof(*key)); 1150cfb6eeb4SYOSHIFUJI Hideaki return -ENOMEM; 1151cfb6eeb4SYOSHIFUJI Hideaki } 1152f6685938SArnaldo Carvalho de Melo 1153a915da9bSEric Dumazet memcpy(key->key, newkey, newkeylen); 1154a915da9bSEric Dumazet key->keylen = newkeylen; 1155a915da9bSEric Dumazet key->family = family; 11566797318eSIvan Delalande key->prefixlen = prefixlen; 1157dea53bb8SDavid Ahern key->l3index = l3index; 1158a915da9bSEric Dumazet memcpy(&key->addr, addr, 1159a915da9bSEric Dumazet (family == AF_INET6) ? sizeof(struct in6_addr) : 1160a915da9bSEric Dumazet sizeof(struct in_addr)); 1161a915da9bSEric Dumazet hlist_add_head_rcu(&key->node, &md5sig->head); 1162cfb6eeb4SYOSHIFUJI Hideaki return 0; 1163cfb6eeb4SYOSHIFUJI Hideaki } 1164a915da9bSEric Dumazet EXPORT_SYMBOL(tcp_md5_do_add); 1165cfb6eeb4SYOSHIFUJI Hideaki 11666797318eSIvan Delalande int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, 1167dea53bb8SDavid Ahern u8 prefixlen, int l3index) 1168cfb6eeb4SYOSHIFUJI Hideaki { 1169a915da9bSEric Dumazet struct tcp_md5sig_key *key; 1170cfb6eeb4SYOSHIFUJI Hideaki 1171dea53bb8SDavid Ahern key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); 1172a915da9bSEric Dumazet if (!key) 1173cfb6eeb4SYOSHIFUJI Hideaki return -ENOENT; 1174a915da9bSEric Dumazet hlist_del_rcu(&key->node); 11755f3d9cb2SEric Dumazet atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1176a915da9bSEric Dumazet kfree_rcu(key, rcu); 1177a915da9bSEric Dumazet return 0; 1178cfb6eeb4SYOSHIFUJI Hideaki } 1179a915da9bSEric Dumazet EXPORT_SYMBOL(tcp_md5_do_del); 1180cfb6eeb4SYOSHIFUJI Hideaki 1181e0683e70Sstephen hemminger static void tcp_clear_md5_list(struct sock *sk) 1182cfb6eeb4SYOSHIFUJI Hideaki { 1183cfb6eeb4SYOSHIFUJI Hideaki struct tcp_sock *tp = tcp_sk(sk); 1184a915da9bSEric Dumazet struct tcp_md5sig_key *key; 1185b67bfe0dSSasha Levin struct hlist_node *n; 1186a8afca03SEric Dumazet struct tcp_md5sig_info *md5sig; 1187cfb6eeb4SYOSHIFUJI Hideaki 1188a8afca03SEric Dumazet md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1189a8afca03SEric Dumazet 1190b67bfe0dSSasha Levin hlist_for_each_entry_safe(key, n, &md5sig->head, node) { 1191a915da9bSEric Dumazet hlist_del_rcu(&key->node); 11925f3d9cb2SEric Dumazet atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1193a915da9bSEric Dumazet kfree_rcu(key, rcu); 1194cfb6eeb4SYOSHIFUJI Hideaki } 1195cfb6eeb4SYOSHIFUJI Hideaki } 1196cfb6eeb4SYOSHIFUJI Hideaki 11978917a777SIvan Delalande static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, 1198d4c19c49SChristoph Hellwig sockptr_t optval, int optlen) 1199cfb6eeb4SYOSHIFUJI Hideaki { 1200cfb6eeb4SYOSHIFUJI Hideaki struct tcp_md5sig cmd; 1201cfb6eeb4SYOSHIFUJI Hideaki struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1202cea97609SDavid Ahern const union tcp_md5_addr *addr; 12038917a777SIvan Delalande u8 prefixlen = 32; 1204dea53bb8SDavid Ahern int l3index = 0; 1205cfb6eeb4SYOSHIFUJI Hideaki 1206cfb6eeb4SYOSHIFUJI Hideaki if (optlen < sizeof(cmd)) 1207cfb6eeb4SYOSHIFUJI Hideaki return -EINVAL; 1208cfb6eeb4SYOSHIFUJI Hideaki 1209d4c19c49SChristoph Hellwig if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) 1210cfb6eeb4SYOSHIFUJI Hideaki return -EFAULT; 1211cfb6eeb4SYOSHIFUJI Hideaki 1212cfb6eeb4SYOSHIFUJI Hideaki if (sin->sin_family != AF_INET) 1213cfb6eeb4SYOSHIFUJI Hideaki return -EINVAL; 1214cfb6eeb4SYOSHIFUJI Hideaki 12158917a777SIvan Delalande if (optname == TCP_MD5SIG_EXT && 12168917a777SIvan Delalande cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { 12178917a777SIvan Delalande prefixlen = cmd.tcpm_prefixlen; 12188917a777SIvan Delalande if (prefixlen > 32) 12198917a777SIvan Delalande return -EINVAL; 12208917a777SIvan Delalande } 12218917a777SIvan Delalande 12226b102db5SDavid Ahern if (optname == TCP_MD5SIG_EXT && 12236b102db5SDavid Ahern cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { 12246b102db5SDavid Ahern struct net_device *dev; 12256b102db5SDavid Ahern 12266b102db5SDavid Ahern rcu_read_lock(); 12276b102db5SDavid Ahern dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); 12286b102db5SDavid Ahern if (dev && netif_is_l3_master(dev)) 12296b102db5SDavid Ahern l3index = dev->ifindex; 12306b102db5SDavid Ahern 12316b102db5SDavid Ahern rcu_read_unlock(); 12326b102db5SDavid Ahern 12336b102db5SDavid Ahern /* ok to reference set/not set outside of rcu; 12346b102db5SDavid Ahern * right now device MUST be an L3 master 12356b102db5SDavid Ahern */ 12366b102db5SDavid Ahern if (!dev || !l3index) 12376b102db5SDavid Ahern return -EINVAL; 12386b102db5SDavid Ahern } 12396b102db5SDavid Ahern 1240cea97609SDavid Ahern addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; 1241cea97609SDavid Ahern 124264a124edSDmitry Popov if (!cmd.tcpm_keylen) 1243dea53bb8SDavid Ahern return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); 1244cfb6eeb4SYOSHIFUJI Hideaki 1245cfb6eeb4SYOSHIFUJI Hideaki if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1246cfb6eeb4SYOSHIFUJI Hideaki return -EINVAL; 1247cfb6eeb4SYOSHIFUJI Hideaki 1248dea53bb8SDavid Ahern return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, 1249cea97609SDavid Ahern cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1250cfb6eeb4SYOSHIFUJI Hideaki } 1251cfb6eeb4SYOSHIFUJI Hideaki 125219689e38SEric Dumazet static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, 125319689e38SEric Dumazet __be32 daddr, __be32 saddr, 125419689e38SEric Dumazet const struct tcphdr *th, int nbytes) 1255cfb6eeb4SYOSHIFUJI Hideaki { 1256cfb6eeb4SYOSHIFUJI Hideaki struct tcp4_pseudohdr *bp; 125749a72dfbSAdam Langley struct scatterlist sg; 125819689e38SEric Dumazet struct tcphdr *_th; 1259cfb6eeb4SYOSHIFUJI Hideaki 126019689e38SEric Dumazet bp = hp->scratch; 1261cfb6eeb4SYOSHIFUJI Hideaki bp->saddr = saddr; 1262cfb6eeb4SYOSHIFUJI Hideaki bp->daddr = daddr; 1263cfb6eeb4SYOSHIFUJI Hideaki bp->pad = 0; 1264076fb722SYOSHIFUJI Hideaki bp->protocol = IPPROTO_TCP; 126549a72dfbSAdam Langley bp->len = cpu_to_be16(nbytes); 1266c7da57a1SDavid S. Miller 126719689e38SEric Dumazet _th = (struct tcphdr *)(bp + 1); 126819689e38SEric Dumazet memcpy(_th, th, sizeof(*th)); 126919689e38SEric Dumazet _th->check = 0; 127019689e38SEric Dumazet 127119689e38SEric Dumazet sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); 127219689e38SEric Dumazet ahash_request_set_crypt(hp->md5_req, &sg, NULL, 127319689e38SEric Dumazet sizeof(*bp) + sizeof(*th)); 1274cf80e0e4SHerbert Xu return crypto_ahash_update(hp->md5_req); 127549a72dfbSAdam Langley } 127649a72dfbSAdam Langley 1277a915da9bSEric Dumazet static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1278318cf7aaSEric Dumazet __be32 daddr, __be32 saddr, const struct tcphdr *th) 127949a72dfbSAdam Langley { 128049a72dfbSAdam Langley struct tcp_md5sig_pool *hp; 1281cf80e0e4SHerbert Xu struct ahash_request *req; 128249a72dfbSAdam Langley 128349a72dfbSAdam Langley hp = tcp_get_md5sig_pool(); 128449a72dfbSAdam Langley if (!hp) 128549a72dfbSAdam Langley goto clear_hash_noput; 1286cf80e0e4SHerbert Xu req = hp->md5_req; 128749a72dfbSAdam Langley 1288cf80e0e4SHerbert Xu if (crypto_ahash_init(req)) 128949a72dfbSAdam Langley goto clear_hash; 129019689e38SEric Dumazet if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) 129149a72dfbSAdam Langley goto clear_hash; 129249a72dfbSAdam Langley if (tcp_md5_hash_key(hp, key)) 129349a72dfbSAdam Langley goto clear_hash; 1294cf80e0e4SHerbert Xu ahash_request_set_crypt(req, NULL, md5_hash, 0); 1295cf80e0e4SHerbert Xu if (crypto_ahash_final(req)) 1296cfb6eeb4SYOSHIFUJI Hideaki goto clear_hash; 1297cfb6eeb4SYOSHIFUJI Hideaki 1298cfb6eeb4SYOSHIFUJI Hideaki tcp_put_md5sig_pool(); 1299cfb6eeb4SYOSHIFUJI Hideaki return 0; 130049a72dfbSAdam Langley 1301cfb6eeb4SYOSHIFUJI Hideaki clear_hash: 1302cfb6eeb4SYOSHIFUJI Hideaki tcp_put_md5sig_pool(); 1303cfb6eeb4SYOSHIFUJI Hideaki clear_hash_noput: 1304cfb6eeb4SYOSHIFUJI Hideaki memset(md5_hash, 0, 16); 130549a72dfbSAdam Langley return 1; 1306cfb6eeb4SYOSHIFUJI Hideaki } 1307cfb6eeb4SYOSHIFUJI Hideaki 130839f8e58eSEric Dumazet int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, 130939f8e58eSEric Dumazet const struct sock *sk, 1310318cf7aaSEric Dumazet const struct sk_buff *skb) 1311cfb6eeb4SYOSHIFUJI Hideaki { 131249a72dfbSAdam Langley struct tcp_md5sig_pool *hp; 1313cf80e0e4SHerbert Xu struct ahash_request *req; 1314318cf7aaSEric Dumazet const struct tcphdr *th = tcp_hdr(skb); 1315cfb6eeb4SYOSHIFUJI Hideaki __be32 saddr, daddr; 1316cfb6eeb4SYOSHIFUJI Hideaki 131739f8e58eSEric Dumazet if (sk) { /* valid for establish/request sockets */ 131839f8e58eSEric Dumazet saddr = sk->sk_rcv_saddr; 131939f8e58eSEric Dumazet daddr = sk->sk_daddr; 1320cfb6eeb4SYOSHIFUJI Hideaki } else { 132149a72dfbSAdam Langley const struct iphdr *iph = ip_hdr(skb); 132249a72dfbSAdam Langley saddr = iph->saddr; 132349a72dfbSAdam Langley daddr = iph->daddr; 1324cfb6eeb4SYOSHIFUJI Hideaki } 1325cfb6eeb4SYOSHIFUJI Hideaki 132649a72dfbSAdam Langley hp = tcp_get_md5sig_pool(); 132749a72dfbSAdam Langley if (!hp) 132849a72dfbSAdam Langley goto clear_hash_noput; 1329cf80e0e4SHerbert Xu req = hp->md5_req; 133049a72dfbSAdam Langley 1331cf80e0e4SHerbert Xu if (crypto_ahash_init(req)) 133249a72dfbSAdam Langley goto clear_hash; 133349a72dfbSAdam Langley 133419689e38SEric Dumazet if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) 133549a72dfbSAdam Langley goto clear_hash; 133649a72dfbSAdam Langley if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) 133749a72dfbSAdam Langley goto clear_hash; 133849a72dfbSAdam Langley if (tcp_md5_hash_key(hp, key)) 133949a72dfbSAdam Langley goto clear_hash; 1340cf80e0e4SHerbert Xu ahash_request_set_crypt(req, NULL, md5_hash, 0); 1341cf80e0e4SHerbert Xu if (crypto_ahash_final(req)) 134249a72dfbSAdam Langley goto clear_hash; 134349a72dfbSAdam Langley 134449a72dfbSAdam Langley tcp_put_md5sig_pool(); 134549a72dfbSAdam Langley return 0; 134649a72dfbSAdam Langley 134749a72dfbSAdam Langley clear_hash: 134849a72dfbSAdam Langley tcp_put_md5sig_pool(); 134949a72dfbSAdam Langley clear_hash_noput: 135049a72dfbSAdam Langley memset(md5_hash, 0, 16); 135149a72dfbSAdam Langley return 1; 135249a72dfbSAdam Langley } 135349a72dfbSAdam Langley EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1354cfb6eeb4SYOSHIFUJI Hideaki 1355ba8e275aSEric Dumazet #endif 1356ba8e275aSEric Dumazet 1357ff74e23fSEric Dumazet /* Called with rcu_read_lock() */ 1358ba8e275aSEric Dumazet static bool tcp_v4_inbound_md5_hash(const struct sock *sk, 1359534322caSDavid Ahern const struct sk_buff *skb, 1360534322caSDavid Ahern int dif, int sdif) 1361cfb6eeb4SYOSHIFUJI Hideaki { 1362ba8e275aSEric Dumazet #ifdef CONFIG_TCP_MD5SIG 1363cfb6eeb4SYOSHIFUJI Hideaki /* 1364cfb6eeb4SYOSHIFUJI Hideaki * This gets called for each TCP segment that arrives 1365cfb6eeb4SYOSHIFUJI Hideaki * so we want to be efficient. 1366cfb6eeb4SYOSHIFUJI Hideaki * We have 3 drop cases: 1367cfb6eeb4SYOSHIFUJI Hideaki * o No MD5 hash and one expected. 1368cfb6eeb4SYOSHIFUJI Hideaki * o MD5 hash and we're not expecting one. 1369cfb6eeb4SYOSHIFUJI Hideaki * o MD5 hash and its wrong. 1370cfb6eeb4SYOSHIFUJI Hideaki */ 1371cf533ea5SEric Dumazet const __u8 *hash_location = NULL; 1372cfb6eeb4SYOSHIFUJI Hideaki struct tcp_md5sig_key *hash_expected; 1373eddc9ec5SArnaldo Carvalho de Melo const struct iphdr *iph = ip_hdr(skb); 1374cf533ea5SEric Dumazet const struct tcphdr *th = tcp_hdr(skb); 1375cea97609SDavid Ahern const union tcp_md5_addr *addr; 1376cfb6eeb4SYOSHIFUJI Hideaki unsigned char newhash[16]; 1377dea53bb8SDavid Ahern int genhash, l3index; 1378dea53bb8SDavid Ahern 1379dea53bb8SDavid Ahern /* sdif set, means packet ingressed via a device 1380dea53bb8SDavid Ahern * in an L3 domain and dif is set to the l3mdev 1381dea53bb8SDavid Ahern */ 1382dea53bb8SDavid Ahern l3index = sdif ? dif : 0; 1383cfb6eeb4SYOSHIFUJI Hideaki 1384cea97609SDavid Ahern addr = (union tcp_md5_addr *)&iph->saddr; 1385dea53bb8SDavid Ahern hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 13867d5d5525SYOSHIFUJI Hideaki hash_location = tcp_parse_md5sig_option(th); 1387cfb6eeb4SYOSHIFUJI Hideaki 1388cfb6eeb4SYOSHIFUJI Hideaki /* We've parsed the options - do we have a hash? */ 1389cfb6eeb4SYOSHIFUJI Hideaki if (!hash_expected && !hash_location) 1390a2a385d6SEric Dumazet return false; 1391cfb6eeb4SYOSHIFUJI Hideaki 1392cfb6eeb4SYOSHIFUJI Hideaki if (hash_expected && !hash_location) { 1393c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1394a2a385d6SEric Dumazet return true; 1395cfb6eeb4SYOSHIFUJI Hideaki } 1396cfb6eeb4SYOSHIFUJI Hideaki 1397cfb6eeb4SYOSHIFUJI Hideaki if (!hash_expected && hash_location) { 1398c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1399a2a385d6SEric Dumazet return true; 1400cfb6eeb4SYOSHIFUJI Hideaki } 1401cfb6eeb4SYOSHIFUJI Hideaki 1402cfb6eeb4SYOSHIFUJI Hideaki /* Okay, so this is hash_expected and hash_location - 1403cfb6eeb4SYOSHIFUJI Hideaki * so we need to calculate the checksum. 1404cfb6eeb4SYOSHIFUJI Hideaki */ 140549a72dfbSAdam Langley genhash = tcp_v4_md5_hash_skb(newhash, 1406cfb6eeb4SYOSHIFUJI Hideaki hash_expected, 140739f8e58eSEric Dumazet NULL, skb); 1408cfb6eeb4SYOSHIFUJI Hideaki 1409cfb6eeb4SYOSHIFUJI Hideaki if (genhash || memcmp(hash_location, newhash, 16) != 0) { 141072145a68SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); 1411dea53bb8SDavid Ahern net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", 1412673d57e7SHarvey Harrison &iph->saddr, ntohs(th->source), 1413673d57e7SHarvey Harrison &iph->daddr, ntohs(th->dest), 1414e87cc472SJoe Perches genhash ? " tcp_v4_calc_md5_hash failed" 1415dea53bb8SDavid Ahern : "", l3index); 1416a2a385d6SEric Dumazet return true; 1417cfb6eeb4SYOSHIFUJI Hideaki } 1418a2a385d6SEric Dumazet return false; 1419cfb6eeb4SYOSHIFUJI Hideaki #endif 1420ba8e275aSEric Dumazet return false; 1421ba8e275aSEric Dumazet } 1422cfb6eeb4SYOSHIFUJI Hideaki 1423b40cf18eSEric Dumazet static void tcp_v4_init_req(struct request_sock *req, 1424b40cf18eSEric Dumazet const struct sock *sk_listener, 142516bea70aSOctavian Purdila struct sk_buff *skb) 142616bea70aSOctavian Purdila { 142716bea70aSOctavian Purdila struct inet_request_sock *ireq = inet_rsk(req); 1428c92e8c02SEric Dumazet struct net *net = sock_net(sk_listener); 142916bea70aSOctavian Purdila 143008d2cc3bSEric Dumazet sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 143108d2cc3bSEric Dumazet sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); 1432c92e8c02SEric Dumazet RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); 143316bea70aSOctavian Purdila } 143416bea70aSOctavian Purdila 1435f964629eSEric Dumazet static struct dst_entry *tcp_v4_route_req(const struct sock *sk, 1436f964629eSEric Dumazet struct flowi *fl, 14374396e461SSoheil Hassas Yeganeh const struct request_sock *req) 1438d94e0417SOctavian Purdila { 14394396e461SSoheil Hassas Yeganeh return inet_csk_route_req(sk, &fl->u.ip4, req); 1440d94e0417SOctavian Purdila } 1441d94e0417SOctavian Purdila 144272a3effaSEric Dumazet struct request_sock_ops tcp_request_sock_ops __read_mostly = { 14431da177e4SLinus Torvalds .family = PF_INET, 14442e6599cbSArnaldo Carvalho de Melo .obj_size = sizeof(struct tcp_request_sock), 14455db92c99SOctavian Purdila .rtx_syn_ack = tcp_rtx_synack, 144660236fddSArnaldo Carvalho de Melo .send_ack = tcp_v4_reqsk_send_ack, 144760236fddSArnaldo Carvalho de Melo .destructor = tcp_v4_reqsk_destructor, 14481da177e4SLinus Torvalds .send_reset = tcp_v4_send_reset, 144972659eccSOctavian Purdila .syn_ack_timeout = tcp_syn_ack_timeout, 14501da177e4SLinus Torvalds }; 14511da177e4SLinus Torvalds 145235b2c321SMat Martineau const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 14532aec4a29SOctavian Purdila .mss_clamp = TCP_MSS_DEFAULT, 145416bea70aSOctavian Purdila #ifdef CONFIG_TCP_MD5SIG 1455fd3a154aSEric Dumazet .req_md5_lookup = tcp_v4_md5_lookup, 1456e3afe7b7SJohn Dykstra .calc_md5_hash = tcp_v4_md5_hash_skb, 1457b6332e6cSAndrew Morton #endif 145816bea70aSOctavian Purdila .init_req = tcp_v4_init_req, 1459fb7b37a7SOctavian Purdila #ifdef CONFIG_SYN_COOKIES 1460fb7b37a7SOctavian Purdila .cookie_init_seq = cookie_v4_init_sequence, 1461fb7b37a7SOctavian Purdila #endif 1462d94e0417SOctavian Purdila .route_req = tcp_v4_route_req, 146384b114b9SEric Dumazet .init_seq = tcp_v4_init_seq, 146484b114b9SEric Dumazet .init_ts_off = tcp_v4_init_ts_off, 1465d6274bd8SOctavian Purdila .send_synack = tcp_v4_send_synack, 146616bea70aSOctavian Purdila }; 1467cfb6eeb4SYOSHIFUJI Hideaki 14681da177e4SLinus Torvalds int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 14691da177e4SLinus Torvalds { 14701da177e4SLinus Torvalds /* Never answer to SYNs send to broadcast or multicast */ 1471511c3f92SEric Dumazet if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 14721da177e4SLinus Torvalds goto drop; 14731da177e4SLinus Torvalds 14741fb6f159SOctavian Purdila return tcp_conn_request(&tcp_request_sock_ops, 14751fb6f159SOctavian Purdila &tcp_request_sock_ipv4_ops, sk, skb); 14761da177e4SLinus Torvalds 14771da177e4SLinus Torvalds drop: 14789caad864SEric Dumazet tcp_listendrop(sk); 14791da177e4SLinus Torvalds return 0; 14801da177e4SLinus Torvalds } 14814bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_conn_request); 14821da177e4SLinus Torvalds 14831da177e4SLinus Torvalds 14841da177e4SLinus Torvalds /* 14851da177e4SLinus Torvalds * The three way handshake has completed - we got a valid synack - 14861da177e4SLinus Torvalds * now create the new socket. 14871da177e4SLinus Torvalds */ 14880c27171eSEric Dumazet struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, 148960236fddSArnaldo Carvalho de Melo struct request_sock *req, 14905e0724d0SEric Dumazet struct dst_entry *dst, 14915e0724d0SEric Dumazet struct request_sock *req_unhash, 14925e0724d0SEric Dumazet bool *own_req) 14931da177e4SLinus Torvalds { 14942e6599cbSArnaldo Carvalho de Melo struct inet_request_sock *ireq; 14951da177e4SLinus Torvalds struct inet_sock *newinet; 14961da177e4SLinus Torvalds struct tcp_sock *newtp; 14971da177e4SLinus Torvalds struct sock *newsk; 1498cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 1499cea97609SDavid Ahern const union tcp_md5_addr *addr; 1500cfb6eeb4SYOSHIFUJI Hideaki struct tcp_md5sig_key *key; 1501dea53bb8SDavid Ahern int l3index; 1502cfb6eeb4SYOSHIFUJI Hideaki #endif 1503f6d8bd05SEric Dumazet struct ip_options_rcu *inet_opt; 15041da177e4SLinus Torvalds 15051da177e4SLinus Torvalds if (sk_acceptq_is_full(sk)) 15061da177e4SLinus Torvalds goto exit_overflow; 15071da177e4SLinus Torvalds 15081da177e4SLinus Torvalds newsk = tcp_create_openreq_child(sk, req, skb); 15091da177e4SLinus Torvalds if (!newsk) 1510093d2823SBalazs Scheidler goto exit_nonewsk; 15111da177e4SLinus Torvalds 1512bcd76111SHerbert Xu newsk->sk_gso_type = SKB_GSO_TCPV4; 1513fae6ef87SNeal Cardwell inet_sk_rx_dst_set(newsk, skb); 15141da177e4SLinus Torvalds 15151da177e4SLinus Torvalds newtp = tcp_sk(newsk); 15161da177e4SLinus Torvalds newinet = inet_sk(newsk); 15172e6599cbSArnaldo Carvalho de Melo ireq = inet_rsk(req); 1518d1e559d0SEric Dumazet sk_daddr_set(newsk, ireq->ir_rmt_addr); 1519d1e559d0SEric Dumazet sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); 15206dd9a14eSDavid Ahern newsk->sk_bound_dev_if = ireq->ir_iif; 1521634fb979SEric Dumazet newinet->inet_saddr = ireq->ir_loc_addr; 1522c92e8c02SEric Dumazet inet_opt = rcu_dereference(ireq->ireq_opt); 1523c92e8c02SEric Dumazet RCU_INIT_POINTER(newinet->inet_opt, inet_opt); 1524463c84b9SArnaldo Carvalho de Melo newinet->mc_index = inet_iif(skb); 1525eddc9ec5SArnaldo Carvalho de Melo newinet->mc_ttl = ip_hdr(skb)->ttl; 15264c507d28SJiri Benc newinet->rcv_tos = ip_hdr(skb)->tos; 1527d83d8461SArnaldo Carvalho de Melo inet_csk(newsk)->icsk_ext_hdr_len = 0; 1528f6d8bd05SEric Dumazet if (inet_opt) 1529f6d8bd05SEric Dumazet inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1530a904a069SEric Dumazet newinet->inet_id = prandom_u32(); 15311da177e4SLinus Torvalds 1532dfd25fffSEric Dumazet if (!dst) { 1533dfd25fffSEric Dumazet dst = inet_csk_route_child_sock(sk, newsk, req); 1534dfd25fffSEric Dumazet if (!dst) 15350e734419SDavid S. Miller goto put_and_exit; 1536dfd25fffSEric Dumazet } else { 1537dfd25fffSEric Dumazet /* syncookie case : see end of cookie_v4_check() */ 1538dfd25fffSEric Dumazet } 15390e734419SDavid S. Miller sk_setup_caps(newsk, dst); 15400e734419SDavid S. Miller 154181164413SDaniel Borkmann tcp_ca_openreq_child(newsk, dst); 154281164413SDaniel Borkmann 15431da177e4SLinus Torvalds tcp_sync_mss(newsk, dst_mtu(dst)); 15443541f9e8SEric Dumazet newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); 1545f5fff5dcSTom Quetchenbach 15461da177e4SLinus Torvalds tcp_initialize_rcv_mss(newsk); 15471da177e4SLinus Torvalds 1548cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 1549dea53bb8SDavid Ahern l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); 1550cfb6eeb4SYOSHIFUJI Hideaki /* Copy over the MD5 key from the original socket */ 1551cea97609SDavid Ahern addr = (union tcp_md5_addr *)&newinet->inet_daddr; 1552dea53bb8SDavid Ahern key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); 155300db4124SIan Morris if (key) { 1554cfb6eeb4SYOSHIFUJI Hideaki /* 1555cfb6eeb4SYOSHIFUJI Hideaki * We're using one, so create a matching key 1556cfb6eeb4SYOSHIFUJI Hideaki * on the newsk structure. If we fail to get 1557cfb6eeb4SYOSHIFUJI Hideaki * memory, then we end up not copying the key 1558cfb6eeb4SYOSHIFUJI Hideaki * across. Shucks. 1559cfb6eeb4SYOSHIFUJI Hideaki */ 1560dea53bb8SDavid Ahern tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, 1561cea97609SDavid Ahern key->key, key->keylen, GFP_ATOMIC); 1562a465419bSEric Dumazet sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1563cfb6eeb4SYOSHIFUJI Hideaki } 1564cfb6eeb4SYOSHIFUJI Hideaki #endif 1565cfb6eeb4SYOSHIFUJI Hideaki 15660e734419SDavid S. Miller if (__inet_inherit_port(sk, newsk) < 0) 15670e734419SDavid S. Miller goto put_and_exit; 15685e0724d0SEric Dumazet *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); 1569c92e8c02SEric Dumazet if (likely(*own_req)) { 157049a496c9SEric Dumazet tcp_move_syn(newtp, req); 1571c92e8c02SEric Dumazet ireq->ireq_opt = NULL; 1572c92e8c02SEric Dumazet } else { 1573c92e8c02SEric Dumazet newinet->inet_opt = NULL; 1574c92e8c02SEric Dumazet } 15751da177e4SLinus Torvalds return newsk; 15761da177e4SLinus Torvalds 15771da177e4SLinus Torvalds exit_overflow: 1578c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1579093d2823SBalazs Scheidler exit_nonewsk: 1580093d2823SBalazs Scheidler dst_release(dst); 15811da177e4SLinus Torvalds exit: 15829caad864SEric Dumazet tcp_listendrop(sk); 15831da177e4SLinus Torvalds return NULL; 15840e734419SDavid S. Miller put_and_exit: 1585c92e8c02SEric Dumazet newinet->inet_opt = NULL; 1586e337e24dSChristoph Paasch inet_csk_prepare_forced_close(newsk); 1587e337e24dSChristoph Paasch tcp_done(newsk); 15880e734419SDavid S. Miller goto exit; 15891da177e4SLinus Torvalds } 15904bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 15911da177e4SLinus Torvalds 1592079096f1SEric Dumazet static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) 15931da177e4SLinus Torvalds { 15941da177e4SLinus Torvalds #ifdef CONFIG_SYN_COOKIES 1595079096f1SEric Dumazet const struct tcphdr *th = tcp_hdr(skb); 1596079096f1SEric Dumazet 1597af9b4738SFlorian Westphal if (!th->syn) 1598461b74c3SCong Wang sk = cookie_v4_check(sk, skb); 15991da177e4SLinus Torvalds #endif 16001da177e4SLinus Torvalds return sk; 16011da177e4SLinus Torvalds } 16021da177e4SLinus Torvalds 16039349d600SPetar Penkov u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, 16049349d600SPetar Penkov struct tcphdr *th, u32 *cookie) 16059349d600SPetar Penkov { 16069349d600SPetar Penkov u16 mss = 0; 16079349d600SPetar Penkov #ifdef CONFIG_SYN_COOKIES 16089349d600SPetar Penkov mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, 16099349d600SPetar Penkov &tcp_request_sock_ipv4_ops, sk, th); 16109349d600SPetar Penkov if (mss) { 16119349d600SPetar Penkov *cookie = __cookie_v4_init_sequence(iph, th, &mss); 16129349d600SPetar Penkov tcp_synq_overflow(sk); 16139349d600SPetar Penkov } 16149349d600SPetar Penkov #endif 16159349d600SPetar Penkov return mss; 16169349d600SPetar Penkov } 16179349d600SPetar Penkov 16181da177e4SLinus Torvalds /* The socket must have it's spinlock held when we get 1619e994b2f0SEric Dumazet * here, unless it is a TCP_LISTEN socket. 16201da177e4SLinus Torvalds * 16211da177e4SLinus Torvalds * We have a potential double-lock case here, so even when 16221da177e4SLinus Torvalds * doing backlog processing we use the BH locking scheme. 16231da177e4SLinus Torvalds * This is because we cannot sleep with the original spinlock 16241da177e4SLinus Torvalds * held. 16251da177e4SLinus Torvalds */ 16261da177e4SLinus Torvalds int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 16271da177e4SLinus Torvalds { 1628cfb6eeb4SYOSHIFUJI Hideaki struct sock *rsk; 1629cfb6eeb4SYOSHIFUJI Hideaki 16301da177e4SLinus Torvalds if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 163192101b3bSDavid S. Miller struct dst_entry *dst = sk->sk_rx_dst; 1632404e0a8bSEric Dumazet 1633404e0a8bSEric Dumazet sock_rps_save_rxhash(sk, skb); 16343d97379aSEric Dumazet sk_mark_napi_id(sk, skb); 1635404e0a8bSEric Dumazet if (dst) { 1636505fbcf0SEric Dumazet if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 163751456b29SIan Morris !dst->ops->check(dst, 0)) { 163892101b3bSDavid S. Miller dst_release(dst); 163992101b3bSDavid S. Miller sk->sk_rx_dst = NULL; 164092101b3bSDavid S. Miller } 164192101b3bSDavid S. Miller } 16423d97d88eSYafang Shao tcp_rcv_established(sk, skb); 16431da177e4SLinus Torvalds return 0; 16441da177e4SLinus Torvalds } 16451da177e4SLinus Torvalds 164612e25e10SEric Dumazet if (tcp_checksum_complete(skb)) 16471da177e4SLinus Torvalds goto csum_err; 16481da177e4SLinus Torvalds 16491da177e4SLinus Torvalds if (sk->sk_state == TCP_LISTEN) { 1650079096f1SEric Dumazet struct sock *nsk = tcp_v4_cookie_check(sk, skb); 1651079096f1SEric Dumazet 16521da177e4SLinus Torvalds if (!nsk) 16531da177e4SLinus Torvalds goto discard; 16541da177e4SLinus Torvalds if (nsk != sk) { 1655cfb6eeb4SYOSHIFUJI Hideaki if (tcp_child_process(sk, nsk, skb)) { 1656cfb6eeb4SYOSHIFUJI Hideaki rsk = nsk; 16571da177e4SLinus Torvalds goto reset; 1658cfb6eeb4SYOSHIFUJI Hideaki } 16591da177e4SLinus Torvalds return 0; 16601da177e4SLinus Torvalds } 1661ca55158cSEric Dumazet } else 1662bdeab991STom Herbert sock_rps_save_rxhash(sk, skb); 1663ca55158cSEric Dumazet 166472ab4a86SEric Dumazet if (tcp_rcv_state_process(sk, skb)) { 1665cfb6eeb4SYOSHIFUJI Hideaki rsk = sk; 16661da177e4SLinus Torvalds goto reset; 1667cfb6eeb4SYOSHIFUJI Hideaki } 16681da177e4SLinus Torvalds return 0; 16691da177e4SLinus Torvalds 16701da177e4SLinus Torvalds reset: 1671cfb6eeb4SYOSHIFUJI Hideaki tcp_v4_send_reset(rsk, skb); 16721da177e4SLinus Torvalds discard: 16731da177e4SLinus Torvalds kfree_skb(skb); 16741da177e4SLinus Torvalds /* Be careful here. If this function gets more complicated and 16751da177e4SLinus Torvalds * gcc suffers from register pressure on the x86, sk (in %ebx) 16761da177e4SLinus Torvalds * might be destroyed here. This current version compiles correctly, 16771da177e4SLinus Torvalds * but you have been warned. 16781da177e4SLinus Torvalds */ 16791da177e4SLinus Torvalds return 0; 16801da177e4SLinus Torvalds 16811da177e4SLinus Torvalds csum_err: 1682c10d9310SEric Dumazet TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 1683c10d9310SEric Dumazet TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 16841da177e4SLinus Torvalds goto discard; 16851da177e4SLinus Torvalds } 16864bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_do_rcv); 16871da177e4SLinus Torvalds 16887487449cSPaolo Abeni int tcp_v4_early_demux(struct sk_buff *skb) 168941063e9dSDavid S. Miller { 169041063e9dSDavid S. Miller const struct iphdr *iph; 169141063e9dSDavid S. Miller const struct tcphdr *th; 169241063e9dSDavid S. Miller struct sock *sk; 169341063e9dSDavid S. Miller 169441063e9dSDavid S. Miller if (skb->pkt_type != PACKET_HOST) 16957487449cSPaolo Abeni return 0; 169641063e9dSDavid S. Miller 169745f00f99SEric Dumazet if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) 16987487449cSPaolo Abeni return 0; 169941063e9dSDavid S. Miller 170041063e9dSDavid S. Miller iph = ip_hdr(skb); 170145f00f99SEric Dumazet th = tcp_hdr(skb); 170241063e9dSDavid S. Miller 170341063e9dSDavid S. Miller if (th->doff < sizeof(struct tcphdr) / 4) 17047487449cSPaolo Abeni return 0; 170541063e9dSDavid S. Miller 170645f00f99SEric Dumazet sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, 170741063e9dSDavid S. Miller iph->saddr, th->source, 17087011d085SVijay Subramanian iph->daddr, ntohs(th->dest), 17093fa6f616SDavid Ahern skb->skb_iif, inet_sdif(skb)); 171041063e9dSDavid S. Miller if (sk) { 171141063e9dSDavid S. Miller skb->sk = sk; 171241063e9dSDavid S. Miller skb->destructor = sock_edemux; 1713f7e4eb03SEric Dumazet if (sk_fullsock(sk)) { 1714d0c294c5SMichal Kubeček struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); 1715505fbcf0SEric Dumazet 171641063e9dSDavid S. Miller if (dst) 171741063e9dSDavid S. Miller dst = dst_check(dst, 0); 171892101b3bSDavid S. Miller if (dst && 1719505fbcf0SEric Dumazet inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) 172041063e9dSDavid S. Miller skb_dst_set_noref(skb, dst); 172141063e9dSDavid S. Miller } 172241063e9dSDavid S. Miller } 17237487449cSPaolo Abeni return 0; 172441063e9dSDavid S. Miller } 172541063e9dSDavid S. Miller 1726c9c33212SEric Dumazet bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1727c9c33212SEric Dumazet { 17288265792bSEric Dumazet u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); 17294f693b55SEric Dumazet struct skb_shared_info *shinfo; 17304f693b55SEric Dumazet const struct tcphdr *th; 17314f693b55SEric Dumazet struct tcphdr *thtail; 17324f693b55SEric Dumazet struct sk_buff *tail; 17334f693b55SEric Dumazet unsigned int hdrlen; 17344f693b55SEric Dumazet bool fragstolen; 17354f693b55SEric Dumazet u32 gso_segs; 17364f693b55SEric Dumazet int delta; 1737c9c33212SEric Dumazet 1738c9c33212SEric Dumazet /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1739c9c33212SEric Dumazet * we can fix skb->truesize to its real value to avoid future drops. 1740c9c33212SEric Dumazet * This is valid because skb is not yet charged to the socket. 1741c9c33212SEric Dumazet * It has been noticed pure SACK packets were sometimes dropped 1742c9c33212SEric Dumazet * (if cooked by drivers without copybreak feature). 1743c9c33212SEric Dumazet */ 174460b1af33SEric Dumazet skb_condense(skb); 1745c9c33212SEric Dumazet 1746ade9628eSEric Dumazet skb_dst_drop(skb); 1747ade9628eSEric Dumazet 17484f693b55SEric Dumazet if (unlikely(tcp_checksum_complete(skb))) { 17494f693b55SEric Dumazet bh_unlock_sock(sk); 17504f693b55SEric Dumazet __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 17514f693b55SEric Dumazet __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 17524f693b55SEric Dumazet return true; 17534f693b55SEric Dumazet } 17544f693b55SEric Dumazet 17554f693b55SEric Dumazet /* Attempt coalescing to last skb in backlog, even if we are 17564f693b55SEric Dumazet * above the limits. 17574f693b55SEric Dumazet * This is okay because skb capacity is limited to MAX_SKB_FRAGS. 17584f693b55SEric Dumazet */ 17594f693b55SEric Dumazet th = (const struct tcphdr *)skb->data; 17604f693b55SEric Dumazet hdrlen = th->doff * 4; 17614f693b55SEric Dumazet shinfo = skb_shinfo(skb); 17624f693b55SEric Dumazet 17634f693b55SEric Dumazet if (!shinfo->gso_size) 17644f693b55SEric Dumazet shinfo->gso_size = skb->len - hdrlen; 17654f693b55SEric Dumazet 17664f693b55SEric Dumazet if (!shinfo->gso_segs) 17674f693b55SEric Dumazet shinfo->gso_segs = 1; 17684f693b55SEric Dumazet 17694f693b55SEric Dumazet tail = sk->sk_backlog.tail; 17704f693b55SEric Dumazet if (!tail) 17714f693b55SEric Dumazet goto no_coalesce; 17724f693b55SEric Dumazet thtail = (struct tcphdr *)tail->data; 17734f693b55SEric Dumazet 17744f693b55SEric Dumazet if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || 17754f693b55SEric Dumazet TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || 17764f693b55SEric Dumazet ((TCP_SKB_CB(tail)->tcp_flags | 1777ca2fe295SEric Dumazet TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || 1778ca2fe295SEric Dumazet !((TCP_SKB_CB(tail)->tcp_flags & 1779ca2fe295SEric Dumazet TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || 17804f693b55SEric Dumazet ((TCP_SKB_CB(tail)->tcp_flags ^ 17814f693b55SEric Dumazet TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || 17824f693b55SEric Dumazet #ifdef CONFIG_TLS_DEVICE 17834f693b55SEric Dumazet tail->decrypted != skb->decrypted || 17844f693b55SEric Dumazet #endif 17854f693b55SEric Dumazet thtail->doff != th->doff || 17864f693b55SEric Dumazet memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) 17874f693b55SEric Dumazet goto no_coalesce; 17884f693b55SEric Dumazet 17894f693b55SEric Dumazet __skb_pull(skb, hdrlen); 17904f693b55SEric Dumazet if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { 17914f693b55SEric Dumazet thtail->window = th->window; 17924f693b55SEric Dumazet 17934f693b55SEric Dumazet TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; 17944f693b55SEric Dumazet 17954f693b55SEric Dumazet if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) 17964f693b55SEric Dumazet TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; 17974f693b55SEric Dumazet 1798ca2fe295SEric Dumazet /* We have to update both TCP_SKB_CB(tail)->tcp_flags and 1799ca2fe295SEric Dumazet * thtail->fin, so that the fast path in tcp_rcv_established() 1800ca2fe295SEric Dumazet * is not entered if we append a packet with a FIN. 1801ca2fe295SEric Dumazet * SYN, RST, URG are not present. 1802ca2fe295SEric Dumazet * ACK is set on both packets. 1803ca2fe295SEric Dumazet * PSH : we do not really care in TCP stack, 1804ca2fe295SEric Dumazet * at least for 'GRO' packets. 1805ca2fe295SEric Dumazet */ 1806ca2fe295SEric Dumazet thtail->fin |= th->fin; 18074f693b55SEric Dumazet TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 18084f693b55SEric Dumazet 18094f693b55SEric Dumazet if (TCP_SKB_CB(skb)->has_rxtstamp) { 18104f693b55SEric Dumazet TCP_SKB_CB(tail)->has_rxtstamp = true; 18114f693b55SEric Dumazet tail->tstamp = skb->tstamp; 18124f693b55SEric Dumazet skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; 18134f693b55SEric Dumazet } 18144f693b55SEric Dumazet 18154f693b55SEric Dumazet /* Not as strict as GRO. We only need to carry mss max value */ 18164f693b55SEric Dumazet skb_shinfo(tail)->gso_size = max(shinfo->gso_size, 18174f693b55SEric Dumazet skb_shinfo(tail)->gso_size); 18184f693b55SEric Dumazet 18194f693b55SEric Dumazet gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; 18204f693b55SEric Dumazet skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 18214f693b55SEric Dumazet 18224f693b55SEric Dumazet sk->sk_backlog.len += delta; 18234f693b55SEric Dumazet __NET_INC_STATS(sock_net(sk), 18244f693b55SEric Dumazet LINUX_MIB_TCPBACKLOGCOALESCE); 18254f693b55SEric Dumazet kfree_skb_partial(skb, fragstolen); 18264f693b55SEric Dumazet return false; 18274f693b55SEric Dumazet } 18284f693b55SEric Dumazet __skb_push(skb, hdrlen); 18294f693b55SEric Dumazet 18304f693b55SEric Dumazet no_coalesce: 18314f693b55SEric Dumazet /* Only socket owner can try to collapse/prune rx queues 18324f693b55SEric Dumazet * to reduce memory overhead, so add a little headroom here. 18334f693b55SEric Dumazet * Few sockets backlog are possibly concurrently non empty. 18344f693b55SEric Dumazet */ 18354f693b55SEric Dumazet limit += 64*1024; 18364f693b55SEric Dumazet 1837c9c33212SEric Dumazet if (unlikely(sk_add_backlog(sk, skb, limit))) { 1838c9c33212SEric Dumazet bh_unlock_sock(sk); 1839c9c33212SEric Dumazet __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1840c9c33212SEric Dumazet return true; 1841c9c33212SEric Dumazet } 1842c9c33212SEric Dumazet return false; 1843c9c33212SEric Dumazet } 1844c9c33212SEric Dumazet EXPORT_SYMBOL(tcp_add_backlog); 1845c9c33212SEric Dumazet 1846ac6e7800SEric Dumazet int tcp_filter(struct sock *sk, struct sk_buff *skb) 1847ac6e7800SEric Dumazet { 1848ac6e7800SEric Dumazet struct tcphdr *th = (struct tcphdr *)skb->data; 1849ac6e7800SEric Dumazet 1850f2feaefdSChristoph Paasch return sk_filter_trim_cap(sk, skb, th->doff * 4); 1851ac6e7800SEric Dumazet } 1852ac6e7800SEric Dumazet EXPORT_SYMBOL(tcp_filter); 1853ac6e7800SEric Dumazet 1854eeea10b8SEric Dumazet static void tcp_v4_restore_cb(struct sk_buff *skb) 1855eeea10b8SEric Dumazet { 1856eeea10b8SEric Dumazet memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, 1857eeea10b8SEric Dumazet sizeof(struct inet_skb_parm)); 1858eeea10b8SEric Dumazet } 1859eeea10b8SEric Dumazet 1860eeea10b8SEric Dumazet static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, 1861eeea10b8SEric Dumazet const struct tcphdr *th) 1862eeea10b8SEric Dumazet { 1863eeea10b8SEric Dumazet /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() 1864eeea10b8SEric Dumazet * barrier() makes sure compiler wont play fool^Waliasing games. 1865eeea10b8SEric Dumazet */ 1866eeea10b8SEric Dumazet memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), 1867eeea10b8SEric Dumazet sizeof(struct inet_skb_parm)); 1868eeea10b8SEric Dumazet barrier(); 1869eeea10b8SEric Dumazet 1870eeea10b8SEric Dumazet TCP_SKB_CB(skb)->seq = ntohl(th->seq); 1871eeea10b8SEric Dumazet TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 1872eeea10b8SEric Dumazet skb->len - th->doff * 4); 1873eeea10b8SEric Dumazet TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1874eeea10b8SEric Dumazet TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); 1875eeea10b8SEric Dumazet TCP_SKB_CB(skb)->tcp_tw_isn = 0; 1876eeea10b8SEric Dumazet TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1877eeea10b8SEric Dumazet TCP_SKB_CB(skb)->sacked = 0; 1878eeea10b8SEric Dumazet TCP_SKB_CB(skb)->has_rxtstamp = 1879eeea10b8SEric Dumazet skb->tstamp || skb_hwtstamps(skb)->hwtstamp; 1880eeea10b8SEric Dumazet } 1881eeea10b8SEric Dumazet 18821da177e4SLinus Torvalds /* 18831da177e4SLinus Torvalds * From tcp_input.c 18841da177e4SLinus Torvalds */ 18851da177e4SLinus Torvalds 18861da177e4SLinus Torvalds int tcp_v4_rcv(struct sk_buff *skb) 18871da177e4SLinus Torvalds { 18883b24d854SEric Dumazet struct net *net = dev_net(skb->dev); 18898b27dae5SEric Dumazet struct sk_buff *skb_to_free; 18903fa6f616SDavid Ahern int sdif = inet_sdif(skb); 1891534322caSDavid Ahern int dif = inet_iif(skb); 1892eddc9ec5SArnaldo Carvalho de Melo const struct iphdr *iph; 1893cf533ea5SEric Dumazet const struct tcphdr *th; 18943b24d854SEric Dumazet bool refcounted; 18951da177e4SLinus Torvalds struct sock *sk; 18961da177e4SLinus Torvalds int ret; 18971da177e4SLinus Torvalds 18981da177e4SLinus Torvalds if (skb->pkt_type != PACKET_HOST) 18991da177e4SLinus Torvalds goto discard_it; 19001da177e4SLinus Torvalds 19011da177e4SLinus Torvalds /* Count it even if it's bad */ 190290bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_INSEGS); 19031da177e4SLinus Torvalds 19041da177e4SLinus Torvalds if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 19051da177e4SLinus Torvalds goto discard_it; 19061da177e4SLinus Torvalds 1907ea1627c2SEric Dumazet th = (const struct tcphdr *)skb->data; 19081da177e4SLinus Torvalds 1909ea1627c2SEric Dumazet if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) 19101da177e4SLinus Torvalds goto bad_packet; 19111da177e4SLinus Torvalds if (!pskb_may_pull(skb, th->doff * 4)) 19121da177e4SLinus Torvalds goto discard_it; 19131da177e4SLinus Torvalds 19141da177e4SLinus Torvalds /* An explanation is required here, I think. 19151da177e4SLinus Torvalds * Packet length and doff are validated by header prediction, 1916caa20d9aSStephen Hemminger * provided case of th->doff==0 is eliminated. 19171da177e4SLinus Torvalds * So, we defer the checks. */ 1918ed70fcfcSTom Herbert 1919ed70fcfcSTom Herbert if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) 19206a5dc9e5SEric Dumazet goto csum_error; 19211da177e4SLinus Torvalds 1922ea1627c2SEric Dumazet th = (const struct tcphdr *)skb->data; 1923eddc9ec5SArnaldo Carvalho de Melo iph = ip_hdr(skb); 19244bdc3d66SEric Dumazet lookup: 1925a583636aSCraig Gallek sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, 19263fa6f616SDavid Ahern th->dest, sdif, &refcounted); 19271da177e4SLinus Torvalds if (!sk) 19281da177e4SLinus Torvalds goto no_tcp_socket; 19291da177e4SLinus Torvalds 1930bb134d5dSEric Dumazet process: 1931bb134d5dSEric Dumazet if (sk->sk_state == TCP_TIME_WAIT) 1932bb134d5dSEric Dumazet goto do_time_wait; 1933bb134d5dSEric Dumazet 1934079096f1SEric Dumazet if (sk->sk_state == TCP_NEW_SYN_RECV) { 1935079096f1SEric Dumazet struct request_sock *req = inet_reqsk(sk); 1936e0f9759fSEric Dumazet bool req_stolen = false; 19377716682cSEric Dumazet struct sock *nsk; 1938079096f1SEric Dumazet 1939079096f1SEric Dumazet sk = req->rsk_listener; 1940534322caSDavid Ahern if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { 1941e65c332dSEric Dumazet sk_drops_add(sk, skb); 194272923555SEric Dumazet reqsk_put(req); 194372923555SEric Dumazet goto discard_it; 194472923555SEric Dumazet } 19454fd44a98SFrank van der Linden if (tcp_checksum_complete(skb)) { 19464fd44a98SFrank van der Linden reqsk_put(req); 19474fd44a98SFrank van der Linden goto csum_error; 19484fd44a98SFrank van der Linden } 19497716682cSEric Dumazet if (unlikely(sk->sk_state != TCP_LISTEN)) { 1950f03f2e15SEric Dumazet inet_csk_reqsk_queue_drop_and_put(sk, req); 19514bdc3d66SEric Dumazet goto lookup; 19524bdc3d66SEric Dumazet } 19533b24d854SEric Dumazet /* We own a reference on the listener, increase it again 19543b24d854SEric Dumazet * as we might lose it too soon. 19553b24d854SEric Dumazet */ 19567716682cSEric Dumazet sock_hold(sk); 19573b24d854SEric Dumazet refcounted = true; 19581f3b359fSEric Dumazet nsk = NULL; 1959eeea10b8SEric Dumazet if (!tcp_filter(sk, skb)) { 1960eeea10b8SEric Dumazet th = (const struct tcphdr *)skb->data; 1961eeea10b8SEric Dumazet iph = ip_hdr(skb); 1962eeea10b8SEric Dumazet tcp_v4_fill_cb(skb, iph, th); 1963e0f9759fSEric Dumazet nsk = tcp_check_req(sk, skb, req, false, &req_stolen); 1964eeea10b8SEric Dumazet } 1965079096f1SEric Dumazet if (!nsk) { 1966079096f1SEric Dumazet reqsk_put(req); 1967e0f9759fSEric Dumazet if (req_stolen) { 1968e0f9759fSEric Dumazet /* Another cpu got exclusive access to req 1969e0f9759fSEric Dumazet * and created a full blown socket. 1970e0f9759fSEric Dumazet * Try to feed this packet to this socket 1971e0f9759fSEric Dumazet * instead of discarding it. 1972e0f9759fSEric Dumazet */ 1973e0f9759fSEric Dumazet tcp_v4_restore_cb(skb); 1974e0f9759fSEric Dumazet sock_put(sk); 1975e0f9759fSEric Dumazet goto lookup; 1976e0f9759fSEric Dumazet } 19777716682cSEric Dumazet goto discard_and_relse; 1978079096f1SEric Dumazet } 1979079096f1SEric Dumazet if (nsk == sk) { 1980079096f1SEric Dumazet reqsk_put(req); 1981eeea10b8SEric Dumazet tcp_v4_restore_cb(skb); 1982079096f1SEric Dumazet } else if (tcp_child_process(sk, nsk, skb)) { 1983079096f1SEric Dumazet tcp_v4_send_reset(nsk, skb); 19847716682cSEric Dumazet goto discard_and_relse; 1985079096f1SEric Dumazet } else { 19867716682cSEric Dumazet sock_put(sk); 1987079096f1SEric Dumazet return 0; 1988079096f1SEric Dumazet } 1989079096f1SEric Dumazet } 19906cce09f8SEric Dumazet if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { 199102a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); 1992d218d111SStephen Hemminger goto discard_and_relse; 19936cce09f8SEric Dumazet } 1994d218d111SStephen Hemminger 19951da177e4SLinus Torvalds if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 19961da177e4SLinus Torvalds goto discard_and_relse; 19979ea88a15SDmitry Popov 1998534322caSDavid Ahern if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) 19999ea88a15SDmitry Popov goto discard_and_relse; 20009ea88a15SDmitry Popov 2001895b5c9fSFlorian Westphal nf_reset_ct(skb); 20021da177e4SLinus Torvalds 2003ac6e7800SEric Dumazet if (tcp_filter(sk, skb)) 20041da177e4SLinus Torvalds goto discard_and_relse; 2005ac6e7800SEric Dumazet th = (const struct tcphdr *)skb->data; 2006ac6e7800SEric Dumazet iph = ip_hdr(skb); 2007eeea10b8SEric Dumazet tcp_v4_fill_cb(skb, iph, th); 20081da177e4SLinus Torvalds 20091da177e4SLinus Torvalds skb->dev = NULL; 20101da177e4SLinus Torvalds 2011e994b2f0SEric Dumazet if (sk->sk_state == TCP_LISTEN) { 2012e994b2f0SEric Dumazet ret = tcp_v4_do_rcv(sk, skb); 2013e994b2f0SEric Dumazet goto put_and_return; 2014e994b2f0SEric Dumazet } 2015e994b2f0SEric Dumazet 2016e994b2f0SEric Dumazet sk_incoming_cpu_update(sk); 2017e994b2f0SEric Dumazet 2018c6366184SIngo Molnar bh_lock_sock_nested(sk); 2019a44d6eacSMartin KaFai Lau tcp_segs_in(tcp_sk(sk), skb); 20201da177e4SLinus Torvalds ret = 0; 20211da177e4SLinus Torvalds if (!sock_owned_by_user(sk)) { 20228b27dae5SEric Dumazet skb_to_free = sk->sk_rx_skb_cache; 20238b27dae5SEric Dumazet sk->sk_rx_skb_cache = NULL; 20241da177e4SLinus Torvalds ret = tcp_v4_do_rcv(sk, skb); 20258b27dae5SEric Dumazet } else { 20268b27dae5SEric Dumazet if (tcp_add_backlog(sk, skb)) 20276b03a53aSZhu Yi goto discard_and_relse; 20288b27dae5SEric Dumazet skb_to_free = NULL; 20296b03a53aSZhu Yi } 20301da177e4SLinus Torvalds bh_unlock_sock(sk); 20318b27dae5SEric Dumazet if (skb_to_free) 20328b27dae5SEric Dumazet __kfree_skb(skb_to_free); 20331da177e4SLinus Torvalds 2034e994b2f0SEric Dumazet put_and_return: 20353b24d854SEric Dumazet if (refcounted) 20361da177e4SLinus Torvalds sock_put(sk); 20371da177e4SLinus Torvalds 20381da177e4SLinus Torvalds return ret; 20391da177e4SLinus Torvalds 20401da177e4SLinus Torvalds no_tcp_socket: 20411da177e4SLinus Torvalds if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 20421da177e4SLinus Torvalds goto discard_it; 20431da177e4SLinus Torvalds 2044eeea10b8SEric Dumazet tcp_v4_fill_cb(skb, iph, th); 2045eeea10b8SEric Dumazet 204612e25e10SEric Dumazet if (tcp_checksum_complete(skb)) { 20476a5dc9e5SEric Dumazet csum_error: 204890bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); 20491da177e4SLinus Torvalds bad_packet: 205090bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_INERRS); 20511da177e4SLinus Torvalds } else { 2052cfb6eeb4SYOSHIFUJI Hideaki tcp_v4_send_reset(NULL, skb); 20531da177e4SLinus Torvalds } 20541da177e4SLinus Torvalds 20551da177e4SLinus Torvalds discard_it: 20561da177e4SLinus Torvalds /* Discard frame. */ 20571da177e4SLinus Torvalds kfree_skb(skb); 20581da177e4SLinus Torvalds return 0; 20591da177e4SLinus Torvalds 20601da177e4SLinus Torvalds discard_and_relse: 2061532182cdSEric Dumazet sk_drops_add(sk, skb); 20623b24d854SEric Dumazet if (refcounted) 20631da177e4SLinus Torvalds sock_put(sk); 20641da177e4SLinus Torvalds goto discard_it; 20651da177e4SLinus Torvalds 20661da177e4SLinus Torvalds do_time_wait: 20671da177e4SLinus Torvalds if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 20689469c7b4SYOSHIFUJI Hideaki inet_twsk_put(inet_twsk(sk)); 20691da177e4SLinus Torvalds goto discard_it; 20701da177e4SLinus Torvalds } 20711da177e4SLinus Torvalds 2072eeea10b8SEric Dumazet tcp_v4_fill_cb(skb, iph, th); 2073eeea10b8SEric Dumazet 20746a5dc9e5SEric Dumazet if (tcp_checksum_complete(skb)) { 20756a5dc9e5SEric Dumazet inet_twsk_put(inet_twsk(sk)); 20766a5dc9e5SEric Dumazet goto csum_error; 20771da177e4SLinus Torvalds } 20789469c7b4SYOSHIFUJI Hideaki switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { 20791da177e4SLinus Torvalds case TCP_TW_SYN: { 2080c346dca1SYOSHIFUJI Hideaki struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2081a583636aSCraig Gallek &tcp_hashinfo, skb, 2082a583636aSCraig Gallek __tcp_hdrlen(th), 2083da5e3630STom Herbert iph->saddr, th->source, 2084eddc9ec5SArnaldo Carvalho de Melo iph->daddr, th->dest, 20853fa6f616SDavid Ahern inet_iif(skb), 20863fa6f616SDavid Ahern sdif); 20871da177e4SLinus Torvalds if (sk2) { 2088dbe7faa4SEric Dumazet inet_twsk_deschedule_put(inet_twsk(sk)); 20891da177e4SLinus Torvalds sk = sk2; 2090eeea10b8SEric Dumazet tcp_v4_restore_cb(skb); 20913b24d854SEric Dumazet refcounted = false; 20921da177e4SLinus Torvalds goto process; 20931da177e4SLinus Torvalds } 20941da177e4SLinus Torvalds } 2095fcfd6dfaSGustavo A. R. Silva /* to ACK */ 2096a8eceea8SJoe Perches fallthrough; 20971da177e4SLinus Torvalds case TCP_TW_ACK: 20981da177e4SLinus Torvalds tcp_v4_timewait_ack(sk, skb); 20991da177e4SLinus Torvalds break; 21001da177e4SLinus Torvalds case TCP_TW_RST: 2101271c3b9bSFlorian Westphal tcp_v4_send_reset(sk, skb); 2102271c3b9bSFlorian Westphal inet_twsk_deschedule_put(inet_twsk(sk)); 2103271c3b9bSFlorian Westphal goto discard_it; 21041da177e4SLinus Torvalds case TCP_TW_SUCCESS:; 21051da177e4SLinus Torvalds } 21061da177e4SLinus Torvalds goto discard_it; 21071da177e4SLinus Torvalds } 21081da177e4SLinus Torvalds 2109ccb7c410SDavid S. Miller static struct timewait_sock_ops tcp_timewait_sock_ops = { 2110ccb7c410SDavid S. Miller .twsk_obj_size = sizeof(struct tcp_timewait_sock), 2111ccb7c410SDavid S. Miller .twsk_unique = tcp_twsk_unique, 2112ccb7c410SDavid S. Miller .twsk_destructor= tcp_twsk_destructor, 2113ccb7c410SDavid S. Miller }; 21141da177e4SLinus Torvalds 211563d02d15SEric Dumazet void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) 21165d299f3dSEric Dumazet { 21175d299f3dSEric Dumazet struct dst_entry *dst = skb_dst(skb); 21185d299f3dSEric Dumazet 21195037e9efSEric Dumazet if (dst && dst_hold_safe(dst)) { 21205d299f3dSEric Dumazet sk->sk_rx_dst = dst; 21215d299f3dSEric Dumazet inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; 21225d299f3dSEric Dumazet } 2123ca777effSEric Dumazet } 212463d02d15SEric Dumazet EXPORT_SYMBOL(inet_sk_rx_dst_set); 21255d299f3dSEric Dumazet 21263b401a81SStephen Hemminger const struct inet_connection_sock_af_ops ipv4_specific = { 21271da177e4SLinus Torvalds .queue_xmit = ip_queue_xmit, 21281da177e4SLinus Torvalds .send_check = tcp_v4_send_check, 212932519f11SArnaldo Carvalho de Melo .rebuild_header = inet_sk_rebuild_header, 21305d299f3dSEric Dumazet .sk_rx_dst_set = inet_sk_rx_dst_set, 21311da177e4SLinus Torvalds .conn_request = tcp_v4_conn_request, 21321da177e4SLinus Torvalds .syn_recv_sock = tcp_v4_syn_recv_sock, 21331da177e4SLinus Torvalds .net_header_len = sizeof(struct iphdr), 21341da177e4SLinus Torvalds .setsockopt = ip_setsockopt, 21351da177e4SLinus Torvalds .getsockopt = ip_getsockopt, 2136543d9cfeSArnaldo Carvalho de Melo .addr2sockaddr = inet_csk_addr2sockaddr, 2137543d9cfeSArnaldo Carvalho de Melo .sockaddr_len = sizeof(struct sockaddr_in), 21384fab9071SNeal Cardwell .mtu_reduced = tcp_v4_mtu_reduced, 21391da177e4SLinus Torvalds }; 21404bc2f18bSEric Dumazet EXPORT_SYMBOL(ipv4_specific); 21411da177e4SLinus Torvalds 2142cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 2143b2e4b3deSStephen Hemminger static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 2144cfb6eeb4SYOSHIFUJI Hideaki .md5_lookup = tcp_v4_md5_lookup, 214549a72dfbSAdam Langley .calc_md5_hash = tcp_v4_md5_hash_skb, 2146cfb6eeb4SYOSHIFUJI Hideaki .md5_parse = tcp_v4_parse_md5_keys, 2147cfb6eeb4SYOSHIFUJI Hideaki }; 2148b6332e6cSAndrew Morton #endif 2149cfb6eeb4SYOSHIFUJI Hideaki 21501da177e4SLinus Torvalds /* NOTE: A lot of things set to zero explicitly by call to 21511da177e4SLinus Torvalds * sk_alloc() so need not be done here. 21521da177e4SLinus Torvalds */ 21531da177e4SLinus Torvalds static int tcp_v4_init_sock(struct sock *sk) 21541da177e4SLinus Torvalds { 21556687e988SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 21561da177e4SLinus Torvalds 2157900f65d3SNeal Cardwell tcp_init_sock(sk); 21581da177e4SLinus Torvalds 21598292a17aSArnaldo Carvalho de Melo icsk->icsk_af_ops = &ipv4_specific; 2160900f65d3SNeal Cardwell 2161cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 2162ac807fa8SDavid S. Miller tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 2163cfb6eeb4SYOSHIFUJI Hideaki #endif 21641da177e4SLinus Torvalds 21651da177e4SLinus Torvalds return 0; 21661da177e4SLinus Torvalds } 21671da177e4SLinus Torvalds 21687d06b2e0SBrian Haley void tcp_v4_destroy_sock(struct sock *sk) 21691da177e4SLinus Torvalds { 21701da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 21711da177e4SLinus Torvalds 2172e1a4aa50SSong Liu trace_tcp_destroy_sock(sk); 2173e1a4aa50SSong Liu 21741da177e4SLinus Torvalds tcp_clear_xmit_timers(sk); 21751da177e4SLinus Torvalds 21766687e988SArnaldo Carvalho de Melo tcp_cleanup_congestion_control(sk); 2177317a76f9SStephen Hemminger 2178734942ccSDave Watson tcp_cleanup_ulp(sk); 2179734942ccSDave Watson 21801da177e4SLinus Torvalds /* Cleanup up the write buffer. */ 2181fe067e8aSDavid S. Miller tcp_write_queue_purge(sk); 21821da177e4SLinus Torvalds 2183cf1ef3f0SWei Wang /* Check if we want to disable active TFO */ 2184cf1ef3f0SWei Wang tcp_fastopen_active_disable_ofo_check(sk); 2185cf1ef3f0SWei Wang 21861da177e4SLinus Torvalds /* Cleans up our, hopefully empty, out_of_order_queue. */ 21879f5afeaeSYaogong Wang skb_rbtree_purge(&tp->out_of_order_queue); 21881da177e4SLinus Torvalds 2189cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 2190cfb6eeb4SYOSHIFUJI Hideaki /* Clean up the MD5 key list, if any */ 2191cfb6eeb4SYOSHIFUJI Hideaki if (tp->md5sig_info) { 2192a915da9bSEric Dumazet tcp_clear_md5_list(sk); 2193fb7df5e4SMat Martineau kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); 2194cfb6eeb4SYOSHIFUJI Hideaki tp->md5sig_info = NULL; 2195cfb6eeb4SYOSHIFUJI Hideaki } 2196cfb6eeb4SYOSHIFUJI Hideaki #endif 2197cfb6eeb4SYOSHIFUJI Hideaki 21981da177e4SLinus Torvalds /* Clean up a referenced TCP bind bucket. */ 2199463c84b9SArnaldo Carvalho de Melo if (inet_csk(sk)->icsk_bind_hash) 2200ab1e0a13SArnaldo Carvalho de Melo inet_put_port(sk); 22011da177e4SLinus Torvalds 2202d983ea6fSEric Dumazet BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); 2203435cf559SWilliam Allen Simpson 2204cf60af03SYuchung Cheng /* If socket is aborted during connect operation */ 2205cf60af03SYuchung Cheng tcp_free_fastopen_req(tp); 22061fba70e5SYuchung Cheng tcp_fastopen_destroy_cipher(sk); 2207cd8ae852SEric Dumazet tcp_saved_syn_free(tp); 2208cf60af03SYuchung Cheng 2209180d8cd9SGlauber Costa sk_sockets_allocated_dec(sk); 22101da177e4SLinus Torvalds } 22111da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_destroy_sock); 22121da177e4SLinus Torvalds 22131da177e4SLinus Torvalds #ifdef CONFIG_PROC_FS 22141da177e4SLinus Torvalds /* Proc filesystem TCP sock list dumping. */ 22151da177e4SLinus Torvalds 2216a8b690f9STom Herbert /* 2217a8b690f9STom Herbert * Get next listener socket follow cur. If cur is NULL, get first socket 2218a8b690f9STom Herbert * starting from bucket given in st->bucket; when st->bucket is zero the 2219a8b690f9STom Herbert * very first socket in the hash table is returned. 2220a8b690f9STom Herbert */ 22211da177e4SLinus Torvalds static void *listening_get_next(struct seq_file *seq, void *cur) 22221da177e4SLinus Torvalds { 2223b08d4d3bSYonghong Song struct tcp_seq_afinfo *afinfo; 22241da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private; 2225a4146b1bSDenis V. Lunev struct net *net = seq_file_net(seq); 22263b24d854SEric Dumazet struct inet_listen_hashbucket *ilb; 22278dbd76e7SEric Dumazet struct hlist_nulls_node *node; 22283b24d854SEric Dumazet struct sock *sk = cur; 22291da177e4SLinus Torvalds 2230b08d4d3bSYonghong Song if (st->bpf_seq_afinfo) 2231b08d4d3bSYonghong Song afinfo = st->bpf_seq_afinfo; 2232b08d4d3bSYonghong Song else 2233b08d4d3bSYonghong Song afinfo = PDE_DATA(file_inode(seq->file)); 2234b08d4d3bSYonghong Song 22351da177e4SLinus Torvalds if (!sk) { 22363b24d854SEric Dumazet get_head: 2237a8b690f9STom Herbert ilb = &tcp_hashinfo.listening_hash[st->bucket]; 22389652dc2eSEric Dumazet spin_lock(&ilb->lock); 22398dbd76e7SEric Dumazet sk = sk_nulls_head(&ilb->nulls_head); 2240a8b690f9STom Herbert st->offset = 0; 22411da177e4SLinus Torvalds goto get_sk; 22421da177e4SLinus Torvalds } 22435caea4eaSEric Dumazet ilb = &tcp_hashinfo.listening_hash[st->bucket]; 22441da177e4SLinus Torvalds ++st->num; 2245a8b690f9STom Herbert ++st->offset; 22461da177e4SLinus Torvalds 22478dbd76e7SEric Dumazet sk = sk_nulls_next(sk); 22481da177e4SLinus Torvalds get_sk: 22498dbd76e7SEric Dumazet sk_nulls_for_each_from(sk, node) { 22508475ef9fSPavel Emelyanov if (!net_eq(sock_net(sk), net)) 22518475ef9fSPavel Emelyanov continue; 2252b08d4d3bSYonghong Song if (afinfo->family == AF_UNSPEC || 2253b08d4d3bSYonghong Song sk->sk_family == afinfo->family) 22543b24d854SEric Dumazet return sk; 22551da177e4SLinus Torvalds } 22569652dc2eSEric Dumazet spin_unlock(&ilb->lock); 2257a8b690f9STom Herbert st->offset = 0; 22583b24d854SEric Dumazet if (++st->bucket < INET_LHTABLE_SIZE) 22593b24d854SEric Dumazet goto get_head; 22603b24d854SEric Dumazet return NULL; 22611da177e4SLinus Torvalds } 22621da177e4SLinus Torvalds 22631da177e4SLinus Torvalds static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 22641da177e4SLinus Torvalds { 2265a8b690f9STom Herbert struct tcp_iter_state *st = seq->private; 2266a8b690f9STom Herbert void *rc; 2267a8b690f9STom Herbert 2268a8b690f9STom Herbert st->bucket = 0; 2269a8b690f9STom Herbert st->offset = 0; 2270a8b690f9STom Herbert rc = listening_get_next(seq, NULL); 22711da177e4SLinus Torvalds 22721da177e4SLinus Torvalds while (rc && *pos) { 22731da177e4SLinus Torvalds rc = listening_get_next(seq, rc); 22741da177e4SLinus Torvalds --*pos; 22751da177e4SLinus Torvalds } 22761da177e4SLinus Torvalds return rc; 22771da177e4SLinus Torvalds } 22781da177e4SLinus Torvalds 227905dbc7b5SEric Dumazet static inline bool empty_bucket(const struct tcp_iter_state *st) 22806eac5604SAndi Kleen { 228105dbc7b5SEric Dumazet return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); 22826eac5604SAndi Kleen } 22836eac5604SAndi Kleen 2284a8b690f9STom Herbert /* 2285a8b690f9STom Herbert * Get first established socket starting from bucket given in st->bucket. 2286a8b690f9STom Herbert * If st->bucket is zero, the very first socket in the hash is returned. 2287a8b690f9STom Herbert */ 22881da177e4SLinus Torvalds static void *established_get_first(struct seq_file *seq) 22891da177e4SLinus Torvalds { 2290b08d4d3bSYonghong Song struct tcp_seq_afinfo *afinfo; 22911da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private; 2292a4146b1bSDenis V. Lunev struct net *net = seq_file_net(seq); 22931da177e4SLinus Torvalds void *rc = NULL; 22941da177e4SLinus Torvalds 2295b08d4d3bSYonghong Song if (st->bpf_seq_afinfo) 2296b08d4d3bSYonghong Song afinfo = st->bpf_seq_afinfo; 2297b08d4d3bSYonghong Song else 2298b08d4d3bSYonghong Song afinfo = PDE_DATA(file_inode(seq->file)); 2299b08d4d3bSYonghong Song 2300a8b690f9STom Herbert st->offset = 0; 2301a8b690f9STom Herbert for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 23021da177e4SLinus Torvalds struct sock *sk; 23033ab5aee7SEric Dumazet struct hlist_nulls_node *node; 23049db66bdcSEric Dumazet spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 23051da177e4SLinus Torvalds 23066eac5604SAndi Kleen /* Lockless fast path for the common case of empty buckets */ 23076eac5604SAndi Kleen if (empty_bucket(st)) 23086eac5604SAndi Kleen continue; 23096eac5604SAndi Kleen 23109db66bdcSEric Dumazet spin_lock_bh(lock); 23113ab5aee7SEric Dumazet sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2312b08d4d3bSYonghong Song if ((afinfo->family != AF_UNSPEC && 2313b08d4d3bSYonghong Song sk->sk_family != afinfo->family) || 2314878628fbSYOSHIFUJI Hideaki !net_eq(sock_net(sk), net)) { 23151da177e4SLinus Torvalds continue; 23161da177e4SLinus Torvalds } 23171da177e4SLinus Torvalds rc = sk; 23181da177e4SLinus Torvalds goto out; 23191da177e4SLinus Torvalds } 23209db66bdcSEric Dumazet spin_unlock_bh(lock); 23211da177e4SLinus Torvalds } 23221da177e4SLinus Torvalds out: 23231da177e4SLinus Torvalds return rc; 23241da177e4SLinus Torvalds } 23251da177e4SLinus Torvalds 23261da177e4SLinus Torvalds static void *established_get_next(struct seq_file *seq, void *cur) 23271da177e4SLinus Torvalds { 2328b08d4d3bSYonghong Song struct tcp_seq_afinfo *afinfo; 23291da177e4SLinus Torvalds struct sock *sk = cur; 23303ab5aee7SEric Dumazet struct hlist_nulls_node *node; 23311da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private; 2332a4146b1bSDenis V. Lunev struct net *net = seq_file_net(seq); 23331da177e4SLinus Torvalds 2334b08d4d3bSYonghong Song if (st->bpf_seq_afinfo) 2335b08d4d3bSYonghong Song afinfo = st->bpf_seq_afinfo; 2336b08d4d3bSYonghong Song else 2337b08d4d3bSYonghong Song afinfo = PDE_DATA(file_inode(seq->file)); 2338b08d4d3bSYonghong Song 23391da177e4SLinus Torvalds ++st->num; 2340a8b690f9STom Herbert ++st->offset; 23411da177e4SLinus Torvalds 23423ab5aee7SEric Dumazet sk = sk_nulls_next(sk); 23431da177e4SLinus Torvalds 23443ab5aee7SEric Dumazet sk_nulls_for_each_from(sk, node) { 2345b08d4d3bSYonghong Song if ((afinfo->family == AF_UNSPEC || 2346b08d4d3bSYonghong Song sk->sk_family == afinfo->family) && 234737d849bbSChristoph Hellwig net_eq(sock_net(sk), net)) 234805dbc7b5SEric Dumazet return sk; 23491da177e4SLinus Torvalds } 23501da177e4SLinus Torvalds 235105dbc7b5SEric Dumazet spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 235205dbc7b5SEric Dumazet ++st->bucket; 235305dbc7b5SEric Dumazet return established_get_first(seq); 23541da177e4SLinus Torvalds } 23551da177e4SLinus Torvalds 23561da177e4SLinus Torvalds static void *established_get_idx(struct seq_file *seq, loff_t pos) 23571da177e4SLinus Torvalds { 2358a8b690f9STom Herbert struct tcp_iter_state *st = seq->private; 2359a8b690f9STom Herbert void *rc; 2360a8b690f9STom Herbert 2361a8b690f9STom Herbert st->bucket = 0; 2362a8b690f9STom Herbert rc = established_get_first(seq); 23631da177e4SLinus Torvalds 23641da177e4SLinus Torvalds while (rc && pos) { 23651da177e4SLinus Torvalds rc = established_get_next(seq, rc); 23661da177e4SLinus Torvalds --pos; 23671da177e4SLinus Torvalds } 23681da177e4SLinus Torvalds return rc; 23691da177e4SLinus Torvalds } 23701da177e4SLinus Torvalds 23711da177e4SLinus Torvalds static void *tcp_get_idx(struct seq_file *seq, loff_t pos) 23721da177e4SLinus Torvalds { 23731da177e4SLinus Torvalds void *rc; 23741da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private; 23751da177e4SLinus Torvalds 23761da177e4SLinus Torvalds st->state = TCP_SEQ_STATE_LISTENING; 23771da177e4SLinus Torvalds rc = listening_get_idx(seq, &pos); 23781da177e4SLinus Torvalds 23791da177e4SLinus Torvalds if (!rc) { 23801da177e4SLinus Torvalds st->state = TCP_SEQ_STATE_ESTABLISHED; 23811da177e4SLinus Torvalds rc = established_get_idx(seq, pos); 23821da177e4SLinus Torvalds } 23831da177e4SLinus Torvalds 23841da177e4SLinus Torvalds return rc; 23851da177e4SLinus Torvalds } 23861da177e4SLinus Torvalds 2387a8b690f9STom Herbert static void *tcp_seek_last_pos(struct seq_file *seq) 2388a8b690f9STom Herbert { 2389a8b690f9STom Herbert struct tcp_iter_state *st = seq->private; 2390a8b690f9STom Herbert int offset = st->offset; 2391a8b690f9STom Herbert int orig_num = st->num; 2392a8b690f9STom Herbert void *rc = NULL; 2393a8b690f9STom Herbert 2394a8b690f9STom Herbert switch (st->state) { 2395a8b690f9STom Herbert case TCP_SEQ_STATE_LISTENING: 2396a8b690f9STom Herbert if (st->bucket >= INET_LHTABLE_SIZE) 2397a8b690f9STom Herbert break; 2398a8b690f9STom Herbert st->state = TCP_SEQ_STATE_LISTENING; 2399a8b690f9STom Herbert rc = listening_get_next(seq, NULL); 2400a8b690f9STom Herbert while (offset-- && rc) 2401a8b690f9STom Herbert rc = listening_get_next(seq, rc); 2402a8b690f9STom Herbert if (rc) 2403a8b690f9STom Herbert break; 2404a8b690f9STom Herbert st->bucket = 0; 240505dbc7b5SEric Dumazet st->state = TCP_SEQ_STATE_ESTABLISHED; 2406a8eceea8SJoe Perches fallthrough; 2407a8b690f9STom Herbert case TCP_SEQ_STATE_ESTABLISHED: 2408a8b690f9STom Herbert if (st->bucket > tcp_hashinfo.ehash_mask) 2409a8b690f9STom Herbert break; 2410a8b690f9STom Herbert rc = established_get_first(seq); 2411a8b690f9STom Herbert while (offset-- && rc) 2412a8b690f9STom Herbert rc = established_get_next(seq, rc); 2413a8b690f9STom Herbert } 2414a8b690f9STom Herbert 2415a8b690f9STom Herbert st->num = orig_num; 2416a8b690f9STom Herbert 2417a8b690f9STom Herbert return rc; 2418a8b690f9STom Herbert } 2419a8b690f9STom Herbert 242037d849bbSChristoph Hellwig void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 24211da177e4SLinus Torvalds { 24221da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private; 2423a8b690f9STom Herbert void *rc; 2424a8b690f9STom Herbert 2425a8b690f9STom Herbert if (*pos && *pos == st->last_pos) { 2426a8b690f9STom Herbert rc = tcp_seek_last_pos(seq); 2427a8b690f9STom Herbert if (rc) 2428a8b690f9STom Herbert goto out; 2429a8b690f9STom Herbert } 2430a8b690f9STom Herbert 24311da177e4SLinus Torvalds st->state = TCP_SEQ_STATE_LISTENING; 24321da177e4SLinus Torvalds st->num = 0; 2433a8b690f9STom Herbert st->bucket = 0; 2434a8b690f9STom Herbert st->offset = 0; 2435a8b690f9STom Herbert rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2436a8b690f9STom Herbert 2437a8b690f9STom Herbert out: 2438a8b690f9STom Herbert st->last_pos = *pos; 2439a8b690f9STom Herbert return rc; 24401da177e4SLinus Torvalds } 244137d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_start); 24421da177e4SLinus Torvalds 244337d849bbSChristoph Hellwig void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 24441da177e4SLinus Torvalds { 2445a8b690f9STom Herbert struct tcp_iter_state *st = seq->private; 24461da177e4SLinus Torvalds void *rc = NULL; 24471da177e4SLinus Torvalds 24481da177e4SLinus Torvalds if (v == SEQ_START_TOKEN) { 24491da177e4SLinus Torvalds rc = tcp_get_idx(seq, 0); 24501da177e4SLinus Torvalds goto out; 24511da177e4SLinus Torvalds } 24521da177e4SLinus Torvalds 24531da177e4SLinus Torvalds switch (st->state) { 24541da177e4SLinus Torvalds case TCP_SEQ_STATE_LISTENING: 24551da177e4SLinus Torvalds rc = listening_get_next(seq, v); 24561da177e4SLinus Torvalds if (!rc) { 24571da177e4SLinus Torvalds st->state = TCP_SEQ_STATE_ESTABLISHED; 2458a8b690f9STom Herbert st->bucket = 0; 2459a8b690f9STom Herbert st->offset = 0; 24601da177e4SLinus Torvalds rc = established_get_first(seq); 24611da177e4SLinus Torvalds } 24621da177e4SLinus Torvalds break; 24631da177e4SLinus Torvalds case TCP_SEQ_STATE_ESTABLISHED: 24641da177e4SLinus Torvalds rc = established_get_next(seq, v); 24651da177e4SLinus Torvalds break; 24661da177e4SLinus Torvalds } 24671da177e4SLinus Torvalds out: 24681da177e4SLinus Torvalds ++*pos; 2469a8b690f9STom Herbert st->last_pos = *pos; 24701da177e4SLinus Torvalds return rc; 24711da177e4SLinus Torvalds } 247237d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_next); 24731da177e4SLinus Torvalds 247437d849bbSChristoph Hellwig void tcp_seq_stop(struct seq_file *seq, void *v) 24751da177e4SLinus Torvalds { 24761da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private; 24771da177e4SLinus Torvalds 24781da177e4SLinus Torvalds switch (st->state) { 24791da177e4SLinus Torvalds case TCP_SEQ_STATE_LISTENING: 24801da177e4SLinus Torvalds if (v != SEQ_START_TOKEN) 24819652dc2eSEric Dumazet spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); 24821da177e4SLinus Torvalds break; 24831da177e4SLinus Torvalds case TCP_SEQ_STATE_ESTABLISHED: 24841da177e4SLinus Torvalds if (v) 24859db66bdcSEric Dumazet spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 24861da177e4SLinus Torvalds break; 24871da177e4SLinus Torvalds } 24881da177e4SLinus Torvalds } 248937d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_stop); 24901da177e4SLinus Torvalds 2491d4f06873SEric Dumazet static void get_openreq4(const struct request_sock *req, 2492aa3a0c8cSEric Dumazet struct seq_file *f, int i) 24931da177e4SLinus Torvalds { 24942e6599cbSArnaldo Carvalho de Melo const struct inet_request_sock *ireq = inet_rsk(req); 2495fa76ce73SEric Dumazet long delta = req->rsk_timer.expires - jiffies; 24961da177e4SLinus Torvalds 24975e659e4cSPavel Emelyanov seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2498652586dfSTetsuo Handa " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 24991da177e4SLinus Torvalds i, 2500634fb979SEric Dumazet ireq->ir_loc_addr, 2501d4f06873SEric Dumazet ireq->ir_num, 2502634fb979SEric Dumazet ireq->ir_rmt_addr, 2503634fb979SEric Dumazet ntohs(ireq->ir_rmt_port), 25041da177e4SLinus Torvalds TCP_SYN_RECV, 25051da177e4SLinus Torvalds 0, 0, /* could print option size, but that is af dependent. */ 25061da177e4SLinus Torvalds 1, /* timers active (only the expire timer) */ 2507a399a805SEric Dumazet jiffies_delta_to_clock_t(delta), 2508e6c022a4SEric Dumazet req->num_timeout, 2509aa3a0c8cSEric Dumazet from_kuid_munged(seq_user_ns(f), 2510aa3a0c8cSEric Dumazet sock_i_uid(req->rsk_listener)), 25111da177e4SLinus Torvalds 0, /* non standard timer */ 25121da177e4SLinus Torvalds 0, /* open_requests have no inode */ 2513d4f06873SEric Dumazet 0, 2514652586dfSTetsuo Handa req); 25151da177e4SLinus Torvalds } 25161da177e4SLinus Torvalds 2517652586dfSTetsuo Handa static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) 25181da177e4SLinus Torvalds { 25191da177e4SLinus Torvalds int timer_active; 25201da177e4SLinus Torvalds unsigned long timer_expires; 2521cf533ea5SEric Dumazet const struct tcp_sock *tp = tcp_sk(sk); 2522cf4c6bf8SIlpo Järvinen const struct inet_connection_sock *icsk = inet_csk(sk); 2523cf533ea5SEric Dumazet const struct inet_sock *inet = inet_sk(sk); 25240536fcc0SEric Dumazet const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; 2525c720c7e8SEric Dumazet __be32 dest = inet->inet_daddr; 2526c720c7e8SEric Dumazet __be32 src = inet->inet_rcv_saddr; 2527c720c7e8SEric Dumazet __u16 destp = ntohs(inet->inet_dport); 2528c720c7e8SEric Dumazet __u16 srcp = ntohs(inet->inet_sport); 252949d09007SEric Dumazet int rx_queue; 253000fd38d9SEric Dumazet int state; 25311da177e4SLinus Torvalds 25326ba8a3b1SNandita Dukkipati if (icsk->icsk_pending == ICSK_TIME_RETRANS || 253357dde7f7SYuchung Cheng icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 25346ba8a3b1SNandita Dukkipati icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 25351da177e4SLinus Torvalds timer_active = 1; 2536463c84b9SArnaldo Carvalho de Melo timer_expires = icsk->icsk_timeout; 2537463c84b9SArnaldo Carvalho de Melo } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 25381da177e4SLinus Torvalds timer_active = 4; 2539463c84b9SArnaldo Carvalho de Melo timer_expires = icsk->icsk_timeout; 2540cf4c6bf8SIlpo Järvinen } else if (timer_pending(&sk->sk_timer)) { 25411da177e4SLinus Torvalds timer_active = 2; 2542cf4c6bf8SIlpo Järvinen timer_expires = sk->sk_timer.expires; 25431da177e4SLinus Torvalds } else { 25441da177e4SLinus Torvalds timer_active = 0; 25451da177e4SLinus Torvalds timer_expires = jiffies; 25461da177e4SLinus Torvalds } 25471da177e4SLinus Torvalds 2548986ffdfdSYafang Shao state = inet_sk_state_load(sk); 254900fd38d9SEric Dumazet if (state == TCP_LISTEN) 2550288efe86SEric Dumazet rx_queue = READ_ONCE(sk->sk_ack_backlog); 255149d09007SEric Dumazet else 255200fd38d9SEric Dumazet /* Because we don't lock the socket, 255300fd38d9SEric Dumazet * we might find a transient negative value. 255449d09007SEric Dumazet */ 2555dba7d9b8SEric Dumazet rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - 25567db48e98SEric Dumazet READ_ONCE(tp->copied_seq), 0); 255749d09007SEric Dumazet 25585e659e4cSPavel Emelyanov seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2559652586dfSTetsuo Handa "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", 256000fd38d9SEric Dumazet i, src, srcp, dest, destp, state, 25610f317464SEric Dumazet READ_ONCE(tp->write_seq) - tp->snd_una, 256249d09007SEric Dumazet rx_queue, 25631da177e4SLinus Torvalds timer_active, 2564a399a805SEric Dumazet jiffies_delta_to_clock_t(timer_expires - jiffies), 2565463c84b9SArnaldo Carvalho de Melo icsk->icsk_retransmits, 2566a7cb5a49SEric W. Biederman from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 25676687e988SArnaldo Carvalho de Melo icsk->icsk_probes_out, 2568cf4c6bf8SIlpo Järvinen sock_i_ino(sk), 256941c6d650SReshetova, Elena refcount_read(&sk->sk_refcnt), sk, 25707be87351SStephen Hemminger jiffies_to_clock_t(icsk->icsk_rto), 25717be87351SStephen Hemminger jiffies_to_clock_t(icsk->icsk_ack.ato), 257231954cd8SWei Wang (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), 25731da177e4SLinus Torvalds tp->snd_cwnd, 257400fd38d9SEric Dumazet state == TCP_LISTEN ? 257500fd38d9SEric Dumazet fastopenq->max_qlen : 2576652586dfSTetsuo Handa (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); 25771da177e4SLinus Torvalds } 25781da177e4SLinus Torvalds 2579cf533ea5SEric Dumazet static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2580652586dfSTetsuo Handa struct seq_file *f, int i) 25811da177e4SLinus Torvalds { 2582789f558cSEric Dumazet long delta = tw->tw_timer.expires - jiffies; 258323f33c2dSAl Viro __be32 dest, src; 25841da177e4SLinus Torvalds __u16 destp, srcp; 25851da177e4SLinus Torvalds 25861da177e4SLinus Torvalds dest = tw->tw_daddr; 25871da177e4SLinus Torvalds src = tw->tw_rcv_saddr; 25881da177e4SLinus Torvalds destp = ntohs(tw->tw_dport); 25891da177e4SLinus Torvalds srcp = ntohs(tw->tw_sport); 25901da177e4SLinus Torvalds 25915e659e4cSPavel Emelyanov seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2592652586dfSTetsuo Handa " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", 25931da177e4SLinus Torvalds i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2594a399a805SEric Dumazet 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 259541c6d650SReshetova, Elena refcount_read(&tw->tw_refcnt), tw); 25961da177e4SLinus Torvalds } 25971da177e4SLinus Torvalds 25981da177e4SLinus Torvalds #define TMPSZ 150 25991da177e4SLinus Torvalds 26001da177e4SLinus Torvalds static int tcp4_seq_show(struct seq_file *seq, void *v) 26011da177e4SLinus Torvalds { 26021da177e4SLinus Torvalds struct tcp_iter_state *st; 260305dbc7b5SEric Dumazet struct sock *sk = v; 26041da177e4SLinus Torvalds 2605652586dfSTetsuo Handa seq_setwidth(seq, TMPSZ - 1); 26061da177e4SLinus Torvalds if (v == SEQ_START_TOKEN) { 2607652586dfSTetsuo Handa seq_puts(seq, " sl local_address rem_address st tx_queue " 26081da177e4SLinus Torvalds "rx_queue tr tm->when retrnsmt uid timeout " 26091da177e4SLinus Torvalds "inode"); 26101da177e4SLinus Torvalds goto out; 26111da177e4SLinus Torvalds } 26121da177e4SLinus Torvalds st = seq->private; 26131da177e4SLinus Torvalds 261405dbc7b5SEric Dumazet if (sk->sk_state == TCP_TIME_WAIT) 2615652586dfSTetsuo Handa get_timewait4_sock(v, seq, st->num); 2616079096f1SEric Dumazet else if (sk->sk_state == TCP_NEW_SYN_RECV) 2617079096f1SEric Dumazet get_openreq4(v, seq, st->num); 261805dbc7b5SEric Dumazet else 2619652586dfSTetsuo Handa get_tcp4_sock(v, seq, st->num); 26201da177e4SLinus Torvalds out: 2621652586dfSTetsuo Handa seq_pad(seq, '\n'); 26221da177e4SLinus Torvalds return 0; 26231da177e4SLinus Torvalds } 26241da177e4SLinus Torvalds 262552d87d5fSYonghong Song #ifdef CONFIG_BPF_SYSCALL 262652d87d5fSYonghong Song struct bpf_iter__tcp { 262752d87d5fSYonghong Song __bpf_md_ptr(struct bpf_iter_meta *, meta); 262852d87d5fSYonghong Song __bpf_md_ptr(struct sock_common *, sk_common); 262952d87d5fSYonghong Song uid_t uid __aligned(8); 263052d87d5fSYonghong Song }; 263152d87d5fSYonghong Song 263252d87d5fSYonghong Song static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 263352d87d5fSYonghong Song struct sock_common *sk_common, uid_t uid) 263452d87d5fSYonghong Song { 263552d87d5fSYonghong Song struct bpf_iter__tcp ctx; 263652d87d5fSYonghong Song 263752d87d5fSYonghong Song meta->seq_num--; /* skip SEQ_START_TOKEN */ 263852d87d5fSYonghong Song ctx.meta = meta; 263952d87d5fSYonghong Song ctx.sk_common = sk_common; 264052d87d5fSYonghong Song ctx.uid = uid; 264152d87d5fSYonghong Song return bpf_iter_run_prog(prog, &ctx); 264252d87d5fSYonghong Song } 264352d87d5fSYonghong Song 264452d87d5fSYonghong Song static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) 264552d87d5fSYonghong Song { 264652d87d5fSYonghong Song struct bpf_iter_meta meta; 264752d87d5fSYonghong Song struct bpf_prog *prog; 264852d87d5fSYonghong Song struct sock *sk = v; 264952d87d5fSYonghong Song uid_t uid; 265052d87d5fSYonghong Song 265152d87d5fSYonghong Song if (v == SEQ_START_TOKEN) 265252d87d5fSYonghong Song return 0; 265352d87d5fSYonghong Song 265452d87d5fSYonghong Song if (sk->sk_state == TCP_TIME_WAIT) { 265552d87d5fSYonghong Song uid = 0; 265652d87d5fSYonghong Song } else if (sk->sk_state == TCP_NEW_SYN_RECV) { 265752d87d5fSYonghong Song const struct request_sock *req = v; 265852d87d5fSYonghong Song 265952d87d5fSYonghong Song uid = from_kuid_munged(seq_user_ns(seq), 266052d87d5fSYonghong Song sock_i_uid(req->rsk_listener)); 266152d87d5fSYonghong Song } else { 266252d87d5fSYonghong Song uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 266352d87d5fSYonghong Song } 266452d87d5fSYonghong Song 266552d87d5fSYonghong Song meta.seq = seq; 266652d87d5fSYonghong Song prog = bpf_iter_get_info(&meta, false); 266752d87d5fSYonghong Song return tcp_prog_seq_show(prog, &meta, v, uid); 266852d87d5fSYonghong Song } 266952d87d5fSYonghong Song 267052d87d5fSYonghong Song static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) 267152d87d5fSYonghong Song { 267252d87d5fSYonghong Song struct bpf_iter_meta meta; 267352d87d5fSYonghong Song struct bpf_prog *prog; 267452d87d5fSYonghong Song 267552d87d5fSYonghong Song if (!v) { 267652d87d5fSYonghong Song meta.seq = seq; 267752d87d5fSYonghong Song prog = bpf_iter_get_info(&meta, true); 267852d87d5fSYonghong Song if (prog) 267952d87d5fSYonghong Song (void)tcp_prog_seq_show(prog, &meta, v, 0); 268052d87d5fSYonghong Song } 268152d87d5fSYonghong Song 268252d87d5fSYonghong Song tcp_seq_stop(seq, v); 268352d87d5fSYonghong Song } 268452d87d5fSYonghong Song 268552d87d5fSYonghong Song static const struct seq_operations bpf_iter_tcp_seq_ops = { 268652d87d5fSYonghong Song .show = bpf_iter_tcp_seq_show, 268752d87d5fSYonghong Song .start = tcp_seq_start, 268852d87d5fSYonghong Song .next = tcp_seq_next, 268952d87d5fSYonghong Song .stop = bpf_iter_tcp_seq_stop, 269052d87d5fSYonghong Song }; 269152d87d5fSYonghong Song #endif 269252d87d5fSYonghong Song 269337d849bbSChristoph Hellwig static const struct seq_operations tcp4_seq_ops = { 269437d849bbSChristoph Hellwig .show = tcp4_seq_show, 269537d849bbSChristoph Hellwig .start = tcp_seq_start, 269637d849bbSChristoph Hellwig .next = tcp_seq_next, 269737d849bbSChristoph Hellwig .stop = tcp_seq_stop, 269837d849bbSChristoph Hellwig }; 269937d849bbSChristoph Hellwig 27001da177e4SLinus Torvalds static struct tcp_seq_afinfo tcp4_seq_afinfo = { 27011da177e4SLinus Torvalds .family = AF_INET, 27021da177e4SLinus Torvalds }; 27031da177e4SLinus Torvalds 27042c8c1e72SAlexey Dobriyan static int __net_init tcp4_proc_init_net(struct net *net) 2705757764f6SPavel Emelyanov { 2706c3506372SChristoph Hellwig if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, 2707c3506372SChristoph Hellwig sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) 270837d849bbSChristoph Hellwig return -ENOMEM; 270937d849bbSChristoph Hellwig return 0; 2710757764f6SPavel Emelyanov } 2711757764f6SPavel Emelyanov 27122c8c1e72SAlexey Dobriyan static void __net_exit tcp4_proc_exit_net(struct net *net) 2713757764f6SPavel Emelyanov { 271437d849bbSChristoph Hellwig remove_proc_entry("tcp", net->proc_net); 2715757764f6SPavel Emelyanov } 2716757764f6SPavel Emelyanov 2717757764f6SPavel Emelyanov static struct pernet_operations tcp4_net_ops = { 2718757764f6SPavel Emelyanov .init = tcp4_proc_init_net, 2719757764f6SPavel Emelyanov .exit = tcp4_proc_exit_net, 2720757764f6SPavel Emelyanov }; 2721757764f6SPavel Emelyanov 27221da177e4SLinus Torvalds int __init tcp4_proc_init(void) 27231da177e4SLinus Torvalds { 2724757764f6SPavel Emelyanov return register_pernet_subsys(&tcp4_net_ops); 27251da177e4SLinus Torvalds } 27261da177e4SLinus Torvalds 27271da177e4SLinus Torvalds void tcp4_proc_exit(void) 27281da177e4SLinus Torvalds { 2729757764f6SPavel Emelyanov unregister_pernet_subsys(&tcp4_net_ops); 27301da177e4SLinus Torvalds } 27311da177e4SLinus Torvalds #endif /* CONFIG_PROC_FS */ 27321da177e4SLinus Torvalds 27331da177e4SLinus Torvalds struct proto tcp_prot = { 27341da177e4SLinus Torvalds .name = "TCP", 27351da177e4SLinus Torvalds .owner = THIS_MODULE, 27361da177e4SLinus Torvalds .close = tcp_close, 2737d74bad4eSAndrey Ignatov .pre_connect = tcp_v4_pre_connect, 27381da177e4SLinus Torvalds .connect = tcp_v4_connect, 27391da177e4SLinus Torvalds .disconnect = tcp_disconnect, 2740463c84b9SArnaldo Carvalho de Melo .accept = inet_csk_accept, 27411da177e4SLinus Torvalds .ioctl = tcp_ioctl, 27421da177e4SLinus Torvalds .init = tcp_v4_init_sock, 27431da177e4SLinus Torvalds .destroy = tcp_v4_destroy_sock, 27441da177e4SLinus Torvalds .shutdown = tcp_shutdown, 27451da177e4SLinus Torvalds .setsockopt = tcp_setsockopt, 27461da177e4SLinus Torvalds .getsockopt = tcp_getsockopt, 27474b9d07a4SUrsula Braun .keepalive = tcp_set_keepalive, 27481da177e4SLinus Torvalds .recvmsg = tcp_recvmsg, 27497ba42910SChangli Gao .sendmsg = tcp_sendmsg, 27507ba42910SChangli Gao .sendpage = tcp_sendpage, 27511da177e4SLinus Torvalds .backlog_rcv = tcp_v4_do_rcv, 275246d3ceabSEric Dumazet .release_cb = tcp_release_cb, 2753ab1e0a13SArnaldo Carvalho de Melo .hash = inet_hash, 2754ab1e0a13SArnaldo Carvalho de Melo .unhash = inet_unhash, 2755ab1e0a13SArnaldo Carvalho de Melo .get_port = inet_csk_get_port, 27561da177e4SLinus Torvalds .enter_memory_pressure = tcp_enter_memory_pressure, 275706044751SEric Dumazet .leave_memory_pressure = tcp_leave_memory_pressure, 2758c9bee3b7SEric Dumazet .stream_memory_free = tcp_stream_memory_free, 27591da177e4SLinus Torvalds .sockets_allocated = &tcp_sockets_allocated, 27600a5578cfSArnaldo Carvalho de Melo .orphan_count = &tcp_orphan_count, 27611da177e4SLinus Torvalds .memory_allocated = &tcp_memory_allocated, 27621da177e4SLinus Torvalds .memory_pressure = &tcp_memory_pressure, 2763a4fe34bfSEric W. Biederman .sysctl_mem = sysctl_tcp_mem, 2764356d1833SEric Dumazet .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), 2765356d1833SEric Dumazet .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), 27661da177e4SLinus Torvalds .max_header = MAX_TCP_HEADER, 27671da177e4SLinus Torvalds .obj_size = sizeof(struct tcp_sock), 27685f0d5a3aSPaul E. McKenney .slab_flags = SLAB_TYPESAFE_BY_RCU, 27696d6ee43eSArnaldo Carvalho de Melo .twsk_prot = &tcp_timewait_sock_ops, 277060236fddSArnaldo Carvalho de Melo .rsk_prot = &tcp_request_sock_ops, 277139d8cda7SPavel Emelyanov .h.hashinfo = &tcp_hashinfo, 27727ba42910SChangli Gao .no_autobind = true, 2773c1e64e29SLorenzo Colitti .diag_destroy = tcp_abort, 27741da177e4SLinus Torvalds }; 27754bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_prot); 27761da177e4SLinus Torvalds 2777046ee902SDenis V. Lunev static void __net_exit tcp_sk_exit(struct net *net) 2778046ee902SDenis V. Lunev { 2779bdbbb852SEric Dumazet int cpu; 2780bdbbb852SEric Dumazet 2781b506bc97SDust Li if (net->ipv4.tcp_congestion_control) 27820baf26b0SMartin KaFai Lau bpf_module_put(net->ipv4.tcp_congestion_control, 27830baf26b0SMartin KaFai Lau net->ipv4.tcp_congestion_control->owner); 27846670e152SStephen Hemminger 2785bdbbb852SEric Dumazet for_each_possible_cpu(cpu) 2786bdbbb852SEric Dumazet inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2787bdbbb852SEric Dumazet free_percpu(net->ipv4.tcp_sk); 2788bdbbb852SEric Dumazet } 2789bdbbb852SEric Dumazet 2790bdbbb852SEric Dumazet static int __net_init tcp_sk_init(struct net *net) 2791bdbbb852SEric Dumazet { 2792fee83d09SHaishuang Yan int res, cpu, cnt; 2793bdbbb852SEric Dumazet 2794bdbbb852SEric Dumazet net->ipv4.tcp_sk = alloc_percpu(struct sock *); 2795bdbbb852SEric Dumazet if (!net->ipv4.tcp_sk) 2796bdbbb852SEric Dumazet return -ENOMEM; 2797bdbbb852SEric Dumazet 2798bdbbb852SEric Dumazet for_each_possible_cpu(cpu) { 2799bdbbb852SEric Dumazet struct sock *sk; 2800bdbbb852SEric Dumazet 2801bdbbb852SEric Dumazet res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, 2802bdbbb852SEric Dumazet IPPROTO_TCP, net); 2803bdbbb852SEric Dumazet if (res) 2804bdbbb852SEric Dumazet goto fail; 2805a9d6532bSEric Dumazet sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 2806431280eeSEric Dumazet 2807431280eeSEric Dumazet /* Please enforce IP_DF and IPID==0 for RST and 2808431280eeSEric Dumazet * ACK sent in SYN-RECV and TIME-WAIT state. 2809431280eeSEric Dumazet */ 2810431280eeSEric Dumazet inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; 2811431280eeSEric Dumazet 2812bdbbb852SEric Dumazet *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; 2813bdbbb852SEric Dumazet } 281449213555SDaniel Borkmann 2815bdbbb852SEric Dumazet net->ipv4.sysctl_tcp_ecn = 2; 281649213555SDaniel Borkmann net->ipv4.sysctl_tcp_ecn_fallback = 1; 281749213555SDaniel Borkmann 2818b0f9ca53SFan Du net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 28195f3e2bf0SEric Dumazet net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; 28206b58e0a5SFan Du net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; 282105cbc0dbSFan Du net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; 2822c04b79b6SJosh Hunt net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; 2823bdbbb852SEric Dumazet 282413b287e8SNikolay Borisov net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; 28259bd6861bSNikolay Borisov net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; 2826b840d15dSNikolay Borisov net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; 282713b287e8SNikolay Borisov 28286fa25166SNikolay Borisov net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 28297c083ecbSNikolay Borisov net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 28300aca737dSDavid S. Miller net->ipv4.sysctl_tcp_syncookies = 1; 28311043e25fSNikolay Borisov net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; 2832ae5c3f40SNikolay Borisov net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; 2833c6214a97SNikolay Borisov net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; 2834c402d9beSNikolay Borisov net->ipv4.sysctl_tcp_orphan_retries = 0; 28351e579caaSNikolay Borisov net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 28364979f2d9SNikolay Borisov net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 283779e9fed4SMaciej Żenczykowski net->ipv4.sysctl_tcp_tw_reuse = 2; 283865e6d901SKevin(Yudong) Yang net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; 283912ed8244SNikolay Borisov 2840fee83d09SHaishuang Yan cnt = tcp_hashinfo.ehash_mask + 1; 2841743e4815SYafang Shao net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; 28421946e672SHaishuang Yan net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; 28431946e672SHaishuang Yan 2844623d0c2dSEric Dumazet net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); 2845f9301034SEric Dumazet net->ipv4.sysctl_tcp_sack = 1; 28469bb37ef0SEric Dumazet net->ipv4.sysctl_tcp_window_scaling = 1; 28475d2ed052SEric Dumazet net->ipv4.sysctl_tcp_timestamps = 1; 28482ae21cf5SEric Dumazet net->ipv4.sysctl_tcp_early_retrans = 3; 2849e20223f1SEric Dumazet net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; 2850b510f0d2SEric Dumazet net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ 2851e0a1e5b5SEric Dumazet net->ipv4.sysctl_tcp_retrans_collapse = 1; 2852c6e21803SEric Dumazet net->ipv4.sysctl_tcp_max_reordering = 300; 28536496f6bdSEric Dumazet net->ipv4.sysctl_tcp_dsack = 1; 28540c12654aSEric Dumazet net->ipv4.sysctl_tcp_app_win = 31; 285594f0893eSEric Dumazet net->ipv4.sysctl_tcp_adv_win_scale = 1; 2856af9b69a7SEric Dumazet net->ipv4.sysctl_tcp_frto = 2; 28574540c0cfSEric Dumazet net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 2858d06a9904SEric Dumazet /* This limits the percentage of the congestion window which we 2859d06a9904SEric Dumazet * will allow a single TSO frame to consume. Building TSO frames 2860d06a9904SEric Dumazet * which are too large can cause TCP streams to be bursty. 2861d06a9904SEric Dumazet */ 2862d06a9904SEric Dumazet net->ipv4.sysctl_tcp_tso_win_divisor = 3; 2863c73e5807SEric Dumazet /* Default TSQ limit of 16 TSO segments */ 2864c73e5807SEric Dumazet net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 2865b530b681SEric Dumazet /* rfc5961 challenge ack rate limiting */ 2866b530b681SEric Dumazet net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 286726e9596eSEric Dumazet net->ipv4.sysctl_tcp_min_tso_segs = 2; 2868bd239704SEric Dumazet net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 2869790f00e1SEric Dumazet net->ipv4.sysctl_tcp_autocorking = 1; 28704170ba6bSEric Dumazet net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; 287123a7102aSEric Dumazet net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; 2872c26e91f8SEric Dumazet net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; 2873356d1833SEric Dumazet if (net != &init_net) { 2874356d1833SEric Dumazet memcpy(net->ipv4.sysctl_tcp_rmem, 2875356d1833SEric Dumazet init_net.ipv4.sysctl_tcp_rmem, 2876356d1833SEric Dumazet sizeof(init_net.ipv4.sysctl_tcp_rmem)); 2877356d1833SEric Dumazet memcpy(net->ipv4.sysctl_tcp_wmem, 2878356d1833SEric Dumazet init_net.ipv4.sysctl_tcp_wmem, 2879356d1833SEric Dumazet sizeof(init_net.ipv4.sysctl_tcp_wmem)); 2880356d1833SEric Dumazet } 28816d82aa24SEric Dumazet net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 2882a70437ccSEric Dumazet net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 28839c21d2fcSEric Dumazet net->ipv4.sysctl_tcp_comp_sack_nr = 44; 2884e1cfcbe8SHaishuang Yan net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 288543713848SHaishuang Yan spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); 28863733be14SHaishuang Yan net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 28873733be14SHaishuang Yan atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2888e1cfcbe8SHaishuang Yan 28896670e152SStephen Hemminger /* Reno is always built in */ 28906670e152SStephen Hemminger if (!net_eq(net, &init_net) && 28910baf26b0SMartin KaFai Lau bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 28920baf26b0SMartin KaFai Lau init_net.ipv4.tcp_congestion_control->owner)) 28936670e152SStephen Hemminger net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 28946670e152SStephen Hemminger else 28956670e152SStephen Hemminger net->ipv4.tcp_congestion_control = &tcp_reno; 28966670e152SStephen Hemminger 289749213555SDaniel Borkmann return 0; 2898bdbbb852SEric Dumazet fail: 2899bdbbb852SEric Dumazet tcp_sk_exit(net); 2900bdbbb852SEric Dumazet 2901bdbbb852SEric Dumazet return res; 2902b099ce26SEric W. Biederman } 2903b099ce26SEric W. Biederman 2904b099ce26SEric W. Biederman static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2905b099ce26SEric W. Biederman { 290643713848SHaishuang Yan struct net *net; 290743713848SHaishuang Yan 29081946e672SHaishuang Yan inet_twsk_purge(&tcp_hashinfo, AF_INET); 290943713848SHaishuang Yan 291043713848SHaishuang Yan list_for_each_entry(net, net_exit_list, exit_list) 291143713848SHaishuang Yan tcp_fastopen_ctx_destroy(net); 2912046ee902SDenis V. Lunev } 2913046ee902SDenis V. Lunev 2914046ee902SDenis V. Lunev static struct pernet_operations __net_initdata tcp_sk_ops = { 2915046ee902SDenis V. Lunev .init = tcp_sk_init, 2916046ee902SDenis V. Lunev .exit = tcp_sk_exit, 2917b099ce26SEric W. Biederman .exit_batch = tcp_sk_exit_batch, 2918046ee902SDenis V. Lunev }; 2919046ee902SDenis V. Lunev 292052d87d5fSYonghong Song #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 292152d87d5fSYonghong Song DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, 292252d87d5fSYonghong Song struct sock_common *sk_common, uid_t uid) 292352d87d5fSYonghong Song 292452d87d5fSYonghong Song static int bpf_iter_init_tcp(void *priv_data) 292552d87d5fSYonghong Song { 292652d87d5fSYonghong Song struct tcp_iter_state *st = priv_data; 292752d87d5fSYonghong Song struct tcp_seq_afinfo *afinfo; 292852d87d5fSYonghong Song int ret; 292952d87d5fSYonghong Song 293052d87d5fSYonghong Song afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); 293152d87d5fSYonghong Song if (!afinfo) 293252d87d5fSYonghong Song return -ENOMEM; 293352d87d5fSYonghong Song 293452d87d5fSYonghong Song afinfo->family = AF_UNSPEC; 293552d87d5fSYonghong Song st->bpf_seq_afinfo = afinfo; 293652d87d5fSYonghong Song ret = bpf_iter_init_seq_net(priv_data); 293752d87d5fSYonghong Song if (ret) 293852d87d5fSYonghong Song kfree(afinfo); 293952d87d5fSYonghong Song return ret; 294052d87d5fSYonghong Song } 294152d87d5fSYonghong Song 294252d87d5fSYonghong Song static void bpf_iter_fini_tcp(void *priv_data) 294352d87d5fSYonghong Song { 294452d87d5fSYonghong Song struct tcp_iter_state *st = priv_data; 294552d87d5fSYonghong Song 294652d87d5fSYonghong Song kfree(st->bpf_seq_afinfo); 294752d87d5fSYonghong Song bpf_iter_fini_seq_net(priv_data); 294852d87d5fSYonghong Song } 294952d87d5fSYonghong Song 2950*14fc6bd6SYonghong Song static const struct bpf_iter_seq_info tcp_seq_info = { 295152d87d5fSYonghong Song .seq_ops = &bpf_iter_tcp_seq_ops, 295252d87d5fSYonghong Song .init_seq_private = bpf_iter_init_tcp, 295352d87d5fSYonghong Song .fini_seq_private = bpf_iter_fini_tcp, 295452d87d5fSYonghong Song .seq_priv_size = sizeof(struct tcp_iter_state), 2955*14fc6bd6SYonghong Song }; 2956*14fc6bd6SYonghong Song 2957*14fc6bd6SYonghong Song static struct bpf_iter_reg tcp_reg_info = { 2958*14fc6bd6SYonghong Song .target = "tcp", 295952d87d5fSYonghong Song .ctx_arg_info_size = 1, 296052d87d5fSYonghong Song .ctx_arg_info = { 296152d87d5fSYonghong Song { offsetof(struct bpf_iter__tcp, sk_common), 296252d87d5fSYonghong Song PTR_TO_BTF_ID_OR_NULL }, 296352d87d5fSYonghong Song }, 2964*14fc6bd6SYonghong Song .seq_info = &tcp_seq_info, 296552d87d5fSYonghong Song }; 296652d87d5fSYonghong Song 296752d87d5fSYonghong Song static void __init bpf_iter_register(void) 296852d87d5fSYonghong Song { 2969951cf368SYonghong Song tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; 297052d87d5fSYonghong Song if (bpf_iter_reg_target(&tcp_reg_info)) 297152d87d5fSYonghong Song pr_warn("Warning: could not register bpf iterator tcp\n"); 297252d87d5fSYonghong Song } 297352d87d5fSYonghong Song 297452d87d5fSYonghong Song #endif 297552d87d5fSYonghong Song 29769b0f976fSDenis V. Lunev void __init tcp_v4_init(void) 29771da177e4SLinus Torvalds { 29786a1b3054SEric W. Biederman if (register_pernet_subsys(&tcp_sk_ops)) 29791da177e4SLinus Torvalds panic("Failed to create the TCP control socket.\n"); 298052d87d5fSYonghong Song 298152d87d5fSYonghong Song #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 298252d87d5fSYonghong Song bpf_iter_register(); 298352d87d5fSYonghong Song #endif 29841da177e4SLinus Torvalds } 2985