12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * INET An implementation of the TCP/IP protocol suite for the LINUX
41da177e4SLinus Torvalds * operating system. INET is implemented using the BSD Socket
51da177e4SLinus Torvalds * interface as the means of communication with the user level.
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * Implementation of the Transmission Control Protocol(TCP).
81da177e4SLinus Torvalds *
91da177e4SLinus Torvalds * IPv4 specific functions
101da177e4SLinus Torvalds *
111da177e4SLinus Torvalds * code split from:
121da177e4SLinus Torvalds * linux/ipv4/tcp.c
131da177e4SLinus Torvalds * linux/ipv4/tcp_input.c
141da177e4SLinus Torvalds * linux/ipv4/tcp_output.c
151da177e4SLinus Torvalds *
161da177e4SLinus Torvalds * See tcp.c for author information
171da177e4SLinus Torvalds */
181da177e4SLinus Torvalds
191da177e4SLinus Torvalds /*
201da177e4SLinus Torvalds * Changes:
211da177e4SLinus Torvalds * David S. Miller : New socket lookup architecture.
221da177e4SLinus Torvalds * This code is dedicated to John Dyson.
231da177e4SLinus Torvalds * David S. Miller : Change semantics of established hash,
241da177e4SLinus Torvalds * half is devoted to TIME_WAIT sockets
251da177e4SLinus Torvalds * and the rest go in the other half.
261da177e4SLinus Torvalds * Andi Kleen : Add support for syncookies and fixed
271da177e4SLinus Torvalds * some bugs: ip options weren't passed to
281da177e4SLinus Torvalds * the TCP layer, missed a check for an
291da177e4SLinus Torvalds * ACK bit.
301da177e4SLinus Torvalds * Andi Kleen : Implemented fast path mtu discovery.
311da177e4SLinus Torvalds * Fixed many serious bugs in the
3260236fddSArnaldo Carvalho de Melo * request_sock handling and moved
331da177e4SLinus Torvalds * most of it into the af independent code.
341da177e4SLinus Torvalds * Added tail drop and some other bugfixes.
35caa20d9aSStephen Hemminger * Added new listen semantics.
361da177e4SLinus Torvalds * Mike McLagan : Routing by source
371da177e4SLinus Torvalds * Juan Jose Ciarlante: ip_dynaddr bits
381da177e4SLinus Torvalds * Andi Kleen: various fixes.
391da177e4SLinus Torvalds * Vitaly E. Lavrov : Transparent proxy revived after year
401da177e4SLinus Torvalds * coma.
411da177e4SLinus Torvalds * Andi Kleen : Fix new listen.
421da177e4SLinus Torvalds * Andi Kleen : Fix accept error reporting.
431da177e4SLinus Torvalds * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
441da177e4SLinus Torvalds * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
451da177e4SLinus Torvalds * a single port at the same time.
461da177e4SLinus Torvalds */
471da177e4SLinus Torvalds
48afd46503SJoe Perches #define pr_fmt(fmt) "TCP: " fmt
491da177e4SLinus Torvalds
50eb4dea58SHerbert Xu #include <linux/bottom_half.h>
511da177e4SLinus Torvalds #include <linux/types.h>
521da177e4SLinus Torvalds #include <linux/fcntl.h>
531da177e4SLinus Torvalds #include <linux/module.h>
541da177e4SLinus Torvalds #include <linux/random.h>
551da177e4SLinus Torvalds #include <linux/cache.h>
561da177e4SLinus Torvalds #include <linux/jhash.h>
571da177e4SLinus Torvalds #include <linux/init.h>
581da177e4SLinus Torvalds #include <linux/times.h>
595a0e3ad6STejun Heo #include <linux/slab.h>
609f4a7c93SJian Wen #include <linux/sched.h>
611da177e4SLinus Torvalds
62457c4cbcSEric W. Biederman #include <net/net_namespace.h>
631da177e4SLinus Torvalds #include <net/icmp.h>
64304a1618SArnaldo Carvalho de Melo #include <net/inet_hashtables.h>
651da177e4SLinus Torvalds #include <net/tcp.h>
6620380731SArnaldo Carvalho de Melo #include <net/transp_v6.h>
671da177e4SLinus Torvalds #include <net/ipv6.h>
681da177e4SLinus Torvalds #include <net/inet_common.h>
696d6ee43eSArnaldo Carvalho de Melo #include <net/timewait_sock.h>
701da177e4SLinus Torvalds #include <net/xfrm.h>
716e5714eaSDavid S. Miller #include <net/secure_seq.h>
72076bb0c8SEliezer Tamir #include <net/busy_poll.h>
736be49deaSJason Xing #include <net/rstreason.h>
741da177e4SLinus Torvalds
751da177e4SLinus Torvalds #include <linux/inet.h>
761da177e4SLinus Torvalds #include <linux/ipv6.h>
771da177e4SLinus Torvalds #include <linux/stddef.h>
781da177e4SLinus Torvalds #include <linux/proc_fs.h>
791da177e4SLinus Torvalds #include <linux/seq_file.h>
806797318eSIvan Delalande #include <linux/inetdevice.h>
81951cf368SYonghong Song #include <linux/btf_ids.h>
82*8f0b3cc9SMina Almasry #include <linux/skbuff_ref.h>
831da177e4SLinus Torvalds
84cf80e0e4SHerbert Xu #include <crypto/hash.h>
85cfb6eeb4SYOSHIFUJI Hideaki #include <linux/scatterlist.h>
86cfb6eeb4SYOSHIFUJI Hideaki
87c24b14c4SSong Liu #include <trace/events/tcp.h>
88c24b14c4SSong Liu
89cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
90a915da9bSEric Dumazet static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91318cf7aaSEric Dumazet __be32 daddr, __be32 saddr, const struct tcphdr *th);
92cfb6eeb4SYOSHIFUJI Hideaki #endif
93cfb6eeb4SYOSHIFUJI Hideaki
945caea4eaSEric Dumazet struct inet_hashinfo tcp_hashinfo;
954bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_hashinfo);
961da177e4SLinus Torvalds
97ebad6d03SSebastian Andrzej Siewior static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98ebad6d03SSebastian Andrzej Siewior .bh_lock = INIT_LOCAL_LOCK(bh_lock),
99ebad6d03SSebastian Andrzej Siewior };
10037ba017dSEric Dumazet
101565d121bSFlorian Westphal static DEFINE_MUTEX(tcp_exit_batch_mutex);
102565d121bSFlorian Westphal
tcp_v4_init_seq(const struct sk_buff * skb)10384b114b9SEric Dumazet static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1041da177e4SLinus Torvalds {
10584b114b9SEric Dumazet return secure_tcp_seq(ip_hdr(skb)->daddr,
106eddc9ec5SArnaldo Carvalho de Melo ip_hdr(skb)->saddr,
107aa8223c7SArnaldo Carvalho de Melo tcp_hdr(skb)->dest,
10884b114b9SEric Dumazet tcp_hdr(skb)->source);
10984b114b9SEric Dumazet }
11084b114b9SEric Dumazet
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)1115d2ed052SEric Dumazet static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
11284b114b9SEric Dumazet {
1135d2ed052SEric Dumazet return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1141da177e4SLinus Torvalds }
1151da177e4SLinus Torvalds
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)1166d6ee43eSArnaldo Carvalho de Melo int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
1176d6ee43eSArnaldo Carvalho de Melo {
118cbfc6495SKuniyuki Iwashima int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
11979e9fed4SMaciej Żenczykowski const struct inet_timewait_sock *tw = inet_twsk(sktw);
1206d6ee43eSArnaldo Carvalho de Melo const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
1216d6ee43eSArnaldo Carvalho de Melo struct tcp_sock *tp = tcp_sk(sk);
12269e0b33aSEric Dumazet int ts_recent_stamp;
12379e9fed4SMaciej Żenczykowski
1243e5cbbb1SEric Dumazet if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
1250d9e5df4SJason Xing reuse = 0;
1260d9e5df4SJason Xing
12779e9fed4SMaciej Żenczykowski if (reuse == 2) {
12879e9fed4SMaciej Żenczykowski /* Still does not detect *everything* that goes through
12979e9fed4SMaciej Żenczykowski * lo, since we require a loopback src or dst address
13079e9fed4SMaciej Żenczykowski * or direct binding to 'lo' interface.
13179e9fed4SMaciej Żenczykowski */
13279e9fed4SMaciej Żenczykowski bool loopback = false;
13379e9fed4SMaciej Żenczykowski if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
13479e9fed4SMaciej Żenczykowski loopback = true;
13579e9fed4SMaciej Żenczykowski #if IS_ENABLED(CONFIG_IPV6)
13679e9fed4SMaciej Żenczykowski if (tw->tw_family == AF_INET6) {
13779e9fed4SMaciej Żenczykowski if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
138be2644aaSEric Dumazet ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
13979e9fed4SMaciej Żenczykowski ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
140be2644aaSEric Dumazet ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
14179e9fed4SMaciej Żenczykowski loopback = true;
14279e9fed4SMaciej Żenczykowski } else
14379e9fed4SMaciej Żenczykowski #endif
14479e9fed4SMaciej Żenczykowski {
14579e9fed4SMaciej Żenczykowski if (ipv4_is_loopback(tw->tw_daddr) ||
14679e9fed4SMaciej Żenczykowski ipv4_is_loopback(tw->tw_rcv_saddr))
14779e9fed4SMaciej Żenczykowski loopback = true;
14879e9fed4SMaciej Żenczykowski }
14979e9fed4SMaciej Żenczykowski if (!loopback)
15079e9fed4SMaciej Żenczykowski reuse = 0;
15179e9fed4SMaciej Żenczykowski }
1526d6ee43eSArnaldo Carvalho de Melo
1536d6ee43eSArnaldo Carvalho de Melo /* With PAWS, it is safe from the viewpoint
1546d6ee43eSArnaldo Carvalho de Melo of data integrity. Even without PAWS it is safe provided sequence
1556d6ee43eSArnaldo Carvalho de Melo spaces do not overlap i.e. at data rates <= 80Mbit/sec.
1566d6ee43eSArnaldo Carvalho de Melo
1576d6ee43eSArnaldo Carvalho de Melo Actually, the idea is close to VJ's one, only timestamp cache is
1586d6ee43eSArnaldo Carvalho de Melo held not per host, but per port pair and TW bucket is used as state
1596d6ee43eSArnaldo Carvalho de Melo holder.
1606d6ee43eSArnaldo Carvalho de Melo
1616d6ee43eSArnaldo Carvalho de Melo If TW bucket has been already destroyed we fall back to VJ's scheme
1626d6ee43eSArnaldo Carvalho de Melo and use initial timestamp retrieved from peer table.
1636d6ee43eSArnaldo Carvalho de Melo */
16469e0b33aSEric Dumazet ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
16569e0b33aSEric Dumazet if (ts_recent_stamp &&
166cca9bab1SArnd Bergmann (!twp || (reuse && time_after32(ktime_get_seconds(),
16769e0b33aSEric Dumazet ts_recent_stamp)))) {
168b334b924SValentin Schneider /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
169f2db7230SKuniyuki Iwashima * and releasing the bucket lock.
170f2db7230SKuniyuki Iwashima */
171f2db7230SKuniyuki Iwashima if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
172f2db7230SKuniyuki Iwashima return 0;
173f2db7230SKuniyuki Iwashima
17421684dc4SStefan Baranoff /* In case of repair and re-using TIME-WAIT sockets we still
17521684dc4SStefan Baranoff * want to be sure that it is safe as above but honor the
17621684dc4SStefan Baranoff * sequence numbers and time stamps set as part of the repair
17721684dc4SStefan Baranoff * process.
17821684dc4SStefan Baranoff *
17921684dc4SStefan Baranoff * Without this check re-using a TIME-WAIT socket with TCP
18021684dc4SStefan Baranoff * repair would accumulate a -1 on the repair assigned
18121684dc4SStefan Baranoff * sequence number. The first time it is reused the sequence
18221684dc4SStefan Baranoff * is -1, the second time -2, etc. This fixes that issue
18321684dc4SStefan Baranoff * without appearing to create any others.
18421684dc4SStefan Baranoff */
18521684dc4SStefan Baranoff if (likely(!tp->repair)) {
1860f317464SEric Dumazet u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
1870f317464SEric Dumazet
1880f317464SEric Dumazet if (!seq)
1890f317464SEric Dumazet seq = 1;
1900f317464SEric Dumazet WRITE_ONCE(tp->write_seq, seq);
19169e0b33aSEric Dumazet tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
19269e0b33aSEric Dumazet tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
19321684dc4SStefan Baranoff }
194f2db7230SKuniyuki Iwashima
1956d6ee43eSArnaldo Carvalho de Melo return 1;
1966d6ee43eSArnaldo Carvalho de Melo }
1976d6ee43eSArnaldo Carvalho de Melo
1986d6ee43eSArnaldo Carvalho de Melo return 0;
1996d6ee43eSArnaldo Carvalho de Melo }
2006d6ee43eSArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(tcp_twsk_unique);
2016d6ee43eSArnaldo Carvalho de Melo
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)202d74bad4eSAndrey Ignatov static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
203d74bad4eSAndrey Ignatov int addr_len)
204d74bad4eSAndrey Ignatov {
205d74bad4eSAndrey Ignatov /* This check is replicated from tcp_v4_connect() and intended to
206d74bad4eSAndrey Ignatov * prevent BPF program called below from accessing bytes that are out
207d74bad4eSAndrey Ignatov * of the bound specified by user in addr_len.
208d74bad4eSAndrey Ignatov */
209d74bad4eSAndrey Ignatov if (addr_len < sizeof(struct sockaddr_in))
210d74bad4eSAndrey Ignatov return -EINVAL;
211d74bad4eSAndrey Ignatov
212d74bad4eSAndrey Ignatov sock_owned_by_me(sk);
213d74bad4eSAndrey Ignatov
214fefba7d1SDaan De Meyer return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
215d74bad4eSAndrey Ignatov }
216d74bad4eSAndrey Ignatov
2171da177e4SLinus Torvalds /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)2181da177e4SLinus Torvalds int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
2191da177e4SLinus Torvalds {
2202d7192d6SDavid S. Miller struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
22108eaef90SKuniyuki Iwashima struct inet_timewait_death_row *tcp_death_row;
2221da177e4SLinus Torvalds struct inet_sock *inet = inet_sk(sk);
2231da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk);
22408eaef90SKuniyuki Iwashima struct ip_options_rcu *inet_opt;
22508eaef90SKuniyuki Iwashima struct net *net = sock_net(sk);
226dca8b089SDavid S. Miller __be16 orig_sport, orig_dport;
2278c5dae4cSKuniyuki Iwashima __be32 daddr, nexthop;
228da905bd1SDavid S. Miller struct flowi4 *fl4;
2292d7192d6SDavid S. Miller struct rtable *rt;
2301da177e4SLinus Torvalds int err;
2311da177e4SLinus Torvalds
2321da177e4SLinus Torvalds if (addr_len < sizeof(struct sockaddr_in))
2331da177e4SLinus Torvalds return -EINVAL;
2341da177e4SLinus Torvalds
2351da177e4SLinus Torvalds if (usin->sin_family != AF_INET)
2361da177e4SLinus Torvalds return -EAFNOSUPPORT;
2371da177e4SLinus Torvalds
2381da177e4SLinus Torvalds nexthop = daddr = usin->sin_addr.s_addr;
239f6d8bd05SEric Dumazet inet_opt = rcu_dereference_protected(inet->inet_opt,
2401e1d04e6SHannes Frederic Sowa lockdep_sock_is_held(sk));
241f6d8bd05SEric Dumazet if (inet_opt && inet_opt->opt.srr) {
2421da177e4SLinus Torvalds if (!daddr)
2431da177e4SLinus Torvalds return -EINVAL;
244f6d8bd05SEric Dumazet nexthop = inet_opt->opt.faddr;
2451da177e4SLinus Torvalds }
2461da177e4SLinus Torvalds
247dca8b089SDavid S. Miller orig_sport = inet->inet_sport;
248dca8b089SDavid S. Miller orig_dport = usin->sin_port;
249da905bd1SDavid S. Miller fl4 = &inet->cork.fl.u.ip4;
250da905bd1SDavid S. Miller rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
25167e1e2f4SGuillaume Nault sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
25267e1e2f4SGuillaume Nault orig_dport, sk);
253b23dd4feSDavid S. Miller if (IS_ERR(rt)) {
254b23dd4feSDavid S. Miller err = PTR_ERR(rt);
255b23dd4feSDavid S. Miller if (err == -ENETUNREACH)
25608eaef90SKuniyuki Iwashima IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
257b23dd4feSDavid S. Miller return err;
258584bdf8cSWei Dong }
2591da177e4SLinus Torvalds
2601da177e4SLinus Torvalds if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
2611da177e4SLinus Torvalds ip_rt_put(rt);
2621da177e4SLinus Torvalds return -ENETUNREACH;
2631da177e4SLinus Torvalds }
2641da177e4SLinus Torvalds
265f6d8bd05SEric Dumazet if (!inet_opt || !inet_opt->opt.srr)
266da905bd1SDavid S. Miller daddr = fl4->daddr;
2671da177e4SLinus Torvalds
2684461568aSKuniyuki Iwashima tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
2694461568aSKuniyuki Iwashima
27028044fc1SJoanne Koong if (!inet->inet_saddr) {
2718c5dae4cSKuniyuki Iwashima err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
27228044fc1SJoanne Koong if (err) {
27328044fc1SJoanne Koong ip_rt_put(rt);
27428044fc1SJoanne Koong return err;
27528044fc1SJoanne Koong }
2768c5dae4cSKuniyuki Iwashima } else {
2778c5dae4cSKuniyuki Iwashima sk_rcv_saddr_set(sk, inet->inet_saddr);
27828044fc1SJoanne Koong }
27928044fc1SJoanne Koong
280c720c7e8SEric Dumazet if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
2811da177e4SLinus Torvalds /* Reset inherited state */
2821da177e4SLinus Torvalds tp->rx_opt.ts_recent = 0;
2831da177e4SLinus Torvalds tp->rx_opt.ts_recent_stamp = 0;
284ee995283SPavel Emelyanov if (likely(!tp->repair))
2850f317464SEric Dumazet WRITE_ONCE(tp->write_seq, 0);
2861da177e4SLinus Torvalds }
2871da177e4SLinus Torvalds
288c720c7e8SEric Dumazet inet->inet_dport = usin->sin_port;
289d1e559d0SEric Dumazet sk_daddr_set(sk, daddr);
2901da177e4SLinus Torvalds
291d83d8461SArnaldo Carvalho de Melo inet_csk(sk)->icsk_ext_hdr_len = 0;
292f6d8bd05SEric Dumazet if (inet_opt)
293f6d8bd05SEric Dumazet inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
2941da177e4SLinus Torvalds
295bee7ca9eSWilliam Allen Simpson tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
2961da177e4SLinus Torvalds
2971da177e4SLinus Torvalds /* Socket identity is still unknown (sport may be zero).
2981da177e4SLinus Torvalds * However we set state to SYN-SENT and not releasing socket
2991da177e4SLinus Torvalds * lock select source port, enter ourselves into the hash tables and
3001da177e4SLinus Torvalds * complete initialization after this.
3011da177e4SLinus Torvalds */
3021da177e4SLinus Torvalds tcp_set_state(sk, TCP_SYN_SENT);
3031946e672SHaishuang Yan err = inet_hash_connect(tcp_death_row, sk);
3041da177e4SLinus Torvalds if (err)
3051da177e4SLinus Torvalds goto failure;
3061da177e4SLinus Torvalds
307877d1f62STom Herbert sk_set_txhash(sk);
3089e7ceb06SSathya Perla
309da905bd1SDavid S. Miller rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
310c720c7e8SEric Dumazet inet->inet_sport, inet->inet_dport, sk);
311b23dd4feSDavid S. Miller if (IS_ERR(rt)) {
312b23dd4feSDavid S. Miller err = PTR_ERR(rt);
313b23dd4feSDavid S. Miller rt = NULL;
3141da177e4SLinus Torvalds goto failure;
315b23dd4feSDavid S. Miller }
316614e8316SEric Dumazet tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
3171da177e4SLinus Torvalds /* OK, now commit destination to socket. */
318bcd76111SHerbert Xu sk->sk_gso_type = SKB_GSO_TCPV4;
319d8d1f30bSChangli Gao sk_setup_caps(sk, &rt->dst);
32019f6d3f3SWei Wang rt = NULL;
3211da177e4SLinus Torvalds
32200355fa5SAlexey Kodanev if (likely(!tp->repair)) {
32384b114b9SEric Dumazet if (!tp->write_seq)
3240f317464SEric Dumazet WRITE_ONCE(tp->write_seq,
3250f317464SEric Dumazet secure_tcp_seq(inet->inet_saddr,
326c720c7e8SEric Dumazet inet->inet_daddr,
327c720c7e8SEric Dumazet inet->inet_sport,
3280f317464SEric Dumazet usin->sin_port));
329dd23c9f1SEric Dumazet WRITE_ONCE(tp->tsoffset,
330dd23c9f1SEric Dumazet secure_tcp_ts_off(net, inet->inet_saddr,
331dd23c9f1SEric Dumazet inet->inet_daddr));
33200355fa5SAlexey Kodanev }
3331da177e4SLinus Torvalds
334f866fbc8SEric Dumazet atomic_set(&inet->inet_id, get_random_u16());
3351da177e4SLinus Torvalds
33619f6d3f3SWei Wang if (tcp_fastopen_defer_connect(sk, &err))
33719f6d3f3SWei Wang return err;
33819f6d3f3SWei Wang if (err)
33919f6d3f3SWei Wang goto failure;
34019f6d3f3SWei Wang
3411da177e4SLinus Torvalds err = tcp_connect(sk);
342ee995283SPavel Emelyanov
3431da177e4SLinus Torvalds if (err)
3441da177e4SLinus Torvalds goto failure;
3451da177e4SLinus Torvalds
3461da177e4SLinus Torvalds return 0;
3471da177e4SLinus Torvalds
3481da177e4SLinus Torvalds failure:
3497174259eSArnaldo Carvalho de Melo /*
3507174259eSArnaldo Carvalho de Melo * This unhashes the socket and releases the local port,
3517174259eSArnaldo Carvalho de Melo * if necessary.
3527174259eSArnaldo Carvalho de Melo */
3531da177e4SLinus Torvalds tcp_set_state(sk, TCP_CLOSE);
354e0833d1fSKuniyuki Iwashima inet_bhash2_reset_saddr(sk);
3551da177e4SLinus Torvalds ip_rt_put(rt);
3561da177e4SLinus Torvalds sk->sk_route_caps = 0;
357c720c7e8SEric Dumazet inet->inet_dport = 0;
3581da177e4SLinus Torvalds return err;
3591da177e4SLinus Torvalds }
3604bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_connect);
3611da177e4SLinus Torvalds
3621da177e4SLinus Torvalds /*
363563d34d0SEric Dumazet * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
364563d34d0SEric Dumazet * It can be called through tcp_release_cb() if socket was owned by user
365563d34d0SEric Dumazet * at the time tcp_v4_err() was called to handle ICMP message.
3661da177e4SLinus Torvalds */
tcp_v4_mtu_reduced(struct sock * sk)3674fab9071SNeal Cardwell void tcp_v4_mtu_reduced(struct sock *sk)
3681da177e4SLinus Torvalds {
3691da177e4SLinus Torvalds struct inet_sock *inet = inet_sk(sk);
37002b2faafSEric Dumazet struct dst_entry *dst;
37102b2faafSEric Dumazet u32 mtu;
3721da177e4SLinus Torvalds
37302b2faafSEric Dumazet if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
37402b2faafSEric Dumazet return;
375561022acSEric Dumazet mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
37680d0a69fSDavid S. Miller dst = inet_csk_update_pmtu(sk, mtu);
37780d0a69fSDavid S. Miller if (!dst)
3781da177e4SLinus Torvalds return;
3791da177e4SLinus Torvalds
3801da177e4SLinus Torvalds /* Something is about to be wrong... Remember soft error
3811da177e4SLinus Torvalds * for the case, if this connection will not able to recover.
3821da177e4SLinus Torvalds */
3831da177e4SLinus Torvalds if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
384cee1af82SEric Dumazet WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
3851da177e4SLinus Torvalds
3861da177e4SLinus Torvalds mtu = dst_mtu(dst);
3871da177e4SLinus Torvalds
3881da177e4SLinus Torvalds if (inet->pmtudisc != IP_PMTUDISC_DONT &&
389482fc609SHannes Frederic Sowa ip_sk_accept_pmtu(sk) &&
390d83d8461SArnaldo Carvalho de Melo inet_csk(sk)->icsk_pmtu_cookie > mtu) {
3911da177e4SLinus Torvalds tcp_sync_mss(sk, mtu);
3921da177e4SLinus Torvalds
3931da177e4SLinus Torvalds /* Resend the TCP packet because it's
3941da177e4SLinus Torvalds * clear that the old packet has been
3951da177e4SLinus Torvalds * dropped. This is the new "fast" path mtu
3961da177e4SLinus Torvalds * discovery.
3971da177e4SLinus Torvalds */
3981da177e4SLinus Torvalds tcp_simple_retransmit(sk);
3991da177e4SLinus Torvalds } /* else let the usual retransmit timer handle it */
4001da177e4SLinus Torvalds }
4014fab9071SNeal Cardwell EXPORT_SYMBOL(tcp_v4_mtu_reduced);
4021da177e4SLinus Torvalds
do_redirect(struct sk_buff * skb,struct sock * sk)40355be7a9cSDavid S. Miller static void do_redirect(struct sk_buff *skb, struct sock *sk)
40455be7a9cSDavid S. Miller {
40555be7a9cSDavid S. Miller struct dst_entry *dst = __sk_dst_check(sk, 0);
40655be7a9cSDavid S. Miller
4071ed5c48fSDavid S. Miller if (dst)
4086700c270SDavid S. Miller dst->ops->redirect(dst, sk, skb);
40955be7a9cSDavid S. Miller }
41055be7a9cSDavid S. Miller
41126e37360SEric Dumazet
41226e37360SEric Dumazet /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)4139cf74903SEric Dumazet void tcp_req_err(struct sock *sk, u32 seq, bool abort)
41426e37360SEric Dumazet {
41526e37360SEric Dumazet struct request_sock *req = inet_reqsk(sk);
41626e37360SEric Dumazet struct net *net = sock_net(sk);
41726e37360SEric Dumazet
41826e37360SEric Dumazet /* ICMPs are not backlogged, hence we cannot get
41926e37360SEric Dumazet * an established socket here.
42026e37360SEric Dumazet */
42126e37360SEric Dumazet if (seq != tcp_rsk(req)->snt_isn) {
42202a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
4239cf74903SEric Dumazet } else if (abort) {
42426e37360SEric Dumazet /*
42526e37360SEric Dumazet * Still in SYN_RECV, just remove it silently.
42626e37360SEric Dumazet * There is no good way to pass the error to the newly
42726e37360SEric Dumazet * created socket, and POSIX does not want network
42826e37360SEric Dumazet * errors returned from accept().
42926e37360SEric Dumazet */
430c6973669SFan Du inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4319caad864SEric Dumazet tcp_listendrop(req->rsk_listener);
43226e37360SEric Dumazet }
433ef84d8ceSEric Dumazet reqsk_put(req);
43426e37360SEric Dumazet }
43526e37360SEric Dumazet EXPORT_SYMBOL(tcp_req_err);
43626e37360SEric Dumazet
437f7456642SEric Dumazet /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)438d2924569SEric Dumazet void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
439f7456642SEric Dumazet {
440f7456642SEric Dumazet struct inet_connection_sock *icsk = inet_csk(sk);
441f7456642SEric Dumazet struct tcp_sock *tp = tcp_sk(sk);
442f7456642SEric Dumazet struct sk_buff *skb;
443f7456642SEric Dumazet s32 remaining;
444f7456642SEric Dumazet u32 delta_us;
445f7456642SEric Dumazet
446f7456642SEric Dumazet if (sock_owned_by_user(sk))
447f7456642SEric Dumazet return;
448f7456642SEric Dumazet
449f7456642SEric Dumazet if (seq != tp->snd_una || !icsk->icsk_retransmits ||
450f7456642SEric Dumazet !icsk->icsk_backoff)
451f7456642SEric Dumazet return;
452f7456642SEric Dumazet
453f7456642SEric Dumazet skb = tcp_rtx_queue_head(sk);
454f7456642SEric Dumazet if (WARN_ON_ONCE(!skb))
455f7456642SEric Dumazet return;
456f7456642SEric Dumazet
457f7456642SEric Dumazet icsk->icsk_backoff--;
458f7456642SEric Dumazet icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
459f7456642SEric Dumazet icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
460f7456642SEric Dumazet
461f7456642SEric Dumazet tcp_mstamp_refresh(tp);
462f7456642SEric Dumazet delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
463f7456642SEric Dumazet remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
464f7456642SEric Dumazet
465f7456642SEric Dumazet if (remaining > 0) {
466f7456642SEric Dumazet inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
467f7456642SEric Dumazet remaining, TCP_RTO_MAX);
468f7456642SEric Dumazet } else {
469f7456642SEric Dumazet /* RTO revert clocked out retransmission.
470f7456642SEric Dumazet * Will retransmit now.
471f7456642SEric Dumazet */
472f7456642SEric Dumazet tcp_retransmit_timer(sk);
473f7456642SEric Dumazet }
474f7456642SEric Dumazet }
475d2924569SEric Dumazet EXPORT_SYMBOL(tcp_ld_RTO_revert);
476f7456642SEric Dumazet
4771da177e4SLinus Torvalds /*
4781da177e4SLinus Torvalds * This routine is called by the ICMP module when it gets some
4791da177e4SLinus Torvalds * sort of error condition. If err < 0 then the socket should
4801da177e4SLinus Torvalds * be closed and the error returned to the user. If err > 0
4811da177e4SLinus Torvalds * it's just the icmp type << 8 | icmp code. After adjustment
4821da177e4SLinus Torvalds * header points to the first 8 bytes of the tcp header. We need
4831da177e4SLinus Torvalds * to find the appropriate port.
4841da177e4SLinus Torvalds *
4851da177e4SLinus Torvalds * The locking strategy used here is very "optimistic". When
4861da177e4SLinus Torvalds * someone else accesses the socket the ICMP is just dropped
4871da177e4SLinus Torvalds * and for some paths there is no check at all.
4881da177e4SLinus Torvalds * A more general error queue to queue errors for later handling
4891da177e4SLinus Torvalds * is probably better.
4901da177e4SLinus Torvalds *
4911da177e4SLinus Torvalds */
4921da177e4SLinus Torvalds
tcp_v4_err(struct sk_buff * skb,u32 info)493a12daf13SEric Dumazet int tcp_v4_err(struct sk_buff *skb, u32 info)
4941da177e4SLinus Torvalds {
495a12daf13SEric Dumazet const struct iphdr *iph = (const struct iphdr *)skb->data;
496a12daf13SEric Dumazet struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
4971da177e4SLinus Torvalds struct tcp_sock *tp;
498a12daf13SEric Dumazet const int type = icmp_hdr(skb)->type;
499a12daf13SEric Dumazet const int code = icmp_hdr(skb)->code;
5001da177e4SLinus Torvalds struct sock *sk;
5010a672f74SYuchung Cheng struct request_sock *fastopen;
5029a568de4SEric Dumazet u32 seq, snd_una;
5031da177e4SLinus Torvalds int err;
504a12daf13SEric Dumazet struct net *net = dev_net(skb->dev);
5051da177e4SLinus Torvalds
5064461568aSKuniyuki Iwashima sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
5074461568aSKuniyuki Iwashima iph->daddr, th->dest, iph->saddr,
5084461568aSKuniyuki Iwashima ntohs(th->source), inet_iif(skb), 0);
5091da177e4SLinus Torvalds if (!sk) {
5105d3848bcSEric Dumazet __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
51132bbd879SStefano Brivio return -ENOENT;
5121da177e4SLinus Torvalds }
5131da177e4SLinus Torvalds if (sk->sk_state == TCP_TIME_WAIT) {
514953af8e3SDmitry Safonov /* To increase the counter of ignored icmps for TCP-AO */
515953af8e3SDmitry Safonov tcp_ao_ignore_icmp(sk, AF_INET, type, code);
5169469c7b4SYOSHIFUJI Hideaki inet_twsk_put(inet_twsk(sk));
51732bbd879SStefano Brivio return 0;
5181da177e4SLinus Torvalds }
51926e37360SEric Dumazet seq = ntohl(th->seq);
52032bbd879SStefano Brivio if (sk->sk_state == TCP_NEW_SYN_RECV) {
52132bbd879SStefano Brivio tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
5229cf74903SEric Dumazet type == ICMP_TIME_EXCEEDED ||
5239cf74903SEric Dumazet (type == ICMP_DEST_UNREACH &&
5249cf74903SEric Dumazet (code == ICMP_NET_UNREACH ||
5259cf74903SEric Dumazet code == ICMP_HOST_UNREACH)));
52632bbd879SStefano Brivio return 0;
52732bbd879SStefano Brivio }
5281da177e4SLinus Torvalds
529953af8e3SDmitry Safonov if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
530953af8e3SDmitry Safonov sock_put(sk);
531953af8e3SDmitry Safonov return 0;
532953af8e3SDmitry Safonov }
533953af8e3SDmitry Safonov
5341da177e4SLinus Torvalds bh_lock_sock(sk);
5351da177e4SLinus Torvalds /* If too many ICMPs get dropped on busy
5361da177e4SLinus Torvalds * servers this needs to be solved differently.
537563d34d0SEric Dumazet * We do take care of PMTU discovery (RFC1191) special case :
538563d34d0SEric Dumazet * we can receive locally generated ICMP messages while socket is held.
5391da177e4SLinus Torvalds */
540b74aa930SEric Dumazet if (sock_owned_by_user(sk)) {
541b74aa930SEric Dumazet if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
54202a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
543b74aa930SEric Dumazet }
5441da177e4SLinus Torvalds if (sk->sk_state == TCP_CLOSE)
5451da177e4SLinus Torvalds goto out;
5461da177e4SLinus Torvalds
547020e71a3SEric Dumazet if (static_branch_unlikely(&ip4_min_ttl)) {
54814834c4fSEric Dumazet /* min_ttl can be changed concurrently from do_ip_setsockopt() */
54914834c4fSEric Dumazet if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
55002a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
55197e3ecd1Sstephen hemminger goto out;
55297e3ecd1Sstephen hemminger }
553020e71a3SEric Dumazet }
55497e3ecd1Sstephen hemminger
5551da177e4SLinus Torvalds tp = tcp_sk(sk);
5560a672f74SYuchung Cheng /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
557d983ea6fSEric Dumazet fastopen = rcu_dereference(tp->fastopen_rsk);
5580a672f74SYuchung Cheng snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
5591da177e4SLinus Torvalds if (sk->sk_state != TCP_LISTEN &&
5600a672f74SYuchung Cheng !between(seq, snd_una, tp->snd_nxt)) {
56102a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
5621da177e4SLinus Torvalds goto out;
5631da177e4SLinus Torvalds }
5641da177e4SLinus Torvalds
5651da177e4SLinus Torvalds switch (type) {
56655be7a9cSDavid S. Miller case ICMP_REDIRECT:
56745caeaa5SJon Maxwell if (!sock_owned_by_user(sk))
568a12daf13SEric Dumazet do_redirect(skb, sk);
56955be7a9cSDavid S. Miller goto out;
5701da177e4SLinus Torvalds case ICMP_SOURCE_QUENCH:
5711da177e4SLinus Torvalds /* Just silently ignore these. */
5721da177e4SLinus Torvalds goto out;
5731da177e4SLinus Torvalds case ICMP_PARAMETERPROB:
5741da177e4SLinus Torvalds err = EPROTO;
5751da177e4SLinus Torvalds break;
5761da177e4SLinus Torvalds case ICMP_DEST_UNREACH:
5771da177e4SLinus Torvalds if (code > NR_ICMP_UNREACH)
5781da177e4SLinus Torvalds goto out;
5791da177e4SLinus Torvalds
5801da177e4SLinus Torvalds if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
5810d4f0608SEric Dumazet /* We are not interested in TCP_LISTEN and open_requests
5820d4f0608SEric Dumazet * (SYN-ACKs send out by Linux are always <576bytes so
5830d4f0608SEric Dumazet * they should go through unfragmented).
5840d4f0608SEric Dumazet */
5850d4f0608SEric Dumazet if (sk->sk_state == TCP_LISTEN)
5860d4f0608SEric Dumazet goto out;
5870d4f0608SEric Dumazet
588561022acSEric Dumazet WRITE_ONCE(tp->mtu_info, info);
589144d56e9SEric Dumazet if (!sock_owned_by_user(sk)) {
590563d34d0SEric Dumazet tcp_v4_mtu_reduced(sk);
591144d56e9SEric Dumazet } else {
5927aa5470cSEric Dumazet if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
593144d56e9SEric Dumazet sock_hold(sk);
594144d56e9SEric Dumazet }
5951da177e4SLinus Torvalds goto out;
5961da177e4SLinus Torvalds }
5971da177e4SLinus Torvalds
5981da177e4SLinus Torvalds err = icmp_err_convert[code].errno;
599f7456642SEric Dumazet /* check if this ICMP message allows revert of backoff.
600f7456642SEric Dumazet * (see RFC 6069)
601f7456642SEric Dumazet */
602f7456642SEric Dumazet if (!fastopen &&
603f7456642SEric Dumazet (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
604f7456642SEric Dumazet tcp_ld_RTO_revert(sk, seq);
6051da177e4SLinus Torvalds break;
6061da177e4SLinus Torvalds case ICMP_TIME_EXCEEDED:
6071da177e4SLinus Torvalds err = EHOSTUNREACH;
6081da177e4SLinus Torvalds break;
6091da177e4SLinus Torvalds default:
6101da177e4SLinus Torvalds goto out;
6111da177e4SLinus Torvalds }
6121da177e4SLinus Torvalds
6131da177e4SLinus Torvalds switch (sk->sk_state) {
6141da177e4SLinus Torvalds case TCP_SYN_SENT:
6150a672f74SYuchung Cheng case TCP_SYN_RECV:
6160a672f74SYuchung Cheng /* Only in fast or simultaneous open. If a fast open socket is
6172bdcc73cSRandy Dunlap * already accepted it is treated as a connected one below.
6181da177e4SLinus Torvalds */
61951456b29SIan Morris if (fastopen && !fastopen->sk)
6200a672f74SYuchung Cheng break;
6210a672f74SYuchung Cheng
622a12daf13SEric Dumazet ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
62345af29caSEric Dumazet
624fde6f897SEric Dumazet if (!sock_owned_by_user(sk))
625fde6f897SEric Dumazet tcp_done_with_error(sk, err);
626fde6f897SEric Dumazet else
627cee1af82SEric Dumazet WRITE_ONCE(sk->sk_err_soft, err);
6281da177e4SLinus Torvalds goto out;
6291da177e4SLinus Torvalds }
6301da177e4SLinus Torvalds
6311da177e4SLinus Torvalds /* If we've already connected we will keep trying
6321da177e4SLinus Torvalds * until we time out, or the user gives up.
6331da177e4SLinus Torvalds *
6341da177e4SLinus Torvalds * rfc1122 4.2.3.9 allows to consider as hard errors
6351da177e4SLinus Torvalds * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
6361da177e4SLinus Torvalds * but it is obsoleted by pmtu discovery).
6371da177e4SLinus Torvalds *
6381da177e4SLinus Torvalds * Note, that in modern internet, where routing is unreliable
6391da177e4SLinus Torvalds * and in each dark corner broken firewalls sit, sending random
6401da177e4SLinus Torvalds * errors ordered by their masters even this two messages finally lose
6411da177e4SLinus Torvalds * their original sense (even Linux sends invalid PORT_UNREACHs)
6421da177e4SLinus Torvalds *
6431da177e4SLinus Torvalds * Now we are in compliance with RFCs.
6441da177e4SLinus Torvalds * --ANK (980905)
6451da177e4SLinus Torvalds */
6461da177e4SLinus Torvalds
6476b5f43eaSEric Dumazet if (!sock_owned_by_user(sk) &&
6486b5f43eaSEric Dumazet inet_test_bit(RECVERR, sk)) {
649e13ec3daSEric Dumazet WRITE_ONCE(sk->sk_err, err);
650e3ae2365SAlexander Aring sk_error_report(sk);
6511da177e4SLinus Torvalds } else { /* Only an error on timeout */
652cee1af82SEric Dumazet WRITE_ONCE(sk->sk_err_soft, err);
6531da177e4SLinus Torvalds }
6541da177e4SLinus Torvalds
6551da177e4SLinus Torvalds out:
6561da177e4SLinus Torvalds bh_unlock_sock(sk);
6571da177e4SLinus Torvalds sock_put(sk);
65832bbd879SStefano Brivio return 0;
6591da177e4SLinus Torvalds }
6601da177e4SLinus Torvalds
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)66128850dc7SDaniel Borkmann void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
6621da177e4SLinus Torvalds {
663aa8223c7SArnaldo Carvalho de Melo struct tcphdr *th = tcp_hdr(skb);
6641da177e4SLinus Torvalds
665419f9f89SHerbert Xu th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
666663ead3bSHerbert Xu skb->csum_start = skb_transport_header(skb) - skb->head;
667ff1dcadbSAl Viro skb->csum_offset = offsetof(struct tcphdr, check);
6681da177e4SLinus Torvalds }
6691da177e4SLinus Torvalds
670419f9f89SHerbert Xu /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)671bb296246SHerbert Xu void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
672419f9f89SHerbert Xu {
673cf533ea5SEric Dumazet const struct inet_sock *inet = inet_sk(sk);
674419f9f89SHerbert Xu
675419f9f89SHerbert Xu __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
676419f9f89SHerbert Xu }
6774bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_send_check);
678419f9f89SHerbert Xu
679ba7783adSDmitry Safonov #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
680ba7783adSDmitry Safonov
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])681ba7783adSDmitry Safonov static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
682ba7783adSDmitry Safonov const struct tcp_ao_hdr *aoh,
683ba7783adSDmitry Safonov struct ip_reply_arg *arg, struct tcphdr *reply,
684ba7783adSDmitry Safonov __be32 reply_options[REPLY_OPTIONS_LEN])
685ba7783adSDmitry Safonov {
686ba7783adSDmitry Safonov #ifdef CONFIG_TCP_AO
687ba7783adSDmitry Safonov int sdif = tcp_v4_sdif(skb);
688ba7783adSDmitry Safonov int dif = inet_iif(skb);
689ba7783adSDmitry Safonov int l3index = sdif ? dif : 0;
690ba7783adSDmitry Safonov bool allocated_traffic_key;
691ba7783adSDmitry Safonov struct tcp_ao_key *key;
692ba7783adSDmitry Safonov char *traffic_key;
693ba7783adSDmitry Safonov bool drop = true;
694ba7783adSDmitry Safonov u32 ao_sne = 0;
695ba7783adSDmitry Safonov u8 keyid;
696ba7783adSDmitry Safonov
697ba7783adSDmitry Safonov rcu_read_lock();
69864382c71SDmitry Safonov if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
699ba7783adSDmitry Safonov &key, &traffic_key, &allocated_traffic_key,
700ba7783adSDmitry Safonov &keyid, &ao_sne))
701ba7783adSDmitry Safonov goto out;
702ba7783adSDmitry Safonov
703ba7783adSDmitry Safonov reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
704ba7783adSDmitry Safonov (aoh->rnext_keyid << 8) | keyid);
705da7dfaa6SDmitry Safonov arg->iov[0].iov_len += tcp_ao_len_aligned(key);
706ba7783adSDmitry Safonov reply->doff = arg->iov[0].iov_len / 4;
707ba7783adSDmitry Safonov
708ba7783adSDmitry Safonov if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
709ba7783adSDmitry Safonov key, traffic_key,
710ba7783adSDmitry Safonov (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
711ba7783adSDmitry Safonov (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
712ba7783adSDmitry Safonov reply, ao_sne))
713ba7783adSDmitry Safonov goto out;
714ba7783adSDmitry Safonov drop = false;
715ba7783adSDmitry Safonov out:
716ba7783adSDmitry Safonov rcu_read_unlock();
717ba7783adSDmitry Safonov if (allocated_traffic_key)
718ba7783adSDmitry Safonov kfree(traffic_key);
719ba7783adSDmitry Safonov return drop;
720ba7783adSDmitry Safonov #else
721ba7783adSDmitry Safonov return true;
722ba7783adSDmitry Safonov #endif
723ba7783adSDmitry Safonov }
724ba7783adSDmitry Safonov
7251da177e4SLinus Torvalds /*
7261da177e4SLinus Torvalds * This routine will send an RST to the other tcp.
7271da177e4SLinus Torvalds *
7281da177e4SLinus Torvalds * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
7291da177e4SLinus Torvalds * for reset.
7301da177e4SLinus Torvalds * Answer: if a packet caused RST, it is not for a socket
7311da177e4SLinus Torvalds * existing in our system, if it is matched to a socket,
7321da177e4SLinus Torvalds * it is just duplicate segment or bug in other side's TCP.
7331da177e4SLinus Torvalds * So that we build reply only basing on parameters
7341da177e4SLinus Torvalds * arrived with segment.
7351da177e4SLinus Torvalds * Exception: precedence violation. We do not implement it in any case.
7361da177e4SLinus Torvalds */
7371da177e4SLinus Torvalds
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)7386be49deaSJason Xing static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
7396be49deaSJason Xing enum sk_rst_reason reason)
7401da177e4SLinus Torvalds {
741cf533ea5SEric Dumazet const struct tcphdr *th = tcp_hdr(skb);
742cfb6eeb4SYOSHIFUJI Hideaki struct {
743cfb6eeb4SYOSHIFUJI Hideaki struct tcphdr th;
744ba7783adSDmitry Safonov __be32 opt[REPLY_OPTIONS_LEN];
745cfb6eeb4SYOSHIFUJI Hideaki } rep;
746ba7783adSDmitry Safonov const __u8 *md5_hash_location = NULL;
747ba7783adSDmitry Safonov const struct tcp_ao_hdr *aoh;
7481da177e4SLinus Torvalds struct ip_reply_arg arg;
749cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
750e46787f0SFlorian Westphal struct tcp_md5sig_key *key = NULL;
751658ddaafSShawn Lu unsigned char newhash[16];
752658ddaafSShawn Lu struct sock *sk1 = NULL;
753ba7783adSDmitry Safonov int genhash;
754cfb6eeb4SYOSHIFUJI Hideaki #endif
755d6fb396cSEric Dumazet u64 transmit_time = 0;
75600483690SJon Maxwell struct sock *ctl_sk;
757d6fb396cSEric Dumazet struct net *net;
758c0a8966eSAntoine Tenart u32 txhash = 0;
7591da177e4SLinus Torvalds
7601da177e4SLinus Torvalds /* Never send a reset in response to a reset. */
7611da177e4SLinus Torvalds if (th->rst)
7621da177e4SLinus Torvalds return;
7631da177e4SLinus Torvalds
764c3658e8dSEric Dumazet /* If sk not NULL, it means we did a successful lookup and incoming
765c3658e8dSEric Dumazet * route had to be correct. prequeue might have dropped our dst.
766c3658e8dSEric Dumazet */
767c3658e8dSEric Dumazet if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
7681da177e4SLinus Torvalds return;
7691da177e4SLinus Torvalds
7701da177e4SLinus Torvalds /* Swap the send and the receive. */
771cfb6eeb4SYOSHIFUJI Hideaki memset(&rep, 0, sizeof(rep));
772cfb6eeb4SYOSHIFUJI Hideaki rep.th.dest = th->source;
773cfb6eeb4SYOSHIFUJI Hideaki rep.th.source = th->dest;
774cfb6eeb4SYOSHIFUJI Hideaki rep.th.doff = sizeof(struct tcphdr) / 4;
775cfb6eeb4SYOSHIFUJI Hideaki rep.th.rst = 1;
7761da177e4SLinus Torvalds
7771da177e4SLinus Torvalds if (th->ack) {
778cfb6eeb4SYOSHIFUJI Hideaki rep.th.seq = th->ack_seq;
7791da177e4SLinus Torvalds } else {
780cfb6eeb4SYOSHIFUJI Hideaki rep.th.ack = 1;
781cfb6eeb4SYOSHIFUJI Hideaki rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
7821da177e4SLinus Torvalds skb->len - (th->doff << 2));
7831da177e4SLinus Torvalds }
7841da177e4SLinus Torvalds
7857174259eSArnaldo Carvalho de Melo memset(&arg, 0, sizeof(arg));
786cfb6eeb4SYOSHIFUJI Hideaki arg.iov[0].iov_base = (unsigned char *)&rep;
787cfb6eeb4SYOSHIFUJI Hideaki arg.iov[0].iov_len = sizeof(rep.th);
788cfb6eeb4SYOSHIFUJI Hideaki
7890f85feaeSEric Dumazet net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
790ba7783adSDmitry Safonov
791f7dca36fSDmitry Safonov /* Invalid TCP option size or twice included auth */
792ba7783adSDmitry Safonov if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
793f7dca36fSDmitry Safonov return;
794f7dca36fSDmitry Safonov
795ba7783adSDmitry Safonov if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
796ba7783adSDmitry Safonov return;
797ba7783adSDmitry Safonov
798ba7783adSDmitry Safonov #ifdef CONFIG_TCP_MD5SIG
7993b24d854SEric Dumazet rcu_read_lock();
800271c3b9bSFlorian Westphal if (sk && sk_fullsock(sk)) {
801cea97609SDavid Ahern const union tcp_md5_addr *addr;
802dea53bb8SDavid Ahern int l3index;
803cea97609SDavid Ahern
804dea53bb8SDavid Ahern /* sdif set, means packet ingressed via a device
805dea53bb8SDavid Ahern * in an L3 domain and inet_iif is set to it.
806dea53bb8SDavid Ahern */
807dea53bb8SDavid Ahern l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
808cea97609SDavid Ahern addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
809dea53bb8SDavid Ahern key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
810f7dca36fSDmitry Safonov } else if (md5_hash_location) {
811cea97609SDavid Ahern const union tcp_md5_addr *addr;
812534322caSDavid Ahern int sdif = tcp_v4_sdif(skb);
813534322caSDavid Ahern int dif = inet_iif(skb);
814dea53bb8SDavid Ahern int l3index;
815cea97609SDavid Ahern
816658ddaafSShawn Lu /*
817658ddaafSShawn Lu * active side is lost. Try to find listening socket through
818658ddaafSShawn Lu * source port, and then find md5 key through listening socket.
819658ddaafSShawn Lu * we are not loose security here:
820658ddaafSShawn Lu * Incoming packet is checked with md5 hash with finding key,
821658ddaafSShawn Lu * no RST generated if md5 hash doesn't match.
822658ddaafSShawn Lu */
8234461568aSKuniyuki Iwashima sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
8244461568aSKuniyuki Iwashima NULL, 0, ip_hdr(skb)->saddr,
825da5e3630STom Herbert th->source, ip_hdr(skb)->daddr,
826534322caSDavid Ahern ntohs(th->source), dif, sdif);
827658ddaafSShawn Lu /* don't send rst if it can't find key */
828658ddaafSShawn Lu if (!sk1)
8293b24d854SEric Dumazet goto out;
8303b24d854SEric Dumazet
831dea53bb8SDavid Ahern /* sdif set, means packet ingressed via a device
832dea53bb8SDavid Ahern * in an L3 domain and dif is set to it.
833dea53bb8SDavid Ahern */
834dea53bb8SDavid Ahern l3index = sdif ? dif : 0;
835cea97609SDavid Ahern addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
836dea53bb8SDavid Ahern key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
837658ddaafSShawn Lu if (!key)
8383b24d854SEric Dumazet goto out;
8393b24d854SEric Dumazet
840658ddaafSShawn Lu
84139f8e58eSEric Dumazet genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
842f7dca36fSDmitry Safonov if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
8433b24d854SEric Dumazet goto out;
8443b24d854SEric Dumazet
845658ddaafSShawn Lu }
846658ddaafSShawn Lu
847cfb6eeb4SYOSHIFUJI Hideaki if (key) {
848cfb6eeb4SYOSHIFUJI Hideaki rep.opt[0] = htonl((TCPOPT_NOP << 24) |
849cfb6eeb4SYOSHIFUJI Hideaki (TCPOPT_NOP << 16) |
850cfb6eeb4SYOSHIFUJI Hideaki (TCPOPT_MD5SIG << 8) |
851cfb6eeb4SYOSHIFUJI Hideaki TCPOLEN_MD5SIG);
852cfb6eeb4SYOSHIFUJI Hideaki /* Update length and the length the header thinks exists */
853cfb6eeb4SYOSHIFUJI Hideaki arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
854cfb6eeb4SYOSHIFUJI Hideaki rep.th.doff = arg.iov[0].iov_len / 4;
855cfb6eeb4SYOSHIFUJI Hideaki
85649a72dfbSAdam Langley tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
85778e645cbSIlpo Järvinen key, ip_hdr(skb)->saddr,
85878e645cbSIlpo Järvinen ip_hdr(skb)->daddr, &rep.th);
859cfb6eeb4SYOSHIFUJI Hideaki }
860cfb6eeb4SYOSHIFUJI Hideaki #endif
861dc87efdbSFlorian Westphal /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
862dc87efdbSFlorian Westphal if (rep.opt[0] == 0) {
863dc87efdbSFlorian Westphal __be32 mrst = mptcp_reset_option(skb);
864dc87efdbSFlorian Westphal
865dc87efdbSFlorian Westphal if (mrst) {
866dc87efdbSFlorian Westphal rep.opt[0] = mrst;
867dc87efdbSFlorian Westphal arg.iov[0].iov_len += sizeof(mrst);
868dc87efdbSFlorian Westphal rep.th.doff = arg.iov[0].iov_len / 4;
869dc87efdbSFlorian Westphal }
870dc87efdbSFlorian Westphal }
871dc87efdbSFlorian Westphal
872eddc9ec5SArnaldo Carvalho de Melo arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
873eddc9ec5SArnaldo Carvalho de Melo ip_hdr(skb)->saddr, /* XXX */
87452cd5750SIlpo Järvinen arg.iov[0].iov_len, IPPROTO_TCP, 0);
8751da177e4SLinus Torvalds arg.csumoffset = offsetof(struct tcphdr, check) / 2;
876271c3b9bSFlorian Westphal arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
877271c3b9bSFlorian Westphal
878e2446eaaSShawn Lu /* When socket is gone, all binding information is lost.
8794c675258SAlexey Kuznetsov * routing might fail in this case. No choice here, if we choose to force
8804c675258SAlexey Kuznetsov * input interface, we will misroute in case of asymmetric route.
881e2446eaaSShawn Lu */
88219822a98SJason Xing if (sk)
8834c675258SAlexey Kuznetsov arg.bound_dev_if = sk->sk_bound_dev_if;
88419822a98SJason Xing
885b533fb9cSJason Xing trace_tcp_send_reset(sk, skb, reason);
8861da177e4SLinus Torvalds
887271c3b9bSFlorian Westphal BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
888271c3b9bSFlorian Westphal offsetof(struct inet_timewait_sock, tw_bound_dev_if));
889271c3b9bSFlorian Westphal
89066b13d99SEric Dumazet arg.tos = ip_hdr(skb)->tos;
891e2d118a1SLorenzo Colitti arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
89247dcc20aSEric Dumazet local_bh_disable();
893ebad6d03SSebastian Andrzej Siewior local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
894ebad6d03SSebastian Andrzej Siewior ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
895ebad6d03SSebastian Andrzej Siewior
89637ba017dSEric Dumazet sock_net_set(ctl_sk, net);
897a842fe14SEric Dumazet if (sk) {
89800483690SJon Maxwell ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
89900483690SJon Maxwell inet_twsk(sk)->tw_mark : sk->sk_mark;
900f6c0f5d2SEric Dumazet ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
90110bbf165SEric Dumazet inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
902d6fb396cSEric Dumazet transmit_time = tcp_transmit_time(sk);
903e22aa148Ssewookseo xfrm_sk_clone_policy(ctl_sk, sk);
904c0a8966eSAntoine Tenart txhash = (sk->sk_state == TCP_TIME_WAIT) ?
905c0a8966eSAntoine Tenart inet_twsk(sk)->tw_txhash : sk->sk_txhash;
9061e306ec4SEric Dumazet } else {
9071e306ec4SEric Dumazet ctl_sk->sk_mark = 0;
9081e306ec4SEric Dumazet ctl_sk->sk_priority = 0;
909a842fe14SEric Dumazet }
91000483690SJon Maxwell ip_send_unicast_reply(ctl_sk,
911bdbbb852SEric Dumazet skb, &TCP_SKB_CB(skb)->header.h4.opt,
91224a2d43dSEric Dumazet ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913d6fb396cSEric Dumazet &arg, arg.iov[0].iov_len,
914c0a8966eSAntoine Tenart transmit_time, txhash);
9151da177e4SLinus Torvalds
916e22aa148Ssewookseo xfrm_sk_free_policy(ctl_sk);
91737ba017dSEric Dumazet sock_net_set(ctl_sk, &init_net);
91890bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
91990bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
920ebad6d03SSebastian Andrzej Siewior local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
92147dcc20aSEric Dumazet local_bh_enable();
922658ddaafSShawn Lu
923658ddaafSShawn Lu #ifdef CONFIG_TCP_MD5SIG
9243b24d854SEric Dumazet out:
925658ddaafSShawn Lu rcu_read_unlock();
926658ddaafSShawn Lu #endif
9271da177e4SLinus Torvalds }
9281da177e4SLinus Torvalds
9291da177e4SLinus Torvalds /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
9301da177e4SLinus Torvalds outside socket context is ugly, certainly. What can I do?
9311da177e4SLinus Torvalds */
9321da177e4SLinus Torvalds
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)933e2d118a1SLorenzo Colitti static void tcp_v4_send_ack(const struct sock *sk,
934e62a123bSEric Dumazet struct sk_buff *skb, u32 seq, u32 ack,
935ee684b6fSAndrey Vagin u32 win, u32 tsval, u32 tsecr, int oif,
936decde258SDmitry Safonov struct tcp_key *key,
937c0a8966eSAntoine Tenart int reply_flags, u8 tos, u32 txhash)
9381da177e4SLinus Torvalds {
939cf533ea5SEric Dumazet const struct tcphdr *th = tcp_hdr(skb);
9401da177e4SLinus Torvalds struct {
9411da177e4SLinus Torvalds struct tcphdr th;
942decde258SDmitry Safonov __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
9431da177e4SLinus Torvalds } rep;
944e2d118a1SLorenzo Colitti struct net *net = sock_net(sk);
9451da177e4SLinus Torvalds struct ip_reply_arg arg;
94600483690SJon Maxwell struct sock *ctl_sk;
947d6fb396cSEric Dumazet u64 transmit_time;
9481da177e4SLinus Torvalds
9491da177e4SLinus Torvalds memset(&rep.th, 0, sizeof(struct tcphdr));
9507174259eSArnaldo Carvalho de Melo memset(&arg, 0, sizeof(arg));
9511da177e4SLinus Torvalds
9521da177e4SLinus Torvalds arg.iov[0].iov_base = (unsigned char *)&rep;
9531da177e4SLinus Torvalds arg.iov[0].iov_len = sizeof(rep.th);
954ee684b6fSAndrey Vagin if (tsecr) {
955cfb6eeb4SYOSHIFUJI Hideaki rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
9561da177e4SLinus Torvalds (TCPOPT_TIMESTAMP << 8) |
9571da177e4SLinus Torvalds TCPOLEN_TIMESTAMP);
958ee684b6fSAndrey Vagin rep.opt[1] = htonl(tsval);
959ee684b6fSAndrey Vagin rep.opt[2] = htonl(tsecr);
960cb48cfe8SCraig Schlenter arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
9611da177e4SLinus Torvalds }
9621da177e4SLinus Torvalds
9631da177e4SLinus Torvalds /* Swap the send and the receive. */
9641da177e4SLinus Torvalds rep.th.dest = th->source;
9651da177e4SLinus Torvalds rep.th.source = th->dest;
9661da177e4SLinus Torvalds rep.th.doff = arg.iov[0].iov_len / 4;
9671da177e4SLinus Torvalds rep.th.seq = htonl(seq);
9681da177e4SLinus Torvalds rep.th.ack_seq = htonl(ack);
9691da177e4SLinus Torvalds rep.th.ack = 1;
9701da177e4SLinus Torvalds rep.th.window = htons(win);
9711da177e4SLinus Torvalds
972cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
973decde258SDmitry Safonov if (tcp_key_is_md5(key)) {
974ee684b6fSAndrey Vagin int offset = (tsecr) ? 3 : 0;
975cfb6eeb4SYOSHIFUJI Hideaki
976cfb6eeb4SYOSHIFUJI Hideaki rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
977cfb6eeb4SYOSHIFUJI Hideaki (TCPOPT_NOP << 16) |
978cfb6eeb4SYOSHIFUJI Hideaki (TCPOPT_MD5SIG << 8) |
979cfb6eeb4SYOSHIFUJI Hideaki TCPOLEN_MD5SIG);
980cfb6eeb4SYOSHIFUJI Hideaki arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
981cfb6eeb4SYOSHIFUJI Hideaki rep.th.doff = arg.iov[0].iov_len/4;
982cfb6eeb4SYOSHIFUJI Hideaki
98349a72dfbSAdam Langley tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
984decde258SDmitry Safonov key->md5_key, ip_hdr(skb)->saddr,
98590b7e112SAdam Langley ip_hdr(skb)->daddr, &rep.th);
986cfb6eeb4SYOSHIFUJI Hideaki }
987cfb6eeb4SYOSHIFUJI Hideaki #endif
988decde258SDmitry Safonov #ifdef CONFIG_TCP_AO
989decde258SDmitry Safonov if (tcp_key_is_ao(key)) {
990decde258SDmitry Safonov int offset = (tsecr) ? 3 : 0;
991decde258SDmitry Safonov
992decde258SDmitry Safonov rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
993decde258SDmitry Safonov (tcp_ao_len(key->ao_key) << 16) |
994decde258SDmitry Safonov (key->ao_key->sndid << 8) |
995decde258SDmitry Safonov key->rcv_next);
996da7dfaa6SDmitry Safonov arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
997decde258SDmitry Safonov rep.th.doff = arg.iov[0].iov_len / 4;
998decde258SDmitry Safonov
999decde258SDmitry Safonov tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1000decde258SDmitry Safonov key->ao_key, key->traffic_key,
1001decde258SDmitry Safonov (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1002decde258SDmitry Safonov (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1003decde258SDmitry Safonov &rep.th, key->sne);
1004decde258SDmitry Safonov }
1005decde258SDmitry Safonov #endif
100688ef4a5aSKOVACS Krisztian arg.flags = reply_flags;
1007eddc9ec5SArnaldo Carvalho de Melo arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1008eddc9ec5SArnaldo Carvalho de Melo ip_hdr(skb)->saddr, /* XXX */
10091da177e4SLinus Torvalds arg.iov[0].iov_len, IPPROTO_TCP, 0);
10101da177e4SLinus Torvalds arg.csumoffset = offsetof(struct tcphdr, check) / 2;
10119501f972SYOSHIFUJI Hideaki if (oif)
10129501f972SYOSHIFUJI Hideaki arg.bound_dev_if = oif;
101366b13d99SEric Dumazet arg.tos = tos;
1014e2d118a1SLorenzo Colitti arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
101547dcc20aSEric Dumazet local_bh_disable();
1016ebad6d03SSebastian Andrzej Siewior local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1017ebad6d03SSebastian Andrzej Siewior ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
101837ba017dSEric Dumazet sock_net_set(ctl_sk, net);
101900483690SJon Maxwell ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
10203c5b4d69SEric Dumazet inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1021f6c0f5d2SEric Dumazet ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
10228bf43be7SEric Dumazet inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1023d6fb396cSEric Dumazet transmit_time = tcp_transmit_time(sk);
102400483690SJon Maxwell ip_send_unicast_reply(ctl_sk,
1025bdbbb852SEric Dumazet skb, &TCP_SKB_CB(skb)->header.h4.opt,
102624a2d43dSEric Dumazet ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1027d6fb396cSEric Dumazet &arg, arg.iov[0].iov_len,
1028c0a8966eSAntoine Tenart transmit_time, txhash);
10291da177e4SLinus Torvalds
103037ba017dSEric Dumazet sock_net_set(ctl_sk, &init_net);
103190bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1032ebad6d03SSebastian Andrzej Siewior local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
103347dcc20aSEric Dumazet local_bh_enable();
10341da177e4SLinus Torvalds }
10351da177e4SLinus Torvalds
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)10361da177e4SLinus Torvalds static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
10371da177e4SLinus Torvalds {
10388feaf0c0SArnaldo Carvalho de Melo struct inet_timewait_sock *tw = inet_twsk(sk);
1039cfb6eeb4SYOSHIFUJI Hideaki struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1040decde258SDmitry Safonov struct tcp_key key = {};
1041decde258SDmitry Safonov #ifdef CONFIG_TCP_AO
1042decde258SDmitry Safonov struct tcp_ao_info *ao_info;
1043decde258SDmitry Safonov
104467fa83f7SDmitry Safonov if (static_branch_unlikely(&tcp_ao_needed.key)) {
1045decde258SDmitry Safonov /* FIXME: the segment to-be-acked is not verified yet */
1046decde258SDmitry Safonov ao_info = rcu_dereference(tcptw->ao_info);
1047decde258SDmitry Safonov if (ao_info) {
1048decde258SDmitry Safonov const struct tcp_ao_hdr *aoh;
1049decde258SDmitry Safonov
1050decde258SDmitry Safonov if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1051decde258SDmitry Safonov inet_twsk_put(tw);
1052decde258SDmitry Safonov return;
1053decde258SDmitry Safonov }
1054decde258SDmitry Safonov
1055decde258SDmitry Safonov if (aoh)
1056decde258SDmitry Safonov key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1057decde258SDmitry Safonov }
105867fa83f7SDmitry Safonov }
1059decde258SDmitry Safonov if (key.ao_key) {
1060decde258SDmitry Safonov struct tcp_ao_key *rnext_key;
1061decde258SDmitry Safonov
1062decde258SDmitry Safonov key.traffic_key = snd_other_key(key.ao_key);
106364382c71SDmitry Safonov key.sne = READ_ONCE(ao_info->snd_sne);
1064decde258SDmitry Safonov rnext_key = READ_ONCE(ao_info->rnext_key);
1065decde258SDmitry Safonov key.rcv_next = rnext_key->rcvid;
1066decde258SDmitry Safonov key.type = TCP_KEY_AO;
1067decde258SDmitry Safonov #else
1068decde258SDmitry Safonov if (0) {
1069decde258SDmitry Safonov #endif
10703966a668SDmitry Safonov } else if (static_branch_tcp_md5()) {
1071decde258SDmitry Safonov key.md5_key = tcp_twsk_md5_key(tcptw);
1072decde258SDmitry Safonov if (key.md5_key)
1073decde258SDmitry Safonov key.type = TCP_KEY_MD5;
1074decde258SDmitry Safonov }
10751da177e4SLinus Torvalds
1076e2d118a1SLorenzo Colitti tcp_v4_send_ack(sk, skb,
1077c0a11493SEric Dumazet tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
10787174259eSArnaldo Carvalho de Melo tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
107916cf6477SEric Dumazet tcp_tw_tsval(tcptw),
108069e0b33aSEric Dumazet READ_ONCE(tcptw->tw_ts_recent),
1081decde258SDmitry Safonov tw->tw_bound_dev_if, &key,
108266b13d99SEric Dumazet tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1083c0a8966eSAntoine Tenart tw->tw_tos,
1084decde258SDmitry Safonov tw->tw_txhash);
10851da177e4SLinus Torvalds
10868feaf0c0SArnaldo Carvalho de Melo inet_twsk_put(tw);
10871da177e4SLinus Torvalds }
10881da177e4SLinus Torvalds
1089a00e7444SEric Dumazet static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
10907174259eSArnaldo Carvalho de Melo struct request_sock *req)
10911da177e4SLinus Torvalds {
1092decde258SDmitry Safonov struct tcp_key key = {};
1093cea97609SDavid Ahern
1094168a8f58SJerry Chu /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1095168a8f58SJerry Chu * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1096168a8f58SJerry Chu */
1097e62a123bSEric Dumazet u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1098e62a123bSEric Dumazet tcp_sk(sk)->snd_nxt;
1099e62a123bSEric Dumazet
110006b22ef2SDmitry Safonov #ifdef CONFIG_TCP_AO
110167fa83f7SDmitry Safonov if (static_branch_unlikely(&tcp_ao_needed.key) &&
110267fa83f7SDmitry Safonov tcp_rsk_used_ao(req)) {
110306b22ef2SDmitry Safonov const union tcp_md5_addr *addr;
110406b22ef2SDmitry Safonov const struct tcp_ao_hdr *aoh;
1105248411b8SDmitry Safonov int l3index;
110606b22ef2SDmitry Safonov
110706b22ef2SDmitry Safonov /* Invalid TCP option size or twice included auth */
110806b22ef2SDmitry Safonov if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
110906b22ef2SDmitry Safonov return;
111006b22ef2SDmitry Safonov if (!aoh)
111106b22ef2SDmitry Safonov return;
111206b22ef2SDmitry Safonov
111306b22ef2SDmitry Safonov addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1114248411b8SDmitry Safonov l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1115248411b8SDmitry Safonov key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
111606b22ef2SDmitry Safonov aoh->rnext_keyid, -1);
111706b22ef2SDmitry Safonov if (unlikely(!key.ao_key)) {
111806b22ef2SDmitry Safonov /* Send ACK with any matching MKT for the peer */
1119248411b8SDmitry Safonov key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
112006b22ef2SDmitry Safonov /* Matching key disappeared (user removed the key?)
112106b22ef2SDmitry Safonov * let the handshake timeout.
112220a2b49fSEric Dumazet */
112306b22ef2SDmitry Safonov if (!key.ao_key) {
112406b22ef2SDmitry Safonov net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
112506b22ef2SDmitry Safonov addr,
112606b22ef2SDmitry Safonov ntohs(tcp_hdr(skb)->source),
112706b22ef2SDmitry Safonov &ip_hdr(skb)->daddr,
112806b22ef2SDmitry Safonov ntohs(tcp_hdr(skb)->dest));
112906b22ef2SDmitry Safonov return;
113006b22ef2SDmitry Safonov }
113106b22ef2SDmitry Safonov }
113206b22ef2SDmitry Safonov key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
113306b22ef2SDmitry Safonov if (!key.traffic_key)
113406b22ef2SDmitry Safonov return;
113506b22ef2SDmitry Safonov
113606b22ef2SDmitry Safonov key.type = TCP_KEY_AO;
113706b22ef2SDmitry Safonov key.rcv_next = aoh->keyid;
113806b22ef2SDmitry Safonov tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
113906b22ef2SDmitry Safonov #else
114006b22ef2SDmitry Safonov if (0) {
114106b22ef2SDmitry Safonov #endif
11423966a668SDmitry Safonov } else if (static_branch_tcp_md5()) {
1143decde258SDmitry Safonov const union tcp_md5_addr *addr;
1144decde258SDmitry Safonov int l3index;
1145decde258SDmitry Safonov
1146cea97609SDavid Ahern addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1147dea53bb8SDavid Ahern l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1148decde258SDmitry Safonov key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1149decde258SDmitry Safonov if (key.md5_key)
1150decde258SDmitry Safonov key.type = TCP_KEY_MD5;
115106b22ef2SDmitry Safonov }
115206b22ef2SDmitry Safonov
1153e2d118a1SLorenzo Colitti tcp_v4_send_ack(sk, skb, seq,
115420a2b49fSEric Dumazet tcp_rsk(req)->rcv_nxt,
1155f4dca95fSEric Dumazet tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
115616cf6477SEric Dumazet tcp_rsk_tsval(tcp_rsk(req)),
1157eba20811SEric Dumazet READ_ONCE(req->ts_recent),
1158decde258SDmitry Safonov 0, &key,
115966b13d99SEric Dumazet inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
11605e526552SEric Dumazet ip_hdr(skb)->tos,
11615e526552SEric Dumazet READ_ONCE(tcp_rsk(req)->txhash));
116206b22ef2SDmitry Safonov if (tcp_key_is_ao(&key))
116306b22ef2SDmitry Safonov kfree(key.traffic_key);
11641da177e4SLinus Torvalds }
11651da177e4SLinus Torvalds
11661da177e4SLinus Torvalds /*
11679bf1d83eSKris Katterjohn * Send a SYN-ACK after having received a SYN.
116860236fddSArnaldo Carvalho de Melo * This still operates on a request_sock only, not on a big
11691da177e4SLinus Torvalds * socket.
11701da177e4SLinus Torvalds */
11710f935dbeSEric Dumazet static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1172d6274bd8SOctavian Purdila struct flowi *fl,
1173e6b4d113SWilliam Allen Simpson struct request_sock *req,
1174ca6fb065SEric Dumazet struct tcp_fastopen_cookie *foc,
1175331fca43SMartin KaFai Lau enum tcp_synack_type synack_type,
1176331fca43SMartin KaFai Lau struct sk_buff *syn_skb)
11771da177e4SLinus Torvalds {
11782e6599cbSArnaldo Carvalho de Melo const struct inet_request_sock *ireq = inet_rsk(req);
11796bd023f3SDavid S. Miller struct flowi4 fl4;
11801da177e4SLinus Torvalds int err = -1;
11811da177e4SLinus Torvalds struct sk_buff *skb;
1182ac8f1710SWei Wang u8 tos;
11831da177e4SLinus Torvalds
11841da177e4SLinus Torvalds /* First, grab a route. */
1185ba3f7f04SDavid S. Miller if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1186fd80eb94SDenis V. Lunev return -1;
11871da177e4SLinus Torvalds
1188331fca43SMartin KaFai Lau skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
11891da177e4SLinus Torvalds
11901da177e4SLinus Torvalds if (skb) {
1191634fb979SEric Dumazet __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
11921da177e4SLinus Torvalds
1193e08d0b3dSEric Dumazet tos = READ_ONCE(inet_sk(sk)->tos);
1194e08d0b3dSEric Dumazet
1195e08d0b3dSEric Dumazet if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1196e08d0b3dSEric Dumazet tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1197e08d0b3dSEric Dumazet (tos & INET_ECN_MASK);
11981da177e4SLinus Torvalds
1199407c85c7SAlexander Duyck if (!INET_ECN_is_capable(tos) &&
1200407c85c7SAlexander Duyck tcp_bpf_ca_needs_ecn((struct sock *)req))
1201407c85c7SAlexander Duyck tos |= INET_ECN_ECT_0;
12021da177e4SLinus Torvalds
12032ab2ddd3SEric Dumazet rcu_read_lock();
1204634fb979SEric Dumazet err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1205634fb979SEric Dumazet ireq->ir_rmt_addr,
1206de033b7dSWei Wang rcu_dereference(ireq->ireq_opt),
1207861602b5SAlexander Duyck tos);
12082ab2ddd3SEric Dumazet rcu_read_unlock();
1209b9df3cb8SGerrit Renker err = net_xmit_eval(err);
12101da177e4SLinus Torvalds }
12111da177e4SLinus Torvalds
12121da177e4SLinus Torvalds return err;
12131da177e4SLinus Torvalds }
12141da177e4SLinus Torvalds
12151da177e4SLinus Torvalds /*
121660236fddSArnaldo Carvalho de Melo * IPv4 request_sock destructor.
12171da177e4SLinus Torvalds */
121860236fddSArnaldo Carvalho de Melo static void tcp_v4_reqsk_destructor(struct request_sock *req)
12191da177e4SLinus Torvalds {
1220c92e8c02SEric Dumazet kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
12211da177e4SLinus Torvalds }
12221da177e4SLinus Torvalds
1223cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
1224cfb6eeb4SYOSHIFUJI Hideaki /*
1225cfb6eeb4SYOSHIFUJI Hideaki * RFC2385 MD5 checksumming requires a mapping of
1226cfb6eeb4SYOSHIFUJI Hideaki * IP address->MD5 Key.
1227cfb6eeb4SYOSHIFUJI Hideaki * We need to maintain these in the sk structure.
1228cfb6eeb4SYOSHIFUJI Hideaki */
1229cfb6eeb4SYOSHIFUJI Hideaki
1230459837b5SDmitry Safonov DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
12316015c71eSEric Dumazet EXPORT_SYMBOL(tcp_md5_needed);
12326015c71eSEric Dumazet
123386f1e3a8SLeonard Crestez static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
123486f1e3a8SLeonard Crestez {
123586f1e3a8SLeonard Crestez if (!old)
123686f1e3a8SLeonard Crestez return true;
123786f1e3a8SLeonard Crestez
123886f1e3a8SLeonard Crestez /* l3index always overrides non-l3index */
123986f1e3a8SLeonard Crestez if (old->l3index && new->l3index == 0)
124086f1e3a8SLeonard Crestez return false;
124186f1e3a8SLeonard Crestez if (old->l3index == 0 && new->l3index)
124286f1e3a8SLeonard Crestez return true;
124386f1e3a8SLeonard Crestez
124486f1e3a8SLeonard Crestez return old->prefixlen < new->prefixlen;
124586f1e3a8SLeonard Crestez }
124686f1e3a8SLeonard Crestez
1247cfb6eeb4SYOSHIFUJI Hideaki /* Find the Key structure for an address. */
1248dea53bb8SDavid Ahern struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1249a915da9bSEric Dumazet const union tcp_md5_addr *addr,
12500aadc739SDmitry Safonov int family, bool any_l3index)
1251cfb6eeb4SYOSHIFUJI Hideaki {
1252fd3a154aSEric Dumazet const struct tcp_sock *tp = tcp_sk(sk);
1253a915da9bSEric Dumazet struct tcp_md5sig_key *key;
1254fd3a154aSEric Dumazet const struct tcp_md5sig_info *md5sig;
12556797318eSIvan Delalande __be32 mask;
12566797318eSIvan Delalande struct tcp_md5sig_key *best_match = NULL;
12576797318eSIvan Delalande bool match;
1258cfb6eeb4SYOSHIFUJI Hideaki
1259a8afca03SEric Dumazet /* caller either holds rcu_read_lock() or socket lock */
1260a8afca03SEric Dumazet md5sig = rcu_dereference_check(tp->md5sig_info,
12611e1d04e6SHannes Frederic Sowa lockdep_sock_is_held(sk));
1262a8afca03SEric Dumazet if (!md5sig)
1263cfb6eeb4SYOSHIFUJI Hideaki return NULL;
1264083a0326SArnd Bergmann
1265c8b91770SAmol Grover hlist_for_each_entry_rcu(key, &md5sig->head, node,
1266c8b91770SAmol Grover lockdep_sock_is_held(sk)) {
1267a915da9bSEric Dumazet if (key->family != family)
1268a915da9bSEric Dumazet continue;
12690aadc739SDmitry Safonov if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
12700aadc739SDmitry Safonov key->l3index != l3index)
1271dea53bb8SDavid Ahern continue;
12726797318eSIvan Delalande if (family == AF_INET) {
12736797318eSIvan Delalande mask = inet_make_mask(key->prefixlen);
12746797318eSIvan Delalande match = (key->addr.a4.s_addr & mask) ==
12756797318eSIvan Delalande (addr->a4.s_addr & mask);
12766797318eSIvan Delalande #if IS_ENABLED(CONFIG_IPV6)
12776797318eSIvan Delalande } else if (family == AF_INET6) {
12786797318eSIvan Delalande match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
12796797318eSIvan Delalande key->prefixlen);
12806797318eSIvan Delalande #endif
12816797318eSIvan Delalande } else {
12826797318eSIvan Delalande match = false;
12836797318eSIvan Delalande }
12846797318eSIvan Delalande
128586f1e3a8SLeonard Crestez if (match && better_md5_match(best_match, key))
12866797318eSIvan Delalande best_match = key;
12876797318eSIvan Delalande }
12886797318eSIvan Delalande return best_match;
12896797318eSIvan Delalande }
12906015c71eSEric Dumazet EXPORT_SYMBOL(__tcp_md5_do_lookup);
12916797318eSIvan Delalande
1292e8f37d57SWu Fengguang static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
12936797318eSIvan Delalande const union tcp_md5_addr *addr,
1294dea53bb8SDavid Ahern int family, u8 prefixlen,
1295a76c2315SLeonard Crestez int l3index, u8 flags)
12966797318eSIvan Delalande {
12976797318eSIvan Delalande const struct tcp_sock *tp = tcp_sk(sk);
12986797318eSIvan Delalande struct tcp_md5sig_key *key;
12996797318eSIvan Delalande unsigned int size = sizeof(struct in_addr);
13006797318eSIvan Delalande const struct tcp_md5sig_info *md5sig;
13016797318eSIvan Delalande
13026797318eSIvan Delalande /* caller either holds rcu_read_lock() or socket lock */
13036797318eSIvan Delalande md5sig = rcu_dereference_check(tp->md5sig_info,
13046797318eSIvan Delalande lockdep_sock_is_held(sk));
13056797318eSIvan Delalande if (!md5sig)
13066797318eSIvan Delalande return NULL;
13076797318eSIvan Delalande #if IS_ENABLED(CONFIG_IPV6)
13086797318eSIvan Delalande if (family == AF_INET6)
13096797318eSIvan Delalande size = sizeof(struct in6_addr);
13106797318eSIvan Delalande #endif
1311c8b91770SAmol Grover hlist_for_each_entry_rcu(key, &md5sig->head, node,
1312c8b91770SAmol Grover lockdep_sock_is_held(sk)) {
13136797318eSIvan Delalande if (key->family != family)
13146797318eSIvan Delalande continue;
1315a76c2315SLeonard Crestez if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1316a76c2315SLeonard Crestez continue;
131786f1e3a8SLeonard Crestez if (key->l3index != l3index)
1318dea53bb8SDavid Ahern continue;
13196797318eSIvan Delalande if (!memcmp(&key->addr, addr, size) &&
13206797318eSIvan Delalande key->prefixlen == prefixlen)
1321a915da9bSEric Dumazet return key;
1322cfb6eeb4SYOSHIFUJI Hideaki }
1323cfb6eeb4SYOSHIFUJI Hideaki return NULL;
1324cfb6eeb4SYOSHIFUJI Hideaki }
1325cfb6eeb4SYOSHIFUJI Hideaki
1326b83e3debSEric Dumazet struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1327fd3a154aSEric Dumazet const struct sock *addr_sk)
1328cfb6eeb4SYOSHIFUJI Hideaki {
1329b52e6921SEric Dumazet const union tcp_md5_addr *addr;
1330dea53bb8SDavid Ahern int l3index;
1331a915da9bSEric Dumazet
1332dea53bb8SDavid Ahern l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1333dea53bb8SDavid Ahern addr_sk->sk_bound_dev_if);
1334b52e6921SEric Dumazet addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1335dea53bb8SDavid Ahern return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1336cfb6eeb4SYOSHIFUJI Hideaki }
1337cfb6eeb4SYOSHIFUJI Hideaki EXPORT_SYMBOL(tcp_v4_md5_lookup);
1338cfb6eeb4SYOSHIFUJI Hideaki
1339f62c7517SDmitry Safonov static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1340f62c7517SDmitry Safonov {
1341f62c7517SDmitry Safonov struct tcp_sock *tp = tcp_sk(sk);
1342f62c7517SDmitry Safonov struct tcp_md5sig_info *md5sig;
1343f62c7517SDmitry Safonov
1344f62c7517SDmitry Safonov md5sig = kmalloc(sizeof(*md5sig), gfp);
1345f62c7517SDmitry Safonov if (!md5sig)
1346f62c7517SDmitry Safonov return -ENOMEM;
1347f62c7517SDmitry Safonov
1348f62c7517SDmitry Safonov sk_gso_disable(sk);
1349f62c7517SDmitry Safonov INIT_HLIST_HEAD(&md5sig->head);
1350f62c7517SDmitry Safonov rcu_assign_pointer(tp->md5sig_info, md5sig);
1351f62c7517SDmitry Safonov return 0;
1352f62c7517SDmitry Safonov }
1353f62c7517SDmitry Safonov
1354cfb6eeb4SYOSHIFUJI Hideaki /* This can be called on a newly created socket, from other files */
1355459837b5SDmitry Safonov static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1356a76c2315SLeonard Crestez int family, u8 prefixlen, int l3index, u8 flags,
1357dea53bb8SDavid Ahern const u8 *newkey, u8 newkeylen, gfp_t gfp)
1358cfb6eeb4SYOSHIFUJI Hideaki {
1359cfb6eeb4SYOSHIFUJI Hideaki /* Add Key to the list */
1360b0a713e9SMatthias M. Dellweg struct tcp_md5sig_key *key;
1361cfb6eeb4SYOSHIFUJI Hideaki struct tcp_sock *tp = tcp_sk(sk);
1362f6685938SArnaldo Carvalho de Melo struct tcp_md5sig_info *md5sig;
1363f6685938SArnaldo Carvalho de Melo
1364a76c2315SLeonard Crestez key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1365a915da9bSEric Dumazet if (key) {
1366e6ced831SEric Dumazet /* Pre-existing entry - just update that one.
1367e6ced831SEric Dumazet * Note that the key might be used concurrently.
1368e6ced831SEric Dumazet * data_race() is telling kcsan that we do not care of
1369e6ced831SEric Dumazet * key mismatches, since changing MD5 key on live flows
1370e6ced831SEric Dumazet * can lead to packet drops.
1371e6ced831SEric Dumazet */
1372e6ced831SEric Dumazet data_race(memcpy(key->key, newkey, newkeylen));
13736a2febecSEric Dumazet
1374e6ced831SEric Dumazet /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1375e6ced831SEric Dumazet * Also note that a reader could catch new key->keylen value
1376e6ced831SEric Dumazet * but old key->key[], this is the reason we use __GFP_ZERO
1377e6ced831SEric Dumazet * at sock_kmalloc() time below these lines.
1378e6ced831SEric Dumazet */
1379e6ced831SEric Dumazet WRITE_ONCE(key->keylen, newkeylen);
13806a2febecSEric Dumazet
1381a915da9bSEric Dumazet return 0;
1382cfb6eeb4SYOSHIFUJI Hideaki }
1383260fcbebSYan, Zheng
1384f62c7517SDmitry Safonov md5sig = rcu_dereference_protected(tp->md5sig_info,
1385f62c7517SDmitry Safonov lockdep_sock_is_held(sk));
1386a915da9bSEric Dumazet
1387e6ced831SEric Dumazet key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1388a915da9bSEric Dumazet if (!key)
1389a915da9bSEric Dumazet return -ENOMEM;
1390f6685938SArnaldo Carvalho de Melo
1391a915da9bSEric Dumazet memcpy(key->key, newkey, newkeylen);
1392a915da9bSEric Dumazet key->keylen = newkeylen;
1393a915da9bSEric Dumazet key->family = family;
13946797318eSIvan Delalande key->prefixlen = prefixlen;
1395dea53bb8SDavid Ahern key->l3index = l3index;
1396a76c2315SLeonard Crestez key->flags = flags;
1397a915da9bSEric Dumazet memcpy(&key->addr, addr,
13983a2cd89bShuhai (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1399a915da9bSEric Dumazet sizeof(struct in_addr));
1400a915da9bSEric Dumazet hlist_add_head_rcu(&key->node, &md5sig->head);
1401cfb6eeb4SYOSHIFUJI Hideaki return 0;
1402cfb6eeb4SYOSHIFUJI Hideaki }
1403459837b5SDmitry Safonov
1404459837b5SDmitry Safonov int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1405459837b5SDmitry Safonov int family, u8 prefixlen, int l3index, u8 flags,
1406459837b5SDmitry Safonov const u8 *newkey, u8 newkeylen)
1407459837b5SDmitry Safonov {
1408459837b5SDmitry Safonov struct tcp_sock *tp = tcp_sk(sk);
1409459837b5SDmitry Safonov
1410459837b5SDmitry Safonov if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
14118c73b263SDmitry Safonov if (tcp_md5_alloc_sigpool())
1412459837b5SDmitry Safonov return -ENOMEM;
1413459837b5SDmitry Safonov
14148c73b263SDmitry Safonov if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
14158c73b263SDmitry Safonov tcp_md5_release_sigpool();
14168c73b263SDmitry Safonov return -ENOMEM;
14178c73b263SDmitry Safonov }
14188c73b263SDmitry Safonov
1419459837b5SDmitry Safonov if (!static_branch_inc(&tcp_md5_needed.key)) {
1420459837b5SDmitry Safonov struct tcp_md5sig_info *md5sig;
1421459837b5SDmitry Safonov
1422459837b5SDmitry Safonov md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1423459837b5SDmitry Safonov rcu_assign_pointer(tp->md5sig_info, NULL);
142455fb80d5SEric Dumazet kfree_rcu(md5sig, rcu);
14258c73b263SDmitry Safonov tcp_md5_release_sigpool();
1426459837b5SDmitry Safonov return -EUSERS;
1427459837b5SDmitry Safonov }
1428459837b5SDmitry Safonov }
1429459837b5SDmitry Safonov
1430459837b5SDmitry Safonov return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1431459837b5SDmitry Safonov newkey, newkeylen, GFP_KERNEL);
1432459837b5SDmitry Safonov }
1433a915da9bSEric Dumazet EXPORT_SYMBOL(tcp_md5_do_add);
1434cfb6eeb4SYOSHIFUJI Hideaki
1435459837b5SDmitry Safonov int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1436459837b5SDmitry Safonov int family, u8 prefixlen, int l3index,
1437459837b5SDmitry Safonov struct tcp_md5sig_key *key)
1438459837b5SDmitry Safonov {
1439459837b5SDmitry Safonov struct tcp_sock *tp = tcp_sk(sk);
1440459837b5SDmitry Safonov
1441459837b5SDmitry Safonov if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
14428c73b263SDmitry Safonov tcp_md5_add_sigpool();
14438c73b263SDmitry Safonov
14448c73b263SDmitry Safonov if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
14458c73b263SDmitry Safonov tcp_md5_release_sigpool();
1446459837b5SDmitry Safonov return -ENOMEM;
14478c73b263SDmitry Safonov }
1448459837b5SDmitry Safonov
1449459837b5SDmitry Safonov if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1450459837b5SDmitry Safonov struct tcp_md5sig_info *md5sig;
1451459837b5SDmitry Safonov
1452459837b5SDmitry Safonov md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1453459837b5SDmitry Safonov net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1454459837b5SDmitry Safonov rcu_assign_pointer(tp->md5sig_info, NULL);
145555fb80d5SEric Dumazet kfree_rcu(md5sig, rcu);
14568c73b263SDmitry Safonov tcp_md5_release_sigpool();
1457459837b5SDmitry Safonov return -EUSERS;
1458459837b5SDmitry Safonov }
1459459837b5SDmitry Safonov }
1460459837b5SDmitry Safonov
1461459837b5SDmitry Safonov return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1462459837b5SDmitry Safonov key->flags, key->key, key->keylen,
1463459837b5SDmitry Safonov sk_gfp_mask(sk, GFP_ATOMIC));
1464459837b5SDmitry Safonov }
1465459837b5SDmitry Safonov EXPORT_SYMBOL(tcp_md5_key_copy);
1466459837b5SDmitry Safonov
14676797318eSIvan Delalande int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1468a76c2315SLeonard Crestez u8 prefixlen, int l3index, u8 flags)
1469cfb6eeb4SYOSHIFUJI Hideaki {
1470a915da9bSEric Dumazet struct tcp_md5sig_key *key;
1471cfb6eeb4SYOSHIFUJI Hideaki
1472a76c2315SLeonard Crestez key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1473a915da9bSEric Dumazet if (!key)
1474cfb6eeb4SYOSHIFUJI Hideaki return -ENOENT;
1475a915da9bSEric Dumazet hlist_del_rcu(&key->node);
14765f3d9cb2SEric Dumazet atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1477a915da9bSEric Dumazet kfree_rcu(key, rcu);
1478a915da9bSEric Dumazet return 0;
1479cfb6eeb4SYOSHIFUJI Hideaki }
1480a915da9bSEric Dumazet EXPORT_SYMBOL(tcp_md5_do_del);
1481cfb6eeb4SYOSHIFUJI Hideaki
14820aadc739SDmitry Safonov void tcp_clear_md5_list(struct sock *sk)
1483cfb6eeb4SYOSHIFUJI Hideaki {
1484cfb6eeb4SYOSHIFUJI Hideaki struct tcp_sock *tp = tcp_sk(sk);
1485a915da9bSEric Dumazet struct tcp_md5sig_key *key;
1486b67bfe0dSSasha Levin struct hlist_node *n;
1487a8afca03SEric Dumazet struct tcp_md5sig_info *md5sig;
1488cfb6eeb4SYOSHIFUJI Hideaki
1489a8afca03SEric Dumazet md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1490a8afca03SEric Dumazet
1491b67bfe0dSSasha Levin hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1492a915da9bSEric Dumazet hlist_del_rcu(&key->node);
14935f3d9cb2SEric Dumazet atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1494a915da9bSEric Dumazet kfree_rcu(key, rcu);
1495cfb6eeb4SYOSHIFUJI Hideaki }
1496cfb6eeb4SYOSHIFUJI Hideaki }
1497cfb6eeb4SYOSHIFUJI Hideaki
14988917a777SIvan Delalande static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1499d4c19c49SChristoph Hellwig sockptr_t optval, int optlen)
1500cfb6eeb4SYOSHIFUJI Hideaki {
1501cfb6eeb4SYOSHIFUJI Hideaki struct tcp_md5sig cmd;
1502cfb6eeb4SYOSHIFUJI Hideaki struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1503cea97609SDavid Ahern const union tcp_md5_addr *addr;
15048917a777SIvan Delalande u8 prefixlen = 32;
1505dea53bb8SDavid Ahern int l3index = 0;
1506248411b8SDmitry Safonov bool l3flag;
1507a76c2315SLeonard Crestez u8 flags;
1508cfb6eeb4SYOSHIFUJI Hideaki
1509cfb6eeb4SYOSHIFUJI Hideaki if (optlen < sizeof(cmd))
1510cfb6eeb4SYOSHIFUJI Hideaki return -EINVAL;
1511cfb6eeb4SYOSHIFUJI Hideaki
1512d4c19c49SChristoph Hellwig if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1513cfb6eeb4SYOSHIFUJI Hideaki return -EFAULT;
1514cfb6eeb4SYOSHIFUJI Hideaki
1515cfb6eeb4SYOSHIFUJI Hideaki if (sin->sin_family != AF_INET)
1516cfb6eeb4SYOSHIFUJI Hideaki return -EINVAL;
1517cfb6eeb4SYOSHIFUJI Hideaki
1518a76c2315SLeonard Crestez flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1519248411b8SDmitry Safonov l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1520a76c2315SLeonard Crestez
15218917a777SIvan Delalande if (optname == TCP_MD5SIG_EXT &&
15228917a777SIvan Delalande cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
15238917a777SIvan Delalande prefixlen = cmd.tcpm_prefixlen;
15248917a777SIvan Delalande if (prefixlen > 32)
15258917a777SIvan Delalande return -EINVAL;
15268917a777SIvan Delalande }
15278917a777SIvan Delalande
1528a76c2315SLeonard Crestez if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
15296b102db5SDavid Ahern cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
15306b102db5SDavid Ahern struct net_device *dev;
15316b102db5SDavid Ahern
15326b102db5SDavid Ahern rcu_read_lock();
15336b102db5SDavid Ahern dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
15346b102db5SDavid Ahern if (dev && netif_is_l3_master(dev))
15356b102db5SDavid Ahern l3index = dev->ifindex;
15366b102db5SDavid Ahern
15376b102db5SDavid Ahern rcu_read_unlock();
15386b102db5SDavid Ahern
15396b102db5SDavid Ahern /* ok to reference set/not set outside of rcu;
15406b102db5SDavid Ahern * right now device MUST be an L3 master
15416b102db5SDavid Ahern */
15426b102db5SDavid Ahern if (!dev || !l3index)
15436b102db5SDavid Ahern return -EINVAL;
15446b102db5SDavid Ahern }
15456b102db5SDavid Ahern
1546cea97609SDavid Ahern addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1547cea97609SDavid Ahern
154864a124edSDmitry Popov if (!cmd.tcpm_keylen)
1549a76c2315SLeonard Crestez return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1550cfb6eeb4SYOSHIFUJI Hideaki
1551cfb6eeb4SYOSHIFUJI Hideaki if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1552cfb6eeb4SYOSHIFUJI Hideaki return -EINVAL;
1553cfb6eeb4SYOSHIFUJI Hideaki
15540aadc739SDmitry Safonov /* Don't allow keys for peers that have a matching TCP-AO key.
15550aadc739SDmitry Safonov * See the comment in tcp_ao_add_cmd()
15560aadc739SDmitry Safonov */
1557248411b8SDmitry Safonov if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
15580aadc739SDmitry Safonov return -EKEYREJECTED;
15590aadc739SDmitry Safonov
1560a76c2315SLeonard Crestez return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1561459837b5SDmitry Safonov cmd.tcpm_key, cmd.tcpm_keylen);
1562cfb6eeb4SYOSHIFUJI Hideaki }
1563cfb6eeb4SYOSHIFUJI Hideaki
15648c73b263SDmitry Safonov static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
156519689e38SEric Dumazet __be32 daddr, __be32 saddr,
156619689e38SEric Dumazet const struct tcphdr *th, int nbytes)
1567cfb6eeb4SYOSHIFUJI Hideaki {
1568cfb6eeb4SYOSHIFUJI Hideaki struct tcp4_pseudohdr *bp;
156949a72dfbSAdam Langley struct scatterlist sg;
157019689e38SEric Dumazet struct tcphdr *_th;
1571cfb6eeb4SYOSHIFUJI Hideaki
157219689e38SEric Dumazet bp = hp->scratch;
1573cfb6eeb4SYOSHIFUJI Hideaki bp->saddr = saddr;
1574cfb6eeb4SYOSHIFUJI Hideaki bp->daddr = daddr;
1575cfb6eeb4SYOSHIFUJI Hideaki bp->pad = 0;
1576076fb722SYOSHIFUJI Hideaki bp->protocol = IPPROTO_TCP;
157749a72dfbSAdam Langley bp->len = cpu_to_be16(nbytes);
1578c7da57a1SDavid S. Miller
157919689e38SEric Dumazet _th = (struct tcphdr *)(bp + 1);
158019689e38SEric Dumazet memcpy(_th, th, sizeof(*th));
158119689e38SEric Dumazet _th->check = 0;
158219689e38SEric Dumazet
158319689e38SEric Dumazet sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
15848c73b263SDmitry Safonov ahash_request_set_crypt(hp->req, &sg, NULL,
158519689e38SEric Dumazet sizeof(*bp) + sizeof(*th));
15868c73b263SDmitry Safonov return crypto_ahash_update(hp->req);
158749a72dfbSAdam Langley }
158849a72dfbSAdam Langley
1589a915da9bSEric Dumazet static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1590318cf7aaSEric Dumazet __be32 daddr, __be32 saddr, const struct tcphdr *th)
159149a72dfbSAdam Langley {
15928c73b263SDmitry Safonov struct tcp_sigpool hp;
159349a72dfbSAdam Langley
15948c73b263SDmitry Safonov if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
15958c73b263SDmitry Safonov goto clear_hash_nostart;
159649a72dfbSAdam Langley
15978c73b263SDmitry Safonov if (crypto_ahash_init(hp.req))
159849a72dfbSAdam Langley goto clear_hash;
15998c73b263SDmitry Safonov if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
160049a72dfbSAdam Langley goto clear_hash;
16018c73b263SDmitry Safonov if (tcp_md5_hash_key(&hp, key))
160249a72dfbSAdam Langley goto clear_hash;
16038c73b263SDmitry Safonov ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
16048c73b263SDmitry Safonov if (crypto_ahash_final(hp.req))
1605cfb6eeb4SYOSHIFUJI Hideaki goto clear_hash;
1606cfb6eeb4SYOSHIFUJI Hideaki
16078c73b263SDmitry Safonov tcp_sigpool_end(&hp);
1608cfb6eeb4SYOSHIFUJI Hideaki return 0;
160949a72dfbSAdam Langley
1610cfb6eeb4SYOSHIFUJI Hideaki clear_hash:
16118c73b263SDmitry Safonov tcp_sigpool_end(&hp);
16128c73b263SDmitry Safonov clear_hash_nostart:
1613cfb6eeb4SYOSHIFUJI Hideaki memset(md5_hash, 0, 16);
161449a72dfbSAdam Langley return 1;
1615cfb6eeb4SYOSHIFUJI Hideaki }
1616cfb6eeb4SYOSHIFUJI Hideaki
161739f8e58eSEric Dumazet int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
161839f8e58eSEric Dumazet const struct sock *sk,
1619318cf7aaSEric Dumazet const struct sk_buff *skb)
1620cfb6eeb4SYOSHIFUJI Hideaki {
1621318cf7aaSEric Dumazet const struct tcphdr *th = tcp_hdr(skb);
16228c73b263SDmitry Safonov struct tcp_sigpool hp;
1623cfb6eeb4SYOSHIFUJI Hideaki __be32 saddr, daddr;
1624cfb6eeb4SYOSHIFUJI Hideaki
162539f8e58eSEric Dumazet if (sk) { /* valid for establish/request sockets */
162639f8e58eSEric Dumazet saddr = sk->sk_rcv_saddr;
162739f8e58eSEric Dumazet daddr = sk->sk_daddr;
1628cfb6eeb4SYOSHIFUJI Hideaki } else {
162949a72dfbSAdam Langley const struct iphdr *iph = ip_hdr(skb);
163049a72dfbSAdam Langley saddr = iph->saddr;
163149a72dfbSAdam Langley daddr = iph->daddr;
1632cfb6eeb4SYOSHIFUJI Hideaki }
1633cfb6eeb4SYOSHIFUJI Hideaki
16348c73b263SDmitry Safonov if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
16358c73b263SDmitry Safonov goto clear_hash_nostart;
163649a72dfbSAdam Langley
16378c73b263SDmitry Safonov if (crypto_ahash_init(hp.req))
163849a72dfbSAdam Langley goto clear_hash;
163949a72dfbSAdam Langley
16408c73b263SDmitry Safonov if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
164149a72dfbSAdam Langley goto clear_hash;
16428c73b263SDmitry Safonov if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
164349a72dfbSAdam Langley goto clear_hash;
16448c73b263SDmitry Safonov if (tcp_md5_hash_key(&hp, key))
164549a72dfbSAdam Langley goto clear_hash;
16468c73b263SDmitry Safonov ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
16478c73b263SDmitry Safonov if (crypto_ahash_final(hp.req))
164849a72dfbSAdam Langley goto clear_hash;
164949a72dfbSAdam Langley
16508c73b263SDmitry Safonov tcp_sigpool_end(&hp);
165149a72dfbSAdam Langley return 0;
165249a72dfbSAdam Langley
165349a72dfbSAdam Langley clear_hash:
16548c73b263SDmitry Safonov tcp_sigpool_end(&hp);
16558c73b263SDmitry Safonov clear_hash_nostart:
165649a72dfbSAdam Langley memset(md5_hash, 0, 16);
165749a72dfbSAdam Langley return 1;
165849a72dfbSAdam Langley }
165949a72dfbSAdam Langley EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1660cfb6eeb4SYOSHIFUJI Hideaki
1661ba8e275aSEric Dumazet #endif
1662ba8e275aSEric Dumazet
1663b40cf18eSEric Dumazet static void tcp_v4_init_req(struct request_sock *req,
1664b40cf18eSEric Dumazet const struct sock *sk_listener,
166516bea70aSOctavian Purdila struct sk_buff *skb)
166616bea70aSOctavian Purdila {
166716bea70aSOctavian Purdila struct inet_request_sock *ireq = inet_rsk(req);
1668c92e8c02SEric Dumazet struct net *net = sock_net(sk_listener);
166916bea70aSOctavian Purdila
167008d2cc3bSEric Dumazet sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
167108d2cc3bSEric Dumazet sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1672c92e8c02SEric Dumazet RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
167316bea70aSOctavian Purdila }
167416bea70aSOctavian Purdila
1675f964629eSEric Dumazet static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
16767ea851d1SFlorian Westphal struct sk_buff *skb,
1677f964629eSEric Dumazet struct flowi *fl,
1678b9e81040SEric Dumazet struct request_sock *req,
1679b9e81040SEric Dumazet u32 tw_isn)
1680d94e0417SOctavian Purdila {
16817ea851d1SFlorian Westphal tcp_v4_init_req(req, sk, skb);
16827ea851d1SFlorian Westphal
16837ea851d1SFlorian Westphal if (security_inet_conn_request(sk, skb, req))
16847ea851d1SFlorian Westphal return NULL;
16857ea851d1SFlorian Westphal
16864396e461SSoheil Hassas Yeganeh return inet_csk_route_req(sk, &fl->u.ip4, req);
1687d94e0417SOctavian Purdila }
1688d94e0417SOctavian Purdila
168972a3effaSEric Dumazet struct request_sock_ops tcp_request_sock_ops __read_mostly = {
16901da177e4SLinus Torvalds .family = PF_INET,
16912e6599cbSArnaldo Carvalho de Melo .obj_size = sizeof(struct tcp_request_sock),
16925db92c99SOctavian Purdila .rtx_syn_ack = tcp_rtx_synack,
169360236fddSArnaldo Carvalho de Melo .send_ack = tcp_v4_reqsk_send_ack,
169460236fddSArnaldo Carvalho de Melo .destructor = tcp_v4_reqsk_destructor,
16951da177e4SLinus Torvalds .send_reset = tcp_v4_send_reset,
169672659eccSOctavian Purdila .syn_ack_timeout = tcp_syn_ack_timeout,
16971da177e4SLinus Torvalds };
16981da177e4SLinus Torvalds
169935b2c321SMat Martineau const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
17002aec4a29SOctavian Purdila .mss_clamp = TCP_MSS_DEFAULT,
170116bea70aSOctavian Purdila #ifdef CONFIG_TCP_MD5SIG
1702fd3a154aSEric Dumazet .req_md5_lookup = tcp_v4_md5_lookup,
1703e3afe7b7SJohn Dykstra .calc_md5_hash = tcp_v4_md5_hash_skb,
1704b6332e6cSAndrew Morton #endif
170506b22ef2SDmitry Safonov #ifdef CONFIG_TCP_AO
170606b22ef2SDmitry Safonov .ao_lookup = tcp_v4_ao_lookup_rsk,
170706b22ef2SDmitry Safonov .ao_calc_key = tcp_v4_ao_calc_key_rsk,
17089427c6aaSDmitry Safonov .ao_synack_hash = tcp_v4_ao_synack_hash,
170906b22ef2SDmitry Safonov #endif
1710fb7b37a7SOctavian Purdila #ifdef CONFIG_SYN_COOKIES
1711fb7b37a7SOctavian Purdila .cookie_init_seq = cookie_v4_init_sequence,
1712fb7b37a7SOctavian Purdila #endif
1713d94e0417SOctavian Purdila .route_req = tcp_v4_route_req,
171484b114b9SEric Dumazet .init_seq = tcp_v4_init_seq,
171584b114b9SEric Dumazet .init_ts_off = tcp_v4_init_ts_off,
1716d6274bd8SOctavian Purdila .send_synack = tcp_v4_send_synack,
171716bea70aSOctavian Purdila };
1718cfb6eeb4SYOSHIFUJI Hideaki
17191da177e4SLinus Torvalds int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
17201da177e4SLinus Torvalds {
17211da177e4SLinus Torvalds /* Never answer to SYNs send to broadcast or multicast */
1722511c3f92SEric Dumazet if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
17231da177e4SLinus Torvalds goto drop;
17241da177e4SLinus Torvalds
17251fb6f159SOctavian Purdila return tcp_conn_request(&tcp_request_sock_ops,
17261fb6f159SOctavian Purdila &tcp_request_sock_ipv4_ops, sk, skb);
17271da177e4SLinus Torvalds
17281da177e4SLinus Torvalds drop:
17299caad864SEric Dumazet tcp_listendrop(sk);
17301da177e4SLinus Torvalds return 0;
17311da177e4SLinus Torvalds }
17324bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_conn_request);
17331da177e4SLinus Torvalds
17341da177e4SLinus Torvalds
17351da177e4SLinus Torvalds /*
17361da177e4SLinus Torvalds * The three way handshake has completed - we got a valid synack -
17371da177e4SLinus Torvalds * now create the new socket.
17381da177e4SLinus Torvalds */
17390c27171eSEric Dumazet struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
174060236fddSArnaldo Carvalho de Melo struct request_sock *req,
17415e0724d0SEric Dumazet struct dst_entry *dst,
17425e0724d0SEric Dumazet struct request_sock *req_unhash,
17435e0724d0SEric Dumazet bool *own_req)
17441da177e4SLinus Torvalds {
17452e6599cbSArnaldo Carvalho de Melo struct inet_request_sock *ireq;
174601770a16SRicardo Dias bool found_dup_sk = false;
17471da177e4SLinus Torvalds struct inet_sock *newinet;
17481da177e4SLinus Torvalds struct tcp_sock *newtp;
17491da177e4SLinus Torvalds struct sock *newsk;
1750cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
1751cea97609SDavid Ahern const union tcp_md5_addr *addr;
1752cfb6eeb4SYOSHIFUJI Hideaki struct tcp_md5sig_key *key;
1753dea53bb8SDavid Ahern int l3index;
1754cfb6eeb4SYOSHIFUJI Hideaki #endif
1755f6d8bd05SEric Dumazet struct ip_options_rcu *inet_opt;
17561da177e4SLinus Torvalds
17571da177e4SLinus Torvalds if (sk_acceptq_is_full(sk))
17581da177e4SLinus Torvalds goto exit_overflow;
17591da177e4SLinus Torvalds
17601da177e4SLinus Torvalds newsk = tcp_create_openreq_child(sk, req, skb);
17611da177e4SLinus Torvalds if (!newsk)
1762093d2823SBalazs Scheidler goto exit_nonewsk;
17631da177e4SLinus Torvalds
1764bcd76111SHerbert Xu newsk->sk_gso_type = SKB_GSO_TCPV4;
1765fae6ef87SNeal Cardwell inet_sk_rx_dst_set(newsk, skb);
17661da177e4SLinus Torvalds
17671da177e4SLinus Torvalds newtp = tcp_sk(newsk);
17681da177e4SLinus Torvalds newinet = inet_sk(newsk);
17692e6599cbSArnaldo Carvalho de Melo ireq = inet_rsk(req);
1770d1e559d0SEric Dumazet sk_daddr_set(newsk, ireq->ir_rmt_addr);
1771d1e559d0SEric Dumazet sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
17726dd9a14eSDavid Ahern newsk->sk_bound_dev_if = ireq->ir_iif;
1773634fb979SEric Dumazet newinet->inet_saddr = ireq->ir_loc_addr;
1774c92e8c02SEric Dumazet inet_opt = rcu_dereference(ireq->ireq_opt);
1775c92e8c02SEric Dumazet RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1776463c84b9SArnaldo Carvalho de Melo newinet->mc_index = inet_iif(skb);
1777eddc9ec5SArnaldo Carvalho de Melo newinet->mc_ttl = ip_hdr(skb)->ttl;
17784c507d28SJiri Benc newinet->rcv_tos = ip_hdr(skb)->tos;
1779d83d8461SArnaldo Carvalho de Melo inet_csk(newsk)->icsk_ext_hdr_len = 0;
1780f6d8bd05SEric Dumazet if (inet_opt)
1781f6d8bd05SEric Dumazet inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1782f866fbc8SEric Dumazet atomic_set(&newinet->inet_id, get_random_u16());
17831da177e4SLinus Torvalds
17848ef44b6fSWei Wang /* Set ToS of the new socket based upon the value of incoming SYN.
17858ef44b6fSWei Wang * ECT bits are set later in tcp_init_transfer().
17868ef44b6fSWei Wang */
1787870e3a63SKuniyuki Iwashima if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1788ac8f1710SWei Wang newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1789ac8f1710SWei Wang
1790dfd25fffSEric Dumazet if (!dst) {
1791dfd25fffSEric Dumazet dst = inet_csk_route_child_sock(sk, newsk, req);
1792dfd25fffSEric Dumazet if (!dst)
17930e734419SDavid S. Miller goto put_and_exit;
1794dfd25fffSEric Dumazet } else {
1795dfd25fffSEric Dumazet /* syncookie case : see end of cookie_v4_check() */
1796dfd25fffSEric Dumazet }
17970e734419SDavid S. Miller sk_setup_caps(newsk, dst);
17980e734419SDavid S. Miller
179981164413SDaniel Borkmann tcp_ca_openreq_child(newsk, dst);
180081164413SDaniel Borkmann
18011da177e4SLinus Torvalds tcp_sync_mss(newsk, dst_mtu(dst));
18023541f9e8SEric Dumazet newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1803f5fff5dcSTom Quetchenbach
18041da177e4SLinus Torvalds tcp_initialize_rcv_mss(newsk);
18051da177e4SLinus Torvalds
1806cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
1807dea53bb8SDavid Ahern l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1808cfb6eeb4SYOSHIFUJI Hideaki /* Copy over the MD5 key from the original socket */
1809cea97609SDavid Ahern addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1810dea53bb8SDavid Ahern key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
181106b22ef2SDmitry Safonov if (key && !tcp_rsk_used_ao(req)) {
1812b389d1afSDmitry Safonov if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1813b389d1afSDmitry Safonov goto put_and_exit;
1814aba54656SEric Dumazet sk_gso_disable(newsk);
1815cfb6eeb4SYOSHIFUJI Hideaki }
1816cfb6eeb4SYOSHIFUJI Hideaki #endif
181706b22ef2SDmitry Safonov #ifdef CONFIG_TCP_AO
181806b22ef2SDmitry Safonov if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
181906b22ef2SDmitry Safonov goto put_and_exit; /* OOM, release back memory */
182006b22ef2SDmitry Safonov #endif
1821cfb6eeb4SYOSHIFUJI Hideaki
18220e734419SDavid S. Miller if (__inet_inherit_port(sk, newsk) < 0)
18230e734419SDavid S. Miller goto put_and_exit;
182401770a16SRicardo Dias *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
182501770a16SRicardo Dias &found_dup_sk);
1826c92e8c02SEric Dumazet if (likely(*own_req)) {
182749a496c9SEric Dumazet tcp_move_syn(newtp, req);
1828c92e8c02SEric Dumazet ireq->ireq_opt = NULL;
1829c92e8c02SEric Dumazet } else {
1830c89dffc7SKuniyuki Iwashima newinet->inet_opt = NULL;
1831c89dffc7SKuniyuki Iwashima
183201770a16SRicardo Dias if (!req_unhash && found_dup_sk) {
183301770a16SRicardo Dias /* This code path should only be executed in the
183401770a16SRicardo Dias * syncookie case only
183501770a16SRicardo Dias */
183601770a16SRicardo Dias bh_unlock_sock(newsk);
183701770a16SRicardo Dias sock_put(newsk);
183801770a16SRicardo Dias newsk = NULL;
1839c92e8c02SEric Dumazet }
184001770a16SRicardo Dias }
18411da177e4SLinus Torvalds return newsk;
18421da177e4SLinus Torvalds
18431da177e4SLinus Torvalds exit_overflow:
1844c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1845093d2823SBalazs Scheidler exit_nonewsk:
1846093d2823SBalazs Scheidler dst_release(dst);
18471da177e4SLinus Torvalds exit:
18489caad864SEric Dumazet tcp_listendrop(sk);
18491da177e4SLinus Torvalds return NULL;
18500e734419SDavid S. Miller put_and_exit:
1851c92e8c02SEric Dumazet newinet->inet_opt = NULL;
1852e337e24dSChristoph Paasch inet_csk_prepare_forced_close(newsk);
1853e337e24dSChristoph Paasch tcp_done(newsk);
18540e734419SDavid S. Miller goto exit;
18551da177e4SLinus Torvalds }
18564bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
18571da177e4SLinus Torvalds
1858079096f1SEric Dumazet static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
18591da177e4SLinus Torvalds {
18601da177e4SLinus Torvalds #ifdef CONFIG_SYN_COOKIES
1861079096f1SEric Dumazet const struct tcphdr *th = tcp_hdr(skb);
1862079096f1SEric Dumazet
1863af9b4738SFlorian Westphal if (!th->syn)
1864461b74c3SCong Wang sk = cookie_v4_check(sk, skb);
18651da177e4SLinus Torvalds #endif
18661da177e4SLinus Torvalds return sk;
18671da177e4SLinus Torvalds }
18681da177e4SLinus Torvalds
18699349d600SPetar Penkov u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
18709349d600SPetar Penkov struct tcphdr *th, u32 *cookie)
18719349d600SPetar Penkov {
18729349d600SPetar Penkov u16 mss = 0;
18739349d600SPetar Penkov #ifdef CONFIG_SYN_COOKIES
18749349d600SPetar Penkov mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
18759349d600SPetar Penkov &tcp_request_sock_ipv4_ops, sk, th);
18769349d600SPetar Penkov if (mss) {
18779349d600SPetar Penkov *cookie = __cookie_v4_init_sequence(iph, th, &mss);
18789349d600SPetar Penkov tcp_synq_overflow(sk);
18799349d600SPetar Penkov }
18809349d600SPetar Penkov #endif
18819349d600SPetar Penkov return mss;
18829349d600SPetar Penkov }
18839349d600SPetar Penkov
1884bbd807dfSBrian Vazquez INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1885bbd807dfSBrian Vazquez u32));
18861da177e4SLinus Torvalds /* The socket must have it's spinlock held when we get
1887e994b2f0SEric Dumazet * here, unless it is a TCP_LISTEN socket.
18881da177e4SLinus Torvalds *
18891da177e4SLinus Torvalds * We have a potential double-lock case here, so even when
18901da177e4SLinus Torvalds * doing backlog processing we use the BH locking scheme.
18911da177e4SLinus Torvalds * This is because we cannot sleep with the original spinlock
18921da177e4SLinus Torvalds * held.
18931da177e4SLinus Torvalds */
18941da177e4SLinus Torvalds int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
18951da177e4SLinus Torvalds {
18968eba65faSMenglong Dong enum skb_drop_reason reason;
1897cfb6eeb4SYOSHIFUJI Hideaki struct sock *rsk;
1898cfb6eeb4SYOSHIFUJI Hideaki
18991da177e4SLinus Torvalds if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
19008f905c0eSEric Dumazet struct dst_entry *dst;
19018f905c0eSEric Dumazet
19028f905c0eSEric Dumazet dst = rcu_dereference_protected(sk->sk_rx_dst,
19038f905c0eSEric Dumazet lockdep_sock_is_held(sk));
1904404e0a8bSEric Dumazet
1905404e0a8bSEric Dumazet sock_rps_save_rxhash(sk, skb);
19063d97379aSEric Dumazet sk_mark_napi_id(sk, skb);
1907404e0a8bSEric Dumazet if (dst) {
19080c0a5ef8SEric Dumazet if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1909bbd807dfSBrian Vazquez !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1910bbd807dfSBrian Vazquez dst, 0)) {
19118f905c0eSEric Dumazet RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
191292101b3bSDavid S. Miller dst_release(dst);
191392101b3bSDavid S. Miller }
191492101b3bSDavid S. Miller }
19153d97d88eSYafang Shao tcp_rcv_established(sk, skb);
19161da177e4SLinus Torvalds return 0;
19171da177e4SLinus Torvalds }
19181da177e4SLinus Torvalds
191912e25e10SEric Dumazet if (tcp_checksum_complete(skb))
19201da177e4SLinus Torvalds goto csum_err;
19211da177e4SLinus Torvalds
19221da177e4SLinus Torvalds if (sk->sk_state == TCP_LISTEN) {
1923079096f1SEric Dumazet struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1924079096f1SEric Dumazet
19251da177e4SLinus Torvalds if (!nsk)
192665be4393SJason Xing return 0;
19271da177e4SLinus Torvalds if (nsk != sk) {
1928ee01defeSJason Xing reason = tcp_child_process(sk, nsk, skb);
1929ee01defeSJason Xing if (reason) {
1930cfb6eeb4SYOSHIFUJI Hideaki rsk = nsk;
19311da177e4SLinus Torvalds goto reset;
1932cfb6eeb4SYOSHIFUJI Hideaki }
19331da177e4SLinus Torvalds return 0;
19341da177e4SLinus Torvalds }
1935ca55158cSEric Dumazet } else
1936bdeab991STom Herbert sock_rps_save_rxhash(sk, skb);
1937ca55158cSEric Dumazet
1938b9825695SJason Xing reason = tcp_rcv_state_process(sk, skb);
1939b9825695SJason Xing if (reason) {
1940cfb6eeb4SYOSHIFUJI Hideaki rsk = sk;
19411da177e4SLinus Torvalds goto reset;
1942cfb6eeb4SYOSHIFUJI Hideaki }
19431da177e4SLinus Torvalds return 0;
19441da177e4SLinus Torvalds
19451da177e4SLinus Torvalds reset:
1946120391efSJason Xing tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
19471da177e4SLinus Torvalds discard:
194846a02aa3SYan Zhai sk_skb_reason_drop(sk, skb, reason);
19491da177e4SLinus Torvalds /* Be careful here. If this function gets more complicated and
19501da177e4SLinus Torvalds * gcc suffers from register pressure on the x86, sk (in %ebx)
19511da177e4SLinus Torvalds * might be destroyed here. This current version compiles correctly,
19521da177e4SLinus Torvalds * but you have been warned.
19531da177e4SLinus Torvalds */
19541da177e4SLinus Torvalds return 0;
19551da177e4SLinus Torvalds
19561da177e4SLinus Torvalds csum_err:
19578eba65faSMenglong Dong reason = SKB_DROP_REASON_TCP_CSUM;
1958709c0314SJakub Kicinski trace_tcp_bad_csum(skb);
1959c10d9310SEric Dumazet TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1960c10d9310SEric Dumazet TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
19611da177e4SLinus Torvalds goto discard;
19621da177e4SLinus Torvalds }
19634bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_do_rcv);
19641da177e4SLinus Torvalds
19657487449cSPaolo Abeni int tcp_v4_early_demux(struct sk_buff *skb)
196641063e9dSDavid S. Miller {
19674461568aSKuniyuki Iwashima struct net *net = dev_net(skb->dev);
196841063e9dSDavid S. Miller const struct iphdr *iph;
196941063e9dSDavid S. Miller const struct tcphdr *th;
197041063e9dSDavid S. Miller struct sock *sk;
197141063e9dSDavid S. Miller
197241063e9dSDavid S. Miller if (skb->pkt_type != PACKET_HOST)
19737487449cSPaolo Abeni return 0;
197441063e9dSDavid S. Miller
197545f00f99SEric Dumazet if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
19767487449cSPaolo Abeni return 0;
197741063e9dSDavid S. Miller
197841063e9dSDavid S. Miller iph = ip_hdr(skb);
197945f00f99SEric Dumazet th = tcp_hdr(skb);
198041063e9dSDavid S. Miller
198141063e9dSDavid S. Miller if (th->doff < sizeof(struct tcphdr) / 4)
19827487449cSPaolo Abeni return 0;
198341063e9dSDavid S. Miller
19844461568aSKuniyuki Iwashima sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
198541063e9dSDavid S. Miller iph->saddr, th->source,
19867011d085SVijay Subramanian iph->daddr, ntohs(th->dest),
19873fa6f616SDavid Ahern skb->skb_iif, inet_sdif(skb));
198841063e9dSDavid S. Miller if (sk) {
198941063e9dSDavid S. Miller skb->sk = sk;
199041063e9dSDavid S. Miller skb->destructor = sock_edemux;
1991f7e4eb03SEric Dumazet if (sk_fullsock(sk)) {
19928f905c0eSEric Dumazet struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1993505fbcf0SEric Dumazet
199441063e9dSDavid S. Miller if (dst)
199541063e9dSDavid S. Miller dst = dst_check(dst, 0);
199692101b3bSDavid S. Miller if (dst &&
19970c0a5ef8SEric Dumazet sk->sk_rx_dst_ifindex == skb->skb_iif)
199841063e9dSDavid S. Miller skb_dst_set_noref(skb, dst);
199941063e9dSDavid S. Miller }
200041063e9dSDavid S. Miller }
20017487449cSPaolo Abeni return 0;
200241063e9dSDavid S. Miller }
200341063e9dSDavid S. Miller
20047a26dc9eSMenglong Dong bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
20057a26dc9eSMenglong Dong enum skb_drop_reason *reason)
2006c9c33212SEric Dumazet {
2007ec00ed47SEric Dumazet u32 tail_gso_size, tail_gso_segs;
20084f693b55SEric Dumazet struct skb_shared_info *shinfo;
20094f693b55SEric Dumazet const struct tcphdr *th;
20104f693b55SEric Dumazet struct tcphdr *thtail;
20114f693b55SEric Dumazet struct sk_buff *tail;
20124f693b55SEric Dumazet unsigned int hdrlen;
20134f693b55SEric Dumazet bool fragstolen;
20144f693b55SEric Dumazet u32 gso_segs;
2015b160c285SEric Dumazet u32 gso_size;
2016ec00ed47SEric Dumazet u64 limit;
20174f693b55SEric Dumazet int delta;
2018c9c33212SEric Dumazet
2019c9c33212SEric Dumazet /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2020c9c33212SEric Dumazet * we can fix skb->truesize to its real value to avoid future drops.
2021c9c33212SEric Dumazet * This is valid because skb is not yet charged to the socket.
2022c9c33212SEric Dumazet * It has been noticed pure SACK packets were sometimes dropped
2023c9c33212SEric Dumazet * (if cooked by drivers without copybreak feature).
2024c9c33212SEric Dumazet */
202560b1af33SEric Dumazet skb_condense(skb);
2026c9c33212SEric Dumazet
2027ade9628eSEric Dumazet skb_dst_drop(skb);
2028ade9628eSEric Dumazet
20294f693b55SEric Dumazet if (unlikely(tcp_checksum_complete(skb))) {
20304f693b55SEric Dumazet bh_unlock_sock(sk);
2031709c0314SJakub Kicinski trace_tcp_bad_csum(skb);
20327a26dc9eSMenglong Dong *reason = SKB_DROP_REASON_TCP_CSUM;
20334f693b55SEric Dumazet __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
20344f693b55SEric Dumazet __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
20354f693b55SEric Dumazet return true;
20364f693b55SEric Dumazet }
20374f693b55SEric Dumazet
20384f693b55SEric Dumazet /* Attempt coalescing to last skb in backlog, even if we are
20394f693b55SEric Dumazet * above the limits.
20404f693b55SEric Dumazet * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
20414f693b55SEric Dumazet */
20424f693b55SEric Dumazet th = (const struct tcphdr *)skb->data;
20434f693b55SEric Dumazet hdrlen = th->doff * 4;
20444f693b55SEric Dumazet
20454f693b55SEric Dumazet tail = sk->sk_backlog.tail;
20464f693b55SEric Dumazet if (!tail)
20474f693b55SEric Dumazet goto no_coalesce;
20484f693b55SEric Dumazet thtail = (struct tcphdr *)tail->data;
20494f693b55SEric Dumazet
20504f693b55SEric Dumazet if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
20514f693b55SEric Dumazet TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
20524f693b55SEric Dumazet ((TCP_SKB_CB(tail)->tcp_flags |
2053ca2fe295SEric Dumazet TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2054ca2fe295SEric Dumazet !((TCP_SKB_CB(tail)->tcp_flags &
2055ca2fe295SEric Dumazet TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
20564f693b55SEric Dumazet ((TCP_SKB_CB(tail)->tcp_flags ^
20574f693b55SEric Dumazet TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
205807111530SJakub Kicinski !tcp_skb_can_collapse_rx(tail, skb) ||
20594f693b55SEric Dumazet thtail->doff != th->doff ||
20604f693b55SEric Dumazet memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
20614f693b55SEric Dumazet goto no_coalesce;
20624f693b55SEric Dumazet
20634f693b55SEric Dumazet __skb_pull(skb, hdrlen);
2064b160c285SEric Dumazet
2065b160c285SEric Dumazet shinfo = skb_shinfo(skb);
2066b160c285SEric Dumazet gso_size = shinfo->gso_size ?: skb->len;
2067b160c285SEric Dumazet gso_segs = shinfo->gso_segs ?: 1;
2068b160c285SEric Dumazet
2069b160c285SEric Dumazet shinfo = skb_shinfo(tail);
2070b160c285SEric Dumazet tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2071b160c285SEric Dumazet tail_gso_segs = shinfo->gso_segs ?: 1;
2072b160c285SEric Dumazet
20734f693b55SEric Dumazet if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
20744f693b55SEric Dumazet TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
20754f693b55SEric Dumazet
207686bccd03SEric Dumazet if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
20774f693b55SEric Dumazet TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
207886bccd03SEric Dumazet thtail->window = th->window;
207986bccd03SEric Dumazet }
20804f693b55SEric Dumazet
2081ca2fe295SEric Dumazet /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2082ca2fe295SEric Dumazet * thtail->fin, so that the fast path in tcp_rcv_established()
2083ca2fe295SEric Dumazet * is not entered if we append a packet with a FIN.
2084ca2fe295SEric Dumazet * SYN, RST, URG are not present.
2085ca2fe295SEric Dumazet * ACK is set on both packets.
2086ca2fe295SEric Dumazet * PSH : we do not really care in TCP stack,
2087ca2fe295SEric Dumazet * at least for 'GRO' packets.
2088ca2fe295SEric Dumazet */
2089ca2fe295SEric Dumazet thtail->fin |= th->fin;
20904f693b55SEric Dumazet TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
20914f693b55SEric Dumazet
20924f693b55SEric Dumazet if (TCP_SKB_CB(skb)->has_rxtstamp) {
20934f693b55SEric Dumazet TCP_SKB_CB(tail)->has_rxtstamp = true;
20944f693b55SEric Dumazet tail->tstamp = skb->tstamp;
20954f693b55SEric Dumazet skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
20964f693b55SEric Dumazet }
20974f693b55SEric Dumazet
20984f693b55SEric Dumazet /* Not as strict as GRO. We only need to carry mss max value */
2099b160c285SEric Dumazet shinfo->gso_size = max(gso_size, tail_gso_size);
2100b160c285SEric Dumazet shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
21014f693b55SEric Dumazet
21024f693b55SEric Dumazet sk->sk_backlog.len += delta;
21034f693b55SEric Dumazet __NET_INC_STATS(sock_net(sk),
21044f693b55SEric Dumazet LINUX_MIB_TCPBACKLOGCOALESCE);
21054f693b55SEric Dumazet kfree_skb_partial(skb, fragstolen);
21064f693b55SEric Dumazet return false;
21074f693b55SEric Dumazet }
21084f693b55SEric Dumazet __skb_push(skb, hdrlen);
21094f693b55SEric Dumazet
21104f693b55SEric Dumazet no_coalesce:
2111ec00ed47SEric Dumazet /* sk->sk_backlog.len is reset only at the end of __release_sock().
2112ec00ed47SEric Dumazet * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2113ec00ed47SEric Dumazet * sk_rcvbuf in normal conditions.
2114ec00ed47SEric Dumazet */
2115ec00ed47SEric Dumazet limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2116ec00ed47SEric Dumazet
2117ec00ed47SEric Dumazet limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2118ec791d81SLu Wei
21194f693b55SEric Dumazet /* Only socket owner can try to collapse/prune rx queues
21204f693b55SEric Dumazet * to reduce memory overhead, so add a little headroom here.
21214f693b55SEric Dumazet * Few sockets backlog are possibly concurrently non empty.
21224f693b55SEric Dumazet */
2123ec791d81SLu Wei limit += 64 * 1024;
21244f693b55SEric Dumazet
2125ec00ed47SEric Dumazet limit = min_t(u64, limit, UINT_MAX);
2126ec00ed47SEric Dumazet
2127c9c33212SEric Dumazet if (unlikely(sk_add_backlog(sk, skb, limit))) {
2128c9c33212SEric Dumazet bh_unlock_sock(sk);
21297a26dc9eSMenglong Dong *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2130c9c33212SEric Dumazet __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2131c9c33212SEric Dumazet return true;
2132c9c33212SEric Dumazet }
2133c9c33212SEric Dumazet return false;
2134c9c33212SEric Dumazet }
2135c9c33212SEric Dumazet EXPORT_SYMBOL(tcp_add_backlog);
2136c9c33212SEric Dumazet
2137ac6e7800SEric Dumazet int tcp_filter(struct sock *sk, struct sk_buff *skb)
2138ac6e7800SEric Dumazet {
2139ac6e7800SEric Dumazet struct tcphdr *th = (struct tcphdr *)skb->data;
2140ac6e7800SEric Dumazet
2141f2feaefdSChristoph Paasch return sk_filter_trim_cap(sk, skb, th->doff * 4);
2142ac6e7800SEric Dumazet }
2143ac6e7800SEric Dumazet EXPORT_SYMBOL(tcp_filter);
2144ac6e7800SEric Dumazet
2145eeea10b8SEric Dumazet static void tcp_v4_restore_cb(struct sk_buff *skb)
2146eeea10b8SEric Dumazet {
2147eeea10b8SEric Dumazet memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2148eeea10b8SEric Dumazet sizeof(struct inet_skb_parm));
2149eeea10b8SEric Dumazet }
2150eeea10b8SEric Dumazet
2151eeea10b8SEric Dumazet static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2152eeea10b8SEric Dumazet const struct tcphdr *th)
2153eeea10b8SEric Dumazet {
2154eeea10b8SEric Dumazet /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2155eeea10b8SEric Dumazet * barrier() makes sure compiler wont play fool^Waliasing games.
2156eeea10b8SEric Dumazet */
2157eeea10b8SEric Dumazet memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2158eeea10b8SEric Dumazet sizeof(struct inet_skb_parm));
2159eeea10b8SEric Dumazet barrier();
2160eeea10b8SEric Dumazet
2161eeea10b8SEric Dumazet TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2162eeea10b8SEric Dumazet TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2163eeea10b8SEric Dumazet skb->len - th->doff * 4);
2164eeea10b8SEric Dumazet TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2165eeea10b8SEric Dumazet TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2166eeea10b8SEric Dumazet TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2167eeea10b8SEric Dumazet TCP_SKB_CB(skb)->sacked = 0;
2168eeea10b8SEric Dumazet TCP_SKB_CB(skb)->has_rxtstamp =
2169eeea10b8SEric Dumazet skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2170eeea10b8SEric Dumazet }
2171eeea10b8SEric Dumazet
21721da177e4SLinus Torvalds /*
21731da177e4SLinus Torvalds * From tcp_input.c
21741da177e4SLinus Torvalds */
21751da177e4SLinus Torvalds
21761da177e4SLinus Torvalds int tcp_v4_rcv(struct sk_buff *skb)
21771da177e4SLinus Torvalds {
21783b24d854SEric Dumazet struct net *net = dev_net(skb->dev);
2179643b622bSMenglong Dong enum skb_drop_reason drop_reason;
21803fa6f616SDavid Ahern int sdif = inet_sdif(skb);
2181534322caSDavid Ahern int dif = inet_iif(skb);
2182eddc9ec5SArnaldo Carvalho de Melo const struct iphdr *iph;
2183cf533ea5SEric Dumazet const struct tcphdr *th;
218446a02aa3SYan Zhai struct sock *sk = NULL;
21853b24d854SEric Dumazet bool refcounted;
21861da177e4SLinus Torvalds int ret;
218741eecbd7SEric Dumazet u32 isn;
21881da177e4SLinus Torvalds
218985125597SMenglong Dong drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
21901da177e4SLinus Torvalds if (skb->pkt_type != PACKET_HOST)
21911da177e4SLinus Torvalds goto discard_it;
21921da177e4SLinus Torvalds
21931da177e4SLinus Torvalds /* Count it even if it's bad */
219490bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_INSEGS);
21951da177e4SLinus Torvalds
21961da177e4SLinus Torvalds if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
21971da177e4SLinus Torvalds goto discard_it;
21981da177e4SLinus Torvalds
2199ea1627c2SEric Dumazet th = (const struct tcphdr *)skb->data;
22001da177e4SLinus Torvalds
220185125597SMenglong Dong if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
220285125597SMenglong Dong drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
22031da177e4SLinus Torvalds goto bad_packet;
220485125597SMenglong Dong }
22051da177e4SLinus Torvalds if (!pskb_may_pull(skb, th->doff * 4))
22061da177e4SLinus Torvalds goto discard_it;
22071da177e4SLinus Torvalds
22081da177e4SLinus Torvalds /* An explanation is required here, I think.
22091da177e4SLinus Torvalds * Packet length and doff are validated by header prediction,
2210caa20d9aSStephen Hemminger * provided case of th->doff==0 is eliminated.
22111da177e4SLinus Torvalds * So, we defer the checks. */
2212ed70fcfcSTom Herbert
2213ed70fcfcSTom Herbert if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
22146a5dc9e5SEric Dumazet goto csum_error;
22151da177e4SLinus Torvalds
2216ea1627c2SEric Dumazet th = (const struct tcphdr *)skb->data;
2217eddc9ec5SArnaldo Carvalho de Melo iph = ip_hdr(skb);
22184bdc3d66SEric Dumazet lookup:
22194461568aSKuniyuki Iwashima sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
22204461568aSKuniyuki Iwashima skb, __tcp_hdrlen(th), th->source,
22213fa6f616SDavid Ahern th->dest, sdif, &refcounted);
22221da177e4SLinus Torvalds if (!sk)
22231da177e4SLinus Torvalds goto no_tcp_socket;
22241da177e4SLinus Torvalds
2225bb134d5dSEric Dumazet if (sk->sk_state == TCP_TIME_WAIT)
2226bb134d5dSEric Dumazet goto do_time_wait;
2227bb134d5dSEric Dumazet
2228079096f1SEric Dumazet if (sk->sk_state == TCP_NEW_SYN_RECV) {
2229079096f1SEric Dumazet struct request_sock *req = inet_reqsk(sk);
2230e0f9759fSEric Dumazet bool req_stolen = false;
22317716682cSEric Dumazet struct sock *nsk;
2232079096f1SEric Dumazet
2233079096f1SEric Dumazet sk = req->rsk_listener;
22346f0012e3SEric Dumazet if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
22356f0012e3SEric Dumazet drop_reason = SKB_DROP_REASON_XFRM_POLICY;
22366f0012e3SEric Dumazet else
22370a3a8090SDmitry Safonov drop_reason = tcp_inbound_hash(sk, req, skb,
22387bbb765bSDmitry Safonov &iph->saddr, &iph->daddr,
22391330b6efSJakub Kicinski AF_INET, dif, sdif);
22401330b6efSJakub Kicinski if (unlikely(drop_reason)) {
2241e65c332dSEric Dumazet sk_drops_add(sk, skb);
224272923555SEric Dumazet reqsk_put(req);
224372923555SEric Dumazet goto discard_it;
224472923555SEric Dumazet }
22454fd44a98SFrank van der Linden if (tcp_checksum_complete(skb)) {
22464fd44a98SFrank van der Linden reqsk_put(req);
22474fd44a98SFrank van der Linden goto csum_error;
22484fd44a98SFrank van der Linden }
22497716682cSEric Dumazet if (unlikely(sk->sk_state != TCP_LISTEN)) {
2250d4f2c86bSKuniyuki Iwashima nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2251d4f2c86bSKuniyuki Iwashima if (!nsk) {
2252f03f2e15SEric Dumazet inet_csk_reqsk_queue_drop_and_put(sk, req);
22534bdc3d66SEric Dumazet goto lookup;
22544bdc3d66SEric Dumazet }
2255d4f2c86bSKuniyuki Iwashima sk = nsk;
2256d4f2c86bSKuniyuki Iwashima /* reuseport_migrate_sock() has already held one sk_refcnt
2257d4f2c86bSKuniyuki Iwashima * before returning.
2258d4f2c86bSKuniyuki Iwashima */
2259d4f2c86bSKuniyuki Iwashima } else {
22603b24d854SEric Dumazet /* We own a reference on the listener, increase it again
22613b24d854SEric Dumazet * as we might lose it too soon.
22623b24d854SEric Dumazet */
22637716682cSEric Dumazet sock_hold(sk);
2264d4f2c86bSKuniyuki Iwashima }
22653b24d854SEric Dumazet refcounted = true;
22661f3b359fSEric Dumazet nsk = NULL;
2267eeea10b8SEric Dumazet if (!tcp_filter(sk, skb)) {
2268eeea10b8SEric Dumazet th = (const struct tcphdr *)skb->data;
2269eeea10b8SEric Dumazet iph = ip_hdr(skb);
2270eeea10b8SEric Dumazet tcp_v4_fill_cb(skb, iph, th);
2271e0f9759fSEric Dumazet nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2272255f9034SMenglong Dong } else {
2273255f9034SMenglong Dong drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2274eeea10b8SEric Dumazet }
2275079096f1SEric Dumazet if (!nsk) {
2276079096f1SEric Dumazet reqsk_put(req);
2277e0f9759fSEric Dumazet if (req_stolen) {
2278e0f9759fSEric Dumazet /* Another cpu got exclusive access to req
2279e0f9759fSEric Dumazet * and created a full blown socket.
2280e0f9759fSEric Dumazet * Try to feed this packet to this socket
2281e0f9759fSEric Dumazet * instead of discarding it.
2282e0f9759fSEric Dumazet */
2283e0f9759fSEric Dumazet tcp_v4_restore_cb(skb);
2284e0f9759fSEric Dumazet sock_put(sk);
2285e0f9759fSEric Dumazet goto lookup;
2286e0f9759fSEric Dumazet }
22877716682cSEric Dumazet goto discard_and_relse;
2288079096f1SEric Dumazet }
22896f0012e3SEric Dumazet nf_reset_ct(skb);
2290079096f1SEric Dumazet if (nsk == sk) {
2291079096f1SEric Dumazet reqsk_put(req);
2292eeea10b8SEric Dumazet tcp_v4_restore_cb(skb);
2293ee01defeSJason Xing } else {
2294ee01defeSJason Xing drop_reason = tcp_child_process(sk, nsk, skb);
2295ee01defeSJason Xing if (drop_reason) {
2296120391efSJason Xing enum sk_rst_reason rst_reason;
2297120391efSJason Xing
2298120391efSJason Xing rst_reason = sk_rst_convert_drop_reason(drop_reason);
2299120391efSJason Xing tcp_v4_send_reset(nsk, skb, rst_reason);
23007716682cSEric Dumazet goto discard_and_relse;
2301ee01defeSJason Xing }
23027716682cSEric Dumazet sock_put(sk);
2303079096f1SEric Dumazet return 0;
2304079096f1SEric Dumazet }
2305079096f1SEric Dumazet }
230614834c4fSEric Dumazet
2307d13b0596SEric Dumazet process:
2308020e71a3SEric Dumazet if (static_branch_unlikely(&ip4_min_ttl)) {
230914834c4fSEric Dumazet /* min_ttl can be changed concurrently from do_ip_setsockopt() */
231014834c4fSEric Dumazet if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
231102a1d6e7SEric Dumazet __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
23122798e36dSEric Dumazet drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2313d218d111SStephen Hemminger goto discard_and_relse;
23146cce09f8SEric Dumazet }
2315020e71a3SEric Dumazet }
2316d218d111SStephen Hemminger
2317255f9034SMenglong Dong if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2318255f9034SMenglong Dong drop_reason = SKB_DROP_REASON_XFRM_POLICY;
23191da177e4SLinus Torvalds goto discard_and_relse;
2320255f9034SMenglong Dong }
23219ea88a15SDmitry Popov
23220a3a8090SDmitry Safonov drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
23230a3a8090SDmitry Safonov AF_INET, dif, sdif);
23241330b6efSJakub Kicinski if (drop_reason)
23259ea88a15SDmitry Popov goto discard_and_relse;
23269ea88a15SDmitry Popov
2327895b5c9fSFlorian Westphal nf_reset_ct(skb);
23281da177e4SLinus Torvalds
232985125597SMenglong Dong if (tcp_filter(sk, skb)) {
2330364df53cSMenglong Dong drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
23311da177e4SLinus Torvalds goto discard_and_relse;
233285125597SMenglong Dong }
2333ac6e7800SEric Dumazet th = (const struct tcphdr *)skb->data;
2334ac6e7800SEric Dumazet iph = ip_hdr(skb);
2335eeea10b8SEric Dumazet tcp_v4_fill_cb(skb, iph, th);
23361da177e4SLinus Torvalds
23371da177e4SLinus Torvalds skb->dev = NULL;
23381da177e4SLinus Torvalds
2339e994b2f0SEric Dumazet if (sk->sk_state == TCP_LISTEN) {
2340e994b2f0SEric Dumazet ret = tcp_v4_do_rcv(sk, skb);
2341e994b2f0SEric Dumazet goto put_and_return;
2342e994b2f0SEric Dumazet }
2343e994b2f0SEric Dumazet
2344e994b2f0SEric Dumazet sk_incoming_cpu_update(sk);
2345e994b2f0SEric Dumazet
2346c6366184SIngo Molnar bh_lock_sock_nested(sk);
2347a44d6eacSMartin KaFai Lau tcp_segs_in(tcp_sk(sk), skb);
23481da177e4SLinus Torvalds ret = 0;
23491da177e4SLinus Torvalds if (!sock_owned_by_user(sk)) {
23501da177e4SLinus Torvalds ret = tcp_v4_do_rcv(sk, skb);
23518b27dae5SEric Dumazet } else {
23527a26dc9eSMenglong Dong if (tcp_add_backlog(sk, skb, &drop_reason))
23536b03a53aSZhu Yi goto discard_and_relse;
23546b03a53aSZhu Yi }
23551da177e4SLinus Torvalds bh_unlock_sock(sk);
23561da177e4SLinus Torvalds
2357e994b2f0SEric Dumazet put_and_return:
23583b24d854SEric Dumazet if (refcounted)
23591da177e4SLinus Torvalds sock_put(sk);
23601da177e4SLinus Torvalds
23611da177e4SLinus Torvalds return ret;
23621da177e4SLinus Torvalds
23631da177e4SLinus Torvalds no_tcp_socket:
236485125597SMenglong Dong drop_reason = SKB_DROP_REASON_NO_SOCKET;
23651da177e4SLinus Torvalds if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
23661da177e4SLinus Torvalds goto discard_it;
23671da177e4SLinus Torvalds
2368eeea10b8SEric Dumazet tcp_v4_fill_cb(skb, iph, th);
2369eeea10b8SEric Dumazet
237012e25e10SEric Dumazet if (tcp_checksum_complete(skb)) {
23716a5dc9e5SEric Dumazet csum_error:
237285125597SMenglong Dong drop_reason = SKB_DROP_REASON_TCP_CSUM;
2373709c0314SJakub Kicinski trace_tcp_bad_csum(skb);
237490bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
23751da177e4SLinus Torvalds bad_packet:
237690bbcc60SEric Dumazet __TCP_INC_STATS(net, TCP_MIB_INERRS);
23771da177e4SLinus Torvalds } else {
2378120391efSJason Xing tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
23791da177e4SLinus Torvalds }
23801da177e4SLinus Torvalds
23811da177e4SLinus Torvalds discard_it:
2382f8319dfdSMenglong Dong SKB_DR_OR(drop_reason, NOT_SPECIFIED);
23831da177e4SLinus Torvalds /* Discard frame. */
238446a02aa3SYan Zhai sk_skb_reason_drop(sk, skb, drop_reason);
23851da177e4SLinus Torvalds return 0;
23861da177e4SLinus Torvalds
23871da177e4SLinus Torvalds discard_and_relse:
2388532182cdSEric Dumazet sk_drops_add(sk, skb);
23893b24d854SEric Dumazet if (refcounted)
23901da177e4SLinus Torvalds sock_put(sk);
23911da177e4SLinus Torvalds goto discard_it;
23921da177e4SLinus Torvalds
23931da177e4SLinus Torvalds do_time_wait:
23941da177e4SLinus Torvalds if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2395255f9034SMenglong Dong drop_reason = SKB_DROP_REASON_XFRM_POLICY;
23969469c7b4SYOSHIFUJI Hideaki inet_twsk_put(inet_twsk(sk));
23971da177e4SLinus Torvalds goto discard_it;
23981da177e4SLinus Torvalds }
23991da177e4SLinus Torvalds
2400eeea10b8SEric Dumazet tcp_v4_fill_cb(skb, iph, th);
2401eeea10b8SEric Dumazet
24026a5dc9e5SEric Dumazet if (tcp_checksum_complete(skb)) {
24036a5dc9e5SEric Dumazet inet_twsk_put(inet_twsk(sk));
24046a5dc9e5SEric Dumazet goto csum_error;
24051da177e4SLinus Torvalds }
240641eecbd7SEric Dumazet switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
24071da177e4SLinus Torvalds case TCP_TW_SYN: {
24084461568aSKuniyuki Iwashima struct sock *sk2 = inet_lookup_listener(net,
24094461568aSKuniyuki Iwashima net->ipv4.tcp_death_row.hashinfo,
24104461568aSKuniyuki Iwashima skb, __tcp_hdrlen(th),
2411da5e3630STom Herbert iph->saddr, th->source,
2412eddc9ec5SArnaldo Carvalho de Melo iph->daddr, th->dest,
24133fa6f616SDavid Ahern inet_iif(skb),
24143fa6f616SDavid Ahern sdif);
24151da177e4SLinus Torvalds if (sk2) {
2416dbe7faa4SEric Dumazet inet_twsk_deschedule_put(inet_twsk(sk));
24171da177e4SLinus Torvalds sk = sk2;
2418eeea10b8SEric Dumazet tcp_v4_restore_cb(skb);
24193b24d854SEric Dumazet refcounted = false;
242041eecbd7SEric Dumazet __this_cpu_write(tcp_tw_isn, isn);
24211da177e4SLinus Torvalds goto process;
24221da177e4SLinus Torvalds }
24231da177e4SLinus Torvalds }
2424fcfd6dfaSGustavo A. R. Silva /* to ACK */
2425a8eceea8SJoe Perches fallthrough;
24261da177e4SLinus Torvalds case TCP_TW_ACK:
24271da177e4SLinus Torvalds tcp_v4_timewait_ack(sk, skb);
24281da177e4SLinus Torvalds break;
24291da177e4SLinus Torvalds case TCP_TW_RST:
243022a32557SJason Xing tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2431271c3b9bSFlorian Westphal inet_twsk_deschedule_put(inet_twsk(sk));
2432271c3b9bSFlorian Westphal goto discard_it;
24331da177e4SLinus Torvalds case TCP_TW_SUCCESS:;
24341da177e4SLinus Torvalds }
24351da177e4SLinus Torvalds goto discard_it;
24361da177e4SLinus Torvalds }
24371da177e4SLinus Torvalds
2438ccb7c410SDavid S. Miller static struct timewait_sock_ops tcp_timewait_sock_ops = {
2439ccb7c410SDavid S. Miller .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2440ccb7c410SDavid S. Miller .twsk_destructor= tcp_twsk_destructor,
2441ccb7c410SDavid S. Miller };
24421da177e4SLinus Torvalds
244363d02d15SEric Dumazet void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
24445d299f3dSEric Dumazet {
24455d299f3dSEric Dumazet struct dst_entry *dst = skb_dst(skb);
24465d299f3dSEric Dumazet
24475037e9efSEric Dumazet if (dst && dst_hold_safe(dst)) {
24488f905c0eSEric Dumazet rcu_assign_pointer(sk->sk_rx_dst, dst);
24490c0a5ef8SEric Dumazet sk->sk_rx_dst_ifindex = skb->skb_iif;
24505d299f3dSEric Dumazet }
2451ca777effSEric Dumazet }
245263d02d15SEric Dumazet EXPORT_SYMBOL(inet_sk_rx_dst_set);
24535d299f3dSEric Dumazet
24543b401a81SStephen Hemminger const struct inet_connection_sock_af_ops ipv4_specific = {
24551da177e4SLinus Torvalds .queue_xmit = ip_queue_xmit,
24561da177e4SLinus Torvalds .send_check = tcp_v4_send_check,
245732519f11SArnaldo Carvalho de Melo .rebuild_header = inet_sk_rebuild_header,
24585d299f3dSEric Dumazet .sk_rx_dst_set = inet_sk_rx_dst_set,
24591da177e4SLinus Torvalds .conn_request = tcp_v4_conn_request,
24601da177e4SLinus Torvalds .syn_recv_sock = tcp_v4_syn_recv_sock,
24611da177e4SLinus Torvalds .net_header_len = sizeof(struct iphdr),
24621da177e4SLinus Torvalds .setsockopt = ip_setsockopt,
24631da177e4SLinus Torvalds .getsockopt = ip_getsockopt,
2464543d9cfeSArnaldo Carvalho de Melo .addr2sockaddr = inet_csk_addr2sockaddr,
2465543d9cfeSArnaldo Carvalho de Melo .sockaddr_len = sizeof(struct sockaddr_in),
24664fab9071SNeal Cardwell .mtu_reduced = tcp_v4_mtu_reduced,
24671da177e4SLinus Torvalds };
24684bc2f18bSEric Dumazet EXPORT_SYMBOL(ipv4_specific);
24691da177e4SLinus Torvalds
24704954f17dSDmitry Safonov #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2471b2e4b3deSStephen Hemminger static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
24724954f17dSDmitry Safonov #ifdef CONFIG_TCP_MD5SIG
2473cfb6eeb4SYOSHIFUJI Hideaki .md5_lookup = tcp_v4_md5_lookup,
247449a72dfbSAdam Langley .calc_md5_hash = tcp_v4_md5_hash_skb,
2475cfb6eeb4SYOSHIFUJI Hideaki .md5_parse = tcp_v4_parse_md5_keys,
24764954f17dSDmitry Safonov #endif
24774954f17dSDmitry Safonov #ifdef CONFIG_TCP_AO
24780aadc739SDmitry Safonov .ao_lookup = tcp_v4_ao_lookup,
24791e03d32bSDmitry Safonov .calc_ao_hash = tcp_v4_ao_hash_skb,
24804954f17dSDmitry Safonov .ao_parse = tcp_v4_parse_ao,
24817c2ffaf2SDmitry Safonov .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
24824954f17dSDmitry Safonov #endif
2483cfb6eeb4SYOSHIFUJI Hideaki };
2484b6332e6cSAndrew Morton #endif
2485cfb6eeb4SYOSHIFUJI Hideaki
24861da177e4SLinus Torvalds /* NOTE: A lot of things set to zero explicitly by call to
24871da177e4SLinus Torvalds * sk_alloc() so need not be done here.
24881da177e4SLinus Torvalds */
24891da177e4SLinus Torvalds static int tcp_v4_init_sock(struct sock *sk)
24901da177e4SLinus Torvalds {
24916687e988SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk);
24921da177e4SLinus Torvalds
2493900f65d3SNeal Cardwell tcp_init_sock(sk);
24941da177e4SLinus Torvalds
24958292a17aSArnaldo Carvalho de Melo icsk->icsk_af_ops = &ipv4_specific;
2496900f65d3SNeal Cardwell
24974954f17dSDmitry Safonov #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2498ac807fa8SDavid S. Miller tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2499cfb6eeb4SYOSHIFUJI Hideaki #endif
25001da177e4SLinus Torvalds
25011da177e4SLinus Torvalds return 0;
25021da177e4SLinus Torvalds }
25031da177e4SLinus Torvalds
25048c73b263SDmitry Safonov #ifdef CONFIG_TCP_MD5SIG
25058c73b263SDmitry Safonov static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
25068c73b263SDmitry Safonov {
25078c73b263SDmitry Safonov struct tcp_md5sig_info *md5sig;
25088c73b263SDmitry Safonov
25098c73b263SDmitry Safonov md5sig = container_of(head, struct tcp_md5sig_info, rcu);
25108c73b263SDmitry Safonov kfree(md5sig);
25118c73b263SDmitry Safonov static_branch_slow_dec_deferred(&tcp_md5_needed);
25128c73b263SDmitry Safonov tcp_md5_release_sigpool();
25138c73b263SDmitry Safonov }
25148c73b263SDmitry Safonov #endif
25158c73b263SDmitry Safonov
2516*8f0b3cc9SMina Almasry static void tcp_release_user_frags(struct sock *sk)
2517*8f0b3cc9SMina Almasry {
2518*8f0b3cc9SMina Almasry #ifdef CONFIG_PAGE_POOL
2519*8f0b3cc9SMina Almasry unsigned long index;
2520*8f0b3cc9SMina Almasry void *netmem;
2521*8f0b3cc9SMina Almasry
2522*8f0b3cc9SMina Almasry xa_for_each(&sk->sk_user_frags, index, netmem)
2523*8f0b3cc9SMina Almasry WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2524*8f0b3cc9SMina Almasry #endif
2525*8f0b3cc9SMina Almasry }
2526*8f0b3cc9SMina Almasry
25277d06b2e0SBrian Haley void tcp_v4_destroy_sock(struct sock *sk)
25281da177e4SLinus Torvalds {
25291da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk);
25301da177e4SLinus Torvalds
2531*8f0b3cc9SMina Almasry tcp_release_user_frags(sk);
2532*8f0b3cc9SMina Almasry
2533*8f0b3cc9SMina Almasry xa_destroy(&sk->sk_user_frags);
2534*8f0b3cc9SMina Almasry
2535e1a4aa50SSong Liu trace_tcp_destroy_sock(sk);
2536e1a4aa50SSong Liu
25371da177e4SLinus Torvalds tcp_clear_xmit_timers(sk);
25381da177e4SLinus Torvalds
25396687e988SArnaldo Carvalho de Melo tcp_cleanup_congestion_control(sk);
2540317a76f9SStephen Hemminger
2541734942ccSDave Watson tcp_cleanup_ulp(sk);
2542734942ccSDave Watson
25431da177e4SLinus Torvalds /* Cleanup up the write buffer. */
2544fe067e8aSDavid S. Miller tcp_write_queue_purge(sk);
25451da177e4SLinus Torvalds
2546cf1ef3f0SWei Wang /* Check if we want to disable active TFO */
2547cf1ef3f0SWei Wang tcp_fastopen_active_disable_ofo_check(sk);
2548cf1ef3f0SWei Wang
25491da177e4SLinus Torvalds /* Cleans up our, hopefully empty, out_of_order_queue. */
25509f5afeaeSYaogong Wang skb_rbtree_purge(&tp->out_of_order_queue);
25511da177e4SLinus Torvalds
2552cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
2553cfb6eeb4SYOSHIFUJI Hideaki /* Clean up the MD5 key list, if any */
2554cfb6eeb4SYOSHIFUJI Hideaki if (tp->md5sig_info) {
25558c73b263SDmitry Safonov struct tcp_md5sig_info *md5sig;
25568c73b263SDmitry Safonov
25578c73b263SDmitry Safonov md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2558a915da9bSEric Dumazet tcp_clear_md5_list(sk);
25598c73b263SDmitry Safonov call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
25608c73b263SDmitry Safonov rcu_assign_pointer(tp->md5sig_info, NULL);
2561cfb6eeb4SYOSHIFUJI Hideaki }
2562cfb6eeb4SYOSHIFUJI Hideaki #endif
2563decde258SDmitry Safonov tcp_ao_destroy_sock(sk, false);
2564cfb6eeb4SYOSHIFUJI Hideaki
25651da177e4SLinus Torvalds /* Clean up a referenced TCP bind bucket. */
2566463c84b9SArnaldo Carvalho de Melo if (inet_csk(sk)->icsk_bind_hash)
2567ab1e0a13SArnaldo Carvalho de Melo inet_put_port(sk);
25681da177e4SLinus Torvalds
2569d983ea6fSEric Dumazet BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2570435cf559SWilliam Allen Simpson
2571cf60af03SYuchung Cheng /* If socket is aborted during connect operation */
2572cf60af03SYuchung Cheng tcp_free_fastopen_req(tp);
25731fba70e5SYuchung Cheng tcp_fastopen_destroy_cipher(sk);
2574cd8ae852SEric Dumazet tcp_saved_syn_free(tp);
2575cf60af03SYuchung Cheng
2576180d8cd9SGlauber Costa sk_sockets_allocated_dec(sk);
25771da177e4SLinus Torvalds }
25781da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_destroy_sock);
25791da177e4SLinus Torvalds
25801da177e4SLinus Torvalds #ifdef CONFIG_PROC_FS
25811da177e4SLinus Torvalds /* Proc filesystem TCP sock list dumping. */
25821da177e4SLinus Torvalds
2583ad2d6137SMartin KaFai Lau static unsigned short seq_file_family(const struct seq_file *seq);
2584ad2d6137SMartin KaFai Lau
2585ad2d6137SMartin KaFai Lau static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2586ad2d6137SMartin KaFai Lau {
2587ad2d6137SMartin KaFai Lau unsigned short family = seq_file_family(seq);
2588ad2d6137SMartin KaFai Lau
2589ad2d6137SMartin KaFai Lau /* AF_UNSPEC is used as a match all */
2590ad2d6137SMartin KaFai Lau return ((family == AF_UNSPEC || family == sk->sk_family) &&
2591ad2d6137SMartin KaFai Lau net_eq(sock_net(sk), seq_file_net(seq)));
2592ad2d6137SMartin KaFai Lau }
2593ad2d6137SMartin KaFai Lau
2594b72acf45SMartin KaFai Lau /* Find a non empty bucket (starting from st->bucket)
2595b72acf45SMartin KaFai Lau * and return the first sk from it.
2596b72acf45SMartin KaFai Lau */
2597b72acf45SMartin KaFai Lau static void *listening_get_first(struct seq_file *seq)
2598b72acf45SMartin KaFai Lau {
25994461568aSKuniyuki Iwashima struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2600b72acf45SMartin KaFai Lau struct tcp_iter_state *st = seq->private;
2601b72acf45SMartin KaFai Lau
2602b72acf45SMartin KaFai Lau st->offset = 0;
26034461568aSKuniyuki Iwashima for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
260405c0b357SMartin KaFai Lau struct inet_listen_hashbucket *ilb2;
2605cae3873cSMartin KaFai Lau struct hlist_nulls_node *node;
2606b72acf45SMartin KaFai Lau struct sock *sk;
2607b72acf45SMartin KaFai Lau
26084461568aSKuniyuki Iwashima ilb2 = &hinfo->lhash2[st->bucket];
2609cae3873cSMartin KaFai Lau if (hlist_nulls_empty(&ilb2->nulls_head))
2610b72acf45SMartin KaFai Lau continue;
2611b72acf45SMartin KaFai Lau
261205c0b357SMartin KaFai Lau spin_lock(&ilb2->lock);
2613cae3873cSMartin KaFai Lau sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2614b72acf45SMartin KaFai Lau if (seq_sk_match(seq, sk))
2615b72acf45SMartin KaFai Lau return sk;
2616b72acf45SMartin KaFai Lau }
261705c0b357SMartin KaFai Lau spin_unlock(&ilb2->lock);
2618b72acf45SMartin KaFai Lau }
2619b72acf45SMartin KaFai Lau
2620b72acf45SMartin KaFai Lau return NULL;
2621b72acf45SMartin KaFai Lau }
2622b72acf45SMartin KaFai Lau
2623b72acf45SMartin KaFai Lau /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2624b72acf45SMartin KaFai Lau * If "cur" is the last one in the st->bucket,
2625b72acf45SMartin KaFai Lau * call listening_get_first() to return the first sk of the next
2626b72acf45SMartin KaFai Lau * non empty bucket.
2627a8b690f9STom Herbert */
26281da177e4SLinus Torvalds static void *listening_get_next(struct seq_file *seq, void *cur)
26291da177e4SLinus Torvalds {
26301da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private;
263105c0b357SMartin KaFai Lau struct inet_listen_hashbucket *ilb2;
2632cae3873cSMartin KaFai Lau struct hlist_nulls_node *node;
26334461568aSKuniyuki Iwashima struct inet_hashinfo *hinfo;
26343b24d854SEric Dumazet struct sock *sk = cur;
26351da177e4SLinus Torvalds
26361da177e4SLinus Torvalds ++st->num;
2637a8b690f9STom Herbert ++st->offset;
26381da177e4SLinus Torvalds
2639cae3873cSMartin KaFai Lau sk = sk_nulls_next(sk);
2640cae3873cSMartin KaFai Lau sk_nulls_for_each_from(sk, node) {
2641ad2d6137SMartin KaFai Lau if (seq_sk_match(seq, sk))
26423b24d854SEric Dumazet return sk;
26431da177e4SLinus Torvalds }
2644b72acf45SMartin KaFai Lau
26454461568aSKuniyuki Iwashima hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
26464461568aSKuniyuki Iwashima ilb2 = &hinfo->lhash2[st->bucket];
264705c0b357SMartin KaFai Lau spin_unlock(&ilb2->lock);
2648b72acf45SMartin KaFai Lau ++st->bucket;
2649b72acf45SMartin KaFai Lau return listening_get_first(seq);
26501da177e4SLinus Torvalds }
26511da177e4SLinus Torvalds
26521da177e4SLinus Torvalds static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
26531da177e4SLinus Torvalds {
2654a8b690f9STom Herbert struct tcp_iter_state *st = seq->private;
2655a8b690f9STom Herbert void *rc;
2656a8b690f9STom Herbert
2657a8b690f9STom Herbert st->bucket = 0;
2658a8b690f9STom Herbert st->offset = 0;
2659b72acf45SMartin KaFai Lau rc = listening_get_first(seq);
26601da177e4SLinus Torvalds
26611da177e4SLinus Torvalds while (rc && *pos) {
26621da177e4SLinus Torvalds rc = listening_get_next(seq, rc);
26631da177e4SLinus Torvalds --*pos;
26641da177e4SLinus Torvalds }
26651da177e4SLinus Torvalds return rc;
26661da177e4SLinus Torvalds }
26671da177e4SLinus Torvalds
26684461568aSKuniyuki Iwashima static inline bool empty_bucket(struct inet_hashinfo *hinfo,
26694461568aSKuniyuki Iwashima const struct tcp_iter_state *st)
26706eac5604SAndi Kleen {
26714461568aSKuniyuki Iwashima return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
26726eac5604SAndi Kleen }
26736eac5604SAndi Kleen
2674a8b690f9STom Herbert /*
2675a8b690f9STom Herbert * Get first established socket starting from bucket given in st->bucket.
2676a8b690f9STom Herbert * If st->bucket is zero, the very first socket in the hash is returned.
2677a8b690f9STom Herbert */
26781da177e4SLinus Torvalds static void *established_get_first(struct seq_file *seq)
26791da177e4SLinus Torvalds {
26804461568aSKuniyuki Iwashima struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
26811da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private;
2682b08d4d3bSYonghong Song
2683a8b690f9STom Herbert st->offset = 0;
26844461568aSKuniyuki Iwashima for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
26851da177e4SLinus Torvalds struct sock *sk;
26863ab5aee7SEric Dumazet struct hlist_nulls_node *node;
26874461568aSKuniyuki Iwashima spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
26881da177e4SLinus Torvalds
26899f4a7c93SJian Wen cond_resched();
26909f4a7c93SJian Wen
26916eac5604SAndi Kleen /* Lockless fast path for the common case of empty buckets */
26924461568aSKuniyuki Iwashima if (empty_bucket(hinfo, st))
26936eac5604SAndi Kleen continue;
26946eac5604SAndi Kleen
26959db66bdcSEric Dumazet spin_lock_bh(lock);
26964461568aSKuniyuki Iwashima sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2697ad2d6137SMartin KaFai Lau if (seq_sk_match(seq, sk))
2698ad2d6137SMartin KaFai Lau return sk;
26991da177e4SLinus Torvalds }
27009db66bdcSEric Dumazet spin_unlock_bh(lock);
27011da177e4SLinus Torvalds }
2702ad2d6137SMartin KaFai Lau
2703ad2d6137SMartin KaFai Lau return NULL;
27041da177e4SLinus Torvalds }
27051da177e4SLinus Torvalds
27061da177e4SLinus Torvalds static void *established_get_next(struct seq_file *seq, void *cur)
27071da177e4SLinus Torvalds {
27084461568aSKuniyuki Iwashima struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
27091da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private;
271008eaef90SKuniyuki Iwashima struct hlist_nulls_node *node;
271108eaef90SKuniyuki Iwashima struct sock *sk = cur;
2712b08d4d3bSYonghong Song
27131da177e4SLinus Torvalds ++st->num;
2714a8b690f9STom Herbert ++st->offset;
27151da177e4SLinus Torvalds
27163ab5aee7SEric Dumazet sk = sk_nulls_next(sk);
27171da177e4SLinus Torvalds
27183ab5aee7SEric Dumazet sk_nulls_for_each_from(sk, node) {
2719ad2d6137SMartin KaFai Lau if (seq_sk_match(seq, sk))
272005dbc7b5SEric Dumazet return sk;
27211da177e4SLinus Torvalds }
27221da177e4SLinus Torvalds
27234461568aSKuniyuki Iwashima spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
272405dbc7b5SEric Dumazet ++st->bucket;
272505dbc7b5SEric Dumazet return established_get_first(seq);
27261da177e4SLinus Torvalds }
27271da177e4SLinus Torvalds
27281da177e4SLinus Torvalds static void *established_get_idx(struct seq_file *seq, loff_t pos)
27291da177e4SLinus Torvalds {
2730a8b690f9STom Herbert struct tcp_iter_state *st = seq->private;
2731a8b690f9STom Herbert void *rc;
2732a8b690f9STom Herbert
2733a8b690f9STom Herbert st->bucket = 0;
2734a8b690f9STom Herbert rc = established_get_first(seq);
27351da177e4SLinus Torvalds
27361da177e4SLinus Torvalds while (rc && pos) {
27371da177e4SLinus Torvalds rc = established_get_next(seq, rc);
27381da177e4SLinus Torvalds --pos;
27391da177e4SLinus Torvalds }
27401da177e4SLinus Torvalds return rc;
27411da177e4SLinus Torvalds }
27421da177e4SLinus Torvalds
27431da177e4SLinus Torvalds static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
27441da177e4SLinus Torvalds {
27451da177e4SLinus Torvalds void *rc;
27461da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private;
27471da177e4SLinus Torvalds
27481da177e4SLinus Torvalds st->state = TCP_SEQ_STATE_LISTENING;
27491da177e4SLinus Torvalds rc = listening_get_idx(seq, &pos);
27501da177e4SLinus Torvalds
27511da177e4SLinus Torvalds if (!rc) {
27521da177e4SLinus Torvalds st->state = TCP_SEQ_STATE_ESTABLISHED;
27531da177e4SLinus Torvalds rc = established_get_idx(seq, pos);
27541da177e4SLinus Torvalds }
27551da177e4SLinus Torvalds
27561da177e4SLinus Torvalds return rc;
27571da177e4SLinus Torvalds }
27581da177e4SLinus Torvalds
2759a8b690f9STom Herbert static void *tcp_seek_last_pos(struct seq_file *seq)
2760a8b690f9STom Herbert {
27614461568aSKuniyuki Iwashima struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2762a8b690f9STom Herbert struct tcp_iter_state *st = seq->private;
2763525e2f9fSMartin KaFai Lau int bucket = st->bucket;
2764a8b690f9STom Herbert int offset = st->offset;
2765a8b690f9STom Herbert int orig_num = st->num;
2766a8b690f9STom Herbert void *rc = NULL;
2767a8b690f9STom Herbert
2768a8b690f9STom Herbert switch (st->state) {
2769a8b690f9STom Herbert case TCP_SEQ_STATE_LISTENING:
27704461568aSKuniyuki Iwashima if (st->bucket > hinfo->lhash2_mask)
2771a8b690f9STom Herbert break;
2772b72acf45SMartin KaFai Lau rc = listening_get_first(seq);
2773525e2f9fSMartin KaFai Lau while (offset-- && rc && bucket == st->bucket)
2774a8b690f9STom Herbert rc = listening_get_next(seq, rc);
2775a8b690f9STom Herbert if (rc)
2776a8b690f9STom Herbert break;
2777a8b690f9STom Herbert st->bucket = 0;
277805dbc7b5SEric Dumazet st->state = TCP_SEQ_STATE_ESTABLISHED;
2779a8eceea8SJoe Perches fallthrough;
2780a8b690f9STom Herbert case TCP_SEQ_STATE_ESTABLISHED:
27814461568aSKuniyuki Iwashima if (st->bucket > hinfo->ehash_mask)
2782a8b690f9STom Herbert break;
2783a8b690f9STom Herbert rc = established_get_first(seq);
2784525e2f9fSMartin KaFai Lau while (offset-- && rc && bucket == st->bucket)
2785a8b690f9STom Herbert rc = established_get_next(seq, rc);
2786a8b690f9STom Herbert }
2787a8b690f9STom Herbert
2788a8b690f9STom Herbert st->num = orig_num;
2789a8b690f9STom Herbert
2790a8b690f9STom Herbert return rc;
2791a8b690f9STom Herbert }
2792a8b690f9STom Herbert
279337d849bbSChristoph Hellwig void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
27941da177e4SLinus Torvalds {
27951da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private;
2796a8b690f9STom Herbert void *rc;
2797a8b690f9STom Herbert
2798a8b690f9STom Herbert if (*pos && *pos == st->last_pos) {
2799a8b690f9STom Herbert rc = tcp_seek_last_pos(seq);
2800a8b690f9STom Herbert if (rc)
2801a8b690f9STom Herbert goto out;
2802a8b690f9STom Herbert }
2803a8b690f9STom Herbert
28041da177e4SLinus Torvalds st->state = TCP_SEQ_STATE_LISTENING;
28051da177e4SLinus Torvalds st->num = 0;
2806a8b690f9STom Herbert st->bucket = 0;
2807a8b690f9STom Herbert st->offset = 0;
2808a8b690f9STom Herbert rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2809a8b690f9STom Herbert
2810a8b690f9STom Herbert out:
2811a8b690f9STom Herbert st->last_pos = *pos;
2812a8b690f9STom Herbert return rc;
28131da177e4SLinus Torvalds }
281437d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_start);
28151da177e4SLinus Torvalds
281637d849bbSChristoph Hellwig void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
28171da177e4SLinus Torvalds {
2818a8b690f9STom Herbert struct tcp_iter_state *st = seq->private;
28191da177e4SLinus Torvalds void *rc = NULL;
28201da177e4SLinus Torvalds
28211da177e4SLinus Torvalds if (v == SEQ_START_TOKEN) {
28221da177e4SLinus Torvalds rc = tcp_get_idx(seq, 0);
28231da177e4SLinus Torvalds goto out;
28241da177e4SLinus Torvalds }
28251da177e4SLinus Torvalds
28261da177e4SLinus Torvalds switch (st->state) {
28271da177e4SLinus Torvalds case TCP_SEQ_STATE_LISTENING:
28281da177e4SLinus Torvalds rc = listening_get_next(seq, v);
28291da177e4SLinus Torvalds if (!rc) {
28301da177e4SLinus Torvalds st->state = TCP_SEQ_STATE_ESTABLISHED;
2831a8b690f9STom Herbert st->bucket = 0;
2832a8b690f9STom Herbert st->offset = 0;
28331da177e4SLinus Torvalds rc = established_get_first(seq);
28341da177e4SLinus Torvalds }
28351da177e4SLinus Torvalds break;
28361da177e4SLinus Torvalds case TCP_SEQ_STATE_ESTABLISHED:
28371da177e4SLinus Torvalds rc = established_get_next(seq, v);
28381da177e4SLinus Torvalds break;
28391da177e4SLinus Torvalds }
28401da177e4SLinus Torvalds out:
28411da177e4SLinus Torvalds ++*pos;
2842a8b690f9STom Herbert st->last_pos = *pos;
28431da177e4SLinus Torvalds return rc;
28441da177e4SLinus Torvalds }
284537d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_next);
28461da177e4SLinus Torvalds
284737d849bbSChristoph Hellwig void tcp_seq_stop(struct seq_file *seq, void *v)
28481da177e4SLinus Torvalds {
28494461568aSKuniyuki Iwashima struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
28501da177e4SLinus Torvalds struct tcp_iter_state *st = seq->private;
28511da177e4SLinus Torvalds
28521da177e4SLinus Torvalds switch (st->state) {
28531da177e4SLinus Torvalds case TCP_SEQ_STATE_LISTENING:
28541da177e4SLinus Torvalds if (v != SEQ_START_TOKEN)
28554461568aSKuniyuki Iwashima spin_unlock(&hinfo->lhash2[st->bucket].lock);
28561da177e4SLinus Torvalds break;
28571da177e4SLinus Torvalds case TCP_SEQ_STATE_ESTABLISHED:
28581da177e4SLinus Torvalds if (v)
28594461568aSKuniyuki Iwashima spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
28601da177e4SLinus Torvalds break;
28611da177e4SLinus Torvalds }
28621da177e4SLinus Torvalds }
286337d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_stop);
28641da177e4SLinus Torvalds
2865d4f06873SEric Dumazet static void get_openreq4(const struct request_sock *req,
2866aa3a0c8cSEric Dumazet struct seq_file *f, int i)
28671da177e4SLinus Torvalds {
28682e6599cbSArnaldo Carvalho de Melo const struct inet_request_sock *ireq = inet_rsk(req);
2869fa76ce73SEric Dumazet long delta = req->rsk_timer.expires - jiffies;
28701da177e4SLinus Torvalds
28715e659e4cSPavel Emelyanov seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2872652586dfSTetsuo Handa " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
28731da177e4SLinus Torvalds i,
2874634fb979SEric Dumazet ireq->ir_loc_addr,
2875d4f06873SEric Dumazet ireq->ir_num,
2876634fb979SEric Dumazet ireq->ir_rmt_addr,
2877634fb979SEric Dumazet ntohs(ireq->ir_rmt_port),
28781da177e4SLinus Torvalds TCP_SYN_RECV,
28791da177e4SLinus Torvalds 0, 0, /* could print option size, but that is af dependent. */
28801da177e4SLinus Torvalds 1, /* timers active (only the expire timer) */
2881a399a805SEric Dumazet jiffies_delta_to_clock_t(delta),
2882e6c022a4SEric Dumazet req->num_timeout,
2883aa3a0c8cSEric Dumazet from_kuid_munged(seq_user_ns(f),
2884aa3a0c8cSEric Dumazet sock_i_uid(req->rsk_listener)),
28851da177e4SLinus Torvalds 0, /* non standard timer */
28861da177e4SLinus Torvalds 0, /* open_requests have no inode */
2887d4f06873SEric Dumazet 0,
2888652586dfSTetsuo Handa req);
28891da177e4SLinus Torvalds }
28901da177e4SLinus Torvalds
2891652586dfSTetsuo Handa static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
28921da177e4SLinus Torvalds {
28931da177e4SLinus Torvalds int timer_active;
28941da177e4SLinus Torvalds unsigned long timer_expires;
2895cf533ea5SEric Dumazet const struct tcp_sock *tp = tcp_sk(sk);
2896cf4c6bf8SIlpo Järvinen const struct inet_connection_sock *icsk = inet_csk(sk);
2897cf533ea5SEric Dumazet const struct inet_sock *inet = inet_sk(sk);
28980536fcc0SEric Dumazet const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2899c720c7e8SEric Dumazet __be32 dest = inet->inet_daddr;
2900c720c7e8SEric Dumazet __be32 src = inet->inet_rcv_saddr;
2901c720c7e8SEric Dumazet __u16 destp = ntohs(inet->inet_dport);
2902c720c7e8SEric Dumazet __u16 srcp = ntohs(inet->inet_sport);
290349d09007SEric Dumazet int rx_queue;
290400fd38d9SEric Dumazet int state;
29051da177e4SLinus Torvalds
29066ba8a3b1SNandita Dukkipati if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
290757dde7f7SYuchung Cheng icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
29086ba8a3b1SNandita Dukkipati icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
29091da177e4SLinus Torvalds timer_active = 1;
2910463c84b9SArnaldo Carvalho de Melo timer_expires = icsk->icsk_timeout;
2911463c84b9SArnaldo Carvalho de Melo } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
29121da177e4SLinus Torvalds timer_active = 4;
2913463c84b9SArnaldo Carvalho de Melo timer_expires = icsk->icsk_timeout;
2914cf4c6bf8SIlpo Järvinen } else if (timer_pending(&sk->sk_timer)) {
29151da177e4SLinus Torvalds timer_active = 2;
2916cf4c6bf8SIlpo Järvinen timer_expires = sk->sk_timer.expires;
29171da177e4SLinus Torvalds } else {
29181da177e4SLinus Torvalds timer_active = 0;
29191da177e4SLinus Torvalds timer_expires = jiffies;
29201da177e4SLinus Torvalds }
29211da177e4SLinus Torvalds
2922986ffdfdSYafang Shao state = inet_sk_state_load(sk);
292300fd38d9SEric Dumazet if (state == TCP_LISTEN)
2924288efe86SEric Dumazet rx_queue = READ_ONCE(sk->sk_ack_backlog);
292549d09007SEric Dumazet else
292600fd38d9SEric Dumazet /* Because we don't lock the socket,
292700fd38d9SEric Dumazet * we might find a transient negative value.
292849d09007SEric Dumazet */
2929dba7d9b8SEric Dumazet rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
29307db48e98SEric Dumazet READ_ONCE(tp->copied_seq), 0);
293149d09007SEric Dumazet
29325e659e4cSPavel Emelyanov seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2933652586dfSTetsuo Handa "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
293400fd38d9SEric Dumazet i, src, srcp, dest, destp, state,
29350f317464SEric Dumazet READ_ONCE(tp->write_seq) - tp->snd_una,
293649d09007SEric Dumazet rx_queue,
29371da177e4SLinus Torvalds timer_active,
2938a399a805SEric Dumazet jiffies_delta_to_clock_t(timer_expires - jiffies),
2939463c84b9SArnaldo Carvalho de Melo icsk->icsk_retransmits,
2940a7cb5a49SEric W. Biederman from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
29416687e988SArnaldo Carvalho de Melo icsk->icsk_probes_out,
2942cf4c6bf8SIlpo Järvinen sock_i_ino(sk),
294341c6d650SReshetova, Elena refcount_read(&sk->sk_refcnt), sk,
29447be87351SStephen Hemminger jiffies_to_clock_t(icsk->icsk_rto),
29457be87351SStephen Hemminger jiffies_to_clock_t(icsk->icsk_ack.ato),
294631954cd8SWei Wang (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
294740570375SEric Dumazet tcp_snd_cwnd(tp),
294800fd38d9SEric Dumazet state == TCP_LISTEN ?
294900fd38d9SEric Dumazet fastopenq->max_qlen :
2950652586dfSTetsuo Handa (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
29511da177e4SLinus Torvalds }
29521da177e4SLinus Torvalds
2953cf533ea5SEric Dumazet static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2954652586dfSTetsuo Handa struct seq_file *f, int i)
29551da177e4SLinus Torvalds {
2956789f558cSEric Dumazet long delta = tw->tw_timer.expires - jiffies;
295723f33c2dSAl Viro __be32 dest, src;
29581da177e4SLinus Torvalds __u16 destp, srcp;
29591da177e4SLinus Torvalds
29601da177e4SLinus Torvalds dest = tw->tw_daddr;
29611da177e4SLinus Torvalds src = tw->tw_rcv_saddr;
29621da177e4SLinus Torvalds destp = ntohs(tw->tw_dport);
29631da177e4SLinus Torvalds srcp = ntohs(tw->tw_sport);
29641da177e4SLinus Torvalds
29655e659e4cSPavel Emelyanov seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2966652586dfSTetsuo Handa " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
29673e5cbbb1SEric Dumazet i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2968a399a805SEric Dumazet 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
296941c6d650SReshetova, Elena refcount_read(&tw->tw_refcnt), tw);
29701da177e4SLinus Torvalds }
29711da177e4SLinus Torvalds
29721da177e4SLinus Torvalds #define TMPSZ 150
29731da177e4SLinus Torvalds
29741da177e4SLinus Torvalds static int tcp4_seq_show(struct seq_file *seq, void *v)
29751da177e4SLinus Torvalds {
29761da177e4SLinus Torvalds struct tcp_iter_state *st;
297705dbc7b5SEric Dumazet struct sock *sk = v;
29781da177e4SLinus Torvalds
2979652586dfSTetsuo Handa seq_setwidth(seq, TMPSZ - 1);
29801da177e4SLinus Torvalds if (v == SEQ_START_TOKEN) {
2981652586dfSTetsuo Handa seq_puts(seq, " sl local_address rem_address st tx_queue "
29821da177e4SLinus Torvalds "rx_queue tr tm->when retrnsmt uid timeout "
29831da177e4SLinus Torvalds "inode");
29841da177e4SLinus Torvalds goto out;
29851da177e4SLinus Torvalds }
29861da177e4SLinus Torvalds st = seq->private;
29871da177e4SLinus Torvalds
298805dbc7b5SEric Dumazet if (sk->sk_state == TCP_TIME_WAIT)
2989652586dfSTetsuo Handa get_timewait4_sock(v, seq, st->num);
2990079096f1SEric Dumazet else if (sk->sk_state == TCP_NEW_SYN_RECV)
2991079096f1SEric Dumazet get_openreq4(v, seq, st->num);
299205dbc7b5SEric Dumazet else
2993652586dfSTetsuo Handa get_tcp4_sock(v, seq, st->num);
29941da177e4SLinus Torvalds out:
2995652586dfSTetsuo Handa seq_pad(seq, '\n');
29961da177e4SLinus Torvalds return 0;
29971da177e4SLinus Torvalds }
29981da177e4SLinus Torvalds
299952d87d5fSYonghong Song #ifdef CONFIG_BPF_SYSCALL
300004c7820bSMartin KaFai Lau struct bpf_tcp_iter_state {
300104c7820bSMartin KaFai Lau struct tcp_iter_state state;
300204c7820bSMartin KaFai Lau unsigned int cur_sk;
300304c7820bSMartin KaFai Lau unsigned int end_sk;
300404c7820bSMartin KaFai Lau unsigned int max_sk;
300504c7820bSMartin KaFai Lau struct sock **batch;
300604c7820bSMartin KaFai Lau bool st_bucket_done;
300704c7820bSMartin KaFai Lau };
300804c7820bSMartin KaFai Lau
300952d87d5fSYonghong Song struct bpf_iter__tcp {
301052d87d5fSYonghong Song __bpf_md_ptr(struct bpf_iter_meta *, meta);
301152d87d5fSYonghong Song __bpf_md_ptr(struct sock_common *, sk_common);
301252d87d5fSYonghong Song uid_t uid __aligned(8);
301352d87d5fSYonghong Song };
301452d87d5fSYonghong Song
301552d87d5fSYonghong Song static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
301652d87d5fSYonghong Song struct sock_common *sk_common, uid_t uid)
301752d87d5fSYonghong Song {
301852d87d5fSYonghong Song struct bpf_iter__tcp ctx;
301952d87d5fSYonghong Song
302052d87d5fSYonghong Song meta->seq_num--; /* skip SEQ_START_TOKEN */
302152d87d5fSYonghong Song ctx.meta = meta;
302252d87d5fSYonghong Song ctx.sk_common = sk_common;
302352d87d5fSYonghong Song ctx.uid = uid;
302452d87d5fSYonghong Song return bpf_iter_run_prog(prog, &ctx);
302552d87d5fSYonghong Song }
302652d87d5fSYonghong Song
302704c7820bSMartin KaFai Lau static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
302804c7820bSMartin KaFai Lau {
302904c7820bSMartin KaFai Lau while (iter->cur_sk < iter->end_sk)
3030580031ffSMartin KaFai Lau sock_gen_put(iter->batch[iter->cur_sk++]);
303104c7820bSMartin KaFai Lau }
303204c7820bSMartin KaFai Lau
303304c7820bSMartin KaFai Lau static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
303404c7820bSMartin KaFai Lau unsigned int new_batch_sz)
303504c7820bSMartin KaFai Lau {
303604c7820bSMartin KaFai Lau struct sock **new_batch;
303704c7820bSMartin KaFai Lau
303804c7820bSMartin KaFai Lau new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
303904c7820bSMartin KaFai Lau GFP_USER | __GFP_NOWARN);
304004c7820bSMartin KaFai Lau if (!new_batch)
304104c7820bSMartin KaFai Lau return -ENOMEM;
304204c7820bSMartin KaFai Lau
304304c7820bSMartin KaFai Lau bpf_iter_tcp_put_batch(iter);
304404c7820bSMartin KaFai Lau kvfree(iter->batch);
304504c7820bSMartin KaFai Lau iter->batch = new_batch;
304604c7820bSMartin KaFai Lau iter->max_sk = new_batch_sz;
304704c7820bSMartin KaFai Lau
304804c7820bSMartin KaFai Lau return 0;
304904c7820bSMartin KaFai Lau }
305004c7820bSMartin KaFai Lau
305104c7820bSMartin KaFai Lau static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
305204c7820bSMartin KaFai Lau struct sock *start_sk)
305304c7820bSMartin KaFai Lau {
30544461568aSKuniyuki Iwashima struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
305504c7820bSMartin KaFai Lau struct bpf_tcp_iter_state *iter = seq->private;
305604c7820bSMartin KaFai Lau struct tcp_iter_state *st = &iter->state;
3057cae3873cSMartin KaFai Lau struct hlist_nulls_node *node;
305804c7820bSMartin KaFai Lau unsigned int expected = 1;
305904c7820bSMartin KaFai Lau struct sock *sk;
306004c7820bSMartin KaFai Lau
306104c7820bSMartin KaFai Lau sock_hold(start_sk);
306204c7820bSMartin KaFai Lau iter->batch[iter->end_sk++] = start_sk;
306304c7820bSMartin KaFai Lau
3064cae3873cSMartin KaFai Lau sk = sk_nulls_next(start_sk);
3065cae3873cSMartin KaFai Lau sk_nulls_for_each_from(sk, node) {
306604c7820bSMartin KaFai Lau if (seq_sk_match(seq, sk)) {
306704c7820bSMartin KaFai Lau if (iter->end_sk < iter->max_sk) {
306804c7820bSMartin KaFai Lau sock_hold(sk);
306904c7820bSMartin KaFai Lau iter->batch[iter->end_sk++] = sk;
307004c7820bSMartin KaFai Lau }
307104c7820bSMartin KaFai Lau expected++;
307204c7820bSMartin KaFai Lau }
307304c7820bSMartin KaFai Lau }
30744461568aSKuniyuki Iwashima spin_unlock(&hinfo->lhash2[st->bucket].lock);
307504c7820bSMartin KaFai Lau
307604c7820bSMartin KaFai Lau return expected;
307704c7820bSMartin KaFai Lau }
307804c7820bSMartin KaFai Lau
307904c7820bSMartin KaFai Lau static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
308004c7820bSMartin KaFai Lau struct sock *start_sk)
308104c7820bSMartin KaFai Lau {
30824461568aSKuniyuki Iwashima struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
308304c7820bSMartin KaFai Lau struct bpf_tcp_iter_state *iter = seq->private;
308404c7820bSMartin KaFai Lau struct tcp_iter_state *st = &iter->state;
308504c7820bSMartin KaFai Lau struct hlist_nulls_node *node;
308604c7820bSMartin KaFai Lau unsigned int expected = 1;
308704c7820bSMartin KaFai Lau struct sock *sk;
308804c7820bSMartin KaFai Lau
308904c7820bSMartin KaFai Lau sock_hold(start_sk);
309004c7820bSMartin KaFai Lau iter->batch[iter->end_sk++] = start_sk;
309104c7820bSMartin KaFai Lau
309204c7820bSMartin KaFai Lau sk = sk_nulls_next(start_sk);
309304c7820bSMartin KaFai Lau sk_nulls_for_each_from(sk, node) {
309404c7820bSMartin KaFai Lau if (seq_sk_match(seq, sk)) {
309504c7820bSMartin KaFai Lau if (iter->end_sk < iter->max_sk) {
309604c7820bSMartin KaFai Lau sock_hold(sk);
309704c7820bSMartin KaFai Lau iter->batch[iter->end_sk++] = sk;
309804c7820bSMartin KaFai Lau }
309904c7820bSMartin KaFai Lau expected++;
310004c7820bSMartin KaFai Lau }
310104c7820bSMartin KaFai Lau }
31024461568aSKuniyuki Iwashima spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
310304c7820bSMartin KaFai Lau
310404c7820bSMartin KaFai Lau return expected;
310504c7820bSMartin KaFai Lau }
310604c7820bSMartin KaFai Lau
310704c7820bSMartin KaFai Lau static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
310804c7820bSMartin KaFai Lau {
31094461568aSKuniyuki Iwashima struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
311004c7820bSMartin KaFai Lau struct bpf_tcp_iter_state *iter = seq->private;
311104c7820bSMartin KaFai Lau struct tcp_iter_state *st = &iter->state;
311204c7820bSMartin KaFai Lau unsigned int expected;
311304c7820bSMartin KaFai Lau bool resized = false;
311404c7820bSMartin KaFai Lau struct sock *sk;
311504c7820bSMartin KaFai Lau
311604c7820bSMartin KaFai Lau /* The st->bucket is done. Directly advance to the next
311704c7820bSMartin KaFai Lau * bucket instead of having the tcp_seek_last_pos() to skip
311804c7820bSMartin KaFai Lau * one by one in the current bucket and eventually find out
311904c7820bSMartin KaFai Lau * it has to advance to the next bucket.
312004c7820bSMartin KaFai Lau */
312104c7820bSMartin KaFai Lau if (iter->st_bucket_done) {
312204c7820bSMartin KaFai Lau st->offset = 0;
312304c7820bSMartin KaFai Lau st->bucket++;
312404c7820bSMartin KaFai Lau if (st->state == TCP_SEQ_STATE_LISTENING &&
31254461568aSKuniyuki Iwashima st->bucket > hinfo->lhash2_mask) {
312604c7820bSMartin KaFai Lau st->state = TCP_SEQ_STATE_ESTABLISHED;
312704c7820bSMartin KaFai Lau st->bucket = 0;
312804c7820bSMartin KaFai Lau }
312904c7820bSMartin KaFai Lau }
313004c7820bSMartin KaFai Lau
313104c7820bSMartin KaFai Lau again:
313204c7820bSMartin KaFai Lau /* Get a new batch */
313304c7820bSMartin KaFai Lau iter->cur_sk = 0;
313404c7820bSMartin KaFai Lau iter->end_sk = 0;
313504c7820bSMartin KaFai Lau iter->st_bucket_done = false;
313604c7820bSMartin KaFai Lau
313704c7820bSMartin KaFai Lau sk = tcp_seek_last_pos(seq);
313804c7820bSMartin KaFai Lau if (!sk)
313904c7820bSMartin KaFai Lau return NULL; /* Done */
314004c7820bSMartin KaFai Lau
314104c7820bSMartin KaFai Lau if (st->state == TCP_SEQ_STATE_LISTENING)
314204c7820bSMartin KaFai Lau expected = bpf_iter_tcp_listening_batch(seq, sk);
314304c7820bSMartin KaFai Lau else
314404c7820bSMartin KaFai Lau expected = bpf_iter_tcp_established_batch(seq, sk);
314504c7820bSMartin KaFai Lau
314604c7820bSMartin KaFai Lau if (iter->end_sk == expected) {
314704c7820bSMartin KaFai Lau iter->st_bucket_done = true;
314804c7820bSMartin KaFai Lau return sk;
314904c7820bSMartin KaFai Lau }
315004c7820bSMartin KaFai Lau
315104c7820bSMartin KaFai Lau if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
315204c7820bSMartin KaFai Lau resized = true;
315304c7820bSMartin KaFai Lau goto again;
315404c7820bSMartin KaFai Lau }
315504c7820bSMartin KaFai Lau
315604c7820bSMartin KaFai Lau return sk;
315704c7820bSMartin KaFai Lau }
315804c7820bSMartin KaFai Lau
315904c7820bSMartin KaFai Lau static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
316004c7820bSMartin KaFai Lau {
316104c7820bSMartin KaFai Lau /* bpf iter does not support lseek, so it always
316204c7820bSMartin KaFai Lau * continue from where it was stop()-ped.
316304c7820bSMartin KaFai Lau */
316404c7820bSMartin KaFai Lau if (*pos)
316504c7820bSMartin KaFai Lau return bpf_iter_tcp_batch(seq);
316604c7820bSMartin KaFai Lau
316704c7820bSMartin KaFai Lau return SEQ_START_TOKEN;
316804c7820bSMartin KaFai Lau }
316904c7820bSMartin KaFai Lau
317004c7820bSMartin KaFai Lau static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
317104c7820bSMartin KaFai Lau {
317204c7820bSMartin KaFai Lau struct bpf_tcp_iter_state *iter = seq->private;
317304c7820bSMartin KaFai Lau struct tcp_iter_state *st = &iter->state;
317404c7820bSMartin KaFai Lau struct sock *sk;
317504c7820bSMartin KaFai Lau
317604c7820bSMartin KaFai Lau /* Whenever seq_next() is called, the iter->cur_sk is
317704c7820bSMartin KaFai Lau * done with seq_show(), so advance to the next sk in
317804c7820bSMartin KaFai Lau * the batch.
317904c7820bSMartin KaFai Lau */
318004c7820bSMartin KaFai Lau if (iter->cur_sk < iter->end_sk) {
318104c7820bSMartin KaFai Lau /* Keeping st->num consistent in tcp_iter_state.
318204c7820bSMartin KaFai Lau * bpf_iter_tcp does not use st->num.
318304c7820bSMartin KaFai Lau * meta.seq_num is used instead.
318404c7820bSMartin KaFai Lau */
318504c7820bSMartin KaFai Lau st->num++;
318604c7820bSMartin KaFai Lau /* Move st->offset to the next sk in the bucket such that
318704c7820bSMartin KaFai Lau * the future start() will resume at st->offset in
318804c7820bSMartin KaFai Lau * st->bucket. See tcp_seek_last_pos().
318904c7820bSMartin KaFai Lau */
319004c7820bSMartin KaFai Lau st->offset++;
3191580031ffSMartin KaFai Lau sock_gen_put(iter->batch[iter->cur_sk++]);
319204c7820bSMartin KaFai Lau }
319304c7820bSMartin KaFai Lau
319404c7820bSMartin KaFai Lau if (iter->cur_sk < iter->end_sk)
319504c7820bSMartin KaFai Lau sk = iter->batch[iter->cur_sk];
319604c7820bSMartin KaFai Lau else
319704c7820bSMartin KaFai Lau sk = bpf_iter_tcp_batch(seq);
319804c7820bSMartin KaFai Lau
319904c7820bSMartin KaFai Lau ++*pos;
320004c7820bSMartin KaFai Lau /* Keeping st->last_pos consistent in tcp_iter_state.
320104c7820bSMartin KaFai Lau * bpf iter does not do lseek, so st->last_pos always equals to *pos.
320204c7820bSMartin KaFai Lau */
320304c7820bSMartin KaFai Lau st->last_pos = *pos;
320404c7820bSMartin KaFai Lau return sk;
320504c7820bSMartin KaFai Lau }
320604c7820bSMartin KaFai Lau
320752d87d5fSYonghong Song static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
320852d87d5fSYonghong Song {
320952d87d5fSYonghong Song struct bpf_iter_meta meta;
321052d87d5fSYonghong Song struct bpf_prog *prog;
321152d87d5fSYonghong Song struct sock *sk = v;
321252d87d5fSYonghong Song uid_t uid;
321304c7820bSMartin KaFai Lau int ret;
321452d87d5fSYonghong Song
321552d87d5fSYonghong Song if (v == SEQ_START_TOKEN)
321652d87d5fSYonghong Song return 0;
321752d87d5fSYonghong Song
321804c7820bSMartin KaFai Lau if (sk_fullsock(sk))
32199378096eSAditi Ghag lock_sock(sk);
322004c7820bSMartin KaFai Lau
322104c7820bSMartin KaFai Lau if (unlikely(sk_unhashed(sk))) {
322204c7820bSMartin KaFai Lau ret = SEQ_SKIP;
322304c7820bSMartin KaFai Lau goto unlock;
322404c7820bSMartin KaFai Lau }
322504c7820bSMartin KaFai Lau
322652d87d5fSYonghong Song if (sk->sk_state == TCP_TIME_WAIT) {
322752d87d5fSYonghong Song uid = 0;
322852d87d5fSYonghong Song } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
322952d87d5fSYonghong Song const struct request_sock *req = v;
323052d87d5fSYonghong Song
323152d87d5fSYonghong Song uid = from_kuid_munged(seq_user_ns(seq),
323252d87d5fSYonghong Song sock_i_uid(req->rsk_listener));
323352d87d5fSYonghong Song } else {
323452d87d5fSYonghong Song uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
323552d87d5fSYonghong Song }
323652d87d5fSYonghong Song
323752d87d5fSYonghong Song meta.seq = seq;
323852d87d5fSYonghong Song prog = bpf_iter_get_info(&meta, false);
323904c7820bSMartin KaFai Lau ret = tcp_prog_seq_show(prog, &meta, v, uid);
324004c7820bSMartin KaFai Lau
324104c7820bSMartin KaFai Lau unlock:
324204c7820bSMartin KaFai Lau if (sk_fullsock(sk))
32439378096eSAditi Ghag release_sock(sk);
324404c7820bSMartin KaFai Lau return ret;
324504c7820bSMartin KaFai Lau
324652d87d5fSYonghong Song }
324752d87d5fSYonghong Song
324852d87d5fSYonghong Song static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
324952d87d5fSYonghong Song {
325004c7820bSMartin KaFai Lau struct bpf_tcp_iter_state *iter = seq->private;
325152d87d5fSYonghong Song struct bpf_iter_meta meta;
325252d87d5fSYonghong Song struct bpf_prog *prog;
325352d87d5fSYonghong Song
325452d87d5fSYonghong Song if (!v) {
325552d87d5fSYonghong Song meta.seq = seq;
325652d87d5fSYonghong Song prog = bpf_iter_get_info(&meta, true);
325752d87d5fSYonghong Song if (prog)
325852d87d5fSYonghong Song (void)tcp_prog_seq_show(prog, &meta, v, 0);
325952d87d5fSYonghong Song }
326052d87d5fSYonghong Song
326104c7820bSMartin KaFai Lau if (iter->cur_sk < iter->end_sk) {
326204c7820bSMartin KaFai Lau bpf_iter_tcp_put_batch(iter);
326304c7820bSMartin KaFai Lau iter->st_bucket_done = false;
326404c7820bSMartin KaFai Lau }
326552d87d5fSYonghong Song }
326652d87d5fSYonghong Song
326752d87d5fSYonghong Song static const struct seq_operations bpf_iter_tcp_seq_ops = {
326852d87d5fSYonghong Song .show = bpf_iter_tcp_seq_show,
326904c7820bSMartin KaFai Lau .start = bpf_iter_tcp_seq_start,
327004c7820bSMartin KaFai Lau .next = bpf_iter_tcp_seq_next,
327152d87d5fSYonghong Song .stop = bpf_iter_tcp_seq_stop,
327252d87d5fSYonghong Song };
327352d87d5fSYonghong Song #endif
3274ad2d6137SMartin KaFai Lau static unsigned short seq_file_family(const struct seq_file *seq)
3275ad2d6137SMartin KaFai Lau {
327662001372SMartin KaFai Lau const struct tcp_seq_afinfo *afinfo;
3277ad2d6137SMartin KaFai Lau
327862001372SMartin KaFai Lau #ifdef CONFIG_BPF_SYSCALL
3279ad2d6137SMartin KaFai Lau /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
328062001372SMartin KaFai Lau if (seq->op == &bpf_iter_tcp_seq_ops)
3281ad2d6137SMartin KaFai Lau return AF_UNSPEC;
328262001372SMartin KaFai Lau #endif
3283ad2d6137SMartin KaFai Lau
3284ad2d6137SMartin KaFai Lau /* Iterated from proc fs */
3285359745d7SMuchun Song afinfo = pde_data(file_inode(seq->file));
3286ad2d6137SMartin KaFai Lau return afinfo->family;
3287ad2d6137SMartin KaFai Lau }
328852d87d5fSYonghong Song
328937d849bbSChristoph Hellwig static const struct seq_operations tcp4_seq_ops = {
329037d849bbSChristoph Hellwig .show = tcp4_seq_show,
329137d849bbSChristoph Hellwig .start = tcp_seq_start,
329237d849bbSChristoph Hellwig .next = tcp_seq_next,
329337d849bbSChristoph Hellwig .stop = tcp_seq_stop,
329437d849bbSChristoph Hellwig };
329537d849bbSChristoph Hellwig
32961da177e4SLinus Torvalds static struct tcp_seq_afinfo tcp4_seq_afinfo = {
32971da177e4SLinus Torvalds .family = AF_INET,
32981da177e4SLinus Torvalds };
32991da177e4SLinus Torvalds
33002c8c1e72SAlexey Dobriyan static int __net_init tcp4_proc_init_net(struct net *net)
3301757764f6SPavel Emelyanov {
3302c3506372SChristoph Hellwig if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3303c3506372SChristoph Hellwig sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
330437d849bbSChristoph Hellwig return -ENOMEM;
330537d849bbSChristoph Hellwig return 0;
3306757764f6SPavel Emelyanov }
3307757764f6SPavel Emelyanov
33082c8c1e72SAlexey Dobriyan static void __net_exit tcp4_proc_exit_net(struct net *net)
3309757764f6SPavel Emelyanov {
331037d849bbSChristoph Hellwig remove_proc_entry("tcp", net->proc_net);
3311757764f6SPavel Emelyanov }
3312757764f6SPavel Emelyanov
3313757764f6SPavel Emelyanov static struct pernet_operations tcp4_net_ops = {
3314757764f6SPavel Emelyanov .init = tcp4_proc_init_net,
3315757764f6SPavel Emelyanov .exit = tcp4_proc_exit_net,
3316757764f6SPavel Emelyanov };
3317757764f6SPavel Emelyanov
33181da177e4SLinus Torvalds int __init tcp4_proc_init(void)
33191da177e4SLinus Torvalds {
3320757764f6SPavel Emelyanov return register_pernet_subsys(&tcp4_net_ops);
33211da177e4SLinus Torvalds }
33221da177e4SLinus Torvalds
33231da177e4SLinus Torvalds void tcp4_proc_exit(void)
33241da177e4SLinus Torvalds {
3325757764f6SPavel Emelyanov unregister_pernet_subsys(&tcp4_net_ops);
33261da177e4SLinus Torvalds }
33271da177e4SLinus Torvalds #endif /* CONFIG_PROC_FS */
33281da177e4SLinus Torvalds
3329d3cd4924SEric Dumazet /* @wake is one when sk_stream_write_space() calls us.
3330d3cd4924SEric Dumazet * This sends EPOLLOUT only if notsent_bytes is half the limit.
3331d3cd4924SEric Dumazet * This mimics the strategy used in sock_def_write_space().
3332d3cd4924SEric Dumazet */
3333d3cd4924SEric Dumazet bool tcp_stream_memory_free(const struct sock *sk, int wake)
3334d3cd4924SEric Dumazet {
3335d3cd4924SEric Dumazet const struct tcp_sock *tp = tcp_sk(sk);
3336d3cd4924SEric Dumazet u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3337d3cd4924SEric Dumazet READ_ONCE(tp->snd_nxt);
3338d3cd4924SEric Dumazet
3339d3cd4924SEric Dumazet return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3340d3cd4924SEric Dumazet }
3341d3cd4924SEric Dumazet EXPORT_SYMBOL(tcp_stream_memory_free);
3342d3cd4924SEric Dumazet
33431da177e4SLinus Torvalds struct proto tcp_prot = {
33441da177e4SLinus Torvalds .name = "TCP",
33451da177e4SLinus Torvalds .owner = THIS_MODULE,
33461da177e4SLinus Torvalds .close = tcp_close,
3347d74bad4eSAndrey Ignatov .pre_connect = tcp_v4_pre_connect,
33481da177e4SLinus Torvalds .connect = tcp_v4_connect,
33491da177e4SLinus Torvalds .disconnect = tcp_disconnect,
3350463c84b9SArnaldo Carvalho de Melo .accept = inet_csk_accept,
33511da177e4SLinus Torvalds .ioctl = tcp_ioctl,
33521da177e4SLinus Torvalds .init = tcp_v4_init_sock,
33531da177e4SLinus Torvalds .destroy = tcp_v4_destroy_sock,
33541da177e4SLinus Torvalds .shutdown = tcp_shutdown,
33551da177e4SLinus Torvalds .setsockopt = tcp_setsockopt,
33561da177e4SLinus Torvalds .getsockopt = tcp_getsockopt,
33579cacf81fSStanislav Fomichev .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
33584b9d07a4SUrsula Braun .keepalive = tcp_set_keepalive,
33591da177e4SLinus Torvalds .recvmsg = tcp_recvmsg,
33607ba42910SChangli Gao .sendmsg = tcp_sendmsg,
33611d7e4538SDavid Howells .splice_eof = tcp_splice_eof,
33621da177e4SLinus Torvalds .backlog_rcv = tcp_v4_do_rcv,
336346d3ceabSEric Dumazet .release_cb = tcp_release_cb,
3364ab1e0a13SArnaldo Carvalho de Melo .hash = inet_hash,
3365ab1e0a13SArnaldo Carvalho de Melo .unhash = inet_unhash,
3366ab1e0a13SArnaldo Carvalho de Melo .get_port = inet_csk_get_port,
336791a760b2SMenglong Dong .put_port = inet_put_port,
33688a59f9d1SCong Wang #ifdef CONFIG_BPF_SYSCALL
33698a59f9d1SCong Wang .psock_update_sk_prot = tcp_bpf_update_proto,
33708a59f9d1SCong Wang #endif
33711da177e4SLinus Torvalds .enter_memory_pressure = tcp_enter_memory_pressure,
337206044751SEric Dumazet .leave_memory_pressure = tcp_leave_memory_pressure,
3373c9bee3b7SEric Dumazet .stream_memory_free = tcp_stream_memory_free,
33741da177e4SLinus Torvalds .sockets_allocated = &tcp_sockets_allocated,
33750a5578cfSArnaldo Carvalho de Melo .orphan_count = &tcp_orphan_count,
33760defbb0aSEric Dumazet
33771da177e4SLinus Torvalds .memory_allocated = &tcp_memory_allocated,
33780defbb0aSEric Dumazet .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
33790defbb0aSEric Dumazet
33801da177e4SLinus Torvalds .memory_pressure = &tcp_memory_pressure,
3381a4fe34bfSEric W. Biederman .sysctl_mem = sysctl_tcp_mem,
3382356d1833SEric Dumazet .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3383356d1833SEric Dumazet .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
33841da177e4SLinus Torvalds .max_header = MAX_TCP_HEADER,
33851da177e4SLinus Torvalds .obj_size = sizeof(struct tcp_sock),
33865f0d5a3aSPaul E. McKenney .slab_flags = SLAB_TYPESAFE_BY_RCU,
33876d6ee43eSArnaldo Carvalho de Melo .twsk_prot = &tcp_timewait_sock_ops,
338860236fddSArnaldo Carvalho de Melo .rsk_prot = &tcp_request_sock_ops,
3389429e42c1SKuniyuki Iwashima .h.hashinfo = NULL,
33907ba42910SChangli Gao .no_autobind = true,
3391c1e64e29SLorenzo Colitti .diag_destroy = tcp_abort,
33921da177e4SLinus Torvalds };
33934bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_prot);
33941da177e4SLinus Torvalds
3395046ee902SDenis V. Lunev static void __net_exit tcp_sk_exit(struct net *net)
3396046ee902SDenis V. Lunev {
3397b506bc97SDust Li if (net->ipv4.tcp_congestion_control)
33980baf26b0SMartin KaFai Lau bpf_module_put(net->ipv4.tcp_congestion_control,
33990baf26b0SMartin KaFai Lau net->ipv4.tcp_congestion_control->owner);
3400bdbbb852SEric Dumazet }
3401bdbbb852SEric Dumazet
3402d1e5e640SKuniyuki Iwashima static void __net_init tcp_set_hashinfo(struct net *net)
3403d1e5e640SKuniyuki Iwashima {
3404d1e5e640SKuniyuki Iwashima struct inet_hashinfo *hinfo;
3405d1e5e640SKuniyuki Iwashima unsigned int ehash_entries;
3406d1e5e640SKuniyuki Iwashima struct net *old_net;
3407d1e5e640SKuniyuki Iwashima
3408d1e5e640SKuniyuki Iwashima if (net_eq(net, &init_net))
3409d1e5e640SKuniyuki Iwashima goto fallback;
3410d1e5e640SKuniyuki Iwashima
3411d1e5e640SKuniyuki Iwashima old_net = current->nsproxy->net_ns;
3412d1e5e640SKuniyuki Iwashima ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3413d1e5e640SKuniyuki Iwashima if (!ehash_entries)
3414d1e5e640SKuniyuki Iwashima goto fallback;
3415d1e5e640SKuniyuki Iwashima
3416d1e5e640SKuniyuki Iwashima ehash_entries = roundup_pow_of_two(ehash_entries);
3417d1e5e640SKuniyuki Iwashima hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3418d1e5e640SKuniyuki Iwashima if (!hinfo) {
3419d1e5e640SKuniyuki Iwashima pr_warn("Failed to allocate TCP ehash (entries: %u) "
3420d1e5e640SKuniyuki Iwashima "for a netns, fallback to the global one\n",
3421d1e5e640SKuniyuki Iwashima ehash_entries);
3422d1e5e640SKuniyuki Iwashima fallback:
3423d1e5e640SKuniyuki Iwashima hinfo = &tcp_hashinfo;
3424d1e5e640SKuniyuki Iwashima ehash_entries = tcp_hashinfo.ehash_mask + 1;
3425d1e5e640SKuniyuki Iwashima }
3426d1e5e640SKuniyuki Iwashima
3427d1e5e640SKuniyuki Iwashima net->ipv4.tcp_death_row.hashinfo = hinfo;
3428d1e5e640SKuniyuki Iwashima net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3429d1e5e640SKuniyuki Iwashima net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3430d1e5e640SKuniyuki Iwashima }
3431d1e5e640SKuniyuki Iwashima
3432bdbbb852SEric Dumazet static int __net_init tcp_sk_init(struct net *net)
3433bdbbb852SEric Dumazet {
3434bdbbb852SEric Dumazet net->ipv4.sysctl_tcp_ecn = 2;
343549213555SDaniel Borkmann net->ipv4.sysctl_tcp_ecn_fallback = 1;
343649213555SDaniel Borkmann
3437b0f9ca53SFan Du net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
34385f3e2bf0SEric Dumazet net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
34396b58e0a5SFan Du net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
344005cbc0dbSFan Du net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3441c04b79b6SJosh Hunt net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3442bdbbb852SEric Dumazet
344313b287e8SNikolay Borisov net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
34449bd6861bSNikolay Borisov net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3445b840d15dSNikolay Borisov net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
344613b287e8SNikolay Borisov
34476fa25166SNikolay Borisov net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
34487c083ecbSNikolay Borisov net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
34490aca737dSDavid S. Miller net->ipv4.sysctl_tcp_syncookies = 1;
34501043e25fSNikolay Borisov net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3451ae5c3f40SNikolay Borisov net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3452c6214a97SNikolay Borisov net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3453c402d9beSNikolay Borisov net->ipv4.sysctl_tcp_orphan_retries = 0;
34541e579caaSNikolay Borisov net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
34554979f2d9SNikolay Borisov net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
345679e9fed4SMaciej Żenczykowski net->ipv4.sysctl_tcp_tw_reuse = 2;
345765e6d901SKevin(Yudong) Yang net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
345812ed8244SNikolay Borisov
3459e9bd0ccaSKuniyuki Iwashima refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3460d1e5e640SKuniyuki Iwashima tcp_set_hashinfo(net);
34611946e672SHaishuang Yan
3462f9301034SEric Dumazet net->ipv4.sysctl_tcp_sack = 1;
34639bb37ef0SEric Dumazet net->ipv4.sysctl_tcp_window_scaling = 1;
34645d2ed052SEric Dumazet net->ipv4.sysctl_tcp_timestamps = 1;
34652ae21cf5SEric Dumazet net->ipv4.sysctl_tcp_early_retrans = 3;
3466e20223f1SEric Dumazet net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3467b510f0d2SEric Dumazet net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3468e0a1e5b5SEric Dumazet net->ipv4.sysctl_tcp_retrans_collapse = 1;
3469c6e21803SEric Dumazet net->ipv4.sysctl_tcp_max_reordering = 300;
34706496f6bdSEric Dumazet net->ipv4.sysctl_tcp_dsack = 1;
34710c12654aSEric Dumazet net->ipv4.sysctl_tcp_app_win = 31;
347294f0893eSEric Dumazet net->ipv4.sysctl_tcp_adv_win_scale = 1;
3473af9b69a7SEric Dumazet net->ipv4.sysctl_tcp_frto = 2;
34744540c0cfSEric Dumazet net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3475d06a9904SEric Dumazet /* This limits the percentage of the congestion window which we
3476d06a9904SEric Dumazet * will allow a single TSO frame to consume. Building TSO frames
3477d06a9904SEric Dumazet * which are too large can cause TCP streams to be bursty.
3478d06a9904SEric Dumazet */
3479d06a9904SEric Dumazet net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3480c73e5807SEric Dumazet /* Default TSQ limit of 16 TSO segments */
3481c73e5807SEric Dumazet net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
348279e3602cSEric Dumazet
348379e3602cSEric Dumazet /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
348479e3602cSEric Dumazet net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
348579e3602cSEric Dumazet
348626e9596eSEric Dumazet net->ipv4.sysctl_tcp_min_tso_segs = 2;
348765466904SEric Dumazet net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3488bd239704SEric Dumazet net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3489790f00e1SEric Dumazet net->ipv4.sysctl_tcp_autocorking = 1;
34904170ba6bSEric Dumazet net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
349123a7102aSEric Dumazet net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3492c26e91f8SEric Dumazet net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3493356d1833SEric Dumazet if (net != &init_net) {
3494356d1833SEric Dumazet memcpy(net->ipv4.sysctl_tcp_rmem,
3495356d1833SEric Dumazet init_net.ipv4.sysctl_tcp_rmem,
3496356d1833SEric Dumazet sizeof(init_net.ipv4.sysctl_tcp_rmem));
3497356d1833SEric Dumazet memcpy(net->ipv4.sysctl_tcp_wmem,
3498356d1833SEric Dumazet init_net.ipv4.sysctl_tcp_wmem,
3499356d1833SEric Dumazet sizeof(init_net.ipv4.sysctl_tcp_wmem));
3500356d1833SEric Dumazet }
35016d82aa24SEric Dumazet net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3502a70437ccSEric Dumazet net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
35039c21d2fcSEric Dumazet net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3504133c4c0dSEric Dumazet net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3505e1cfcbe8SHaishuang Yan net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3506213ad73dSWei Wang net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
35073733be14SHaishuang Yan atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3508e1cfcbe8SHaishuang Yan
3509bd456f28SMubashir Adnan Qureshi /* Set default values for PLB */
3510bd456f28SMubashir Adnan Qureshi net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3511bd456f28SMubashir Adnan Qureshi net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3512bd456f28SMubashir Adnan Qureshi net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3513bd456f28SMubashir Adnan Qureshi net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3514bd456f28SMubashir Adnan Qureshi /* Default congestion threshold for PLB to mark a round is 50% */
35151a91bb7cSMubashir Adnan Qureshi net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3516bd456f28SMubashir Adnan Qureshi
35176670e152SStephen Hemminger /* Reno is always built in */
35186670e152SStephen Hemminger if (!net_eq(net, &init_net) &&
35190baf26b0SMartin KaFai Lau bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
35200baf26b0SMartin KaFai Lau init_net.ipv4.tcp_congestion_control->owner))
35216670e152SStephen Hemminger net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
35226670e152SStephen Hemminger else
35236670e152SStephen Hemminger net->ipv4.tcp_congestion_control = &tcp_reno;
35246670e152SStephen Hemminger
3525ccce324dSDavid Morley net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3526b650d953Smfreemon@cloudflare.com net->ipv4.sysctl_tcp_shrink_window = 0;
3527b650d953Smfreemon@cloudflare.com
3528562b1fdfSHaiyang Zhang net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3529f086edefSKevin Yang net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3530562b1fdfSHaiyang Zhang
353149213555SDaniel Borkmann return 0;
3532b099ce26SEric W. Biederman }
3533b099ce26SEric W. Biederman
3534b099ce26SEric W. Biederman static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3535b099ce26SEric W. Biederman {
353643713848SHaishuang Yan struct net *net;
353743713848SHaishuang Yan
3538565d121bSFlorian Westphal /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3539565d121bSFlorian Westphal * and failed setup_net error unwinding path are serialized.
3540565d121bSFlorian Westphal *
3541565d121bSFlorian Westphal * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3542565d121bSFlorian Westphal * net_exit_list, the thread that dismantles a particular twsk must
3543565d121bSFlorian Westphal * do so without other thread progressing to refcount_dec_and_test() of
3544565d121bSFlorian Westphal * tcp_death_row.tw_refcount.
3545565d121bSFlorian Westphal */
3546565d121bSFlorian Westphal mutex_lock(&tcp_exit_batch_mutex);
3547565d121bSFlorian Westphal
35481eeb5043SEric Dumazet tcp_twsk_purge(net_exit_list);
354904c494e6SEric Dumazet
3550e9bd0ccaSKuniyuki Iwashima list_for_each_entry(net, net_exit_list, exit_list) {
3551d1e5e640SKuniyuki Iwashima inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3552e9bd0ccaSKuniyuki Iwashima WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
355343713848SHaishuang Yan tcp_fastopen_ctx_destroy(net);
3554046ee902SDenis V. Lunev }
3555565d121bSFlorian Westphal
3556565d121bSFlorian Westphal mutex_unlock(&tcp_exit_batch_mutex);
3557e9bd0ccaSKuniyuki Iwashima }
3558046ee902SDenis V. Lunev
3559046ee902SDenis V. Lunev static struct pernet_operations __net_initdata tcp_sk_ops = {
3560046ee902SDenis V. Lunev .init = tcp_sk_init,
3561046ee902SDenis V. Lunev .exit = tcp_sk_exit,
3562b099ce26SEric W. Biederman .exit_batch = tcp_sk_exit_batch,
3563046ee902SDenis V. Lunev };
3564046ee902SDenis V. Lunev
356552d87d5fSYonghong Song #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
356652d87d5fSYonghong Song DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
356752d87d5fSYonghong Song struct sock_common *sk_common, uid_t uid)
356852d87d5fSYonghong Song
356904c7820bSMartin KaFai Lau #define INIT_BATCH_SZ 16
357004c7820bSMartin KaFai Lau
3571f9c79272SYonghong Song static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
357252d87d5fSYonghong Song {
357304c7820bSMartin KaFai Lau struct bpf_tcp_iter_state *iter = priv_data;
357404c7820bSMartin KaFai Lau int err;
357552d87d5fSYonghong Song
357604c7820bSMartin KaFai Lau err = bpf_iter_init_seq_net(priv_data, aux);
357704c7820bSMartin KaFai Lau if (err)
357804c7820bSMartin KaFai Lau return err;
357952d87d5fSYonghong Song
358004c7820bSMartin KaFai Lau err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
358104c7820bSMartin KaFai Lau if (err) {
358204c7820bSMartin KaFai Lau bpf_iter_fini_seq_net(priv_data);
358304c7820bSMartin KaFai Lau return err;
358404c7820bSMartin KaFai Lau }
358504c7820bSMartin KaFai Lau
358604c7820bSMartin KaFai Lau return 0;
358752d87d5fSYonghong Song }
358852d87d5fSYonghong Song
358952d87d5fSYonghong Song static void bpf_iter_fini_tcp(void *priv_data)
359052d87d5fSYonghong Song {
359104c7820bSMartin KaFai Lau struct bpf_tcp_iter_state *iter = priv_data;
359252d87d5fSYonghong Song
359352d87d5fSYonghong Song bpf_iter_fini_seq_net(priv_data);
359404c7820bSMartin KaFai Lau kvfree(iter->batch);
359552d87d5fSYonghong Song }
359652d87d5fSYonghong Song
359714fc6bd6SYonghong Song static const struct bpf_iter_seq_info tcp_seq_info = {
359852d87d5fSYonghong Song .seq_ops = &bpf_iter_tcp_seq_ops,
359952d87d5fSYonghong Song .init_seq_private = bpf_iter_init_tcp,
360052d87d5fSYonghong Song .fini_seq_private = bpf_iter_fini_tcp,
360104c7820bSMartin KaFai Lau .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
360214fc6bd6SYonghong Song };
360314fc6bd6SYonghong Song
36043cee6fb8SMartin KaFai Lau static const struct bpf_func_proto *
36053cee6fb8SMartin KaFai Lau bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
36063cee6fb8SMartin KaFai Lau const struct bpf_prog *prog)
36073cee6fb8SMartin KaFai Lau {
36083cee6fb8SMartin KaFai Lau switch (func_id) {
36093cee6fb8SMartin KaFai Lau case BPF_FUNC_setsockopt:
36103cee6fb8SMartin KaFai Lau return &bpf_sk_setsockopt_proto;
36113cee6fb8SMartin KaFai Lau case BPF_FUNC_getsockopt:
36123cee6fb8SMartin KaFai Lau return &bpf_sk_getsockopt_proto;
36133cee6fb8SMartin KaFai Lau default:
36143cee6fb8SMartin KaFai Lau return NULL;
36153cee6fb8SMartin KaFai Lau }
36163cee6fb8SMartin KaFai Lau }
36173cee6fb8SMartin KaFai Lau
361814fc6bd6SYonghong Song static struct bpf_iter_reg tcp_reg_info = {
361914fc6bd6SYonghong Song .target = "tcp",
362052d87d5fSYonghong Song .ctx_arg_info_size = 1,
362152d87d5fSYonghong Song .ctx_arg_info = {
362252d87d5fSYonghong Song { offsetof(struct bpf_iter__tcp, sk_common),
36234ddbcb88SAditi Ghag PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
362452d87d5fSYonghong Song },
36253cee6fb8SMartin KaFai Lau .get_func_proto = bpf_iter_tcp_get_func_proto,
362614fc6bd6SYonghong Song .seq_info = &tcp_seq_info,
362752d87d5fSYonghong Song };
362852d87d5fSYonghong Song
362952d87d5fSYonghong Song static void __init bpf_iter_register(void)
363052d87d5fSYonghong Song {
3631951cf368SYonghong Song tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
363252d87d5fSYonghong Song if (bpf_iter_reg_target(&tcp_reg_info))
363352d87d5fSYonghong Song pr_warn("Warning: could not register bpf iterator tcp\n");
363452d87d5fSYonghong Song }
363552d87d5fSYonghong Song
363652d87d5fSYonghong Song #endif
363752d87d5fSYonghong Song
36389b0f976fSDenis V. Lunev void __init tcp_v4_init(void)
36391da177e4SLinus Torvalds {
364037ba017dSEric Dumazet int cpu, res;
364137ba017dSEric Dumazet
364237ba017dSEric Dumazet for_each_possible_cpu(cpu) {
364337ba017dSEric Dumazet struct sock *sk;
364437ba017dSEric Dumazet
364537ba017dSEric Dumazet res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
364637ba017dSEric Dumazet IPPROTO_TCP, &init_net);
364737ba017dSEric Dumazet if (res)
364837ba017dSEric Dumazet panic("Failed to create the TCP control socket.\n");
364937ba017dSEric Dumazet sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
365037ba017dSEric Dumazet
365137ba017dSEric Dumazet /* Please enforce IP_DF and IPID==0 for RST and
365237ba017dSEric Dumazet * ACK sent in SYN-RECV and TIME-WAIT state.
365337ba017dSEric Dumazet */
365437ba017dSEric Dumazet inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
365537ba017dSEric Dumazet
36561693c5dbSAbhishek Chauhan sk->sk_clockid = CLOCK_MONOTONIC;
36571693c5dbSAbhishek Chauhan
3658ebad6d03SSebastian Andrzej Siewior per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
365937ba017dSEric Dumazet }
36606a1b3054SEric W. Biederman if (register_pernet_subsys(&tcp_sk_ops))
36611da177e4SLinus Torvalds panic("Failed to create the TCP control socket.\n");
366252d87d5fSYonghong Song
366352d87d5fSYonghong Song #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
366452d87d5fSYonghong Song bpf_iter_register();
366552d87d5fSYonghong Song #endif
36661da177e4SLinus Torvalds }
3667