xref: /linux/net/ipv4/tcp_ipv4.c (revision 9410645520e9b820069761f3450ef6661418e279)
12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  * INET		An implementation of the TCP/IP protocol suite for the LINUX
41da177e4SLinus Torvalds  *		operating system.  INET is implemented using the  BSD Socket
51da177e4SLinus Torvalds  *		interface as the means of communication with the user level.
61da177e4SLinus Torvalds  *
71da177e4SLinus Torvalds  *		Implementation of the Transmission Control Protocol(TCP).
81da177e4SLinus Torvalds  *
91da177e4SLinus Torvalds  *		IPv4 specific functions
101da177e4SLinus Torvalds  *
111da177e4SLinus Torvalds  *		code split from:
121da177e4SLinus Torvalds  *		linux/ipv4/tcp.c
131da177e4SLinus Torvalds  *		linux/ipv4/tcp_input.c
141da177e4SLinus Torvalds  *		linux/ipv4/tcp_output.c
151da177e4SLinus Torvalds  *
161da177e4SLinus Torvalds  *		See tcp.c for author information
171da177e4SLinus Torvalds  */
181da177e4SLinus Torvalds 
191da177e4SLinus Torvalds /*
201da177e4SLinus Torvalds  * Changes:
211da177e4SLinus Torvalds  *		David S. Miller	:	New socket lookup architecture.
221da177e4SLinus Torvalds  *					This code is dedicated to John Dyson.
231da177e4SLinus Torvalds  *		David S. Miller :	Change semantics of established hash,
241da177e4SLinus Torvalds  *					half is devoted to TIME_WAIT sockets
251da177e4SLinus Torvalds  *					and the rest go in the other half.
261da177e4SLinus Torvalds  *		Andi Kleen :		Add support for syncookies and fixed
271da177e4SLinus Torvalds  *					some bugs: ip options weren't passed to
281da177e4SLinus Torvalds  *					the TCP layer, missed a check for an
291da177e4SLinus Torvalds  *					ACK bit.
301da177e4SLinus Torvalds  *		Andi Kleen :		Implemented fast path mtu discovery.
311da177e4SLinus Torvalds  *	     				Fixed many serious bugs in the
3260236fddSArnaldo Carvalho de Melo  *					request_sock handling and moved
331da177e4SLinus Torvalds  *					most of it into the af independent code.
341da177e4SLinus Torvalds  *					Added tail drop and some other bugfixes.
35caa20d9aSStephen Hemminger  *					Added new listen semantics.
361da177e4SLinus Torvalds  *		Mike McLagan	:	Routing by source
371da177e4SLinus Torvalds  *	Juan Jose Ciarlante:		ip_dynaddr bits
381da177e4SLinus Torvalds  *		Andi Kleen:		various fixes.
391da177e4SLinus Torvalds  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
401da177e4SLinus Torvalds  *					coma.
411da177e4SLinus Torvalds  *	Andi Kleen		:	Fix new listen.
421da177e4SLinus Torvalds  *	Andi Kleen		:	Fix accept error reporting.
431da177e4SLinus Torvalds  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
441da177e4SLinus Torvalds  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
451da177e4SLinus Torvalds  *					a single port at the same time.
461da177e4SLinus Torvalds  */
471da177e4SLinus Torvalds 
48afd46503SJoe Perches #define pr_fmt(fmt) "TCP: " fmt
491da177e4SLinus Torvalds 
50eb4dea58SHerbert Xu #include <linux/bottom_half.h>
511da177e4SLinus Torvalds #include <linux/types.h>
521da177e4SLinus Torvalds #include <linux/fcntl.h>
531da177e4SLinus Torvalds #include <linux/module.h>
541da177e4SLinus Torvalds #include <linux/random.h>
551da177e4SLinus Torvalds #include <linux/cache.h>
561da177e4SLinus Torvalds #include <linux/jhash.h>
571da177e4SLinus Torvalds #include <linux/init.h>
581da177e4SLinus Torvalds #include <linux/times.h>
595a0e3ad6STejun Heo #include <linux/slab.h>
609f4a7c93SJian Wen #include <linux/sched.h>
611da177e4SLinus Torvalds 
62457c4cbcSEric W. Biederman #include <net/net_namespace.h>
631da177e4SLinus Torvalds #include <net/icmp.h>
64304a1618SArnaldo Carvalho de Melo #include <net/inet_hashtables.h>
651da177e4SLinus Torvalds #include <net/tcp.h>
6620380731SArnaldo Carvalho de Melo #include <net/transp_v6.h>
671da177e4SLinus Torvalds #include <net/ipv6.h>
681da177e4SLinus Torvalds #include <net/inet_common.h>
696d6ee43eSArnaldo Carvalho de Melo #include <net/timewait_sock.h>
701da177e4SLinus Torvalds #include <net/xfrm.h>
716e5714eaSDavid S. Miller #include <net/secure_seq.h>
72076bb0c8SEliezer Tamir #include <net/busy_poll.h>
736be49deaSJason Xing #include <net/rstreason.h>
741da177e4SLinus Torvalds 
751da177e4SLinus Torvalds #include <linux/inet.h>
761da177e4SLinus Torvalds #include <linux/ipv6.h>
771da177e4SLinus Torvalds #include <linux/stddef.h>
781da177e4SLinus Torvalds #include <linux/proc_fs.h>
791da177e4SLinus Torvalds #include <linux/seq_file.h>
806797318eSIvan Delalande #include <linux/inetdevice.h>
81951cf368SYonghong Song #include <linux/btf_ids.h>
82*8f0b3cc9SMina Almasry #include <linux/skbuff_ref.h>
831da177e4SLinus Torvalds 
84cf80e0e4SHerbert Xu #include <crypto/hash.h>
85cfb6eeb4SYOSHIFUJI Hideaki #include <linux/scatterlist.h>
86cfb6eeb4SYOSHIFUJI Hideaki 
87c24b14c4SSong Liu #include <trace/events/tcp.h>
88c24b14c4SSong Liu 
89cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
90a915da9bSEric Dumazet static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91318cf7aaSEric Dumazet 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92cfb6eeb4SYOSHIFUJI Hideaki #endif
93cfb6eeb4SYOSHIFUJI Hideaki 
945caea4eaSEric Dumazet struct inet_hashinfo tcp_hashinfo;
954bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_hashinfo);
961da177e4SLinus Torvalds 
97ebad6d03SSebastian Andrzej Siewior static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98ebad6d03SSebastian Andrzej Siewior 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
99ebad6d03SSebastian Andrzej Siewior };
10037ba017dSEric Dumazet 
101565d121bSFlorian Westphal static DEFINE_MUTEX(tcp_exit_batch_mutex);
102565d121bSFlorian Westphal 
tcp_v4_init_seq(const struct sk_buff * skb)10384b114b9SEric Dumazet static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1041da177e4SLinus Torvalds {
10584b114b9SEric Dumazet 	return secure_tcp_seq(ip_hdr(skb)->daddr,
106eddc9ec5SArnaldo Carvalho de Melo 			      ip_hdr(skb)->saddr,
107aa8223c7SArnaldo Carvalho de Melo 			      tcp_hdr(skb)->dest,
10884b114b9SEric Dumazet 			      tcp_hdr(skb)->source);
10984b114b9SEric Dumazet }
11084b114b9SEric Dumazet 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)1115d2ed052SEric Dumazet static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
11284b114b9SEric Dumazet {
1135d2ed052SEric Dumazet 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1141da177e4SLinus Torvalds }
1151da177e4SLinus Torvalds 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)1166d6ee43eSArnaldo Carvalho de Melo int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
1176d6ee43eSArnaldo Carvalho de Melo {
118cbfc6495SKuniyuki Iwashima 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
11979e9fed4SMaciej Żenczykowski 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
1206d6ee43eSArnaldo Carvalho de Melo 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
1216d6ee43eSArnaldo Carvalho de Melo 	struct tcp_sock *tp = tcp_sk(sk);
12269e0b33aSEric Dumazet 	int ts_recent_stamp;
12379e9fed4SMaciej Żenczykowski 
1243e5cbbb1SEric Dumazet 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
1250d9e5df4SJason Xing 		reuse = 0;
1260d9e5df4SJason Xing 
12779e9fed4SMaciej Żenczykowski 	if (reuse == 2) {
12879e9fed4SMaciej Żenczykowski 		/* Still does not detect *everything* that goes through
12979e9fed4SMaciej Żenczykowski 		 * lo, since we require a loopback src or dst address
13079e9fed4SMaciej Żenczykowski 		 * or direct binding to 'lo' interface.
13179e9fed4SMaciej Żenczykowski 		 */
13279e9fed4SMaciej Żenczykowski 		bool loopback = false;
13379e9fed4SMaciej Żenczykowski 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
13479e9fed4SMaciej Żenczykowski 			loopback = true;
13579e9fed4SMaciej Żenczykowski #if IS_ENABLED(CONFIG_IPV6)
13679e9fed4SMaciej Żenczykowski 		if (tw->tw_family == AF_INET6) {
13779e9fed4SMaciej Żenczykowski 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
138be2644aaSEric Dumazet 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
13979e9fed4SMaciej Żenczykowski 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
140be2644aaSEric Dumazet 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
14179e9fed4SMaciej Żenczykowski 				loopback = true;
14279e9fed4SMaciej Żenczykowski 		} else
14379e9fed4SMaciej Żenczykowski #endif
14479e9fed4SMaciej Żenczykowski 		{
14579e9fed4SMaciej Żenczykowski 			if (ipv4_is_loopback(tw->tw_daddr) ||
14679e9fed4SMaciej Żenczykowski 			    ipv4_is_loopback(tw->tw_rcv_saddr))
14779e9fed4SMaciej Żenczykowski 				loopback = true;
14879e9fed4SMaciej Żenczykowski 		}
14979e9fed4SMaciej Żenczykowski 		if (!loopback)
15079e9fed4SMaciej Żenczykowski 			reuse = 0;
15179e9fed4SMaciej Żenczykowski 	}
1526d6ee43eSArnaldo Carvalho de Melo 
1536d6ee43eSArnaldo Carvalho de Melo 	/* With PAWS, it is safe from the viewpoint
1546d6ee43eSArnaldo Carvalho de Melo 	   of data integrity. Even without PAWS it is safe provided sequence
1556d6ee43eSArnaldo Carvalho de Melo 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
1566d6ee43eSArnaldo Carvalho de Melo 
1576d6ee43eSArnaldo Carvalho de Melo 	   Actually, the idea is close to VJ's one, only timestamp cache is
1586d6ee43eSArnaldo Carvalho de Melo 	   held not per host, but per port pair and TW bucket is used as state
1596d6ee43eSArnaldo Carvalho de Melo 	   holder.
1606d6ee43eSArnaldo Carvalho de Melo 
1616d6ee43eSArnaldo Carvalho de Melo 	   If TW bucket has been already destroyed we fall back to VJ's scheme
1626d6ee43eSArnaldo Carvalho de Melo 	   and use initial timestamp retrieved from peer table.
1636d6ee43eSArnaldo Carvalho de Melo 	 */
16469e0b33aSEric Dumazet 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
16569e0b33aSEric Dumazet 	if (ts_recent_stamp &&
166cca9bab1SArnd Bergmann 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
16769e0b33aSEric Dumazet 					    ts_recent_stamp)))) {
168b334b924SValentin Schneider 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
169f2db7230SKuniyuki Iwashima 		 * and releasing the bucket lock.
170f2db7230SKuniyuki Iwashima 		 */
171f2db7230SKuniyuki Iwashima 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
172f2db7230SKuniyuki Iwashima 			return 0;
173f2db7230SKuniyuki Iwashima 
17421684dc4SStefan Baranoff 		/* In case of repair and re-using TIME-WAIT sockets we still
17521684dc4SStefan Baranoff 		 * want to be sure that it is safe as above but honor the
17621684dc4SStefan Baranoff 		 * sequence numbers and time stamps set as part of the repair
17721684dc4SStefan Baranoff 		 * process.
17821684dc4SStefan Baranoff 		 *
17921684dc4SStefan Baranoff 		 * Without this check re-using a TIME-WAIT socket with TCP
18021684dc4SStefan Baranoff 		 * repair would accumulate a -1 on the repair assigned
18121684dc4SStefan Baranoff 		 * sequence number. The first time it is reused the sequence
18221684dc4SStefan Baranoff 		 * is -1, the second time -2, etc. This fixes that issue
18321684dc4SStefan Baranoff 		 * without appearing to create any others.
18421684dc4SStefan Baranoff 		 */
18521684dc4SStefan Baranoff 		if (likely(!tp->repair)) {
1860f317464SEric Dumazet 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
1870f317464SEric Dumazet 
1880f317464SEric Dumazet 			if (!seq)
1890f317464SEric Dumazet 				seq = 1;
1900f317464SEric Dumazet 			WRITE_ONCE(tp->write_seq, seq);
19169e0b33aSEric Dumazet 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
19269e0b33aSEric Dumazet 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
19321684dc4SStefan Baranoff 		}
194f2db7230SKuniyuki Iwashima 
1956d6ee43eSArnaldo Carvalho de Melo 		return 1;
1966d6ee43eSArnaldo Carvalho de Melo 	}
1976d6ee43eSArnaldo Carvalho de Melo 
1986d6ee43eSArnaldo Carvalho de Melo 	return 0;
1996d6ee43eSArnaldo Carvalho de Melo }
2006d6ee43eSArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(tcp_twsk_unique);
2016d6ee43eSArnaldo Carvalho de Melo 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)202d74bad4eSAndrey Ignatov static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
203d74bad4eSAndrey Ignatov 			      int addr_len)
204d74bad4eSAndrey Ignatov {
205d74bad4eSAndrey Ignatov 	/* This check is replicated from tcp_v4_connect() and intended to
206d74bad4eSAndrey Ignatov 	 * prevent BPF program called below from accessing bytes that are out
207d74bad4eSAndrey Ignatov 	 * of the bound specified by user in addr_len.
208d74bad4eSAndrey Ignatov 	 */
209d74bad4eSAndrey Ignatov 	if (addr_len < sizeof(struct sockaddr_in))
210d74bad4eSAndrey Ignatov 		return -EINVAL;
211d74bad4eSAndrey Ignatov 
212d74bad4eSAndrey Ignatov 	sock_owned_by_me(sk);
213d74bad4eSAndrey Ignatov 
214fefba7d1SDaan De Meyer 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
215d74bad4eSAndrey Ignatov }
216d74bad4eSAndrey Ignatov 
2171da177e4SLinus Torvalds /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)2181da177e4SLinus Torvalds int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
2191da177e4SLinus Torvalds {
2202d7192d6SDavid S. Miller 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
22108eaef90SKuniyuki Iwashima 	struct inet_timewait_death_row *tcp_death_row;
2221da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sk);
2231da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
22408eaef90SKuniyuki Iwashima 	struct ip_options_rcu *inet_opt;
22508eaef90SKuniyuki Iwashima 	struct net *net = sock_net(sk);
226dca8b089SDavid S. Miller 	__be16 orig_sport, orig_dport;
2278c5dae4cSKuniyuki Iwashima 	__be32 daddr, nexthop;
228da905bd1SDavid S. Miller 	struct flowi4 *fl4;
2292d7192d6SDavid S. Miller 	struct rtable *rt;
2301da177e4SLinus Torvalds 	int err;
2311da177e4SLinus Torvalds 
2321da177e4SLinus Torvalds 	if (addr_len < sizeof(struct sockaddr_in))
2331da177e4SLinus Torvalds 		return -EINVAL;
2341da177e4SLinus Torvalds 
2351da177e4SLinus Torvalds 	if (usin->sin_family != AF_INET)
2361da177e4SLinus Torvalds 		return -EAFNOSUPPORT;
2371da177e4SLinus Torvalds 
2381da177e4SLinus Torvalds 	nexthop = daddr = usin->sin_addr.s_addr;
239f6d8bd05SEric Dumazet 	inet_opt = rcu_dereference_protected(inet->inet_opt,
2401e1d04e6SHannes Frederic Sowa 					     lockdep_sock_is_held(sk));
241f6d8bd05SEric Dumazet 	if (inet_opt && inet_opt->opt.srr) {
2421da177e4SLinus Torvalds 		if (!daddr)
2431da177e4SLinus Torvalds 			return -EINVAL;
244f6d8bd05SEric Dumazet 		nexthop = inet_opt->opt.faddr;
2451da177e4SLinus Torvalds 	}
2461da177e4SLinus Torvalds 
247dca8b089SDavid S. Miller 	orig_sport = inet->inet_sport;
248dca8b089SDavid S. Miller 	orig_dport = usin->sin_port;
249da905bd1SDavid S. Miller 	fl4 = &inet->cork.fl.u.ip4;
250da905bd1SDavid S. Miller 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
25167e1e2f4SGuillaume Nault 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
25267e1e2f4SGuillaume Nault 			      orig_dport, sk);
253b23dd4feSDavid S. Miller 	if (IS_ERR(rt)) {
254b23dd4feSDavid S. Miller 		err = PTR_ERR(rt);
255b23dd4feSDavid S. Miller 		if (err == -ENETUNREACH)
25608eaef90SKuniyuki Iwashima 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
257b23dd4feSDavid S. Miller 		return err;
258584bdf8cSWei Dong 	}
2591da177e4SLinus Torvalds 
2601da177e4SLinus Torvalds 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
2611da177e4SLinus Torvalds 		ip_rt_put(rt);
2621da177e4SLinus Torvalds 		return -ENETUNREACH;
2631da177e4SLinus Torvalds 	}
2641da177e4SLinus Torvalds 
265f6d8bd05SEric Dumazet 	if (!inet_opt || !inet_opt->opt.srr)
266da905bd1SDavid S. Miller 		daddr = fl4->daddr;
2671da177e4SLinus Torvalds 
2684461568aSKuniyuki Iwashima 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
2694461568aSKuniyuki Iwashima 
27028044fc1SJoanne Koong 	if (!inet->inet_saddr) {
2718c5dae4cSKuniyuki Iwashima 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
27228044fc1SJoanne Koong 		if (err) {
27328044fc1SJoanne Koong 			ip_rt_put(rt);
27428044fc1SJoanne Koong 			return err;
27528044fc1SJoanne Koong 		}
2768c5dae4cSKuniyuki Iwashima 	} else {
2778c5dae4cSKuniyuki Iwashima 		sk_rcv_saddr_set(sk, inet->inet_saddr);
27828044fc1SJoanne Koong 	}
27928044fc1SJoanne Koong 
280c720c7e8SEric Dumazet 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
2811da177e4SLinus Torvalds 		/* Reset inherited state */
2821da177e4SLinus Torvalds 		tp->rx_opt.ts_recent	   = 0;
2831da177e4SLinus Torvalds 		tp->rx_opt.ts_recent_stamp = 0;
284ee995283SPavel Emelyanov 		if (likely(!tp->repair))
2850f317464SEric Dumazet 			WRITE_ONCE(tp->write_seq, 0);
2861da177e4SLinus Torvalds 	}
2871da177e4SLinus Torvalds 
288c720c7e8SEric Dumazet 	inet->inet_dport = usin->sin_port;
289d1e559d0SEric Dumazet 	sk_daddr_set(sk, daddr);
2901da177e4SLinus Torvalds 
291d83d8461SArnaldo Carvalho de Melo 	inet_csk(sk)->icsk_ext_hdr_len = 0;
292f6d8bd05SEric Dumazet 	if (inet_opt)
293f6d8bd05SEric Dumazet 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
2941da177e4SLinus Torvalds 
295bee7ca9eSWilliam Allen Simpson 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
2961da177e4SLinus Torvalds 
2971da177e4SLinus Torvalds 	/* Socket identity is still unknown (sport may be zero).
2981da177e4SLinus Torvalds 	 * However we set state to SYN-SENT and not releasing socket
2991da177e4SLinus Torvalds 	 * lock select source port, enter ourselves into the hash tables and
3001da177e4SLinus Torvalds 	 * complete initialization after this.
3011da177e4SLinus Torvalds 	 */
3021da177e4SLinus Torvalds 	tcp_set_state(sk, TCP_SYN_SENT);
3031946e672SHaishuang Yan 	err = inet_hash_connect(tcp_death_row, sk);
3041da177e4SLinus Torvalds 	if (err)
3051da177e4SLinus Torvalds 		goto failure;
3061da177e4SLinus Torvalds 
307877d1f62STom Herbert 	sk_set_txhash(sk);
3089e7ceb06SSathya Perla 
309da905bd1SDavid S. Miller 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
310c720c7e8SEric Dumazet 			       inet->inet_sport, inet->inet_dport, sk);
311b23dd4feSDavid S. Miller 	if (IS_ERR(rt)) {
312b23dd4feSDavid S. Miller 		err = PTR_ERR(rt);
313b23dd4feSDavid S. Miller 		rt = NULL;
3141da177e4SLinus Torvalds 		goto failure;
315b23dd4feSDavid S. Miller 	}
316614e8316SEric Dumazet 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
3171da177e4SLinus Torvalds 	/* OK, now commit destination to socket.  */
318bcd76111SHerbert Xu 	sk->sk_gso_type = SKB_GSO_TCPV4;
319d8d1f30bSChangli Gao 	sk_setup_caps(sk, &rt->dst);
32019f6d3f3SWei Wang 	rt = NULL;
3211da177e4SLinus Torvalds 
32200355fa5SAlexey Kodanev 	if (likely(!tp->repair)) {
32384b114b9SEric Dumazet 		if (!tp->write_seq)
3240f317464SEric Dumazet 			WRITE_ONCE(tp->write_seq,
3250f317464SEric Dumazet 				   secure_tcp_seq(inet->inet_saddr,
326c720c7e8SEric Dumazet 						  inet->inet_daddr,
327c720c7e8SEric Dumazet 						  inet->inet_sport,
3280f317464SEric Dumazet 						  usin->sin_port));
329dd23c9f1SEric Dumazet 		WRITE_ONCE(tp->tsoffset,
330dd23c9f1SEric Dumazet 			   secure_tcp_ts_off(net, inet->inet_saddr,
331dd23c9f1SEric Dumazet 					     inet->inet_daddr));
33200355fa5SAlexey Kodanev 	}
3331da177e4SLinus Torvalds 
334f866fbc8SEric Dumazet 	atomic_set(&inet->inet_id, get_random_u16());
3351da177e4SLinus Torvalds 
33619f6d3f3SWei Wang 	if (tcp_fastopen_defer_connect(sk, &err))
33719f6d3f3SWei Wang 		return err;
33819f6d3f3SWei Wang 	if (err)
33919f6d3f3SWei Wang 		goto failure;
34019f6d3f3SWei Wang 
3411da177e4SLinus Torvalds 	err = tcp_connect(sk);
342ee995283SPavel Emelyanov 
3431da177e4SLinus Torvalds 	if (err)
3441da177e4SLinus Torvalds 		goto failure;
3451da177e4SLinus Torvalds 
3461da177e4SLinus Torvalds 	return 0;
3471da177e4SLinus Torvalds 
3481da177e4SLinus Torvalds failure:
3497174259eSArnaldo Carvalho de Melo 	/*
3507174259eSArnaldo Carvalho de Melo 	 * This unhashes the socket and releases the local port,
3517174259eSArnaldo Carvalho de Melo 	 * if necessary.
3527174259eSArnaldo Carvalho de Melo 	 */
3531da177e4SLinus Torvalds 	tcp_set_state(sk, TCP_CLOSE);
354e0833d1fSKuniyuki Iwashima 	inet_bhash2_reset_saddr(sk);
3551da177e4SLinus Torvalds 	ip_rt_put(rt);
3561da177e4SLinus Torvalds 	sk->sk_route_caps = 0;
357c720c7e8SEric Dumazet 	inet->inet_dport = 0;
3581da177e4SLinus Torvalds 	return err;
3591da177e4SLinus Torvalds }
3604bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_connect);
3611da177e4SLinus Torvalds 
3621da177e4SLinus Torvalds /*
363563d34d0SEric Dumazet  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
364563d34d0SEric Dumazet  * It can be called through tcp_release_cb() if socket was owned by user
365563d34d0SEric Dumazet  * at the time tcp_v4_err() was called to handle ICMP message.
3661da177e4SLinus Torvalds  */
tcp_v4_mtu_reduced(struct sock * sk)3674fab9071SNeal Cardwell void tcp_v4_mtu_reduced(struct sock *sk)
3681da177e4SLinus Torvalds {
3691da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sk);
37002b2faafSEric Dumazet 	struct dst_entry *dst;
37102b2faafSEric Dumazet 	u32 mtu;
3721da177e4SLinus Torvalds 
37302b2faafSEric Dumazet 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
37402b2faafSEric Dumazet 		return;
375561022acSEric Dumazet 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
37680d0a69fSDavid S. Miller 	dst = inet_csk_update_pmtu(sk, mtu);
37780d0a69fSDavid S. Miller 	if (!dst)
3781da177e4SLinus Torvalds 		return;
3791da177e4SLinus Torvalds 
3801da177e4SLinus Torvalds 	/* Something is about to be wrong... Remember soft error
3811da177e4SLinus Torvalds 	 * for the case, if this connection will not able to recover.
3821da177e4SLinus Torvalds 	 */
3831da177e4SLinus Torvalds 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
384cee1af82SEric Dumazet 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
3851da177e4SLinus Torvalds 
3861da177e4SLinus Torvalds 	mtu = dst_mtu(dst);
3871da177e4SLinus Torvalds 
3881da177e4SLinus Torvalds 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
389482fc609SHannes Frederic Sowa 	    ip_sk_accept_pmtu(sk) &&
390d83d8461SArnaldo Carvalho de Melo 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
3911da177e4SLinus Torvalds 		tcp_sync_mss(sk, mtu);
3921da177e4SLinus Torvalds 
3931da177e4SLinus Torvalds 		/* Resend the TCP packet because it's
3941da177e4SLinus Torvalds 		 * clear that the old packet has been
3951da177e4SLinus Torvalds 		 * dropped. This is the new "fast" path mtu
3961da177e4SLinus Torvalds 		 * discovery.
3971da177e4SLinus Torvalds 		 */
3981da177e4SLinus Torvalds 		tcp_simple_retransmit(sk);
3991da177e4SLinus Torvalds 	} /* else let the usual retransmit timer handle it */
4001da177e4SLinus Torvalds }
4014fab9071SNeal Cardwell EXPORT_SYMBOL(tcp_v4_mtu_reduced);
4021da177e4SLinus Torvalds 
do_redirect(struct sk_buff * skb,struct sock * sk)40355be7a9cSDavid S. Miller static void do_redirect(struct sk_buff *skb, struct sock *sk)
40455be7a9cSDavid S. Miller {
40555be7a9cSDavid S. Miller 	struct dst_entry *dst = __sk_dst_check(sk, 0);
40655be7a9cSDavid S. Miller 
4071ed5c48fSDavid S. Miller 	if (dst)
4086700c270SDavid S. Miller 		dst->ops->redirect(dst, sk, skb);
40955be7a9cSDavid S. Miller }
41055be7a9cSDavid S. Miller 
41126e37360SEric Dumazet 
41226e37360SEric Dumazet /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)4139cf74903SEric Dumazet void tcp_req_err(struct sock *sk, u32 seq, bool abort)
41426e37360SEric Dumazet {
41526e37360SEric Dumazet 	struct request_sock *req = inet_reqsk(sk);
41626e37360SEric Dumazet 	struct net *net = sock_net(sk);
41726e37360SEric Dumazet 
41826e37360SEric Dumazet 	/* ICMPs are not backlogged, hence we cannot get
41926e37360SEric Dumazet 	 * an established socket here.
42026e37360SEric Dumazet 	 */
42126e37360SEric Dumazet 	if (seq != tcp_rsk(req)->snt_isn) {
42202a1d6e7SEric Dumazet 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
4239cf74903SEric Dumazet 	} else if (abort) {
42426e37360SEric Dumazet 		/*
42526e37360SEric Dumazet 		 * Still in SYN_RECV, just remove it silently.
42626e37360SEric Dumazet 		 * There is no good way to pass the error to the newly
42726e37360SEric Dumazet 		 * created socket, and POSIX does not want network
42826e37360SEric Dumazet 		 * errors returned from accept().
42926e37360SEric Dumazet 		 */
430c6973669SFan Du 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4319caad864SEric Dumazet 		tcp_listendrop(req->rsk_listener);
43226e37360SEric Dumazet 	}
433ef84d8ceSEric Dumazet 	reqsk_put(req);
43426e37360SEric Dumazet }
43526e37360SEric Dumazet EXPORT_SYMBOL(tcp_req_err);
43626e37360SEric Dumazet 
437f7456642SEric Dumazet /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)438d2924569SEric Dumazet void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
439f7456642SEric Dumazet {
440f7456642SEric Dumazet 	struct inet_connection_sock *icsk = inet_csk(sk);
441f7456642SEric Dumazet 	struct tcp_sock *tp = tcp_sk(sk);
442f7456642SEric Dumazet 	struct sk_buff *skb;
443f7456642SEric Dumazet 	s32 remaining;
444f7456642SEric Dumazet 	u32 delta_us;
445f7456642SEric Dumazet 
446f7456642SEric Dumazet 	if (sock_owned_by_user(sk))
447f7456642SEric Dumazet 		return;
448f7456642SEric Dumazet 
449f7456642SEric Dumazet 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
450f7456642SEric Dumazet 	    !icsk->icsk_backoff)
451f7456642SEric Dumazet 		return;
452f7456642SEric Dumazet 
453f7456642SEric Dumazet 	skb = tcp_rtx_queue_head(sk);
454f7456642SEric Dumazet 	if (WARN_ON_ONCE(!skb))
455f7456642SEric Dumazet 		return;
456f7456642SEric Dumazet 
457f7456642SEric Dumazet 	icsk->icsk_backoff--;
458f7456642SEric Dumazet 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
459f7456642SEric Dumazet 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
460f7456642SEric Dumazet 
461f7456642SEric Dumazet 	tcp_mstamp_refresh(tp);
462f7456642SEric Dumazet 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
463f7456642SEric Dumazet 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
464f7456642SEric Dumazet 
465f7456642SEric Dumazet 	if (remaining > 0) {
466f7456642SEric Dumazet 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
467f7456642SEric Dumazet 					  remaining, TCP_RTO_MAX);
468f7456642SEric Dumazet 	} else {
469f7456642SEric Dumazet 		/* RTO revert clocked out retransmission.
470f7456642SEric Dumazet 		 * Will retransmit now.
471f7456642SEric Dumazet 		 */
472f7456642SEric Dumazet 		tcp_retransmit_timer(sk);
473f7456642SEric Dumazet 	}
474f7456642SEric Dumazet }
475d2924569SEric Dumazet EXPORT_SYMBOL(tcp_ld_RTO_revert);
476f7456642SEric Dumazet 
4771da177e4SLinus Torvalds /*
4781da177e4SLinus Torvalds  * This routine is called by the ICMP module when it gets some
4791da177e4SLinus Torvalds  * sort of error condition.  If err < 0 then the socket should
4801da177e4SLinus Torvalds  * be closed and the error returned to the user.  If err > 0
4811da177e4SLinus Torvalds  * it's just the icmp type << 8 | icmp code.  After adjustment
4821da177e4SLinus Torvalds  * header points to the first 8 bytes of the tcp header.  We need
4831da177e4SLinus Torvalds  * to find the appropriate port.
4841da177e4SLinus Torvalds  *
4851da177e4SLinus Torvalds  * The locking strategy used here is very "optimistic". When
4861da177e4SLinus Torvalds  * someone else accesses the socket the ICMP is just dropped
4871da177e4SLinus Torvalds  * and for some paths there is no check at all.
4881da177e4SLinus Torvalds  * A more general error queue to queue errors for later handling
4891da177e4SLinus Torvalds  * is probably better.
4901da177e4SLinus Torvalds  *
4911da177e4SLinus Torvalds  */
4921da177e4SLinus Torvalds 
tcp_v4_err(struct sk_buff * skb,u32 info)493a12daf13SEric Dumazet int tcp_v4_err(struct sk_buff *skb, u32 info)
4941da177e4SLinus Torvalds {
495a12daf13SEric Dumazet 	const struct iphdr *iph = (const struct iphdr *)skb->data;
496a12daf13SEric Dumazet 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
4971da177e4SLinus Torvalds 	struct tcp_sock *tp;
498a12daf13SEric Dumazet 	const int type = icmp_hdr(skb)->type;
499a12daf13SEric Dumazet 	const int code = icmp_hdr(skb)->code;
5001da177e4SLinus Torvalds 	struct sock *sk;
5010a672f74SYuchung Cheng 	struct request_sock *fastopen;
5029a568de4SEric Dumazet 	u32 seq, snd_una;
5031da177e4SLinus Torvalds 	int err;
504a12daf13SEric Dumazet 	struct net *net = dev_net(skb->dev);
5051da177e4SLinus Torvalds 
5064461568aSKuniyuki Iwashima 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
5074461568aSKuniyuki Iwashima 				       iph->daddr, th->dest, iph->saddr,
5084461568aSKuniyuki Iwashima 				       ntohs(th->source), inet_iif(skb), 0);
5091da177e4SLinus Torvalds 	if (!sk) {
5105d3848bcSEric Dumazet 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
51132bbd879SStefano Brivio 		return -ENOENT;
5121da177e4SLinus Torvalds 	}
5131da177e4SLinus Torvalds 	if (sk->sk_state == TCP_TIME_WAIT) {
514953af8e3SDmitry Safonov 		/* To increase the counter of ignored icmps for TCP-AO */
515953af8e3SDmitry Safonov 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
5169469c7b4SYOSHIFUJI Hideaki 		inet_twsk_put(inet_twsk(sk));
51732bbd879SStefano Brivio 		return 0;
5181da177e4SLinus Torvalds 	}
51926e37360SEric Dumazet 	seq = ntohl(th->seq);
52032bbd879SStefano Brivio 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
52132bbd879SStefano Brivio 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
5229cf74903SEric Dumazet 				     type == ICMP_TIME_EXCEEDED ||
5239cf74903SEric Dumazet 				     (type == ICMP_DEST_UNREACH &&
5249cf74903SEric Dumazet 				      (code == ICMP_NET_UNREACH ||
5259cf74903SEric Dumazet 				       code == ICMP_HOST_UNREACH)));
52632bbd879SStefano Brivio 		return 0;
52732bbd879SStefano Brivio 	}
5281da177e4SLinus Torvalds 
529953af8e3SDmitry Safonov 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
530953af8e3SDmitry Safonov 		sock_put(sk);
531953af8e3SDmitry Safonov 		return 0;
532953af8e3SDmitry Safonov 	}
533953af8e3SDmitry Safonov 
5341da177e4SLinus Torvalds 	bh_lock_sock(sk);
5351da177e4SLinus Torvalds 	/* If too many ICMPs get dropped on busy
5361da177e4SLinus Torvalds 	 * servers this needs to be solved differently.
537563d34d0SEric Dumazet 	 * We do take care of PMTU discovery (RFC1191) special case :
538563d34d0SEric Dumazet 	 * we can receive locally generated ICMP messages while socket is held.
5391da177e4SLinus Torvalds 	 */
540b74aa930SEric Dumazet 	if (sock_owned_by_user(sk)) {
541b74aa930SEric Dumazet 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
54202a1d6e7SEric Dumazet 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
543b74aa930SEric Dumazet 	}
5441da177e4SLinus Torvalds 	if (sk->sk_state == TCP_CLOSE)
5451da177e4SLinus Torvalds 		goto out;
5461da177e4SLinus Torvalds 
547020e71a3SEric Dumazet 	if (static_branch_unlikely(&ip4_min_ttl)) {
54814834c4fSEric Dumazet 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
54914834c4fSEric Dumazet 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
55002a1d6e7SEric Dumazet 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
55197e3ecd1Sstephen hemminger 			goto out;
55297e3ecd1Sstephen hemminger 		}
553020e71a3SEric Dumazet 	}
55497e3ecd1Sstephen hemminger 
5551da177e4SLinus Torvalds 	tp = tcp_sk(sk);
5560a672f74SYuchung Cheng 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
557d983ea6fSEric Dumazet 	fastopen = rcu_dereference(tp->fastopen_rsk);
5580a672f74SYuchung Cheng 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
5591da177e4SLinus Torvalds 	if (sk->sk_state != TCP_LISTEN &&
5600a672f74SYuchung Cheng 	    !between(seq, snd_una, tp->snd_nxt)) {
56102a1d6e7SEric Dumazet 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
5621da177e4SLinus Torvalds 		goto out;
5631da177e4SLinus Torvalds 	}
5641da177e4SLinus Torvalds 
5651da177e4SLinus Torvalds 	switch (type) {
56655be7a9cSDavid S. Miller 	case ICMP_REDIRECT:
56745caeaa5SJon Maxwell 		if (!sock_owned_by_user(sk))
568a12daf13SEric Dumazet 			do_redirect(skb, sk);
56955be7a9cSDavid S. Miller 		goto out;
5701da177e4SLinus Torvalds 	case ICMP_SOURCE_QUENCH:
5711da177e4SLinus Torvalds 		/* Just silently ignore these. */
5721da177e4SLinus Torvalds 		goto out;
5731da177e4SLinus Torvalds 	case ICMP_PARAMETERPROB:
5741da177e4SLinus Torvalds 		err = EPROTO;
5751da177e4SLinus Torvalds 		break;
5761da177e4SLinus Torvalds 	case ICMP_DEST_UNREACH:
5771da177e4SLinus Torvalds 		if (code > NR_ICMP_UNREACH)
5781da177e4SLinus Torvalds 			goto out;
5791da177e4SLinus Torvalds 
5801da177e4SLinus Torvalds 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
5810d4f0608SEric Dumazet 			/* We are not interested in TCP_LISTEN and open_requests
5820d4f0608SEric Dumazet 			 * (SYN-ACKs send out by Linux are always <576bytes so
5830d4f0608SEric Dumazet 			 * they should go through unfragmented).
5840d4f0608SEric Dumazet 			 */
5850d4f0608SEric Dumazet 			if (sk->sk_state == TCP_LISTEN)
5860d4f0608SEric Dumazet 				goto out;
5870d4f0608SEric Dumazet 
588561022acSEric Dumazet 			WRITE_ONCE(tp->mtu_info, info);
589144d56e9SEric Dumazet 			if (!sock_owned_by_user(sk)) {
590563d34d0SEric Dumazet 				tcp_v4_mtu_reduced(sk);
591144d56e9SEric Dumazet 			} else {
5927aa5470cSEric Dumazet 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
593144d56e9SEric Dumazet 					sock_hold(sk);
594144d56e9SEric Dumazet 			}
5951da177e4SLinus Torvalds 			goto out;
5961da177e4SLinus Torvalds 		}
5971da177e4SLinus Torvalds 
5981da177e4SLinus Torvalds 		err = icmp_err_convert[code].errno;
599f7456642SEric Dumazet 		/* check if this ICMP message allows revert of backoff.
600f7456642SEric Dumazet 		 * (see RFC 6069)
601f7456642SEric Dumazet 		 */
602f7456642SEric Dumazet 		if (!fastopen &&
603f7456642SEric Dumazet 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
604f7456642SEric Dumazet 			tcp_ld_RTO_revert(sk, seq);
6051da177e4SLinus Torvalds 		break;
6061da177e4SLinus Torvalds 	case ICMP_TIME_EXCEEDED:
6071da177e4SLinus Torvalds 		err = EHOSTUNREACH;
6081da177e4SLinus Torvalds 		break;
6091da177e4SLinus Torvalds 	default:
6101da177e4SLinus Torvalds 		goto out;
6111da177e4SLinus Torvalds 	}
6121da177e4SLinus Torvalds 
6131da177e4SLinus Torvalds 	switch (sk->sk_state) {
6141da177e4SLinus Torvalds 	case TCP_SYN_SENT:
6150a672f74SYuchung Cheng 	case TCP_SYN_RECV:
6160a672f74SYuchung Cheng 		/* Only in fast or simultaneous open. If a fast open socket is
6172bdcc73cSRandy Dunlap 		 * already accepted it is treated as a connected one below.
6181da177e4SLinus Torvalds 		 */
61951456b29SIan Morris 		if (fastopen && !fastopen->sk)
6200a672f74SYuchung Cheng 			break;
6210a672f74SYuchung Cheng 
622a12daf13SEric Dumazet 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
62345af29caSEric Dumazet 
624fde6f897SEric Dumazet 		if (!sock_owned_by_user(sk))
625fde6f897SEric Dumazet 			tcp_done_with_error(sk, err);
626fde6f897SEric Dumazet 		else
627cee1af82SEric Dumazet 			WRITE_ONCE(sk->sk_err_soft, err);
6281da177e4SLinus Torvalds 		goto out;
6291da177e4SLinus Torvalds 	}
6301da177e4SLinus Torvalds 
6311da177e4SLinus Torvalds 	/* If we've already connected we will keep trying
6321da177e4SLinus Torvalds 	 * until we time out, or the user gives up.
6331da177e4SLinus Torvalds 	 *
6341da177e4SLinus Torvalds 	 * rfc1122 4.2.3.9 allows to consider as hard errors
6351da177e4SLinus Torvalds 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
6361da177e4SLinus Torvalds 	 * but it is obsoleted by pmtu discovery).
6371da177e4SLinus Torvalds 	 *
6381da177e4SLinus Torvalds 	 * Note, that in modern internet, where routing is unreliable
6391da177e4SLinus Torvalds 	 * and in each dark corner broken firewalls sit, sending random
6401da177e4SLinus Torvalds 	 * errors ordered by their masters even this two messages finally lose
6411da177e4SLinus Torvalds 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
6421da177e4SLinus Torvalds 	 *
6431da177e4SLinus Torvalds 	 * Now we are in compliance with RFCs.
6441da177e4SLinus Torvalds 	 *							--ANK (980905)
6451da177e4SLinus Torvalds 	 */
6461da177e4SLinus Torvalds 
6476b5f43eaSEric Dumazet 	if (!sock_owned_by_user(sk) &&
6486b5f43eaSEric Dumazet 	    inet_test_bit(RECVERR, sk)) {
649e13ec3daSEric Dumazet 		WRITE_ONCE(sk->sk_err, err);
650e3ae2365SAlexander Aring 		sk_error_report(sk);
6511da177e4SLinus Torvalds 	} else	{ /* Only an error on timeout */
652cee1af82SEric Dumazet 		WRITE_ONCE(sk->sk_err_soft, err);
6531da177e4SLinus Torvalds 	}
6541da177e4SLinus Torvalds 
6551da177e4SLinus Torvalds out:
6561da177e4SLinus Torvalds 	bh_unlock_sock(sk);
6571da177e4SLinus Torvalds 	sock_put(sk);
65832bbd879SStefano Brivio 	return 0;
6591da177e4SLinus Torvalds }
6601da177e4SLinus Torvalds 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)66128850dc7SDaniel Borkmann void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
6621da177e4SLinus Torvalds {
663aa8223c7SArnaldo Carvalho de Melo 	struct tcphdr *th = tcp_hdr(skb);
6641da177e4SLinus Torvalds 
665419f9f89SHerbert Xu 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
666663ead3bSHerbert Xu 	skb->csum_start = skb_transport_header(skb) - skb->head;
667ff1dcadbSAl Viro 	skb->csum_offset = offsetof(struct tcphdr, check);
6681da177e4SLinus Torvalds }
6691da177e4SLinus Torvalds 
670419f9f89SHerbert Xu /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)671bb296246SHerbert Xu void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
672419f9f89SHerbert Xu {
673cf533ea5SEric Dumazet 	const struct inet_sock *inet = inet_sk(sk);
674419f9f89SHerbert Xu 
675419f9f89SHerbert Xu 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
676419f9f89SHerbert Xu }
6774bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_send_check);
678419f9f89SHerbert Xu 
679ba7783adSDmitry Safonov #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
680ba7783adSDmitry Safonov 
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])681ba7783adSDmitry Safonov static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
682ba7783adSDmitry Safonov 				 const struct tcp_ao_hdr *aoh,
683ba7783adSDmitry Safonov 				 struct ip_reply_arg *arg, struct tcphdr *reply,
684ba7783adSDmitry Safonov 				 __be32 reply_options[REPLY_OPTIONS_LEN])
685ba7783adSDmitry Safonov {
686ba7783adSDmitry Safonov #ifdef CONFIG_TCP_AO
687ba7783adSDmitry Safonov 	int sdif = tcp_v4_sdif(skb);
688ba7783adSDmitry Safonov 	int dif = inet_iif(skb);
689ba7783adSDmitry Safonov 	int l3index = sdif ? dif : 0;
690ba7783adSDmitry Safonov 	bool allocated_traffic_key;
691ba7783adSDmitry Safonov 	struct tcp_ao_key *key;
692ba7783adSDmitry Safonov 	char *traffic_key;
693ba7783adSDmitry Safonov 	bool drop = true;
694ba7783adSDmitry Safonov 	u32 ao_sne = 0;
695ba7783adSDmitry Safonov 	u8 keyid;
696ba7783adSDmitry Safonov 
697ba7783adSDmitry Safonov 	rcu_read_lock();
69864382c71SDmitry Safonov 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
699ba7783adSDmitry Safonov 				 &key, &traffic_key, &allocated_traffic_key,
700ba7783adSDmitry Safonov 				 &keyid, &ao_sne))
701ba7783adSDmitry Safonov 		goto out;
702ba7783adSDmitry Safonov 
703ba7783adSDmitry Safonov 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
704ba7783adSDmitry Safonov 				 (aoh->rnext_keyid << 8) | keyid);
705da7dfaa6SDmitry Safonov 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
706ba7783adSDmitry Safonov 	reply->doff = arg->iov[0].iov_len / 4;
707ba7783adSDmitry Safonov 
708ba7783adSDmitry Safonov 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
709ba7783adSDmitry Safonov 			    key, traffic_key,
710ba7783adSDmitry Safonov 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
711ba7783adSDmitry Safonov 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
712ba7783adSDmitry Safonov 			    reply, ao_sne))
713ba7783adSDmitry Safonov 		goto out;
714ba7783adSDmitry Safonov 	drop = false;
715ba7783adSDmitry Safonov out:
716ba7783adSDmitry Safonov 	rcu_read_unlock();
717ba7783adSDmitry Safonov 	if (allocated_traffic_key)
718ba7783adSDmitry Safonov 		kfree(traffic_key);
719ba7783adSDmitry Safonov 	return drop;
720ba7783adSDmitry Safonov #else
721ba7783adSDmitry Safonov 	return true;
722ba7783adSDmitry Safonov #endif
723ba7783adSDmitry Safonov }
724ba7783adSDmitry Safonov 
7251da177e4SLinus Torvalds /*
7261da177e4SLinus Torvalds  *	This routine will send an RST to the other tcp.
7271da177e4SLinus Torvalds  *
7281da177e4SLinus Torvalds  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
7291da177e4SLinus Torvalds  *		      for reset.
7301da177e4SLinus Torvalds  *	Answer: if a packet caused RST, it is not for a socket
7311da177e4SLinus Torvalds  *		existing in our system, if it is matched to a socket,
7321da177e4SLinus Torvalds  *		it is just duplicate segment or bug in other side's TCP.
7331da177e4SLinus Torvalds  *		So that we build reply only basing on parameters
7341da177e4SLinus Torvalds  *		arrived with segment.
7351da177e4SLinus Torvalds  *	Exception: precedence violation. We do not implement it in any case.
7361da177e4SLinus Torvalds  */
7371da177e4SLinus Torvalds 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)7386be49deaSJason Xing static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
7396be49deaSJason Xing 			      enum sk_rst_reason reason)
7401da177e4SLinus Torvalds {
741cf533ea5SEric Dumazet 	const struct tcphdr *th = tcp_hdr(skb);
742cfb6eeb4SYOSHIFUJI Hideaki 	struct {
743cfb6eeb4SYOSHIFUJI Hideaki 		struct tcphdr th;
744ba7783adSDmitry Safonov 		__be32 opt[REPLY_OPTIONS_LEN];
745cfb6eeb4SYOSHIFUJI Hideaki 	} rep;
746ba7783adSDmitry Safonov 	const __u8 *md5_hash_location = NULL;
747ba7783adSDmitry Safonov 	const struct tcp_ao_hdr *aoh;
7481da177e4SLinus Torvalds 	struct ip_reply_arg arg;
749cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
750e46787f0SFlorian Westphal 	struct tcp_md5sig_key *key = NULL;
751658ddaafSShawn Lu 	unsigned char newhash[16];
752658ddaafSShawn Lu 	struct sock *sk1 = NULL;
753ba7783adSDmitry Safonov 	int genhash;
754cfb6eeb4SYOSHIFUJI Hideaki #endif
755d6fb396cSEric Dumazet 	u64 transmit_time = 0;
75600483690SJon Maxwell 	struct sock *ctl_sk;
757d6fb396cSEric Dumazet 	struct net *net;
758c0a8966eSAntoine Tenart 	u32 txhash = 0;
7591da177e4SLinus Torvalds 
7601da177e4SLinus Torvalds 	/* Never send a reset in response to a reset. */
7611da177e4SLinus Torvalds 	if (th->rst)
7621da177e4SLinus Torvalds 		return;
7631da177e4SLinus Torvalds 
764c3658e8dSEric Dumazet 	/* If sk not NULL, it means we did a successful lookup and incoming
765c3658e8dSEric Dumazet 	 * route had to be correct. prequeue might have dropped our dst.
766c3658e8dSEric Dumazet 	 */
767c3658e8dSEric Dumazet 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
7681da177e4SLinus Torvalds 		return;
7691da177e4SLinus Torvalds 
7701da177e4SLinus Torvalds 	/* Swap the send and the receive. */
771cfb6eeb4SYOSHIFUJI Hideaki 	memset(&rep, 0, sizeof(rep));
772cfb6eeb4SYOSHIFUJI Hideaki 	rep.th.dest   = th->source;
773cfb6eeb4SYOSHIFUJI Hideaki 	rep.th.source = th->dest;
774cfb6eeb4SYOSHIFUJI Hideaki 	rep.th.doff   = sizeof(struct tcphdr) / 4;
775cfb6eeb4SYOSHIFUJI Hideaki 	rep.th.rst    = 1;
7761da177e4SLinus Torvalds 
7771da177e4SLinus Torvalds 	if (th->ack) {
778cfb6eeb4SYOSHIFUJI Hideaki 		rep.th.seq = th->ack_seq;
7791da177e4SLinus Torvalds 	} else {
780cfb6eeb4SYOSHIFUJI Hideaki 		rep.th.ack = 1;
781cfb6eeb4SYOSHIFUJI Hideaki 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
7821da177e4SLinus Torvalds 				       skb->len - (th->doff << 2));
7831da177e4SLinus Torvalds 	}
7841da177e4SLinus Torvalds 
7857174259eSArnaldo Carvalho de Melo 	memset(&arg, 0, sizeof(arg));
786cfb6eeb4SYOSHIFUJI Hideaki 	arg.iov[0].iov_base = (unsigned char *)&rep;
787cfb6eeb4SYOSHIFUJI Hideaki 	arg.iov[0].iov_len  = sizeof(rep.th);
788cfb6eeb4SYOSHIFUJI Hideaki 
7890f85feaeSEric Dumazet 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
790ba7783adSDmitry Safonov 
791f7dca36fSDmitry Safonov 	/* Invalid TCP option size or twice included auth */
792ba7783adSDmitry Safonov 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
793f7dca36fSDmitry Safonov 		return;
794f7dca36fSDmitry Safonov 
795ba7783adSDmitry Safonov 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
796ba7783adSDmitry Safonov 		return;
797ba7783adSDmitry Safonov 
798ba7783adSDmitry Safonov #ifdef CONFIG_TCP_MD5SIG
7993b24d854SEric Dumazet 	rcu_read_lock();
800271c3b9bSFlorian Westphal 	if (sk && sk_fullsock(sk)) {
801cea97609SDavid Ahern 		const union tcp_md5_addr *addr;
802dea53bb8SDavid Ahern 		int l3index;
803cea97609SDavid Ahern 
804dea53bb8SDavid Ahern 		/* sdif set, means packet ingressed via a device
805dea53bb8SDavid Ahern 		 * in an L3 domain and inet_iif is set to it.
806dea53bb8SDavid Ahern 		 */
807dea53bb8SDavid Ahern 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
808cea97609SDavid Ahern 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
809dea53bb8SDavid Ahern 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
810f7dca36fSDmitry Safonov 	} else if (md5_hash_location) {
811cea97609SDavid Ahern 		const union tcp_md5_addr *addr;
812534322caSDavid Ahern 		int sdif = tcp_v4_sdif(skb);
813534322caSDavid Ahern 		int dif = inet_iif(skb);
814dea53bb8SDavid Ahern 		int l3index;
815cea97609SDavid Ahern 
816658ddaafSShawn Lu 		/*
817658ddaafSShawn Lu 		 * active side is lost. Try to find listening socket through
818658ddaafSShawn Lu 		 * source port, and then find md5 key through listening socket.
819658ddaafSShawn Lu 		 * we are not loose security here:
820658ddaafSShawn Lu 		 * Incoming packet is checked with md5 hash with finding key,
821658ddaafSShawn Lu 		 * no RST generated if md5 hash doesn't match.
822658ddaafSShawn Lu 		 */
8234461568aSKuniyuki Iwashima 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
8244461568aSKuniyuki Iwashima 					     NULL, 0, ip_hdr(skb)->saddr,
825da5e3630STom Herbert 					     th->source, ip_hdr(skb)->daddr,
826534322caSDavid Ahern 					     ntohs(th->source), dif, sdif);
827658ddaafSShawn Lu 		/* don't send rst if it can't find key */
828658ddaafSShawn Lu 		if (!sk1)
8293b24d854SEric Dumazet 			goto out;
8303b24d854SEric Dumazet 
831dea53bb8SDavid Ahern 		/* sdif set, means packet ingressed via a device
832dea53bb8SDavid Ahern 		 * in an L3 domain and dif is set to it.
833dea53bb8SDavid Ahern 		 */
834dea53bb8SDavid Ahern 		l3index = sdif ? dif : 0;
835cea97609SDavid Ahern 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
836dea53bb8SDavid Ahern 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
837658ddaafSShawn Lu 		if (!key)
8383b24d854SEric Dumazet 			goto out;
8393b24d854SEric Dumazet 
840658ddaafSShawn Lu 
84139f8e58eSEric Dumazet 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
842f7dca36fSDmitry Safonov 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
8433b24d854SEric Dumazet 			goto out;
8443b24d854SEric Dumazet 
845658ddaafSShawn Lu 	}
846658ddaafSShawn Lu 
847cfb6eeb4SYOSHIFUJI Hideaki 	if (key) {
848cfb6eeb4SYOSHIFUJI Hideaki 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
849cfb6eeb4SYOSHIFUJI Hideaki 				   (TCPOPT_NOP << 16) |
850cfb6eeb4SYOSHIFUJI Hideaki 				   (TCPOPT_MD5SIG << 8) |
851cfb6eeb4SYOSHIFUJI Hideaki 				   TCPOLEN_MD5SIG);
852cfb6eeb4SYOSHIFUJI Hideaki 		/* Update length and the length the header thinks exists */
853cfb6eeb4SYOSHIFUJI Hideaki 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
854cfb6eeb4SYOSHIFUJI Hideaki 		rep.th.doff = arg.iov[0].iov_len / 4;
855cfb6eeb4SYOSHIFUJI Hideaki 
85649a72dfbSAdam Langley 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
85778e645cbSIlpo Järvinen 				     key, ip_hdr(skb)->saddr,
85878e645cbSIlpo Järvinen 				     ip_hdr(skb)->daddr, &rep.th);
859cfb6eeb4SYOSHIFUJI Hideaki 	}
860cfb6eeb4SYOSHIFUJI Hideaki #endif
861dc87efdbSFlorian Westphal 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
862dc87efdbSFlorian Westphal 	if (rep.opt[0] == 0) {
863dc87efdbSFlorian Westphal 		__be32 mrst = mptcp_reset_option(skb);
864dc87efdbSFlorian Westphal 
865dc87efdbSFlorian Westphal 		if (mrst) {
866dc87efdbSFlorian Westphal 			rep.opt[0] = mrst;
867dc87efdbSFlorian Westphal 			arg.iov[0].iov_len += sizeof(mrst);
868dc87efdbSFlorian Westphal 			rep.th.doff = arg.iov[0].iov_len / 4;
869dc87efdbSFlorian Westphal 		}
870dc87efdbSFlorian Westphal 	}
871dc87efdbSFlorian Westphal 
872eddc9ec5SArnaldo Carvalho de Melo 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
873eddc9ec5SArnaldo Carvalho de Melo 				      ip_hdr(skb)->saddr, /* XXX */
87452cd5750SIlpo Järvinen 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
8751da177e4SLinus Torvalds 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
876271c3b9bSFlorian Westphal 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
877271c3b9bSFlorian Westphal 
878e2446eaaSShawn Lu 	/* When socket is gone, all binding information is lost.
8794c675258SAlexey Kuznetsov 	 * routing might fail in this case. No choice here, if we choose to force
8804c675258SAlexey Kuznetsov 	 * input interface, we will misroute in case of asymmetric route.
881e2446eaaSShawn Lu 	 */
88219822a98SJason Xing 	if (sk)
8834c675258SAlexey Kuznetsov 		arg.bound_dev_if = sk->sk_bound_dev_if;
88419822a98SJason Xing 
885b533fb9cSJason Xing 	trace_tcp_send_reset(sk, skb, reason);
8861da177e4SLinus Torvalds 
887271c3b9bSFlorian Westphal 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
888271c3b9bSFlorian Westphal 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
889271c3b9bSFlorian Westphal 
89066b13d99SEric Dumazet 	arg.tos = ip_hdr(skb)->tos;
891e2d118a1SLorenzo Colitti 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
89247dcc20aSEric Dumazet 	local_bh_disable();
893ebad6d03SSebastian Andrzej Siewior 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
894ebad6d03SSebastian Andrzej Siewior 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
895ebad6d03SSebastian Andrzej Siewior 
89637ba017dSEric Dumazet 	sock_net_set(ctl_sk, net);
897a842fe14SEric Dumazet 	if (sk) {
89800483690SJon Maxwell 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
89900483690SJon Maxwell 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
900f6c0f5d2SEric Dumazet 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
90110bbf165SEric Dumazet 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
902d6fb396cSEric Dumazet 		transmit_time = tcp_transmit_time(sk);
903e22aa148Ssewookseo 		xfrm_sk_clone_policy(ctl_sk, sk);
904c0a8966eSAntoine Tenart 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
905c0a8966eSAntoine Tenart 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
9061e306ec4SEric Dumazet 	} else {
9071e306ec4SEric Dumazet 		ctl_sk->sk_mark = 0;
9081e306ec4SEric Dumazet 		ctl_sk->sk_priority = 0;
909a842fe14SEric Dumazet 	}
91000483690SJon Maxwell 	ip_send_unicast_reply(ctl_sk,
911bdbbb852SEric Dumazet 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
91224a2d43dSEric Dumazet 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913d6fb396cSEric Dumazet 			      &arg, arg.iov[0].iov_len,
914c0a8966eSAntoine Tenart 			      transmit_time, txhash);
9151da177e4SLinus Torvalds 
916e22aa148Ssewookseo 	xfrm_sk_free_policy(ctl_sk);
91737ba017dSEric Dumazet 	sock_net_set(ctl_sk, &init_net);
91890bbcc60SEric Dumazet 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
91990bbcc60SEric Dumazet 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
920ebad6d03SSebastian Andrzej Siewior 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
92147dcc20aSEric Dumazet 	local_bh_enable();
922658ddaafSShawn Lu 
923658ddaafSShawn Lu #ifdef CONFIG_TCP_MD5SIG
9243b24d854SEric Dumazet out:
925658ddaafSShawn Lu 	rcu_read_unlock();
926658ddaafSShawn Lu #endif
9271da177e4SLinus Torvalds }
9281da177e4SLinus Torvalds 
9291da177e4SLinus Torvalds /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
9301da177e4SLinus Torvalds    outside socket context is ugly, certainly. What can I do?
9311da177e4SLinus Torvalds  */
9321da177e4SLinus Torvalds 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)933e2d118a1SLorenzo Colitti static void tcp_v4_send_ack(const struct sock *sk,
934e62a123bSEric Dumazet 			    struct sk_buff *skb, u32 seq, u32 ack,
935ee684b6fSAndrey Vagin 			    u32 win, u32 tsval, u32 tsecr, int oif,
936decde258SDmitry Safonov 			    struct tcp_key *key,
937c0a8966eSAntoine Tenart 			    int reply_flags, u8 tos, u32 txhash)
9381da177e4SLinus Torvalds {
939cf533ea5SEric Dumazet 	const struct tcphdr *th = tcp_hdr(skb);
9401da177e4SLinus Torvalds 	struct {
9411da177e4SLinus Torvalds 		struct tcphdr th;
942decde258SDmitry Safonov 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
9431da177e4SLinus Torvalds 	} rep;
944e2d118a1SLorenzo Colitti 	struct net *net = sock_net(sk);
9451da177e4SLinus Torvalds 	struct ip_reply_arg arg;
94600483690SJon Maxwell 	struct sock *ctl_sk;
947d6fb396cSEric Dumazet 	u64 transmit_time;
9481da177e4SLinus Torvalds 
9491da177e4SLinus Torvalds 	memset(&rep.th, 0, sizeof(struct tcphdr));
9507174259eSArnaldo Carvalho de Melo 	memset(&arg, 0, sizeof(arg));
9511da177e4SLinus Torvalds 
9521da177e4SLinus Torvalds 	arg.iov[0].iov_base = (unsigned char *)&rep;
9531da177e4SLinus Torvalds 	arg.iov[0].iov_len  = sizeof(rep.th);
954ee684b6fSAndrey Vagin 	if (tsecr) {
955cfb6eeb4SYOSHIFUJI Hideaki 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
9561da177e4SLinus Torvalds 				   (TCPOPT_TIMESTAMP << 8) |
9571da177e4SLinus Torvalds 				   TCPOLEN_TIMESTAMP);
958ee684b6fSAndrey Vagin 		rep.opt[1] = htonl(tsval);
959ee684b6fSAndrey Vagin 		rep.opt[2] = htonl(tsecr);
960cb48cfe8SCraig Schlenter 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
9611da177e4SLinus Torvalds 	}
9621da177e4SLinus Torvalds 
9631da177e4SLinus Torvalds 	/* Swap the send and the receive. */
9641da177e4SLinus Torvalds 	rep.th.dest    = th->source;
9651da177e4SLinus Torvalds 	rep.th.source  = th->dest;
9661da177e4SLinus Torvalds 	rep.th.doff    = arg.iov[0].iov_len / 4;
9671da177e4SLinus Torvalds 	rep.th.seq     = htonl(seq);
9681da177e4SLinus Torvalds 	rep.th.ack_seq = htonl(ack);
9691da177e4SLinus Torvalds 	rep.th.ack     = 1;
9701da177e4SLinus Torvalds 	rep.th.window  = htons(win);
9711da177e4SLinus Torvalds 
972cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
973decde258SDmitry Safonov 	if (tcp_key_is_md5(key)) {
974ee684b6fSAndrey Vagin 		int offset = (tsecr) ? 3 : 0;
975cfb6eeb4SYOSHIFUJI Hideaki 
976cfb6eeb4SYOSHIFUJI Hideaki 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
977cfb6eeb4SYOSHIFUJI Hideaki 					  (TCPOPT_NOP << 16) |
978cfb6eeb4SYOSHIFUJI Hideaki 					  (TCPOPT_MD5SIG << 8) |
979cfb6eeb4SYOSHIFUJI Hideaki 					  TCPOLEN_MD5SIG);
980cfb6eeb4SYOSHIFUJI Hideaki 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
981cfb6eeb4SYOSHIFUJI Hideaki 		rep.th.doff = arg.iov[0].iov_len/4;
982cfb6eeb4SYOSHIFUJI Hideaki 
98349a72dfbSAdam Langley 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
984decde258SDmitry Safonov 				    key->md5_key, ip_hdr(skb)->saddr,
98590b7e112SAdam Langley 				    ip_hdr(skb)->daddr, &rep.th);
986cfb6eeb4SYOSHIFUJI Hideaki 	}
987cfb6eeb4SYOSHIFUJI Hideaki #endif
988decde258SDmitry Safonov #ifdef CONFIG_TCP_AO
989decde258SDmitry Safonov 	if (tcp_key_is_ao(key)) {
990decde258SDmitry Safonov 		int offset = (tsecr) ? 3 : 0;
991decde258SDmitry Safonov 
992decde258SDmitry Safonov 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
993decde258SDmitry Safonov 					  (tcp_ao_len(key->ao_key) << 16) |
994decde258SDmitry Safonov 					  (key->ao_key->sndid << 8) |
995decde258SDmitry Safonov 					  key->rcv_next);
996da7dfaa6SDmitry Safonov 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
997decde258SDmitry Safonov 		rep.th.doff = arg.iov[0].iov_len / 4;
998decde258SDmitry Safonov 
999decde258SDmitry Safonov 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1000decde258SDmitry Safonov 				key->ao_key, key->traffic_key,
1001decde258SDmitry Safonov 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1002decde258SDmitry Safonov 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1003decde258SDmitry Safonov 				&rep.th, key->sne);
1004decde258SDmitry Safonov 	}
1005decde258SDmitry Safonov #endif
100688ef4a5aSKOVACS Krisztian 	arg.flags = reply_flags;
1007eddc9ec5SArnaldo Carvalho de Melo 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1008eddc9ec5SArnaldo Carvalho de Melo 				      ip_hdr(skb)->saddr, /* XXX */
10091da177e4SLinus Torvalds 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
10101da177e4SLinus Torvalds 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
10119501f972SYOSHIFUJI Hideaki 	if (oif)
10129501f972SYOSHIFUJI Hideaki 		arg.bound_dev_if = oif;
101366b13d99SEric Dumazet 	arg.tos = tos;
1014e2d118a1SLorenzo Colitti 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
101547dcc20aSEric Dumazet 	local_bh_disable();
1016ebad6d03SSebastian Andrzej Siewior 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1017ebad6d03SSebastian Andrzej Siewior 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
101837ba017dSEric Dumazet 	sock_net_set(ctl_sk, net);
101900483690SJon Maxwell 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
10203c5b4d69SEric Dumazet 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1021f6c0f5d2SEric Dumazet 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
10228bf43be7SEric Dumazet 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1023d6fb396cSEric Dumazet 	transmit_time = tcp_transmit_time(sk);
102400483690SJon Maxwell 	ip_send_unicast_reply(ctl_sk,
1025bdbbb852SEric Dumazet 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
102624a2d43dSEric Dumazet 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1027d6fb396cSEric Dumazet 			      &arg, arg.iov[0].iov_len,
1028c0a8966eSAntoine Tenart 			      transmit_time, txhash);
10291da177e4SLinus Torvalds 
103037ba017dSEric Dumazet 	sock_net_set(ctl_sk, &init_net);
103190bbcc60SEric Dumazet 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1032ebad6d03SSebastian Andrzej Siewior 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
103347dcc20aSEric Dumazet 	local_bh_enable();
10341da177e4SLinus Torvalds }
10351da177e4SLinus Torvalds 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)10361da177e4SLinus Torvalds static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
10371da177e4SLinus Torvalds {
10388feaf0c0SArnaldo Carvalho de Melo 	struct inet_timewait_sock *tw = inet_twsk(sk);
1039cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1040decde258SDmitry Safonov 	struct tcp_key key = {};
1041decde258SDmitry Safonov #ifdef CONFIG_TCP_AO
1042decde258SDmitry Safonov 	struct tcp_ao_info *ao_info;
1043decde258SDmitry Safonov 
104467fa83f7SDmitry Safonov 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1045decde258SDmitry Safonov 		/* FIXME: the segment to-be-acked is not verified yet */
1046decde258SDmitry Safonov 		ao_info = rcu_dereference(tcptw->ao_info);
1047decde258SDmitry Safonov 		if (ao_info) {
1048decde258SDmitry Safonov 			const struct tcp_ao_hdr *aoh;
1049decde258SDmitry Safonov 
1050decde258SDmitry Safonov 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1051decde258SDmitry Safonov 				inet_twsk_put(tw);
1052decde258SDmitry Safonov 				return;
1053decde258SDmitry Safonov 			}
1054decde258SDmitry Safonov 
1055decde258SDmitry Safonov 			if (aoh)
1056decde258SDmitry Safonov 				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1057decde258SDmitry Safonov 		}
105867fa83f7SDmitry Safonov 	}
1059decde258SDmitry Safonov 	if (key.ao_key) {
1060decde258SDmitry Safonov 		struct tcp_ao_key *rnext_key;
1061decde258SDmitry Safonov 
1062decde258SDmitry Safonov 		key.traffic_key = snd_other_key(key.ao_key);
106364382c71SDmitry Safonov 		key.sne = READ_ONCE(ao_info->snd_sne);
1064decde258SDmitry Safonov 		rnext_key = READ_ONCE(ao_info->rnext_key);
1065decde258SDmitry Safonov 		key.rcv_next = rnext_key->rcvid;
1066decde258SDmitry Safonov 		key.type = TCP_KEY_AO;
1067decde258SDmitry Safonov #else
1068decde258SDmitry Safonov 	if (0) {
1069decde258SDmitry Safonov #endif
10703966a668SDmitry Safonov 	} else if (static_branch_tcp_md5()) {
1071decde258SDmitry Safonov 		key.md5_key = tcp_twsk_md5_key(tcptw);
1072decde258SDmitry Safonov 		if (key.md5_key)
1073decde258SDmitry Safonov 			key.type = TCP_KEY_MD5;
1074decde258SDmitry Safonov 	}
10751da177e4SLinus Torvalds 
1076e2d118a1SLorenzo Colitti 	tcp_v4_send_ack(sk, skb,
1077c0a11493SEric Dumazet 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
10787174259eSArnaldo Carvalho de Melo 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
107916cf6477SEric Dumazet 			tcp_tw_tsval(tcptw),
108069e0b33aSEric Dumazet 			READ_ONCE(tcptw->tw_ts_recent),
1081decde258SDmitry Safonov 			tw->tw_bound_dev_if, &key,
108266b13d99SEric Dumazet 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1083c0a8966eSAntoine Tenart 			tw->tw_tos,
1084decde258SDmitry Safonov 			tw->tw_txhash);
10851da177e4SLinus Torvalds 
10868feaf0c0SArnaldo Carvalho de Melo 	inet_twsk_put(tw);
10871da177e4SLinus Torvalds }
10881da177e4SLinus Torvalds 
1089a00e7444SEric Dumazet static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
10907174259eSArnaldo Carvalho de Melo 				  struct request_sock *req)
10911da177e4SLinus Torvalds {
1092decde258SDmitry Safonov 	struct tcp_key key = {};
1093cea97609SDavid Ahern 
1094168a8f58SJerry Chu 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1095168a8f58SJerry Chu 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1096168a8f58SJerry Chu 	 */
1097e62a123bSEric Dumazet 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1098e62a123bSEric Dumazet 					     tcp_sk(sk)->snd_nxt;
1099e62a123bSEric Dumazet 
110006b22ef2SDmitry Safonov #ifdef CONFIG_TCP_AO
110167fa83f7SDmitry Safonov 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
110267fa83f7SDmitry Safonov 	    tcp_rsk_used_ao(req)) {
110306b22ef2SDmitry Safonov 		const union tcp_md5_addr *addr;
110406b22ef2SDmitry Safonov 		const struct tcp_ao_hdr *aoh;
1105248411b8SDmitry Safonov 		int l3index;
110606b22ef2SDmitry Safonov 
110706b22ef2SDmitry Safonov 		/* Invalid TCP option size or twice included auth */
110806b22ef2SDmitry Safonov 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
110906b22ef2SDmitry Safonov 			return;
111006b22ef2SDmitry Safonov 		if (!aoh)
111106b22ef2SDmitry Safonov 			return;
111206b22ef2SDmitry Safonov 
111306b22ef2SDmitry Safonov 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1114248411b8SDmitry Safonov 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1115248411b8SDmitry Safonov 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
111606b22ef2SDmitry Safonov 					      aoh->rnext_keyid, -1);
111706b22ef2SDmitry Safonov 		if (unlikely(!key.ao_key)) {
111806b22ef2SDmitry Safonov 			/* Send ACK with any matching MKT for the peer */
1119248411b8SDmitry Safonov 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
112006b22ef2SDmitry Safonov 			/* Matching key disappeared (user removed the key?)
112106b22ef2SDmitry Safonov 			 * let the handshake timeout.
112220a2b49fSEric Dumazet 			 */
112306b22ef2SDmitry Safonov 			if (!key.ao_key) {
112406b22ef2SDmitry Safonov 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
112506b22ef2SDmitry Safonov 						     addr,
112606b22ef2SDmitry Safonov 						     ntohs(tcp_hdr(skb)->source),
112706b22ef2SDmitry Safonov 						     &ip_hdr(skb)->daddr,
112806b22ef2SDmitry Safonov 						     ntohs(tcp_hdr(skb)->dest));
112906b22ef2SDmitry Safonov 				return;
113006b22ef2SDmitry Safonov 			}
113106b22ef2SDmitry Safonov 		}
113206b22ef2SDmitry Safonov 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
113306b22ef2SDmitry Safonov 		if (!key.traffic_key)
113406b22ef2SDmitry Safonov 			return;
113506b22ef2SDmitry Safonov 
113606b22ef2SDmitry Safonov 		key.type = TCP_KEY_AO;
113706b22ef2SDmitry Safonov 		key.rcv_next = aoh->keyid;
113806b22ef2SDmitry Safonov 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
113906b22ef2SDmitry Safonov #else
114006b22ef2SDmitry Safonov 	if (0) {
114106b22ef2SDmitry Safonov #endif
11423966a668SDmitry Safonov 	} else if (static_branch_tcp_md5()) {
1143decde258SDmitry Safonov 		const union tcp_md5_addr *addr;
1144decde258SDmitry Safonov 		int l3index;
1145decde258SDmitry Safonov 
1146cea97609SDavid Ahern 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1147dea53bb8SDavid Ahern 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1148decde258SDmitry Safonov 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1149decde258SDmitry Safonov 		if (key.md5_key)
1150decde258SDmitry Safonov 			key.type = TCP_KEY_MD5;
115106b22ef2SDmitry Safonov 	}
115206b22ef2SDmitry Safonov 
1153e2d118a1SLorenzo Colitti 	tcp_v4_send_ack(sk, skb, seq,
115420a2b49fSEric Dumazet 			tcp_rsk(req)->rcv_nxt,
1155f4dca95fSEric Dumazet 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
115616cf6477SEric Dumazet 			tcp_rsk_tsval(tcp_rsk(req)),
1157eba20811SEric Dumazet 			READ_ONCE(req->ts_recent),
1158decde258SDmitry Safonov 			0, &key,
115966b13d99SEric Dumazet 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
11605e526552SEric Dumazet 			ip_hdr(skb)->tos,
11615e526552SEric Dumazet 			READ_ONCE(tcp_rsk(req)->txhash));
116206b22ef2SDmitry Safonov 	if (tcp_key_is_ao(&key))
116306b22ef2SDmitry Safonov 		kfree(key.traffic_key);
11641da177e4SLinus Torvalds }
11651da177e4SLinus Torvalds 
11661da177e4SLinus Torvalds /*
11679bf1d83eSKris Katterjohn  *	Send a SYN-ACK after having received a SYN.
116860236fddSArnaldo Carvalho de Melo  *	This still operates on a request_sock only, not on a big
11691da177e4SLinus Torvalds  *	socket.
11701da177e4SLinus Torvalds  */
11710f935dbeSEric Dumazet static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1172d6274bd8SOctavian Purdila 			      struct flowi *fl,
1173e6b4d113SWilliam Allen Simpson 			      struct request_sock *req,
1174ca6fb065SEric Dumazet 			      struct tcp_fastopen_cookie *foc,
1175331fca43SMartin KaFai Lau 			      enum tcp_synack_type synack_type,
1176331fca43SMartin KaFai Lau 			      struct sk_buff *syn_skb)
11771da177e4SLinus Torvalds {
11782e6599cbSArnaldo Carvalho de Melo 	const struct inet_request_sock *ireq = inet_rsk(req);
11796bd023f3SDavid S. Miller 	struct flowi4 fl4;
11801da177e4SLinus Torvalds 	int err = -1;
11811da177e4SLinus Torvalds 	struct sk_buff *skb;
1182ac8f1710SWei Wang 	u8 tos;
11831da177e4SLinus Torvalds 
11841da177e4SLinus Torvalds 	/* First, grab a route. */
1185ba3f7f04SDavid S. Miller 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1186fd80eb94SDenis V. Lunev 		return -1;
11871da177e4SLinus Torvalds 
1188331fca43SMartin KaFai Lau 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
11891da177e4SLinus Torvalds 
11901da177e4SLinus Torvalds 	if (skb) {
1191634fb979SEric Dumazet 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
11921da177e4SLinus Torvalds 
1193e08d0b3dSEric Dumazet 		tos = READ_ONCE(inet_sk(sk)->tos);
1194e08d0b3dSEric Dumazet 
1195e08d0b3dSEric Dumazet 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1196e08d0b3dSEric Dumazet 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1197e08d0b3dSEric Dumazet 			      (tos & INET_ECN_MASK);
11981da177e4SLinus Torvalds 
1199407c85c7SAlexander Duyck 		if (!INET_ECN_is_capable(tos) &&
1200407c85c7SAlexander Duyck 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1201407c85c7SAlexander Duyck 			tos |= INET_ECN_ECT_0;
12021da177e4SLinus Torvalds 
12032ab2ddd3SEric Dumazet 		rcu_read_lock();
1204634fb979SEric Dumazet 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1205634fb979SEric Dumazet 					    ireq->ir_rmt_addr,
1206de033b7dSWei Wang 					    rcu_dereference(ireq->ireq_opt),
1207861602b5SAlexander Duyck 					    tos);
12082ab2ddd3SEric Dumazet 		rcu_read_unlock();
1209b9df3cb8SGerrit Renker 		err = net_xmit_eval(err);
12101da177e4SLinus Torvalds 	}
12111da177e4SLinus Torvalds 
12121da177e4SLinus Torvalds 	return err;
12131da177e4SLinus Torvalds }
12141da177e4SLinus Torvalds 
12151da177e4SLinus Torvalds /*
121660236fddSArnaldo Carvalho de Melo  *	IPv4 request_sock destructor.
12171da177e4SLinus Torvalds  */
121860236fddSArnaldo Carvalho de Melo static void tcp_v4_reqsk_destructor(struct request_sock *req)
12191da177e4SLinus Torvalds {
1220c92e8c02SEric Dumazet 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
12211da177e4SLinus Torvalds }
12221da177e4SLinus Torvalds 
1223cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
1224cfb6eeb4SYOSHIFUJI Hideaki /*
1225cfb6eeb4SYOSHIFUJI Hideaki  * RFC2385 MD5 checksumming requires a mapping of
1226cfb6eeb4SYOSHIFUJI Hideaki  * IP address->MD5 Key.
1227cfb6eeb4SYOSHIFUJI Hideaki  * We need to maintain these in the sk structure.
1228cfb6eeb4SYOSHIFUJI Hideaki  */
1229cfb6eeb4SYOSHIFUJI Hideaki 
1230459837b5SDmitry Safonov DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
12316015c71eSEric Dumazet EXPORT_SYMBOL(tcp_md5_needed);
12326015c71eSEric Dumazet 
123386f1e3a8SLeonard Crestez static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
123486f1e3a8SLeonard Crestez {
123586f1e3a8SLeonard Crestez 	if (!old)
123686f1e3a8SLeonard Crestez 		return true;
123786f1e3a8SLeonard Crestez 
123886f1e3a8SLeonard Crestez 	/* l3index always overrides non-l3index */
123986f1e3a8SLeonard Crestez 	if (old->l3index && new->l3index == 0)
124086f1e3a8SLeonard Crestez 		return false;
124186f1e3a8SLeonard Crestez 	if (old->l3index == 0 && new->l3index)
124286f1e3a8SLeonard Crestez 		return true;
124386f1e3a8SLeonard Crestez 
124486f1e3a8SLeonard Crestez 	return old->prefixlen < new->prefixlen;
124586f1e3a8SLeonard Crestez }
124686f1e3a8SLeonard Crestez 
1247cfb6eeb4SYOSHIFUJI Hideaki /* Find the Key structure for an address.  */
1248dea53bb8SDavid Ahern struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1249a915da9bSEric Dumazet 					   const union tcp_md5_addr *addr,
12500aadc739SDmitry Safonov 					   int family, bool any_l3index)
1251cfb6eeb4SYOSHIFUJI Hideaki {
1252fd3a154aSEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
1253a915da9bSEric Dumazet 	struct tcp_md5sig_key *key;
1254fd3a154aSEric Dumazet 	const struct tcp_md5sig_info *md5sig;
12556797318eSIvan Delalande 	__be32 mask;
12566797318eSIvan Delalande 	struct tcp_md5sig_key *best_match = NULL;
12576797318eSIvan Delalande 	bool match;
1258cfb6eeb4SYOSHIFUJI Hideaki 
1259a8afca03SEric Dumazet 	/* caller either holds rcu_read_lock() or socket lock */
1260a8afca03SEric Dumazet 	md5sig = rcu_dereference_check(tp->md5sig_info,
12611e1d04e6SHannes Frederic Sowa 				       lockdep_sock_is_held(sk));
1262a8afca03SEric Dumazet 	if (!md5sig)
1263cfb6eeb4SYOSHIFUJI Hideaki 		return NULL;
1264083a0326SArnd Bergmann 
1265c8b91770SAmol Grover 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1266c8b91770SAmol Grover 				 lockdep_sock_is_held(sk)) {
1267a915da9bSEric Dumazet 		if (key->family != family)
1268a915da9bSEric Dumazet 			continue;
12690aadc739SDmitry Safonov 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
12700aadc739SDmitry Safonov 		    key->l3index != l3index)
1271dea53bb8SDavid Ahern 			continue;
12726797318eSIvan Delalande 		if (family == AF_INET) {
12736797318eSIvan Delalande 			mask = inet_make_mask(key->prefixlen);
12746797318eSIvan Delalande 			match = (key->addr.a4.s_addr & mask) ==
12756797318eSIvan Delalande 				(addr->a4.s_addr & mask);
12766797318eSIvan Delalande #if IS_ENABLED(CONFIG_IPV6)
12776797318eSIvan Delalande 		} else if (family == AF_INET6) {
12786797318eSIvan Delalande 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
12796797318eSIvan Delalande 						  key->prefixlen);
12806797318eSIvan Delalande #endif
12816797318eSIvan Delalande 		} else {
12826797318eSIvan Delalande 			match = false;
12836797318eSIvan Delalande 		}
12846797318eSIvan Delalande 
128586f1e3a8SLeonard Crestez 		if (match && better_md5_match(best_match, key))
12866797318eSIvan Delalande 			best_match = key;
12876797318eSIvan Delalande 	}
12886797318eSIvan Delalande 	return best_match;
12896797318eSIvan Delalande }
12906015c71eSEric Dumazet EXPORT_SYMBOL(__tcp_md5_do_lookup);
12916797318eSIvan Delalande 
1292e8f37d57SWu Fengguang static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
12936797318eSIvan Delalande 						      const union tcp_md5_addr *addr,
1294dea53bb8SDavid Ahern 						      int family, u8 prefixlen,
1295a76c2315SLeonard Crestez 						      int l3index, u8 flags)
12966797318eSIvan Delalande {
12976797318eSIvan Delalande 	const struct tcp_sock *tp = tcp_sk(sk);
12986797318eSIvan Delalande 	struct tcp_md5sig_key *key;
12996797318eSIvan Delalande 	unsigned int size = sizeof(struct in_addr);
13006797318eSIvan Delalande 	const struct tcp_md5sig_info *md5sig;
13016797318eSIvan Delalande 
13026797318eSIvan Delalande 	/* caller either holds rcu_read_lock() or socket lock */
13036797318eSIvan Delalande 	md5sig = rcu_dereference_check(tp->md5sig_info,
13046797318eSIvan Delalande 				       lockdep_sock_is_held(sk));
13056797318eSIvan Delalande 	if (!md5sig)
13066797318eSIvan Delalande 		return NULL;
13076797318eSIvan Delalande #if IS_ENABLED(CONFIG_IPV6)
13086797318eSIvan Delalande 	if (family == AF_INET6)
13096797318eSIvan Delalande 		size = sizeof(struct in6_addr);
13106797318eSIvan Delalande #endif
1311c8b91770SAmol Grover 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1312c8b91770SAmol Grover 				 lockdep_sock_is_held(sk)) {
13136797318eSIvan Delalande 		if (key->family != family)
13146797318eSIvan Delalande 			continue;
1315a76c2315SLeonard Crestez 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1316a76c2315SLeonard Crestez 			continue;
131786f1e3a8SLeonard Crestez 		if (key->l3index != l3index)
1318dea53bb8SDavid Ahern 			continue;
13196797318eSIvan Delalande 		if (!memcmp(&key->addr, addr, size) &&
13206797318eSIvan Delalande 		    key->prefixlen == prefixlen)
1321a915da9bSEric Dumazet 			return key;
1322cfb6eeb4SYOSHIFUJI Hideaki 	}
1323cfb6eeb4SYOSHIFUJI Hideaki 	return NULL;
1324cfb6eeb4SYOSHIFUJI Hideaki }
1325cfb6eeb4SYOSHIFUJI Hideaki 
1326b83e3debSEric Dumazet struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1327fd3a154aSEric Dumazet 					 const struct sock *addr_sk)
1328cfb6eeb4SYOSHIFUJI Hideaki {
1329b52e6921SEric Dumazet 	const union tcp_md5_addr *addr;
1330dea53bb8SDavid Ahern 	int l3index;
1331a915da9bSEric Dumazet 
1332dea53bb8SDavid Ahern 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1333dea53bb8SDavid Ahern 						 addr_sk->sk_bound_dev_if);
1334b52e6921SEric Dumazet 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1335dea53bb8SDavid Ahern 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1336cfb6eeb4SYOSHIFUJI Hideaki }
1337cfb6eeb4SYOSHIFUJI Hideaki EXPORT_SYMBOL(tcp_v4_md5_lookup);
1338cfb6eeb4SYOSHIFUJI Hideaki 
1339f62c7517SDmitry Safonov static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1340f62c7517SDmitry Safonov {
1341f62c7517SDmitry Safonov 	struct tcp_sock *tp = tcp_sk(sk);
1342f62c7517SDmitry Safonov 	struct tcp_md5sig_info *md5sig;
1343f62c7517SDmitry Safonov 
1344f62c7517SDmitry Safonov 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1345f62c7517SDmitry Safonov 	if (!md5sig)
1346f62c7517SDmitry Safonov 		return -ENOMEM;
1347f62c7517SDmitry Safonov 
1348f62c7517SDmitry Safonov 	sk_gso_disable(sk);
1349f62c7517SDmitry Safonov 	INIT_HLIST_HEAD(&md5sig->head);
1350f62c7517SDmitry Safonov 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1351f62c7517SDmitry Safonov 	return 0;
1352f62c7517SDmitry Safonov }
1353f62c7517SDmitry Safonov 
1354cfb6eeb4SYOSHIFUJI Hideaki /* This can be called on a newly created socket, from other files */
1355459837b5SDmitry Safonov static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1356a76c2315SLeonard Crestez 			    int family, u8 prefixlen, int l3index, u8 flags,
1357dea53bb8SDavid Ahern 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1358cfb6eeb4SYOSHIFUJI Hideaki {
1359cfb6eeb4SYOSHIFUJI Hideaki 	/* Add Key to the list */
1360b0a713e9SMatthias M. Dellweg 	struct tcp_md5sig_key *key;
1361cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_sock *tp = tcp_sk(sk);
1362f6685938SArnaldo Carvalho de Melo 	struct tcp_md5sig_info *md5sig;
1363f6685938SArnaldo Carvalho de Melo 
1364a76c2315SLeonard Crestez 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1365a915da9bSEric Dumazet 	if (key) {
1366e6ced831SEric Dumazet 		/* Pre-existing entry - just update that one.
1367e6ced831SEric Dumazet 		 * Note that the key might be used concurrently.
1368e6ced831SEric Dumazet 		 * data_race() is telling kcsan that we do not care of
1369e6ced831SEric Dumazet 		 * key mismatches, since changing MD5 key on live flows
1370e6ced831SEric Dumazet 		 * can lead to packet drops.
1371e6ced831SEric Dumazet 		 */
1372e6ced831SEric Dumazet 		data_race(memcpy(key->key, newkey, newkeylen));
13736a2febecSEric Dumazet 
1374e6ced831SEric Dumazet 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1375e6ced831SEric Dumazet 		 * Also note that a reader could catch new key->keylen value
1376e6ced831SEric Dumazet 		 * but old key->key[], this is the reason we use __GFP_ZERO
1377e6ced831SEric Dumazet 		 * at sock_kmalloc() time below these lines.
1378e6ced831SEric Dumazet 		 */
1379e6ced831SEric Dumazet 		WRITE_ONCE(key->keylen, newkeylen);
13806a2febecSEric Dumazet 
1381a915da9bSEric Dumazet 		return 0;
1382cfb6eeb4SYOSHIFUJI Hideaki 	}
1383260fcbebSYan, Zheng 
1384f62c7517SDmitry Safonov 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1385f62c7517SDmitry Safonov 					   lockdep_sock_is_held(sk));
1386a915da9bSEric Dumazet 
1387e6ced831SEric Dumazet 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1388a915da9bSEric Dumazet 	if (!key)
1389a915da9bSEric Dumazet 		return -ENOMEM;
1390f6685938SArnaldo Carvalho de Melo 
1391a915da9bSEric Dumazet 	memcpy(key->key, newkey, newkeylen);
1392a915da9bSEric Dumazet 	key->keylen = newkeylen;
1393a915da9bSEric Dumazet 	key->family = family;
13946797318eSIvan Delalande 	key->prefixlen = prefixlen;
1395dea53bb8SDavid Ahern 	key->l3index = l3index;
1396a76c2315SLeonard Crestez 	key->flags = flags;
1397a915da9bSEric Dumazet 	memcpy(&key->addr, addr,
13983a2cd89bShuhai 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1399a915da9bSEric Dumazet 								 sizeof(struct in_addr));
1400a915da9bSEric Dumazet 	hlist_add_head_rcu(&key->node, &md5sig->head);
1401cfb6eeb4SYOSHIFUJI Hideaki 	return 0;
1402cfb6eeb4SYOSHIFUJI Hideaki }
1403459837b5SDmitry Safonov 
1404459837b5SDmitry Safonov int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1405459837b5SDmitry Safonov 		   int family, u8 prefixlen, int l3index, u8 flags,
1406459837b5SDmitry Safonov 		   const u8 *newkey, u8 newkeylen)
1407459837b5SDmitry Safonov {
1408459837b5SDmitry Safonov 	struct tcp_sock *tp = tcp_sk(sk);
1409459837b5SDmitry Safonov 
1410459837b5SDmitry Safonov 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
14118c73b263SDmitry Safonov 		if (tcp_md5_alloc_sigpool())
1412459837b5SDmitry Safonov 			return -ENOMEM;
1413459837b5SDmitry Safonov 
14148c73b263SDmitry Safonov 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
14158c73b263SDmitry Safonov 			tcp_md5_release_sigpool();
14168c73b263SDmitry Safonov 			return -ENOMEM;
14178c73b263SDmitry Safonov 		}
14188c73b263SDmitry Safonov 
1419459837b5SDmitry Safonov 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1420459837b5SDmitry Safonov 			struct tcp_md5sig_info *md5sig;
1421459837b5SDmitry Safonov 
1422459837b5SDmitry Safonov 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1423459837b5SDmitry Safonov 			rcu_assign_pointer(tp->md5sig_info, NULL);
142455fb80d5SEric Dumazet 			kfree_rcu(md5sig, rcu);
14258c73b263SDmitry Safonov 			tcp_md5_release_sigpool();
1426459837b5SDmitry Safonov 			return -EUSERS;
1427459837b5SDmitry Safonov 		}
1428459837b5SDmitry Safonov 	}
1429459837b5SDmitry Safonov 
1430459837b5SDmitry Safonov 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1431459837b5SDmitry Safonov 				newkey, newkeylen, GFP_KERNEL);
1432459837b5SDmitry Safonov }
1433a915da9bSEric Dumazet EXPORT_SYMBOL(tcp_md5_do_add);
1434cfb6eeb4SYOSHIFUJI Hideaki 
1435459837b5SDmitry Safonov int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1436459837b5SDmitry Safonov 		     int family, u8 prefixlen, int l3index,
1437459837b5SDmitry Safonov 		     struct tcp_md5sig_key *key)
1438459837b5SDmitry Safonov {
1439459837b5SDmitry Safonov 	struct tcp_sock *tp = tcp_sk(sk);
1440459837b5SDmitry Safonov 
1441459837b5SDmitry Safonov 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
14428c73b263SDmitry Safonov 		tcp_md5_add_sigpool();
14438c73b263SDmitry Safonov 
14448c73b263SDmitry Safonov 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
14458c73b263SDmitry Safonov 			tcp_md5_release_sigpool();
1446459837b5SDmitry Safonov 			return -ENOMEM;
14478c73b263SDmitry Safonov 		}
1448459837b5SDmitry Safonov 
1449459837b5SDmitry Safonov 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1450459837b5SDmitry Safonov 			struct tcp_md5sig_info *md5sig;
1451459837b5SDmitry Safonov 
1452459837b5SDmitry Safonov 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1453459837b5SDmitry Safonov 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1454459837b5SDmitry Safonov 			rcu_assign_pointer(tp->md5sig_info, NULL);
145555fb80d5SEric Dumazet 			kfree_rcu(md5sig, rcu);
14568c73b263SDmitry Safonov 			tcp_md5_release_sigpool();
1457459837b5SDmitry Safonov 			return -EUSERS;
1458459837b5SDmitry Safonov 		}
1459459837b5SDmitry Safonov 	}
1460459837b5SDmitry Safonov 
1461459837b5SDmitry Safonov 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1462459837b5SDmitry Safonov 				key->flags, key->key, key->keylen,
1463459837b5SDmitry Safonov 				sk_gfp_mask(sk, GFP_ATOMIC));
1464459837b5SDmitry Safonov }
1465459837b5SDmitry Safonov EXPORT_SYMBOL(tcp_md5_key_copy);
1466459837b5SDmitry Safonov 
14676797318eSIvan Delalande int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1468a76c2315SLeonard Crestez 		   u8 prefixlen, int l3index, u8 flags)
1469cfb6eeb4SYOSHIFUJI Hideaki {
1470a915da9bSEric Dumazet 	struct tcp_md5sig_key *key;
1471cfb6eeb4SYOSHIFUJI Hideaki 
1472a76c2315SLeonard Crestez 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1473a915da9bSEric Dumazet 	if (!key)
1474cfb6eeb4SYOSHIFUJI Hideaki 		return -ENOENT;
1475a915da9bSEric Dumazet 	hlist_del_rcu(&key->node);
14765f3d9cb2SEric Dumazet 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1477a915da9bSEric Dumazet 	kfree_rcu(key, rcu);
1478a915da9bSEric Dumazet 	return 0;
1479cfb6eeb4SYOSHIFUJI Hideaki }
1480a915da9bSEric Dumazet EXPORT_SYMBOL(tcp_md5_do_del);
1481cfb6eeb4SYOSHIFUJI Hideaki 
14820aadc739SDmitry Safonov void tcp_clear_md5_list(struct sock *sk)
1483cfb6eeb4SYOSHIFUJI Hideaki {
1484cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_sock *tp = tcp_sk(sk);
1485a915da9bSEric Dumazet 	struct tcp_md5sig_key *key;
1486b67bfe0dSSasha Levin 	struct hlist_node *n;
1487a8afca03SEric Dumazet 	struct tcp_md5sig_info *md5sig;
1488cfb6eeb4SYOSHIFUJI Hideaki 
1489a8afca03SEric Dumazet 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1490a8afca03SEric Dumazet 
1491b67bfe0dSSasha Levin 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1492a915da9bSEric Dumazet 		hlist_del_rcu(&key->node);
14935f3d9cb2SEric Dumazet 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1494a915da9bSEric Dumazet 		kfree_rcu(key, rcu);
1495cfb6eeb4SYOSHIFUJI Hideaki 	}
1496cfb6eeb4SYOSHIFUJI Hideaki }
1497cfb6eeb4SYOSHIFUJI Hideaki 
14988917a777SIvan Delalande static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1499d4c19c49SChristoph Hellwig 				 sockptr_t optval, int optlen)
1500cfb6eeb4SYOSHIFUJI Hideaki {
1501cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_md5sig cmd;
1502cfb6eeb4SYOSHIFUJI Hideaki 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1503cea97609SDavid Ahern 	const union tcp_md5_addr *addr;
15048917a777SIvan Delalande 	u8 prefixlen = 32;
1505dea53bb8SDavid Ahern 	int l3index = 0;
1506248411b8SDmitry Safonov 	bool l3flag;
1507a76c2315SLeonard Crestez 	u8 flags;
1508cfb6eeb4SYOSHIFUJI Hideaki 
1509cfb6eeb4SYOSHIFUJI Hideaki 	if (optlen < sizeof(cmd))
1510cfb6eeb4SYOSHIFUJI Hideaki 		return -EINVAL;
1511cfb6eeb4SYOSHIFUJI Hideaki 
1512d4c19c49SChristoph Hellwig 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1513cfb6eeb4SYOSHIFUJI Hideaki 		return -EFAULT;
1514cfb6eeb4SYOSHIFUJI Hideaki 
1515cfb6eeb4SYOSHIFUJI Hideaki 	if (sin->sin_family != AF_INET)
1516cfb6eeb4SYOSHIFUJI Hideaki 		return -EINVAL;
1517cfb6eeb4SYOSHIFUJI Hideaki 
1518a76c2315SLeonard Crestez 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1519248411b8SDmitry Safonov 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1520a76c2315SLeonard Crestez 
15218917a777SIvan Delalande 	if (optname == TCP_MD5SIG_EXT &&
15228917a777SIvan Delalande 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
15238917a777SIvan Delalande 		prefixlen = cmd.tcpm_prefixlen;
15248917a777SIvan Delalande 		if (prefixlen > 32)
15258917a777SIvan Delalande 			return -EINVAL;
15268917a777SIvan Delalande 	}
15278917a777SIvan Delalande 
1528a76c2315SLeonard Crestez 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
15296b102db5SDavid Ahern 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
15306b102db5SDavid Ahern 		struct net_device *dev;
15316b102db5SDavid Ahern 
15326b102db5SDavid Ahern 		rcu_read_lock();
15336b102db5SDavid Ahern 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
15346b102db5SDavid Ahern 		if (dev && netif_is_l3_master(dev))
15356b102db5SDavid Ahern 			l3index = dev->ifindex;
15366b102db5SDavid Ahern 
15376b102db5SDavid Ahern 		rcu_read_unlock();
15386b102db5SDavid Ahern 
15396b102db5SDavid Ahern 		/* ok to reference set/not set outside of rcu;
15406b102db5SDavid Ahern 		 * right now device MUST be an L3 master
15416b102db5SDavid Ahern 		 */
15426b102db5SDavid Ahern 		if (!dev || !l3index)
15436b102db5SDavid Ahern 			return -EINVAL;
15446b102db5SDavid Ahern 	}
15456b102db5SDavid Ahern 
1546cea97609SDavid Ahern 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1547cea97609SDavid Ahern 
154864a124edSDmitry Popov 	if (!cmd.tcpm_keylen)
1549a76c2315SLeonard Crestez 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1550cfb6eeb4SYOSHIFUJI Hideaki 
1551cfb6eeb4SYOSHIFUJI Hideaki 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1552cfb6eeb4SYOSHIFUJI Hideaki 		return -EINVAL;
1553cfb6eeb4SYOSHIFUJI Hideaki 
15540aadc739SDmitry Safonov 	/* Don't allow keys for peers that have a matching TCP-AO key.
15550aadc739SDmitry Safonov 	 * See the comment in tcp_ao_add_cmd()
15560aadc739SDmitry Safonov 	 */
1557248411b8SDmitry Safonov 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
15580aadc739SDmitry Safonov 		return -EKEYREJECTED;
15590aadc739SDmitry Safonov 
1560a76c2315SLeonard Crestez 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1561459837b5SDmitry Safonov 			      cmd.tcpm_key, cmd.tcpm_keylen);
1562cfb6eeb4SYOSHIFUJI Hideaki }
1563cfb6eeb4SYOSHIFUJI Hideaki 
15648c73b263SDmitry Safonov static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
156519689e38SEric Dumazet 				   __be32 daddr, __be32 saddr,
156619689e38SEric Dumazet 				   const struct tcphdr *th, int nbytes)
1567cfb6eeb4SYOSHIFUJI Hideaki {
1568cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp4_pseudohdr *bp;
156949a72dfbSAdam Langley 	struct scatterlist sg;
157019689e38SEric Dumazet 	struct tcphdr *_th;
1571cfb6eeb4SYOSHIFUJI Hideaki 
157219689e38SEric Dumazet 	bp = hp->scratch;
1573cfb6eeb4SYOSHIFUJI Hideaki 	bp->saddr = saddr;
1574cfb6eeb4SYOSHIFUJI Hideaki 	bp->daddr = daddr;
1575cfb6eeb4SYOSHIFUJI Hideaki 	bp->pad = 0;
1576076fb722SYOSHIFUJI Hideaki 	bp->protocol = IPPROTO_TCP;
157749a72dfbSAdam Langley 	bp->len = cpu_to_be16(nbytes);
1578c7da57a1SDavid S. Miller 
157919689e38SEric Dumazet 	_th = (struct tcphdr *)(bp + 1);
158019689e38SEric Dumazet 	memcpy(_th, th, sizeof(*th));
158119689e38SEric Dumazet 	_th->check = 0;
158219689e38SEric Dumazet 
158319689e38SEric Dumazet 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
15848c73b263SDmitry Safonov 	ahash_request_set_crypt(hp->req, &sg, NULL,
158519689e38SEric Dumazet 				sizeof(*bp) + sizeof(*th));
15868c73b263SDmitry Safonov 	return crypto_ahash_update(hp->req);
158749a72dfbSAdam Langley }
158849a72dfbSAdam Langley 
1589a915da9bSEric Dumazet static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1590318cf7aaSEric Dumazet 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
159149a72dfbSAdam Langley {
15928c73b263SDmitry Safonov 	struct tcp_sigpool hp;
159349a72dfbSAdam Langley 
15948c73b263SDmitry Safonov 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
15958c73b263SDmitry Safonov 		goto clear_hash_nostart;
159649a72dfbSAdam Langley 
15978c73b263SDmitry Safonov 	if (crypto_ahash_init(hp.req))
159849a72dfbSAdam Langley 		goto clear_hash;
15998c73b263SDmitry Safonov 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
160049a72dfbSAdam Langley 		goto clear_hash;
16018c73b263SDmitry Safonov 	if (tcp_md5_hash_key(&hp, key))
160249a72dfbSAdam Langley 		goto clear_hash;
16038c73b263SDmitry Safonov 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
16048c73b263SDmitry Safonov 	if (crypto_ahash_final(hp.req))
1605cfb6eeb4SYOSHIFUJI Hideaki 		goto clear_hash;
1606cfb6eeb4SYOSHIFUJI Hideaki 
16078c73b263SDmitry Safonov 	tcp_sigpool_end(&hp);
1608cfb6eeb4SYOSHIFUJI Hideaki 	return 0;
160949a72dfbSAdam Langley 
1610cfb6eeb4SYOSHIFUJI Hideaki clear_hash:
16118c73b263SDmitry Safonov 	tcp_sigpool_end(&hp);
16128c73b263SDmitry Safonov clear_hash_nostart:
1613cfb6eeb4SYOSHIFUJI Hideaki 	memset(md5_hash, 0, 16);
161449a72dfbSAdam Langley 	return 1;
1615cfb6eeb4SYOSHIFUJI Hideaki }
1616cfb6eeb4SYOSHIFUJI Hideaki 
161739f8e58eSEric Dumazet int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
161839f8e58eSEric Dumazet 			const struct sock *sk,
1619318cf7aaSEric Dumazet 			const struct sk_buff *skb)
1620cfb6eeb4SYOSHIFUJI Hideaki {
1621318cf7aaSEric Dumazet 	const struct tcphdr *th = tcp_hdr(skb);
16228c73b263SDmitry Safonov 	struct tcp_sigpool hp;
1623cfb6eeb4SYOSHIFUJI Hideaki 	__be32 saddr, daddr;
1624cfb6eeb4SYOSHIFUJI Hideaki 
162539f8e58eSEric Dumazet 	if (sk) { /* valid for establish/request sockets */
162639f8e58eSEric Dumazet 		saddr = sk->sk_rcv_saddr;
162739f8e58eSEric Dumazet 		daddr = sk->sk_daddr;
1628cfb6eeb4SYOSHIFUJI Hideaki 	} else {
162949a72dfbSAdam Langley 		const struct iphdr *iph = ip_hdr(skb);
163049a72dfbSAdam Langley 		saddr = iph->saddr;
163149a72dfbSAdam Langley 		daddr = iph->daddr;
1632cfb6eeb4SYOSHIFUJI Hideaki 	}
1633cfb6eeb4SYOSHIFUJI Hideaki 
16348c73b263SDmitry Safonov 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
16358c73b263SDmitry Safonov 		goto clear_hash_nostart;
163649a72dfbSAdam Langley 
16378c73b263SDmitry Safonov 	if (crypto_ahash_init(hp.req))
163849a72dfbSAdam Langley 		goto clear_hash;
163949a72dfbSAdam Langley 
16408c73b263SDmitry Safonov 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
164149a72dfbSAdam Langley 		goto clear_hash;
16428c73b263SDmitry Safonov 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
164349a72dfbSAdam Langley 		goto clear_hash;
16448c73b263SDmitry Safonov 	if (tcp_md5_hash_key(&hp, key))
164549a72dfbSAdam Langley 		goto clear_hash;
16468c73b263SDmitry Safonov 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
16478c73b263SDmitry Safonov 	if (crypto_ahash_final(hp.req))
164849a72dfbSAdam Langley 		goto clear_hash;
164949a72dfbSAdam Langley 
16508c73b263SDmitry Safonov 	tcp_sigpool_end(&hp);
165149a72dfbSAdam Langley 	return 0;
165249a72dfbSAdam Langley 
165349a72dfbSAdam Langley clear_hash:
16548c73b263SDmitry Safonov 	tcp_sigpool_end(&hp);
16558c73b263SDmitry Safonov clear_hash_nostart:
165649a72dfbSAdam Langley 	memset(md5_hash, 0, 16);
165749a72dfbSAdam Langley 	return 1;
165849a72dfbSAdam Langley }
165949a72dfbSAdam Langley EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1660cfb6eeb4SYOSHIFUJI Hideaki 
1661ba8e275aSEric Dumazet #endif
1662ba8e275aSEric Dumazet 
1663b40cf18eSEric Dumazet static void tcp_v4_init_req(struct request_sock *req,
1664b40cf18eSEric Dumazet 			    const struct sock *sk_listener,
166516bea70aSOctavian Purdila 			    struct sk_buff *skb)
166616bea70aSOctavian Purdila {
166716bea70aSOctavian Purdila 	struct inet_request_sock *ireq = inet_rsk(req);
1668c92e8c02SEric Dumazet 	struct net *net = sock_net(sk_listener);
166916bea70aSOctavian Purdila 
167008d2cc3bSEric Dumazet 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
167108d2cc3bSEric Dumazet 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1672c92e8c02SEric Dumazet 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
167316bea70aSOctavian Purdila }
167416bea70aSOctavian Purdila 
1675f964629eSEric Dumazet static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
16767ea851d1SFlorian Westphal 					  struct sk_buff *skb,
1677f964629eSEric Dumazet 					  struct flowi *fl,
1678b9e81040SEric Dumazet 					  struct request_sock *req,
1679b9e81040SEric Dumazet 					  u32 tw_isn)
1680d94e0417SOctavian Purdila {
16817ea851d1SFlorian Westphal 	tcp_v4_init_req(req, sk, skb);
16827ea851d1SFlorian Westphal 
16837ea851d1SFlorian Westphal 	if (security_inet_conn_request(sk, skb, req))
16847ea851d1SFlorian Westphal 		return NULL;
16857ea851d1SFlorian Westphal 
16864396e461SSoheil Hassas Yeganeh 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1687d94e0417SOctavian Purdila }
1688d94e0417SOctavian Purdila 
168972a3effaSEric Dumazet struct request_sock_ops tcp_request_sock_ops __read_mostly = {
16901da177e4SLinus Torvalds 	.family		=	PF_INET,
16912e6599cbSArnaldo Carvalho de Melo 	.obj_size	=	sizeof(struct tcp_request_sock),
16925db92c99SOctavian Purdila 	.rtx_syn_ack	=	tcp_rtx_synack,
169360236fddSArnaldo Carvalho de Melo 	.send_ack	=	tcp_v4_reqsk_send_ack,
169460236fddSArnaldo Carvalho de Melo 	.destructor	=	tcp_v4_reqsk_destructor,
16951da177e4SLinus Torvalds 	.send_reset	=	tcp_v4_send_reset,
169672659eccSOctavian Purdila 	.syn_ack_timeout =	tcp_syn_ack_timeout,
16971da177e4SLinus Torvalds };
16981da177e4SLinus Torvalds 
169935b2c321SMat Martineau const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
17002aec4a29SOctavian Purdila 	.mss_clamp	=	TCP_MSS_DEFAULT,
170116bea70aSOctavian Purdila #ifdef CONFIG_TCP_MD5SIG
1702fd3a154aSEric Dumazet 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1703e3afe7b7SJohn Dykstra 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1704b6332e6cSAndrew Morton #endif
170506b22ef2SDmitry Safonov #ifdef CONFIG_TCP_AO
170606b22ef2SDmitry Safonov 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
170706b22ef2SDmitry Safonov 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
17089427c6aaSDmitry Safonov 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
170906b22ef2SDmitry Safonov #endif
1710fb7b37a7SOctavian Purdila #ifdef CONFIG_SYN_COOKIES
1711fb7b37a7SOctavian Purdila 	.cookie_init_seq =	cookie_v4_init_sequence,
1712fb7b37a7SOctavian Purdila #endif
1713d94e0417SOctavian Purdila 	.route_req	=	tcp_v4_route_req,
171484b114b9SEric Dumazet 	.init_seq	=	tcp_v4_init_seq,
171584b114b9SEric Dumazet 	.init_ts_off	=	tcp_v4_init_ts_off,
1716d6274bd8SOctavian Purdila 	.send_synack	=	tcp_v4_send_synack,
171716bea70aSOctavian Purdila };
1718cfb6eeb4SYOSHIFUJI Hideaki 
17191da177e4SLinus Torvalds int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
17201da177e4SLinus Torvalds {
17211da177e4SLinus Torvalds 	/* Never answer to SYNs send to broadcast or multicast */
1722511c3f92SEric Dumazet 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
17231da177e4SLinus Torvalds 		goto drop;
17241da177e4SLinus Torvalds 
17251fb6f159SOctavian Purdila 	return tcp_conn_request(&tcp_request_sock_ops,
17261fb6f159SOctavian Purdila 				&tcp_request_sock_ipv4_ops, sk, skb);
17271da177e4SLinus Torvalds 
17281da177e4SLinus Torvalds drop:
17299caad864SEric Dumazet 	tcp_listendrop(sk);
17301da177e4SLinus Torvalds 	return 0;
17311da177e4SLinus Torvalds }
17324bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_conn_request);
17331da177e4SLinus Torvalds 
17341da177e4SLinus Torvalds 
17351da177e4SLinus Torvalds /*
17361da177e4SLinus Torvalds  * The three way handshake has completed - we got a valid synack -
17371da177e4SLinus Torvalds  * now create the new socket.
17381da177e4SLinus Torvalds  */
17390c27171eSEric Dumazet struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
174060236fddSArnaldo Carvalho de Melo 				  struct request_sock *req,
17415e0724d0SEric Dumazet 				  struct dst_entry *dst,
17425e0724d0SEric Dumazet 				  struct request_sock *req_unhash,
17435e0724d0SEric Dumazet 				  bool *own_req)
17441da177e4SLinus Torvalds {
17452e6599cbSArnaldo Carvalho de Melo 	struct inet_request_sock *ireq;
174601770a16SRicardo Dias 	bool found_dup_sk = false;
17471da177e4SLinus Torvalds 	struct inet_sock *newinet;
17481da177e4SLinus Torvalds 	struct tcp_sock *newtp;
17491da177e4SLinus Torvalds 	struct sock *newsk;
1750cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
1751cea97609SDavid Ahern 	const union tcp_md5_addr *addr;
1752cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_md5sig_key *key;
1753dea53bb8SDavid Ahern 	int l3index;
1754cfb6eeb4SYOSHIFUJI Hideaki #endif
1755f6d8bd05SEric Dumazet 	struct ip_options_rcu *inet_opt;
17561da177e4SLinus Torvalds 
17571da177e4SLinus Torvalds 	if (sk_acceptq_is_full(sk))
17581da177e4SLinus Torvalds 		goto exit_overflow;
17591da177e4SLinus Torvalds 
17601da177e4SLinus Torvalds 	newsk = tcp_create_openreq_child(sk, req, skb);
17611da177e4SLinus Torvalds 	if (!newsk)
1762093d2823SBalazs Scheidler 		goto exit_nonewsk;
17631da177e4SLinus Torvalds 
1764bcd76111SHerbert Xu 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1765fae6ef87SNeal Cardwell 	inet_sk_rx_dst_set(newsk, skb);
17661da177e4SLinus Torvalds 
17671da177e4SLinus Torvalds 	newtp		      = tcp_sk(newsk);
17681da177e4SLinus Torvalds 	newinet		      = inet_sk(newsk);
17692e6599cbSArnaldo Carvalho de Melo 	ireq		      = inet_rsk(req);
1770d1e559d0SEric Dumazet 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1771d1e559d0SEric Dumazet 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
17726dd9a14eSDavid Ahern 	newsk->sk_bound_dev_if = ireq->ir_iif;
1773634fb979SEric Dumazet 	newinet->inet_saddr   = ireq->ir_loc_addr;
1774c92e8c02SEric Dumazet 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1775c92e8c02SEric Dumazet 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1776463c84b9SArnaldo Carvalho de Melo 	newinet->mc_index     = inet_iif(skb);
1777eddc9ec5SArnaldo Carvalho de Melo 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
17784c507d28SJiri Benc 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1779d83d8461SArnaldo Carvalho de Melo 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1780f6d8bd05SEric Dumazet 	if (inet_opt)
1781f6d8bd05SEric Dumazet 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1782f866fbc8SEric Dumazet 	atomic_set(&newinet->inet_id, get_random_u16());
17831da177e4SLinus Torvalds 
17848ef44b6fSWei Wang 	/* Set ToS of the new socket based upon the value of incoming SYN.
17858ef44b6fSWei Wang 	 * ECT bits are set later in tcp_init_transfer().
17868ef44b6fSWei Wang 	 */
1787870e3a63SKuniyuki Iwashima 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1788ac8f1710SWei Wang 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1789ac8f1710SWei Wang 
1790dfd25fffSEric Dumazet 	if (!dst) {
1791dfd25fffSEric Dumazet 		dst = inet_csk_route_child_sock(sk, newsk, req);
1792dfd25fffSEric Dumazet 		if (!dst)
17930e734419SDavid S. Miller 			goto put_and_exit;
1794dfd25fffSEric Dumazet 	} else {
1795dfd25fffSEric Dumazet 		/* syncookie case : see end of cookie_v4_check() */
1796dfd25fffSEric Dumazet 	}
17970e734419SDavid S. Miller 	sk_setup_caps(newsk, dst);
17980e734419SDavid S. Miller 
179981164413SDaniel Borkmann 	tcp_ca_openreq_child(newsk, dst);
180081164413SDaniel Borkmann 
18011da177e4SLinus Torvalds 	tcp_sync_mss(newsk, dst_mtu(dst));
18023541f9e8SEric Dumazet 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1803f5fff5dcSTom Quetchenbach 
18041da177e4SLinus Torvalds 	tcp_initialize_rcv_mss(newsk);
18051da177e4SLinus Torvalds 
1806cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
1807dea53bb8SDavid Ahern 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1808cfb6eeb4SYOSHIFUJI Hideaki 	/* Copy over the MD5 key from the original socket */
1809cea97609SDavid Ahern 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1810dea53bb8SDavid Ahern 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
181106b22ef2SDmitry Safonov 	if (key && !tcp_rsk_used_ao(req)) {
1812b389d1afSDmitry Safonov 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1813b389d1afSDmitry Safonov 			goto put_and_exit;
1814aba54656SEric Dumazet 		sk_gso_disable(newsk);
1815cfb6eeb4SYOSHIFUJI Hideaki 	}
1816cfb6eeb4SYOSHIFUJI Hideaki #endif
181706b22ef2SDmitry Safonov #ifdef CONFIG_TCP_AO
181806b22ef2SDmitry Safonov 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
181906b22ef2SDmitry Safonov 		goto put_and_exit; /* OOM, release back memory */
182006b22ef2SDmitry Safonov #endif
1821cfb6eeb4SYOSHIFUJI Hideaki 
18220e734419SDavid S. Miller 	if (__inet_inherit_port(sk, newsk) < 0)
18230e734419SDavid S. Miller 		goto put_and_exit;
182401770a16SRicardo Dias 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
182501770a16SRicardo Dias 				       &found_dup_sk);
1826c92e8c02SEric Dumazet 	if (likely(*own_req)) {
182749a496c9SEric Dumazet 		tcp_move_syn(newtp, req);
1828c92e8c02SEric Dumazet 		ireq->ireq_opt = NULL;
1829c92e8c02SEric Dumazet 	} else {
1830c89dffc7SKuniyuki Iwashima 		newinet->inet_opt = NULL;
1831c89dffc7SKuniyuki Iwashima 
183201770a16SRicardo Dias 		if (!req_unhash && found_dup_sk) {
183301770a16SRicardo Dias 			/* This code path should only be executed in the
183401770a16SRicardo Dias 			 * syncookie case only
183501770a16SRicardo Dias 			 */
183601770a16SRicardo Dias 			bh_unlock_sock(newsk);
183701770a16SRicardo Dias 			sock_put(newsk);
183801770a16SRicardo Dias 			newsk = NULL;
1839c92e8c02SEric Dumazet 		}
184001770a16SRicardo Dias 	}
18411da177e4SLinus Torvalds 	return newsk;
18421da177e4SLinus Torvalds 
18431da177e4SLinus Torvalds exit_overflow:
1844c10d9310SEric Dumazet 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1845093d2823SBalazs Scheidler exit_nonewsk:
1846093d2823SBalazs Scheidler 	dst_release(dst);
18471da177e4SLinus Torvalds exit:
18489caad864SEric Dumazet 	tcp_listendrop(sk);
18491da177e4SLinus Torvalds 	return NULL;
18500e734419SDavid S. Miller put_and_exit:
1851c92e8c02SEric Dumazet 	newinet->inet_opt = NULL;
1852e337e24dSChristoph Paasch 	inet_csk_prepare_forced_close(newsk);
1853e337e24dSChristoph Paasch 	tcp_done(newsk);
18540e734419SDavid S. Miller 	goto exit;
18551da177e4SLinus Torvalds }
18564bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
18571da177e4SLinus Torvalds 
1858079096f1SEric Dumazet static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
18591da177e4SLinus Torvalds {
18601da177e4SLinus Torvalds #ifdef CONFIG_SYN_COOKIES
1861079096f1SEric Dumazet 	const struct tcphdr *th = tcp_hdr(skb);
1862079096f1SEric Dumazet 
1863af9b4738SFlorian Westphal 	if (!th->syn)
1864461b74c3SCong Wang 		sk = cookie_v4_check(sk, skb);
18651da177e4SLinus Torvalds #endif
18661da177e4SLinus Torvalds 	return sk;
18671da177e4SLinus Torvalds }
18681da177e4SLinus Torvalds 
18699349d600SPetar Penkov u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
18709349d600SPetar Penkov 			 struct tcphdr *th, u32 *cookie)
18719349d600SPetar Penkov {
18729349d600SPetar Penkov 	u16 mss = 0;
18739349d600SPetar Penkov #ifdef CONFIG_SYN_COOKIES
18749349d600SPetar Penkov 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
18759349d600SPetar Penkov 				    &tcp_request_sock_ipv4_ops, sk, th);
18769349d600SPetar Penkov 	if (mss) {
18779349d600SPetar Penkov 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
18789349d600SPetar Penkov 		tcp_synq_overflow(sk);
18799349d600SPetar Penkov 	}
18809349d600SPetar Penkov #endif
18819349d600SPetar Penkov 	return mss;
18829349d600SPetar Penkov }
18839349d600SPetar Penkov 
1884bbd807dfSBrian Vazquez INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1885bbd807dfSBrian Vazquez 							   u32));
18861da177e4SLinus Torvalds /* The socket must have it's spinlock held when we get
1887e994b2f0SEric Dumazet  * here, unless it is a TCP_LISTEN socket.
18881da177e4SLinus Torvalds  *
18891da177e4SLinus Torvalds  * We have a potential double-lock case here, so even when
18901da177e4SLinus Torvalds  * doing backlog processing we use the BH locking scheme.
18911da177e4SLinus Torvalds  * This is because we cannot sleep with the original spinlock
18921da177e4SLinus Torvalds  * held.
18931da177e4SLinus Torvalds  */
18941da177e4SLinus Torvalds int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
18951da177e4SLinus Torvalds {
18968eba65faSMenglong Dong 	enum skb_drop_reason reason;
1897cfb6eeb4SYOSHIFUJI Hideaki 	struct sock *rsk;
1898cfb6eeb4SYOSHIFUJI Hideaki 
18991da177e4SLinus Torvalds 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
19008f905c0eSEric Dumazet 		struct dst_entry *dst;
19018f905c0eSEric Dumazet 
19028f905c0eSEric Dumazet 		dst = rcu_dereference_protected(sk->sk_rx_dst,
19038f905c0eSEric Dumazet 						lockdep_sock_is_held(sk));
1904404e0a8bSEric Dumazet 
1905404e0a8bSEric Dumazet 		sock_rps_save_rxhash(sk, skb);
19063d97379aSEric Dumazet 		sk_mark_napi_id(sk, skb);
1907404e0a8bSEric Dumazet 		if (dst) {
19080c0a5ef8SEric Dumazet 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1909bbd807dfSBrian Vazquez 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1910bbd807dfSBrian Vazquez 					     dst, 0)) {
19118f905c0eSEric Dumazet 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
191292101b3bSDavid S. Miller 				dst_release(dst);
191392101b3bSDavid S. Miller 			}
191492101b3bSDavid S. Miller 		}
19153d97d88eSYafang Shao 		tcp_rcv_established(sk, skb);
19161da177e4SLinus Torvalds 		return 0;
19171da177e4SLinus Torvalds 	}
19181da177e4SLinus Torvalds 
191912e25e10SEric Dumazet 	if (tcp_checksum_complete(skb))
19201da177e4SLinus Torvalds 		goto csum_err;
19211da177e4SLinus Torvalds 
19221da177e4SLinus Torvalds 	if (sk->sk_state == TCP_LISTEN) {
1923079096f1SEric Dumazet 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1924079096f1SEric Dumazet 
19251da177e4SLinus Torvalds 		if (!nsk)
192665be4393SJason Xing 			return 0;
19271da177e4SLinus Torvalds 		if (nsk != sk) {
1928ee01defeSJason Xing 			reason = tcp_child_process(sk, nsk, skb);
1929ee01defeSJason Xing 			if (reason) {
1930cfb6eeb4SYOSHIFUJI Hideaki 				rsk = nsk;
19311da177e4SLinus Torvalds 				goto reset;
1932cfb6eeb4SYOSHIFUJI Hideaki 			}
19331da177e4SLinus Torvalds 			return 0;
19341da177e4SLinus Torvalds 		}
1935ca55158cSEric Dumazet 	} else
1936bdeab991STom Herbert 		sock_rps_save_rxhash(sk, skb);
1937ca55158cSEric Dumazet 
1938b9825695SJason Xing 	reason = tcp_rcv_state_process(sk, skb);
1939b9825695SJason Xing 	if (reason) {
1940cfb6eeb4SYOSHIFUJI Hideaki 		rsk = sk;
19411da177e4SLinus Torvalds 		goto reset;
1942cfb6eeb4SYOSHIFUJI Hideaki 	}
19431da177e4SLinus Torvalds 	return 0;
19441da177e4SLinus Torvalds 
19451da177e4SLinus Torvalds reset:
1946120391efSJason Xing 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
19471da177e4SLinus Torvalds discard:
194846a02aa3SYan Zhai 	sk_skb_reason_drop(sk, skb, reason);
19491da177e4SLinus Torvalds 	/* Be careful here. If this function gets more complicated and
19501da177e4SLinus Torvalds 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
19511da177e4SLinus Torvalds 	 * might be destroyed here. This current version compiles correctly,
19521da177e4SLinus Torvalds 	 * but you have been warned.
19531da177e4SLinus Torvalds 	 */
19541da177e4SLinus Torvalds 	return 0;
19551da177e4SLinus Torvalds 
19561da177e4SLinus Torvalds csum_err:
19578eba65faSMenglong Dong 	reason = SKB_DROP_REASON_TCP_CSUM;
1958709c0314SJakub Kicinski 	trace_tcp_bad_csum(skb);
1959c10d9310SEric Dumazet 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1960c10d9310SEric Dumazet 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
19611da177e4SLinus Torvalds 	goto discard;
19621da177e4SLinus Torvalds }
19634bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_do_rcv);
19641da177e4SLinus Torvalds 
19657487449cSPaolo Abeni int tcp_v4_early_demux(struct sk_buff *skb)
196641063e9dSDavid S. Miller {
19674461568aSKuniyuki Iwashima 	struct net *net = dev_net(skb->dev);
196841063e9dSDavid S. Miller 	const struct iphdr *iph;
196941063e9dSDavid S. Miller 	const struct tcphdr *th;
197041063e9dSDavid S. Miller 	struct sock *sk;
197141063e9dSDavid S. Miller 
197241063e9dSDavid S. Miller 	if (skb->pkt_type != PACKET_HOST)
19737487449cSPaolo Abeni 		return 0;
197441063e9dSDavid S. Miller 
197545f00f99SEric Dumazet 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
19767487449cSPaolo Abeni 		return 0;
197741063e9dSDavid S. Miller 
197841063e9dSDavid S. Miller 	iph = ip_hdr(skb);
197945f00f99SEric Dumazet 	th = tcp_hdr(skb);
198041063e9dSDavid S. Miller 
198141063e9dSDavid S. Miller 	if (th->doff < sizeof(struct tcphdr) / 4)
19827487449cSPaolo Abeni 		return 0;
198341063e9dSDavid S. Miller 
19844461568aSKuniyuki Iwashima 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
198541063e9dSDavid S. Miller 				       iph->saddr, th->source,
19867011d085SVijay Subramanian 				       iph->daddr, ntohs(th->dest),
19873fa6f616SDavid Ahern 				       skb->skb_iif, inet_sdif(skb));
198841063e9dSDavid S. Miller 	if (sk) {
198941063e9dSDavid S. Miller 		skb->sk = sk;
199041063e9dSDavid S. Miller 		skb->destructor = sock_edemux;
1991f7e4eb03SEric Dumazet 		if (sk_fullsock(sk)) {
19928f905c0eSEric Dumazet 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1993505fbcf0SEric Dumazet 
199441063e9dSDavid S. Miller 			if (dst)
199541063e9dSDavid S. Miller 				dst = dst_check(dst, 0);
199692101b3bSDavid S. Miller 			if (dst &&
19970c0a5ef8SEric Dumazet 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
199841063e9dSDavid S. Miller 				skb_dst_set_noref(skb, dst);
199941063e9dSDavid S. Miller 		}
200041063e9dSDavid S. Miller 	}
20017487449cSPaolo Abeni 	return 0;
200241063e9dSDavid S. Miller }
200341063e9dSDavid S. Miller 
20047a26dc9eSMenglong Dong bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
20057a26dc9eSMenglong Dong 		     enum skb_drop_reason *reason)
2006c9c33212SEric Dumazet {
2007ec00ed47SEric Dumazet 	u32 tail_gso_size, tail_gso_segs;
20084f693b55SEric Dumazet 	struct skb_shared_info *shinfo;
20094f693b55SEric Dumazet 	const struct tcphdr *th;
20104f693b55SEric Dumazet 	struct tcphdr *thtail;
20114f693b55SEric Dumazet 	struct sk_buff *tail;
20124f693b55SEric Dumazet 	unsigned int hdrlen;
20134f693b55SEric Dumazet 	bool fragstolen;
20144f693b55SEric Dumazet 	u32 gso_segs;
2015b160c285SEric Dumazet 	u32 gso_size;
2016ec00ed47SEric Dumazet 	u64 limit;
20174f693b55SEric Dumazet 	int delta;
2018c9c33212SEric Dumazet 
2019c9c33212SEric Dumazet 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2020c9c33212SEric Dumazet 	 * we can fix skb->truesize to its real value to avoid future drops.
2021c9c33212SEric Dumazet 	 * This is valid because skb is not yet charged to the socket.
2022c9c33212SEric Dumazet 	 * It has been noticed pure SACK packets were sometimes dropped
2023c9c33212SEric Dumazet 	 * (if cooked by drivers without copybreak feature).
2024c9c33212SEric Dumazet 	 */
202560b1af33SEric Dumazet 	skb_condense(skb);
2026c9c33212SEric Dumazet 
2027ade9628eSEric Dumazet 	skb_dst_drop(skb);
2028ade9628eSEric Dumazet 
20294f693b55SEric Dumazet 	if (unlikely(tcp_checksum_complete(skb))) {
20304f693b55SEric Dumazet 		bh_unlock_sock(sk);
2031709c0314SJakub Kicinski 		trace_tcp_bad_csum(skb);
20327a26dc9eSMenglong Dong 		*reason = SKB_DROP_REASON_TCP_CSUM;
20334f693b55SEric Dumazet 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
20344f693b55SEric Dumazet 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
20354f693b55SEric Dumazet 		return true;
20364f693b55SEric Dumazet 	}
20374f693b55SEric Dumazet 
20384f693b55SEric Dumazet 	/* Attempt coalescing to last skb in backlog, even if we are
20394f693b55SEric Dumazet 	 * above the limits.
20404f693b55SEric Dumazet 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
20414f693b55SEric Dumazet 	 */
20424f693b55SEric Dumazet 	th = (const struct tcphdr *)skb->data;
20434f693b55SEric Dumazet 	hdrlen = th->doff * 4;
20444f693b55SEric Dumazet 
20454f693b55SEric Dumazet 	tail = sk->sk_backlog.tail;
20464f693b55SEric Dumazet 	if (!tail)
20474f693b55SEric Dumazet 		goto no_coalesce;
20484f693b55SEric Dumazet 	thtail = (struct tcphdr *)tail->data;
20494f693b55SEric Dumazet 
20504f693b55SEric Dumazet 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
20514f693b55SEric Dumazet 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
20524f693b55SEric Dumazet 	    ((TCP_SKB_CB(tail)->tcp_flags |
2053ca2fe295SEric Dumazet 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2054ca2fe295SEric Dumazet 	    !((TCP_SKB_CB(tail)->tcp_flags &
2055ca2fe295SEric Dumazet 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
20564f693b55SEric Dumazet 	    ((TCP_SKB_CB(tail)->tcp_flags ^
20574f693b55SEric Dumazet 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
205807111530SJakub Kicinski 	    !tcp_skb_can_collapse_rx(tail, skb) ||
20594f693b55SEric Dumazet 	    thtail->doff != th->doff ||
20604f693b55SEric Dumazet 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
20614f693b55SEric Dumazet 		goto no_coalesce;
20624f693b55SEric Dumazet 
20634f693b55SEric Dumazet 	__skb_pull(skb, hdrlen);
2064b160c285SEric Dumazet 
2065b160c285SEric Dumazet 	shinfo = skb_shinfo(skb);
2066b160c285SEric Dumazet 	gso_size = shinfo->gso_size ?: skb->len;
2067b160c285SEric Dumazet 	gso_segs = shinfo->gso_segs ?: 1;
2068b160c285SEric Dumazet 
2069b160c285SEric Dumazet 	shinfo = skb_shinfo(tail);
2070b160c285SEric Dumazet 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2071b160c285SEric Dumazet 	tail_gso_segs = shinfo->gso_segs ?: 1;
2072b160c285SEric Dumazet 
20734f693b55SEric Dumazet 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
20744f693b55SEric Dumazet 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
20754f693b55SEric Dumazet 
207686bccd03SEric Dumazet 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
20774f693b55SEric Dumazet 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
207886bccd03SEric Dumazet 			thtail->window = th->window;
207986bccd03SEric Dumazet 		}
20804f693b55SEric Dumazet 
2081ca2fe295SEric Dumazet 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2082ca2fe295SEric Dumazet 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2083ca2fe295SEric Dumazet 		 * is not entered if we append a packet with a FIN.
2084ca2fe295SEric Dumazet 		 * SYN, RST, URG are not present.
2085ca2fe295SEric Dumazet 		 * ACK is set on both packets.
2086ca2fe295SEric Dumazet 		 * PSH : we do not really care in TCP stack,
2087ca2fe295SEric Dumazet 		 *       at least for 'GRO' packets.
2088ca2fe295SEric Dumazet 		 */
2089ca2fe295SEric Dumazet 		thtail->fin |= th->fin;
20904f693b55SEric Dumazet 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
20914f693b55SEric Dumazet 
20924f693b55SEric Dumazet 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
20934f693b55SEric Dumazet 			TCP_SKB_CB(tail)->has_rxtstamp = true;
20944f693b55SEric Dumazet 			tail->tstamp = skb->tstamp;
20954f693b55SEric Dumazet 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
20964f693b55SEric Dumazet 		}
20974f693b55SEric Dumazet 
20984f693b55SEric Dumazet 		/* Not as strict as GRO. We only need to carry mss max value */
2099b160c285SEric Dumazet 		shinfo->gso_size = max(gso_size, tail_gso_size);
2100b160c285SEric Dumazet 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
21014f693b55SEric Dumazet 
21024f693b55SEric Dumazet 		sk->sk_backlog.len += delta;
21034f693b55SEric Dumazet 		__NET_INC_STATS(sock_net(sk),
21044f693b55SEric Dumazet 				LINUX_MIB_TCPBACKLOGCOALESCE);
21054f693b55SEric Dumazet 		kfree_skb_partial(skb, fragstolen);
21064f693b55SEric Dumazet 		return false;
21074f693b55SEric Dumazet 	}
21084f693b55SEric Dumazet 	__skb_push(skb, hdrlen);
21094f693b55SEric Dumazet 
21104f693b55SEric Dumazet no_coalesce:
2111ec00ed47SEric Dumazet 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2112ec00ed47SEric Dumazet 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2113ec00ed47SEric Dumazet 	 * sk_rcvbuf in normal conditions.
2114ec00ed47SEric Dumazet 	 */
2115ec00ed47SEric Dumazet 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2116ec00ed47SEric Dumazet 
2117ec00ed47SEric Dumazet 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2118ec791d81SLu Wei 
21194f693b55SEric Dumazet 	/* Only socket owner can try to collapse/prune rx queues
21204f693b55SEric Dumazet 	 * to reduce memory overhead, so add a little headroom here.
21214f693b55SEric Dumazet 	 * Few sockets backlog are possibly concurrently non empty.
21224f693b55SEric Dumazet 	 */
2123ec791d81SLu Wei 	limit += 64 * 1024;
21244f693b55SEric Dumazet 
2125ec00ed47SEric Dumazet 	limit = min_t(u64, limit, UINT_MAX);
2126ec00ed47SEric Dumazet 
2127c9c33212SEric Dumazet 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2128c9c33212SEric Dumazet 		bh_unlock_sock(sk);
21297a26dc9eSMenglong Dong 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2130c9c33212SEric Dumazet 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2131c9c33212SEric Dumazet 		return true;
2132c9c33212SEric Dumazet 	}
2133c9c33212SEric Dumazet 	return false;
2134c9c33212SEric Dumazet }
2135c9c33212SEric Dumazet EXPORT_SYMBOL(tcp_add_backlog);
2136c9c33212SEric Dumazet 
2137ac6e7800SEric Dumazet int tcp_filter(struct sock *sk, struct sk_buff *skb)
2138ac6e7800SEric Dumazet {
2139ac6e7800SEric Dumazet 	struct tcphdr *th = (struct tcphdr *)skb->data;
2140ac6e7800SEric Dumazet 
2141f2feaefdSChristoph Paasch 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2142ac6e7800SEric Dumazet }
2143ac6e7800SEric Dumazet EXPORT_SYMBOL(tcp_filter);
2144ac6e7800SEric Dumazet 
2145eeea10b8SEric Dumazet static void tcp_v4_restore_cb(struct sk_buff *skb)
2146eeea10b8SEric Dumazet {
2147eeea10b8SEric Dumazet 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2148eeea10b8SEric Dumazet 		sizeof(struct inet_skb_parm));
2149eeea10b8SEric Dumazet }
2150eeea10b8SEric Dumazet 
2151eeea10b8SEric Dumazet static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2152eeea10b8SEric Dumazet 			   const struct tcphdr *th)
2153eeea10b8SEric Dumazet {
2154eeea10b8SEric Dumazet 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2155eeea10b8SEric Dumazet 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2156eeea10b8SEric Dumazet 	 */
2157eeea10b8SEric Dumazet 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2158eeea10b8SEric Dumazet 		sizeof(struct inet_skb_parm));
2159eeea10b8SEric Dumazet 	barrier();
2160eeea10b8SEric Dumazet 
2161eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2162eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2163eeea10b8SEric Dumazet 				    skb->len - th->doff * 4);
2164eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2165eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2166eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2167eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->sacked	 = 0;
2168eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->has_rxtstamp =
2169eeea10b8SEric Dumazet 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2170eeea10b8SEric Dumazet }
2171eeea10b8SEric Dumazet 
21721da177e4SLinus Torvalds /*
21731da177e4SLinus Torvalds  *	From tcp_input.c
21741da177e4SLinus Torvalds  */
21751da177e4SLinus Torvalds 
21761da177e4SLinus Torvalds int tcp_v4_rcv(struct sk_buff *skb)
21771da177e4SLinus Torvalds {
21783b24d854SEric Dumazet 	struct net *net = dev_net(skb->dev);
2179643b622bSMenglong Dong 	enum skb_drop_reason drop_reason;
21803fa6f616SDavid Ahern 	int sdif = inet_sdif(skb);
2181534322caSDavid Ahern 	int dif = inet_iif(skb);
2182eddc9ec5SArnaldo Carvalho de Melo 	const struct iphdr *iph;
2183cf533ea5SEric Dumazet 	const struct tcphdr *th;
218446a02aa3SYan Zhai 	struct sock *sk = NULL;
21853b24d854SEric Dumazet 	bool refcounted;
21861da177e4SLinus Torvalds 	int ret;
218741eecbd7SEric Dumazet 	u32 isn;
21881da177e4SLinus Torvalds 
218985125597SMenglong Dong 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
21901da177e4SLinus Torvalds 	if (skb->pkt_type != PACKET_HOST)
21911da177e4SLinus Torvalds 		goto discard_it;
21921da177e4SLinus Torvalds 
21931da177e4SLinus Torvalds 	/* Count it even if it's bad */
219490bbcc60SEric Dumazet 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
21951da177e4SLinus Torvalds 
21961da177e4SLinus Torvalds 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
21971da177e4SLinus Torvalds 		goto discard_it;
21981da177e4SLinus Torvalds 
2199ea1627c2SEric Dumazet 	th = (const struct tcphdr *)skb->data;
22001da177e4SLinus Torvalds 
220185125597SMenglong Dong 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
220285125597SMenglong Dong 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
22031da177e4SLinus Torvalds 		goto bad_packet;
220485125597SMenglong Dong 	}
22051da177e4SLinus Torvalds 	if (!pskb_may_pull(skb, th->doff * 4))
22061da177e4SLinus Torvalds 		goto discard_it;
22071da177e4SLinus Torvalds 
22081da177e4SLinus Torvalds 	/* An explanation is required here, I think.
22091da177e4SLinus Torvalds 	 * Packet length and doff are validated by header prediction,
2210caa20d9aSStephen Hemminger 	 * provided case of th->doff==0 is eliminated.
22111da177e4SLinus Torvalds 	 * So, we defer the checks. */
2212ed70fcfcSTom Herbert 
2213ed70fcfcSTom Herbert 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
22146a5dc9e5SEric Dumazet 		goto csum_error;
22151da177e4SLinus Torvalds 
2216ea1627c2SEric Dumazet 	th = (const struct tcphdr *)skb->data;
2217eddc9ec5SArnaldo Carvalho de Melo 	iph = ip_hdr(skb);
22184bdc3d66SEric Dumazet lookup:
22194461568aSKuniyuki Iwashima 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
22204461568aSKuniyuki Iwashima 			       skb, __tcp_hdrlen(th), th->source,
22213fa6f616SDavid Ahern 			       th->dest, sdif, &refcounted);
22221da177e4SLinus Torvalds 	if (!sk)
22231da177e4SLinus Torvalds 		goto no_tcp_socket;
22241da177e4SLinus Torvalds 
2225bb134d5dSEric Dumazet 	if (sk->sk_state == TCP_TIME_WAIT)
2226bb134d5dSEric Dumazet 		goto do_time_wait;
2227bb134d5dSEric Dumazet 
2228079096f1SEric Dumazet 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2229079096f1SEric Dumazet 		struct request_sock *req = inet_reqsk(sk);
2230e0f9759fSEric Dumazet 		bool req_stolen = false;
22317716682cSEric Dumazet 		struct sock *nsk;
2232079096f1SEric Dumazet 
2233079096f1SEric Dumazet 		sk = req->rsk_listener;
22346f0012e3SEric Dumazet 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
22356f0012e3SEric Dumazet 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
22366f0012e3SEric Dumazet 		else
22370a3a8090SDmitry Safonov 			drop_reason = tcp_inbound_hash(sk, req, skb,
22387bbb765bSDmitry Safonov 						       &iph->saddr, &iph->daddr,
22391330b6efSJakub Kicinski 						       AF_INET, dif, sdif);
22401330b6efSJakub Kicinski 		if (unlikely(drop_reason)) {
2241e65c332dSEric Dumazet 			sk_drops_add(sk, skb);
224272923555SEric Dumazet 			reqsk_put(req);
224372923555SEric Dumazet 			goto discard_it;
224472923555SEric Dumazet 		}
22454fd44a98SFrank van der Linden 		if (tcp_checksum_complete(skb)) {
22464fd44a98SFrank van der Linden 			reqsk_put(req);
22474fd44a98SFrank van der Linden 			goto csum_error;
22484fd44a98SFrank van der Linden 		}
22497716682cSEric Dumazet 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2250d4f2c86bSKuniyuki Iwashima 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2251d4f2c86bSKuniyuki Iwashima 			if (!nsk) {
2252f03f2e15SEric Dumazet 				inet_csk_reqsk_queue_drop_and_put(sk, req);
22534bdc3d66SEric Dumazet 				goto lookup;
22544bdc3d66SEric Dumazet 			}
2255d4f2c86bSKuniyuki Iwashima 			sk = nsk;
2256d4f2c86bSKuniyuki Iwashima 			/* reuseport_migrate_sock() has already held one sk_refcnt
2257d4f2c86bSKuniyuki Iwashima 			 * before returning.
2258d4f2c86bSKuniyuki Iwashima 			 */
2259d4f2c86bSKuniyuki Iwashima 		} else {
22603b24d854SEric Dumazet 			/* We own a reference on the listener, increase it again
22613b24d854SEric Dumazet 			 * as we might lose it too soon.
22623b24d854SEric Dumazet 			 */
22637716682cSEric Dumazet 			sock_hold(sk);
2264d4f2c86bSKuniyuki Iwashima 		}
22653b24d854SEric Dumazet 		refcounted = true;
22661f3b359fSEric Dumazet 		nsk = NULL;
2267eeea10b8SEric Dumazet 		if (!tcp_filter(sk, skb)) {
2268eeea10b8SEric Dumazet 			th = (const struct tcphdr *)skb->data;
2269eeea10b8SEric Dumazet 			iph = ip_hdr(skb);
2270eeea10b8SEric Dumazet 			tcp_v4_fill_cb(skb, iph, th);
2271e0f9759fSEric Dumazet 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2272255f9034SMenglong Dong 		} else {
2273255f9034SMenglong Dong 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2274eeea10b8SEric Dumazet 		}
2275079096f1SEric Dumazet 		if (!nsk) {
2276079096f1SEric Dumazet 			reqsk_put(req);
2277e0f9759fSEric Dumazet 			if (req_stolen) {
2278e0f9759fSEric Dumazet 				/* Another cpu got exclusive access to req
2279e0f9759fSEric Dumazet 				 * and created a full blown socket.
2280e0f9759fSEric Dumazet 				 * Try to feed this packet to this socket
2281e0f9759fSEric Dumazet 				 * instead of discarding it.
2282e0f9759fSEric Dumazet 				 */
2283e0f9759fSEric Dumazet 				tcp_v4_restore_cb(skb);
2284e0f9759fSEric Dumazet 				sock_put(sk);
2285e0f9759fSEric Dumazet 				goto lookup;
2286e0f9759fSEric Dumazet 			}
22877716682cSEric Dumazet 			goto discard_and_relse;
2288079096f1SEric Dumazet 		}
22896f0012e3SEric Dumazet 		nf_reset_ct(skb);
2290079096f1SEric Dumazet 		if (nsk == sk) {
2291079096f1SEric Dumazet 			reqsk_put(req);
2292eeea10b8SEric Dumazet 			tcp_v4_restore_cb(skb);
2293ee01defeSJason Xing 		} else {
2294ee01defeSJason Xing 			drop_reason = tcp_child_process(sk, nsk, skb);
2295ee01defeSJason Xing 			if (drop_reason) {
2296120391efSJason Xing 				enum sk_rst_reason rst_reason;
2297120391efSJason Xing 
2298120391efSJason Xing 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2299120391efSJason Xing 				tcp_v4_send_reset(nsk, skb, rst_reason);
23007716682cSEric Dumazet 				goto discard_and_relse;
2301ee01defeSJason Xing 			}
23027716682cSEric Dumazet 			sock_put(sk);
2303079096f1SEric Dumazet 			return 0;
2304079096f1SEric Dumazet 		}
2305079096f1SEric Dumazet 	}
230614834c4fSEric Dumazet 
2307d13b0596SEric Dumazet process:
2308020e71a3SEric Dumazet 	if (static_branch_unlikely(&ip4_min_ttl)) {
230914834c4fSEric Dumazet 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
231014834c4fSEric Dumazet 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
231102a1d6e7SEric Dumazet 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
23122798e36dSEric Dumazet 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2313d218d111SStephen Hemminger 			goto discard_and_relse;
23146cce09f8SEric Dumazet 		}
2315020e71a3SEric Dumazet 	}
2316d218d111SStephen Hemminger 
2317255f9034SMenglong Dong 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2318255f9034SMenglong Dong 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
23191da177e4SLinus Torvalds 		goto discard_and_relse;
2320255f9034SMenglong Dong 	}
23219ea88a15SDmitry Popov 
23220a3a8090SDmitry Safonov 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
23230a3a8090SDmitry Safonov 				       AF_INET, dif, sdif);
23241330b6efSJakub Kicinski 	if (drop_reason)
23259ea88a15SDmitry Popov 		goto discard_and_relse;
23269ea88a15SDmitry Popov 
2327895b5c9fSFlorian Westphal 	nf_reset_ct(skb);
23281da177e4SLinus Torvalds 
232985125597SMenglong Dong 	if (tcp_filter(sk, skb)) {
2330364df53cSMenglong Dong 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
23311da177e4SLinus Torvalds 		goto discard_and_relse;
233285125597SMenglong Dong 	}
2333ac6e7800SEric Dumazet 	th = (const struct tcphdr *)skb->data;
2334ac6e7800SEric Dumazet 	iph = ip_hdr(skb);
2335eeea10b8SEric Dumazet 	tcp_v4_fill_cb(skb, iph, th);
23361da177e4SLinus Torvalds 
23371da177e4SLinus Torvalds 	skb->dev = NULL;
23381da177e4SLinus Torvalds 
2339e994b2f0SEric Dumazet 	if (sk->sk_state == TCP_LISTEN) {
2340e994b2f0SEric Dumazet 		ret = tcp_v4_do_rcv(sk, skb);
2341e994b2f0SEric Dumazet 		goto put_and_return;
2342e994b2f0SEric Dumazet 	}
2343e994b2f0SEric Dumazet 
2344e994b2f0SEric Dumazet 	sk_incoming_cpu_update(sk);
2345e994b2f0SEric Dumazet 
2346c6366184SIngo Molnar 	bh_lock_sock_nested(sk);
2347a44d6eacSMartin KaFai Lau 	tcp_segs_in(tcp_sk(sk), skb);
23481da177e4SLinus Torvalds 	ret = 0;
23491da177e4SLinus Torvalds 	if (!sock_owned_by_user(sk)) {
23501da177e4SLinus Torvalds 		ret = tcp_v4_do_rcv(sk, skb);
23518b27dae5SEric Dumazet 	} else {
23527a26dc9eSMenglong Dong 		if (tcp_add_backlog(sk, skb, &drop_reason))
23536b03a53aSZhu Yi 			goto discard_and_relse;
23546b03a53aSZhu Yi 	}
23551da177e4SLinus Torvalds 	bh_unlock_sock(sk);
23561da177e4SLinus Torvalds 
2357e994b2f0SEric Dumazet put_and_return:
23583b24d854SEric Dumazet 	if (refcounted)
23591da177e4SLinus Torvalds 		sock_put(sk);
23601da177e4SLinus Torvalds 
23611da177e4SLinus Torvalds 	return ret;
23621da177e4SLinus Torvalds 
23631da177e4SLinus Torvalds no_tcp_socket:
236485125597SMenglong Dong 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
23651da177e4SLinus Torvalds 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
23661da177e4SLinus Torvalds 		goto discard_it;
23671da177e4SLinus Torvalds 
2368eeea10b8SEric Dumazet 	tcp_v4_fill_cb(skb, iph, th);
2369eeea10b8SEric Dumazet 
237012e25e10SEric Dumazet 	if (tcp_checksum_complete(skb)) {
23716a5dc9e5SEric Dumazet csum_error:
237285125597SMenglong Dong 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2373709c0314SJakub Kicinski 		trace_tcp_bad_csum(skb);
237490bbcc60SEric Dumazet 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
23751da177e4SLinus Torvalds bad_packet:
237690bbcc60SEric Dumazet 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
23771da177e4SLinus Torvalds 	} else {
2378120391efSJason Xing 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
23791da177e4SLinus Torvalds 	}
23801da177e4SLinus Torvalds 
23811da177e4SLinus Torvalds discard_it:
2382f8319dfdSMenglong Dong 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
23831da177e4SLinus Torvalds 	/* Discard frame. */
238446a02aa3SYan Zhai 	sk_skb_reason_drop(sk, skb, drop_reason);
23851da177e4SLinus Torvalds 	return 0;
23861da177e4SLinus Torvalds 
23871da177e4SLinus Torvalds discard_and_relse:
2388532182cdSEric Dumazet 	sk_drops_add(sk, skb);
23893b24d854SEric Dumazet 	if (refcounted)
23901da177e4SLinus Torvalds 		sock_put(sk);
23911da177e4SLinus Torvalds 	goto discard_it;
23921da177e4SLinus Torvalds 
23931da177e4SLinus Torvalds do_time_wait:
23941da177e4SLinus Torvalds 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2395255f9034SMenglong Dong 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
23969469c7b4SYOSHIFUJI Hideaki 		inet_twsk_put(inet_twsk(sk));
23971da177e4SLinus Torvalds 		goto discard_it;
23981da177e4SLinus Torvalds 	}
23991da177e4SLinus Torvalds 
2400eeea10b8SEric Dumazet 	tcp_v4_fill_cb(skb, iph, th);
2401eeea10b8SEric Dumazet 
24026a5dc9e5SEric Dumazet 	if (tcp_checksum_complete(skb)) {
24036a5dc9e5SEric Dumazet 		inet_twsk_put(inet_twsk(sk));
24046a5dc9e5SEric Dumazet 		goto csum_error;
24051da177e4SLinus Torvalds 	}
240641eecbd7SEric Dumazet 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
24071da177e4SLinus Torvalds 	case TCP_TW_SYN: {
24084461568aSKuniyuki Iwashima 		struct sock *sk2 = inet_lookup_listener(net,
24094461568aSKuniyuki Iwashima 							net->ipv4.tcp_death_row.hashinfo,
24104461568aSKuniyuki Iwashima 							skb, __tcp_hdrlen(th),
2411da5e3630STom Herbert 							iph->saddr, th->source,
2412eddc9ec5SArnaldo Carvalho de Melo 							iph->daddr, th->dest,
24133fa6f616SDavid Ahern 							inet_iif(skb),
24143fa6f616SDavid Ahern 							sdif);
24151da177e4SLinus Torvalds 		if (sk2) {
2416dbe7faa4SEric Dumazet 			inet_twsk_deschedule_put(inet_twsk(sk));
24171da177e4SLinus Torvalds 			sk = sk2;
2418eeea10b8SEric Dumazet 			tcp_v4_restore_cb(skb);
24193b24d854SEric Dumazet 			refcounted = false;
242041eecbd7SEric Dumazet 			__this_cpu_write(tcp_tw_isn, isn);
24211da177e4SLinus Torvalds 			goto process;
24221da177e4SLinus Torvalds 		}
24231da177e4SLinus Torvalds 	}
2424fcfd6dfaSGustavo A. R. Silva 		/* to ACK */
2425a8eceea8SJoe Perches 		fallthrough;
24261da177e4SLinus Torvalds 	case TCP_TW_ACK:
24271da177e4SLinus Torvalds 		tcp_v4_timewait_ack(sk, skb);
24281da177e4SLinus Torvalds 		break;
24291da177e4SLinus Torvalds 	case TCP_TW_RST:
243022a32557SJason Xing 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2431271c3b9bSFlorian Westphal 		inet_twsk_deschedule_put(inet_twsk(sk));
2432271c3b9bSFlorian Westphal 		goto discard_it;
24331da177e4SLinus Torvalds 	case TCP_TW_SUCCESS:;
24341da177e4SLinus Torvalds 	}
24351da177e4SLinus Torvalds 	goto discard_it;
24361da177e4SLinus Torvalds }
24371da177e4SLinus Torvalds 
2438ccb7c410SDavid S. Miller static struct timewait_sock_ops tcp_timewait_sock_ops = {
2439ccb7c410SDavid S. Miller 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2440ccb7c410SDavid S. Miller 	.twsk_destructor= tcp_twsk_destructor,
2441ccb7c410SDavid S. Miller };
24421da177e4SLinus Torvalds 
244363d02d15SEric Dumazet void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
24445d299f3dSEric Dumazet {
24455d299f3dSEric Dumazet 	struct dst_entry *dst = skb_dst(skb);
24465d299f3dSEric Dumazet 
24475037e9efSEric Dumazet 	if (dst && dst_hold_safe(dst)) {
24488f905c0eSEric Dumazet 		rcu_assign_pointer(sk->sk_rx_dst, dst);
24490c0a5ef8SEric Dumazet 		sk->sk_rx_dst_ifindex = skb->skb_iif;
24505d299f3dSEric Dumazet 	}
2451ca777effSEric Dumazet }
245263d02d15SEric Dumazet EXPORT_SYMBOL(inet_sk_rx_dst_set);
24535d299f3dSEric Dumazet 
24543b401a81SStephen Hemminger const struct inet_connection_sock_af_ops ipv4_specific = {
24551da177e4SLinus Torvalds 	.queue_xmit	   = ip_queue_xmit,
24561da177e4SLinus Torvalds 	.send_check	   = tcp_v4_send_check,
245732519f11SArnaldo Carvalho de Melo 	.rebuild_header	   = inet_sk_rebuild_header,
24585d299f3dSEric Dumazet 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
24591da177e4SLinus Torvalds 	.conn_request	   = tcp_v4_conn_request,
24601da177e4SLinus Torvalds 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
24611da177e4SLinus Torvalds 	.net_header_len	   = sizeof(struct iphdr),
24621da177e4SLinus Torvalds 	.setsockopt	   = ip_setsockopt,
24631da177e4SLinus Torvalds 	.getsockopt	   = ip_getsockopt,
2464543d9cfeSArnaldo Carvalho de Melo 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2465543d9cfeSArnaldo Carvalho de Melo 	.sockaddr_len	   = sizeof(struct sockaddr_in),
24664fab9071SNeal Cardwell 	.mtu_reduced	   = tcp_v4_mtu_reduced,
24671da177e4SLinus Torvalds };
24684bc2f18bSEric Dumazet EXPORT_SYMBOL(ipv4_specific);
24691da177e4SLinus Torvalds 
24704954f17dSDmitry Safonov #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2471b2e4b3deSStephen Hemminger static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
24724954f17dSDmitry Safonov #ifdef CONFIG_TCP_MD5SIG
2473cfb6eeb4SYOSHIFUJI Hideaki 	.md5_lookup		= tcp_v4_md5_lookup,
247449a72dfbSAdam Langley 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2475cfb6eeb4SYOSHIFUJI Hideaki 	.md5_parse		= tcp_v4_parse_md5_keys,
24764954f17dSDmitry Safonov #endif
24774954f17dSDmitry Safonov #ifdef CONFIG_TCP_AO
24780aadc739SDmitry Safonov 	.ao_lookup		= tcp_v4_ao_lookup,
24791e03d32bSDmitry Safonov 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
24804954f17dSDmitry Safonov 	.ao_parse		= tcp_v4_parse_ao,
24817c2ffaf2SDmitry Safonov 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
24824954f17dSDmitry Safonov #endif
2483cfb6eeb4SYOSHIFUJI Hideaki };
2484b6332e6cSAndrew Morton #endif
2485cfb6eeb4SYOSHIFUJI Hideaki 
24861da177e4SLinus Torvalds /* NOTE: A lot of things set to zero explicitly by call to
24871da177e4SLinus Torvalds  *       sk_alloc() so need not be done here.
24881da177e4SLinus Torvalds  */
24891da177e4SLinus Torvalds static int tcp_v4_init_sock(struct sock *sk)
24901da177e4SLinus Torvalds {
24916687e988SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
24921da177e4SLinus Torvalds 
2493900f65d3SNeal Cardwell 	tcp_init_sock(sk);
24941da177e4SLinus Torvalds 
24958292a17aSArnaldo Carvalho de Melo 	icsk->icsk_af_ops = &ipv4_specific;
2496900f65d3SNeal Cardwell 
24974954f17dSDmitry Safonov #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2498ac807fa8SDavid S. Miller 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2499cfb6eeb4SYOSHIFUJI Hideaki #endif
25001da177e4SLinus Torvalds 
25011da177e4SLinus Torvalds 	return 0;
25021da177e4SLinus Torvalds }
25031da177e4SLinus Torvalds 
25048c73b263SDmitry Safonov #ifdef CONFIG_TCP_MD5SIG
25058c73b263SDmitry Safonov static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
25068c73b263SDmitry Safonov {
25078c73b263SDmitry Safonov 	struct tcp_md5sig_info *md5sig;
25088c73b263SDmitry Safonov 
25098c73b263SDmitry Safonov 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
25108c73b263SDmitry Safonov 	kfree(md5sig);
25118c73b263SDmitry Safonov 	static_branch_slow_dec_deferred(&tcp_md5_needed);
25128c73b263SDmitry Safonov 	tcp_md5_release_sigpool();
25138c73b263SDmitry Safonov }
25148c73b263SDmitry Safonov #endif
25158c73b263SDmitry Safonov 
2516*8f0b3cc9SMina Almasry static void tcp_release_user_frags(struct sock *sk)
2517*8f0b3cc9SMina Almasry {
2518*8f0b3cc9SMina Almasry #ifdef CONFIG_PAGE_POOL
2519*8f0b3cc9SMina Almasry 	unsigned long index;
2520*8f0b3cc9SMina Almasry 	void *netmem;
2521*8f0b3cc9SMina Almasry 
2522*8f0b3cc9SMina Almasry 	xa_for_each(&sk->sk_user_frags, index, netmem)
2523*8f0b3cc9SMina Almasry 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2524*8f0b3cc9SMina Almasry #endif
2525*8f0b3cc9SMina Almasry }
2526*8f0b3cc9SMina Almasry 
25277d06b2e0SBrian Haley void tcp_v4_destroy_sock(struct sock *sk)
25281da177e4SLinus Torvalds {
25291da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
25301da177e4SLinus Torvalds 
2531*8f0b3cc9SMina Almasry 	tcp_release_user_frags(sk);
2532*8f0b3cc9SMina Almasry 
2533*8f0b3cc9SMina Almasry 	xa_destroy(&sk->sk_user_frags);
2534*8f0b3cc9SMina Almasry 
2535e1a4aa50SSong Liu 	trace_tcp_destroy_sock(sk);
2536e1a4aa50SSong Liu 
25371da177e4SLinus Torvalds 	tcp_clear_xmit_timers(sk);
25381da177e4SLinus Torvalds 
25396687e988SArnaldo Carvalho de Melo 	tcp_cleanup_congestion_control(sk);
2540317a76f9SStephen Hemminger 
2541734942ccSDave Watson 	tcp_cleanup_ulp(sk);
2542734942ccSDave Watson 
25431da177e4SLinus Torvalds 	/* Cleanup up the write buffer. */
2544fe067e8aSDavid S. Miller 	tcp_write_queue_purge(sk);
25451da177e4SLinus Torvalds 
2546cf1ef3f0SWei Wang 	/* Check if we want to disable active TFO */
2547cf1ef3f0SWei Wang 	tcp_fastopen_active_disable_ofo_check(sk);
2548cf1ef3f0SWei Wang 
25491da177e4SLinus Torvalds 	/* Cleans up our, hopefully empty, out_of_order_queue. */
25509f5afeaeSYaogong Wang 	skb_rbtree_purge(&tp->out_of_order_queue);
25511da177e4SLinus Torvalds 
2552cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
2553cfb6eeb4SYOSHIFUJI Hideaki 	/* Clean up the MD5 key list, if any */
2554cfb6eeb4SYOSHIFUJI Hideaki 	if (tp->md5sig_info) {
25558c73b263SDmitry Safonov 		struct tcp_md5sig_info *md5sig;
25568c73b263SDmitry Safonov 
25578c73b263SDmitry Safonov 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2558a915da9bSEric Dumazet 		tcp_clear_md5_list(sk);
25598c73b263SDmitry Safonov 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
25608c73b263SDmitry Safonov 		rcu_assign_pointer(tp->md5sig_info, NULL);
2561cfb6eeb4SYOSHIFUJI Hideaki 	}
2562cfb6eeb4SYOSHIFUJI Hideaki #endif
2563decde258SDmitry Safonov 	tcp_ao_destroy_sock(sk, false);
2564cfb6eeb4SYOSHIFUJI Hideaki 
25651da177e4SLinus Torvalds 	/* Clean up a referenced TCP bind bucket. */
2566463c84b9SArnaldo Carvalho de Melo 	if (inet_csk(sk)->icsk_bind_hash)
2567ab1e0a13SArnaldo Carvalho de Melo 		inet_put_port(sk);
25681da177e4SLinus Torvalds 
2569d983ea6fSEric Dumazet 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2570435cf559SWilliam Allen Simpson 
2571cf60af03SYuchung Cheng 	/* If socket is aborted during connect operation */
2572cf60af03SYuchung Cheng 	tcp_free_fastopen_req(tp);
25731fba70e5SYuchung Cheng 	tcp_fastopen_destroy_cipher(sk);
2574cd8ae852SEric Dumazet 	tcp_saved_syn_free(tp);
2575cf60af03SYuchung Cheng 
2576180d8cd9SGlauber Costa 	sk_sockets_allocated_dec(sk);
25771da177e4SLinus Torvalds }
25781da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_destroy_sock);
25791da177e4SLinus Torvalds 
25801da177e4SLinus Torvalds #ifdef CONFIG_PROC_FS
25811da177e4SLinus Torvalds /* Proc filesystem TCP sock list dumping. */
25821da177e4SLinus Torvalds 
2583ad2d6137SMartin KaFai Lau static unsigned short seq_file_family(const struct seq_file *seq);
2584ad2d6137SMartin KaFai Lau 
2585ad2d6137SMartin KaFai Lau static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2586ad2d6137SMartin KaFai Lau {
2587ad2d6137SMartin KaFai Lau 	unsigned short family = seq_file_family(seq);
2588ad2d6137SMartin KaFai Lau 
2589ad2d6137SMartin KaFai Lau 	/* AF_UNSPEC is used as a match all */
2590ad2d6137SMartin KaFai Lau 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2591ad2d6137SMartin KaFai Lau 		net_eq(sock_net(sk), seq_file_net(seq)));
2592ad2d6137SMartin KaFai Lau }
2593ad2d6137SMartin KaFai Lau 
2594b72acf45SMartin KaFai Lau /* Find a non empty bucket (starting from st->bucket)
2595b72acf45SMartin KaFai Lau  * and return the first sk from it.
2596b72acf45SMartin KaFai Lau  */
2597b72acf45SMartin KaFai Lau static void *listening_get_first(struct seq_file *seq)
2598b72acf45SMartin KaFai Lau {
25994461568aSKuniyuki Iwashima 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2600b72acf45SMartin KaFai Lau 	struct tcp_iter_state *st = seq->private;
2601b72acf45SMartin KaFai Lau 
2602b72acf45SMartin KaFai Lau 	st->offset = 0;
26034461568aSKuniyuki Iwashima 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
260405c0b357SMartin KaFai Lau 		struct inet_listen_hashbucket *ilb2;
2605cae3873cSMartin KaFai Lau 		struct hlist_nulls_node *node;
2606b72acf45SMartin KaFai Lau 		struct sock *sk;
2607b72acf45SMartin KaFai Lau 
26084461568aSKuniyuki Iwashima 		ilb2 = &hinfo->lhash2[st->bucket];
2609cae3873cSMartin KaFai Lau 		if (hlist_nulls_empty(&ilb2->nulls_head))
2610b72acf45SMartin KaFai Lau 			continue;
2611b72acf45SMartin KaFai Lau 
261205c0b357SMartin KaFai Lau 		spin_lock(&ilb2->lock);
2613cae3873cSMartin KaFai Lau 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2614b72acf45SMartin KaFai Lau 			if (seq_sk_match(seq, sk))
2615b72acf45SMartin KaFai Lau 				return sk;
2616b72acf45SMartin KaFai Lau 		}
261705c0b357SMartin KaFai Lau 		spin_unlock(&ilb2->lock);
2618b72acf45SMartin KaFai Lau 	}
2619b72acf45SMartin KaFai Lau 
2620b72acf45SMartin KaFai Lau 	return NULL;
2621b72acf45SMartin KaFai Lau }
2622b72acf45SMartin KaFai Lau 
2623b72acf45SMartin KaFai Lau /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2624b72acf45SMartin KaFai Lau  * If "cur" is the last one in the st->bucket,
2625b72acf45SMartin KaFai Lau  * call listening_get_first() to return the first sk of the next
2626b72acf45SMartin KaFai Lau  * non empty bucket.
2627a8b690f9STom Herbert  */
26281da177e4SLinus Torvalds static void *listening_get_next(struct seq_file *seq, void *cur)
26291da177e4SLinus Torvalds {
26301da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
263105c0b357SMartin KaFai Lau 	struct inet_listen_hashbucket *ilb2;
2632cae3873cSMartin KaFai Lau 	struct hlist_nulls_node *node;
26334461568aSKuniyuki Iwashima 	struct inet_hashinfo *hinfo;
26343b24d854SEric Dumazet 	struct sock *sk = cur;
26351da177e4SLinus Torvalds 
26361da177e4SLinus Torvalds 	++st->num;
2637a8b690f9STom Herbert 	++st->offset;
26381da177e4SLinus Torvalds 
2639cae3873cSMartin KaFai Lau 	sk = sk_nulls_next(sk);
2640cae3873cSMartin KaFai Lau 	sk_nulls_for_each_from(sk, node) {
2641ad2d6137SMartin KaFai Lau 		if (seq_sk_match(seq, sk))
26423b24d854SEric Dumazet 			return sk;
26431da177e4SLinus Torvalds 	}
2644b72acf45SMartin KaFai Lau 
26454461568aSKuniyuki Iwashima 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
26464461568aSKuniyuki Iwashima 	ilb2 = &hinfo->lhash2[st->bucket];
264705c0b357SMartin KaFai Lau 	spin_unlock(&ilb2->lock);
2648b72acf45SMartin KaFai Lau 	++st->bucket;
2649b72acf45SMartin KaFai Lau 	return listening_get_first(seq);
26501da177e4SLinus Torvalds }
26511da177e4SLinus Torvalds 
26521da177e4SLinus Torvalds static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
26531da177e4SLinus Torvalds {
2654a8b690f9STom Herbert 	struct tcp_iter_state *st = seq->private;
2655a8b690f9STom Herbert 	void *rc;
2656a8b690f9STom Herbert 
2657a8b690f9STom Herbert 	st->bucket = 0;
2658a8b690f9STom Herbert 	st->offset = 0;
2659b72acf45SMartin KaFai Lau 	rc = listening_get_first(seq);
26601da177e4SLinus Torvalds 
26611da177e4SLinus Torvalds 	while (rc && *pos) {
26621da177e4SLinus Torvalds 		rc = listening_get_next(seq, rc);
26631da177e4SLinus Torvalds 		--*pos;
26641da177e4SLinus Torvalds 	}
26651da177e4SLinus Torvalds 	return rc;
26661da177e4SLinus Torvalds }
26671da177e4SLinus Torvalds 
26684461568aSKuniyuki Iwashima static inline bool empty_bucket(struct inet_hashinfo *hinfo,
26694461568aSKuniyuki Iwashima 				const struct tcp_iter_state *st)
26706eac5604SAndi Kleen {
26714461568aSKuniyuki Iwashima 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
26726eac5604SAndi Kleen }
26736eac5604SAndi Kleen 
2674a8b690f9STom Herbert /*
2675a8b690f9STom Herbert  * Get first established socket starting from bucket given in st->bucket.
2676a8b690f9STom Herbert  * If st->bucket is zero, the very first socket in the hash is returned.
2677a8b690f9STom Herbert  */
26781da177e4SLinus Torvalds static void *established_get_first(struct seq_file *seq)
26791da177e4SLinus Torvalds {
26804461568aSKuniyuki Iwashima 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
26811da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
2682b08d4d3bSYonghong Song 
2683a8b690f9STom Herbert 	st->offset = 0;
26844461568aSKuniyuki Iwashima 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
26851da177e4SLinus Torvalds 		struct sock *sk;
26863ab5aee7SEric Dumazet 		struct hlist_nulls_node *node;
26874461568aSKuniyuki Iwashima 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
26881da177e4SLinus Torvalds 
26899f4a7c93SJian Wen 		cond_resched();
26909f4a7c93SJian Wen 
26916eac5604SAndi Kleen 		/* Lockless fast path for the common case of empty buckets */
26924461568aSKuniyuki Iwashima 		if (empty_bucket(hinfo, st))
26936eac5604SAndi Kleen 			continue;
26946eac5604SAndi Kleen 
26959db66bdcSEric Dumazet 		spin_lock_bh(lock);
26964461568aSKuniyuki Iwashima 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2697ad2d6137SMartin KaFai Lau 			if (seq_sk_match(seq, sk))
2698ad2d6137SMartin KaFai Lau 				return sk;
26991da177e4SLinus Torvalds 		}
27009db66bdcSEric Dumazet 		spin_unlock_bh(lock);
27011da177e4SLinus Torvalds 	}
2702ad2d6137SMartin KaFai Lau 
2703ad2d6137SMartin KaFai Lau 	return NULL;
27041da177e4SLinus Torvalds }
27051da177e4SLinus Torvalds 
27061da177e4SLinus Torvalds static void *established_get_next(struct seq_file *seq, void *cur)
27071da177e4SLinus Torvalds {
27084461568aSKuniyuki Iwashima 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
27091da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
271008eaef90SKuniyuki Iwashima 	struct hlist_nulls_node *node;
271108eaef90SKuniyuki Iwashima 	struct sock *sk = cur;
2712b08d4d3bSYonghong Song 
27131da177e4SLinus Torvalds 	++st->num;
2714a8b690f9STom Herbert 	++st->offset;
27151da177e4SLinus Torvalds 
27163ab5aee7SEric Dumazet 	sk = sk_nulls_next(sk);
27171da177e4SLinus Torvalds 
27183ab5aee7SEric Dumazet 	sk_nulls_for_each_from(sk, node) {
2719ad2d6137SMartin KaFai Lau 		if (seq_sk_match(seq, sk))
272005dbc7b5SEric Dumazet 			return sk;
27211da177e4SLinus Torvalds 	}
27221da177e4SLinus Torvalds 
27234461568aSKuniyuki Iwashima 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
272405dbc7b5SEric Dumazet 	++st->bucket;
272505dbc7b5SEric Dumazet 	return established_get_first(seq);
27261da177e4SLinus Torvalds }
27271da177e4SLinus Torvalds 
27281da177e4SLinus Torvalds static void *established_get_idx(struct seq_file *seq, loff_t pos)
27291da177e4SLinus Torvalds {
2730a8b690f9STom Herbert 	struct tcp_iter_state *st = seq->private;
2731a8b690f9STom Herbert 	void *rc;
2732a8b690f9STom Herbert 
2733a8b690f9STom Herbert 	st->bucket = 0;
2734a8b690f9STom Herbert 	rc = established_get_first(seq);
27351da177e4SLinus Torvalds 
27361da177e4SLinus Torvalds 	while (rc && pos) {
27371da177e4SLinus Torvalds 		rc = established_get_next(seq, rc);
27381da177e4SLinus Torvalds 		--pos;
27391da177e4SLinus Torvalds 	}
27401da177e4SLinus Torvalds 	return rc;
27411da177e4SLinus Torvalds }
27421da177e4SLinus Torvalds 
27431da177e4SLinus Torvalds static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
27441da177e4SLinus Torvalds {
27451da177e4SLinus Torvalds 	void *rc;
27461da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
27471da177e4SLinus Torvalds 
27481da177e4SLinus Torvalds 	st->state = TCP_SEQ_STATE_LISTENING;
27491da177e4SLinus Torvalds 	rc	  = listening_get_idx(seq, &pos);
27501da177e4SLinus Torvalds 
27511da177e4SLinus Torvalds 	if (!rc) {
27521da177e4SLinus Torvalds 		st->state = TCP_SEQ_STATE_ESTABLISHED;
27531da177e4SLinus Torvalds 		rc	  = established_get_idx(seq, pos);
27541da177e4SLinus Torvalds 	}
27551da177e4SLinus Torvalds 
27561da177e4SLinus Torvalds 	return rc;
27571da177e4SLinus Torvalds }
27581da177e4SLinus Torvalds 
2759a8b690f9STom Herbert static void *tcp_seek_last_pos(struct seq_file *seq)
2760a8b690f9STom Herbert {
27614461568aSKuniyuki Iwashima 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2762a8b690f9STom Herbert 	struct tcp_iter_state *st = seq->private;
2763525e2f9fSMartin KaFai Lau 	int bucket = st->bucket;
2764a8b690f9STom Herbert 	int offset = st->offset;
2765a8b690f9STom Herbert 	int orig_num = st->num;
2766a8b690f9STom Herbert 	void *rc = NULL;
2767a8b690f9STom Herbert 
2768a8b690f9STom Herbert 	switch (st->state) {
2769a8b690f9STom Herbert 	case TCP_SEQ_STATE_LISTENING:
27704461568aSKuniyuki Iwashima 		if (st->bucket > hinfo->lhash2_mask)
2771a8b690f9STom Herbert 			break;
2772b72acf45SMartin KaFai Lau 		rc = listening_get_first(seq);
2773525e2f9fSMartin KaFai Lau 		while (offset-- && rc && bucket == st->bucket)
2774a8b690f9STom Herbert 			rc = listening_get_next(seq, rc);
2775a8b690f9STom Herbert 		if (rc)
2776a8b690f9STom Herbert 			break;
2777a8b690f9STom Herbert 		st->bucket = 0;
277805dbc7b5SEric Dumazet 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2779a8eceea8SJoe Perches 		fallthrough;
2780a8b690f9STom Herbert 	case TCP_SEQ_STATE_ESTABLISHED:
27814461568aSKuniyuki Iwashima 		if (st->bucket > hinfo->ehash_mask)
2782a8b690f9STom Herbert 			break;
2783a8b690f9STom Herbert 		rc = established_get_first(seq);
2784525e2f9fSMartin KaFai Lau 		while (offset-- && rc && bucket == st->bucket)
2785a8b690f9STom Herbert 			rc = established_get_next(seq, rc);
2786a8b690f9STom Herbert 	}
2787a8b690f9STom Herbert 
2788a8b690f9STom Herbert 	st->num = orig_num;
2789a8b690f9STom Herbert 
2790a8b690f9STom Herbert 	return rc;
2791a8b690f9STom Herbert }
2792a8b690f9STom Herbert 
279337d849bbSChristoph Hellwig void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
27941da177e4SLinus Torvalds {
27951da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
2796a8b690f9STom Herbert 	void *rc;
2797a8b690f9STom Herbert 
2798a8b690f9STom Herbert 	if (*pos && *pos == st->last_pos) {
2799a8b690f9STom Herbert 		rc = tcp_seek_last_pos(seq);
2800a8b690f9STom Herbert 		if (rc)
2801a8b690f9STom Herbert 			goto out;
2802a8b690f9STom Herbert 	}
2803a8b690f9STom Herbert 
28041da177e4SLinus Torvalds 	st->state = TCP_SEQ_STATE_LISTENING;
28051da177e4SLinus Torvalds 	st->num = 0;
2806a8b690f9STom Herbert 	st->bucket = 0;
2807a8b690f9STom Herbert 	st->offset = 0;
2808a8b690f9STom Herbert 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2809a8b690f9STom Herbert 
2810a8b690f9STom Herbert out:
2811a8b690f9STom Herbert 	st->last_pos = *pos;
2812a8b690f9STom Herbert 	return rc;
28131da177e4SLinus Torvalds }
281437d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_start);
28151da177e4SLinus Torvalds 
281637d849bbSChristoph Hellwig void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
28171da177e4SLinus Torvalds {
2818a8b690f9STom Herbert 	struct tcp_iter_state *st = seq->private;
28191da177e4SLinus Torvalds 	void *rc = NULL;
28201da177e4SLinus Torvalds 
28211da177e4SLinus Torvalds 	if (v == SEQ_START_TOKEN) {
28221da177e4SLinus Torvalds 		rc = tcp_get_idx(seq, 0);
28231da177e4SLinus Torvalds 		goto out;
28241da177e4SLinus Torvalds 	}
28251da177e4SLinus Torvalds 
28261da177e4SLinus Torvalds 	switch (st->state) {
28271da177e4SLinus Torvalds 	case TCP_SEQ_STATE_LISTENING:
28281da177e4SLinus Torvalds 		rc = listening_get_next(seq, v);
28291da177e4SLinus Torvalds 		if (!rc) {
28301da177e4SLinus Torvalds 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2831a8b690f9STom Herbert 			st->bucket = 0;
2832a8b690f9STom Herbert 			st->offset = 0;
28331da177e4SLinus Torvalds 			rc	  = established_get_first(seq);
28341da177e4SLinus Torvalds 		}
28351da177e4SLinus Torvalds 		break;
28361da177e4SLinus Torvalds 	case TCP_SEQ_STATE_ESTABLISHED:
28371da177e4SLinus Torvalds 		rc = established_get_next(seq, v);
28381da177e4SLinus Torvalds 		break;
28391da177e4SLinus Torvalds 	}
28401da177e4SLinus Torvalds out:
28411da177e4SLinus Torvalds 	++*pos;
2842a8b690f9STom Herbert 	st->last_pos = *pos;
28431da177e4SLinus Torvalds 	return rc;
28441da177e4SLinus Torvalds }
284537d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_next);
28461da177e4SLinus Torvalds 
284737d849bbSChristoph Hellwig void tcp_seq_stop(struct seq_file *seq, void *v)
28481da177e4SLinus Torvalds {
28494461568aSKuniyuki Iwashima 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
28501da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
28511da177e4SLinus Torvalds 
28521da177e4SLinus Torvalds 	switch (st->state) {
28531da177e4SLinus Torvalds 	case TCP_SEQ_STATE_LISTENING:
28541da177e4SLinus Torvalds 		if (v != SEQ_START_TOKEN)
28554461568aSKuniyuki Iwashima 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
28561da177e4SLinus Torvalds 		break;
28571da177e4SLinus Torvalds 	case TCP_SEQ_STATE_ESTABLISHED:
28581da177e4SLinus Torvalds 		if (v)
28594461568aSKuniyuki Iwashima 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
28601da177e4SLinus Torvalds 		break;
28611da177e4SLinus Torvalds 	}
28621da177e4SLinus Torvalds }
286337d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_stop);
28641da177e4SLinus Torvalds 
2865d4f06873SEric Dumazet static void get_openreq4(const struct request_sock *req,
2866aa3a0c8cSEric Dumazet 			 struct seq_file *f, int i)
28671da177e4SLinus Torvalds {
28682e6599cbSArnaldo Carvalho de Melo 	const struct inet_request_sock *ireq = inet_rsk(req);
2869fa76ce73SEric Dumazet 	long delta = req->rsk_timer.expires - jiffies;
28701da177e4SLinus Torvalds 
28715e659e4cSPavel Emelyanov 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2872652586dfSTetsuo Handa 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
28731da177e4SLinus Torvalds 		i,
2874634fb979SEric Dumazet 		ireq->ir_loc_addr,
2875d4f06873SEric Dumazet 		ireq->ir_num,
2876634fb979SEric Dumazet 		ireq->ir_rmt_addr,
2877634fb979SEric Dumazet 		ntohs(ireq->ir_rmt_port),
28781da177e4SLinus Torvalds 		TCP_SYN_RECV,
28791da177e4SLinus Torvalds 		0, 0, /* could print option size, but that is af dependent. */
28801da177e4SLinus Torvalds 		1,    /* timers active (only the expire timer) */
2881a399a805SEric Dumazet 		jiffies_delta_to_clock_t(delta),
2882e6c022a4SEric Dumazet 		req->num_timeout,
2883aa3a0c8cSEric Dumazet 		from_kuid_munged(seq_user_ns(f),
2884aa3a0c8cSEric Dumazet 				 sock_i_uid(req->rsk_listener)),
28851da177e4SLinus Torvalds 		0,  /* non standard timer */
28861da177e4SLinus Torvalds 		0, /* open_requests have no inode */
2887d4f06873SEric Dumazet 		0,
2888652586dfSTetsuo Handa 		req);
28891da177e4SLinus Torvalds }
28901da177e4SLinus Torvalds 
2891652586dfSTetsuo Handa static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
28921da177e4SLinus Torvalds {
28931da177e4SLinus Torvalds 	int timer_active;
28941da177e4SLinus Torvalds 	unsigned long timer_expires;
2895cf533ea5SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
2896cf4c6bf8SIlpo Järvinen 	const struct inet_connection_sock *icsk = inet_csk(sk);
2897cf533ea5SEric Dumazet 	const struct inet_sock *inet = inet_sk(sk);
28980536fcc0SEric Dumazet 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2899c720c7e8SEric Dumazet 	__be32 dest = inet->inet_daddr;
2900c720c7e8SEric Dumazet 	__be32 src = inet->inet_rcv_saddr;
2901c720c7e8SEric Dumazet 	__u16 destp = ntohs(inet->inet_dport);
2902c720c7e8SEric Dumazet 	__u16 srcp = ntohs(inet->inet_sport);
290349d09007SEric Dumazet 	int rx_queue;
290400fd38d9SEric Dumazet 	int state;
29051da177e4SLinus Torvalds 
29066ba8a3b1SNandita Dukkipati 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
290757dde7f7SYuchung Cheng 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
29086ba8a3b1SNandita Dukkipati 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
29091da177e4SLinus Torvalds 		timer_active	= 1;
2910463c84b9SArnaldo Carvalho de Melo 		timer_expires	= icsk->icsk_timeout;
2911463c84b9SArnaldo Carvalho de Melo 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
29121da177e4SLinus Torvalds 		timer_active	= 4;
2913463c84b9SArnaldo Carvalho de Melo 		timer_expires	= icsk->icsk_timeout;
2914cf4c6bf8SIlpo Järvinen 	} else if (timer_pending(&sk->sk_timer)) {
29151da177e4SLinus Torvalds 		timer_active	= 2;
2916cf4c6bf8SIlpo Järvinen 		timer_expires	= sk->sk_timer.expires;
29171da177e4SLinus Torvalds 	} else {
29181da177e4SLinus Torvalds 		timer_active	= 0;
29191da177e4SLinus Torvalds 		timer_expires = jiffies;
29201da177e4SLinus Torvalds 	}
29211da177e4SLinus Torvalds 
2922986ffdfdSYafang Shao 	state = inet_sk_state_load(sk);
292300fd38d9SEric Dumazet 	if (state == TCP_LISTEN)
2924288efe86SEric Dumazet 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
292549d09007SEric Dumazet 	else
292600fd38d9SEric Dumazet 		/* Because we don't lock the socket,
292700fd38d9SEric Dumazet 		 * we might find a transient negative value.
292849d09007SEric Dumazet 		 */
2929dba7d9b8SEric Dumazet 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
29307db48e98SEric Dumazet 				      READ_ONCE(tp->copied_seq), 0);
293149d09007SEric Dumazet 
29325e659e4cSPavel Emelyanov 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2933652586dfSTetsuo Handa 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
293400fd38d9SEric Dumazet 		i, src, srcp, dest, destp, state,
29350f317464SEric Dumazet 		READ_ONCE(tp->write_seq) - tp->snd_una,
293649d09007SEric Dumazet 		rx_queue,
29371da177e4SLinus Torvalds 		timer_active,
2938a399a805SEric Dumazet 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2939463c84b9SArnaldo Carvalho de Melo 		icsk->icsk_retransmits,
2940a7cb5a49SEric W. Biederman 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
29416687e988SArnaldo Carvalho de Melo 		icsk->icsk_probes_out,
2942cf4c6bf8SIlpo Järvinen 		sock_i_ino(sk),
294341c6d650SReshetova, Elena 		refcount_read(&sk->sk_refcnt), sk,
29447be87351SStephen Hemminger 		jiffies_to_clock_t(icsk->icsk_rto),
29457be87351SStephen Hemminger 		jiffies_to_clock_t(icsk->icsk_ack.ato),
294631954cd8SWei Wang 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
294740570375SEric Dumazet 		tcp_snd_cwnd(tp),
294800fd38d9SEric Dumazet 		state == TCP_LISTEN ?
294900fd38d9SEric Dumazet 		    fastopenq->max_qlen :
2950652586dfSTetsuo Handa 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
29511da177e4SLinus Torvalds }
29521da177e4SLinus Torvalds 
2953cf533ea5SEric Dumazet static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2954652586dfSTetsuo Handa 			       struct seq_file *f, int i)
29551da177e4SLinus Torvalds {
2956789f558cSEric Dumazet 	long delta = tw->tw_timer.expires - jiffies;
295723f33c2dSAl Viro 	__be32 dest, src;
29581da177e4SLinus Torvalds 	__u16 destp, srcp;
29591da177e4SLinus Torvalds 
29601da177e4SLinus Torvalds 	dest  = tw->tw_daddr;
29611da177e4SLinus Torvalds 	src   = tw->tw_rcv_saddr;
29621da177e4SLinus Torvalds 	destp = ntohs(tw->tw_dport);
29631da177e4SLinus Torvalds 	srcp  = ntohs(tw->tw_sport);
29641da177e4SLinus Torvalds 
29655e659e4cSPavel Emelyanov 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2966652586dfSTetsuo Handa 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
29673e5cbbb1SEric Dumazet 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2968a399a805SEric Dumazet 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
296941c6d650SReshetova, Elena 		refcount_read(&tw->tw_refcnt), tw);
29701da177e4SLinus Torvalds }
29711da177e4SLinus Torvalds 
29721da177e4SLinus Torvalds #define TMPSZ 150
29731da177e4SLinus Torvalds 
29741da177e4SLinus Torvalds static int tcp4_seq_show(struct seq_file *seq, void *v)
29751da177e4SLinus Torvalds {
29761da177e4SLinus Torvalds 	struct tcp_iter_state *st;
297705dbc7b5SEric Dumazet 	struct sock *sk = v;
29781da177e4SLinus Torvalds 
2979652586dfSTetsuo Handa 	seq_setwidth(seq, TMPSZ - 1);
29801da177e4SLinus Torvalds 	if (v == SEQ_START_TOKEN) {
2981652586dfSTetsuo Handa 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
29821da177e4SLinus Torvalds 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
29831da177e4SLinus Torvalds 			   "inode");
29841da177e4SLinus Torvalds 		goto out;
29851da177e4SLinus Torvalds 	}
29861da177e4SLinus Torvalds 	st = seq->private;
29871da177e4SLinus Torvalds 
298805dbc7b5SEric Dumazet 	if (sk->sk_state == TCP_TIME_WAIT)
2989652586dfSTetsuo Handa 		get_timewait4_sock(v, seq, st->num);
2990079096f1SEric Dumazet 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2991079096f1SEric Dumazet 		get_openreq4(v, seq, st->num);
299205dbc7b5SEric Dumazet 	else
2993652586dfSTetsuo Handa 		get_tcp4_sock(v, seq, st->num);
29941da177e4SLinus Torvalds out:
2995652586dfSTetsuo Handa 	seq_pad(seq, '\n');
29961da177e4SLinus Torvalds 	return 0;
29971da177e4SLinus Torvalds }
29981da177e4SLinus Torvalds 
299952d87d5fSYonghong Song #ifdef CONFIG_BPF_SYSCALL
300004c7820bSMartin KaFai Lau struct bpf_tcp_iter_state {
300104c7820bSMartin KaFai Lau 	struct tcp_iter_state state;
300204c7820bSMartin KaFai Lau 	unsigned int cur_sk;
300304c7820bSMartin KaFai Lau 	unsigned int end_sk;
300404c7820bSMartin KaFai Lau 	unsigned int max_sk;
300504c7820bSMartin KaFai Lau 	struct sock **batch;
300604c7820bSMartin KaFai Lau 	bool st_bucket_done;
300704c7820bSMartin KaFai Lau };
300804c7820bSMartin KaFai Lau 
300952d87d5fSYonghong Song struct bpf_iter__tcp {
301052d87d5fSYonghong Song 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
301152d87d5fSYonghong Song 	__bpf_md_ptr(struct sock_common *, sk_common);
301252d87d5fSYonghong Song 	uid_t uid __aligned(8);
301352d87d5fSYonghong Song };
301452d87d5fSYonghong Song 
301552d87d5fSYonghong Song static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
301652d87d5fSYonghong Song 			     struct sock_common *sk_common, uid_t uid)
301752d87d5fSYonghong Song {
301852d87d5fSYonghong Song 	struct bpf_iter__tcp ctx;
301952d87d5fSYonghong Song 
302052d87d5fSYonghong Song 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
302152d87d5fSYonghong Song 	ctx.meta = meta;
302252d87d5fSYonghong Song 	ctx.sk_common = sk_common;
302352d87d5fSYonghong Song 	ctx.uid = uid;
302452d87d5fSYonghong Song 	return bpf_iter_run_prog(prog, &ctx);
302552d87d5fSYonghong Song }
302652d87d5fSYonghong Song 
302704c7820bSMartin KaFai Lau static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
302804c7820bSMartin KaFai Lau {
302904c7820bSMartin KaFai Lau 	while (iter->cur_sk < iter->end_sk)
3030580031ffSMartin KaFai Lau 		sock_gen_put(iter->batch[iter->cur_sk++]);
303104c7820bSMartin KaFai Lau }
303204c7820bSMartin KaFai Lau 
303304c7820bSMartin KaFai Lau static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
303404c7820bSMartin KaFai Lau 				      unsigned int new_batch_sz)
303504c7820bSMartin KaFai Lau {
303604c7820bSMartin KaFai Lau 	struct sock **new_batch;
303704c7820bSMartin KaFai Lau 
303804c7820bSMartin KaFai Lau 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
303904c7820bSMartin KaFai Lau 			     GFP_USER | __GFP_NOWARN);
304004c7820bSMartin KaFai Lau 	if (!new_batch)
304104c7820bSMartin KaFai Lau 		return -ENOMEM;
304204c7820bSMartin KaFai Lau 
304304c7820bSMartin KaFai Lau 	bpf_iter_tcp_put_batch(iter);
304404c7820bSMartin KaFai Lau 	kvfree(iter->batch);
304504c7820bSMartin KaFai Lau 	iter->batch = new_batch;
304604c7820bSMartin KaFai Lau 	iter->max_sk = new_batch_sz;
304704c7820bSMartin KaFai Lau 
304804c7820bSMartin KaFai Lau 	return 0;
304904c7820bSMartin KaFai Lau }
305004c7820bSMartin KaFai Lau 
305104c7820bSMartin KaFai Lau static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
305204c7820bSMartin KaFai Lau 						 struct sock *start_sk)
305304c7820bSMartin KaFai Lau {
30544461568aSKuniyuki Iwashima 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
305504c7820bSMartin KaFai Lau 	struct bpf_tcp_iter_state *iter = seq->private;
305604c7820bSMartin KaFai Lau 	struct tcp_iter_state *st = &iter->state;
3057cae3873cSMartin KaFai Lau 	struct hlist_nulls_node *node;
305804c7820bSMartin KaFai Lau 	unsigned int expected = 1;
305904c7820bSMartin KaFai Lau 	struct sock *sk;
306004c7820bSMartin KaFai Lau 
306104c7820bSMartin KaFai Lau 	sock_hold(start_sk);
306204c7820bSMartin KaFai Lau 	iter->batch[iter->end_sk++] = start_sk;
306304c7820bSMartin KaFai Lau 
3064cae3873cSMartin KaFai Lau 	sk = sk_nulls_next(start_sk);
3065cae3873cSMartin KaFai Lau 	sk_nulls_for_each_from(sk, node) {
306604c7820bSMartin KaFai Lau 		if (seq_sk_match(seq, sk)) {
306704c7820bSMartin KaFai Lau 			if (iter->end_sk < iter->max_sk) {
306804c7820bSMartin KaFai Lau 				sock_hold(sk);
306904c7820bSMartin KaFai Lau 				iter->batch[iter->end_sk++] = sk;
307004c7820bSMartin KaFai Lau 			}
307104c7820bSMartin KaFai Lau 			expected++;
307204c7820bSMartin KaFai Lau 		}
307304c7820bSMartin KaFai Lau 	}
30744461568aSKuniyuki Iwashima 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
307504c7820bSMartin KaFai Lau 
307604c7820bSMartin KaFai Lau 	return expected;
307704c7820bSMartin KaFai Lau }
307804c7820bSMartin KaFai Lau 
307904c7820bSMartin KaFai Lau static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
308004c7820bSMartin KaFai Lau 						   struct sock *start_sk)
308104c7820bSMartin KaFai Lau {
30824461568aSKuniyuki Iwashima 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
308304c7820bSMartin KaFai Lau 	struct bpf_tcp_iter_state *iter = seq->private;
308404c7820bSMartin KaFai Lau 	struct tcp_iter_state *st = &iter->state;
308504c7820bSMartin KaFai Lau 	struct hlist_nulls_node *node;
308604c7820bSMartin KaFai Lau 	unsigned int expected = 1;
308704c7820bSMartin KaFai Lau 	struct sock *sk;
308804c7820bSMartin KaFai Lau 
308904c7820bSMartin KaFai Lau 	sock_hold(start_sk);
309004c7820bSMartin KaFai Lau 	iter->batch[iter->end_sk++] = start_sk;
309104c7820bSMartin KaFai Lau 
309204c7820bSMartin KaFai Lau 	sk = sk_nulls_next(start_sk);
309304c7820bSMartin KaFai Lau 	sk_nulls_for_each_from(sk, node) {
309404c7820bSMartin KaFai Lau 		if (seq_sk_match(seq, sk)) {
309504c7820bSMartin KaFai Lau 			if (iter->end_sk < iter->max_sk) {
309604c7820bSMartin KaFai Lau 				sock_hold(sk);
309704c7820bSMartin KaFai Lau 				iter->batch[iter->end_sk++] = sk;
309804c7820bSMartin KaFai Lau 			}
309904c7820bSMartin KaFai Lau 			expected++;
310004c7820bSMartin KaFai Lau 		}
310104c7820bSMartin KaFai Lau 	}
31024461568aSKuniyuki Iwashima 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
310304c7820bSMartin KaFai Lau 
310404c7820bSMartin KaFai Lau 	return expected;
310504c7820bSMartin KaFai Lau }
310604c7820bSMartin KaFai Lau 
310704c7820bSMartin KaFai Lau static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
310804c7820bSMartin KaFai Lau {
31094461568aSKuniyuki Iwashima 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
311004c7820bSMartin KaFai Lau 	struct bpf_tcp_iter_state *iter = seq->private;
311104c7820bSMartin KaFai Lau 	struct tcp_iter_state *st = &iter->state;
311204c7820bSMartin KaFai Lau 	unsigned int expected;
311304c7820bSMartin KaFai Lau 	bool resized = false;
311404c7820bSMartin KaFai Lau 	struct sock *sk;
311504c7820bSMartin KaFai Lau 
311604c7820bSMartin KaFai Lau 	/* The st->bucket is done.  Directly advance to the next
311704c7820bSMartin KaFai Lau 	 * bucket instead of having the tcp_seek_last_pos() to skip
311804c7820bSMartin KaFai Lau 	 * one by one in the current bucket and eventually find out
311904c7820bSMartin KaFai Lau 	 * it has to advance to the next bucket.
312004c7820bSMartin KaFai Lau 	 */
312104c7820bSMartin KaFai Lau 	if (iter->st_bucket_done) {
312204c7820bSMartin KaFai Lau 		st->offset = 0;
312304c7820bSMartin KaFai Lau 		st->bucket++;
312404c7820bSMartin KaFai Lau 		if (st->state == TCP_SEQ_STATE_LISTENING &&
31254461568aSKuniyuki Iwashima 		    st->bucket > hinfo->lhash2_mask) {
312604c7820bSMartin KaFai Lau 			st->state = TCP_SEQ_STATE_ESTABLISHED;
312704c7820bSMartin KaFai Lau 			st->bucket = 0;
312804c7820bSMartin KaFai Lau 		}
312904c7820bSMartin KaFai Lau 	}
313004c7820bSMartin KaFai Lau 
313104c7820bSMartin KaFai Lau again:
313204c7820bSMartin KaFai Lau 	/* Get a new batch */
313304c7820bSMartin KaFai Lau 	iter->cur_sk = 0;
313404c7820bSMartin KaFai Lau 	iter->end_sk = 0;
313504c7820bSMartin KaFai Lau 	iter->st_bucket_done = false;
313604c7820bSMartin KaFai Lau 
313704c7820bSMartin KaFai Lau 	sk = tcp_seek_last_pos(seq);
313804c7820bSMartin KaFai Lau 	if (!sk)
313904c7820bSMartin KaFai Lau 		return NULL; /* Done */
314004c7820bSMartin KaFai Lau 
314104c7820bSMartin KaFai Lau 	if (st->state == TCP_SEQ_STATE_LISTENING)
314204c7820bSMartin KaFai Lau 		expected = bpf_iter_tcp_listening_batch(seq, sk);
314304c7820bSMartin KaFai Lau 	else
314404c7820bSMartin KaFai Lau 		expected = bpf_iter_tcp_established_batch(seq, sk);
314504c7820bSMartin KaFai Lau 
314604c7820bSMartin KaFai Lau 	if (iter->end_sk == expected) {
314704c7820bSMartin KaFai Lau 		iter->st_bucket_done = true;
314804c7820bSMartin KaFai Lau 		return sk;
314904c7820bSMartin KaFai Lau 	}
315004c7820bSMartin KaFai Lau 
315104c7820bSMartin KaFai Lau 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
315204c7820bSMartin KaFai Lau 		resized = true;
315304c7820bSMartin KaFai Lau 		goto again;
315404c7820bSMartin KaFai Lau 	}
315504c7820bSMartin KaFai Lau 
315604c7820bSMartin KaFai Lau 	return sk;
315704c7820bSMartin KaFai Lau }
315804c7820bSMartin KaFai Lau 
315904c7820bSMartin KaFai Lau static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
316004c7820bSMartin KaFai Lau {
316104c7820bSMartin KaFai Lau 	/* bpf iter does not support lseek, so it always
316204c7820bSMartin KaFai Lau 	 * continue from where it was stop()-ped.
316304c7820bSMartin KaFai Lau 	 */
316404c7820bSMartin KaFai Lau 	if (*pos)
316504c7820bSMartin KaFai Lau 		return bpf_iter_tcp_batch(seq);
316604c7820bSMartin KaFai Lau 
316704c7820bSMartin KaFai Lau 	return SEQ_START_TOKEN;
316804c7820bSMartin KaFai Lau }
316904c7820bSMartin KaFai Lau 
317004c7820bSMartin KaFai Lau static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
317104c7820bSMartin KaFai Lau {
317204c7820bSMartin KaFai Lau 	struct bpf_tcp_iter_state *iter = seq->private;
317304c7820bSMartin KaFai Lau 	struct tcp_iter_state *st = &iter->state;
317404c7820bSMartin KaFai Lau 	struct sock *sk;
317504c7820bSMartin KaFai Lau 
317604c7820bSMartin KaFai Lau 	/* Whenever seq_next() is called, the iter->cur_sk is
317704c7820bSMartin KaFai Lau 	 * done with seq_show(), so advance to the next sk in
317804c7820bSMartin KaFai Lau 	 * the batch.
317904c7820bSMartin KaFai Lau 	 */
318004c7820bSMartin KaFai Lau 	if (iter->cur_sk < iter->end_sk) {
318104c7820bSMartin KaFai Lau 		/* Keeping st->num consistent in tcp_iter_state.
318204c7820bSMartin KaFai Lau 		 * bpf_iter_tcp does not use st->num.
318304c7820bSMartin KaFai Lau 		 * meta.seq_num is used instead.
318404c7820bSMartin KaFai Lau 		 */
318504c7820bSMartin KaFai Lau 		st->num++;
318604c7820bSMartin KaFai Lau 		/* Move st->offset to the next sk in the bucket such that
318704c7820bSMartin KaFai Lau 		 * the future start() will resume at st->offset in
318804c7820bSMartin KaFai Lau 		 * st->bucket.  See tcp_seek_last_pos().
318904c7820bSMartin KaFai Lau 		 */
319004c7820bSMartin KaFai Lau 		st->offset++;
3191580031ffSMartin KaFai Lau 		sock_gen_put(iter->batch[iter->cur_sk++]);
319204c7820bSMartin KaFai Lau 	}
319304c7820bSMartin KaFai Lau 
319404c7820bSMartin KaFai Lau 	if (iter->cur_sk < iter->end_sk)
319504c7820bSMartin KaFai Lau 		sk = iter->batch[iter->cur_sk];
319604c7820bSMartin KaFai Lau 	else
319704c7820bSMartin KaFai Lau 		sk = bpf_iter_tcp_batch(seq);
319804c7820bSMartin KaFai Lau 
319904c7820bSMartin KaFai Lau 	++*pos;
320004c7820bSMartin KaFai Lau 	/* Keeping st->last_pos consistent in tcp_iter_state.
320104c7820bSMartin KaFai Lau 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
320204c7820bSMartin KaFai Lau 	 */
320304c7820bSMartin KaFai Lau 	st->last_pos = *pos;
320404c7820bSMartin KaFai Lau 	return sk;
320504c7820bSMartin KaFai Lau }
320604c7820bSMartin KaFai Lau 
320752d87d5fSYonghong Song static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
320852d87d5fSYonghong Song {
320952d87d5fSYonghong Song 	struct bpf_iter_meta meta;
321052d87d5fSYonghong Song 	struct bpf_prog *prog;
321152d87d5fSYonghong Song 	struct sock *sk = v;
321252d87d5fSYonghong Song 	uid_t uid;
321304c7820bSMartin KaFai Lau 	int ret;
321452d87d5fSYonghong Song 
321552d87d5fSYonghong Song 	if (v == SEQ_START_TOKEN)
321652d87d5fSYonghong Song 		return 0;
321752d87d5fSYonghong Song 
321804c7820bSMartin KaFai Lau 	if (sk_fullsock(sk))
32199378096eSAditi Ghag 		lock_sock(sk);
322004c7820bSMartin KaFai Lau 
322104c7820bSMartin KaFai Lau 	if (unlikely(sk_unhashed(sk))) {
322204c7820bSMartin KaFai Lau 		ret = SEQ_SKIP;
322304c7820bSMartin KaFai Lau 		goto unlock;
322404c7820bSMartin KaFai Lau 	}
322504c7820bSMartin KaFai Lau 
322652d87d5fSYonghong Song 	if (sk->sk_state == TCP_TIME_WAIT) {
322752d87d5fSYonghong Song 		uid = 0;
322852d87d5fSYonghong Song 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
322952d87d5fSYonghong Song 		const struct request_sock *req = v;
323052d87d5fSYonghong Song 
323152d87d5fSYonghong Song 		uid = from_kuid_munged(seq_user_ns(seq),
323252d87d5fSYonghong Song 				       sock_i_uid(req->rsk_listener));
323352d87d5fSYonghong Song 	} else {
323452d87d5fSYonghong Song 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
323552d87d5fSYonghong Song 	}
323652d87d5fSYonghong Song 
323752d87d5fSYonghong Song 	meta.seq = seq;
323852d87d5fSYonghong Song 	prog = bpf_iter_get_info(&meta, false);
323904c7820bSMartin KaFai Lau 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
324004c7820bSMartin KaFai Lau 
324104c7820bSMartin KaFai Lau unlock:
324204c7820bSMartin KaFai Lau 	if (sk_fullsock(sk))
32439378096eSAditi Ghag 		release_sock(sk);
324404c7820bSMartin KaFai Lau 	return ret;
324504c7820bSMartin KaFai Lau 
324652d87d5fSYonghong Song }
324752d87d5fSYonghong Song 
324852d87d5fSYonghong Song static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
324952d87d5fSYonghong Song {
325004c7820bSMartin KaFai Lau 	struct bpf_tcp_iter_state *iter = seq->private;
325152d87d5fSYonghong Song 	struct bpf_iter_meta meta;
325252d87d5fSYonghong Song 	struct bpf_prog *prog;
325352d87d5fSYonghong Song 
325452d87d5fSYonghong Song 	if (!v) {
325552d87d5fSYonghong Song 		meta.seq = seq;
325652d87d5fSYonghong Song 		prog = bpf_iter_get_info(&meta, true);
325752d87d5fSYonghong Song 		if (prog)
325852d87d5fSYonghong Song 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
325952d87d5fSYonghong Song 	}
326052d87d5fSYonghong Song 
326104c7820bSMartin KaFai Lau 	if (iter->cur_sk < iter->end_sk) {
326204c7820bSMartin KaFai Lau 		bpf_iter_tcp_put_batch(iter);
326304c7820bSMartin KaFai Lau 		iter->st_bucket_done = false;
326404c7820bSMartin KaFai Lau 	}
326552d87d5fSYonghong Song }
326652d87d5fSYonghong Song 
326752d87d5fSYonghong Song static const struct seq_operations bpf_iter_tcp_seq_ops = {
326852d87d5fSYonghong Song 	.show		= bpf_iter_tcp_seq_show,
326904c7820bSMartin KaFai Lau 	.start		= bpf_iter_tcp_seq_start,
327004c7820bSMartin KaFai Lau 	.next		= bpf_iter_tcp_seq_next,
327152d87d5fSYonghong Song 	.stop		= bpf_iter_tcp_seq_stop,
327252d87d5fSYonghong Song };
327352d87d5fSYonghong Song #endif
3274ad2d6137SMartin KaFai Lau static unsigned short seq_file_family(const struct seq_file *seq)
3275ad2d6137SMartin KaFai Lau {
327662001372SMartin KaFai Lau 	const struct tcp_seq_afinfo *afinfo;
3277ad2d6137SMartin KaFai Lau 
327862001372SMartin KaFai Lau #ifdef CONFIG_BPF_SYSCALL
3279ad2d6137SMartin KaFai Lau 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
328062001372SMartin KaFai Lau 	if (seq->op == &bpf_iter_tcp_seq_ops)
3281ad2d6137SMartin KaFai Lau 		return AF_UNSPEC;
328262001372SMartin KaFai Lau #endif
3283ad2d6137SMartin KaFai Lau 
3284ad2d6137SMartin KaFai Lau 	/* Iterated from proc fs */
3285359745d7SMuchun Song 	afinfo = pde_data(file_inode(seq->file));
3286ad2d6137SMartin KaFai Lau 	return afinfo->family;
3287ad2d6137SMartin KaFai Lau }
328852d87d5fSYonghong Song 
328937d849bbSChristoph Hellwig static const struct seq_operations tcp4_seq_ops = {
329037d849bbSChristoph Hellwig 	.show		= tcp4_seq_show,
329137d849bbSChristoph Hellwig 	.start		= tcp_seq_start,
329237d849bbSChristoph Hellwig 	.next		= tcp_seq_next,
329337d849bbSChristoph Hellwig 	.stop		= tcp_seq_stop,
329437d849bbSChristoph Hellwig };
329537d849bbSChristoph Hellwig 
32961da177e4SLinus Torvalds static struct tcp_seq_afinfo tcp4_seq_afinfo = {
32971da177e4SLinus Torvalds 	.family		= AF_INET,
32981da177e4SLinus Torvalds };
32991da177e4SLinus Torvalds 
33002c8c1e72SAlexey Dobriyan static int __net_init tcp4_proc_init_net(struct net *net)
3301757764f6SPavel Emelyanov {
3302c3506372SChristoph Hellwig 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3303c3506372SChristoph Hellwig 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
330437d849bbSChristoph Hellwig 		return -ENOMEM;
330537d849bbSChristoph Hellwig 	return 0;
3306757764f6SPavel Emelyanov }
3307757764f6SPavel Emelyanov 
33082c8c1e72SAlexey Dobriyan static void __net_exit tcp4_proc_exit_net(struct net *net)
3309757764f6SPavel Emelyanov {
331037d849bbSChristoph Hellwig 	remove_proc_entry("tcp", net->proc_net);
3311757764f6SPavel Emelyanov }
3312757764f6SPavel Emelyanov 
3313757764f6SPavel Emelyanov static struct pernet_operations tcp4_net_ops = {
3314757764f6SPavel Emelyanov 	.init = tcp4_proc_init_net,
3315757764f6SPavel Emelyanov 	.exit = tcp4_proc_exit_net,
3316757764f6SPavel Emelyanov };
3317757764f6SPavel Emelyanov 
33181da177e4SLinus Torvalds int __init tcp4_proc_init(void)
33191da177e4SLinus Torvalds {
3320757764f6SPavel Emelyanov 	return register_pernet_subsys(&tcp4_net_ops);
33211da177e4SLinus Torvalds }
33221da177e4SLinus Torvalds 
33231da177e4SLinus Torvalds void tcp4_proc_exit(void)
33241da177e4SLinus Torvalds {
3325757764f6SPavel Emelyanov 	unregister_pernet_subsys(&tcp4_net_ops);
33261da177e4SLinus Torvalds }
33271da177e4SLinus Torvalds #endif /* CONFIG_PROC_FS */
33281da177e4SLinus Torvalds 
3329d3cd4924SEric Dumazet /* @wake is one when sk_stream_write_space() calls us.
3330d3cd4924SEric Dumazet  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3331d3cd4924SEric Dumazet  * This mimics the strategy used in sock_def_write_space().
3332d3cd4924SEric Dumazet  */
3333d3cd4924SEric Dumazet bool tcp_stream_memory_free(const struct sock *sk, int wake)
3334d3cd4924SEric Dumazet {
3335d3cd4924SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
3336d3cd4924SEric Dumazet 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3337d3cd4924SEric Dumazet 			    READ_ONCE(tp->snd_nxt);
3338d3cd4924SEric Dumazet 
3339d3cd4924SEric Dumazet 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3340d3cd4924SEric Dumazet }
3341d3cd4924SEric Dumazet EXPORT_SYMBOL(tcp_stream_memory_free);
3342d3cd4924SEric Dumazet 
33431da177e4SLinus Torvalds struct proto tcp_prot = {
33441da177e4SLinus Torvalds 	.name			= "TCP",
33451da177e4SLinus Torvalds 	.owner			= THIS_MODULE,
33461da177e4SLinus Torvalds 	.close			= tcp_close,
3347d74bad4eSAndrey Ignatov 	.pre_connect		= tcp_v4_pre_connect,
33481da177e4SLinus Torvalds 	.connect		= tcp_v4_connect,
33491da177e4SLinus Torvalds 	.disconnect		= tcp_disconnect,
3350463c84b9SArnaldo Carvalho de Melo 	.accept			= inet_csk_accept,
33511da177e4SLinus Torvalds 	.ioctl			= tcp_ioctl,
33521da177e4SLinus Torvalds 	.init			= tcp_v4_init_sock,
33531da177e4SLinus Torvalds 	.destroy		= tcp_v4_destroy_sock,
33541da177e4SLinus Torvalds 	.shutdown		= tcp_shutdown,
33551da177e4SLinus Torvalds 	.setsockopt		= tcp_setsockopt,
33561da177e4SLinus Torvalds 	.getsockopt		= tcp_getsockopt,
33579cacf81fSStanislav Fomichev 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
33584b9d07a4SUrsula Braun 	.keepalive		= tcp_set_keepalive,
33591da177e4SLinus Torvalds 	.recvmsg		= tcp_recvmsg,
33607ba42910SChangli Gao 	.sendmsg		= tcp_sendmsg,
33611d7e4538SDavid Howells 	.splice_eof		= tcp_splice_eof,
33621da177e4SLinus Torvalds 	.backlog_rcv		= tcp_v4_do_rcv,
336346d3ceabSEric Dumazet 	.release_cb		= tcp_release_cb,
3364ab1e0a13SArnaldo Carvalho de Melo 	.hash			= inet_hash,
3365ab1e0a13SArnaldo Carvalho de Melo 	.unhash			= inet_unhash,
3366ab1e0a13SArnaldo Carvalho de Melo 	.get_port		= inet_csk_get_port,
336791a760b2SMenglong Dong 	.put_port		= inet_put_port,
33688a59f9d1SCong Wang #ifdef CONFIG_BPF_SYSCALL
33698a59f9d1SCong Wang 	.psock_update_sk_prot	= tcp_bpf_update_proto,
33708a59f9d1SCong Wang #endif
33711da177e4SLinus Torvalds 	.enter_memory_pressure	= tcp_enter_memory_pressure,
337206044751SEric Dumazet 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3373c9bee3b7SEric Dumazet 	.stream_memory_free	= tcp_stream_memory_free,
33741da177e4SLinus Torvalds 	.sockets_allocated	= &tcp_sockets_allocated,
33750a5578cfSArnaldo Carvalho de Melo 	.orphan_count		= &tcp_orphan_count,
33760defbb0aSEric Dumazet 
33771da177e4SLinus Torvalds 	.memory_allocated	= &tcp_memory_allocated,
33780defbb0aSEric Dumazet 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
33790defbb0aSEric Dumazet 
33801da177e4SLinus Torvalds 	.memory_pressure	= &tcp_memory_pressure,
3381a4fe34bfSEric W. Biederman 	.sysctl_mem		= sysctl_tcp_mem,
3382356d1833SEric Dumazet 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3383356d1833SEric Dumazet 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
33841da177e4SLinus Torvalds 	.max_header		= MAX_TCP_HEADER,
33851da177e4SLinus Torvalds 	.obj_size		= sizeof(struct tcp_sock),
33865f0d5a3aSPaul E. McKenney 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
33876d6ee43eSArnaldo Carvalho de Melo 	.twsk_prot		= &tcp_timewait_sock_ops,
338860236fddSArnaldo Carvalho de Melo 	.rsk_prot		= &tcp_request_sock_ops,
3389429e42c1SKuniyuki Iwashima 	.h.hashinfo		= NULL,
33907ba42910SChangli Gao 	.no_autobind		= true,
3391c1e64e29SLorenzo Colitti 	.diag_destroy		= tcp_abort,
33921da177e4SLinus Torvalds };
33934bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_prot);
33941da177e4SLinus Torvalds 
3395046ee902SDenis V. Lunev static void __net_exit tcp_sk_exit(struct net *net)
3396046ee902SDenis V. Lunev {
3397b506bc97SDust Li 	if (net->ipv4.tcp_congestion_control)
33980baf26b0SMartin KaFai Lau 		bpf_module_put(net->ipv4.tcp_congestion_control,
33990baf26b0SMartin KaFai Lau 			       net->ipv4.tcp_congestion_control->owner);
3400bdbbb852SEric Dumazet }
3401bdbbb852SEric Dumazet 
3402d1e5e640SKuniyuki Iwashima static void __net_init tcp_set_hashinfo(struct net *net)
3403d1e5e640SKuniyuki Iwashima {
3404d1e5e640SKuniyuki Iwashima 	struct inet_hashinfo *hinfo;
3405d1e5e640SKuniyuki Iwashima 	unsigned int ehash_entries;
3406d1e5e640SKuniyuki Iwashima 	struct net *old_net;
3407d1e5e640SKuniyuki Iwashima 
3408d1e5e640SKuniyuki Iwashima 	if (net_eq(net, &init_net))
3409d1e5e640SKuniyuki Iwashima 		goto fallback;
3410d1e5e640SKuniyuki Iwashima 
3411d1e5e640SKuniyuki Iwashima 	old_net = current->nsproxy->net_ns;
3412d1e5e640SKuniyuki Iwashima 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3413d1e5e640SKuniyuki Iwashima 	if (!ehash_entries)
3414d1e5e640SKuniyuki Iwashima 		goto fallback;
3415d1e5e640SKuniyuki Iwashima 
3416d1e5e640SKuniyuki Iwashima 	ehash_entries = roundup_pow_of_two(ehash_entries);
3417d1e5e640SKuniyuki Iwashima 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3418d1e5e640SKuniyuki Iwashima 	if (!hinfo) {
3419d1e5e640SKuniyuki Iwashima 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3420d1e5e640SKuniyuki Iwashima 			"for a netns, fallback to the global one\n",
3421d1e5e640SKuniyuki Iwashima 			ehash_entries);
3422d1e5e640SKuniyuki Iwashima fallback:
3423d1e5e640SKuniyuki Iwashima 		hinfo = &tcp_hashinfo;
3424d1e5e640SKuniyuki Iwashima 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3425d1e5e640SKuniyuki Iwashima 	}
3426d1e5e640SKuniyuki Iwashima 
3427d1e5e640SKuniyuki Iwashima 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3428d1e5e640SKuniyuki Iwashima 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3429d1e5e640SKuniyuki Iwashima 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3430d1e5e640SKuniyuki Iwashima }
3431d1e5e640SKuniyuki Iwashima 
3432bdbbb852SEric Dumazet static int __net_init tcp_sk_init(struct net *net)
3433bdbbb852SEric Dumazet {
3434bdbbb852SEric Dumazet 	net->ipv4.sysctl_tcp_ecn = 2;
343549213555SDaniel Borkmann 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
343649213555SDaniel Borkmann 
3437b0f9ca53SFan Du 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
34385f3e2bf0SEric Dumazet 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
34396b58e0a5SFan Du 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
344005cbc0dbSFan Du 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3441c04b79b6SJosh Hunt 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3442bdbbb852SEric Dumazet 
344313b287e8SNikolay Borisov 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
34449bd6861bSNikolay Borisov 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3445b840d15dSNikolay Borisov 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
344613b287e8SNikolay Borisov 
34476fa25166SNikolay Borisov 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
34487c083ecbSNikolay Borisov 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
34490aca737dSDavid S. Miller 	net->ipv4.sysctl_tcp_syncookies = 1;
34501043e25fSNikolay Borisov 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3451ae5c3f40SNikolay Borisov 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3452c6214a97SNikolay Borisov 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3453c402d9beSNikolay Borisov 	net->ipv4.sysctl_tcp_orphan_retries = 0;
34541e579caaSNikolay Borisov 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
34554979f2d9SNikolay Borisov 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
345679e9fed4SMaciej Żenczykowski 	net->ipv4.sysctl_tcp_tw_reuse = 2;
345765e6d901SKevin(Yudong) Yang 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
345812ed8244SNikolay Borisov 
3459e9bd0ccaSKuniyuki Iwashima 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3460d1e5e640SKuniyuki Iwashima 	tcp_set_hashinfo(net);
34611946e672SHaishuang Yan 
3462f9301034SEric Dumazet 	net->ipv4.sysctl_tcp_sack = 1;
34639bb37ef0SEric Dumazet 	net->ipv4.sysctl_tcp_window_scaling = 1;
34645d2ed052SEric Dumazet 	net->ipv4.sysctl_tcp_timestamps = 1;
34652ae21cf5SEric Dumazet 	net->ipv4.sysctl_tcp_early_retrans = 3;
3466e20223f1SEric Dumazet 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3467b510f0d2SEric Dumazet 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3468e0a1e5b5SEric Dumazet 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3469c6e21803SEric Dumazet 	net->ipv4.sysctl_tcp_max_reordering = 300;
34706496f6bdSEric Dumazet 	net->ipv4.sysctl_tcp_dsack = 1;
34710c12654aSEric Dumazet 	net->ipv4.sysctl_tcp_app_win = 31;
347294f0893eSEric Dumazet 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3473af9b69a7SEric Dumazet 	net->ipv4.sysctl_tcp_frto = 2;
34744540c0cfSEric Dumazet 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3475d06a9904SEric Dumazet 	/* This limits the percentage of the congestion window which we
3476d06a9904SEric Dumazet 	 * will allow a single TSO frame to consume.  Building TSO frames
3477d06a9904SEric Dumazet 	 * which are too large can cause TCP streams to be bursty.
3478d06a9904SEric Dumazet 	 */
3479d06a9904SEric Dumazet 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3480c73e5807SEric Dumazet 	/* Default TSQ limit of 16 TSO segments */
3481c73e5807SEric Dumazet 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
348279e3602cSEric Dumazet 
348379e3602cSEric Dumazet 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
348479e3602cSEric Dumazet 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
348579e3602cSEric Dumazet 
348626e9596eSEric Dumazet 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
348765466904SEric Dumazet 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3488bd239704SEric Dumazet 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3489790f00e1SEric Dumazet 	net->ipv4.sysctl_tcp_autocorking = 1;
34904170ba6bSEric Dumazet 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
349123a7102aSEric Dumazet 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3492c26e91f8SEric Dumazet 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3493356d1833SEric Dumazet 	if (net != &init_net) {
3494356d1833SEric Dumazet 		memcpy(net->ipv4.sysctl_tcp_rmem,
3495356d1833SEric Dumazet 		       init_net.ipv4.sysctl_tcp_rmem,
3496356d1833SEric Dumazet 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3497356d1833SEric Dumazet 		memcpy(net->ipv4.sysctl_tcp_wmem,
3498356d1833SEric Dumazet 		       init_net.ipv4.sysctl_tcp_wmem,
3499356d1833SEric Dumazet 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3500356d1833SEric Dumazet 	}
35016d82aa24SEric Dumazet 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3502a70437ccSEric Dumazet 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
35039c21d2fcSEric Dumazet 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3504133c4c0dSEric Dumazet 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3505e1cfcbe8SHaishuang Yan 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3506213ad73dSWei Wang 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
35073733be14SHaishuang Yan 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3508e1cfcbe8SHaishuang Yan 
3509bd456f28SMubashir Adnan Qureshi 	/* Set default values for PLB */
3510bd456f28SMubashir Adnan Qureshi 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3511bd456f28SMubashir Adnan Qureshi 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3512bd456f28SMubashir Adnan Qureshi 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3513bd456f28SMubashir Adnan Qureshi 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3514bd456f28SMubashir Adnan Qureshi 	/* Default congestion threshold for PLB to mark a round is 50% */
35151a91bb7cSMubashir Adnan Qureshi 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3516bd456f28SMubashir Adnan Qureshi 
35176670e152SStephen Hemminger 	/* Reno is always built in */
35186670e152SStephen Hemminger 	if (!net_eq(net, &init_net) &&
35190baf26b0SMartin KaFai Lau 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
35200baf26b0SMartin KaFai Lau 			       init_net.ipv4.tcp_congestion_control->owner))
35216670e152SStephen Hemminger 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
35226670e152SStephen Hemminger 	else
35236670e152SStephen Hemminger 		net->ipv4.tcp_congestion_control = &tcp_reno;
35246670e152SStephen Hemminger 
3525ccce324dSDavid Morley 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3526b650d953Smfreemon@cloudflare.com 	net->ipv4.sysctl_tcp_shrink_window = 0;
3527b650d953Smfreemon@cloudflare.com 
3528562b1fdfSHaiyang Zhang 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3529f086edefSKevin Yang 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3530562b1fdfSHaiyang Zhang 
353149213555SDaniel Borkmann 	return 0;
3532b099ce26SEric W. Biederman }
3533b099ce26SEric W. Biederman 
3534b099ce26SEric W. Biederman static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3535b099ce26SEric W. Biederman {
353643713848SHaishuang Yan 	struct net *net;
353743713848SHaishuang Yan 
3538565d121bSFlorian Westphal 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3539565d121bSFlorian Westphal 	 * and failed setup_net error unwinding path are serialized.
3540565d121bSFlorian Westphal 	 *
3541565d121bSFlorian Westphal 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3542565d121bSFlorian Westphal 	 * net_exit_list, the thread that dismantles a particular twsk must
3543565d121bSFlorian Westphal 	 * do so without other thread progressing to refcount_dec_and_test() of
3544565d121bSFlorian Westphal 	 * tcp_death_row.tw_refcount.
3545565d121bSFlorian Westphal 	 */
3546565d121bSFlorian Westphal 	mutex_lock(&tcp_exit_batch_mutex);
3547565d121bSFlorian Westphal 
35481eeb5043SEric Dumazet 	tcp_twsk_purge(net_exit_list);
354904c494e6SEric Dumazet 
3550e9bd0ccaSKuniyuki Iwashima 	list_for_each_entry(net, net_exit_list, exit_list) {
3551d1e5e640SKuniyuki Iwashima 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3552e9bd0ccaSKuniyuki Iwashima 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
355343713848SHaishuang Yan 		tcp_fastopen_ctx_destroy(net);
3554046ee902SDenis V. Lunev 	}
3555565d121bSFlorian Westphal 
3556565d121bSFlorian Westphal 	mutex_unlock(&tcp_exit_batch_mutex);
3557e9bd0ccaSKuniyuki Iwashima }
3558046ee902SDenis V. Lunev 
3559046ee902SDenis V. Lunev static struct pernet_operations __net_initdata tcp_sk_ops = {
3560046ee902SDenis V. Lunev        .init	   = tcp_sk_init,
3561046ee902SDenis V. Lunev        .exit	   = tcp_sk_exit,
3562b099ce26SEric W. Biederman        .exit_batch = tcp_sk_exit_batch,
3563046ee902SDenis V. Lunev };
3564046ee902SDenis V. Lunev 
356552d87d5fSYonghong Song #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
356652d87d5fSYonghong Song DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
356752d87d5fSYonghong Song 		     struct sock_common *sk_common, uid_t uid)
356852d87d5fSYonghong Song 
356904c7820bSMartin KaFai Lau #define INIT_BATCH_SZ 16
357004c7820bSMartin KaFai Lau 
3571f9c79272SYonghong Song static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
357252d87d5fSYonghong Song {
357304c7820bSMartin KaFai Lau 	struct bpf_tcp_iter_state *iter = priv_data;
357404c7820bSMartin KaFai Lau 	int err;
357552d87d5fSYonghong Song 
357604c7820bSMartin KaFai Lau 	err = bpf_iter_init_seq_net(priv_data, aux);
357704c7820bSMartin KaFai Lau 	if (err)
357804c7820bSMartin KaFai Lau 		return err;
357952d87d5fSYonghong Song 
358004c7820bSMartin KaFai Lau 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
358104c7820bSMartin KaFai Lau 	if (err) {
358204c7820bSMartin KaFai Lau 		bpf_iter_fini_seq_net(priv_data);
358304c7820bSMartin KaFai Lau 		return err;
358404c7820bSMartin KaFai Lau 	}
358504c7820bSMartin KaFai Lau 
358604c7820bSMartin KaFai Lau 	return 0;
358752d87d5fSYonghong Song }
358852d87d5fSYonghong Song 
358952d87d5fSYonghong Song static void bpf_iter_fini_tcp(void *priv_data)
359052d87d5fSYonghong Song {
359104c7820bSMartin KaFai Lau 	struct bpf_tcp_iter_state *iter = priv_data;
359252d87d5fSYonghong Song 
359352d87d5fSYonghong Song 	bpf_iter_fini_seq_net(priv_data);
359404c7820bSMartin KaFai Lau 	kvfree(iter->batch);
359552d87d5fSYonghong Song }
359652d87d5fSYonghong Song 
359714fc6bd6SYonghong Song static const struct bpf_iter_seq_info tcp_seq_info = {
359852d87d5fSYonghong Song 	.seq_ops		= &bpf_iter_tcp_seq_ops,
359952d87d5fSYonghong Song 	.init_seq_private	= bpf_iter_init_tcp,
360052d87d5fSYonghong Song 	.fini_seq_private	= bpf_iter_fini_tcp,
360104c7820bSMartin KaFai Lau 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
360214fc6bd6SYonghong Song };
360314fc6bd6SYonghong Song 
36043cee6fb8SMartin KaFai Lau static const struct bpf_func_proto *
36053cee6fb8SMartin KaFai Lau bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
36063cee6fb8SMartin KaFai Lau 			    const struct bpf_prog *prog)
36073cee6fb8SMartin KaFai Lau {
36083cee6fb8SMartin KaFai Lau 	switch (func_id) {
36093cee6fb8SMartin KaFai Lau 	case BPF_FUNC_setsockopt:
36103cee6fb8SMartin KaFai Lau 		return &bpf_sk_setsockopt_proto;
36113cee6fb8SMartin KaFai Lau 	case BPF_FUNC_getsockopt:
36123cee6fb8SMartin KaFai Lau 		return &bpf_sk_getsockopt_proto;
36133cee6fb8SMartin KaFai Lau 	default:
36143cee6fb8SMartin KaFai Lau 		return NULL;
36153cee6fb8SMartin KaFai Lau 	}
36163cee6fb8SMartin KaFai Lau }
36173cee6fb8SMartin KaFai Lau 
361814fc6bd6SYonghong Song static struct bpf_iter_reg tcp_reg_info = {
361914fc6bd6SYonghong Song 	.target			= "tcp",
362052d87d5fSYonghong Song 	.ctx_arg_info_size	= 1,
362152d87d5fSYonghong Song 	.ctx_arg_info		= {
362252d87d5fSYonghong Song 		{ offsetof(struct bpf_iter__tcp, sk_common),
36234ddbcb88SAditi Ghag 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
362452d87d5fSYonghong Song 	},
36253cee6fb8SMartin KaFai Lau 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
362614fc6bd6SYonghong Song 	.seq_info		= &tcp_seq_info,
362752d87d5fSYonghong Song };
362852d87d5fSYonghong Song 
362952d87d5fSYonghong Song static void __init bpf_iter_register(void)
363052d87d5fSYonghong Song {
3631951cf368SYonghong Song 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
363252d87d5fSYonghong Song 	if (bpf_iter_reg_target(&tcp_reg_info))
363352d87d5fSYonghong Song 		pr_warn("Warning: could not register bpf iterator tcp\n");
363452d87d5fSYonghong Song }
363552d87d5fSYonghong Song 
363652d87d5fSYonghong Song #endif
363752d87d5fSYonghong Song 
36389b0f976fSDenis V. Lunev void __init tcp_v4_init(void)
36391da177e4SLinus Torvalds {
364037ba017dSEric Dumazet 	int cpu, res;
364137ba017dSEric Dumazet 
364237ba017dSEric Dumazet 	for_each_possible_cpu(cpu) {
364337ba017dSEric Dumazet 		struct sock *sk;
364437ba017dSEric Dumazet 
364537ba017dSEric Dumazet 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
364637ba017dSEric Dumazet 					   IPPROTO_TCP, &init_net);
364737ba017dSEric Dumazet 		if (res)
364837ba017dSEric Dumazet 			panic("Failed to create the TCP control socket.\n");
364937ba017dSEric Dumazet 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
365037ba017dSEric Dumazet 
365137ba017dSEric Dumazet 		/* Please enforce IP_DF and IPID==0 for RST and
365237ba017dSEric Dumazet 		 * ACK sent in SYN-RECV and TIME-WAIT state.
365337ba017dSEric Dumazet 		 */
365437ba017dSEric Dumazet 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
365537ba017dSEric Dumazet 
36561693c5dbSAbhishek Chauhan 		sk->sk_clockid = CLOCK_MONOTONIC;
36571693c5dbSAbhishek Chauhan 
3658ebad6d03SSebastian Andrzej Siewior 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
365937ba017dSEric Dumazet 	}
36606a1b3054SEric W. Biederman 	if (register_pernet_subsys(&tcp_sk_ops))
36611da177e4SLinus Torvalds 		panic("Failed to create the TCP control socket.\n");
366252d87d5fSYonghong Song 
366352d87d5fSYonghong Song #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
366452d87d5fSYonghong Song 	bpf_iter_register();
366552d87d5fSYonghong Song #endif
36661da177e4SLinus Torvalds }
3667