xref: /linux/net/ipv4/tcp_ipv4.c (revision 14fc6bd6b79c430f615500d0fe6cea4722110db8)
12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  * INET		An implementation of the TCP/IP protocol suite for the LINUX
41da177e4SLinus Torvalds  *		operating system.  INET is implemented using the  BSD Socket
51da177e4SLinus Torvalds  *		interface as the means of communication with the user level.
61da177e4SLinus Torvalds  *
71da177e4SLinus Torvalds  *		Implementation of the Transmission Control Protocol(TCP).
81da177e4SLinus Torvalds  *
91da177e4SLinus Torvalds  *		IPv4 specific functions
101da177e4SLinus Torvalds  *
111da177e4SLinus Torvalds  *		code split from:
121da177e4SLinus Torvalds  *		linux/ipv4/tcp.c
131da177e4SLinus Torvalds  *		linux/ipv4/tcp_input.c
141da177e4SLinus Torvalds  *		linux/ipv4/tcp_output.c
151da177e4SLinus Torvalds  *
161da177e4SLinus Torvalds  *		See tcp.c for author information
171da177e4SLinus Torvalds  */
181da177e4SLinus Torvalds 
191da177e4SLinus Torvalds /*
201da177e4SLinus Torvalds  * Changes:
211da177e4SLinus Torvalds  *		David S. Miller	:	New socket lookup architecture.
221da177e4SLinus Torvalds  *					This code is dedicated to John Dyson.
231da177e4SLinus Torvalds  *		David S. Miller :	Change semantics of established hash,
241da177e4SLinus Torvalds  *					half is devoted to TIME_WAIT sockets
251da177e4SLinus Torvalds  *					and the rest go in the other half.
261da177e4SLinus Torvalds  *		Andi Kleen :		Add support for syncookies and fixed
271da177e4SLinus Torvalds  *					some bugs: ip options weren't passed to
281da177e4SLinus Torvalds  *					the TCP layer, missed a check for an
291da177e4SLinus Torvalds  *					ACK bit.
301da177e4SLinus Torvalds  *		Andi Kleen :		Implemented fast path mtu discovery.
311da177e4SLinus Torvalds  *	     				Fixed many serious bugs in the
3260236fddSArnaldo Carvalho de Melo  *					request_sock handling and moved
331da177e4SLinus Torvalds  *					most of it into the af independent code.
341da177e4SLinus Torvalds  *					Added tail drop and some other bugfixes.
35caa20d9aSStephen Hemminger  *					Added new listen semantics.
361da177e4SLinus Torvalds  *		Mike McLagan	:	Routing by source
371da177e4SLinus Torvalds  *	Juan Jose Ciarlante:		ip_dynaddr bits
381da177e4SLinus Torvalds  *		Andi Kleen:		various fixes.
391da177e4SLinus Torvalds  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
401da177e4SLinus Torvalds  *					coma.
411da177e4SLinus Torvalds  *	Andi Kleen		:	Fix new listen.
421da177e4SLinus Torvalds  *	Andi Kleen		:	Fix accept error reporting.
431da177e4SLinus Torvalds  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
441da177e4SLinus Torvalds  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
451da177e4SLinus Torvalds  *					a single port at the same time.
461da177e4SLinus Torvalds  */
471da177e4SLinus Torvalds 
48afd46503SJoe Perches #define pr_fmt(fmt) "TCP: " fmt
491da177e4SLinus Torvalds 
50eb4dea58SHerbert Xu #include <linux/bottom_half.h>
511da177e4SLinus Torvalds #include <linux/types.h>
521da177e4SLinus Torvalds #include <linux/fcntl.h>
531da177e4SLinus Torvalds #include <linux/module.h>
541da177e4SLinus Torvalds #include <linux/random.h>
551da177e4SLinus Torvalds #include <linux/cache.h>
561da177e4SLinus Torvalds #include <linux/jhash.h>
571da177e4SLinus Torvalds #include <linux/init.h>
581da177e4SLinus Torvalds #include <linux/times.h>
595a0e3ad6STejun Heo #include <linux/slab.h>
601da177e4SLinus Torvalds 
61457c4cbcSEric W. Biederman #include <net/net_namespace.h>
621da177e4SLinus Torvalds #include <net/icmp.h>
63304a1618SArnaldo Carvalho de Melo #include <net/inet_hashtables.h>
641da177e4SLinus Torvalds #include <net/tcp.h>
6520380731SArnaldo Carvalho de Melo #include <net/transp_v6.h>
661da177e4SLinus Torvalds #include <net/ipv6.h>
671da177e4SLinus Torvalds #include <net/inet_common.h>
686d6ee43eSArnaldo Carvalho de Melo #include <net/timewait_sock.h>
691da177e4SLinus Torvalds #include <net/xfrm.h>
706e5714eaSDavid S. Miller #include <net/secure_seq.h>
71076bb0c8SEliezer Tamir #include <net/busy_poll.h>
721da177e4SLinus Torvalds 
731da177e4SLinus Torvalds #include <linux/inet.h>
741da177e4SLinus Torvalds #include <linux/ipv6.h>
751da177e4SLinus Torvalds #include <linux/stddef.h>
761da177e4SLinus Torvalds #include <linux/proc_fs.h>
771da177e4SLinus Torvalds #include <linux/seq_file.h>
786797318eSIvan Delalande #include <linux/inetdevice.h>
79951cf368SYonghong Song #include <linux/btf_ids.h>
801da177e4SLinus Torvalds 
81cf80e0e4SHerbert Xu #include <crypto/hash.h>
82cfb6eeb4SYOSHIFUJI Hideaki #include <linux/scatterlist.h>
83cfb6eeb4SYOSHIFUJI Hideaki 
84c24b14c4SSong Liu #include <trace/events/tcp.h>
85c24b14c4SSong Liu 
86cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
87a915da9bSEric Dumazet static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88318cf7aaSEric Dumazet 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89cfb6eeb4SYOSHIFUJI Hideaki #endif
90cfb6eeb4SYOSHIFUJI Hideaki 
915caea4eaSEric Dumazet struct inet_hashinfo tcp_hashinfo;
924bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_hashinfo);
931da177e4SLinus Torvalds 
9484b114b9SEric Dumazet static u32 tcp_v4_init_seq(const struct sk_buff *skb)
951da177e4SLinus Torvalds {
9684b114b9SEric Dumazet 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97eddc9ec5SArnaldo Carvalho de Melo 			      ip_hdr(skb)->saddr,
98aa8223c7SArnaldo Carvalho de Melo 			      tcp_hdr(skb)->dest,
9984b114b9SEric Dumazet 			      tcp_hdr(skb)->source);
10084b114b9SEric Dumazet }
10184b114b9SEric Dumazet 
1025d2ed052SEric Dumazet static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
10384b114b9SEric Dumazet {
1045d2ed052SEric Dumazet 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1051da177e4SLinus Torvalds }
1061da177e4SLinus Torvalds 
1076d6ee43eSArnaldo Carvalho de Melo int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
1086d6ee43eSArnaldo Carvalho de Melo {
10979e9fed4SMaciej Żenczykowski 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
1106d6ee43eSArnaldo Carvalho de Melo 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
1116d6ee43eSArnaldo Carvalho de Melo 	struct tcp_sock *tp = tcp_sk(sk);
11279e9fed4SMaciej Żenczykowski 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
11379e9fed4SMaciej Żenczykowski 
11479e9fed4SMaciej Żenczykowski 	if (reuse == 2) {
11579e9fed4SMaciej Żenczykowski 		/* Still does not detect *everything* that goes through
11679e9fed4SMaciej Żenczykowski 		 * lo, since we require a loopback src or dst address
11779e9fed4SMaciej Żenczykowski 		 * or direct binding to 'lo' interface.
11879e9fed4SMaciej Żenczykowski 		 */
11979e9fed4SMaciej Żenczykowski 		bool loopback = false;
12079e9fed4SMaciej Żenczykowski 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
12179e9fed4SMaciej Żenczykowski 			loopback = true;
12279e9fed4SMaciej Żenczykowski #if IS_ENABLED(CONFIG_IPV6)
12379e9fed4SMaciej Żenczykowski 		if (tw->tw_family == AF_INET6) {
12479e9fed4SMaciej Żenczykowski 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125be2644aaSEric Dumazet 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
12679e9fed4SMaciej Żenczykowski 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127be2644aaSEric Dumazet 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
12879e9fed4SMaciej Żenczykowski 				loopback = true;
12979e9fed4SMaciej Żenczykowski 		} else
13079e9fed4SMaciej Żenczykowski #endif
13179e9fed4SMaciej Żenczykowski 		{
13279e9fed4SMaciej Żenczykowski 			if (ipv4_is_loopback(tw->tw_daddr) ||
13379e9fed4SMaciej Żenczykowski 			    ipv4_is_loopback(tw->tw_rcv_saddr))
13479e9fed4SMaciej Żenczykowski 				loopback = true;
13579e9fed4SMaciej Żenczykowski 		}
13679e9fed4SMaciej Żenczykowski 		if (!loopback)
13779e9fed4SMaciej Żenczykowski 			reuse = 0;
13879e9fed4SMaciej Żenczykowski 	}
1396d6ee43eSArnaldo Carvalho de Melo 
1406d6ee43eSArnaldo Carvalho de Melo 	/* With PAWS, it is safe from the viewpoint
1416d6ee43eSArnaldo Carvalho de Melo 	   of data integrity. Even without PAWS it is safe provided sequence
1426d6ee43eSArnaldo Carvalho de Melo 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
1436d6ee43eSArnaldo Carvalho de Melo 
1446d6ee43eSArnaldo Carvalho de Melo 	   Actually, the idea is close to VJ's one, only timestamp cache is
1456d6ee43eSArnaldo Carvalho de Melo 	   held not per host, but per port pair and TW bucket is used as state
1466d6ee43eSArnaldo Carvalho de Melo 	   holder.
1476d6ee43eSArnaldo Carvalho de Melo 
1486d6ee43eSArnaldo Carvalho de Melo 	   If TW bucket has been already destroyed we fall back to VJ's scheme
1496d6ee43eSArnaldo Carvalho de Melo 	   and use initial timestamp retrieved from peer table.
1506d6ee43eSArnaldo Carvalho de Melo 	 */
1516d6ee43eSArnaldo Carvalho de Melo 	if (tcptw->tw_ts_recent_stamp &&
152cca9bab1SArnd Bergmann 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153cca9bab1SArnd Bergmann 					    tcptw->tw_ts_recent_stamp)))) {
15421684dc4SStefan Baranoff 		/* In case of repair and re-using TIME-WAIT sockets we still
15521684dc4SStefan Baranoff 		 * want to be sure that it is safe as above but honor the
15621684dc4SStefan Baranoff 		 * sequence numbers and time stamps set as part of the repair
15721684dc4SStefan Baranoff 		 * process.
15821684dc4SStefan Baranoff 		 *
15921684dc4SStefan Baranoff 		 * Without this check re-using a TIME-WAIT socket with TCP
16021684dc4SStefan Baranoff 		 * repair would accumulate a -1 on the repair assigned
16121684dc4SStefan Baranoff 		 * sequence number. The first time it is reused the sequence
16221684dc4SStefan Baranoff 		 * is -1, the second time -2, etc. This fixes that issue
16321684dc4SStefan Baranoff 		 * without appearing to create any others.
16421684dc4SStefan Baranoff 		 */
16521684dc4SStefan Baranoff 		if (likely(!tp->repair)) {
1660f317464SEric Dumazet 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
1670f317464SEric Dumazet 
1680f317464SEric Dumazet 			if (!seq)
1690f317464SEric Dumazet 				seq = 1;
1700f317464SEric Dumazet 			WRITE_ONCE(tp->write_seq, seq);
1716d6ee43eSArnaldo Carvalho de Melo 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
1726d6ee43eSArnaldo Carvalho de Melo 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
17321684dc4SStefan Baranoff 		}
1746d6ee43eSArnaldo Carvalho de Melo 		sock_hold(sktw);
1756d6ee43eSArnaldo Carvalho de Melo 		return 1;
1766d6ee43eSArnaldo Carvalho de Melo 	}
1776d6ee43eSArnaldo Carvalho de Melo 
1786d6ee43eSArnaldo Carvalho de Melo 	return 0;
1796d6ee43eSArnaldo Carvalho de Melo }
1806d6ee43eSArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(tcp_twsk_unique);
1816d6ee43eSArnaldo Carvalho de Melo 
182d74bad4eSAndrey Ignatov static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183d74bad4eSAndrey Ignatov 			      int addr_len)
184d74bad4eSAndrey Ignatov {
185d74bad4eSAndrey Ignatov 	/* This check is replicated from tcp_v4_connect() and intended to
186d74bad4eSAndrey Ignatov 	 * prevent BPF program called below from accessing bytes that are out
187d74bad4eSAndrey Ignatov 	 * of the bound specified by user in addr_len.
188d74bad4eSAndrey Ignatov 	 */
189d74bad4eSAndrey Ignatov 	if (addr_len < sizeof(struct sockaddr_in))
190d74bad4eSAndrey Ignatov 		return -EINVAL;
191d74bad4eSAndrey Ignatov 
192d74bad4eSAndrey Ignatov 	sock_owned_by_me(sk);
193d74bad4eSAndrey Ignatov 
194d74bad4eSAndrey Ignatov 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195d74bad4eSAndrey Ignatov }
196d74bad4eSAndrey Ignatov 
1971da177e4SLinus Torvalds /* This will initiate an outgoing connection. */
1981da177e4SLinus Torvalds int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
1991da177e4SLinus Torvalds {
2002d7192d6SDavid S. Miller 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
2011da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sk);
2021da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
203dca8b089SDavid S. Miller 	__be16 orig_sport, orig_dport;
204bada8adcSAl Viro 	__be32 daddr, nexthop;
205da905bd1SDavid S. Miller 	struct flowi4 *fl4;
2062d7192d6SDavid S. Miller 	struct rtable *rt;
2071da177e4SLinus Torvalds 	int err;
208f6d8bd05SEric Dumazet 	struct ip_options_rcu *inet_opt;
2091946e672SHaishuang Yan 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
2101da177e4SLinus Torvalds 
2111da177e4SLinus Torvalds 	if (addr_len < sizeof(struct sockaddr_in))
2121da177e4SLinus Torvalds 		return -EINVAL;
2131da177e4SLinus Torvalds 
2141da177e4SLinus Torvalds 	if (usin->sin_family != AF_INET)
2151da177e4SLinus Torvalds 		return -EAFNOSUPPORT;
2161da177e4SLinus Torvalds 
2171da177e4SLinus Torvalds 	nexthop = daddr = usin->sin_addr.s_addr;
218f6d8bd05SEric Dumazet 	inet_opt = rcu_dereference_protected(inet->inet_opt,
2191e1d04e6SHannes Frederic Sowa 					     lockdep_sock_is_held(sk));
220f6d8bd05SEric Dumazet 	if (inet_opt && inet_opt->opt.srr) {
2211da177e4SLinus Torvalds 		if (!daddr)
2221da177e4SLinus Torvalds 			return -EINVAL;
223f6d8bd05SEric Dumazet 		nexthop = inet_opt->opt.faddr;
2241da177e4SLinus Torvalds 	}
2251da177e4SLinus Torvalds 
226dca8b089SDavid S. Miller 	orig_sport = inet->inet_sport;
227dca8b089SDavid S. Miller 	orig_dport = usin->sin_port;
228da905bd1SDavid S. Miller 	fl4 = &inet->cork.fl.u.ip4;
229da905bd1SDavid S. Miller 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
2301da177e4SLinus Torvalds 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
2311da177e4SLinus Torvalds 			      IPPROTO_TCP,
2320e0d44abSSteffen Klassert 			      orig_sport, orig_dport, sk);
233b23dd4feSDavid S. Miller 	if (IS_ERR(rt)) {
234b23dd4feSDavid S. Miller 		err = PTR_ERR(rt);
235b23dd4feSDavid S. Miller 		if (err == -ENETUNREACH)
236f1d8cba6SEric Dumazet 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237b23dd4feSDavid S. Miller 		return err;
238584bdf8cSWei Dong 	}
2391da177e4SLinus Torvalds 
2401da177e4SLinus Torvalds 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
2411da177e4SLinus Torvalds 		ip_rt_put(rt);
2421da177e4SLinus Torvalds 		return -ENETUNREACH;
2431da177e4SLinus Torvalds 	}
2441da177e4SLinus Torvalds 
245f6d8bd05SEric Dumazet 	if (!inet_opt || !inet_opt->opt.srr)
246da905bd1SDavid S. Miller 		daddr = fl4->daddr;
2471da177e4SLinus Torvalds 
248c720c7e8SEric Dumazet 	if (!inet->inet_saddr)
249da905bd1SDavid S. Miller 		inet->inet_saddr = fl4->saddr;
250d1e559d0SEric Dumazet 	sk_rcv_saddr_set(sk, inet->inet_saddr);
2511da177e4SLinus Torvalds 
252c720c7e8SEric Dumazet 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
2531da177e4SLinus Torvalds 		/* Reset inherited state */
2541da177e4SLinus Torvalds 		tp->rx_opt.ts_recent	   = 0;
2551da177e4SLinus Torvalds 		tp->rx_opt.ts_recent_stamp = 0;
256ee995283SPavel Emelyanov 		if (likely(!tp->repair))
2570f317464SEric Dumazet 			WRITE_ONCE(tp->write_seq, 0);
2581da177e4SLinus Torvalds 	}
2591da177e4SLinus Torvalds 
260c720c7e8SEric Dumazet 	inet->inet_dport = usin->sin_port;
261d1e559d0SEric Dumazet 	sk_daddr_set(sk, daddr);
2621da177e4SLinus Torvalds 
263d83d8461SArnaldo Carvalho de Melo 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264f6d8bd05SEric Dumazet 	if (inet_opt)
265f6d8bd05SEric Dumazet 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
2661da177e4SLinus Torvalds 
267bee7ca9eSWilliam Allen Simpson 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
2681da177e4SLinus Torvalds 
2691da177e4SLinus Torvalds 	/* Socket identity is still unknown (sport may be zero).
2701da177e4SLinus Torvalds 	 * However we set state to SYN-SENT and not releasing socket
2711da177e4SLinus Torvalds 	 * lock select source port, enter ourselves into the hash tables and
2721da177e4SLinus Torvalds 	 * complete initialization after this.
2731da177e4SLinus Torvalds 	 */
2741da177e4SLinus Torvalds 	tcp_set_state(sk, TCP_SYN_SENT);
2751946e672SHaishuang Yan 	err = inet_hash_connect(tcp_death_row, sk);
2761da177e4SLinus Torvalds 	if (err)
2771da177e4SLinus Torvalds 		goto failure;
2781da177e4SLinus Torvalds 
279877d1f62STom Herbert 	sk_set_txhash(sk);
2809e7ceb06SSathya Perla 
281da905bd1SDavid S. Miller 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282c720c7e8SEric Dumazet 			       inet->inet_sport, inet->inet_dport, sk);
283b23dd4feSDavid S. Miller 	if (IS_ERR(rt)) {
284b23dd4feSDavid S. Miller 		err = PTR_ERR(rt);
285b23dd4feSDavid S. Miller 		rt = NULL;
2861da177e4SLinus Torvalds 		goto failure;
287b23dd4feSDavid S. Miller 	}
2881da177e4SLinus Torvalds 	/* OK, now commit destination to socket.  */
289bcd76111SHerbert Xu 	sk->sk_gso_type = SKB_GSO_TCPV4;
290d8d1f30bSChangli Gao 	sk_setup_caps(sk, &rt->dst);
29119f6d3f3SWei Wang 	rt = NULL;
2921da177e4SLinus Torvalds 
29300355fa5SAlexey Kodanev 	if (likely(!tp->repair)) {
29484b114b9SEric Dumazet 		if (!tp->write_seq)
2950f317464SEric Dumazet 			WRITE_ONCE(tp->write_seq,
2960f317464SEric Dumazet 				   secure_tcp_seq(inet->inet_saddr,
297c720c7e8SEric Dumazet 						  inet->inet_daddr,
298c720c7e8SEric Dumazet 						  inet->inet_sport,
2990f317464SEric Dumazet 						  usin->sin_port));
3005d2ed052SEric Dumazet 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
3015d2ed052SEric Dumazet 						 inet->inet_saddr,
30284b114b9SEric Dumazet 						 inet->inet_daddr);
30300355fa5SAlexey Kodanev 	}
3041da177e4SLinus Torvalds 
305a904a069SEric Dumazet 	inet->inet_id = prandom_u32();
3061da177e4SLinus Torvalds 
30719f6d3f3SWei Wang 	if (tcp_fastopen_defer_connect(sk, &err))
30819f6d3f3SWei Wang 		return err;
30919f6d3f3SWei Wang 	if (err)
31019f6d3f3SWei Wang 		goto failure;
31119f6d3f3SWei Wang 
3121da177e4SLinus Torvalds 	err = tcp_connect(sk);
313ee995283SPavel Emelyanov 
3141da177e4SLinus Torvalds 	if (err)
3151da177e4SLinus Torvalds 		goto failure;
3161da177e4SLinus Torvalds 
3171da177e4SLinus Torvalds 	return 0;
3181da177e4SLinus Torvalds 
3191da177e4SLinus Torvalds failure:
3207174259eSArnaldo Carvalho de Melo 	/*
3217174259eSArnaldo Carvalho de Melo 	 * This unhashes the socket and releases the local port,
3227174259eSArnaldo Carvalho de Melo 	 * if necessary.
3237174259eSArnaldo Carvalho de Melo 	 */
3241da177e4SLinus Torvalds 	tcp_set_state(sk, TCP_CLOSE);
3251da177e4SLinus Torvalds 	ip_rt_put(rt);
3261da177e4SLinus Torvalds 	sk->sk_route_caps = 0;
327c720c7e8SEric Dumazet 	inet->inet_dport = 0;
3281da177e4SLinus Torvalds 	return err;
3291da177e4SLinus Torvalds }
3304bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_connect);
3311da177e4SLinus Torvalds 
3321da177e4SLinus Torvalds /*
333563d34d0SEric Dumazet  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334563d34d0SEric Dumazet  * It can be called through tcp_release_cb() if socket was owned by user
335563d34d0SEric Dumazet  * at the time tcp_v4_err() was called to handle ICMP message.
3361da177e4SLinus Torvalds  */
3374fab9071SNeal Cardwell void tcp_v4_mtu_reduced(struct sock *sk)
3381da177e4SLinus Torvalds {
3391da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sk);
34002b2faafSEric Dumazet 	struct dst_entry *dst;
34102b2faafSEric Dumazet 	u32 mtu;
3421da177e4SLinus Torvalds 
34302b2faafSEric Dumazet 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
34402b2faafSEric Dumazet 		return;
34502b2faafSEric Dumazet 	mtu = tcp_sk(sk)->mtu_info;
34680d0a69fSDavid S. Miller 	dst = inet_csk_update_pmtu(sk, mtu);
34780d0a69fSDavid S. Miller 	if (!dst)
3481da177e4SLinus Torvalds 		return;
3491da177e4SLinus Torvalds 
3501da177e4SLinus Torvalds 	/* Something is about to be wrong... Remember soft error
3511da177e4SLinus Torvalds 	 * for the case, if this connection will not able to recover.
3521da177e4SLinus Torvalds 	 */
3531da177e4SLinus Torvalds 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
3541da177e4SLinus Torvalds 		sk->sk_err_soft = EMSGSIZE;
3551da177e4SLinus Torvalds 
3561da177e4SLinus Torvalds 	mtu = dst_mtu(dst);
3571da177e4SLinus Torvalds 
3581da177e4SLinus Torvalds 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359482fc609SHannes Frederic Sowa 	    ip_sk_accept_pmtu(sk) &&
360d83d8461SArnaldo Carvalho de Melo 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
3611da177e4SLinus Torvalds 		tcp_sync_mss(sk, mtu);
3621da177e4SLinus Torvalds 
3631da177e4SLinus Torvalds 		/* Resend the TCP packet because it's
3641da177e4SLinus Torvalds 		 * clear that the old packet has been
3651da177e4SLinus Torvalds 		 * dropped. This is the new "fast" path mtu
3661da177e4SLinus Torvalds 		 * discovery.
3671da177e4SLinus Torvalds 		 */
3681da177e4SLinus Torvalds 		tcp_simple_retransmit(sk);
3691da177e4SLinus Torvalds 	} /* else let the usual retransmit timer handle it */
3701da177e4SLinus Torvalds }
3714fab9071SNeal Cardwell EXPORT_SYMBOL(tcp_v4_mtu_reduced);
3721da177e4SLinus Torvalds 
37355be7a9cSDavid S. Miller static void do_redirect(struct sk_buff *skb, struct sock *sk)
37455be7a9cSDavid S. Miller {
37555be7a9cSDavid S. Miller 	struct dst_entry *dst = __sk_dst_check(sk, 0);
37655be7a9cSDavid S. Miller 
3771ed5c48fSDavid S. Miller 	if (dst)
3786700c270SDavid S. Miller 		dst->ops->redirect(dst, sk, skb);
37955be7a9cSDavid S. Miller }
38055be7a9cSDavid S. Miller 
38126e37360SEric Dumazet 
38226e37360SEric Dumazet /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
3839cf74903SEric Dumazet void tcp_req_err(struct sock *sk, u32 seq, bool abort)
38426e37360SEric Dumazet {
38526e37360SEric Dumazet 	struct request_sock *req = inet_reqsk(sk);
38626e37360SEric Dumazet 	struct net *net = sock_net(sk);
38726e37360SEric Dumazet 
38826e37360SEric Dumazet 	/* ICMPs are not backlogged, hence we cannot get
38926e37360SEric Dumazet 	 * an established socket here.
39026e37360SEric Dumazet 	 */
39126e37360SEric Dumazet 	if (seq != tcp_rsk(req)->snt_isn) {
39202a1d6e7SEric Dumazet 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
3939cf74903SEric Dumazet 	} else if (abort) {
39426e37360SEric Dumazet 		/*
39526e37360SEric Dumazet 		 * Still in SYN_RECV, just remove it silently.
39626e37360SEric Dumazet 		 * There is no good way to pass the error to the newly
39726e37360SEric Dumazet 		 * created socket, and POSIX does not want network
39826e37360SEric Dumazet 		 * errors returned from accept().
39926e37360SEric Dumazet 		 */
400c6973669SFan Du 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4019caad864SEric Dumazet 		tcp_listendrop(req->rsk_listener);
40226e37360SEric Dumazet 	}
403ef84d8ceSEric Dumazet 	reqsk_put(req);
40426e37360SEric Dumazet }
40526e37360SEric Dumazet EXPORT_SYMBOL(tcp_req_err);
40626e37360SEric Dumazet 
407f7456642SEric Dumazet /* TCP-LD (RFC 6069) logic */
408d2924569SEric Dumazet void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409f7456642SEric Dumazet {
410f7456642SEric Dumazet 	struct inet_connection_sock *icsk = inet_csk(sk);
411f7456642SEric Dumazet 	struct tcp_sock *tp = tcp_sk(sk);
412f7456642SEric Dumazet 	struct sk_buff *skb;
413f7456642SEric Dumazet 	s32 remaining;
414f7456642SEric Dumazet 	u32 delta_us;
415f7456642SEric Dumazet 
416f7456642SEric Dumazet 	if (sock_owned_by_user(sk))
417f7456642SEric Dumazet 		return;
418f7456642SEric Dumazet 
419f7456642SEric Dumazet 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420f7456642SEric Dumazet 	    !icsk->icsk_backoff)
421f7456642SEric Dumazet 		return;
422f7456642SEric Dumazet 
423f7456642SEric Dumazet 	skb = tcp_rtx_queue_head(sk);
424f7456642SEric Dumazet 	if (WARN_ON_ONCE(!skb))
425f7456642SEric Dumazet 		return;
426f7456642SEric Dumazet 
427f7456642SEric Dumazet 	icsk->icsk_backoff--;
428f7456642SEric Dumazet 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429f7456642SEric Dumazet 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430f7456642SEric Dumazet 
431f7456642SEric Dumazet 	tcp_mstamp_refresh(tp);
432f7456642SEric Dumazet 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433f7456642SEric Dumazet 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434f7456642SEric Dumazet 
435f7456642SEric Dumazet 	if (remaining > 0) {
436f7456642SEric Dumazet 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437f7456642SEric Dumazet 					  remaining, TCP_RTO_MAX);
438f7456642SEric Dumazet 	} else {
439f7456642SEric Dumazet 		/* RTO revert clocked out retransmission.
440f7456642SEric Dumazet 		 * Will retransmit now.
441f7456642SEric Dumazet 		 */
442f7456642SEric Dumazet 		tcp_retransmit_timer(sk);
443f7456642SEric Dumazet 	}
444f7456642SEric Dumazet }
445d2924569SEric Dumazet EXPORT_SYMBOL(tcp_ld_RTO_revert);
446f7456642SEric Dumazet 
4471da177e4SLinus Torvalds /*
4481da177e4SLinus Torvalds  * This routine is called by the ICMP module when it gets some
4491da177e4SLinus Torvalds  * sort of error condition.  If err < 0 then the socket should
4501da177e4SLinus Torvalds  * be closed and the error returned to the user.  If err > 0
4511da177e4SLinus Torvalds  * it's just the icmp type << 8 | icmp code.  After adjustment
4521da177e4SLinus Torvalds  * header points to the first 8 bytes of the tcp header.  We need
4531da177e4SLinus Torvalds  * to find the appropriate port.
4541da177e4SLinus Torvalds  *
4551da177e4SLinus Torvalds  * The locking strategy used here is very "optimistic". When
4561da177e4SLinus Torvalds  * someone else accesses the socket the ICMP is just dropped
4571da177e4SLinus Torvalds  * and for some paths there is no check at all.
4581da177e4SLinus Torvalds  * A more general error queue to queue errors for later handling
4591da177e4SLinus Torvalds  * is probably better.
4601da177e4SLinus Torvalds  *
4611da177e4SLinus Torvalds  */
4621da177e4SLinus Torvalds 
463a12daf13SEric Dumazet int tcp_v4_err(struct sk_buff *skb, u32 info)
4641da177e4SLinus Torvalds {
465a12daf13SEric Dumazet 	const struct iphdr *iph = (const struct iphdr *)skb->data;
466a12daf13SEric Dumazet 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
4671da177e4SLinus Torvalds 	struct tcp_sock *tp;
4681da177e4SLinus Torvalds 	struct inet_sock *inet;
469a12daf13SEric Dumazet 	const int type = icmp_hdr(skb)->type;
470a12daf13SEric Dumazet 	const int code = icmp_hdr(skb)->code;
4711da177e4SLinus Torvalds 	struct sock *sk;
4720a672f74SYuchung Cheng 	struct request_sock *fastopen;
4739a568de4SEric Dumazet 	u32 seq, snd_una;
4741da177e4SLinus Torvalds 	int err;
475a12daf13SEric Dumazet 	struct net *net = dev_net(skb->dev);
4761da177e4SLinus Torvalds 
47726e37360SEric Dumazet 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
47826e37360SEric Dumazet 				       th->dest, iph->saddr, ntohs(th->source),
479a12daf13SEric Dumazet 				       inet_iif(skb), 0);
4801da177e4SLinus Torvalds 	if (!sk) {
4815d3848bcSEric Dumazet 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
48232bbd879SStefano Brivio 		return -ENOENT;
4831da177e4SLinus Torvalds 	}
4841da177e4SLinus Torvalds 	if (sk->sk_state == TCP_TIME_WAIT) {
4859469c7b4SYOSHIFUJI Hideaki 		inet_twsk_put(inet_twsk(sk));
48632bbd879SStefano Brivio 		return 0;
4871da177e4SLinus Torvalds 	}
48826e37360SEric Dumazet 	seq = ntohl(th->seq);
48932bbd879SStefano Brivio 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
49032bbd879SStefano Brivio 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
4919cf74903SEric Dumazet 				     type == ICMP_TIME_EXCEEDED ||
4929cf74903SEric Dumazet 				     (type == ICMP_DEST_UNREACH &&
4939cf74903SEric Dumazet 				      (code == ICMP_NET_UNREACH ||
4949cf74903SEric Dumazet 				       code == ICMP_HOST_UNREACH)));
49532bbd879SStefano Brivio 		return 0;
49632bbd879SStefano Brivio 	}
4971da177e4SLinus Torvalds 
4981da177e4SLinus Torvalds 	bh_lock_sock(sk);
4991da177e4SLinus Torvalds 	/* If too many ICMPs get dropped on busy
5001da177e4SLinus Torvalds 	 * servers this needs to be solved differently.
501563d34d0SEric Dumazet 	 * We do take care of PMTU discovery (RFC1191) special case :
502563d34d0SEric Dumazet 	 * we can receive locally generated ICMP messages while socket is held.
5031da177e4SLinus Torvalds 	 */
504b74aa930SEric Dumazet 	if (sock_owned_by_user(sk)) {
505b74aa930SEric Dumazet 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
50602a1d6e7SEric Dumazet 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507b74aa930SEric Dumazet 	}
5081da177e4SLinus Torvalds 	if (sk->sk_state == TCP_CLOSE)
5091da177e4SLinus Torvalds 		goto out;
5101da177e4SLinus Torvalds 
51197e3ecd1Sstephen hemminger 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
51202a1d6e7SEric Dumazet 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
51397e3ecd1Sstephen hemminger 		goto out;
51497e3ecd1Sstephen hemminger 	}
51597e3ecd1Sstephen hemminger 
5161da177e4SLinus Torvalds 	tp = tcp_sk(sk);
5170a672f74SYuchung Cheng 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518d983ea6fSEric Dumazet 	fastopen = rcu_dereference(tp->fastopen_rsk);
5190a672f74SYuchung Cheng 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
5201da177e4SLinus Torvalds 	if (sk->sk_state != TCP_LISTEN &&
5210a672f74SYuchung Cheng 	    !between(seq, snd_una, tp->snd_nxt)) {
52202a1d6e7SEric Dumazet 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
5231da177e4SLinus Torvalds 		goto out;
5241da177e4SLinus Torvalds 	}
5251da177e4SLinus Torvalds 
5261da177e4SLinus Torvalds 	switch (type) {
52755be7a9cSDavid S. Miller 	case ICMP_REDIRECT:
52845caeaa5SJon Maxwell 		if (!sock_owned_by_user(sk))
529a12daf13SEric Dumazet 			do_redirect(skb, sk);
53055be7a9cSDavid S. Miller 		goto out;
5311da177e4SLinus Torvalds 	case ICMP_SOURCE_QUENCH:
5321da177e4SLinus Torvalds 		/* Just silently ignore these. */
5331da177e4SLinus Torvalds 		goto out;
5341da177e4SLinus Torvalds 	case ICMP_PARAMETERPROB:
5351da177e4SLinus Torvalds 		err = EPROTO;
5361da177e4SLinus Torvalds 		break;
5371da177e4SLinus Torvalds 	case ICMP_DEST_UNREACH:
5381da177e4SLinus Torvalds 		if (code > NR_ICMP_UNREACH)
5391da177e4SLinus Torvalds 			goto out;
5401da177e4SLinus Torvalds 
5411da177e4SLinus Torvalds 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
5420d4f0608SEric Dumazet 			/* We are not interested in TCP_LISTEN and open_requests
5430d4f0608SEric Dumazet 			 * (SYN-ACKs send out by Linux are always <576bytes so
5440d4f0608SEric Dumazet 			 * they should go through unfragmented).
5450d4f0608SEric Dumazet 			 */
5460d4f0608SEric Dumazet 			if (sk->sk_state == TCP_LISTEN)
5470d4f0608SEric Dumazet 				goto out;
5480d4f0608SEric Dumazet 
549563d34d0SEric Dumazet 			tp->mtu_info = info;
550144d56e9SEric Dumazet 			if (!sock_owned_by_user(sk)) {
551563d34d0SEric Dumazet 				tcp_v4_mtu_reduced(sk);
552144d56e9SEric Dumazet 			} else {
5537aa5470cSEric Dumazet 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554144d56e9SEric Dumazet 					sock_hold(sk);
555144d56e9SEric Dumazet 			}
5561da177e4SLinus Torvalds 			goto out;
5571da177e4SLinus Torvalds 		}
5581da177e4SLinus Torvalds 
5591da177e4SLinus Torvalds 		err = icmp_err_convert[code].errno;
560f7456642SEric Dumazet 		/* check if this ICMP message allows revert of backoff.
561f7456642SEric Dumazet 		 * (see RFC 6069)
562f7456642SEric Dumazet 		 */
563f7456642SEric Dumazet 		if (!fastopen &&
564f7456642SEric Dumazet 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565f7456642SEric Dumazet 			tcp_ld_RTO_revert(sk, seq);
5661da177e4SLinus Torvalds 		break;
5671da177e4SLinus Torvalds 	case ICMP_TIME_EXCEEDED:
5681da177e4SLinus Torvalds 		err = EHOSTUNREACH;
5691da177e4SLinus Torvalds 		break;
5701da177e4SLinus Torvalds 	default:
5711da177e4SLinus Torvalds 		goto out;
5721da177e4SLinus Torvalds 	}
5731da177e4SLinus Torvalds 
5741da177e4SLinus Torvalds 	switch (sk->sk_state) {
5751da177e4SLinus Torvalds 	case TCP_SYN_SENT:
5760a672f74SYuchung Cheng 	case TCP_SYN_RECV:
5770a672f74SYuchung Cheng 		/* Only in fast or simultaneous open. If a fast open socket is
5780a672f74SYuchung Cheng 		 * is already accepted it is treated as a connected one below.
5791da177e4SLinus Torvalds 		 */
58051456b29SIan Morris 		if (fastopen && !fastopen->sk)
5810a672f74SYuchung Cheng 			break;
5820a672f74SYuchung Cheng 
583a12daf13SEric Dumazet 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
58445af29caSEric Dumazet 
5851da177e4SLinus Torvalds 		if (!sock_owned_by_user(sk)) {
5861da177e4SLinus Torvalds 			sk->sk_err = err;
5871da177e4SLinus Torvalds 
5881da177e4SLinus Torvalds 			sk->sk_error_report(sk);
5891da177e4SLinus Torvalds 
5901da177e4SLinus Torvalds 			tcp_done(sk);
5911da177e4SLinus Torvalds 		} else {
5921da177e4SLinus Torvalds 			sk->sk_err_soft = err;
5931da177e4SLinus Torvalds 		}
5941da177e4SLinus Torvalds 		goto out;
5951da177e4SLinus Torvalds 	}
5961da177e4SLinus Torvalds 
5971da177e4SLinus Torvalds 	/* If we've already connected we will keep trying
5981da177e4SLinus Torvalds 	 * until we time out, or the user gives up.
5991da177e4SLinus Torvalds 	 *
6001da177e4SLinus Torvalds 	 * rfc1122 4.2.3.9 allows to consider as hard errors
6011da177e4SLinus Torvalds 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
6021da177e4SLinus Torvalds 	 * but it is obsoleted by pmtu discovery).
6031da177e4SLinus Torvalds 	 *
6041da177e4SLinus Torvalds 	 * Note, that in modern internet, where routing is unreliable
6051da177e4SLinus Torvalds 	 * and in each dark corner broken firewalls sit, sending random
6061da177e4SLinus Torvalds 	 * errors ordered by their masters even this two messages finally lose
6071da177e4SLinus Torvalds 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
6081da177e4SLinus Torvalds 	 *
6091da177e4SLinus Torvalds 	 * Now we are in compliance with RFCs.
6101da177e4SLinus Torvalds 	 *							--ANK (980905)
6111da177e4SLinus Torvalds 	 */
6121da177e4SLinus Torvalds 
6131da177e4SLinus Torvalds 	inet = inet_sk(sk);
6141da177e4SLinus Torvalds 	if (!sock_owned_by_user(sk) && inet->recverr) {
6151da177e4SLinus Torvalds 		sk->sk_err = err;
6161da177e4SLinus Torvalds 		sk->sk_error_report(sk);
6171da177e4SLinus Torvalds 	} else	{ /* Only an error on timeout */
6181da177e4SLinus Torvalds 		sk->sk_err_soft = err;
6191da177e4SLinus Torvalds 	}
6201da177e4SLinus Torvalds 
6211da177e4SLinus Torvalds out:
6221da177e4SLinus Torvalds 	bh_unlock_sock(sk);
6231da177e4SLinus Torvalds 	sock_put(sk);
62432bbd879SStefano Brivio 	return 0;
6251da177e4SLinus Torvalds }
6261da177e4SLinus Torvalds 
62728850dc7SDaniel Borkmann void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
6281da177e4SLinus Torvalds {
629aa8223c7SArnaldo Carvalho de Melo 	struct tcphdr *th = tcp_hdr(skb);
6301da177e4SLinus Torvalds 
631419f9f89SHerbert Xu 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632663ead3bSHerbert Xu 	skb->csum_start = skb_transport_header(skb) - skb->head;
633ff1dcadbSAl Viro 	skb->csum_offset = offsetof(struct tcphdr, check);
6341da177e4SLinus Torvalds }
6351da177e4SLinus Torvalds 
636419f9f89SHerbert Xu /* This routine computes an IPv4 TCP checksum. */
637bb296246SHerbert Xu void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638419f9f89SHerbert Xu {
639cf533ea5SEric Dumazet 	const struct inet_sock *inet = inet_sk(sk);
640419f9f89SHerbert Xu 
641419f9f89SHerbert Xu 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642419f9f89SHerbert Xu }
6434bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_send_check);
644419f9f89SHerbert Xu 
6451da177e4SLinus Torvalds /*
6461da177e4SLinus Torvalds  *	This routine will send an RST to the other tcp.
6471da177e4SLinus Torvalds  *
6481da177e4SLinus Torvalds  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
6491da177e4SLinus Torvalds  *		      for reset.
6501da177e4SLinus Torvalds  *	Answer: if a packet caused RST, it is not for a socket
6511da177e4SLinus Torvalds  *		existing in our system, if it is matched to a socket,
6521da177e4SLinus Torvalds  *		it is just duplicate segment or bug in other side's TCP.
6531da177e4SLinus Torvalds  *		So that we build reply only basing on parameters
6541da177e4SLinus Torvalds  *		arrived with segment.
6551da177e4SLinus Torvalds  *	Exception: precedence violation. We do not implement it in any case.
6561da177e4SLinus Torvalds  */
6571da177e4SLinus Torvalds 
658a00e7444SEric Dumazet static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
6591da177e4SLinus Torvalds {
660cf533ea5SEric Dumazet 	const struct tcphdr *th = tcp_hdr(skb);
661cfb6eeb4SYOSHIFUJI Hideaki 	struct {
662cfb6eeb4SYOSHIFUJI Hideaki 		struct tcphdr th;
663cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
664714e85beSAl Viro 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
665cfb6eeb4SYOSHIFUJI Hideaki #endif
666cfb6eeb4SYOSHIFUJI Hideaki 	} rep;
6671da177e4SLinus Torvalds 	struct ip_reply_arg arg;
668cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
669e46787f0SFlorian Westphal 	struct tcp_md5sig_key *key = NULL;
670658ddaafSShawn Lu 	const __u8 *hash_location = NULL;
671658ddaafSShawn Lu 	unsigned char newhash[16];
672658ddaafSShawn Lu 	int genhash;
673658ddaafSShawn Lu 	struct sock *sk1 = NULL;
674cfb6eeb4SYOSHIFUJI Hideaki #endif
675d6fb396cSEric Dumazet 	u64 transmit_time = 0;
67600483690SJon Maxwell 	struct sock *ctl_sk;
677d6fb396cSEric Dumazet 	struct net *net;
6781da177e4SLinus Torvalds 
6791da177e4SLinus Torvalds 	/* Never send a reset in response to a reset. */
6801da177e4SLinus Torvalds 	if (th->rst)
6811da177e4SLinus Torvalds 		return;
6821da177e4SLinus Torvalds 
683c3658e8dSEric Dumazet 	/* If sk not NULL, it means we did a successful lookup and incoming
684c3658e8dSEric Dumazet 	 * route had to be correct. prequeue might have dropped our dst.
685c3658e8dSEric Dumazet 	 */
686c3658e8dSEric Dumazet 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
6871da177e4SLinus Torvalds 		return;
6881da177e4SLinus Torvalds 
6891da177e4SLinus Torvalds 	/* Swap the send and the receive. */
690cfb6eeb4SYOSHIFUJI Hideaki 	memset(&rep, 0, sizeof(rep));
691cfb6eeb4SYOSHIFUJI Hideaki 	rep.th.dest   = th->source;
692cfb6eeb4SYOSHIFUJI Hideaki 	rep.th.source = th->dest;
693cfb6eeb4SYOSHIFUJI Hideaki 	rep.th.doff   = sizeof(struct tcphdr) / 4;
694cfb6eeb4SYOSHIFUJI Hideaki 	rep.th.rst    = 1;
6951da177e4SLinus Torvalds 
6961da177e4SLinus Torvalds 	if (th->ack) {
697cfb6eeb4SYOSHIFUJI Hideaki 		rep.th.seq = th->ack_seq;
6981da177e4SLinus Torvalds 	} else {
699cfb6eeb4SYOSHIFUJI Hideaki 		rep.th.ack = 1;
700cfb6eeb4SYOSHIFUJI Hideaki 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
7011da177e4SLinus Torvalds 				       skb->len - (th->doff << 2));
7021da177e4SLinus Torvalds 	}
7031da177e4SLinus Torvalds 
7047174259eSArnaldo Carvalho de Melo 	memset(&arg, 0, sizeof(arg));
705cfb6eeb4SYOSHIFUJI Hideaki 	arg.iov[0].iov_base = (unsigned char *)&rep;
706cfb6eeb4SYOSHIFUJI Hideaki 	arg.iov[0].iov_len  = sizeof(rep.th);
707cfb6eeb4SYOSHIFUJI Hideaki 
7080f85feaeSEric Dumazet 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
7103b24d854SEric Dumazet 	rcu_read_lock();
711658ddaafSShawn Lu 	hash_location = tcp_parse_md5sig_option(th);
712271c3b9bSFlorian Westphal 	if (sk && sk_fullsock(sk)) {
713cea97609SDavid Ahern 		const union tcp_md5_addr *addr;
714dea53bb8SDavid Ahern 		int l3index;
715cea97609SDavid Ahern 
716dea53bb8SDavid Ahern 		/* sdif set, means packet ingressed via a device
717dea53bb8SDavid Ahern 		 * in an L3 domain and inet_iif is set to it.
718dea53bb8SDavid Ahern 		 */
719dea53bb8SDavid Ahern 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720cea97609SDavid Ahern 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721dea53bb8SDavid Ahern 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722e46787f0SFlorian Westphal 	} else if (hash_location) {
723cea97609SDavid Ahern 		const union tcp_md5_addr *addr;
724534322caSDavid Ahern 		int sdif = tcp_v4_sdif(skb);
725534322caSDavid Ahern 		int dif = inet_iif(skb);
726dea53bb8SDavid Ahern 		int l3index;
727cea97609SDavid Ahern 
728658ddaafSShawn Lu 		/*
729658ddaafSShawn Lu 		 * active side is lost. Try to find listening socket through
730658ddaafSShawn Lu 		 * source port, and then find md5 key through listening socket.
731658ddaafSShawn Lu 		 * we are not loose security here:
732658ddaafSShawn Lu 		 * Incoming packet is checked with md5 hash with finding key,
733658ddaafSShawn Lu 		 * no RST generated if md5 hash doesn't match.
734658ddaafSShawn Lu 		 */
735a583636aSCraig Gallek 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
736a583636aSCraig Gallek 					     ip_hdr(skb)->saddr,
737da5e3630STom Herbert 					     th->source, ip_hdr(skb)->daddr,
738534322caSDavid Ahern 					     ntohs(th->source), dif, sdif);
739658ddaafSShawn Lu 		/* don't send rst if it can't find key */
740658ddaafSShawn Lu 		if (!sk1)
7413b24d854SEric Dumazet 			goto out;
7423b24d854SEric Dumazet 
743dea53bb8SDavid Ahern 		/* sdif set, means packet ingressed via a device
744dea53bb8SDavid Ahern 		 * in an L3 domain and dif is set to it.
745dea53bb8SDavid Ahern 		 */
746dea53bb8SDavid Ahern 		l3index = sdif ? dif : 0;
747cea97609SDavid Ahern 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748dea53bb8SDavid Ahern 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
749658ddaafSShawn Lu 		if (!key)
7503b24d854SEric Dumazet 			goto out;
7513b24d854SEric Dumazet 
752658ddaafSShawn Lu 
75339f8e58eSEric Dumazet 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754658ddaafSShawn Lu 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
7553b24d854SEric Dumazet 			goto out;
7563b24d854SEric Dumazet 
757658ddaafSShawn Lu 	}
758658ddaafSShawn Lu 
759cfb6eeb4SYOSHIFUJI Hideaki 	if (key) {
760cfb6eeb4SYOSHIFUJI Hideaki 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
761cfb6eeb4SYOSHIFUJI Hideaki 				   (TCPOPT_NOP << 16) |
762cfb6eeb4SYOSHIFUJI Hideaki 				   (TCPOPT_MD5SIG << 8) |
763cfb6eeb4SYOSHIFUJI Hideaki 				   TCPOLEN_MD5SIG);
764cfb6eeb4SYOSHIFUJI Hideaki 		/* Update length and the length the header thinks exists */
765cfb6eeb4SYOSHIFUJI Hideaki 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766cfb6eeb4SYOSHIFUJI Hideaki 		rep.th.doff = arg.iov[0].iov_len / 4;
767cfb6eeb4SYOSHIFUJI Hideaki 
76849a72dfbSAdam Langley 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
76978e645cbSIlpo Järvinen 				     key, ip_hdr(skb)->saddr,
77078e645cbSIlpo Järvinen 				     ip_hdr(skb)->daddr, &rep.th);
771cfb6eeb4SYOSHIFUJI Hideaki 	}
772cfb6eeb4SYOSHIFUJI Hideaki #endif
773eddc9ec5SArnaldo Carvalho de Melo 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774eddc9ec5SArnaldo Carvalho de Melo 				      ip_hdr(skb)->saddr, /* XXX */
77552cd5750SIlpo Järvinen 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
7761da177e4SLinus Torvalds 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777271c3b9bSFlorian Westphal 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
778271c3b9bSFlorian Westphal 
779e2446eaaSShawn Lu 	/* When socket is gone, all binding information is lost.
7804c675258SAlexey Kuznetsov 	 * routing might fail in this case. No choice here, if we choose to force
7814c675258SAlexey Kuznetsov 	 * input interface, we will misroute in case of asymmetric route.
782e2446eaaSShawn Lu 	 */
783c24b14c4SSong Liu 	if (sk) {
7844c675258SAlexey Kuznetsov 		arg.bound_dev_if = sk->sk_bound_dev_if;
7855c487bb9SSong Liu 		if (sk_fullsock(sk))
786c24b14c4SSong Liu 			trace_tcp_send_reset(sk, skb);
787c24b14c4SSong Liu 	}
7881da177e4SLinus Torvalds 
789271c3b9bSFlorian Westphal 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790271c3b9bSFlorian Westphal 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
791271c3b9bSFlorian Westphal 
79266b13d99SEric Dumazet 	arg.tos = ip_hdr(skb)->tos;
793e2d118a1SLorenzo Colitti 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
79447dcc20aSEric Dumazet 	local_bh_disable();
7955472c3c6SEric Dumazet 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
796a842fe14SEric Dumazet 	if (sk) {
79700483690SJon Maxwell 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
79800483690SJon Maxwell 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
799f6c0f5d2SEric Dumazet 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800f6c0f5d2SEric Dumazet 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
801d6fb396cSEric Dumazet 		transmit_time = tcp_transmit_time(sk);
802a842fe14SEric Dumazet 	}
80300483690SJon Maxwell 	ip_send_unicast_reply(ctl_sk,
804bdbbb852SEric Dumazet 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
80524a2d43dSEric Dumazet 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806d6fb396cSEric Dumazet 			      &arg, arg.iov[0].iov_len,
807d6fb396cSEric Dumazet 			      transmit_time);
8081da177e4SLinus Torvalds 
80900483690SJon Maxwell 	ctl_sk->sk_mark = 0;
81090bbcc60SEric Dumazet 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
81190bbcc60SEric Dumazet 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
81247dcc20aSEric Dumazet 	local_bh_enable();
813658ddaafSShawn Lu 
814658ddaafSShawn Lu #ifdef CONFIG_TCP_MD5SIG
8153b24d854SEric Dumazet out:
816658ddaafSShawn Lu 	rcu_read_unlock();
817658ddaafSShawn Lu #endif
8181da177e4SLinus Torvalds }
8191da177e4SLinus Torvalds 
8201da177e4SLinus Torvalds /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
8211da177e4SLinus Torvalds    outside socket context is ugly, certainly. What can I do?
8221da177e4SLinus Torvalds  */
8231da177e4SLinus Torvalds 
824e2d118a1SLorenzo Colitti static void tcp_v4_send_ack(const struct sock *sk,
825e62a123bSEric Dumazet 			    struct sk_buff *skb, u32 seq, u32 ack,
826ee684b6fSAndrey Vagin 			    u32 win, u32 tsval, u32 tsecr, int oif,
82788ef4a5aSKOVACS Krisztian 			    struct tcp_md5sig_key *key,
82866b13d99SEric Dumazet 			    int reply_flags, u8 tos)
8291da177e4SLinus Torvalds {
830cf533ea5SEric Dumazet 	const struct tcphdr *th = tcp_hdr(skb);
8311da177e4SLinus Torvalds 	struct {
8321da177e4SLinus Torvalds 		struct tcphdr th;
833714e85beSAl Viro 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
835cfb6eeb4SYOSHIFUJI Hideaki 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
836cfb6eeb4SYOSHIFUJI Hideaki #endif
837cfb6eeb4SYOSHIFUJI Hideaki 			];
8381da177e4SLinus Torvalds 	} rep;
839e2d118a1SLorenzo Colitti 	struct net *net = sock_net(sk);
8401da177e4SLinus Torvalds 	struct ip_reply_arg arg;
84100483690SJon Maxwell 	struct sock *ctl_sk;
842d6fb396cSEric Dumazet 	u64 transmit_time;
8431da177e4SLinus Torvalds 
8441da177e4SLinus Torvalds 	memset(&rep.th, 0, sizeof(struct tcphdr));
8457174259eSArnaldo Carvalho de Melo 	memset(&arg, 0, sizeof(arg));
8461da177e4SLinus Torvalds 
8471da177e4SLinus Torvalds 	arg.iov[0].iov_base = (unsigned char *)&rep;
8481da177e4SLinus Torvalds 	arg.iov[0].iov_len  = sizeof(rep.th);
849ee684b6fSAndrey Vagin 	if (tsecr) {
850cfb6eeb4SYOSHIFUJI Hideaki 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
8511da177e4SLinus Torvalds 				   (TCPOPT_TIMESTAMP << 8) |
8521da177e4SLinus Torvalds 				   TCPOLEN_TIMESTAMP);
853ee684b6fSAndrey Vagin 		rep.opt[1] = htonl(tsval);
854ee684b6fSAndrey Vagin 		rep.opt[2] = htonl(tsecr);
855cb48cfe8SCraig Schlenter 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
8561da177e4SLinus Torvalds 	}
8571da177e4SLinus Torvalds 
8581da177e4SLinus Torvalds 	/* Swap the send and the receive. */
8591da177e4SLinus Torvalds 	rep.th.dest    = th->source;
8601da177e4SLinus Torvalds 	rep.th.source  = th->dest;
8611da177e4SLinus Torvalds 	rep.th.doff    = arg.iov[0].iov_len / 4;
8621da177e4SLinus Torvalds 	rep.th.seq     = htonl(seq);
8631da177e4SLinus Torvalds 	rep.th.ack_seq = htonl(ack);
8641da177e4SLinus Torvalds 	rep.th.ack     = 1;
8651da177e4SLinus Torvalds 	rep.th.window  = htons(win);
8661da177e4SLinus Torvalds 
867cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
868cfb6eeb4SYOSHIFUJI Hideaki 	if (key) {
869ee684b6fSAndrey Vagin 		int offset = (tsecr) ? 3 : 0;
870cfb6eeb4SYOSHIFUJI Hideaki 
871cfb6eeb4SYOSHIFUJI Hideaki 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
872cfb6eeb4SYOSHIFUJI Hideaki 					  (TCPOPT_NOP << 16) |
873cfb6eeb4SYOSHIFUJI Hideaki 					  (TCPOPT_MD5SIG << 8) |
874cfb6eeb4SYOSHIFUJI Hideaki 					  TCPOLEN_MD5SIG);
875cfb6eeb4SYOSHIFUJI Hideaki 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876cfb6eeb4SYOSHIFUJI Hideaki 		rep.th.doff = arg.iov[0].iov_len/4;
877cfb6eeb4SYOSHIFUJI Hideaki 
87849a72dfbSAdam Langley 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
87990b7e112SAdam Langley 				    key, ip_hdr(skb)->saddr,
88090b7e112SAdam Langley 				    ip_hdr(skb)->daddr, &rep.th);
881cfb6eeb4SYOSHIFUJI Hideaki 	}
882cfb6eeb4SYOSHIFUJI Hideaki #endif
88388ef4a5aSKOVACS Krisztian 	arg.flags = reply_flags;
884eddc9ec5SArnaldo Carvalho de Melo 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885eddc9ec5SArnaldo Carvalho de Melo 				      ip_hdr(skb)->saddr, /* XXX */
8861da177e4SLinus Torvalds 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
8871da177e4SLinus Torvalds 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
8889501f972SYOSHIFUJI Hideaki 	if (oif)
8899501f972SYOSHIFUJI Hideaki 		arg.bound_dev_if = oif;
89066b13d99SEric Dumazet 	arg.tos = tos;
891e2d118a1SLorenzo Colitti 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
89247dcc20aSEric Dumazet 	local_bh_disable();
8935472c3c6SEric Dumazet 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
89400483690SJon Maxwell 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
89500483690SJon Maxwell 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
896f6c0f5d2SEric Dumazet 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897f6c0f5d2SEric Dumazet 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
898d6fb396cSEric Dumazet 	transmit_time = tcp_transmit_time(sk);
89900483690SJon Maxwell 	ip_send_unicast_reply(ctl_sk,
900bdbbb852SEric Dumazet 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
90124a2d43dSEric Dumazet 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902d6fb396cSEric Dumazet 			      &arg, arg.iov[0].iov_len,
903d6fb396cSEric Dumazet 			      transmit_time);
9041da177e4SLinus Torvalds 
90500483690SJon Maxwell 	ctl_sk->sk_mark = 0;
90690bbcc60SEric Dumazet 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
90747dcc20aSEric Dumazet 	local_bh_enable();
9081da177e4SLinus Torvalds }
9091da177e4SLinus Torvalds 
9101da177e4SLinus Torvalds static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
9111da177e4SLinus Torvalds {
9128feaf0c0SArnaldo Carvalho de Melo 	struct inet_timewait_sock *tw = inet_twsk(sk);
913cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
9141da177e4SLinus Torvalds 
915e2d118a1SLorenzo Colitti 	tcp_v4_send_ack(sk, skb,
916e62a123bSEric Dumazet 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
9177174259eSArnaldo Carvalho de Melo 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9189a568de4SEric Dumazet 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9199501f972SYOSHIFUJI Hideaki 			tcptw->tw_ts_recent,
9209501f972SYOSHIFUJI Hideaki 			tw->tw_bound_dev_if,
92188ef4a5aSKOVACS Krisztian 			tcp_twsk_md5_key(tcptw),
92266b13d99SEric Dumazet 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
92366b13d99SEric Dumazet 			tw->tw_tos
9249501f972SYOSHIFUJI Hideaki 			);
9251da177e4SLinus Torvalds 
9268feaf0c0SArnaldo Carvalho de Melo 	inet_twsk_put(tw);
9271da177e4SLinus Torvalds }
9281da177e4SLinus Torvalds 
929a00e7444SEric Dumazet static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
9307174259eSArnaldo Carvalho de Melo 				  struct request_sock *req)
9311da177e4SLinus Torvalds {
932cea97609SDavid Ahern 	const union tcp_md5_addr *addr;
933dea53bb8SDavid Ahern 	int l3index;
934cea97609SDavid Ahern 
935168a8f58SJerry Chu 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936168a8f58SJerry Chu 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
937168a8f58SJerry Chu 	 */
938e62a123bSEric Dumazet 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
939e62a123bSEric Dumazet 					     tcp_sk(sk)->snd_nxt;
940e62a123bSEric Dumazet 
94120a2b49fSEric Dumazet 	/* RFC 7323 2.3
94220a2b49fSEric Dumazet 	 * The window field (SEG.WND) of every outgoing segment, with the
94320a2b49fSEric Dumazet 	 * exception of <SYN> segments, MUST be right-shifted by
94420a2b49fSEric Dumazet 	 * Rcv.Wind.Shift bits:
94520a2b49fSEric Dumazet 	 */
946cea97609SDavid Ahern 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947dea53bb8SDavid Ahern 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948e2d118a1SLorenzo Colitti 	tcp_v4_send_ack(sk, skb, seq,
94920a2b49fSEric Dumazet 			tcp_rsk(req)->rcv_nxt,
95020a2b49fSEric Dumazet 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9519a568de4SEric Dumazet 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9529501f972SYOSHIFUJI Hideaki 			req->ts_recent,
9539501f972SYOSHIFUJI Hideaki 			0,
954dea53bb8SDavid Ahern 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
95566b13d99SEric Dumazet 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
95666b13d99SEric Dumazet 			ip_hdr(skb)->tos);
9571da177e4SLinus Torvalds }
9581da177e4SLinus Torvalds 
9591da177e4SLinus Torvalds /*
9609bf1d83eSKris Katterjohn  *	Send a SYN-ACK after having received a SYN.
96160236fddSArnaldo Carvalho de Melo  *	This still operates on a request_sock only, not on a big
9621da177e4SLinus Torvalds  *	socket.
9631da177e4SLinus Torvalds  */
9640f935dbeSEric Dumazet static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
965d6274bd8SOctavian Purdila 			      struct flowi *fl,
966e6b4d113SWilliam Allen Simpson 			      struct request_sock *req,
967ca6fb065SEric Dumazet 			      struct tcp_fastopen_cookie *foc,
968b3d05147SEric Dumazet 			      enum tcp_synack_type synack_type)
9691da177e4SLinus Torvalds {
9702e6599cbSArnaldo Carvalho de Melo 	const struct inet_request_sock *ireq = inet_rsk(req);
9716bd023f3SDavid S. Miller 	struct flowi4 fl4;
9721da177e4SLinus Torvalds 	int err = -1;
9731da177e4SLinus Torvalds 	struct sk_buff *skb;
9741da177e4SLinus Torvalds 
9751da177e4SLinus Torvalds 	/* First, grab a route. */
976ba3f7f04SDavid S. Miller 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
977fd80eb94SDenis V. Lunev 		return -1;
9781da177e4SLinus Torvalds 
979b3d05147SEric Dumazet 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
9801da177e4SLinus Torvalds 
9811da177e4SLinus Torvalds 	if (skb) {
982634fb979SEric Dumazet 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
9831da177e4SLinus Torvalds 
9842ab2ddd3SEric Dumazet 		rcu_read_lock();
985634fb979SEric Dumazet 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
986634fb979SEric Dumazet 					    ireq->ir_rmt_addr,
9872ab2ddd3SEric Dumazet 					    rcu_dereference(ireq->ireq_opt));
9882ab2ddd3SEric Dumazet 		rcu_read_unlock();
989b9df3cb8SGerrit Renker 		err = net_xmit_eval(err);
9901da177e4SLinus Torvalds 	}
9911da177e4SLinus Torvalds 
9921da177e4SLinus Torvalds 	return err;
9931da177e4SLinus Torvalds }
9941da177e4SLinus Torvalds 
9951da177e4SLinus Torvalds /*
99660236fddSArnaldo Carvalho de Melo  *	IPv4 request_sock destructor.
9971da177e4SLinus Torvalds  */
99860236fddSArnaldo Carvalho de Melo static void tcp_v4_reqsk_destructor(struct request_sock *req)
9991da177e4SLinus Torvalds {
1000c92e8c02SEric Dumazet 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
10011da177e4SLinus Torvalds }
10021da177e4SLinus Torvalds 
1003cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
1004cfb6eeb4SYOSHIFUJI Hideaki /*
1005cfb6eeb4SYOSHIFUJI Hideaki  * RFC2385 MD5 checksumming requires a mapping of
1006cfb6eeb4SYOSHIFUJI Hideaki  * IP address->MD5 Key.
1007cfb6eeb4SYOSHIFUJI Hideaki  * We need to maintain these in the sk structure.
1008cfb6eeb4SYOSHIFUJI Hideaki  */
1009cfb6eeb4SYOSHIFUJI Hideaki 
1010921f9a0fSEric Dumazet DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
10116015c71eSEric Dumazet EXPORT_SYMBOL(tcp_md5_needed);
10126015c71eSEric Dumazet 
1013cfb6eeb4SYOSHIFUJI Hideaki /* Find the Key structure for an address.  */
1014dea53bb8SDavid Ahern struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1015a915da9bSEric Dumazet 					   const union tcp_md5_addr *addr,
1016a915da9bSEric Dumazet 					   int family)
1017cfb6eeb4SYOSHIFUJI Hideaki {
1018fd3a154aSEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
1019a915da9bSEric Dumazet 	struct tcp_md5sig_key *key;
1020fd3a154aSEric Dumazet 	const struct tcp_md5sig_info *md5sig;
10216797318eSIvan Delalande 	__be32 mask;
10226797318eSIvan Delalande 	struct tcp_md5sig_key *best_match = NULL;
10236797318eSIvan Delalande 	bool match;
1024cfb6eeb4SYOSHIFUJI Hideaki 
1025a8afca03SEric Dumazet 	/* caller either holds rcu_read_lock() or socket lock */
1026a8afca03SEric Dumazet 	md5sig = rcu_dereference_check(tp->md5sig_info,
10271e1d04e6SHannes Frederic Sowa 				       lockdep_sock_is_held(sk));
1028a8afca03SEric Dumazet 	if (!md5sig)
1029cfb6eeb4SYOSHIFUJI Hideaki 		return NULL;
1030083a0326SArnd Bergmann 
1031c8b91770SAmol Grover 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1032c8b91770SAmol Grover 				 lockdep_sock_is_held(sk)) {
1033a915da9bSEric Dumazet 		if (key->family != family)
1034a915da9bSEric Dumazet 			continue;
1035dea53bb8SDavid Ahern 		if (key->l3index && key->l3index != l3index)
1036dea53bb8SDavid Ahern 			continue;
10376797318eSIvan Delalande 		if (family == AF_INET) {
10386797318eSIvan Delalande 			mask = inet_make_mask(key->prefixlen);
10396797318eSIvan Delalande 			match = (key->addr.a4.s_addr & mask) ==
10406797318eSIvan Delalande 				(addr->a4.s_addr & mask);
10416797318eSIvan Delalande #if IS_ENABLED(CONFIG_IPV6)
10426797318eSIvan Delalande 		} else if (family == AF_INET6) {
10436797318eSIvan Delalande 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
10446797318eSIvan Delalande 						  key->prefixlen);
10456797318eSIvan Delalande #endif
10466797318eSIvan Delalande 		} else {
10476797318eSIvan Delalande 			match = false;
10486797318eSIvan Delalande 		}
10496797318eSIvan Delalande 
10506797318eSIvan Delalande 		if (match && (!best_match ||
10516797318eSIvan Delalande 			      key->prefixlen > best_match->prefixlen))
10526797318eSIvan Delalande 			best_match = key;
10536797318eSIvan Delalande 	}
10546797318eSIvan Delalande 	return best_match;
10556797318eSIvan Delalande }
10566015c71eSEric Dumazet EXPORT_SYMBOL(__tcp_md5_do_lookup);
10576797318eSIvan Delalande 
1058e8f37d57SWu Fengguang static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
10596797318eSIvan Delalande 						      const union tcp_md5_addr *addr,
1060dea53bb8SDavid Ahern 						      int family, u8 prefixlen,
1061dea53bb8SDavid Ahern 						      int l3index)
10626797318eSIvan Delalande {
10636797318eSIvan Delalande 	const struct tcp_sock *tp = tcp_sk(sk);
10646797318eSIvan Delalande 	struct tcp_md5sig_key *key;
10656797318eSIvan Delalande 	unsigned int size = sizeof(struct in_addr);
10666797318eSIvan Delalande 	const struct tcp_md5sig_info *md5sig;
10676797318eSIvan Delalande 
10686797318eSIvan Delalande 	/* caller either holds rcu_read_lock() or socket lock */
10696797318eSIvan Delalande 	md5sig = rcu_dereference_check(tp->md5sig_info,
10706797318eSIvan Delalande 				       lockdep_sock_is_held(sk));
10716797318eSIvan Delalande 	if (!md5sig)
10726797318eSIvan Delalande 		return NULL;
10736797318eSIvan Delalande #if IS_ENABLED(CONFIG_IPV6)
10746797318eSIvan Delalande 	if (family == AF_INET6)
10756797318eSIvan Delalande 		size = sizeof(struct in6_addr);
10766797318eSIvan Delalande #endif
1077c8b91770SAmol Grover 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1078c8b91770SAmol Grover 				 lockdep_sock_is_held(sk)) {
10796797318eSIvan Delalande 		if (key->family != family)
10806797318eSIvan Delalande 			continue;
1081dea53bb8SDavid Ahern 		if (key->l3index && key->l3index != l3index)
1082dea53bb8SDavid Ahern 			continue;
10836797318eSIvan Delalande 		if (!memcmp(&key->addr, addr, size) &&
10846797318eSIvan Delalande 		    key->prefixlen == prefixlen)
1085a915da9bSEric Dumazet 			return key;
1086cfb6eeb4SYOSHIFUJI Hideaki 	}
1087cfb6eeb4SYOSHIFUJI Hideaki 	return NULL;
1088cfb6eeb4SYOSHIFUJI Hideaki }
1089cfb6eeb4SYOSHIFUJI Hideaki 
1090b83e3debSEric Dumazet struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1091fd3a154aSEric Dumazet 					 const struct sock *addr_sk)
1092cfb6eeb4SYOSHIFUJI Hideaki {
1093b52e6921SEric Dumazet 	const union tcp_md5_addr *addr;
1094dea53bb8SDavid Ahern 	int l3index;
1095a915da9bSEric Dumazet 
1096dea53bb8SDavid Ahern 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1097dea53bb8SDavid Ahern 						 addr_sk->sk_bound_dev_if);
1098b52e6921SEric Dumazet 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1099dea53bb8SDavid Ahern 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1100cfb6eeb4SYOSHIFUJI Hideaki }
1101cfb6eeb4SYOSHIFUJI Hideaki EXPORT_SYMBOL(tcp_v4_md5_lookup);
1102cfb6eeb4SYOSHIFUJI Hideaki 
1103cfb6eeb4SYOSHIFUJI Hideaki /* This can be called on a newly created socket, from other files */
1104a915da9bSEric Dumazet int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1105dea53bb8SDavid Ahern 		   int family, u8 prefixlen, int l3index,
1106dea53bb8SDavid Ahern 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1107cfb6eeb4SYOSHIFUJI Hideaki {
1108cfb6eeb4SYOSHIFUJI Hideaki 	/* Add Key to the list */
1109b0a713e9SMatthias M. Dellweg 	struct tcp_md5sig_key *key;
1110cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_sock *tp = tcp_sk(sk);
1111f6685938SArnaldo Carvalho de Melo 	struct tcp_md5sig_info *md5sig;
1112f6685938SArnaldo Carvalho de Melo 
1113dea53bb8SDavid Ahern 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1114a915da9bSEric Dumazet 	if (key) {
1115e6ced831SEric Dumazet 		/* Pre-existing entry - just update that one.
1116e6ced831SEric Dumazet 		 * Note that the key might be used concurrently.
1117e6ced831SEric Dumazet 		 * data_race() is telling kcsan that we do not care of
1118e6ced831SEric Dumazet 		 * key mismatches, since changing MD5 key on live flows
1119e6ced831SEric Dumazet 		 * can lead to packet drops.
1120e6ced831SEric Dumazet 		 */
1121e6ced831SEric Dumazet 		data_race(memcpy(key->key, newkey, newkeylen));
11226a2febecSEric Dumazet 
1123e6ced831SEric Dumazet 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1124e6ced831SEric Dumazet 		 * Also note that a reader could catch new key->keylen value
1125e6ced831SEric Dumazet 		 * but old key->key[], this is the reason we use __GFP_ZERO
1126e6ced831SEric Dumazet 		 * at sock_kmalloc() time below these lines.
1127e6ced831SEric Dumazet 		 */
1128e6ced831SEric Dumazet 		WRITE_ONCE(key->keylen, newkeylen);
11296a2febecSEric Dumazet 
1130a915da9bSEric Dumazet 		return 0;
1131cfb6eeb4SYOSHIFUJI Hideaki 	}
1132260fcbebSYan, Zheng 
1133a8afca03SEric Dumazet 	md5sig = rcu_dereference_protected(tp->md5sig_info,
11341e1d04e6SHannes Frederic Sowa 					   lockdep_sock_is_held(sk));
1135a915da9bSEric Dumazet 	if (!md5sig) {
1136a915da9bSEric Dumazet 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1137a915da9bSEric Dumazet 		if (!md5sig)
1138a915da9bSEric Dumazet 			return -ENOMEM;
1139a915da9bSEric Dumazet 
1140a915da9bSEric Dumazet 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1141a915da9bSEric Dumazet 		INIT_HLIST_HEAD(&md5sig->head);
1142a8afca03SEric Dumazet 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1143a915da9bSEric Dumazet 	}
1144a915da9bSEric Dumazet 
1145e6ced831SEric Dumazet 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1146a915da9bSEric Dumazet 	if (!key)
1147a915da9bSEric Dumazet 		return -ENOMEM;
114871cea17eSEric Dumazet 	if (!tcp_alloc_md5sig_pool()) {
11495f3d9cb2SEric Dumazet 		sock_kfree_s(sk, key, sizeof(*key));
1150cfb6eeb4SYOSHIFUJI Hideaki 		return -ENOMEM;
1151cfb6eeb4SYOSHIFUJI Hideaki 	}
1152f6685938SArnaldo Carvalho de Melo 
1153a915da9bSEric Dumazet 	memcpy(key->key, newkey, newkeylen);
1154a915da9bSEric Dumazet 	key->keylen = newkeylen;
1155a915da9bSEric Dumazet 	key->family = family;
11566797318eSIvan Delalande 	key->prefixlen = prefixlen;
1157dea53bb8SDavid Ahern 	key->l3index = l3index;
1158a915da9bSEric Dumazet 	memcpy(&key->addr, addr,
1159a915da9bSEric Dumazet 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1160a915da9bSEric Dumazet 				      sizeof(struct in_addr));
1161a915da9bSEric Dumazet 	hlist_add_head_rcu(&key->node, &md5sig->head);
1162cfb6eeb4SYOSHIFUJI Hideaki 	return 0;
1163cfb6eeb4SYOSHIFUJI Hideaki }
1164a915da9bSEric Dumazet EXPORT_SYMBOL(tcp_md5_do_add);
1165cfb6eeb4SYOSHIFUJI Hideaki 
11666797318eSIvan Delalande int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1167dea53bb8SDavid Ahern 		   u8 prefixlen, int l3index)
1168cfb6eeb4SYOSHIFUJI Hideaki {
1169a915da9bSEric Dumazet 	struct tcp_md5sig_key *key;
1170cfb6eeb4SYOSHIFUJI Hideaki 
1171dea53bb8SDavid Ahern 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1172a915da9bSEric Dumazet 	if (!key)
1173cfb6eeb4SYOSHIFUJI Hideaki 		return -ENOENT;
1174a915da9bSEric Dumazet 	hlist_del_rcu(&key->node);
11755f3d9cb2SEric Dumazet 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1176a915da9bSEric Dumazet 	kfree_rcu(key, rcu);
1177a915da9bSEric Dumazet 	return 0;
1178cfb6eeb4SYOSHIFUJI Hideaki }
1179a915da9bSEric Dumazet EXPORT_SYMBOL(tcp_md5_do_del);
1180cfb6eeb4SYOSHIFUJI Hideaki 
1181e0683e70Sstephen hemminger static void tcp_clear_md5_list(struct sock *sk)
1182cfb6eeb4SYOSHIFUJI Hideaki {
1183cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_sock *tp = tcp_sk(sk);
1184a915da9bSEric Dumazet 	struct tcp_md5sig_key *key;
1185b67bfe0dSSasha Levin 	struct hlist_node *n;
1186a8afca03SEric Dumazet 	struct tcp_md5sig_info *md5sig;
1187cfb6eeb4SYOSHIFUJI Hideaki 
1188a8afca03SEric Dumazet 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1189a8afca03SEric Dumazet 
1190b67bfe0dSSasha Levin 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1191a915da9bSEric Dumazet 		hlist_del_rcu(&key->node);
11925f3d9cb2SEric Dumazet 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1193a915da9bSEric Dumazet 		kfree_rcu(key, rcu);
1194cfb6eeb4SYOSHIFUJI Hideaki 	}
1195cfb6eeb4SYOSHIFUJI Hideaki }
1196cfb6eeb4SYOSHIFUJI Hideaki 
11978917a777SIvan Delalande static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1198d4c19c49SChristoph Hellwig 				 sockptr_t optval, int optlen)
1199cfb6eeb4SYOSHIFUJI Hideaki {
1200cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_md5sig cmd;
1201cfb6eeb4SYOSHIFUJI Hideaki 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1202cea97609SDavid Ahern 	const union tcp_md5_addr *addr;
12038917a777SIvan Delalande 	u8 prefixlen = 32;
1204dea53bb8SDavid Ahern 	int l3index = 0;
1205cfb6eeb4SYOSHIFUJI Hideaki 
1206cfb6eeb4SYOSHIFUJI Hideaki 	if (optlen < sizeof(cmd))
1207cfb6eeb4SYOSHIFUJI Hideaki 		return -EINVAL;
1208cfb6eeb4SYOSHIFUJI Hideaki 
1209d4c19c49SChristoph Hellwig 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1210cfb6eeb4SYOSHIFUJI Hideaki 		return -EFAULT;
1211cfb6eeb4SYOSHIFUJI Hideaki 
1212cfb6eeb4SYOSHIFUJI Hideaki 	if (sin->sin_family != AF_INET)
1213cfb6eeb4SYOSHIFUJI Hideaki 		return -EINVAL;
1214cfb6eeb4SYOSHIFUJI Hideaki 
12158917a777SIvan Delalande 	if (optname == TCP_MD5SIG_EXT &&
12168917a777SIvan Delalande 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
12178917a777SIvan Delalande 		prefixlen = cmd.tcpm_prefixlen;
12188917a777SIvan Delalande 		if (prefixlen > 32)
12198917a777SIvan Delalande 			return -EINVAL;
12208917a777SIvan Delalande 	}
12218917a777SIvan Delalande 
12226b102db5SDavid Ahern 	if (optname == TCP_MD5SIG_EXT &&
12236b102db5SDavid Ahern 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
12246b102db5SDavid Ahern 		struct net_device *dev;
12256b102db5SDavid Ahern 
12266b102db5SDavid Ahern 		rcu_read_lock();
12276b102db5SDavid Ahern 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
12286b102db5SDavid Ahern 		if (dev && netif_is_l3_master(dev))
12296b102db5SDavid Ahern 			l3index = dev->ifindex;
12306b102db5SDavid Ahern 
12316b102db5SDavid Ahern 		rcu_read_unlock();
12326b102db5SDavid Ahern 
12336b102db5SDavid Ahern 		/* ok to reference set/not set outside of rcu;
12346b102db5SDavid Ahern 		 * right now device MUST be an L3 master
12356b102db5SDavid Ahern 		 */
12366b102db5SDavid Ahern 		if (!dev || !l3index)
12376b102db5SDavid Ahern 			return -EINVAL;
12386b102db5SDavid Ahern 	}
12396b102db5SDavid Ahern 
1240cea97609SDavid Ahern 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1241cea97609SDavid Ahern 
124264a124edSDmitry Popov 	if (!cmd.tcpm_keylen)
1243dea53bb8SDavid Ahern 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1244cfb6eeb4SYOSHIFUJI Hideaki 
1245cfb6eeb4SYOSHIFUJI Hideaki 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1246cfb6eeb4SYOSHIFUJI Hideaki 		return -EINVAL;
1247cfb6eeb4SYOSHIFUJI Hideaki 
1248dea53bb8SDavid Ahern 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1249cea97609SDavid Ahern 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1250cfb6eeb4SYOSHIFUJI Hideaki }
1251cfb6eeb4SYOSHIFUJI Hideaki 
125219689e38SEric Dumazet static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
125319689e38SEric Dumazet 				   __be32 daddr, __be32 saddr,
125419689e38SEric Dumazet 				   const struct tcphdr *th, int nbytes)
1255cfb6eeb4SYOSHIFUJI Hideaki {
1256cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp4_pseudohdr *bp;
125749a72dfbSAdam Langley 	struct scatterlist sg;
125819689e38SEric Dumazet 	struct tcphdr *_th;
1259cfb6eeb4SYOSHIFUJI Hideaki 
126019689e38SEric Dumazet 	bp = hp->scratch;
1261cfb6eeb4SYOSHIFUJI Hideaki 	bp->saddr = saddr;
1262cfb6eeb4SYOSHIFUJI Hideaki 	bp->daddr = daddr;
1263cfb6eeb4SYOSHIFUJI Hideaki 	bp->pad = 0;
1264076fb722SYOSHIFUJI Hideaki 	bp->protocol = IPPROTO_TCP;
126549a72dfbSAdam Langley 	bp->len = cpu_to_be16(nbytes);
1266c7da57a1SDavid S. Miller 
126719689e38SEric Dumazet 	_th = (struct tcphdr *)(bp + 1);
126819689e38SEric Dumazet 	memcpy(_th, th, sizeof(*th));
126919689e38SEric Dumazet 	_th->check = 0;
127019689e38SEric Dumazet 
127119689e38SEric Dumazet 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
127219689e38SEric Dumazet 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
127319689e38SEric Dumazet 				sizeof(*bp) + sizeof(*th));
1274cf80e0e4SHerbert Xu 	return crypto_ahash_update(hp->md5_req);
127549a72dfbSAdam Langley }
127649a72dfbSAdam Langley 
1277a915da9bSEric Dumazet static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1278318cf7aaSEric Dumazet 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
127949a72dfbSAdam Langley {
128049a72dfbSAdam Langley 	struct tcp_md5sig_pool *hp;
1281cf80e0e4SHerbert Xu 	struct ahash_request *req;
128249a72dfbSAdam Langley 
128349a72dfbSAdam Langley 	hp = tcp_get_md5sig_pool();
128449a72dfbSAdam Langley 	if (!hp)
128549a72dfbSAdam Langley 		goto clear_hash_noput;
1286cf80e0e4SHerbert Xu 	req = hp->md5_req;
128749a72dfbSAdam Langley 
1288cf80e0e4SHerbert Xu 	if (crypto_ahash_init(req))
128949a72dfbSAdam Langley 		goto clear_hash;
129019689e38SEric Dumazet 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
129149a72dfbSAdam Langley 		goto clear_hash;
129249a72dfbSAdam Langley 	if (tcp_md5_hash_key(hp, key))
129349a72dfbSAdam Langley 		goto clear_hash;
1294cf80e0e4SHerbert Xu 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1295cf80e0e4SHerbert Xu 	if (crypto_ahash_final(req))
1296cfb6eeb4SYOSHIFUJI Hideaki 		goto clear_hash;
1297cfb6eeb4SYOSHIFUJI Hideaki 
1298cfb6eeb4SYOSHIFUJI Hideaki 	tcp_put_md5sig_pool();
1299cfb6eeb4SYOSHIFUJI Hideaki 	return 0;
130049a72dfbSAdam Langley 
1301cfb6eeb4SYOSHIFUJI Hideaki clear_hash:
1302cfb6eeb4SYOSHIFUJI Hideaki 	tcp_put_md5sig_pool();
1303cfb6eeb4SYOSHIFUJI Hideaki clear_hash_noput:
1304cfb6eeb4SYOSHIFUJI Hideaki 	memset(md5_hash, 0, 16);
130549a72dfbSAdam Langley 	return 1;
1306cfb6eeb4SYOSHIFUJI Hideaki }
1307cfb6eeb4SYOSHIFUJI Hideaki 
130839f8e58eSEric Dumazet int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
130939f8e58eSEric Dumazet 			const struct sock *sk,
1310318cf7aaSEric Dumazet 			const struct sk_buff *skb)
1311cfb6eeb4SYOSHIFUJI Hideaki {
131249a72dfbSAdam Langley 	struct tcp_md5sig_pool *hp;
1313cf80e0e4SHerbert Xu 	struct ahash_request *req;
1314318cf7aaSEric Dumazet 	const struct tcphdr *th = tcp_hdr(skb);
1315cfb6eeb4SYOSHIFUJI Hideaki 	__be32 saddr, daddr;
1316cfb6eeb4SYOSHIFUJI Hideaki 
131739f8e58eSEric Dumazet 	if (sk) { /* valid for establish/request sockets */
131839f8e58eSEric Dumazet 		saddr = sk->sk_rcv_saddr;
131939f8e58eSEric Dumazet 		daddr = sk->sk_daddr;
1320cfb6eeb4SYOSHIFUJI Hideaki 	} else {
132149a72dfbSAdam Langley 		const struct iphdr *iph = ip_hdr(skb);
132249a72dfbSAdam Langley 		saddr = iph->saddr;
132349a72dfbSAdam Langley 		daddr = iph->daddr;
1324cfb6eeb4SYOSHIFUJI Hideaki 	}
1325cfb6eeb4SYOSHIFUJI Hideaki 
132649a72dfbSAdam Langley 	hp = tcp_get_md5sig_pool();
132749a72dfbSAdam Langley 	if (!hp)
132849a72dfbSAdam Langley 		goto clear_hash_noput;
1329cf80e0e4SHerbert Xu 	req = hp->md5_req;
133049a72dfbSAdam Langley 
1331cf80e0e4SHerbert Xu 	if (crypto_ahash_init(req))
133249a72dfbSAdam Langley 		goto clear_hash;
133349a72dfbSAdam Langley 
133419689e38SEric Dumazet 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
133549a72dfbSAdam Langley 		goto clear_hash;
133649a72dfbSAdam Langley 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
133749a72dfbSAdam Langley 		goto clear_hash;
133849a72dfbSAdam Langley 	if (tcp_md5_hash_key(hp, key))
133949a72dfbSAdam Langley 		goto clear_hash;
1340cf80e0e4SHerbert Xu 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1341cf80e0e4SHerbert Xu 	if (crypto_ahash_final(req))
134249a72dfbSAdam Langley 		goto clear_hash;
134349a72dfbSAdam Langley 
134449a72dfbSAdam Langley 	tcp_put_md5sig_pool();
134549a72dfbSAdam Langley 	return 0;
134649a72dfbSAdam Langley 
134749a72dfbSAdam Langley clear_hash:
134849a72dfbSAdam Langley 	tcp_put_md5sig_pool();
134949a72dfbSAdam Langley clear_hash_noput:
135049a72dfbSAdam Langley 	memset(md5_hash, 0, 16);
135149a72dfbSAdam Langley 	return 1;
135249a72dfbSAdam Langley }
135349a72dfbSAdam Langley EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1354cfb6eeb4SYOSHIFUJI Hideaki 
1355ba8e275aSEric Dumazet #endif
1356ba8e275aSEric Dumazet 
1357ff74e23fSEric Dumazet /* Called with rcu_read_lock() */
1358ba8e275aSEric Dumazet static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1359534322caSDavid Ahern 				    const struct sk_buff *skb,
1360534322caSDavid Ahern 				    int dif, int sdif)
1361cfb6eeb4SYOSHIFUJI Hideaki {
1362ba8e275aSEric Dumazet #ifdef CONFIG_TCP_MD5SIG
1363cfb6eeb4SYOSHIFUJI Hideaki 	/*
1364cfb6eeb4SYOSHIFUJI Hideaki 	 * This gets called for each TCP segment that arrives
1365cfb6eeb4SYOSHIFUJI Hideaki 	 * so we want to be efficient.
1366cfb6eeb4SYOSHIFUJI Hideaki 	 * We have 3 drop cases:
1367cfb6eeb4SYOSHIFUJI Hideaki 	 * o No MD5 hash and one expected.
1368cfb6eeb4SYOSHIFUJI Hideaki 	 * o MD5 hash and we're not expecting one.
1369cfb6eeb4SYOSHIFUJI Hideaki 	 * o MD5 hash and its wrong.
1370cfb6eeb4SYOSHIFUJI Hideaki 	 */
1371cf533ea5SEric Dumazet 	const __u8 *hash_location = NULL;
1372cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_md5sig_key *hash_expected;
1373eddc9ec5SArnaldo Carvalho de Melo 	const struct iphdr *iph = ip_hdr(skb);
1374cf533ea5SEric Dumazet 	const struct tcphdr *th = tcp_hdr(skb);
1375cea97609SDavid Ahern 	const union tcp_md5_addr *addr;
1376cfb6eeb4SYOSHIFUJI Hideaki 	unsigned char newhash[16];
1377dea53bb8SDavid Ahern 	int genhash, l3index;
1378dea53bb8SDavid Ahern 
1379dea53bb8SDavid Ahern 	/* sdif set, means packet ingressed via a device
1380dea53bb8SDavid Ahern 	 * in an L3 domain and dif is set to the l3mdev
1381dea53bb8SDavid Ahern 	 */
1382dea53bb8SDavid Ahern 	l3index = sdif ? dif : 0;
1383cfb6eeb4SYOSHIFUJI Hideaki 
1384cea97609SDavid Ahern 	addr = (union tcp_md5_addr *)&iph->saddr;
1385dea53bb8SDavid Ahern 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
13867d5d5525SYOSHIFUJI Hideaki 	hash_location = tcp_parse_md5sig_option(th);
1387cfb6eeb4SYOSHIFUJI Hideaki 
1388cfb6eeb4SYOSHIFUJI Hideaki 	/* We've parsed the options - do we have a hash? */
1389cfb6eeb4SYOSHIFUJI Hideaki 	if (!hash_expected && !hash_location)
1390a2a385d6SEric Dumazet 		return false;
1391cfb6eeb4SYOSHIFUJI Hideaki 
1392cfb6eeb4SYOSHIFUJI Hideaki 	if (hash_expected && !hash_location) {
1393c10d9310SEric Dumazet 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1394a2a385d6SEric Dumazet 		return true;
1395cfb6eeb4SYOSHIFUJI Hideaki 	}
1396cfb6eeb4SYOSHIFUJI Hideaki 
1397cfb6eeb4SYOSHIFUJI Hideaki 	if (!hash_expected && hash_location) {
1398c10d9310SEric Dumazet 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1399a2a385d6SEric Dumazet 		return true;
1400cfb6eeb4SYOSHIFUJI Hideaki 	}
1401cfb6eeb4SYOSHIFUJI Hideaki 
1402cfb6eeb4SYOSHIFUJI Hideaki 	/* Okay, so this is hash_expected and hash_location -
1403cfb6eeb4SYOSHIFUJI Hideaki 	 * so we need to calculate the checksum.
1404cfb6eeb4SYOSHIFUJI Hideaki 	 */
140549a72dfbSAdam Langley 	genhash = tcp_v4_md5_hash_skb(newhash,
1406cfb6eeb4SYOSHIFUJI Hideaki 				      hash_expected,
140739f8e58eSEric Dumazet 				      NULL, skb);
1408cfb6eeb4SYOSHIFUJI Hideaki 
1409cfb6eeb4SYOSHIFUJI Hideaki 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
141072145a68SEric Dumazet 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1411dea53bb8SDavid Ahern 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1412673d57e7SHarvey Harrison 				     &iph->saddr, ntohs(th->source),
1413673d57e7SHarvey Harrison 				     &iph->daddr, ntohs(th->dest),
1414e87cc472SJoe Perches 				     genhash ? " tcp_v4_calc_md5_hash failed"
1415dea53bb8SDavid Ahern 				     : "", l3index);
1416a2a385d6SEric Dumazet 		return true;
1417cfb6eeb4SYOSHIFUJI Hideaki 	}
1418a2a385d6SEric Dumazet 	return false;
1419cfb6eeb4SYOSHIFUJI Hideaki #endif
1420ba8e275aSEric Dumazet 	return false;
1421ba8e275aSEric Dumazet }
1422cfb6eeb4SYOSHIFUJI Hideaki 
1423b40cf18eSEric Dumazet static void tcp_v4_init_req(struct request_sock *req,
1424b40cf18eSEric Dumazet 			    const struct sock *sk_listener,
142516bea70aSOctavian Purdila 			    struct sk_buff *skb)
142616bea70aSOctavian Purdila {
142716bea70aSOctavian Purdila 	struct inet_request_sock *ireq = inet_rsk(req);
1428c92e8c02SEric Dumazet 	struct net *net = sock_net(sk_listener);
142916bea70aSOctavian Purdila 
143008d2cc3bSEric Dumazet 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
143108d2cc3bSEric Dumazet 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1432c92e8c02SEric Dumazet 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
143316bea70aSOctavian Purdila }
143416bea70aSOctavian Purdila 
1435f964629eSEric Dumazet static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1436f964629eSEric Dumazet 					  struct flowi *fl,
14374396e461SSoheil Hassas Yeganeh 					  const struct request_sock *req)
1438d94e0417SOctavian Purdila {
14394396e461SSoheil Hassas Yeganeh 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1440d94e0417SOctavian Purdila }
1441d94e0417SOctavian Purdila 
144272a3effaSEric Dumazet struct request_sock_ops tcp_request_sock_ops __read_mostly = {
14431da177e4SLinus Torvalds 	.family		=	PF_INET,
14442e6599cbSArnaldo Carvalho de Melo 	.obj_size	=	sizeof(struct tcp_request_sock),
14455db92c99SOctavian Purdila 	.rtx_syn_ack	=	tcp_rtx_synack,
144660236fddSArnaldo Carvalho de Melo 	.send_ack	=	tcp_v4_reqsk_send_ack,
144760236fddSArnaldo Carvalho de Melo 	.destructor	=	tcp_v4_reqsk_destructor,
14481da177e4SLinus Torvalds 	.send_reset	=	tcp_v4_send_reset,
144972659eccSOctavian Purdila 	.syn_ack_timeout =	tcp_syn_ack_timeout,
14501da177e4SLinus Torvalds };
14511da177e4SLinus Torvalds 
145235b2c321SMat Martineau const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
14532aec4a29SOctavian Purdila 	.mss_clamp	=	TCP_MSS_DEFAULT,
145416bea70aSOctavian Purdila #ifdef CONFIG_TCP_MD5SIG
1455fd3a154aSEric Dumazet 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1456e3afe7b7SJohn Dykstra 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1457b6332e6cSAndrew Morton #endif
145816bea70aSOctavian Purdila 	.init_req	=	tcp_v4_init_req,
1459fb7b37a7SOctavian Purdila #ifdef CONFIG_SYN_COOKIES
1460fb7b37a7SOctavian Purdila 	.cookie_init_seq =	cookie_v4_init_sequence,
1461fb7b37a7SOctavian Purdila #endif
1462d94e0417SOctavian Purdila 	.route_req	=	tcp_v4_route_req,
146384b114b9SEric Dumazet 	.init_seq	=	tcp_v4_init_seq,
146484b114b9SEric Dumazet 	.init_ts_off	=	tcp_v4_init_ts_off,
1465d6274bd8SOctavian Purdila 	.send_synack	=	tcp_v4_send_synack,
146616bea70aSOctavian Purdila };
1467cfb6eeb4SYOSHIFUJI Hideaki 
14681da177e4SLinus Torvalds int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
14691da177e4SLinus Torvalds {
14701da177e4SLinus Torvalds 	/* Never answer to SYNs send to broadcast or multicast */
1471511c3f92SEric Dumazet 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
14721da177e4SLinus Torvalds 		goto drop;
14731da177e4SLinus Torvalds 
14741fb6f159SOctavian Purdila 	return tcp_conn_request(&tcp_request_sock_ops,
14751fb6f159SOctavian Purdila 				&tcp_request_sock_ipv4_ops, sk, skb);
14761da177e4SLinus Torvalds 
14771da177e4SLinus Torvalds drop:
14789caad864SEric Dumazet 	tcp_listendrop(sk);
14791da177e4SLinus Torvalds 	return 0;
14801da177e4SLinus Torvalds }
14814bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_conn_request);
14821da177e4SLinus Torvalds 
14831da177e4SLinus Torvalds 
14841da177e4SLinus Torvalds /*
14851da177e4SLinus Torvalds  * The three way handshake has completed - we got a valid synack -
14861da177e4SLinus Torvalds  * now create the new socket.
14871da177e4SLinus Torvalds  */
14880c27171eSEric Dumazet struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
148960236fddSArnaldo Carvalho de Melo 				  struct request_sock *req,
14905e0724d0SEric Dumazet 				  struct dst_entry *dst,
14915e0724d0SEric Dumazet 				  struct request_sock *req_unhash,
14925e0724d0SEric Dumazet 				  bool *own_req)
14931da177e4SLinus Torvalds {
14942e6599cbSArnaldo Carvalho de Melo 	struct inet_request_sock *ireq;
14951da177e4SLinus Torvalds 	struct inet_sock *newinet;
14961da177e4SLinus Torvalds 	struct tcp_sock *newtp;
14971da177e4SLinus Torvalds 	struct sock *newsk;
1498cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
1499cea97609SDavid Ahern 	const union tcp_md5_addr *addr;
1500cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_md5sig_key *key;
1501dea53bb8SDavid Ahern 	int l3index;
1502cfb6eeb4SYOSHIFUJI Hideaki #endif
1503f6d8bd05SEric Dumazet 	struct ip_options_rcu *inet_opt;
15041da177e4SLinus Torvalds 
15051da177e4SLinus Torvalds 	if (sk_acceptq_is_full(sk))
15061da177e4SLinus Torvalds 		goto exit_overflow;
15071da177e4SLinus Torvalds 
15081da177e4SLinus Torvalds 	newsk = tcp_create_openreq_child(sk, req, skb);
15091da177e4SLinus Torvalds 	if (!newsk)
1510093d2823SBalazs Scheidler 		goto exit_nonewsk;
15111da177e4SLinus Torvalds 
1512bcd76111SHerbert Xu 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1513fae6ef87SNeal Cardwell 	inet_sk_rx_dst_set(newsk, skb);
15141da177e4SLinus Torvalds 
15151da177e4SLinus Torvalds 	newtp		      = tcp_sk(newsk);
15161da177e4SLinus Torvalds 	newinet		      = inet_sk(newsk);
15172e6599cbSArnaldo Carvalho de Melo 	ireq		      = inet_rsk(req);
1518d1e559d0SEric Dumazet 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1519d1e559d0SEric Dumazet 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
15206dd9a14eSDavid Ahern 	newsk->sk_bound_dev_if = ireq->ir_iif;
1521634fb979SEric Dumazet 	newinet->inet_saddr   = ireq->ir_loc_addr;
1522c92e8c02SEric Dumazet 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1523c92e8c02SEric Dumazet 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1524463c84b9SArnaldo Carvalho de Melo 	newinet->mc_index     = inet_iif(skb);
1525eddc9ec5SArnaldo Carvalho de Melo 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
15264c507d28SJiri Benc 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1527d83d8461SArnaldo Carvalho de Melo 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1528f6d8bd05SEric Dumazet 	if (inet_opt)
1529f6d8bd05SEric Dumazet 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1530a904a069SEric Dumazet 	newinet->inet_id = prandom_u32();
15311da177e4SLinus Torvalds 
1532dfd25fffSEric Dumazet 	if (!dst) {
1533dfd25fffSEric Dumazet 		dst = inet_csk_route_child_sock(sk, newsk, req);
1534dfd25fffSEric Dumazet 		if (!dst)
15350e734419SDavid S. Miller 			goto put_and_exit;
1536dfd25fffSEric Dumazet 	} else {
1537dfd25fffSEric Dumazet 		/* syncookie case : see end of cookie_v4_check() */
1538dfd25fffSEric Dumazet 	}
15390e734419SDavid S. Miller 	sk_setup_caps(newsk, dst);
15400e734419SDavid S. Miller 
154181164413SDaniel Borkmann 	tcp_ca_openreq_child(newsk, dst);
154281164413SDaniel Borkmann 
15431da177e4SLinus Torvalds 	tcp_sync_mss(newsk, dst_mtu(dst));
15443541f9e8SEric Dumazet 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1545f5fff5dcSTom Quetchenbach 
15461da177e4SLinus Torvalds 	tcp_initialize_rcv_mss(newsk);
15471da177e4SLinus Torvalds 
1548cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
1549dea53bb8SDavid Ahern 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550cfb6eeb4SYOSHIFUJI Hideaki 	/* Copy over the MD5 key from the original socket */
1551cea97609SDavid Ahern 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552dea53bb8SDavid Ahern 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
155300db4124SIan Morris 	if (key) {
1554cfb6eeb4SYOSHIFUJI Hideaki 		/*
1555cfb6eeb4SYOSHIFUJI Hideaki 		 * We're using one, so create a matching key
1556cfb6eeb4SYOSHIFUJI Hideaki 		 * on the newsk structure. If we fail to get
1557cfb6eeb4SYOSHIFUJI Hideaki 		 * memory, then we end up not copying the key
1558cfb6eeb4SYOSHIFUJI Hideaki 		 * across. Shucks.
1559cfb6eeb4SYOSHIFUJI Hideaki 		 */
1560dea53bb8SDavid Ahern 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1561cea97609SDavid Ahern 			       key->key, key->keylen, GFP_ATOMIC);
1562a465419bSEric Dumazet 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1563cfb6eeb4SYOSHIFUJI Hideaki 	}
1564cfb6eeb4SYOSHIFUJI Hideaki #endif
1565cfb6eeb4SYOSHIFUJI Hideaki 
15660e734419SDavid S. Miller 	if (__inet_inherit_port(sk, newsk) < 0)
15670e734419SDavid S. Miller 		goto put_and_exit;
15685e0724d0SEric Dumazet 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1569c92e8c02SEric Dumazet 	if (likely(*own_req)) {
157049a496c9SEric Dumazet 		tcp_move_syn(newtp, req);
1571c92e8c02SEric Dumazet 		ireq->ireq_opt = NULL;
1572c92e8c02SEric Dumazet 	} else {
1573c92e8c02SEric Dumazet 		newinet->inet_opt = NULL;
1574c92e8c02SEric Dumazet 	}
15751da177e4SLinus Torvalds 	return newsk;
15761da177e4SLinus Torvalds 
15771da177e4SLinus Torvalds exit_overflow:
1578c10d9310SEric Dumazet 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1579093d2823SBalazs Scheidler exit_nonewsk:
1580093d2823SBalazs Scheidler 	dst_release(dst);
15811da177e4SLinus Torvalds exit:
15829caad864SEric Dumazet 	tcp_listendrop(sk);
15831da177e4SLinus Torvalds 	return NULL;
15840e734419SDavid S. Miller put_and_exit:
1585c92e8c02SEric Dumazet 	newinet->inet_opt = NULL;
1586e337e24dSChristoph Paasch 	inet_csk_prepare_forced_close(newsk);
1587e337e24dSChristoph Paasch 	tcp_done(newsk);
15880e734419SDavid S. Miller 	goto exit;
15891da177e4SLinus Torvalds }
15904bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
15911da177e4SLinus Torvalds 
1592079096f1SEric Dumazet static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
15931da177e4SLinus Torvalds {
15941da177e4SLinus Torvalds #ifdef CONFIG_SYN_COOKIES
1595079096f1SEric Dumazet 	const struct tcphdr *th = tcp_hdr(skb);
1596079096f1SEric Dumazet 
1597af9b4738SFlorian Westphal 	if (!th->syn)
1598461b74c3SCong Wang 		sk = cookie_v4_check(sk, skb);
15991da177e4SLinus Torvalds #endif
16001da177e4SLinus Torvalds 	return sk;
16011da177e4SLinus Torvalds }
16021da177e4SLinus Torvalds 
16039349d600SPetar Penkov u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
16049349d600SPetar Penkov 			 struct tcphdr *th, u32 *cookie)
16059349d600SPetar Penkov {
16069349d600SPetar Penkov 	u16 mss = 0;
16079349d600SPetar Penkov #ifdef CONFIG_SYN_COOKIES
16089349d600SPetar Penkov 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
16099349d600SPetar Penkov 				    &tcp_request_sock_ipv4_ops, sk, th);
16109349d600SPetar Penkov 	if (mss) {
16119349d600SPetar Penkov 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
16129349d600SPetar Penkov 		tcp_synq_overflow(sk);
16139349d600SPetar Penkov 	}
16149349d600SPetar Penkov #endif
16159349d600SPetar Penkov 	return mss;
16169349d600SPetar Penkov }
16179349d600SPetar Penkov 
16181da177e4SLinus Torvalds /* The socket must have it's spinlock held when we get
1619e994b2f0SEric Dumazet  * here, unless it is a TCP_LISTEN socket.
16201da177e4SLinus Torvalds  *
16211da177e4SLinus Torvalds  * We have a potential double-lock case here, so even when
16221da177e4SLinus Torvalds  * doing backlog processing we use the BH locking scheme.
16231da177e4SLinus Torvalds  * This is because we cannot sleep with the original spinlock
16241da177e4SLinus Torvalds  * held.
16251da177e4SLinus Torvalds  */
16261da177e4SLinus Torvalds int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
16271da177e4SLinus Torvalds {
1628cfb6eeb4SYOSHIFUJI Hideaki 	struct sock *rsk;
1629cfb6eeb4SYOSHIFUJI Hideaki 
16301da177e4SLinus Torvalds 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
163192101b3bSDavid S. Miller 		struct dst_entry *dst = sk->sk_rx_dst;
1632404e0a8bSEric Dumazet 
1633404e0a8bSEric Dumazet 		sock_rps_save_rxhash(sk, skb);
16343d97379aSEric Dumazet 		sk_mark_napi_id(sk, skb);
1635404e0a8bSEric Dumazet 		if (dst) {
1636505fbcf0SEric Dumazet 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
163751456b29SIan Morris 			    !dst->ops->check(dst, 0)) {
163892101b3bSDavid S. Miller 				dst_release(dst);
163992101b3bSDavid S. Miller 				sk->sk_rx_dst = NULL;
164092101b3bSDavid S. Miller 			}
164192101b3bSDavid S. Miller 		}
16423d97d88eSYafang Shao 		tcp_rcv_established(sk, skb);
16431da177e4SLinus Torvalds 		return 0;
16441da177e4SLinus Torvalds 	}
16451da177e4SLinus Torvalds 
164612e25e10SEric Dumazet 	if (tcp_checksum_complete(skb))
16471da177e4SLinus Torvalds 		goto csum_err;
16481da177e4SLinus Torvalds 
16491da177e4SLinus Torvalds 	if (sk->sk_state == TCP_LISTEN) {
1650079096f1SEric Dumazet 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1651079096f1SEric Dumazet 
16521da177e4SLinus Torvalds 		if (!nsk)
16531da177e4SLinus Torvalds 			goto discard;
16541da177e4SLinus Torvalds 		if (nsk != sk) {
1655cfb6eeb4SYOSHIFUJI Hideaki 			if (tcp_child_process(sk, nsk, skb)) {
1656cfb6eeb4SYOSHIFUJI Hideaki 				rsk = nsk;
16571da177e4SLinus Torvalds 				goto reset;
1658cfb6eeb4SYOSHIFUJI Hideaki 			}
16591da177e4SLinus Torvalds 			return 0;
16601da177e4SLinus Torvalds 		}
1661ca55158cSEric Dumazet 	} else
1662bdeab991STom Herbert 		sock_rps_save_rxhash(sk, skb);
1663ca55158cSEric Dumazet 
166472ab4a86SEric Dumazet 	if (tcp_rcv_state_process(sk, skb)) {
1665cfb6eeb4SYOSHIFUJI Hideaki 		rsk = sk;
16661da177e4SLinus Torvalds 		goto reset;
1667cfb6eeb4SYOSHIFUJI Hideaki 	}
16681da177e4SLinus Torvalds 	return 0;
16691da177e4SLinus Torvalds 
16701da177e4SLinus Torvalds reset:
1671cfb6eeb4SYOSHIFUJI Hideaki 	tcp_v4_send_reset(rsk, skb);
16721da177e4SLinus Torvalds discard:
16731da177e4SLinus Torvalds 	kfree_skb(skb);
16741da177e4SLinus Torvalds 	/* Be careful here. If this function gets more complicated and
16751da177e4SLinus Torvalds 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
16761da177e4SLinus Torvalds 	 * might be destroyed here. This current version compiles correctly,
16771da177e4SLinus Torvalds 	 * but you have been warned.
16781da177e4SLinus Torvalds 	 */
16791da177e4SLinus Torvalds 	return 0;
16801da177e4SLinus Torvalds 
16811da177e4SLinus Torvalds csum_err:
1682c10d9310SEric Dumazet 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1683c10d9310SEric Dumazet 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
16841da177e4SLinus Torvalds 	goto discard;
16851da177e4SLinus Torvalds }
16864bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_v4_do_rcv);
16871da177e4SLinus Torvalds 
16887487449cSPaolo Abeni int tcp_v4_early_demux(struct sk_buff *skb)
168941063e9dSDavid S. Miller {
169041063e9dSDavid S. Miller 	const struct iphdr *iph;
169141063e9dSDavid S. Miller 	const struct tcphdr *th;
169241063e9dSDavid S. Miller 	struct sock *sk;
169341063e9dSDavid S. Miller 
169441063e9dSDavid S. Miller 	if (skb->pkt_type != PACKET_HOST)
16957487449cSPaolo Abeni 		return 0;
169641063e9dSDavid S. Miller 
169745f00f99SEric Dumazet 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
16987487449cSPaolo Abeni 		return 0;
169941063e9dSDavid S. Miller 
170041063e9dSDavid S. Miller 	iph = ip_hdr(skb);
170145f00f99SEric Dumazet 	th = tcp_hdr(skb);
170241063e9dSDavid S. Miller 
170341063e9dSDavid S. Miller 	if (th->doff < sizeof(struct tcphdr) / 4)
17047487449cSPaolo Abeni 		return 0;
170541063e9dSDavid S. Miller 
170645f00f99SEric Dumazet 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
170741063e9dSDavid S. Miller 				       iph->saddr, th->source,
17087011d085SVijay Subramanian 				       iph->daddr, ntohs(th->dest),
17093fa6f616SDavid Ahern 				       skb->skb_iif, inet_sdif(skb));
171041063e9dSDavid S. Miller 	if (sk) {
171141063e9dSDavid S. Miller 		skb->sk = sk;
171241063e9dSDavid S. Miller 		skb->destructor = sock_edemux;
1713f7e4eb03SEric Dumazet 		if (sk_fullsock(sk)) {
1714d0c294c5SMichal Kubeček 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1715505fbcf0SEric Dumazet 
171641063e9dSDavid S. Miller 			if (dst)
171741063e9dSDavid S. Miller 				dst = dst_check(dst, 0);
171892101b3bSDavid S. Miller 			if (dst &&
1719505fbcf0SEric Dumazet 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
172041063e9dSDavid S. Miller 				skb_dst_set_noref(skb, dst);
172141063e9dSDavid S. Miller 		}
172241063e9dSDavid S. Miller 	}
17237487449cSPaolo Abeni 	return 0;
172441063e9dSDavid S. Miller }
172541063e9dSDavid S. Miller 
1726c9c33212SEric Dumazet bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1727c9c33212SEric Dumazet {
17288265792bSEric Dumazet 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
17294f693b55SEric Dumazet 	struct skb_shared_info *shinfo;
17304f693b55SEric Dumazet 	const struct tcphdr *th;
17314f693b55SEric Dumazet 	struct tcphdr *thtail;
17324f693b55SEric Dumazet 	struct sk_buff *tail;
17334f693b55SEric Dumazet 	unsigned int hdrlen;
17344f693b55SEric Dumazet 	bool fragstolen;
17354f693b55SEric Dumazet 	u32 gso_segs;
17364f693b55SEric Dumazet 	int delta;
1737c9c33212SEric Dumazet 
1738c9c33212SEric Dumazet 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1739c9c33212SEric Dumazet 	 * we can fix skb->truesize to its real value to avoid future drops.
1740c9c33212SEric Dumazet 	 * This is valid because skb is not yet charged to the socket.
1741c9c33212SEric Dumazet 	 * It has been noticed pure SACK packets were sometimes dropped
1742c9c33212SEric Dumazet 	 * (if cooked by drivers without copybreak feature).
1743c9c33212SEric Dumazet 	 */
174460b1af33SEric Dumazet 	skb_condense(skb);
1745c9c33212SEric Dumazet 
1746ade9628eSEric Dumazet 	skb_dst_drop(skb);
1747ade9628eSEric Dumazet 
17484f693b55SEric Dumazet 	if (unlikely(tcp_checksum_complete(skb))) {
17494f693b55SEric Dumazet 		bh_unlock_sock(sk);
17504f693b55SEric Dumazet 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
17514f693b55SEric Dumazet 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
17524f693b55SEric Dumazet 		return true;
17534f693b55SEric Dumazet 	}
17544f693b55SEric Dumazet 
17554f693b55SEric Dumazet 	/* Attempt coalescing to last skb in backlog, even if we are
17564f693b55SEric Dumazet 	 * above the limits.
17574f693b55SEric Dumazet 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
17584f693b55SEric Dumazet 	 */
17594f693b55SEric Dumazet 	th = (const struct tcphdr *)skb->data;
17604f693b55SEric Dumazet 	hdrlen = th->doff * 4;
17614f693b55SEric Dumazet 	shinfo = skb_shinfo(skb);
17624f693b55SEric Dumazet 
17634f693b55SEric Dumazet 	if (!shinfo->gso_size)
17644f693b55SEric Dumazet 		shinfo->gso_size = skb->len - hdrlen;
17654f693b55SEric Dumazet 
17664f693b55SEric Dumazet 	if (!shinfo->gso_segs)
17674f693b55SEric Dumazet 		shinfo->gso_segs = 1;
17684f693b55SEric Dumazet 
17694f693b55SEric Dumazet 	tail = sk->sk_backlog.tail;
17704f693b55SEric Dumazet 	if (!tail)
17714f693b55SEric Dumazet 		goto no_coalesce;
17724f693b55SEric Dumazet 	thtail = (struct tcphdr *)tail->data;
17734f693b55SEric Dumazet 
17744f693b55SEric Dumazet 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
17754f693b55SEric Dumazet 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
17764f693b55SEric Dumazet 	    ((TCP_SKB_CB(tail)->tcp_flags |
1777ca2fe295SEric Dumazet 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1778ca2fe295SEric Dumazet 	    !((TCP_SKB_CB(tail)->tcp_flags &
1779ca2fe295SEric Dumazet 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
17804f693b55SEric Dumazet 	    ((TCP_SKB_CB(tail)->tcp_flags ^
17814f693b55SEric Dumazet 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
17824f693b55SEric Dumazet #ifdef CONFIG_TLS_DEVICE
17834f693b55SEric Dumazet 	    tail->decrypted != skb->decrypted ||
17844f693b55SEric Dumazet #endif
17854f693b55SEric Dumazet 	    thtail->doff != th->doff ||
17864f693b55SEric Dumazet 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
17874f693b55SEric Dumazet 		goto no_coalesce;
17884f693b55SEric Dumazet 
17894f693b55SEric Dumazet 	__skb_pull(skb, hdrlen);
17904f693b55SEric Dumazet 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
17914f693b55SEric Dumazet 		thtail->window = th->window;
17924f693b55SEric Dumazet 
17934f693b55SEric Dumazet 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
17944f693b55SEric Dumazet 
17954f693b55SEric Dumazet 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
17964f693b55SEric Dumazet 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
17974f693b55SEric Dumazet 
1798ca2fe295SEric Dumazet 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1799ca2fe295SEric Dumazet 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1800ca2fe295SEric Dumazet 		 * is not entered if we append a packet with a FIN.
1801ca2fe295SEric Dumazet 		 * SYN, RST, URG are not present.
1802ca2fe295SEric Dumazet 		 * ACK is set on both packets.
1803ca2fe295SEric Dumazet 		 * PSH : we do not really care in TCP stack,
1804ca2fe295SEric Dumazet 		 *       at least for 'GRO' packets.
1805ca2fe295SEric Dumazet 		 */
1806ca2fe295SEric Dumazet 		thtail->fin |= th->fin;
18074f693b55SEric Dumazet 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
18084f693b55SEric Dumazet 
18094f693b55SEric Dumazet 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
18104f693b55SEric Dumazet 			TCP_SKB_CB(tail)->has_rxtstamp = true;
18114f693b55SEric Dumazet 			tail->tstamp = skb->tstamp;
18124f693b55SEric Dumazet 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
18134f693b55SEric Dumazet 		}
18144f693b55SEric Dumazet 
18154f693b55SEric Dumazet 		/* Not as strict as GRO. We only need to carry mss max value */
18164f693b55SEric Dumazet 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
18174f693b55SEric Dumazet 						 skb_shinfo(tail)->gso_size);
18184f693b55SEric Dumazet 
18194f693b55SEric Dumazet 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
18204f693b55SEric Dumazet 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
18214f693b55SEric Dumazet 
18224f693b55SEric Dumazet 		sk->sk_backlog.len += delta;
18234f693b55SEric Dumazet 		__NET_INC_STATS(sock_net(sk),
18244f693b55SEric Dumazet 				LINUX_MIB_TCPBACKLOGCOALESCE);
18254f693b55SEric Dumazet 		kfree_skb_partial(skb, fragstolen);
18264f693b55SEric Dumazet 		return false;
18274f693b55SEric Dumazet 	}
18284f693b55SEric Dumazet 	__skb_push(skb, hdrlen);
18294f693b55SEric Dumazet 
18304f693b55SEric Dumazet no_coalesce:
18314f693b55SEric Dumazet 	/* Only socket owner can try to collapse/prune rx queues
18324f693b55SEric Dumazet 	 * to reduce memory overhead, so add a little headroom here.
18334f693b55SEric Dumazet 	 * Few sockets backlog are possibly concurrently non empty.
18344f693b55SEric Dumazet 	 */
18354f693b55SEric Dumazet 	limit += 64*1024;
18364f693b55SEric Dumazet 
1837c9c33212SEric Dumazet 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1838c9c33212SEric Dumazet 		bh_unlock_sock(sk);
1839c9c33212SEric Dumazet 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1840c9c33212SEric Dumazet 		return true;
1841c9c33212SEric Dumazet 	}
1842c9c33212SEric Dumazet 	return false;
1843c9c33212SEric Dumazet }
1844c9c33212SEric Dumazet EXPORT_SYMBOL(tcp_add_backlog);
1845c9c33212SEric Dumazet 
1846ac6e7800SEric Dumazet int tcp_filter(struct sock *sk, struct sk_buff *skb)
1847ac6e7800SEric Dumazet {
1848ac6e7800SEric Dumazet 	struct tcphdr *th = (struct tcphdr *)skb->data;
1849ac6e7800SEric Dumazet 
1850f2feaefdSChristoph Paasch 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1851ac6e7800SEric Dumazet }
1852ac6e7800SEric Dumazet EXPORT_SYMBOL(tcp_filter);
1853ac6e7800SEric Dumazet 
1854eeea10b8SEric Dumazet static void tcp_v4_restore_cb(struct sk_buff *skb)
1855eeea10b8SEric Dumazet {
1856eeea10b8SEric Dumazet 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1857eeea10b8SEric Dumazet 		sizeof(struct inet_skb_parm));
1858eeea10b8SEric Dumazet }
1859eeea10b8SEric Dumazet 
1860eeea10b8SEric Dumazet static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1861eeea10b8SEric Dumazet 			   const struct tcphdr *th)
1862eeea10b8SEric Dumazet {
1863eeea10b8SEric Dumazet 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1864eeea10b8SEric Dumazet 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1865eeea10b8SEric Dumazet 	 */
1866eeea10b8SEric Dumazet 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1867eeea10b8SEric Dumazet 		sizeof(struct inet_skb_parm));
1868eeea10b8SEric Dumazet 	barrier();
1869eeea10b8SEric Dumazet 
1870eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1871eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1872eeea10b8SEric Dumazet 				    skb->len - th->doff * 4);
1873eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1874eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1875eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1876eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1877eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->sacked	 = 0;
1878eeea10b8SEric Dumazet 	TCP_SKB_CB(skb)->has_rxtstamp =
1879eeea10b8SEric Dumazet 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1880eeea10b8SEric Dumazet }
1881eeea10b8SEric Dumazet 
18821da177e4SLinus Torvalds /*
18831da177e4SLinus Torvalds  *	From tcp_input.c
18841da177e4SLinus Torvalds  */
18851da177e4SLinus Torvalds 
18861da177e4SLinus Torvalds int tcp_v4_rcv(struct sk_buff *skb)
18871da177e4SLinus Torvalds {
18883b24d854SEric Dumazet 	struct net *net = dev_net(skb->dev);
18898b27dae5SEric Dumazet 	struct sk_buff *skb_to_free;
18903fa6f616SDavid Ahern 	int sdif = inet_sdif(skb);
1891534322caSDavid Ahern 	int dif = inet_iif(skb);
1892eddc9ec5SArnaldo Carvalho de Melo 	const struct iphdr *iph;
1893cf533ea5SEric Dumazet 	const struct tcphdr *th;
18943b24d854SEric Dumazet 	bool refcounted;
18951da177e4SLinus Torvalds 	struct sock *sk;
18961da177e4SLinus Torvalds 	int ret;
18971da177e4SLinus Torvalds 
18981da177e4SLinus Torvalds 	if (skb->pkt_type != PACKET_HOST)
18991da177e4SLinus Torvalds 		goto discard_it;
19001da177e4SLinus Torvalds 
19011da177e4SLinus Torvalds 	/* Count it even if it's bad */
190290bbcc60SEric Dumazet 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
19031da177e4SLinus Torvalds 
19041da177e4SLinus Torvalds 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
19051da177e4SLinus Torvalds 		goto discard_it;
19061da177e4SLinus Torvalds 
1907ea1627c2SEric Dumazet 	th = (const struct tcphdr *)skb->data;
19081da177e4SLinus Torvalds 
1909ea1627c2SEric Dumazet 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
19101da177e4SLinus Torvalds 		goto bad_packet;
19111da177e4SLinus Torvalds 	if (!pskb_may_pull(skb, th->doff * 4))
19121da177e4SLinus Torvalds 		goto discard_it;
19131da177e4SLinus Torvalds 
19141da177e4SLinus Torvalds 	/* An explanation is required here, I think.
19151da177e4SLinus Torvalds 	 * Packet length and doff are validated by header prediction,
1916caa20d9aSStephen Hemminger 	 * provided case of th->doff==0 is eliminated.
19171da177e4SLinus Torvalds 	 * So, we defer the checks. */
1918ed70fcfcSTom Herbert 
1919ed70fcfcSTom Herbert 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
19206a5dc9e5SEric Dumazet 		goto csum_error;
19211da177e4SLinus Torvalds 
1922ea1627c2SEric Dumazet 	th = (const struct tcphdr *)skb->data;
1923eddc9ec5SArnaldo Carvalho de Melo 	iph = ip_hdr(skb);
19244bdc3d66SEric Dumazet lookup:
1925a583636aSCraig Gallek 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
19263fa6f616SDavid Ahern 			       th->dest, sdif, &refcounted);
19271da177e4SLinus Torvalds 	if (!sk)
19281da177e4SLinus Torvalds 		goto no_tcp_socket;
19291da177e4SLinus Torvalds 
1930bb134d5dSEric Dumazet process:
1931bb134d5dSEric Dumazet 	if (sk->sk_state == TCP_TIME_WAIT)
1932bb134d5dSEric Dumazet 		goto do_time_wait;
1933bb134d5dSEric Dumazet 
1934079096f1SEric Dumazet 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1935079096f1SEric Dumazet 		struct request_sock *req = inet_reqsk(sk);
1936e0f9759fSEric Dumazet 		bool req_stolen = false;
19377716682cSEric Dumazet 		struct sock *nsk;
1938079096f1SEric Dumazet 
1939079096f1SEric Dumazet 		sk = req->rsk_listener;
1940534322caSDavid Ahern 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1941e65c332dSEric Dumazet 			sk_drops_add(sk, skb);
194272923555SEric Dumazet 			reqsk_put(req);
194372923555SEric Dumazet 			goto discard_it;
194472923555SEric Dumazet 		}
19454fd44a98SFrank van der Linden 		if (tcp_checksum_complete(skb)) {
19464fd44a98SFrank van der Linden 			reqsk_put(req);
19474fd44a98SFrank van der Linden 			goto csum_error;
19484fd44a98SFrank van der Linden 		}
19497716682cSEric Dumazet 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1950f03f2e15SEric Dumazet 			inet_csk_reqsk_queue_drop_and_put(sk, req);
19514bdc3d66SEric Dumazet 			goto lookup;
19524bdc3d66SEric Dumazet 		}
19533b24d854SEric Dumazet 		/* We own a reference on the listener, increase it again
19543b24d854SEric Dumazet 		 * as we might lose it too soon.
19553b24d854SEric Dumazet 		 */
19567716682cSEric Dumazet 		sock_hold(sk);
19573b24d854SEric Dumazet 		refcounted = true;
19581f3b359fSEric Dumazet 		nsk = NULL;
1959eeea10b8SEric Dumazet 		if (!tcp_filter(sk, skb)) {
1960eeea10b8SEric Dumazet 			th = (const struct tcphdr *)skb->data;
1961eeea10b8SEric Dumazet 			iph = ip_hdr(skb);
1962eeea10b8SEric Dumazet 			tcp_v4_fill_cb(skb, iph, th);
1963e0f9759fSEric Dumazet 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1964eeea10b8SEric Dumazet 		}
1965079096f1SEric Dumazet 		if (!nsk) {
1966079096f1SEric Dumazet 			reqsk_put(req);
1967e0f9759fSEric Dumazet 			if (req_stolen) {
1968e0f9759fSEric Dumazet 				/* Another cpu got exclusive access to req
1969e0f9759fSEric Dumazet 				 * and created a full blown socket.
1970e0f9759fSEric Dumazet 				 * Try to feed this packet to this socket
1971e0f9759fSEric Dumazet 				 * instead of discarding it.
1972e0f9759fSEric Dumazet 				 */
1973e0f9759fSEric Dumazet 				tcp_v4_restore_cb(skb);
1974e0f9759fSEric Dumazet 				sock_put(sk);
1975e0f9759fSEric Dumazet 				goto lookup;
1976e0f9759fSEric Dumazet 			}
19777716682cSEric Dumazet 			goto discard_and_relse;
1978079096f1SEric Dumazet 		}
1979079096f1SEric Dumazet 		if (nsk == sk) {
1980079096f1SEric Dumazet 			reqsk_put(req);
1981eeea10b8SEric Dumazet 			tcp_v4_restore_cb(skb);
1982079096f1SEric Dumazet 		} else if (tcp_child_process(sk, nsk, skb)) {
1983079096f1SEric Dumazet 			tcp_v4_send_reset(nsk, skb);
19847716682cSEric Dumazet 			goto discard_and_relse;
1985079096f1SEric Dumazet 		} else {
19867716682cSEric Dumazet 			sock_put(sk);
1987079096f1SEric Dumazet 			return 0;
1988079096f1SEric Dumazet 		}
1989079096f1SEric Dumazet 	}
19906cce09f8SEric Dumazet 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
199102a1d6e7SEric Dumazet 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1992d218d111SStephen Hemminger 		goto discard_and_relse;
19936cce09f8SEric Dumazet 	}
1994d218d111SStephen Hemminger 
19951da177e4SLinus Torvalds 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
19961da177e4SLinus Torvalds 		goto discard_and_relse;
19979ea88a15SDmitry Popov 
1998534322caSDavid Ahern 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
19999ea88a15SDmitry Popov 		goto discard_and_relse;
20009ea88a15SDmitry Popov 
2001895b5c9fSFlorian Westphal 	nf_reset_ct(skb);
20021da177e4SLinus Torvalds 
2003ac6e7800SEric Dumazet 	if (tcp_filter(sk, skb))
20041da177e4SLinus Torvalds 		goto discard_and_relse;
2005ac6e7800SEric Dumazet 	th = (const struct tcphdr *)skb->data;
2006ac6e7800SEric Dumazet 	iph = ip_hdr(skb);
2007eeea10b8SEric Dumazet 	tcp_v4_fill_cb(skb, iph, th);
20081da177e4SLinus Torvalds 
20091da177e4SLinus Torvalds 	skb->dev = NULL;
20101da177e4SLinus Torvalds 
2011e994b2f0SEric Dumazet 	if (sk->sk_state == TCP_LISTEN) {
2012e994b2f0SEric Dumazet 		ret = tcp_v4_do_rcv(sk, skb);
2013e994b2f0SEric Dumazet 		goto put_and_return;
2014e994b2f0SEric Dumazet 	}
2015e994b2f0SEric Dumazet 
2016e994b2f0SEric Dumazet 	sk_incoming_cpu_update(sk);
2017e994b2f0SEric Dumazet 
2018c6366184SIngo Molnar 	bh_lock_sock_nested(sk);
2019a44d6eacSMartin KaFai Lau 	tcp_segs_in(tcp_sk(sk), skb);
20201da177e4SLinus Torvalds 	ret = 0;
20211da177e4SLinus Torvalds 	if (!sock_owned_by_user(sk)) {
20228b27dae5SEric Dumazet 		skb_to_free = sk->sk_rx_skb_cache;
20238b27dae5SEric Dumazet 		sk->sk_rx_skb_cache = NULL;
20241da177e4SLinus Torvalds 		ret = tcp_v4_do_rcv(sk, skb);
20258b27dae5SEric Dumazet 	} else {
20268b27dae5SEric Dumazet 		if (tcp_add_backlog(sk, skb))
20276b03a53aSZhu Yi 			goto discard_and_relse;
20288b27dae5SEric Dumazet 		skb_to_free = NULL;
20296b03a53aSZhu Yi 	}
20301da177e4SLinus Torvalds 	bh_unlock_sock(sk);
20318b27dae5SEric Dumazet 	if (skb_to_free)
20328b27dae5SEric Dumazet 		__kfree_skb(skb_to_free);
20331da177e4SLinus Torvalds 
2034e994b2f0SEric Dumazet put_and_return:
20353b24d854SEric Dumazet 	if (refcounted)
20361da177e4SLinus Torvalds 		sock_put(sk);
20371da177e4SLinus Torvalds 
20381da177e4SLinus Torvalds 	return ret;
20391da177e4SLinus Torvalds 
20401da177e4SLinus Torvalds no_tcp_socket:
20411da177e4SLinus Torvalds 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
20421da177e4SLinus Torvalds 		goto discard_it;
20431da177e4SLinus Torvalds 
2044eeea10b8SEric Dumazet 	tcp_v4_fill_cb(skb, iph, th);
2045eeea10b8SEric Dumazet 
204612e25e10SEric Dumazet 	if (tcp_checksum_complete(skb)) {
20476a5dc9e5SEric Dumazet csum_error:
204890bbcc60SEric Dumazet 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
20491da177e4SLinus Torvalds bad_packet:
205090bbcc60SEric Dumazet 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
20511da177e4SLinus Torvalds 	} else {
2052cfb6eeb4SYOSHIFUJI Hideaki 		tcp_v4_send_reset(NULL, skb);
20531da177e4SLinus Torvalds 	}
20541da177e4SLinus Torvalds 
20551da177e4SLinus Torvalds discard_it:
20561da177e4SLinus Torvalds 	/* Discard frame. */
20571da177e4SLinus Torvalds 	kfree_skb(skb);
20581da177e4SLinus Torvalds 	return 0;
20591da177e4SLinus Torvalds 
20601da177e4SLinus Torvalds discard_and_relse:
2061532182cdSEric Dumazet 	sk_drops_add(sk, skb);
20623b24d854SEric Dumazet 	if (refcounted)
20631da177e4SLinus Torvalds 		sock_put(sk);
20641da177e4SLinus Torvalds 	goto discard_it;
20651da177e4SLinus Torvalds 
20661da177e4SLinus Torvalds do_time_wait:
20671da177e4SLinus Torvalds 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
20689469c7b4SYOSHIFUJI Hideaki 		inet_twsk_put(inet_twsk(sk));
20691da177e4SLinus Torvalds 		goto discard_it;
20701da177e4SLinus Torvalds 	}
20711da177e4SLinus Torvalds 
2072eeea10b8SEric Dumazet 	tcp_v4_fill_cb(skb, iph, th);
2073eeea10b8SEric Dumazet 
20746a5dc9e5SEric Dumazet 	if (tcp_checksum_complete(skb)) {
20756a5dc9e5SEric Dumazet 		inet_twsk_put(inet_twsk(sk));
20766a5dc9e5SEric Dumazet 		goto csum_error;
20771da177e4SLinus Torvalds 	}
20789469c7b4SYOSHIFUJI Hideaki 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
20791da177e4SLinus Torvalds 	case TCP_TW_SYN: {
2080c346dca1SYOSHIFUJI Hideaki 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2081a583636aSCraig Gallek 							&tcp_hashinfo, skb,
2082a583636aSCraig Gallek 							__tcp_hdrlen(th),
2083da5e3630STom Herbert 							iph->saddr, th->source,
2084eddc9ec5SArnaldo Carvalho de Melo 							iph->daddr, th->dest,
20853fa6f616SDavid Ahern 							inet_iif(skb),
20863fa6f616SDavid Ahern 							sdif);
20871da177e4SLinus Torvalds 		if (sk2) {
2088dbe7faa4SEric Dumazet 			inet_twsk_deschedule_put(inet_twsk(sk));
20891da177e4SLinus Torvalds 			sk = sk2;
2090eeea10b8SEric Dumazet 			tcp_v4_restore_cb(skb);
20913b24d854SEric Dumazet 			refcounted = false;
20921da177e4SLinus Torvalds 			goto process;
20931da177e4SLinus Torvalds 		}
20941da177e4SLinus Torvalds 	}
2095fcfd6dfaSGustavo A. R. Silva 		/* to ACK */
2096a8eceea8SJoe Perches 		fallthrough;
20971da177e4SLinus Torvalds 	case TCP_TW_ACK:
20981da177e4SLinus Torvalds 		tcp_v4_timewait_ack(sk, skb);
20991da177e4SLinus Torvalds 		break;
21001da177e4SLinus Torvalds 	case TCP_TW_RST:
2101271c3b9bSFlorian Westphal 		tcp_v4_send_reset(sk, skb);
2102271c3b9bSFlorian Westphal 		inet_twsk_deschedule_put(inet_twsk(sk));
2103271c3b9bSFlorian Westphal 		goto discard_it;
21041da177e4SLinus Torvalds 	case TCP_TW_SUCCESS:;
21051da177e4SLinus Torvalds 	}
21061da177e4SLinus Torvalds 	goto discard_it;
21071da177e4SLinus Torvalds }
21081da177e4SLinus Torvalds 
2109ccb7c410SDavid S. Miller static struct timewait_sock_ops tcp_timewait_sock_ops = {
2110ccb7c410SDavid S. Miller 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2111ccb7c410SDavid S. Miller 	.twsk_unique	= tcp_twsk_unique,
2112ccb7c410SDavid S. Miller 	.twsk_destructor= tcp_twsk_destructor,
2113ccb7c410SDavid S. Miller };
21141da177e4SLinus Torvalds 
211563d02d15SEric Dumazet void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
21165d299f3dSEric Dumazet {
21175d299f3dSEric Dumazet 	struct dst_entry *dst = skb_dst(skb);
21185d299f3dSEric Dumazet 
21195037e9efSEric Dumazet 	if (dst && dst_hold_safe(dst)) {
21205d299f3dSEric Dumazet 		sk->sk_rx_dst = dst;
21215d299f3dSEric Dumazet 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
21225d299f3dSEric Dumazet 	}
2123ca777effSEric Dumazet }
212463d02d15SEric Dumazet EXPORT_SYMBOL(inet_sk_rx_dst_set);
21255d299f3dSEric Dumazet 
21263b401a81SStephen Hemminger const struct inet_connection_sock_af_ops ipv4_specific = {
21271da177e4SLinus Torvalds 	.queue_xmit	   = ip_queue_xmit,
21281da177e4SLinus Torvalds 	.send_check	   = tcp_v4_send_check,
212932519f11SArnaldo Carvalho de Melo 	.rebuild_header	   = inet_sk_rebuild_header,
21305d299f3dSEric Dumazet 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
21311da177e4SLinus Torvalds 	.conn_request	   = tcp_v4_conn_request,
21321da177e4SLinus Torvalds 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
21331da177e4SLinus Torvalds 	.net_header_len	   = sizeof(struct iphdr),
21341da177e4SLinus Torvalds 	.setsockopt	   = ip_setsockopt,
21351da177e4SLinus Torvalds 	.getsockopt	   = ip_getsockopt,
2136543d9cfeSArnaldo Carvalho de Melo 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2137543d9cfeSArnaldo Carvalho de Melo 	.sockaddr_len	   = sizeof(struct sockaddr_in),
21384fab9071SNeal Cardwell 	.mtu_reduced	   = tcp_v4_mtu_reduced,
21391da177e4SLinus Torvalds };
21404bc2f18bSEric Dumazet EXPORT_SYMBOL(ipv4_specific);
21411da177e4SLinus Torvalds 
2142cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
2143b2e4b3deSStephen Hemminger static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2144cfb6eeb4SYOSHIFUJI Hideaki 	.md5_lookup		= tcp_v4_md5_lookup,
214549a72dfbSAdam Langley 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2146cfb6eeb4SYOSHIFUJI Hideaki 	.md5_parse		= tcp_v4_parse_md5_keys,
2147cfb6eeb4SYOSHIFUJI Hideaki };
2148b6332e6cSAndrew Morton #endif
2149cfb6eeb4SYOSHIFUJI Hideaki 
21501da177e4SLinus Torvalds /* NOTE: A lot of things set to zero explicitly by call to
21511da177e4SLinus Torvalds  *       sk_alloc() so need not be done here.
21521da177e4SLinus Torvalds  */
21531da177e4SLinus Torvalds static int tcp_v4_init_sock(struct sock *sk)
21541da177e4SLinus Torvalds {
21556687e988SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
21561da177e4SLinus Torvalds 
2157900f65d3SNeal Cardwell 	tcp_init_sock(sk);
21581da177e4SLinus Torvalds 
21598292a17aSArnaldo Carvalho de Melo 	icsk->icsk_af_ops = &ipv4_specific;
2160900f65d3SNeal Cardwell 
2161cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
2162ac807fa8SDavid S. Miller 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2163cfb6eeb4SYOSHIFUJI Hideaki #endif
21641da177e4SLinus Torvalds 
21651da177e4SLinus Torvalds 	return 0;
21661da177e4SLinus Torvalds }
21671da177e4SLinus Torvalds 
21687d06b2e0SBrian Haley void tcp_v4_destroy_sock(struct sock *sk)
21691da177e4SLinus Torvalds {
21701da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
21711da177e4SLinus Torvalds 
2172e1a4aa50SSong Liu 	trace_tcp_destroy_sock(sk);
2173e1a4aa50SSong Liu 
21741da177e4SLinus Torvalds 	tcp_clear_xmit_timers(sk);
21751da177e4SLinus Torvalds 
21766687e988SArnaldo Carvalho de Melo 	tcp_cleanup_congestion_control(sk);
2177317a76f9SStephen Hemminger 
2178734942ccSDave Watson 	tcp_cleanup_ulp(sk);
2179734942ccSDave Watson 
21801da177e4SLinus Torvalds 	/* Cleanup up the write buffer. */
2181fe067e8aSDavid S. Miller 	tcp_write_queue_purge(sk);
21821da177e4SLinus Torvalds 
2183cf1ef3f0SWei Wang 	/* Check if we want to disable active TFO */
2184cf1ef3f0SWei Wang 	tcp_fastopen_active_disable_ofo_check(sk);
2185cf1ef3f0SWei Wang 
21861da177e4SLinus Torvalds 	/* Cleans up our, hopefully empty, out_of_order_queue. */
21879f5afeaeSYaogong Wang 	skb_rbtree_purge(&tp->out_of_order_queue);
21881da177e4SLinus Torvalds 
2189cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
2190cfb6eeb4SYOSHIFUJI Hideaki 	/* Clean up the MD5 key list, if any */
2191cfb6eeb4SYOSHIFUJI Hideaki 	if (tp->md5sig_info) {
2192a915da9bSEric Dumazet 		tcp_clear_md5_list(sk);
2193fb7df5e4SMat Martineau 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2194cfb6eeb4SYOSHIFUJI Hideaki 		tp->md5sig_info = NULL;
2195cfb6eeb4SYOSHIFUJI Hideaki 	}
2196cfb6eeb4SYOSHIFUJI Hideaki #endif
2197cfb6eeb4SYOSHIFUJI Hideaki 
21981da177e4SLinus Torvalds 	/* Clean up a referenced TCP bind bucket. */
2199463c84b9SArnaldo Carvalho de Melo 	if (inet_csk(sk)->icsk_bind_hash)
2200ab1e0a13SArnaldo Carvalho de Melo 		inet_put_port(sk);
22011da177e4SLinus Torvalds 
2202d983ea6fSEric Dumazet 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2203435cf559SWilliam Allen Simpson 
2204cf60af03SYuchung Cheng 	/* If socket is aborted during connect operation */
2205cf60af03SYuchung Cheng 	tcp_free_fastopen_req(tp);
22061fba70e5SYuchung Cheng 	tcp_fastopen_destroy_cipher(sk);
2207cd8ae852SEric Dumazet 	tcp_saved_syn_free(tp);
2208cf60af03SYuchung Cheng 
2209180d8cd9SGlauber Costa 	sk_sockets_allocated_dec(sk);
22101da177e4SLinus Torvalds }
22111da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_destroy_sock);
22121da177e4SLinus Torvalds 
22131da177e4SLinus Torvalds #ifdef CONFIG_PROC_FS
22141da177e4SLinus Torvalds /* Proc filesystem TCP sock list dumping. */
22151da177e4SLinus Torvalds 
2216a8b690f9STom Herbert /*
2217a8b690f9STom Herbert  * Get next listener socket follow cur.  If cur is NULL, get first socket
2218a8b690f9STom Herbert  * starting from bucket given in st->bucket; when st->bucket is zero the
2219a8b690f9STom Herbert  * very first socket in the hash table is returned.
2220a8b690f9STom Herbert  */
22211da177e4SLinus Torvalds static void *listening_get_next(struct seq_file *seq, void *cur)
22221da177e4SLinus Torvalds {
2223b08d4d3bSYonghong Song 	struct tcp_seq_afinfo *afinfo;
22241da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
2225a4146b1bSDenis V. Lunev 	struct net *net = seq_file_net(seq);
22263b24d854SEric Dumazet 	struct inet_listen_hashbucket *ilb;
22278dbd76e7SEric Dumazet 	struct hlist_nulls_node *node;
22283b24d854SEric Dumazet 	struct sock *sk = cur;
22291da177e4SLinus Torvalds 
2230b08d4d3bSYonghong Song 	if (st->bpf_seq_afinfo)
2231b08d4d3bSYonghong Song 		afinfo = st->bpf_seq_afinfo;
2232b08d4d3bSYonghong Song 	else
2233b08d4d3bSYonghong Song 		afinfo = PDE_DATA(file_inode(seq->file));
2234b08d4d3bSYonghong Song 
22351da177e4SLinus Torvalds 	if (!sk) {
22363b24d854SEric Dumazet get_head:
2237a8b690f9STom Herbert 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
22389652dc2eSEric Dumazet 		spin_lock(&ilb->lock);
22398dbd76e7SEric Dumazet 		sk = sk_nulls_head(&ilb->nulls_head);
2240a8b690f9STom Herbert 		st->offset = 0;
22411da177e4SLinus Torvalds 		goto get_sk;
22421da177e4SLinus Torvalds 	}
22435caea4eaSEric Dumazet 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
22441da177e4SLinus Torvalds 	++st->num;
2245a8b690f9STom Herbert 	++st->offset;
22461da177e4SLinus Torvalds 
22478dbd76e7SEric Dumazet 	sk = sk_nulls_next(sk);
22481da177e4SLinus Torvalds get_sk:
22498dbd76e7SEric Dumazet 	sk_nulls_for_each_from(sk, node) {
22508475ef9fSPavel Emelyanov 		if (!net_eq(sock_net(sk), net))
22518475ef9fSPavel Emelyanov 			continue;
2252b08d4d3bSYonghong Song 		if (afinfo->family == AF_UNSPEC ||
2253b08d4d3bSYonghong Song 		    sk->sk_family == afinfo->family)
22543b24d854SEric Dumazet 			return sk;
22551da177e4SLinus Torvalds 	}
22569652dc2eSEric Dumazet 	spin_unlock(&ilb->lock);
2257a8b690f9STom Herbert 	st->offset = 0;
22583b24d854SEric Dumazet 	if (++st->bucket < INET_LHTABLE_SIZE)
22593b24d854SEric Dumazet 		goto get_head;
22603b24d854SEric Dumazet 	return NULL;
22611da177e4SLinus Torvalds }
22621da177e4SLinus Torvalds 
22631da177e4SLinus Torvalds static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
22641da177e4SLinus Torvalds {
2265a8b690f9STom Herbert 	struct tcp_iter_state *st = seq->private;
2266a8b690f9STom Herbert 	void *rc;
2267a8b690f9STom Herbert 
2268a8b690f9STom Herbert 	st->bucket = 0;
2269a8b690f9STom Herbert 	st->offset = 0;
2270a8b690f9STom Herbert 	rc = listening_get_next(seq, NULL);
22711da177e4SLinus Torvalds 
22721da177e4SLinus Torvalds 	while (rc && *pos) {
22731da177e4SLinus Torvalds 		rc = listening_get_next(seq, rc);
22741da177e4SLinus Torvalds 		--*pos;
22751da177e4SLinus Torvalds 	}
22761da177e4SLinus Torvalds 	return rc;
22771da177e4SLinus Torvalds }
22781da177e4SLinus Torvalds 
227905dbc7b5SEric Dumazet static inline bool empty_bucket(const struct tcp_iter_state *st)
22806eac5604SAndi Kleen {
228105dbc7b5SEric Dumazet 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
22826eac5604SAndi Kleen }
22836eac5604SAndi Kleen 
2284a8b690f9STom Herbert /*
2285a8b690f9STom Herbert  * Get first established socket starting from bucket given in st->bucket.
2286a8b690f9STom Herbert  * If st->bucket is zero, the very first socket in the hash is returned.
2287a8b690f9STom Herbert  */
22881da177e4SLinus Torvalds static void *established_get_first(struct seq_file *seq)
22891da177e4SLinus Torvalds {
2290b08d4d3bSYonghong Song 	struct tcp_seq_afinfo *afinfo;
22911da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
2292a4146b1bSDenis V. Lunev 	struct net *net = seq_file_net(seq);
22931da177e4SLinus Torvalds 	void *rc = NULL;
22941da177e4SLinus Torvalds 
2295b08d4d3bSYonghong Song 	if (st->bpf_seq_afinfo)
2296b08d4d3bSYonghong Song 		afinfo = st->bpf_seq_afinfo;
2297b08d4d3bSYonghong Song 	else
2298b08d4d3bSYonghong Song 		afinfo = PDE_DATA(file_inode(seq->file));
2299b08d4d3bSYonghong Song 
2300a8b690f9STom Herbert 	st->offset = 0;
2301a8b690f9STom Herbert 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
23021da177e4SLinus Torvalds 		struct sock *sk;
23033ab5aee7SEric Dumazet 		struct hlist_nulls_node *node;
23049db66bdcSEric Dumazet 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
23051da177e4SLinus Torvalds 
23066eac5604SAndi Kleen 		/* Lockless fast path for the common case of empty buckets */
23076eac5604SAndi Kleen 		if (empty_bucket(st))
23086eac5604SAndi Kleen 			continue;
23096eac5604SAndi Kleen 
23109db66bdcSEric Dumazet 		spin_lock_bh(lock);
23113ab5aee7SEric Dumazet 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2312b08d4d3bSYonghong Song 			if ((afinfo->family != AF_UNSPEC &&
2313b08d4d3bSYonghong Song 			     sk->sk_family != afinfo->family) ||
2314878628fbSYOSHIFUJI Hideaki 			    !net_eq(sock_net(sk), net)) {
23151da177e4SLinus Torvalds 				continue;
23161da177e4SLinus Torvalds 			}
23171da177e4SLinus Torvalds 			rc = sk;
23181da177e4SLinus Torvalds 			goto out;
23191da177e4SLinus Torvalds 		}
23209db66bdcSEric Dumazet 		spin_unlock_bh(lock);
23211da177e4SLinus Torvalds 	}
23221da177e4SLinus Torvalds out:
23231da177e4SLinus Torvalds 	return rc;
23241da177e4SLinus Torvalds }
23251da177e4SLinus Torvalds 
23261da177e4SLinus Torvalds static void *established_get_next(struct seq_file *seq, void *cur)
23271da177e4SLinus Torvalds {
2328b08d4d3bSYonghong Song 	struct tcp_seq_afinfo *afinfo;
23291da177e4SLinus Torvalds 	struct sock *sk = cur;
23303ab5aee7SEric Dumazet 	struct hlist_nulls_node *node;
23311da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
2332a4146b1bSDenis V. Lunev 	struct net *net = seq_file_net(seq);
23331da177e4SLinus Torvalds 
2334b08d4d3bSYonghong Song 	if (st->bpf_seq_afinfo)
2335b08d4d3bSYonghong Song 		afinfo = st->bpf_seq_afinfo;
2336b08d4d3bSYonghong Song 	else
2337b08d4d3bSYonghong Song 		afinfo = PDE_DATA(file_inode(seq->file));
2338b08d4d3bSYonghong Song 
23391da177e4SLinus Torvalds 	++st->num;
2340a8b690f9STom Herbert 	++st->offset;
23411da177e4SLinus Torvalds 
23423ab5aee7SEric Dumazet 	sk = sk_nulls_next(sk);
23431da177e4SLinus Torvalds 
23443ab5aee7SEric Dumazet 	sk_nulls_for_each_from(sk, node) {
2345b08d4d3bSYonghong Song 		if ((afinfo->family == AF_UNSPEC ||
2346b08d4d3bSYonghong Song 		     sk->sk_family == afinfo->family) &&
234737d849bbSChristoph Hellwig 		    net_eq(sock_net(sk), net))
234805dbc7b5SEric Dumazet 			return sk;
23491da177e4SLinus Torvalds 	}
23501da177e4SLinus Torvalds 
235105dbc7b5SEric Dumazet 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
235205dbc7b5SEric Dumazet 	++st->bucket;
235305dbc7b5SEric Dumazet 	return established_get_first(seq);
23541da177e4SLinus Torvalds }
23551da177e4SLinus Torvalds 
23561da177e4SLinus Torvalds static void *established_get_idx(struct seq_file *seq, loff_t pos)
23571da177e4SLinus Torvalds {
2358a8b690f9STom Herbert 	struct tcp_iter_state *st = seq->private;
2359a8b690f9STom Herbert 	void *rc;
2360a8b690f9STom Herbert 
2361a8b690f9STom Herbert 	st->bucket = 0;
2362a8b690f9STom Herbert 	rc = established_get_first(seq);
23631da177e4SLinus Torvalds 
23641da177e4SLinus Torvalds 	while (rc && pos) {
23651da177e4SLinus Torvalds 		rc = established_get_next(seq, rc);
23661da177e4SLinus Torvalds 		--pos;
23671da177e4SLinus Torvalds 	}
23681da177e4SLinus Torvalds 	return rc;
23691da177e4SLinus Torvalds }
23701da177e4SLinus Torvalds 
23711da177e4SLinus Torvalds static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
23721da177e4SLinus Torvalds {
23731da177e4SLinus Torvalds 	void *rc;
23741da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
23751da177e4SLinus Torvalds 
23761da177e4SLinus Torvalds 	st->state = TCP_SEQ_STATE_LISTENING;
23771da177e4SLinus Torvalds 	rc	  = listening_get_idx(seq, &pos);
23781da177e4SLinus Torvalds 
23791da177e4SLinus Torvalds 	if (!rc) {
23801da177e4SLinus Torvalds 		st->state = TCP_SEQ_STATE_ESTABLISHED;
23811da177e4SLinus Torvalds 		rc	  = established_get_idx(seq, pos);
23821da177e4SLinus Torvalds 	}
23831da177e4SLinus Torvalds 
23841da177e4SLinus Torvalds 	return rc;
23851da177e4SLinus Torvalds }
23861da177e4SLinus Torvalds 
2387a8b690f9STom Herbert static void *tcp_seek_last_pos(struct seq_file *seq)
2388a8b690f9STom Herbert {
2389a8b690f9STom Herbert 	struct tcp_iter_state *st = seq->private;
2390a8b690f9STom Herbert 	int offset = st->offset;
2391a8b690f9STom Herbert 	int orig_num = st->num;
2392a8b690f9STom Herbert 	void *rc = NULL;
2393a8b690f9STom Herbert 
2394a8b690f9STom Herbert 	switch (st->state) {
2395a8b690f9STom Herbert 	case TCP_SEQ_STATE_LISTENING:
2396a8b690f9STom Herbert 		if (st->bucket >= INET_LHTABLE_SIZE)
2397a8b690f9STom Herbert 			break;
2398a8b690f9STom Herbert 		st->state = TCP_SEQ_STATE_LISTENING;
2399a8b690f9STom Herbert 		rc = listening_get_next(seq, NULL);
2400a8b690f9STom Herbert 		while (offset-- && rc)
2401a8b690f9STom Herbert 			rc = listening_get_next(seq, rc);
2402a8b690f9STom Herbert 		if (rc)
2403a8b690f9STom Herbert 			break;
2404a8b690f9STom Herbert 		st->bucket = 0;
240505dbc7b5SEric Dumazet 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2406a8eceea8SJoe Perches 		fallthrough;
2407a8b690f9STom Herbert 	case TCP_SEQ_STATE_ESTABLISHED:
2408a8b690f9STom Herbert 		if (st->bucket > tcp_hashinfo.ehash_mask)
2409a8b690f9STom Herbert 			break;
2410a8b690f9STom Herbert 		rc = established_get_first(seq);
2411a8b690f9STom Herbert 		while (offset-- && rc)
2412a8b690f9STom Herbert 			rc = established_get_next(seq, rc);
2413a8b690f9STom Herbert 	}
2414a8b690f9STom Herbert 
2415a8b690f9STom Herbert 	st->num = orig_num;
2416a8b690f9STom Herbert 
2417a8b690f9STom Herbert 	return rc;
2418a8b690f9STom Herbert }
2419a8b690f9STom Herbert 
242037d849bbSChristoph Hellwig void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
24211da177e4SLinus Torvalds {
24221da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
2423a8b690f9STom Herbert 	void *rc;
2424a8b690f9STom Herbert 
2425a8b690f9STom Herbert 	if (*pos && *pos == st->last_pos) {
2426a8b690f9STom Herbert 		rc = tcp_seek_last_pos(seq);
2427a8b690f9STom Herbert 		if (rc)
2428a8b690f9STom Herbert 			goto out;
2429a8b690f9STom Herbert 	}
2430a8b690f9STom Herbert 
24311da177e4SLinus Torvalds 	st->state = TCP_SEQ_STATE_LISTENING;
24321da177e4SLinus Torvalds 	st->num = 0;
2433a8b690f9STom Herbert 	st->bucket = 0;
2434a8b690f9STom Herbert 	st->offset = 0;
2435a8b690f9STom Herbert 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2436a8b690f9STom Herbert 
2437a8b690f9STom Herbert out:
2438a8b690f9STom Herbert 	st->last_pos = *pos;
2439a8b690f9STom Herbert 	return rc;
24401da177e4SLinus Torvalds }
244137d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_start);
24421da177e4SLinus Torvalds 
244337d849bbSChristoph Hellwig void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
24441da177e4SLinus Torvalds {
2445a8b690f9STom Herbert 	struct tcp_iter_state *st = seq->private;
24461da177e4SLinus Torvalds 	void *rc = NULL;
24471da177e4SLinus Torvalds 
24481da177e4SLinus Torvalds 	if (v == SEQ_START_TOKEN) {
24491da177e4SLinus Torvalds 		rc = tcp_get_idx(seq, 0);
24501da177e4SLinus Torvalds 		goto out;
24511da177e4SLinus Torvalds 	}
24521da177e4SLinus Torvalds 
24531da177e4SLinus Torvalds 	switch (st->state) {
24541da177e4SLinus Torvalds 	case TCP_SEQ_STATE_LISTENING:
24551da177e4SLinus Torvalds 		rc = listening_get_next(seq, v);
24561da177e4SLinus Torvalds 		if (!rc) {
24571da177e4SLinus Torvalds 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2458a8b690f9STom Herbert 			st->bucket = 0;
2459a8b690f9STom Herbert 			st->offset = 0;
24601da177e4SLinus Torvalds 			rc	  = established_get_first(seq);
24611da177e4SLinus Torvalds 		}
24621da177e4SLinus Torvalds 		break;
24631da177e4SLinus Torvalds 	case TCP_SEQ_STATE_ESTABLISHED:
24641da177e4SLinus Torvalds 		rc = established_get_next(seq, v);
24651da177e4SLinus Torvalds 		break;
24661da177e4SLinus Torvalds 	}
24671da177e4SLinus Torvalds out:
24681da177e4SLinus Torvalds 	++*pos;
2469a8b690f9STom Herbert 	st->last_pos = *pos;
24701da177e4SLinus Torvalds 	return rc;
24711da177e4SLinus Torvalds }
247237d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_next);
24731da177e4SLinus Torvalds 
247437d849bbSChristoph Hellwig void tcp_seq_stop(struct seq_file *seq, void *v)
24751da177e4SLinus Torvalds {
24761da177e4SLinus Torvalds 	struct tcp_iter_state *st = seq->private;
24771da177e4SLinus Torvalds 
24781da177e4SLinus Torvalds 	switch (st->state) {
24791da177e4SLinus Torvalds 	case TCP_SEQ_STATE_LISTENING:
24801da177e4SLinus Torvalds 		if (v != SEQ_START_TOKEN)
24819652dc2eSEric Dumazet 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
24821da177e4SLinus Torvalds 		break;
24831da177e4SLinus Torvalds 	case TCP_SEQ_STATE_ESTABLISHED:
24841da177e4SLinus Torvalds 		if (v)
24859db66bdcSEric Dumazet 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
24861da177e4SLinus Torvalds 		break;
24871da177e4SLinus Torvalds 	}
24881da177e4SLinus Torvalds }
248937d849bbSChristoph Hellwig EXPORT_SYMBOL(tcp_seq_stop);
24901da177e4SLinus Torvalds 
2491d4f06873SEric Dumazet static void get_openreq4(const struct request_sock *req,
2492aa3a0c8cSEric Dumazet 			 struct seq_file *f, int i)
24931da177e4SLinus Torvalds {
24942e6599cbSArnaldo Carvalho de Melo 	const struct inet_request_sock *ireq = inet_rsk(req);
2495fa76ce73SEric Dumazet 	long delta = req->rsk_timer.expires - jiffies;
24961da177e4SLinus Torvalds 
24975e659e4cSPavel Emelyanov 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2498652586dfSTetsuo Handa 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
24991da177e4SLinus Torvalds 		i,
2500634fb979SEric Dumazet 		ireq->ir_loc_addr,
2501d4f06873SEric Dumazet 		ireq->ir_num,
2502634fb979SEric Dumazet 		ireq->ir_rmt_addr,
2503634fb979SEric Dumazet 		ntohs(ireq->ir_rmt_port),
25041da177e4SLinus Torvalds 		TCP_SYN_RECV,
25051da177e4SLinus Torvalds 		0, 0, /* could print option size, but that is af dependent. */
25061da177e4SLinus Torvalds 		1,    /* timers active (only the expire timer) */
2507a399a805SEric Dumazet 		jiffies_delta_to_clock_t(delta),
2508e6c022a4SEric Dumazet 		req->num_timeout,
2509aa3a0c8cSEric Dumazet 		from_kuid_munged(seq_user_ns(f),
2510aa3a0c8cSEric Dumazet 				 sock_i_uid(req->rsk_listener)),
25111da177e4SLinus Torvalds 		0,  /* non standard timer */
25121da177e4SLinus Torvalds 		0, /* open_requests have no inode */
2513d4f06873SEric Dumazet 		0,
2514652586dfSTetsuo Handa 		req);
25151da177e4SLinus Torvalds }
25161da177e4SLinus Torvalds 
2517652586dfSTetsuo Handa static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
25181da177e4SLinus Torvalds {
25191da177e4SLinus Torvalds 	int timer_active;
25201da177e4SLinus Torvalds 	unsigned long timer_expires;
2521cf533ea5SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
2522cf4c6bf8SIlpo Järvinen 	const struct inet_connection_sock *icsk = inet_csk(sk);
2523cf533ea5SEric Dumazet 	const struct inet_sock *inet = inet_sk(sk);
25240536fcc0SEric Dumazet 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2525c720c7e8SEric Dumazet 	__be32 dest = inet->inet_daddr;
2526c720c7e8SEric Dumazet 	__be32 src = inet->inet_rcv_saddr;
2527c720c7e8SEric Dumazet 	__u16 destp = ntohs(inet->inet_dport);
2528c720c7e8SEric Dumazet 	__u16 srcp = ntohs(inet->inet_sport);
252949d09007SEric Dumazet 	int rx_queue;
253000fd38d9SEric Dumazet 	int state;
25311da177e4SLinus Torvalds 
25326ba8a3b1SNandita Dukkipati 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
253357dde7f7SYuchung Cheng 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
25346ba8a3b1SNandita Dukkipati 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
25351da177e4SLinus Torvalds 		timer_active	= 1;
2536463c84b9SArnaldo Carvalho de Melo 		timer_expires	= icsk->icsk_timeout;
2537463c84b9SArnaldo Carvalho de Melo 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
25381da177e4SLinus Torvalds 		timer_active	= 4;
2539463c84b9SArnaldo Carvalho de Melo 		timer_expires	= icsk->icsk_timeout;
2540cf4c6bf8SIlpo Järvinen 	} else if (timer_pending(&sk->sk_timer)) {
25411da177e4SLinus Torvalds 		timer_active	= 2;
2542cf4c6bf8SIlpo Järvinen 		timer_expires	= sk->sk_timer.expires;
25431da177e4SLinus Torvalds 	} else {
25441da177e4SLinus Torvalds 		timer_active	= 0;
25451da177e4SLinus Torvalds 		timer_expires = jiffies;
25461da177e4SLinus Torvalds 	}
25471da177e4SLinus Torvalds 
2548986ffdfdSYafang Shao 	state = inet_sk_state_load(sk);
254900fd38d9SEric Dumazet 	if (state == TCP_LISTEN)
2550288efe86SEric Dumazet 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
255149d09007SEric Dumazet 	else
255200fd38d9SEric Dumazet 		/* Because we don't lock the socket,
255300fd38d9SEric Dumazet 		 * we might find a transient negative value.
255449d09007SEric Dumazet 		 */
2555dba7d9b8SEric Dumazet 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
25567db48e98SEric Dumazet 				      READ_ONCE(tp->copied_seq), 0);
255749d09007SEric Dumazet 
25585e659e4cSPavel Emelyanov 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2559652586dfSTetsuo Handa 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
256000fd38d9SEric Dumazet 		i, src, srcp, dest, destp, state,
25610f317464SEric Dumazet 		READ_ONCE(tp->write_seq) - tp->snd_una,
256249d09007SEric Dumazet 		rx_queue,
25631da177e4SLinus Torvalds 		timer_active,
2564a399a805SEric Dumazet 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2565463c84b9SArnaldo Carvalho de Melo 		icsk->icsk_retransmits,
2566a7cb5a49SEric W. Biederman 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
25676687e988SArnaldo Carvalho de Melo 		icsk->icsk_probes_out,
2568cf4c6bf8SIlpo Järvinen 		sock_i_ino(sk),
256941c6d650SReshetova, Elena 		refcount_read(&sk->sk_refcnt), sk,
25707be87351SStephen Hemminger 		jiffies_to_clock_t(icsk->icsk_rto),
25717be87351SStephen Hemminger 		jiffies_to_clock_t(icsk->icsk_ack.ato),
257231954cd8SWei Wang 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
25731da177e4SLinus Torvalds 		tp->snd_cwnd,
257400fd38d9SEric Dumazet 		state == TCP_LISTEN ?
257500fd38d9SEric Dumazet 		    fastopenq->max_qlen :
2576652586dfSTetsuo Handa 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
25771da177e4SLinus Torvalds }
25781da177e4SLinus Torvalds 
2579cf533ea5SEric Dumazet static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2580652586dfSTetsuo Handa 			       struct seq_file *f, int i)
25811da177e4SLinus Torvalds {
2582789f558cSEric Dumazet 	long delta = tw->tw_timer.expires - jiffies;
258323f33c2dSAl Viro 	__be32 dest, src;
25841da177e4SLinus Torvalds 	__u16 destp, srcp;
25851da177e4SLinus Torvalds 
25861da177e4SLinus Torvalds 	dest  = tw->tw_daddr;
25871da177e4SLinus Torvalds 	src   = tw->tw_rcv_saddr;
25881da177e4SLinus Torvalds 	destp = ntohs(tw->tw_dport);
25891da177e4SLinus Torvalds 	srcp  = ntohs(tw->tw_sport);
25901da177e4SLinus Torvalds 
25915e659e4cSPavel Emelyanov 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2592652586dfSTetsuo Handa 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
25931da177e4SLinus Torvalds 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2594a399a805SEric Dumazet 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
259541c6d650SReshetova, Elena 		refcount_read(&tw->tw_refcnt), tw);
25961da177e4SLinus Torvalds }
25971da177e4SLinus Torvalds 
25981da177e4SLinus Torvalds #define TMPSZ 150
25991da177e4SLinus Torvalds 
26001da177e4SLinus Torvalds static int tcp4_seq_show(struct seq_file *seq, void *v)
26011da177e4SLinus Torvalds {
26021da177e4SLinus Torvalds 	struct tcp_iter_state *st;
260305dbc7b5SEric Dumazet 	struct sock *sk = v;
26041da177e4SLinus Torvalds 
2605652586dfSTetsuo Handa 	seq_setwidth(seq, TMPSZ - 1);
26061da177e4SLinus Torvalds 	if (v == SEQ_START_TOKEN) {
2607652586dfSTetsuo Handa 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
26081da177e4SLinus Torvalds 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
26091da177e4SLinus Torvalds 			   "inode");
26101da177e4SLinus Torvalds 		goto out;
26111da177e4SLinus Torvalds 	}
26121da177e4SLinus Torvalds 	st = seq->private;
26131da177e4SLinus Torvalds 
261405dbc7b5SEric Dumazet 	if (sk->sk_state == TCP_TIME_WAIT)
2615652586dfSTetsuo Handa 		get_timewait4_sock(v, seq, st->num);
2616079096f1SEric Dumazet 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2617079096f1SEric Dumazet 		get_openreq4(v, seq, st->num);
261805dbc7b5SEric Dumazet 	else
2619652586dfSTetsuo Handa 		get_tcp4_sock(v, seq, st->num);
26201da177e4SLinus Torvalds out:
2621652586dfSTetsuo Handa 	seq_pad(seq, '\n');
26221da177e4SLinus Torvalds 	return 0;
26231da177e4SLinus Torvalds }
26241da177e4SLinus Torvalds 
262552d87d5fSYonghong Song #ifdef CONFIG_BPF_SYSCALL
262652d87d5fSYonghong Song struct bpf_iter__tcp {
262752d87d5fSYonghong Song 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
262852d87d5fSYonghong Song 	__bpf_md_ptr(struct sock_common *, sk_common);
262952d87d5fSYonghong Song 	uid_t uid __aligned(8);
263052d87d5fSYonghong Song };
263152d87d5fSYonghong Song 
263252d87d5fSYonghong Song static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
263352d87d5fSYonghong Song 			     struct sock_common *sk_common, uid_t uid)
263452d87d5fSYonghong Song {
263552d87d5fSYonghong Song 	struct bpf_iter__tcp ctx;
263652d87d5fSYonghong Song 
263752d87d5fSYonghong Song 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
263852d87d5fSYonghong Song 	ctx.meta = meta;
263952d87d5fSYonghong Song 	ctx.sk_common = sk_common;
264052d87d5fSYonghong Song 	ctx.uid = uid;
264152d87d5fSYonghong Song 	return bpf_iter_run_prog(prog, &ctx);
264252d87d5fSYonghong Song }
264352d87d5fSYonghong Song 
264452d87d5fSYonghong Song static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
264552d87d5fSYonghong Song {
264652d87d5fSYonghong Song 	struct bpf_iter_meta meta;
264752d87d5fSYonghong Song 	struct bpf_prog *prog;
264852d87d5fSYonghong Song 	struct sock *sk = v;
264952d87d5fSYonghong Song 	uid_t uid;
265052d87d5fSYonghong Song 
265152d87d5fSYonghong Song 	if (v == SEQ_START_TOKEN)
265252d87d5fSYonghong Song 		return 0;
265352d87d5fSYonghong Song 
265452d87d5fSYonghong Song 	if (sk->sk_state == TCP_TIME_WAIT) {
265552d87d5fSYonghong Song 		uid = 0;
265652d87d5fSYonghong Song 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
265752d87d5fSYonghong Song 		const struct request_sock *req = v;
265852d87d5fSYonghong Song 
265952d87d5fSYonghong Song 		uid = from_kuid_munged(seq_user_ns(seq),
266052d87d5fSYonghong Song 				       sock_i_uid(req->rsk_listener));
266152d87d5fSYonghong Song 	} else {
266252d87d5fSYonghong Song 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
266352d87d5fSYonghong Song 	}
266452d87d5fSYonghong Song 
266552d87d5fSYonghong Song 	meta.seq = seq;
266652d87d5fSYonghong Song 	prog = bpf_iter_get_info(&meta, false);
266752d87d5fSYonghong Song 	return tcp_prog_seq_show(prog, &meta, v, uid);
266852d87d5fSYonghong Song }
266952d87d5fSYonghong Song 
267052d87d5fSYonghong Song static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
267152d87d5fSYonghong Song {
267252d87d5fSYonghong Song 	struct bpf_iter_meta meta;
267352d87d5fSYonghong Song 	struct bpf_prog *prog;
267452d87d5fSYonghong Song 
267552d87d5fSYonghong Song 	if (!v) {
267652d87d5fSYonghong Song 		meta.seq = seq;
267752d87d5fSYonghong Song 		prog = bpf_iter_get_info(&meta, true);
267852d87d5fSYonghong Song 		if (prog)
267952d87d5fSYonghong Song 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
268052d87d5fSYonghong Song 	}
268152d87d5fSYonghong Song 
268252d87d5fSYonghong Song 	tcp_seq_stop(seq, v);
268352d87d5fSYonghong Song }
268452d87d5fSYonghong Song 
268552d87d5fSYonghong Song static const struct seq_operations bpf_iter_tcp_seq_ops = {
268652d87d5fSYonghong Song 	.show		= bpf_iter_tcp_seq_show,
268752d87d5fSYonghong Song 	.start		= tcp_seq_start,
268852d87d5fSYonghong Song 	.next		= tcp_seq_next,
268952d87d5fSYonghong Song 	.stop		= bpf_iter_tcp_seq_stop,
269052d87d5fSYonghong Song };
269152d87d5fSYonghong Song #endif
269252d87d5fSYonghong Song 
269337d849bbSChristoph Hellwig static const struct seq_operations tcp4_seq_ops = {
269437d849bbSChristoph Hellwig 	.show		= tcp4_seq_show,
269537d849bbSChristoph Hellwig 	.start		= tcp_seq_start,
269637d849bbSChristoph Hellwig 	.next		= tcp_seq_next,
269737d849bbSChristoph Hellwig 	.stop		= tcp_seq_stop,
269837d849bbSChristoph Hellwig };
269937d849bbSChristoph Hellwig 
27001da177e4SLinus Torvalds static struct tcp_seq_afinfo tcp4_seq_afinfo = {
27011da177e4SLinus Torvalds 	.family		= AF_INET,
27021da177e4SLinus Torvalds };
27031da177e4SLinus Torvalds 
27042c8c1e72SAlexey Dobriyan static int __net_init tcp4_proc_init_net(struct net *net)
2705757764f6SPavel Emelyanov {
2706c3506372SChristoph Hellwig 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2707c3506372SChristoph Hellwig 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
270837d849bbSChristoph Hellwig 		return -ENOMEM;
270937d849bbSChristoph Hellwig 	return 0;
2710757764f6SPavel Emelyanov }
2711757764f6SPavel Emelyanov 
27122c8c1e72SAlexey Dobriyan static void __net_exit tcp4_proc_exit_net(struct net *net)
2713757764f6SPavel Emelyanov {
271437d849bbSChristoph Hellwig 	remove_proc_entry("tcp", net->proc_net);
2715757764f6SPavel Emelyanov }
2716757764f6SPavel Emelyanov 
2717757764f6SPavel Emelyanov static struct pernet_operations tcp4_net_ops = {
2718757764f6SPavel Emelyanov 	.init = tcp4_proc_init_net,
2719757764f6SPavel Emelyanov 	.exit = tcp4_proc_exit_net,
2720757764f6SPavel Emelyanov };
2721757764f6SPavel Emelyanov 
27221da177e4SLinus Torvalds int __init tcp4_proc_init(void)
27231da177e4SLinus Torvalds {
2724757764f6SPavel Emelyanov 	return register_pernet_subsys(&tcp4_net_ops);
27251da177e4SLinus Torvalds }
27261da177e4SLinus Torvalds 
27271da177e4SLinus Torvalds void tcp4_proc_exit(void)
27281da177e4SLinus Torvalds {
2729757764f6SPavel Emelyanov 	unregister_pernet_subsys(&tcp4_net_ops);
27301da177e4SLinus Torvalds }
27311da177e4SLinus Torvalds #endif /* CONFIG_PROC_FS */
27321da177e4SLinus Torvalds 
27331da177e4SLinus Torvalds struct proto tcp_prot = {
27341da177e4SLinus Torvalds 	.name			= "TCP",
27351da177e4SLinus Torvalds 	.owner			= THIS_MODULE,
27361da177e4SLinus Torvalds 	.close			= tcp_close,
2737d74bad4eSAndrey Ignatov 	.pre_connect		= tcp_v4_pre_connect,
27381da177e4SLinus Torvalds 	.connect		= tcp_v4_connect,
27391da177e4SLinus Torvalds 	.disconnect		= tcp_disconnect,
2740463c84b9SArnaldo Carvalho de Melo 	.accept			= inet_csk_accept,
27411da177e4SLinus Torvalds 	.ioctl			= tcp_ioctl,
27421da177e4SLinus Torvalds 	.init			= tcp_v4_init_sock,
27431da177e4SLinus Torvalds 	.destroy		= tcp_v4_destroy_sock,
27441da177e4SLinus Torvalds 	.shutdown		= tcp_shutdown,
27451da177e4SLinus Torvalds 	.setsockopt		= tcp_setsockopt,
27461da177e4SLinus Torvalds 	.getsockopt		= tcp_getsockopt,
27474b9d07a4SUrsula Braun 	.keepalive		= tcp_set_keepalive,
27481da177e4SLinus Torvalds 	.recvmsg		= tcp_recvmsg,
27497ba42910SChangli Gao 	.sendmsg		= tcp_sendmsg,
27507ba42910SChangli Gao 	.sendpage		= tcp_sendpage,
27511da177e4SLinus Torvalds 	.backlog_rcv		= tcp_v4_do_rcv,
275246d3ceabSEric Dumazet 	.release_cb		= tcp_release_cb,
2753ab1e0a13SArnaldo Carvalho de Melo 	.hash			= inet_hash,
2754ab1e0a13SArnaldo Carvalho de Melo 	.unhash			= inet_unhash,
2755ab1e0a13SArnaldo Carvalho de Melo 	.get_port		= inet_csk_get_port,
27561da177e4SLinus Torvalds 	.enter_memory_pressure	= tcp_enter_memory_pressure,
275706044751SEric Dumazet 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2758c9bee3b7SEric Dumazet 	.stream_memory_free	= tcp_stream_memory_free,
27591da177e4SLinus Torvalds 	.sockets_allocated	= &tcp_sockets_allocated,
27600a5578cfSArnaldo Carvalho de Melo 	.orphan_count		= &tcp_orphan_count,
27611da177e4SLinus Torvalds 	.memory_allocated	= &tcp_memory_allocated,
27621da177e4SLinus Torvalds 	.memory_pressure	= &tcp_memory_pressure,
2763a4fe34bfSEric W. Biederman 	.sysctl_mem		= sysctl_tcp_mem,
2764356d1833SEric Dumazet 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2765356d1833SEric Dumazet 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
27661da177e4SLinus Torvalds 	.max_header		= MAX_TCP_HEADER,
27671da177e4SLinus Torvalds 	.obj_size		= sizeof(struct tcp_sock),
27685f0d5a3aSPaul E. McKenney 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
27696d6ee43eSArnaldo Carvalho de Melo 	.twsk_prot		= &tcp_timewait_sock_ops,
277060236fddSArnaldo Carvalho de Melo 	.rsk_prot		= &tcp_request_sock_ops,
277139d8cda7SPavel Emelyanov 	.h.hashinfo		= &tcp_hashinfo,
27727ba42910SChangli Gao 	.no_autobind		= true,
2773c1e64e29SLorenzo Colitti 	.diag_destroy		= tcp_abort,
27741da177e4SLinus Torvalds };
27754bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_prot);
27761da177e4SLinus Torvalds 
2777046ee902SDenis V. Lunev static void __net_exit tcp_sk_exit(struct net *net)
2778046ee902SDenis V. Lunev {
2779bdbbb852SEric Dumazet 	int cpu;
2780bdbbb852SEric Dumazet 
2781b506bc97SDust Li 	if (net->ipv4.tcp_congestion_control)
27820baf26b0SMartin KaFai Lau 		bpf_module_put(net->ipv4.tcp_congestion_control,
27830baf26b0SMartin KaFai Lau 			       net->ipv4.tcp_congestion_control->owner);
27846670e152SStephen Hemminger 
2785bdbbb852SEric Dumazet 	for_each_possible_cpu(cpu)
2786bdbbb852SEric Dumazet 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2787bdbbb852SEric Dumazet 	free_percpu(net->ipv4.tcp_sk);
2788bdbbb852SEric Dumazet }
2789bdbbb852SEric Dumazet 
2790bdbbb852SEric Dumazet static int __net_init tcp_sk_init(struct net *net)
2791bdbbb852SEric Dumazet {
2792fee83d09SHaishuang Yan 	int res, cpu, cnt;
2793bdbbb852SEric Dumazet 
2794bdbbb852SEric Dumazet 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2795bdbbb852SEric Dumazet 	if (!net->ipv4.tcp_sk)
2796bdbbb852SEric Dumazet 		return -ENOMEM;
2797bdbbb852SEric Dumazet 
2798bdbbb852SEric Dumazet 	for_each_possible_cpu(cpu) {
2799bdbbb852SEric Dumazet 		struct sock *sk;
2800bdbbb852SEric Dumazet 
2801bdbbb852SEric Dumazet 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2802bdbbb852SEric Dumazet 					   IPPROTO_TCP, net);
2803bdbbb852SEric Dumazet 		if (res)
2804bdbbb852SEric Dumazet 			goto fail;
2805a9d6532bSEric Dumazet 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2806431280eeSEric Dumazet 
2807431280eeSEric Dumazet 		/* Please enforce IP_DF and IPID==0 for RST and
2808431280eeSEric Dumazet 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2809431280eeSEric Dumazet 		 */
2810431280eeSEric Dumazet 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2811431280eeSEric Dumazet 
2812bdbbb852SEric Dumazet 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2813bdbbb852SEric Dumazet 	}
281449213555SDaniel Borkmann 
2815bdbbb852SEric Dumazet 	net->ipv4.sysctl_tcp_ecn = 2;
281649213555SDaniel Borkmann 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
281749213555SDaniel Borkmann 
2818b0f9ca53SFan Du 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
28195f3e2bf0SEric Dumazet 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
28206b58e0a5SFan Du 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
282105cbc0dbSFan Du 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2822c04b79b6SJosh Hunt 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2823bdbbb852SEric Dumazet 
282413b287e8SNikolay Borisov 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
28259bd6861bSNikolay Borisov 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2826b840d15dSNikolay Borisov 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
282713b287e8SNikolay Borisov 
28286fa25166SNikolay Borisov 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
28297c083ecbSNikolay Borisov 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
28300aca737dSDavid S. Miller 	net->ipv4.sysctl_tcp_syncookies = 1;
28311043e25fSNikolay Borisov 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2832ae5c3f40SNikolay Borisov 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2833c6214a97SNikolay Borisov 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2834c402d9beSNikolay Borisov 	net->ipv4.sysctl_tcp_orphan_retries = 0;
28351e579caaSNikolay Borisov 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
28364979f2d9SNikolay Borisov 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
283779e9fed4SMaciej Żenczykowski 	net->ipv4.sysctl_tcp_tw_reuse = 2;
283865e6d901SKevin(Yudong) Yang 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
283912ed8244SNikolay Borisov 
2840fee83d09SHaishuang Yan 	cnt = tcp_hashinfo.ehash_mask + 1;
2841743e4815SYafang Shao 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
28421946e672SHaishuang Yan 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
28431946e672SHaishuang Yan 
2844623d0c2dSEric Dumazet 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2845f9301034SEric Dumazet 	net->ipv4.sysctl_tcp_sack = 1;
28469bb37ef0SEric Dumazet 	net->ipv4.sysctl_tcp_window_scaling = 1;
28475d2ed052SEric Dumazet 	net->ipv4.sysctl_tcp_timestamps = 1;
28482ae21cf5SEric Dumazet 	net->ipv4.sysctl_tcp_early_retrans = 3;
2849e20223f1SEric Dumazet 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2850b510f0d2SEric Dumazet 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2851e0a1e5b5SEric Dumazet 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2852c6e21803SEric Dumazet 	net->ipv4.sysctl_tcp_max_reordering = 300;
28536496f6bdSEric Dumazet 	net->ipv4.sysctl_tcp_dsack = 1;
28540c12654aSEric Dumazet 	net->ipv4.sysctl_tcp_app_win = 31;
285594f0893eSEric Dumazet 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2856af9b69a7SEric Dumazet 	net->ipv4.sysctl_tcp_frto = 2;
28574540c0cfSEric Dumazet 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2858d06a9904SEric Dumazet 	/* This limits the percentage of the congestion window which we
2859d06a9904SEric Dumazet 	 * will allow a single TSO frame to consume.  Building TSO frames
2860d06a9904SEric Dumazet 	 * which are too large can cause TCP streams to be bursty.
2861d06a9904SEric Dumazet 	 */
2862d06a9904SEric Dumazet 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2863c73e5807SEric Dumazet 	/* Default TSQ limit of 16 TSO segments */
2864c73e5807SEric Dumazet 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2865b530b681SEric Dumazet 	/* rfc5961 challenge ack rate limiting */
2866b530b681SEric Dumazet 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
286726e9596eSEric Dumazet 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2868bd239704SEric Dumazet 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2869790f00e1SEric Dumazet 	net->ipv4.sysctl_tcp_autocorking = 1;
28704170ba6bSEric Dumazet 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
287123a7102aSEric Dumazet 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2872c26e91f8SEric Dumazet 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2873356d1833SEric Dumazet 	if (net != &init_net) {
2874356d1833SEric Dumazet 		memcpy(net->ipv4.sysctl_tcp_rmem,
2875356d1833SEric Dumazet 		       init_net.ipv4.sysctl_tcp_rmem,
2876356d1833SEric Dumazet 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2877356d1833SEric Dumazet 		memcpy(net->ipv4.sysctl_tcp_wmem,
2878356d1833SEric Dumazet 		       init_net.ipv4.sysctl_tcp_wmem,
2879356d1833SEric Dumazet 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2880356d1833SEric Dumazet 	}
28816d82aa24SEric Dumazet 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2882a70437ccSEric Dumazet 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
28839c21d2fcSEric Dumazet 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2884e1cfcbe8SHaishuang Yan 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
288543713848SHaishuang Yan 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
28863733be14SHaishuang Yan 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
28873733be14SHaishuang Yan 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2888e1cfcbe8SHaishuang Yan 
28896670e152SStephen Hemminger 	/* Reno is always built in */
28906670e152SStephen Hemminger 	if (!net_eq(net, &init_net) &&
28910baf26b0SMartin KaFai Lau 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
28920baf26b0SMartin KaFai Lau 			       init_net.ipv4.tcp_congestion_control->owner))
28936670e152SStephen Hemminger 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
28946670e152SStephen Hemminger 	else
28956670e152SStephen Hemminger 		net->ipv4.tcp_congestion_control = &tcp_reno;
28966670e152SStephen Hemminger 
289749213555SDaniel Borkmann 	return 0;
2898bdbbb852SEric Dumazet fail:
2899bdbbb852SEric Dumazet 	tcp_sk_exit(net);
2900bdbbb852SEric Dumazet 
2901bdbbb852SEric Dumazet 	return res;
2902b099ce26SEric W. Biederman }
2903b099ce26SEric W. Biederman 
2904b099ce26SEric W. Biederman static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2905b099ce26SEric W. Biederman {
290643713848SHaishuang Yan 	struct net *net;
290743713848SHaishuang Yan 
29081946e672SHaishuang Yan 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
290943713848SHaishuang Yan 
291043713848SHaishuang Yan 	list_for_each_entry(net, net_exit_list, exit_list)
291143713848SHaishuang Yan 		tcp_fastopen_ctx_destroy(net);
2912046ee902SDenis V. Lunev }
2913046ee902SDenis V. Lunev 
2914046ee902SDenis V. Lunev static struct pernet_operations __net_initdata tcp_sk_ops = {
2915046ee902SDenis V. Lunev        .init	   = tcp_sk_init,
2916046ee902SDenis V. Lunev        .exit	   = tcp_sk_exit,
2917b099ce26SEric W. Biederman        .exit_batch = tcp_sk_exit_batch,
2918046ee902SDenis V. Lunev };
2919046ee902SDenis V. Lunev 
292052d87d5fSYonghong Song #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
292152d87d5fSYonghong Song DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
292252d87d5fSYonghong Song 		     struct sock_common *sk_common, uid_t uid)
292352d87d5fSYonghong Song 
292452d87d5fSYonghong Song static int bpf_iter_init_tcp(void *priv_data)
292552d87d5fSYonghong Song {
292652d87d5fSYonghong Song 	struct tcp_iter_state *st = priv_data;
292752d87d5fSYonghong Song 	struct tcp_seq_afinfo *afinfo;
292852d87d5fSYonghong Song 	int ret;
292952d87d5fSYonghong Song 
293052d87d5fSYonghong Song 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
293152d87d5fSYonghong Song 	if (!afinfo)
293252d87d5fSYonghong Song 		return -ENOMEM;
293352d87d5fSYonghong Song 
293452d87d5fSYonghong Song 	afinfo->family = AF_UNSPEC;
293552d87d5fSYonghong Song 	st->bpf_seq_afinfo = afinfo;
293652d87d5fSYonghong Song 	ret = bpf_iter_init_seq_net(priv_data);
293752d87d5fSYonghong Song 	if (ret)
293852d87d5fSYonghong Song 		kfree(afinfo);
293952d87d5fSYonghong Song 	return ret;
294052d87d5fSYonghong Song }
294152d87d5fSYonghong Song 
294252d87d5fSYonghong Song static void bpf_iter_fini_tcp(void *priv_data)
294352d87d5fSYonghong Song {
294452d87d5fSYonghong Song 	struct tcp_iter_state *st = priv_data;
294552d87d5fSYonghong Song 
294652d87d5fSYonghong Song 	kfree(st->bpf_seq_afinfo);
294752d87d5fSYonghong Song 	bpf_iter_fini_seq_net(priv_data);
294852d87d5fSYonghong Song }
294952d87d5fSYonghong Song 
2950*14fc6bd6SYonghong Song static const struct bpf_iter_seq_info tcp_seq_info = {
295152d87d5fSYonghong Song 	.seq_ops		= &bpf_iter_tcp_seq_ops,
295252d87d5fSYonghong Song 	.init_seq_private	= bpf_iter_init_tcp,
295352d87d5fSYonghong Song 	.fini_seq_private	= bpf_iter_fini_tcp,
295452d87d5fSYonghong Song 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2955*14fc6bd6SYonghong Song };
2956*14fc6bd6SYonghong Song 
2957*14fc6bd6SYonghong Song static struct bpf_iter_reg tcp_reg_info = {
2958*14fc6bd6SYonghong Song 	.target			= "tcp",
295952d87d5fSYonghong Song 	.ctx_arg_info_size	= 1,
296052d87d5fSYonghong Song 	.ctx_arg_info		= {
296152d87d5fSYonghong Song 		{ offsetof(struct bpf_iter__tcp, sk_common),
296252d87d5fSYonghong Song 		  PTR_TO_BTF_ID_OR_NULL },
296352d87d5fSYonghong Song 	},
2964*14fc6bd6SYonghong Song 	.seq_info		= &tcp_seq_info,
296552d87d5fSYonghong Song };
296652d87d5fSYonghong Song 
296752d87d5fSYonghong Song static void __init bpf_iter_register(void)
296852d87d5fSYonghong Song {
2969951cf368SYonghong Song 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
297052d87d5fSYonghong Song 	if (bpf_iter_reg_target(&tcp_reg_info))
297152d87d5fSYonghong Song 		pr_warn("Warning: could not register bpf iterator tcp\n");
297252d87d5fSYonghong Song }
297352d87d5fSYonghong Song 
297452d87d5fSYonghong Song #endif
297552d87d5fSYonghong Song 
29769b0f976fSDenis V. Lunev void __init tcp_v4_init(void)
29771da177e4SLinus Torvalds {
29786a1b3054SEric W. Biederman 	if (register_pernet_subsys(&tcp_sk_ops))
29791da177e4SLinus Torvalds 		panic("Failed to create the TCP control socket.\n");
298052d87d5fSYonghong Song 
298152d87d5fSYonghong Song #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
298252d87d5fSYonghong Song 	bpf_iter_register();
298352d87d5fSYonghong Song #endif
29841da177e4SLinus Torvalds }
2985