xref: /linux/net/ipv4/tcp_ipv4.c (revision a430a43d087545c96542ee64573237919109d370)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * INET		An implementation of the TCP/IP protocol suite for the LINUX
31da177e4SLinus Torvalds  *		operating system.  INET is implemented using the  BSD Socket
41da177e4SLinus Torvalds  *		interface as the means of communication with the user level.
51da177e4SLinus Torvalds  *
61da177e4SLinus Torvalds  *		Implementation of the Transmission Control Protocol(TCP).
71da177e4SLinus Torvalds  *
81da177e4SLinus Torvalds  * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
91da177e4SLinus Torvalds  *
101da177e4SLinus Torvalds  *		IPv4 specific functions
111da177e4SLinus Torvalds  *
121da177e4SLinus Torvalds  *
131da177e4SLinus Torvalds  *		code split from:
141da177e4SLinus Torvalds  *		linux/ipv4/tcp.c
151da177e4SLinus Torvalds  *		linux/ipv4/tcp_input.c
161da177e4SLinus Torvalds  *		linux/ipv4/tcp_output.c
171da177e4SLinus Torvalds  *
181da177e4SLinus Torvalds  *		See tcp.c for author information
191da177e4SLinus Torvalds  *
201da177e4SLinus Torvalds  *	This program is free software; you can redistribute it and/or
211da177e4SLinus Torvalds  *      modify it under the terms of the GNU General Public License
221da177e4SLinus Torvalds  *      as published by the Free Software Foundation; either version
231da177e4SLinus Torvalds  *      2 of the License, or (at your option) any later version.
241da177e4SLinus Torvalds  */
251da177e4SLinus Torvalds 
261da177e4SLinus Torvalds /*
271da177e4SLinus Torvalds  * Changes:
281da177e4SLinus Torvalds  *		David S. Miller	:	New socket lookup architecture.
291da177e4SLinus Torvalds  *					This code is dedicated to John Dyson.
301da177e4SLinus Torvalds  *		David S. Miller :	Change semantics of established hash,
311da177e4SLinus Torvalds  *					half is devoted to TIME_WAIT sockets
321da177e4SLinus Torvalds  *					and the rest go in the other half.
331da177e4SLinus Torvalds  *		Andi Kleen :		Add support for syncookies and fixed
341da177e4SLinus Torvalds  *					some bugs: ip options weren't passed to
351da177e4SLinus Torvalds  *					the TCP layer, missed a check for an
361da177e4SLinus Torvalds  *					ACK bit.
371da177e4SLinus Torvalds  *		Andi Kleen :		Implemented fast path mtu discovery.
381da177e4SLinus Torvalds  *	     				Fixed many serious bugs in the
3960236fddSArnaldo Carvalho de Melo  *					request_sock handling and moved
401da177e4SLinus Torvalds  *					most of it into the af independent code.
411da177e4SLinus Torvalds  *					Added tail drop and some other bugfixes.
42caa20d9aSStephen Hemminger  *					Added new listen semantics.
431da177e4SLinus Torvalds  *		Mike McLagan	:	Routing by source
441da177e4SLinus Torvalds  *	Juan Jose Ciarlante:		ip_dynaddr bits
451da177e4SLinus Torvalds  *		Andi Kleen:		various fixes.
461da177e4SLinus Torvalds  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
471da177e4SLinus Torvalds  *					coma.
481da177e4SLinus Torvalds  *	Andi Kleen		:	Fix new listen.
491da177e4SLinus Torvalds  *	Andi Kleen		:	Fix accept error reporting.
501da177e4SLinus Torvalds  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
511da177e4SLinus Torvalds  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
521da177e4SLinus Torvalds  *					a single port at the same time.
531da177e4SLinus Torvalds  */
541da177e4SLinus Torvalds 
551da177e4SLinus Torvalds 
561da177e4SLinus Torvalds #include <linux/types.h>
571da177e4SLinus Torvalds #include <linux/fcntl.h>
581da177e4SLinus Torvalds #include <linux/module.h>
591da177e4SLinus Torvalds #include <linux/random.h>
601da177e4SLinus Torvalds #include <linux/cache.h>
611da177e4SLinus Torvalds #include <linux/jhash.h>
621da177e4SLinus Torvalds #include <linux/init.h>
631da177e4SLinus Torvalds #include <linux/times.h>
641da177e4SLinus Torvalds 
651da177e4SLinus Torvalds #include <net/icmp.h>
66304a1618SArnaldo Carvalho de Melo #include <net/inet_hashtables.h>
671da177e4SLinus Torvalds #include <net/tcp.h>
6820380731SArnaldo Carvalho de Melo #include <net/transp_v6.h>
691da177e4SLinus Torvalds #include <net/ipv6.h>
701da177e4SLinus Torvalds #include <net/inet_common.h>
716d6ee43eSArnaldo Carvalho de Melo #include <net/timewait_sock.h>
721da177e4SLinus Torvalds #include <net/xfrm.h>
731a2449a8SChris Leech #include <net/netdma.h>
741da177e4SLinus Torvalds 
751da177e4SLinus Torvalds #include <linux/inet.h>
761da177e4SLinus Torvalds #include <linux/ipv6.h>
771da177e4SLinus Torvalds #include <linux/stddef.h>
781da177e4SLinus Torvalds #include <linux/proc_fs.h>
791da177e4SLinus Torvalds #include <linux/seq_file.h>
801da177e4SLinus Torvalds 
811da177e4SLinus Torvalds int sysctl_tcp_tw_reuse;
821da177e4SLinus Torvalds int sysctl_tcp_low_latency;
831da177e4SLinus Torvalds 
841da177e4SLinus Torvalds /* Check TCP sequence numbers in ICMP packets. */
851da177e4SLinus Torvalds #define ICMP_MIN_LENGTH 8
861da177e4SLinus Torvalds 
871da177e4SLinus Torvalds /* Socket used for sending RSTs */
881da177e4SLinus Torvalds static struct socket *tcp_socket;
891da177e4SLinus Torvalds 
908292a17aSArnaldo Carvalho de Melo void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
911da177e4SLinus Torvalds 
920f7ff927SArnaldo Carvalho de Melo struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93e4d91918SIngo Molnar 	.lhash_lock	= __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
940f7ff927SArnaldo Carvalho de Melo 	.lhash_users	= ATOMIC_INIT(0),
950f7ff927SArnaldo Carvalho de Melo 	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
961da177e4SLinus Torvalds };
971da177e4SLinus Torvalds 
98463c84b9SArnaldo Carvalho de Melo static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99463c84b9SArnaldo Carvalho de Melo {
100971af18bSArnaldo Carvalho de Melo 	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
101971af18bSArnaldo Carvalho de Melo 				 inet_csk_bind_conflict);
102463c84b9SArnaldo Carvalho de Melo }
103463c84b9SArnaldo Carvalho de Melo 
1041da177e4SLinus Torvalds static void tcp_v4_hash(struct sock *sk)
1051da177e4SLinus Torvalds {
10681849d10SArnaldo Carvalho de Melo 	inet_hash(&tcp_hashinfo, sk);
1071da177e4SLinus Torvalds }
1081da177e4SLinus Torvalds 
1091da177e4SLinus Torvalds void tcp_unhash(struct sock *sk)
1101da177e4SLinus Torvalds {
11181849d10SArnaldo Carvalho de Melo 	inet_unhash(&tcp_hashinfo, sk);
1121da177e4SLinus Torvalds }
1131da177e4SLinus Torvalds 
1141da177e4SLinus Torvalds static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
1151da177e4SLinus Torvalds {
1161da177e4SLinus Torvalds 	return secure_tcp_sequence_number(skb->nh.iph->daddr,
1171da177e4SLinus Torvalds 					  skb->nh.iph->saddr,
1181da177e4SLinus Torvalds 					  skb->h.th->dest,
1191da177e4SLinus Torvalds 					  skb->h.th->source);
1201da177e4SLinus Torvalds }
1211da177e4SLinus Torvalds 
1226d6ee43eSArnaldo Carvalho de Melo int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
1236d6ee43eSArnaldo Carvalho de Melo {
1246d6ee43eSArnaldo Carvalho de Melo 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
1256d6ee43eSArnaldo Carvalho de Melo 	struct tcp_sock *tp = tcp_sk(sk);
1266d6ee43eSArnaldo Carvalho de Melo 
1276d6ee43eSArnaldo Carvalho de Melo 	/* With PAWS, it is safe from the viewpoint
1286d6ee43eSArnaldo Carvalho de Melo 	   of data integrity. Even without PAWS it is safe provided sequence
1296d6ee43eSArnaldo Carvalho de Melo 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
1306d6ee43eSArnaldo Carvalho de Melo 
1316d6ee43eSArnaldo Carvalho de Melo 	   Actually, the idea is close to VJ's one, only timestamp cache is
1326d6ee43eSArnaldo Carvalho de Melo 	   held not per host, but per port pair and TW bucket is used as state
1336d6ee43eSArnaldo Carvalho de Melo 	   holder.
1346d6ee43eSArnaldo Carvalho de Melo 
1356d6ee43eSArnaldo Carvalho de Melo 	   If TW bucket has been already destroyed we fall back to VJ's scheme
1366d6ee43eSArnaldo Carvalho de Melo 	   and use initial timestamp retrieved from peer table.
1376d6ee43eSArnaldo Carvalho de Melo 	 */
1386d6ee43eSArnaldo Carvalho de Melo 	if (tcptw->tw_ts_recent_stamp &&
1396d6ee43eSArnaldo Carvalho de Melo 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
1406d6ee43eSArnaldo Carvalho de Melo 			     xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
1416d6ee43eSArnaldo Carvalho de Melo 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
1426d6ee43eSArnaldo Carvalho de Melo 		if (tp->write_seq == 0)
1436d6ee43eSArnaldo Carvalho de Melo 			tp->write_seq = 1;
1446d6ee43eSArnaldo Carvalho de Melo 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
1456d6ee43eSArnaldo Carvalho de Melo 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
1466d6ee43eSArnaldo Carvalho de Melo 		sock_hold(sktw);
1476d6ee43eSArnaldo Carvalho de Melo 		return 1;
1486d6ee43eSArnaldo Carvalho de Melo 	}
1496d6ee43eSArnaldo Carvalho de Melo 
1506d6ee43eSArnaldo Carvalho de Melo 	return 0;
1516d6ee43eSArnaldo Carvalho de Melo }
1526d6ee43eSArnaldo Carvalho de Melo 
1536d6ee43eSArnaldo Carvalho de Melo EXPORT_SYMBOL_GPL(tcp_twsk_unique);
1546d6ee43eSArnaldo Carvalho de Melo 
1551da177e4SLinus Torvalds /* This will initiate an outgoing connection. */
1561da177e4SLinus Torvalds int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
1571da177e4SLinus Torvalds {
1581da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sk);
1591da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1601da177e4SLinus Torvalds 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1611da177e4SLinus Torvalds 	struct rtable *rt;
1621da177e4SLinus Torvalds 	u32 daddr, nexthop;
1631da177e4SLinus Torvalds 	int tmp;
1641da177e4SLinus Torvalds 	int err;
1651da177e4SLinus Torvalds 
1661da177e4SLinus Torvalds 	if (addr_len < sizeof(struct sockaddr_in))
1671da177e4SLinus Torvalds 		return -EINVAL;
1681da177e4SLinus Torvalds 
1691da177e4SLinus Torvalds 	if (usin->sin_family != AF_INET)
1701da177e4SLinus Torvalds 		return -EAFNOSUPPORT;
1711da177e4SLinus Torvalds 
1721da177e4SLinus Torvalds 	nexthop = daddr = usin->sin_addr.s_addr;
1731da177e4SLinus Torvalds 	if (inet->opt && inet->opt->srr) {
1741da177e4SLinus Torvalds 		if (!daddr)
1751da177e4SLinus Torvalds 			return -EINVAL;
1761da177e4SLinus Torvalds 		nexthop = inet->opt->faddr;
1771da177e4SLinus Torvalds 	}
1781da177e4SLinus Torvalds 
1791da177e4SLinus Torvalds 	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
1801da177e4SLinus Torvalds 			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
1811da177e4SLinus Torvalds 			       IPPROTO_TCP,
1821da177e4SLinus Torvalds 			       inet->sport, usin->sin_port, sk);
1831da177e4SLinus Torvalds 	if (tmp < 0)
1841da177e4SLinus Torvalds 		return tmp;
1851da177e4SLinus Torvalds 
1861da177e4SLinus Torvalds 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
1871da177e4SLinus Torvalds 		ip_rt_put(rt);
1881da177e4SLinus Torvalds 		return -ENETUNREACH;
1891da177e4SLinus Torvalds 	}
1901da177e4SLinus Torvalds 
1911da177e4SLinus Torvalds 	if (!inet->opt || !inet->opt->srr)
1921da177e4SLinus Torvalds 		daddr = rt->rt_dst;
1931da177e4SLinus Torvalds 
1941da177e4SLinus Torvalds 	if (!inet->saddr)
1951da177e4SLinus Torvalds 		inet->saddr = rt->rt_src;
1961da177e4SLinus Torvalds 	inet->rcv_saddr = inet->saddr;
1971da177e4SLinus Torvalds 
1981da177e4SLinus Torvalds 	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
1991da177e4SLinus Torvalds 		/* Reset inherited state */
2001da177e4SLinus Torvalds 		tp->rx_opt.ts_recent	   = 0;
2011da177e4SLinus Torvalds 		tp->rx_opt.ts_recent_stamp = 0;
2021da177e4SLinus Torvalds 		tp->write_seq		   = 0;
2031da177e4SLinus Torvalds 	}
2041da177e4SLinus Torvalds 
205295ff7edSArnaldo Carvalho de Melo 	if (tcp_death_row.sysctl_tw_recycle &&
2061da177e4SLinus Torvalds 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
2071da177e4SLinus Torvalds 		struct inet_peer *peer = rt_get_peer(rt);
2081da177e4SLinus Torvalds 
2091da177e4SLinus Torvalds 		/* VJ's idea. We save last timestamp seen from
2101da177e4SLinus Torvalds 		 * the destination in peer table, when entering state TIME-WAIT
2111da177e4SLinus Torvalds 		 * and initialize rx_opt.ts_recent from it, when trying new connection.
2121da177e4SLinus Torvalds 		 */
2131da177e4SLinus Torvalds 
2141da177e4SLinus Torvalds 		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
2151da177e4SLinus Torvalds 			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
2161da177e4SLinus Torvalds 			tp->rx_opt.ts_recent = peer->tcp_ts;
2171da177e4SLinus Torvalds 		}
2181da177e4SLinus Torvalds 	}
2191da177e4SLinus Torvalds 
2201da177e4SLinus Torvalds 	inet->dport = usin->sin_port;
2211da177e4SLinus Torvalds 	inet->daddr = daddr;
2221da177e4SLinus Torvalds 
223d83d8461SArnaldo Carvalho de Melo 	inet_csk(sk)->icsk_ext_hdr_len = 0;
2241da177e4SLinus Torvalds 	if (inet->opt)
225d83d8461SArnaldo Carvalho de Melo 		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
2261da177e4SLinus Torvalds 
2271da177e4SLinus Torvalds 	tp->rx_opt.mss_clamp = 536;
2281da177e4SLinus Torvalds 
2291da177e4SLinus Torvalds 	/* Socket identity is still unknown (sport may be zero).
2301da177e4SLinus Torvalds 	 * However we set state to SYN-SENT and not releasing socket
2311da177e4SLinus Torvalds 	 * lock select source port, enter ourselves into the hash tables and
2321da177e4SLinus Torvalds 	 * complete initialization after this.
2331da177e4SLinus Torvalds 	 */
2341da177e4SLinus Torvalds 	tcp_set_state(sk, TCP_SYN_SENT);
235a7f5e7f1SArnaldo Carvalho de Melo 	err = inet_hash_connect(&tcp_death_row, sk);
2361da177e4SLinus Torvalds 	if (err)
2371da177e4SLinus Torvalds 		goto failure;
2381da177e4SLinus Torvalds 
2395d39a795SPatrick McHardy 	err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk);
2401da177e4SLinus Torvalds 	if (err)
2411da177e4SLinus Torvalds 		goto failure;
2421da177e4SLinus Torvalds 
2431da177e4SLinus Torvalds 	/* OK, now commit destination to socket.  */
244bcd76111SHerbert Xu 	sk->sk_gso_type = SKB_GSO_TCPV4;
2456cbb0df7SArnaldo Carvalho de Melo 	sk_setup_caps(sk, &rt->u.dst);
2461da177e4SLinus Torvalds 
2471da177e4SLinus Torvalds 	if (!tp->write_seq)
2481da177e4SLinus Torvalds 		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
2491da177e4SLinus Torvalds 							   inet->daddr,
2501da177e4SLinus Torvalds 							   inet->sport,
2511da177e4SLinus Torvalds 							   usin->sin_port);
2521da177e4SLinus Torvalds 
2531da177e4SLinus Torvalds 	inet->id = tp->write_seq ^ jiffies;
2541da177e4SLinus Torvalds 
2551da177e4SLinus Torvalds 	err = tcp_connect(sk);
2561da177e4SLinus Torvalds 	rt = NULL;
2571da177e4SLinus Torvalds 	if (err)
2581da177e4SLinus Torvalds 		goto failure;
2591da177e4SLinus Torvalds 
2601da177e4SLinus Torvalds 	return 0;
2611da177e4SLinus Torvalds 
2621da177e4SLinus Torvalds failure:
2631da177e4SLinus Torvalds 	/* This unhashes the socket and releases the local port, if necessary. */
2641da177e4SLinus Torvalds 	tcp_set_state(sk, TCP_CLOSE);
2651da177e4SLinus Torvalds 	ip_rt_put(rt);
2661da177e4SLinus Torvalds 	sk->sk_route_caps = 0;
2671da177e4SLinus Torvalds 	inet->dport = 0;
2681da177e4SLinus Torvalds 	return err;
2691da177e4SLinus Torvalds }
2701da177e4SLinus Torvalds 
2711da177e4SLinus Torvalds /*
2721da177e4SLinus Torvalds  * This routine does path mtu discovery as defined in RFC1191.
2731da177e4SLinus Torvalds  */
27440efc6faSStephen Hemminger static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
2751da177e4SLinus Torvalds {
2761da177e4SLinus Torvalds 	struct dst_entry *dst;
2771da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sk);
2781da177e4SLinus Torvalds 
2791da177e4SLinus Torvalds 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
2801da177e4SLinus Torvalds 	 * send out by Linux are always <576bytes so they should go through
2811da177e4SLinus Torvalds 	 * unfragmented).
2821da177e4SLinus Torvalds 	 */
2831da177e4SLinus Torvalds 	if (sk->sk_state == TCP_LISTEN)
2841da177e4SLinus Torvalds 		return;
2851da177e4SLinus Torvalds 
2861da177e4SLinus Torvalds 	/* We don't check in the destentry if pmtu discovery is forbidden
2871da177e4SLinus Torvalds 	 * on this route. We just assume that no packet_to_big packets
2881da177e4SLinus Torvalds 	 * are send back when pmtu discovery is not active.
2891da177e4SLinus Torvalds      	 * There is a small race when the user changes this flag in the
2901da177e4SLinus Torvalds 	 * route, but I think that's acceptable.
2911da177e4SLinus Torvalds 	 */
2921da177e4SLinus Torvalds 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
2931da177e4SLinus Torvalds 		return;
2941da177e4SLinus Torvalds 
2951da177e4SLinus Torvalds 	dst->ops->update_pmtu(dst, mtu);
2961da177e4SLinus Torvalds 
2971da177e4SLinus Torvalds 	/* Something is about to be wrong... Remember soft error
2981da177e4SLinus Torvalds 	 * for the case, if this connection will not able to recover.
2991da177e4SLinus Torvalds 	 */
3001da177e4SLinus Torvalds 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
3011da177e4SLinus Torvalds 		sk->sk_err_soft = EMSGSIZE;
3021da177e4SLinus Torvalds 
3031da177e4SLinus Torvalds 	mtu = dst_mtu(dst);
3041da177e4SLinus Torvalds 
3051da177e4SLinus Torvalds 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306d83d8461SArnaldo Carvalho de Melo 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
3071da177e4SLinus Torvalds 		tcp_sync_mss(sk, mtu);
3081da177e4SLinus Torvalds 
3091da177e4SLinus Torvalds 		/* Resend the TCP packet because it's
3101da177e4SLinus Torvalds 		 * clear that the old packet has been
3111da177e4SLinus Torvalds 		 * dropped. This is the new "fast" path mtu
3121da177e4SLinus Torvalds 		 * discovery.
3131da177e4SLinus Torvalds 		 */
3141da177e4SLinus Torvalds 		tcp_simple_retransmit(sk);
3151da177e4SLinus Torvalds 	} /* else let the usual retransmit timer handle it */
3161da177e4SLinus Torvalds }
3171da177e4SLinus Torvalds 
3181da177e4SLinus Torvalds /*
3191da177e4SLinus Torvalds  * This routine is called by the ICMP module when it gets some
3201da177e4SLinus Torvalds  * sort of error condition.  If err < 0 then the socket should
3211da177e4SLinus Torvalds  * be closed and the error returned to the user.  If err > 0
3221da177e4SLinus Torvalds  * it's just the icmp type << 8 | icmp code.  After adjustment
3231da177e4SLinus Torvalds  * header points to the first 8 bytes of the tcp header.  We need
3241da177e4SLinus Torvalds  * to find the appropriate port.
3251da177e4SLinus Torvalds  *
3261da177e4SLinus Torvalds  * The locking strategy used here is very "optimistic". When
3271da177e4SLinus Torvalds  * someone else accesses the socket the ICMP is just dropped
3281da177e4SLinus Torvalds  * and for some paths there is no check at all.
3291da177e4SLinus Torvalds  * A more general error queue to queue errors for later handling
3301da177e4SLinus Torvalds  * is probably better.
3311da177e4SLinus Torvalds  *
3321da177e4SLinus Torvalds  */
3331da177e4SLinus Torvalds 
3341da177e4SLinus Torvalds void tcp_v4_err(struct sk_buff *skb, u32 info)
3351da177e4SLinus Torvalds {
3361da177e4SLinus Torvalds 	struct iphdr *iph = (struct iphdr *)skb->data;
3371da177e4SLinus Torvalds 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
3381da177e4SLinus Torvalds 	struct tcp_sock *tp;
3391da177e4SLinus Torvalds 	struct inet_sock *inet;
3401da177e4SLinus Torvalds 	int type = skb->h.icmph->type;
3411da177e4SLinus Torvalds 	int code = skb->h.icmph->code;
3421da177e4SLinus Torvalds 	struct sock *sk;
3431da177e4SLinus Torvalds 	__u32 seq;
3441da177e4SLinus Torvalds 	int err;
3451da177e4SLinus Torvalds 
3461da177e4SLinus Torvalds 	if (skb->len < (iph->ihl << 2) + 8) {
3471da177e4SLinus Torvalds 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
3481da177e4SLinus Torvalds 		return;
3491da177e4SLinus Torvalds 	}
3501da177e4SLinus Torvalds 
351e48c414eSArnaldo Carvalho de Melo 	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
352463c84b9SArnaldo Carvalho de Melo 			 th->source, inet_iif(skb));
3531da177e4SLinus Torvalds 	if (!sk) {
3541da177e4SLinus Torvalds 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
3551da177e4SLinus Torvalds 		return;
3561da177e4SLinus Torvalds 	}
3571da177e4SLinus Torvalds 	if (sk->sk_state == TCP_TIME_WAIT) {
3588feaf0c0SArnaldo Carvalho de Melo 		inet_twsk_put((struct inet_timewait_sock *)sk);
3591da177e4SLinus Torvalds 		return;
3601da177e4SLinus Torvalds 	}
3611da177e4SLinus Torvalds 
3621da177e4SLinus Torvalds 	bh_lock_sock(sk);
3631da177e4SLinus Torvalds 	/* If too many ICMPs get dropped on busy
3641da177e4SLinus Torvalds 	 * servers this needs to be solved differently.
3651da177e4SLinus Torvalds 	 */
3661da177e4SLinus Torvalds 	if (sock_owned_by_user(sk))
3671da177e4SLinus Torvalds 		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
3681da177e4SLinus Torvalds 
3691da177e4SLinus Torvalds 	if (sk->sk_state == TCP_CLOSE)
3701da177e4SLinus Torvalds 		goto out;
3711da177e4SLinus Torvalds 
3721da177e4SLinus Torvalds 	tp = tcp_sk(sk);
3731da177e4SLinus Torvalds 	seq = ntohl(th->seq);
3741da177e4SLinus Torvalds 	if (sk->sk_state != TCP_LISTEN &&
3751da177e4SLinus Torvalds 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
3761da177e4SLinus Torvalds 		NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
3771da177e4SLinus Torvalds 		goto out;
3781da177e4SLinus Torvalds 	}
3791da177e4SLinus Torvalds 
3801da177e4SLinus Torvalds 	switch (type) {
3811da177e4SLinus Torvalds 	case ICMP_SOURCE_QUENCH:
3821da177e4SLinus Torvalds 		/* Just silently ignore these. */
3831da177e4SLinus Torvalds 		goto out;
3841da177e4SLinus Torvalds 	case ICMP_PARAMETERPROB:
3851da177e4SLinus Torvalds 		err = EPROTO;
3861da177e4SLinus Torvalds 		break;
3871da177e4SLinus Torvalds 	case ICMP_DEST_UNREACH:
3881da177e4SLinus Torvalds 		if (code > NR_ICMP_UNREACH)
3891da177e4SLinus Torvalds 			goto out;
3901da177e4SLinus Torvalds 
3911da177e4SLinus Torvalds 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
3921da177e4SLinus Torvalds 			if (!sock_owned_by_user(sk))
3931da177e4SLinus Torvalds 				do_pmtu_discovery(sk, iph, info);
3941da177e4SLinus Torvalds 			goto out;
3951da177e4SLinus Torvalds 		}
3961da177e4SLinus Torvalds 
3971da177e4SLinus Torvalds 		err = icmp_err_convert[code].errno;
3981da177e4SLinus Torvalds 		break;
3991da177e4SLinus Torvalds 	case ICMP_TIME_EXCEEDED:
4001da177e4SLinus Torvalds 		err = EHOSTUNREACH;
4011da177e4SLinus Torvalds 		break;
4021da177e4SLinus Torvalds 	default:
4031da177e4SLinus Torvalds 		goto out;
4041da177e4SLinus Torvalds 	}
4051da177e4SLinus Torvalds 
4061da177e4SLinus Torvalds 	switch (sk->sk_state) {
40760236fddSArnaldo Carvalho de Melo 		struct request_sock *req, **prev;
4081da177e4SLinus Torvalds 	case TCP_LISTEN:
4091da177e4SLinus Torvalds 		if (sock_owned_by_user(sk))
4101da177e4SLinus Torvalds 			goto out;
4111da177e4SLinus Torvalds 
412463c84b9SArnaldo Carvalho de Melo 		req = inet_csk_search_req(sk, &prev, th->dest,
4131da177e4SLinus Torvalds 					  iph->daddr, iph->saddr);
4141da177e4SLinus Torvalds 		if (!req)
4151da177e4SLinus Torvalds 			goto out;
4161da177e4SLinus Torvalds 
4171da177e4SLinus Torvalds 		/* ICMPs are not backlogged, hence we cannot get
4181da177e4SLinus Torvalds 		   an established socket here.
4191da177e4SLinus Torvalds 		 */
4201da177e4SLinus Torvalds 		BUG_TRAP(!req->sk);
4211da177e4SLinus Torvalds 
4222e6599cbSArnaldo Carvalho de Melo 		if (seq != tcp_rsk(req)->snt_isn) {
4231da177e4SLinus Torvalds 			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
4241da177e4SLinus Torvalds 			goto out;
4251da177e4SLinus Torvalds 		}
4261da177e4SLinus Torvalds 
4271da177e4SLinus Torvalds 		/*
4281da177e4SLinus Torvalds 		 * Still in SYN_RECV, just remove it silently.
4291da177e4SLinus Torvalds 		 * There is no good way to pass the error to the newly
4301da177e4SLinus Torvalds 		 * created socket, and POSIX does not want network
4311da177e4SLinus Torvalds 		 * errors returned from accept().
4321da177e4SLinus Torvalds 		 */
433463c84b9SArnaldo Carvalho de Melo 		inet_csk_reqsk_queue_drop(sk, req, prev);
4341da177e4SLinus Torvalds 		goto out;
4351da177e4SLinus Torvalds 
4361da177e4SLinus Torvalds 	case TCP_SYN_SENT:
4371da177e4SLinus Torvalds 	case TCP_SYN_RECV:  /* Cannot happen.
4381da177e4SLinus Torvalds 			       It can f.e. if SYNs crossed.
4391da177e4SLinus Torvalds 			     */
4401da177e4SLinus Torvalds 		if (!sock_owned_by_user(sk)) {
4411da177e4SLinus Torvalds 			TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
4421da177e4SLinus Torvalds 			sk->sk_err = err;
4431da177e4SLinus Torvalds 
4441da177e4SLinus Torvalds 			sk->sk_error_report(sk);
4451da177e4SLinus Torvalds 
4461da177e4SLinus Torvalds 			tcp_done(sk);
4471da177e4SLinus Torvalds 		} else {
4481da177e4SLinus Torvalds 			sk->sk_err_soft = err;
4491da177e4SLinus Torvalds 		}
4501da177e4SLinus Torvalds 		goto out;
4511da177e4SLinus Torvalds 	}
4521da177e4SLinus Torvalds 
4531da177e4SLinus Torvalds 	/* If we've already connected we will keep trying
4541da177e4SLinus Torvalds 	 * until we time out, or the user gives up.
4551da177e4SLinus Torvalds 	 *
4561da177e4SLinus Torvalds 	 * rfc1122 4.2.3.9 allows to consider as hard errors
4571da177e4SLinus Torvalds 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
4581da177e4SLinus Torvalds 	 * but it is obsoleted by pmtu discovery).
4591da177e4SLinus Torvalds 	 *
4601da177e4SLinus Torvalds 	 * Note, that in modern internet, where routing is unreliable
4611da177e4SLinus Torvalds 	 * and in each dark corner broken firewalls sit, sending random
4621da177e4SLinus Torvalds 	 * errors ordered by their masters even this two messages finally lose
4631da177e4SLinus Torvalds 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
4641da177e4SLinus Torvalds 	 *
4651da177e4SLinus Torvalds 	 * Now we are in compliance with RFCs.
4661da177e4SLinus Torvalds 	 *							--ANK (980905)
4671da177e4SLinus Torvalds 	 */
4681da177e4SLinus Torvalds 
4691da177e4SLinus Torvalds 	inet = inet_sk(sk);
4701da177e4SLinus Torvalds 	if (!sock_owned_by_user(sk) && inet->recverr) {
4711da177e4SLinus Torvalds 		sk->sk_err = err;
4721da177e4SLinus Torvalds 		sk->sk_error_report(sk);
4731da177e4SLinus Torvalds 	} else	{ /* Only an error on timeout */
4741da177e4SLinus Torvalds 		sk->sk_err_soft = err;
4751da177e4SLinus Torvalds 	}
4761da177e4SLinus Torvalds 
4771da177e4SLinus Torvalds out:
4781da177e4SLinus Torvalds 	bh_unlock_sock(sk);
4791da177e4SLinus Torvalds 	sock_put(sk);
4801da177e4SLinus Torvalds }
4811da177e4SLinus Torvalds 
4821da177e4SLinus Torvalds /* This routine computes an IPv4 TCP checksum. */
4838292a17aSArnaldo Carvalho de Melo void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
4841da177e4SLinus Torvalds {
4851da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sk);
4868292a17aSArnaldo Carvalho de Melo 	struct tcphdr *th = skb->h.th;
4871da177e4SLinus Torvalds 
4881da177e4SLinus Torvalds 	if (skb->ip_summed == CHECKSUM_HW) {
4891da177e4SLinus Torvalds 		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
4901da177e4SLinus Torvalds 		skb->csum = offsetof(struct tcphdr, check);
4911da177e4SLinus Torvalds 	} else {
4921da177e4SLinus Torvalds 		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
4931da177e4SLinus Torvalds 					 csum_partial((char *)th,
4941da177e4SLinus Torvalds 						      th->doff << 2,
4951da177e4SLinus Torvalds 						      skb->csum));
4961da177e4SLinus Torvalds 	}
4971da177e4SLinus Torvalds }
4981da177e4SLinus Torvalds 
499*a430a43dSHerbert Xu int tcp_v4_gso_send_check(struct sk_buff *skb)
500*a430a43dSHerbert Xu {
501*a430a43dSHerbert Xu 	struct iphdr *iph;
502*a430a43dSHerbert Xu 	struct tcphdr *th;
503*a430a43dSHerbert Xu 
504*a430a43dSHerbert Xu 	if (!pskb_may_pull(skb, sizeof(*th)))
505*a430a43dSHerbert Xu 		return -EINVAL;
506*a430a43dSHerbert Xu 
507*a430a43dSHerbert Xu 	iph = skb->nh.iph;
508*a430a43dSHerbert Xu 	th = skb->h.th;
509*a430a43dSHerbert Xu 
510*a430a43dSHerbert Xu 	th->check = 0;
511*a430a43dSHerbert Xu 	th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
512*a430a43dSHerbert Xu 	skb->csum = offsetof(struct tcphdr, check);
513*a430a43dSHerbert Xu 	skb->ip_summed = CHECKSUM_HW;
514*a430a43dSHerbert Xu 	return 0;
515*a430a43dSHerbert Xu }
516*a430a43dSHerbert Xu 
5171da177e4SLinus Torvalds /*
5181da177e4SLinus Torvalds  *	This routine will send an RST to the other tcp.
5191da177e4SLinus Torvalds  *
5201da177e4SLinus Torvalds  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
5211da177e4SLinus Torvalds  *		      for reset.
5221da177e4SLinus Torvalds  *	Answer: if a packet caused RST, it is not for a socket
5231da177e4SLinus Torvalds  *		existing in our system, if it is matched to a socket,
5241da177e4SLinus Torvalds  *		it is just duplicate segment or bug in other side's TCP.
5251da177e4SLinus Torvalds  *		So that we build reply only basing on parameters
5261da177e4SLinus Torvalds  *		arrived with segment.
5271da177e4SLinus Torvalds  *	Exception: precedence violation. We do not implement it in any case.
5281da177e4SLinus Torvalds  */
5291da177e4SLinus Torvalds 
5301da177e4SLinus Torvalds static void tcp_v4_send_reset(struct sk_buff *skb)
5311da177e4SLinus Torvalds {
5321da177e4SLinus Torvalds 	struct tcphdr *th = skb->h.th;
5331da177e4SLinus Torvalds 	struct tcphdr rth;
5341da177e4SLinus Torvalds 	struct ip_reply_arg arg;
5351da177e4SLinus Torvalds 
5361da177e4SLinus Torvalds 	/* Never send a reset in response to a reset. */
5371da177e4SLinus Torvalds 	if (th->rst)
5381da177e4SLinus Torvalds 		return;
5391da177e4SLinus Torvalds 
5401da177e4SLinus Torvalds 	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
5411da177e4SLinus Torvalds 		return;
5421da177e4SLinus Torvalds 
5431da177e4SLinus Torvalds 	/* Swap the send and the receive. */
5441da177e4SLinus Torvalds 	memset(&rth, 0, sizeof(struct tcphdr));
5451da177e4SLinus Torvalds 	rth.dest   = th->source;
5461da177e4SLinus Torvalds 	rth.source = th->dest;
5471da177e4SLinus Torvalds 	rth.doff   = sizeof(struct tcphdr) / 4;
5481da177e4SLinus Torvalds 	rth.rst    = 1;
5491da177e4SLinus Torvalds 
5501da177e4SLinus Torvalds 	if (th->ack) {
5511da177e4SLinus Torvalds 		rth.seq = th->ack_seq;
5521da177e4SLinus Torvalds 	} else {
5531da177e4SLinus Torvalds 		rth.ack = 1;
5541da177e4SLinus Torvalds 		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
5551da177e4SLinus Torvalds 				    skb->len - (th->doff << 2));
5561da177e4SLinus Torvalds 	}
5571da177e4SLinus Torvalds 
5581da177e4SLinus Torvalds 	memset(&arg, 0, sizeof arg);
5591da177e4SLinus Torvalds 	arg.iov[0].iov_base = (unsigned char *)&rth;
5601da177e4SLinus Torvalds 	arg.iov[0].iov_len  = sizeof rth;
5611da177e4SLinus Torvalds 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
5621da177e4SLinus Torvalds 				      skb->nh.iph->saddr, /*XXX*/
5631da177e4SLinus Torvalds 				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
5641da177e4SLinus Torvalds 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
5651da177e4SLinus Torvalds 
5661da177e4SLinus Torvalds 	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
5671da177e4SLinus Torvalds 
5681da177e4SLinus Torvalds 	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
5691da177e4SLinus Torvalds 	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
5701da177e4SLinus Torvalds }
5711da177e4SLinus Torvalds 
5721da177e4SLinus Torvalds /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
5731da177e4SLinus Torvalds    outside socket context is ugly, certainly. What can I do?
5741da177e4SLinus Torvalds  */
5751da177e4SLinus Torvalds 
5761da177e4SLinus Torvalds static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
5771da177e4SLinus Torvalds 			    u32 win, u32 ts)
5781da177e4SLinus Torvalds {
5791da177e4SLinus Torvalds 	struct tcphdr *th = skb->h.th;
5801da177e4SLinus Torvalds 	struct {
5811da177e4SLinus Torvalds 		struct tcphdr th;
5821da177e4SLinus Torvalds 		u32 tsopt[3];
5831da177e4SLinus Torvalds 	} rep;
5841da177e4SLinus Torvalds 	struct ip_reply_arg arg;
5851da177e4SLinus Torvalds 
5861da177e4SLinus Torvalds 	memset(&rep.th, 0, sizeof(struct tcphdr));
5871da177e4SLinus Torvalds 	memset(&arg, 0, sizeof arg);
5881da177e4SLinus Torvalds 
5891da177e4SLinus Torvalds 	arg.iov[0].iov_base = (unsigned char *)&rep;
5901da177e4SLinus Torvalds 	arg.iov[0].iov_len  = sizeof(rep.th);
5911da177e4SLinus Torvalds 	if (ts) {
5921da177e4SLinus Torvalds 		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
5931da177e4SLinus Torvalds 				     (TCPOPT_TIMESTAMP << 8) |
5941da177e4SLinus Torvalds 				     TCPOLEN_TIMESTAMP);
5951da177e4SLinus Torvalds 		rep.tsopt[1] = htonl(tcp_time_stamp);
5961da177e4SLinus Torvalds 		rep.tsopt[2] = htonl(ts);
5971da177e4SLinus Torvalds 		arg.iov[0].iov_len = sizeof(rep);
5981da177e4SLinus Torvalds 	}
5991da177e4SLinus Torvalds 
6001da177e4SLinus Torvalds 	/* Swap the send and the receive. */
6011da177e4SLinus Torvalds 	rep.th.dest    = th->source;
6021da177e4SLinus Torvalds 	rep.th.source  = th->dest;
6031da177e4SLinus Torvalds 	rep.th.doff    = arg.iov[0].iov_len / 4;
6041da177e4SLinus Torvalds 	rep.th.seq     = htonl(seq);
6051da177e4SLinus Torvalds 	rep.th.ack_seq = htonl(ack);
6061da177e4SLinus Torvalds 	rep.th.ack     = 1;
6071da177e4SLinus Torvalds 	rep.th.window  = htons(win);
6081da177e4SLinus Torvalds 
6091da177e4SLinus Torvalds 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
6101da177e4SLinus Torvalds 				      skb->nh.iph->saddr, /*XXX*/
6111da177e4SLinus Torvalds 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
6121da177e4SLinus Torvalds 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
6131da177e4SLinus Torvalds 
6141da177e4SLinus Torvalds 	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
6151da177e4SLinus Torvalds 
6161da177e4SLinus Torvalds 	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
6171da177e4SLinus Torvalds }
6181da177e4SLinus Torvalds 
6191da177e4SLinus Torvalds static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
6201da177e4SLinus Torvalds {
6218feaf0c0SArnaldo Carvalho de Melo 	struct inet_timewait_sock *tw = inet_twsk(sk);
6228feaf0c0SArnaldo Carvalho de Melo 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
6231da177e4SLinus Torvalds 
6248feaf0c0SArnaldo Carvalho de Melo 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
6258feaf0c0SArnaldo Carvalho de Melo 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
6261da177e4SLinus Torvalds 
6278feaf0c0SArnaldo Carvalho de Melo 	inet_twsk_put(tw);
6281da177e4SLinus Torvalds }
6291da177e4SLinus Torvalds 
63060236fddSArnaldo Carvalho de Melo static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
6311da177e4SLinus Torvalds {
6322e6599cbSArnaldo Carvalho de Melo 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
6331da177e4SLinus Torvalds 			req->ts_recent);
6341da177e4SLinus Torvalds }
6351da177e4SLinus Torvalds 
6361da177e4SLinus Torvalds /*
6371da177e4SLinus Torvalds  *	Send a SYN-ACK after having received an ACK.
63860236fddSArnaldo Carvalho de Melo  *	This still operates on a request_sock only, not on a big
6391da177e4SLinus Torvalds  *	socket.
6401da177e4SLinus Torvalds  */
64160236fddSArnaldo Carvalho de Melo static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
6421da177e4SLinus Torvalds 			      struct dst_entry *dst)
6431da177e4SLinus Torvalds {
6442e6599cbSArnaldo Carvalho de Melo 	const struct inet_request_sock *ireq = inet_rsk(req);
6451da177e4SLinus Torvalds 	int err = -1;
6461da177e4SLinus Torvalds 	struct sk_buff * skb;
6471da177e4SLinus Torvalds 
6481da177e4SLinus Torvalds 	/* First, grab a route. */
649463c84b9SArnaldo Carvalho de Melo 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
6501da177e4SLinus Torvalds 		goto out;
6511da177e4SLinus Torvalds 
6521da177e4SLinus Torvalds 	skb = tcp_make_synack(sk, dst, req);
6531da177e4SLinus Torvalds 
6541da177e4SLinus Torvalds 	if (skb) {
6551da177e4SLinus Torvalds 		struct tcphdr *th = skb->h.th;
6561da177e4SLinus Torvalds 
6571da177e4SLinus Torvalds 		th->check = tcp_v4_check(th, skb->len,
6582e6599cbSArnaldo Carvalho de Melo 					 ireq->loc_addr,
6592e6599cbSArnaldo Carvalho de Melo 					 ireq->rmt_addr,
6601da177e4SLinus Torvalds 					 csum_partial((char *)th, skb->len,
6611da177e4SLinus Torvalds 						      skb->csum));
6621da177e4SLinus Torvalds 
6632e6599cbSArnaldo Carvalho de Melo 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
6642e6599cbSArnaldo Carvalho de Melo 					    ireq->rmt_addr,
6652e6599cbSArnaldo Carvalho de Melo 					    ireq->opt);
6661da177e4SLinus Torvalds 		if (err == NET_XMIT_CN)
6671da177e4SLinus Torvalds 			err = 0;
6681da177e4SLinus Torvalds 	}
6691da177e4SLinus Torvalds 
6701da177e4SLinus Torvalds out:
6711da177e4SLinus Torvalds 	dst_release(dst);
6721da177e4SLinus Torvalds 	return err;
6731da177e4SLinus Torvalds }
6741da177e4SLinus Torvalds 
6751da177e4SLinus Torvalds /*
67660236fddSArnaldo Carvalho de Melo  *	IPv4 request_sock destructor.
6771da177e4SLinus Torvalds  */
67860236fddSArnaldo Carvalho de Melo static void tcp_v4_reqsk_destructor(struct request_sock *req)
6791da177e4SLinus Torvalds {
6802e6599cbSArnaldo Carvalho de Melo 	kfree(inet_rsk(req)->opt);
6811da177e4SLinus Torvalds }
6821da177e4SLinus Torvalds 
68380e40daaSArnaldo Carvalho de Melo #ifdef CONFIG_SYN_COOKIES
68440efc6faSStephen Hemminger static void syn_flood_warning(struct sk_buff *skb)
6851da177e4SLinus Torvalds {
6861da177e4SLinus Torvalds 	static unsigned long warntime;
6871da177e4SLinus Torvalds 
6881da177e4SLinus Torvalds 	if (time_after(jiffies, (warntime + HZ * 60))) {
6891da177e4SLinus Torvalds 		warntime = jiffies;
6901da177e4SLinus Torvalds 		printk(KERN_INFO
6911da177e4SLinus Torvalds 		       "possible SYN flooding on port %d. Sending cookies.\n",
6921da177e4SLinus Torvalds 		       ntohs(skb->h.th->dest));
6931da177e4SLinus Torvalds 	}
6941da177e4SLinus Torvalds }
69580e40daaSArnaldo Carvalho de Melo #endif
6961da177e4SLinus Torvalds 
6971da177e4SLinus Torvalds /*
69860236fddSArnaldo Carvalho de Melo  * Save and compile IPv4 options into the request_sock if needed.
6991da177e4SLinus Torvalds  */
70040efc6faSStephen Hemminger static struct ip_options *tcp_v4_save_options(struct sock *sk,
7011da177e4SLinus Torvalds 					      struct sk_buff *skb)
7021da177e4SLinus Torvalds {
7031da177e4SLinus Torvalds 	struct ip_options *opt = &(IPCB(skb)->opt);
7041da177e4SLinus Torvalds 	struct ip_options *dopt = NULL;
7051da177e4SLinus Torvalds 
7061da177e4SLinus Torvalds 	if (opt && opt->optlen) {
7071da177e4SLinus Torvalds 		int opt_size = optlength(opt);
7081da177e4SLinus Torvalds 		dopt = kmalloc(opt_size, GFP_ATOMIC);
7091da177e4SLinus Torvalds 		if (dopt) {
7101da177e4SLinus Torvalds 			if (ip_options_echo(dopt, skb)) {
7111da177e4SLinus Torvalds 				kfree(dopt);
7121da177e4SLinus Torvalds 				dopt = NULL;
7131da177e4SLinus Torvalds 			}
7141da177e4SLinus Torvalds 		}
7151da177e4SLinus Torvalds 	}
7161da177e4SLinus Torvalds 	return dopt;
7171da177e4SLinus Torvalds }
7181da177e4SLinus Torvalds 
71960236fddSArnaldo Carvalho de Melo struct request_sock_ops tcp_request_sock_ops = {
7201da177e4SLinus Torvalds 	.family		=	PF_INET,
7212e6599cbSArnaldo Carvalho de Melo 	.obj_size	=	sizeof(struct tcp_request_sock),
7221da177e4SLinus Torvalds 	.rtx_syn_ack	=	tcp_v4_send_synack,
72360236fddSArnaldo Carvalho de Melo 	.send_ack	=	tcp_v4_reqsk_send_ack,
72460236fddSArnaldo Carvalho de Melo 	.destructor	=	tcp_v4_reqsk_destructor,
7251da177e4SLinus Torvalds 	.send_reset	=	tcp_v4_send_reset,
7261da177e4SLinus Torvalds };
7271da177e4SLinus Torvalds 
7286d6ee43eSArnaldo Carvalho de Melo static struct timewait_sock_ops tcp_timewait_sock_ops = {
7296d6ee43eSArnaldo Carvalho de Melo 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
7306d6ee43eSArnaldo Carvalho de Melo 	.twsk_unique	= tcp_twsk_unique,
7316d6ee43eSArnaldo Carvalho de Melo };
7326d6ee43eSArnaldo Carvalho de Melo 
7331da177e4SLinus Torvalds int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
7341da177e4SLinus Torvalds {
7352e6599cbSArnaldo Carvalho de Melo 	struct inet_request_sock *ireq;
7361da177e4SLinus Torvalds 	struct tcp_options_received tmp_opt;
73760236fddSArnaldo Carvalho de Melo 	struct request_sock *req;
7381da177e4SLinus Torvalds 	__u32 saddr = skb->nh.iph->saddr;
7391da177e4SLinus Torvalds 	__u32 daddr = skb->nh.iph->daddr;
7401da177e4SLinus Torvalds 	__u32 isn = TCP_SKB_CB(skb)->when;
7411da177e4SLinus Torvalds 	struct dst_entry *dst = NULL;
7421da177e4SLinus Torvalds #ifdef CONFIG_SYN_COOKIES
7431da177e4SLinus Torvalds 	int want_cookie = 0;
7441da177e4SLinus Torvalds #else
7451da177e4SLinus Torvalds #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
7461da177e4SLinus Torvalds #endif
7471da177e4SLinus Torvalds 
7481da177e4SLinus Torvalds 	/* Never answer to SYNs send to broadcast or multicast */
7491da177e4SLinus Torvalds 	if (((struct rtable *)skb->dst)->rt_flags &
7501da177e4SLinus Torvalds 	    (RTCF_BROADCAST | RTCF_MULTICAST))
7511da177e4SLinus Torvalds 		goto drop;
7521da177e4SLinus Torvalds 
7531da177e4SLinus Torvalds 	/* TW buckets are converted to open requests without
7541da177e4SLinus Torvalds 	 * limitations, they conserve resources and peer is
7551da177e4SLinus Torvalds 	 * evidently real one.
7561da177e4SLinus Torvalds 	 */
757463c84b9SArnaldo Carvalho de Melo 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
7581da177e4SLinus Torvalds #ifdef CONFIG_SYN_COOKIES
7591da177e4SLinus Torvalds 		if (sysctl_tcp_syncookies) {
7601da177e4SLinus Torvalds 			want_cookie = 1;
7611da177e4SLinus Torvalds 		} else
7621da177e4SLinus Torvalds #endif
7631da177e4SLinus Torvalds 		goto drop;
7641da177e4SLinus Torvalds 	}
7651da177e4SLinus Torvalds 
7661da177e4SLinus Torvalds 	/* Accept backlog is full. If we have already queued enough
7671da177e4SLinus Torvalds 	 * of warm entries in syn queue, drop request. It is better than
7681da177e4SLinus Torvalds 	 * clogging syn queue with openreqs with exponentially increasing
7691da177e4SLinus Torvalds 	 * timeout.
7701da177e4SLinus Torvalds 	 */
771463c84b9SArnaldo Carvalho de Melo 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
7721da177e4SLinus Torvalds 		goto drop;
7731da177e4SLinus Torvalds 
77460236fddSArnaldo Carvalho de Melo 	req = reqsk_alloc(&tcp_request_sock_ops);
7751da177e4SLinus Torvalds 	if (!req)
7761da177e4SLinus Torvalds 		goto drop;
7771da177e4SLinus Torvalds 
7781da177e4SLinus Torvalds 	tcp_clear_options(&tmp_opt);
7791da177e4SLinus Torvalds 	tmp_opt.mss_clamp = 536;
7801da177e4SLinus Torvalds 	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
7811da177e4SLinus Torvalds 
7821da177e4SLinus Torvalds 	tcp_parse_options(skb, &tmp_opt, 0);
7831da177e4SLinus Torvalds 
7841da177e4SLinus Torvalds 	if (want_cookie) {
7851da177e4SLinus Torvalds 		tcp_clear_options(&tmp_opt);
7861da177e4SLinus Torvalds 		tmp_opt.saw_tstamp = 0;
7871da177e4SLinus Torvalds 	}
7881da177e4SLinus Torvalds 
7891da177e4SLinus Torvalds 	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
7901da177e4SLinus Torvalds 		/* Some OSes (unknown ones, but I see them on web server, which
7911da177e4SLinus Torvalds 		 * contains information interesting only for windows'
7921da177e4SLinus Torvalds 		 * users) do not send their stamp in SYN. It is easy case.
7931da177e4SLinus Torvalds 		 * We simply do not advertise TS support.
7941da177e4SLinus Torvalds 		 */
7951da177e4SLinus Torvalds 		tmp_opt.saw_tstamp = 0;
7961da177e4SLinus Torvalds 		tmp_opt.tstamp_ok  = 0;
7971da177e4SLinus Torvalds 	}
7981da177e4SLinus Torvalds 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
7991da177e4SLinus Torvalds 
8001da177e4SLinus Torvalds 	tcp_openreq_init(req, &tmp_opt, skb);
8011da177e4SLinus Torvalds 
8022e6599cbSArnaldo Carvalho de Melo 	ireq = inet_rsk(req);
8032e6599cbSArnaldo Carvalho de Melo 	ireq->loc_addr = daddr;
8042e6599cbSArnaldo Carvalho de Melo 	ireq->rmt_addr = saddr;
8052e6599cbSArnaldo Carvalho de Melo 	ireq->opt = tcp_v4_save_options(sk, skb);
8061da177e4SLinus Torvalds 	if (!want_cookie)
8071da177e4SLinus Torvalds 		TCP_ECN_create_request(req, skb->h.th);
8081da177e4SLinus Torvalds 
8091da177e4SLinus Torvalds 	if (want_cookie) {
8101da177e4SLinus Torvalds #ifdef CONFIG_SYN_COOKIES
8111da177e4SLinus Torvalds 		syn_flood_warning(skb);
8121da177e4SLinus Torvalds #endif
8131da177e4SLinus Torvalds 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
8141da177e4SLinus Torvalds 	} else if (!isn) {
8151da177e4SLinus Torvalds 		struct inet_peer *peer = NULL;
8161da177e4SLinus Torvalds 
8171da177e4SLinus Torvalds 		/* VJ's idea. We save last timestamp seen
8181da177e4SLinus Torvalds 		 * from the destination in peer table, when entering
8191da177e4SLinus Torvalds 		 * state TIME-WAIT, and check against it before
8201da177e4SLinus Torvalds 		 * accepting new connection request.
8211da177e4SLinus Torvalds 		 *
8221da177e4SLinus Torvalds 		 * If "isn" is not zero, this request hit alive
8231da177e4SLinus Torvalds 		 * timewait bucket, so that all the necessary checks
8241da177e4SLinus Torvalds 		 * are made in the function processing timewait state.
8251da177e4SLinus Torvalds 		 */
8261da177e4SLinus Torvalds 		if (tmp_opt.saw_tstamp &&
827295ff7edSArnaldo Carvalho de Melo 		    tcp_death_row.sysctl_tw_recycle &&
828463c84b9SArnaldo Carvalho de Melo 		    (dst = inet_csk_route_req(sk, req)) != NULL &&
8291da177e4SLinus Torvalds 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
8301da177e4SLinus Torvalds 		    peer->v4daddr == saddr) {
8311da177e4SLinus Torvalds 			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
8321da177e4SLinus Torvalds 			    (s32)(peer->tcp_ts - req->ts_recent) >
8331da177e4SLinus Torvalds 							TCP_PAWS_WINDOW) {
8341da177e4SLinus Torvalds 				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
8351da177e4SLinus Torvalds 				dst_release(dst);
8361da177e4SLinus Torvalds 				goto drop_and_free;
8371da177e4SLinus Torvalds 			}
8381da177e4SLinus Torvalds 		}
8391da177e4SLinus Torvalds 		/* Kill the following clause, if you dislike this way. */
8401da177e4SLinus Torvalds 		else if (!sysctl_tcp_syncookies &&
841463c84b9SArnaldo Carvalho de Melo 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
8421da177e4SLinus Torvalds 			  (sysctl_max_syn_backlog >> 2)) &&
8431da177e4SLinus Torvalds 			 (!peer || !peer->tcp_ts_stamp) &&
8441da177e4SLinus Torvalds 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
8451da177e4SLinus Torvalds 			/* Without syncookies last quarter of
8461da177e4SLinus Torvalds 			 * backlog is filled with destinations,
8471da177e4SLinus Torvalds 			 * proven to be alive.
8481da177e4SLinus Torvalds 			 * It means that we continue to communicate
8491da177e4SLinus Torvalds 			 * to destinations, already remembered
8501da177e4SLinus Torvalds 			 * to the moment of synflood.
8511da177e4SLinus Torvalds 			 */
85264ce2073SPatrick McHardy 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
85364ce2073SPatrick McHardy 				       "request from %u.%u.%u.%u/%u\n",
8541da177e4SLinus Torvalds 				       NIPQUAD(saddr),
85564ce2073SPatrick McHardy 				       ntohs(skb->h.th->source));
8561da177e4SLinus Torvalds 			dst_release(dst);
8571da177e4SLinus Torvalds 			goto drop_and_free;
8581da177e4SLinus Torvalds 		}
8591da177e4SLinus Torvalds 
8601da177e4SLinus Torvalds 		isn = tcp_v4_init_sequence(sk, skb);
8611da177e4SLinus Torvalds 	}
8622e6599cbSArnaldo Carvalho de Melo 	tcp_rsk(req)->snt_isn = isn;
8631da177e4SLinus Torvalds 
8641da177e4SLinus Torvalds 	if (tcp_v4_send_synack(sk, req, dst))
8651da177e4SLinus Torvalds 		goto drop_and_free;
8661da177e4SLinus Torvalds 
8671da177e4SLinus Torvalds 	if (want_cookie) {
86860236fddSArnaldo Carvalho de Melo 	   	reqsk_free(req);
8691da177e4SLinus Torvalds 	} else {
8703f421baaSArnaldo Carvalho de Melo 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
8711da177e4SLinus Torvalds 	}
8721da177e4SLinus Torvalds 	return 0;
8731da177e4SLinus Torvalds 
8741da177e4SLinus Torvalds drop_and_free:
87560236fddSArnaldo Carvalho de Melo 	reqsk_free(req);
8761da177e4SLinus Torvalds drop:
8771da177e4SLinus Torvalds 	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
8781da177e4SLinus Torvalds 	return 0;
8791da177e4SLinus Torvalds }
8801da177e4SLinus Torvalds 
8811da177e4SLinus Torvalds 
8821da177e4SLinus Torvalds /*
8831da177e4SLinus Torvalds  * The three way handshake has completed - we got a valid synack -
8841da177e4SLinus Torvalds  * now create the new socket.
8851da177e4SLinus Torvalds  */
8861da177e4SLinus Torvalds struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
88760236fddSArnaldo Carvalho de Melo 				  struct request_sock *req,
8881da177e4SLinus Torvalds 				  struct dst_entry *dst)
8891da177e4SLinus Torvalds {
8902e6599cbSArnaldo Carvalho de Melo 	struct inet_request_sock *ireq;
8911da177e4SLinus Torvalds 	struct inet_sock *newinet;
8921da177e4SLinus Torvalds 	struct tcp_sock *newtp;
8931da177e4SLinus Torvalds 	struct sock *newsk;
8941da177e4SLinus Torvalds 
8951da177e4SLinus Torvalds 	if (sk_acceptq_is_full(sk))
8961da177e4SLinus Torvalds 		goto exit_overflow;
8971da177e4SLinus Torvalds 
898463c84b9SArnaldo Carvalho de Melo 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
8991da177e4SLinus Torvalds 		goto exit;
9001da177e4SLinus Torvalds 
9011da177e4SLinus Torvalds 	newsk = tcp_create_openreq_child(sk, req, skb);
9021da177e4SLinus Torvalds 	if (!newsk)
9031da177e4SLinus Torvalds 		goto exit;
9041da177e4SLinus Torvalds 
905bcd76111SHerbert Xu 	newsk->sk_gso_type = SKB_GSO_TCPV4;
9066cbb0df7SArnaldo Carvalho de Melo 	sk_setup_caps(newsk, dst);
9071da177e4SLinus Torvalds 
9081da177e4SLinus Torvalds 	newtp		      = tcp_sk(newsk);
9091da177e4SLinus Torvalds 	newinet		      = inet_sk(newsk);
9102e6599cbSArnaldo Carvalho de Melo 	ireq		      = inet_rsk(req);
9112e6599cbSArnaldo Carvalho de Melo 	newinet->daddr	      = ireq->rmt_addr;
9122e6599cbSArnaldo Carvalho de Melo 	newinet->rcv_saddr    = ireq->loc_addr;
9132e6599cbSArnaldo Carvalho de Melo 	newinet->saddr	      = ireq->loc_addr;
9142e6599cbSArnaldo Carvalho de Melo 	newinet->opt	      = ireq->opt;
9152e6599cbSArnaldo Carvalho de Melo 	ireq->opt	      = NULL;
916463c84b9SArnaldo Carvalho de Melo 	newinet->mc_index     = inet_iif(skb);
9171da177e4SLinus Torvalds 	newinet->mc_ttl	      = skb->nh.iph->ttl;
918d83d8461SArnaldo Carvalho de Melo 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
9191da177e4SLinus Torvalds 	if (newinet->opt)
920d83d8461SArnaldo Carvalho de Melo 		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
9211da177e4SLinus Torvalds 	newinet->id = newtp->write_seq ^ jiffies;
9221da177e4SLinus Torvalds 
9235d424d5aSJohn Heffner 	tcp_mtup_init(newsk);
9241da177e4SLinus Torvalds 	tcp_sync_mss(newsk, dst_mtu(dst));
9251da177e4SLinus Torvalds 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
9261da177e4SLinus Torvalds 	tcp_initialize_rcv_mss(newsk);
9271da177e4SLinus Torvalds 
928f3f05f70SArnaldo Carvalho de Melo 	__inet_hash(&tcp_hashinfo, newsk, 0);
9292d8c4ce5SArnaldo Carvalho de Melo 	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
9301da177e4SLinus Torvalds 
9311da177e4SLinus Torvalds 	return newsk;
9321da177e4SLinus Torvalds 
9331da177e4SLinus Torvalds exit_overflow:
9341da177e4SLinus Torvalds 	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
9351da177e4SLinus Torvalds exit:
9361da177e4SLinus Torvalds 	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
9371da177e4SLinus Torvalds 	dst_release(dst);
9381da177e4SLinus Torvalds 	return NULL;
9391da177e4SLinus Torvalds }
9401da177e4SLinus Torvalds 
9411da177e4SLinus Torvalds static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
9421da177e4SLinus Torvalds {
9431da177e4SLinus Torvalds 	struct tcphdr *th = skb->h.th;
9441da177e4SLinus Torvalds 	struct iphdr *iph = skb->nh.iph;
9451da177e4SLinus Torvalds 	struct sock *nsk;
94660236fddSArnaldo Carvalho de Melo 	struct request_sock **prev;
9471da177e4SLinus Torvalds 	/* Find possible connection requests. */
948463c84b9SArnaldo Carvalho de Melo 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
9491da177e4SLinus Torvalds 						       iph->saddr, iph->daddr);
9501da177e4SLinus Torvalds 	if (req)
9511da177e4SLinus Torvalds 		return tcp_check_req(sk, skb, req, prev);
9521da177e4SLinus Torvalds 
953e48c414eSArnaldo Carvalho de Melo 	nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
954e48c414eSArnaldo Carvalho de Melo 					th->source, skb->nh.iph->daddr,
955463c84b9SArnaldo Carvalho de Melo 					ntohs(th->dest), inet_iif(skb));
9561da177e4SLinus Torvalds 
9571da177e4SLinus Torvalds 	if (nsk) {
9581da177e4SLinus Torvalds 		if (nsk->sk_state != TCP_TIME_WAIT) {
9591da177e4SLinus Torvalds 			bh_lock_sock(nsk);
9601da177e4SLinus Torvalds 			return nsk;
9611da177e4SLinus Torvalds 		}
9628feaf0c0SArnaldo Carvalho de Melo 		inet_twsk_put((struct inet_timewait_sock *)nsk);
9631da177e4SLinus Torvalds 		return NULL;
9641da177e4SLinus Torvalds 	}
9651da177e4SLinus Torvalds 
9661da177e4SLinus Torvalds #ifdef CONFIG_SYN_COOKIES
9671da177e4SLinus Torvalds 	if (!th->rst && !th->syn && th->ack)
9681da177e4SLinus Torvalds 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
9691da177e4SLinus Torvalds #endif
9701da177e4SLinus Torvalds 	return sk;
9711da177e4SLinus Torvalds }
9721da177e4SLinus Torvalds 
9731da177e4SLinus Torvalds static int tcp_v4_checksum_init(struct sk_buff *skb)
9741da177e4SLinus Torvalds {
9751da177e4SLinus Torvalds 	if (skb->ip_summed == CHECKSUM_HW) {
9761da177e4SLinus Torvalds 		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
977fb286bb2SHerbert Xu 				  skb->nh.iph->daddr, skb->csum)) {
9781da177e4SLinus Torvalds 			skb->ip_summed = CHECKSUM_UNNECESSARY;
979fb286bb2SHerbert Xu 			return 0;
980fb286bb2SHerbert Xu 		}
981fb286bb2SHerbert Xu 	}
982fb286bb2SHerbert Xu 
983fb286bb2SHerbert Xu 	skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
984fb286bb2SHerbert Xu 				       skb->len, IPPROTO_TCP, 0);
985fb286bb2SHerbert Xu 
986fb286bb2SHerbert Xu 	if (skb->len <= 76) {
987fb286bb2SHerbert Xu 		return __skb_checksum_complete(skb);
9881da177e4SLinus Torvalds 	}
9891da177e4SLinus Torvalds 	return 0;
9901da177e4SLinus Torvalds }
9911da177e4SLinus Torvalds 
9921da177e4SLinus Torvalds 
9931da177e4SLinus Torvalds /* The socket must have it's spinlock held when we get
9941da177e4SLinus Torvalds  * here.
9951da177e4SLinus Torvalds  *
9961da177e4SLinus Torvalds  * We have a potential double-lock case here, so even when
9971da177e4SLinus Torvalds  * doing backlog processing we use the BH locking scheme.
9981da177e4SLinus Torvalds  * This is because we cannot sleep with the original spinlock
9991da177e4SLinus Torvalds  * held.
10001da177e4SLinus Torvalds  */
10011da177e4SLinus Torvalds int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
10021da177e4SLinus Torvalds {
10031da177e4SLinus Torvalds 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
10041da177e4SLinus Torvalds 		TCP_CHECK_TIMER(sk);
10051da177e4SLinus Torvalds 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
10061da177e4SLinus Torvalds 			goto reset;
10071da177e4SLinus Torvalds 		TCP_CHECK_TIMER(sk);
10081da177e4SLinus Torvalds 		return 0;
10091da177e4SLinus Torvalds 	}
10101da177e4SLinus Torvalds 
10111da177e4SLinus Torvalds 	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
10121da177e4SLinus Torvalds 		goto csum_err;
10131da177e4SLinus Torvalds 
10141da177e4SLinus Torvalds 	if (sk->sk_state == TCP_LISTEN) {
10151da177e4SLinus Torvalds 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
10161da177e4SLinus Torvalds 		if (!nsk)
10171da177e4SLinus Torvalds 			goto discard;
10181da177e4SLinus Torvalds 
10191da177e4SLinus Torvalds 		if (nsk != sk) {
10201da177e4SLinus Torvalds 			if (tcp_child_process(sk, nsk, skb))
10211da177e4SLinus Torvalds 				goto reset;
10221da177e4SLinus Torvalds 			return 0;
10231da177e4SLinus Torvalds 		}
10241da177e4SLinus Torvalds 	}
10251da177e4SLinus Torvalds 
10261da177e4SLinus Torvalds 	TCP_CHECK_TIMER(sk);
10271da177e4SLinus Torvalds 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
10281da177e4SLinus Torvalds 		goto reset;
10291da177e4SLinus Torvalds 	TCP_CHECK_TIMER(sk);
10301da177e4SLinus Torvalds 	return 0;
10311da177e4SLinus Torvalds 
10321da177e4SLinus Torvalds reset:
10331da177e4SLinus Torvalds 	tcp_v4_send_reset(skb);
10341da177e4SLinus Torvalds discard:
10351da177e4SLinus Torvalds 	kfree_skb(skb);
10361da177e4SLinus Torvalds 	/* Be careful here. If this function gets more complicated and
10371da177e4SLinus Torvalds 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
10381da177e4SLinus Torvalds 	 * might be destroyed here. This current version compiles correctly,
10391da177e4SLinus Torvalds 	 * but you have been warned.
10401da177e4SLinus Torvalds 	 */
10411da177e4SLinus Torvalds 	return 0;
10421da177e4SLinus Torvalds 
10431da177e4SLinus Torvalds csum_err:
10441da177e4SLinus Torvalds 	TCP_INC_STATS_BH(TCP_MIB_INERRS);
10451da177e4SLinus Torvalds 	goto discard;
10461da177e4SLinus Torvalds }
10471da177e4SLinus Torvalds 
10481da177e4SLinus Torvalds /*
10491da177e4SLinus Torvalds  *	From tcp_input.c
10501da177e4SLinus Torvalds  */
10511da177e4SLinus Torvalds 
10521da177e4SLinus Torvalds int tcp_v4_rcv(struct sk_buff *skb)
10531da177e4SLinus Torvalds {
10541da177e4SLinus Torvalds 	struct tcphdr *th;
10551da177e4SLinus Torvalds 	struct sock *sk;
10561da177e4SLinus Torvalds 	int ret;
10571da177e4SLinus Torvalds 
10581da177e4SLinus Torvalds 	if (skb->pkt_type != PACKET_HOST)
10591da177e4SLinus Torvalds 		goto discard_it;
10601da177e4SLinus Torvalds 
10611da177e4SLinus Torvalds 	/* Count it even if it's bad */
10621da177e4SLinus Torvalds 	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
10631da177e4SLinus Torvalds 
10641da177e4SLinus Torvalds 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
10651da177e4SLinus Torvalds 		goto discard_it;
10661da177e4SLinus Torvalds 
10671da177e4SLinus Torvalds 	th = skb->h.th;
10681da177e4SLinus Torvalds 
10691da177e4SLinus Torvalds 	if (th->doff < sizeof(struct tcphdr) / 4)
10701da177e4SLinus Torvalds 		goto bad_packet;
10711da177e4SLinus Torvalds 	if (!pskb_may_pull(skb, th->doff * 4))
10721da177e4SLinus Torvalds 		goto discard_it;
10731da177e4SLinus Torvalds 
10741da177e4SLinus Torvalds 	/* An explanation is required here, I think.
10751da177e4SLinus Torvalds 	 * Packet length and doff are validated by header prediction,
1076caa20d9aSStephen Hemminger 	 * provided case of th->doff==0 is eliminated.
10771da177e4SLinus Torvalds 	 * So, we defer the checks. */
10781da177e4SLinus Torvalds 	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1079fb286bb2SHerbert Xu 	     tcp_v4_checksum_init(skb)))
10801da177e4SLinus Torvalds 		goto bad_packet;
10811da177e4SLinus Torvalds 
10821da177e4SLinus Torvalds 	th = skb->h.th;
10831da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
10841da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
10851da177e4SLinus Torvalds 				    skb->len - th->doff * 4);
10861da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
10871da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->when	 = 0;
10881da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
10891da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->sacked	 = 0;
10901da177e4SLinus Torvalds 
1091e48c414eSArnaldo Carvalho de Melo 	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
10921da177e4SLinus Torvalds 			   skb->nh.iph->daddr, ntohs(th->dest),
1093463c84b9SArnaldo Carvalho de Melo 			   inet_iif(skb));
10941da177e4SLinus Torvalds 
10951da177e4SLinus Torvalds 	if (!sk)
10961da177e4SLinus Torvalds 		goto no_tcp_socket;
10971da177e4SLinus Torvalds 
10981da177e4SLinus Torvalds process:
10991da177e4SLinus Torvalds 	if (sk->sk_state == TCP_TIME_WAIT)
11001da177e4SLinus Torvalds 		goto do_time_wait;
11011da177e4SLinus Torvalds 
11021da177e4SLinus Torvalds 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
11031da177e4SLinus Torvalds 		goto discard_and_relse;
1104b59c2701SPatrick McHardy 	nf_reset(skb);
11051da177e4SLinus Torvalds 
11061da177e4SLinus Torvalds 	if (sk_filter(sk, skb, 0))
11071da177e4SLinus Torvalds 		goto discard_and_relse;
11081da177e4SLinus Torvalds 
11091da177e4SLinus Torvalds 	skb->dev = NULL;
11101da177e4SLinus Torvalds 
1111c6366184SIngo Molnar 	bh_lock_sock_nested(sk);
11121da177e4SLinus Torvalds 	ret = 0;
11131da177e4SLinus Torvalds 	if (!sock_owned_by_user(sk)) {
11141a2449a8SChris Leech #ifdef CONFIG_NET_DMA
11151a2449a8SChris Leech 		struct tcp_sock *tp = tcp_sk(sk);
11161a2449a8SChris Leech 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
11171a2449a8SChris Leech 			tp->ucopy.dma_chan = get_softnet_dma();
11181a2449a8SChris Leech 		if (tp->ucopy.dma_chan)
11191a2449a8SChris Leech 			ret = tcp_v4_do_rcv(sk, skb);
11201a2449a8SChris Leech 		else
11211a2449a8SChris Leech #endif
11221a2449a8SChris Leech 		{
11231da177e4SLinus Torvalds 			if (!tcp_prequeue(sk, skb))
11241da177e4SLinus Torvalds 			ret = tcp_v4_do_rcv(sk, skb);
11251a2449a8SChris Leech 		}
11261da177e4SLinus Torvalds 	} else
11271da177e4SLinus Torvalds 		sk_add_backlog(sk, skb);
11281da177e4SLinus Torvalds 	bh_unlock_sock(sk);
11291da177e4SLinus Torvalds 
11301da177e4SLinus Torvalds 	sock_put(sk);
11311da177e4SLinus Torvalds 
11321da177e4SLinus Torvalds 	return ret;
11331da177e4SLinus Torvalds 
11341da177e4SLinus Torvalds no_tcp_socket:
11351da177e4SLinus Torvalds 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
11361da177e4SLinus Torvalds 		goto discard_it;
11371da177e4SLinus Torvalds 
11381da177e4SLinus Torvalds 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
11391da177e4SLinus Torvalds bad_packet:
11401da177e4SLinus Torvalds 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
11411da177e4SLinus Torvalds 	} else {
11421da177e4SLinus Torvalds 		tcp_v4_send_reset(skb);
11431da177e4SLinus Torvalds 	}
11441da177e4SLinus Torvalds 
11451da177e4SLinus Torvalds discard_it:
11461da177e4SLinus Torvalds 	/* Discard frame. */
11471da177e4SLinus Torvalds 	kfree_skb(skb);
11481da177e4SLinus Torvalds   	return 0;
11491da177e4SLinus Torvalds 
11501da177e4SLinus Torvalds discard_and_relse:
11511da177e4SLinus Torvalds 	sock_put(sk);
11521da177e4SLinus Torvalds 	goto discard_it;
11531da177e4SLinus Torvalds 
11541da177e4SLinus Torvalds do_time_wait:
11551da177e4SLinus Torvalds 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
11568feaf0c0SArnaldo Carvalho de Melo 		inet_twsk_put((struct inet_timewait_sock *) sk);
11571da177e4SLinus Torvalds 		goto discard_it;
11581da177e4SLinus Torvalds 	}
11591da177e4SLinus Torvalds 
11601da177e4SLinus Torvalds 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
11611da177e4SLinus Torvalds 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
11628feaf0c0SArnaldo Carvalho de Melo 		inet_twsk_put((struct inet_timewait_sock *) sk);
11631da177e4SLinus Torvalds 		goto discard_it;
11641da177e4SLinus Torvalds 	}
11658feaf0c0SArnaldo Carvalho de Melo 	switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
11668feaf0c0SArnaldo Carvalho de Melo 					   skb, th)) {
11671da177e4SLinus Torvalds 	case TCP_TW_SYN: {
116833b62231SArnaldo Carvalho de Melo 		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
116933b62231SArnaldo Carvalho de Melo 							skb->nh.iph->daddr,
11701da177e4SLinus Torvalds 							ntohs(th->dest),
1171463c84b9SArnaldo Carvalho de Melo 							inet_iif(skb));
11721da177e4SLinus Torvalds 		if (sk2) {
1173295ff7edSArnaldo Carvalho de Melo 			inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1174295ff7edSArnaldo Carvalho de Melo 					     &tcp_death_row);
11758feaf0c0SArnaldo Carvalho de Melo 			inet_twsk_put((struct inet_timewait_sock *)sk);
11761da177e4SLinus Torvalds 			sk = sk2;
11771da177e4SLinus Torvalds 			goto process;
11781da177e4SLinus Torvalds 		}
11791da177e4SLinus Torvalds 		/* Fall through to ACK */
11801da177e4SLinus Torvalds 	}
11811da177e4SLinus Torvalds 	case TCP_TW_ACK:
11821da177e4SLinus Torvalds 		tcp_v4_timewait_ack(sk, skb);
11831da177e4SLinus Torvalds 		break;
11841da177e4SLinus Torvalds 	case TCP_TW_RST:
11851da177e4SLinus Torvalds 		goto no_tcp_socket;
11861da177e4SLinus Torvalds 	case TCP_TW_SUCCESS:;
11871da177e4SLinus Torvalds 	}
11881da177e4SLinus Torvalds 	goto discard_it;
11891da177e4SLinus Torvalds }
11901da177e4SLinus Torvalds 
11911da177e4SLinus Torvalds /* VJ's idea. Save last timestamp seen from this destination
11921da177e4SLinus Torvalds  * and hold it at least for normal timewait interval to use for duplicate
11931da177e4SLinus Torvalds  * segment detection in subsequent connections, before they enter synchronized
11941da177e4SLinus Torvalds  * state.
11951da177e4SLinus Torvalds  */
11961da177e4SLinus Torvalds 
11971da177e4SLinus Torvalds int tcp_v4_remember_stamp(struct sock *sk)
11981da177e4SLinus Torvalds {
11991da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sk);
12001da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
12011da177e4SLinus Torvalds 	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
12021da177e4SLinus Torvalds 	struct inet_peer *peer = NULL;
12031da177e4SLinus Torvalds 	int release_it = 0;
12041da177e4SLinus Torvalds 
12051da177e4SLinus Torvalds 	if (!rt || rt->rt_dst != inet->daddr) {
12061da177e4SLinus Torvalds 		peer = inet_getpeer(inet->daddr, 1);
12071da177e4SLinus Torvalds 		release_it = 1;
12081da177e4SLinus Torvalds 	} else {
12091da177e4SLinus Torvalds 		if (!rt->peer)
12101da177e4SLinus Torvalds 			rt_bind_peer(rt, 1);
12111da177e4SLinus Torvalds 		peer = rt->peer;
12121da177e4SLinus Torvalds 	}
12131da177e4SLinus Torvalds 
12141da177e4SLinus Torvalds 	if (peer) {
12151da177e4SLinus Torvalds 		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
12161da177e4SLinus Torvalds 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
12171da177e4SLinus Torvalds 		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
12181da177e4SLinus Torvalds 			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
12191da177e4SLinus Torvalds 			peer->tcp_ts = tp->rx_opt.ts_recent;
12201da177e4SLinus Torvalds 		}
12211da177e4SLinus Torvalds 		if (release_it)
12221da177e4SLinus Torvalds 			inet_putpeer(peer);
12231da177e4SLinus Torvalds 		return 1;
12241da177e4SLinus Torvalds 	}
12251da177e4SLinus Torvalds 
12261da177e4SLinus Torvalds 	return 0;
12271da177e4SLinus Torvalds }
12281da177e4SLinus Torvalds 
12298feaf0c0SArnaldo Carvalho de Melo int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
12301da177e4SLinus Torvalds {
12318feaf0c0SArnaldo Carvalho de Melo 	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
12321da177e4SLinus Torvalds 
12331da177e4SLinus Torvalds 	if (peer) {
12348feaf0c0SArnaldo Carvalho de Melo 		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
12358feaf0c0SArnaldo Carvalho de Melo 
12368feaf0c0SArnaldo Carvalho de Melo 		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
12371da177e4SLinus Torvalds 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
12388feaf0c0SArnaldo Carvalho de Melo 		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
12398feaf0c0SArnaldo Carvalho de Melo 			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
12408feaf0c0SArnaldo Carvalho de Melo 			peer->tcp_ts	   = tcptw->tw_ts_recent;
12411da177e4SLinus Torvalds 		}
12421da177e4SLinus Torvalds 		inet_putpeer(peer);
12431da177e4SLinus Torvalds 		return 1;
12441da177e4SLinus Torvalds 	}
12451da177e4SLinus Torvalds 
12461da177e4SLinus Torvalds 	return 0;
12471da177e4SLinus Torvalds }
12481da177e4SLinus Torvalds 
12498292a17aSArnaldo Carvalho de Melo struct inet_connection_sock_af_ops ipv4_specific = {
12501da177e4SLinus Torvalds 	.queue_xmit	   = ip_queue_xmit,
12511da177e4SLinus Torvalds 	.send_check	   = tcp_v4_send_check,
125232519f11SArnaldo Carvalho de Melo 	.rebuild_header	   = inet_sk_rebuild_header,
12531da177e4SLinus Torvalds 	.conn_request	   = tcp_v4_conn_request,
12541da177e4SLinus Torvalds 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
12551da177e4SLinus Torvalds 	.remember_stamp	   = tcp_v4_remember_stamp,
12561da177e4SLinus Torvalds 	.net_header_len	   = sizeof(struct iphdr),
12571da177e4SLinus Torvalds 	.setsockopt	   = ip_setsockopt,
12581da177e4SLinus Torvalds 	.getsockopt	   = ip_getsockopt,
1259543d9cfeSArnaldo Carvalho de Melo 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1260543d9cfeSArnaldo Carvalho de Melo 	.sockaddr_len	   = sizeof(struct sockaddr_in),
12613fdadf7dSDmitry Mishin #ifdef CONFIG_COMPAT
12623fdadf7dSDmitry Mishin 	.compat_setsockopt = compat_ip_setsockopt,
12633fdadf7dSDmitry Mishin 	.compat_getsockopt = compat_ip_getsockopt,
12643fdadf7dSDmitry Mishin #endif
12651da177e4SLinus Torvalds };
12661da177e4SLinus Torvalds 
12671da177e4SLinus Torvalds /* NOTE: A lot of things set to zero explicitly by call to
12681da177e4SLinus Torvalds  *       sk_alloc() so need not be done here.
12691da177e4SLinus Torvalds  */
12701da177e4SLinus Torvalds static int tcp_v4_init_sock(struct sock *sk)
12711da177e4SLinus Torvalds {
12726687e988SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
12731da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
12741da177e4SLinus Torvalds 
12751da177e4SLinus Torvalds 	skb_queue_head_init(&tp->out_of_order_queue);
12761da177e4SLinus Torvalds 	tcp_init_xmit_timers(sk);
12771da177e4SLinus Torvalds 	tcp_prequeue_init(tp);
12781da177e4SLinus Torvalds 
12796687e988SArnaldo Carvalho de Melo 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
12801da177e4SLinus Torvalds 	tp->mdev = TCP_TIMEOUT_INIT;
12811da177e4SLinus Torvalds 
12821da177e4SLinus Torvalds 	/* So many TCP implementations out there (incorrectly) count the
12831da177e4SLinus Torvalds 	 * initial SYN frame in their delayed-ACK and congestion control
12841da177e4SLinus Torvalds 	 * algorithms that we must have the following bandaid to talk
12851da177e4SLinus Torvalds 	 * efficiently to them.  -DaveM
12861da177e4SLinus Torvalds 	 */
12871da177e4SLinus Torvalds 	tp->snd_cwnd = 2;
12881da177e4SLinus Torvalds 
12891da177e4SLinus Torvalds 	/* See draft-stevens-tcpca-spec-01 for discussion of the
12901da177e4SLinus Torvalds 	 * initialization of these values.
12911da177e4SLinus Torvalds 	 */
12921da177e4SLinus Torvalds 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
12931da177e4SLinus Torvalds 	tp->snd_cwnd_clamp = ~0;
1294c1b4a7e6SDavid S. Miller 	tp->mss_cache = 536;
12951da177e4SLinus Torvalds 
12961da177e4SLinus Torvalds 	tp->reordering = sysctl_tcp_reordering;
12976687e988SArnaldo Carvalho de Melo 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
12981da177e4SLinus Torvalds 
12991da177e4SLinus Torvalds 	sk->sk_state = TCP_CLOSE;
13001da177e4SLinus Torvalds 
13011da177e4SLinus Torvalds 	sk->sk_write_space = sk_stream_write_space;
13021da177e4SLinus Torvalds 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
13031da177e4SLinus Torvalds 
13048292a17aSArnaldo Carvalho de Melo 	icsk->icsk_af_ops = &ipv4_specific;
1305d83d8461SArnaldo Carvalho de Melo 	icsk->icsk_sync_mss = tcp_sync_mss;
13061da177e4SLinus Torvalds 
13071da177e4SLinus Torvalds 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
13081da177e4SLinus Torvalds 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
13091da177e4SLinus Torvalds 
13101da177e4SLinus Torvalds 	atomic_inc(&tcp_sockets_allocated);
13111da177e4SLinus Torvalds 
13121da177e4SLinus Torvalds 	return 0;
13131da177e4SLinus Torvalds }
13141da177e4SLinus Torvalds 
13151da177e4SLinus Torvalds int tcp_v4_destroy_sock(struct sock *sk)
13161da177e4SLinus Torvalds {
13171da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
13181da177e4SLinus Torvalds 
13191da177e4SLinus Torvalds 	tcp_clear_xmit_timers(sk);
13201da177e4SLinus Torvalds 
13216687e988SArnaldo Carvalho de Melo 	tcp_cleanup_congestion_control(sk);
1322317a76f9SStephen Hemminger 
13231da177e4SLinus Torvalds 	/* Cleanup up the write buffer. */
13241da177e4SLinus Torvalds   	sk_stream_writequeue_purge(sk);
13251da177e4SLinus Torvalds 
13261da177e4SLinus Torvalds 	/* Cleans up our, hopefully empty, out_of_order_queue. */
13271da177e4SLinus Torvalds   	__skb_queue_purge(&tp->out_of_order_queue);
13281da177e4SLinus Torvalds 
13291a2449a8SChris Leech #ifdef CONFIG_NET_DMA
13301a2449a8SChris Leech 	/* Cleans up our sk_async_wait_queue */
13311a2449a8SChris Leech   	__skb_queue_purge(&sk->sk_async_wait_queue);
13321a2449a8SChris Leech #endif
13331a2449a8SChris Leech 
13341da177e4SLinus Torvalds 	/* Clean prequeue, it must be empty really */
13351da177e4SLinus Torvalds 	__skb_queue_purge(&tp->ucopy.prequeue);
13361da177e4SLinus Torvalds 
13371da177e4SLinus Torvalds 	/* Clean up a referenced TCP bind bucket. */
1338463c84b9SArnaldo Carvalho de Melo 	if (inet_csk(sk)->icsk_bind_hash)
13392d8c4ce5SArnaldo Carvalho de Melo 		inet_put_port(&tcp_hashinfo, sk);
13401da177e4SLinus Torvalds 
13411da177e4SLinus Torvalds 	/*
13421da177e4SLinus Torvalds 	 * If sendmsg cached page exists, toss it.
13431da177e4SLinus Torvalds 	 */
13441da177e4SLinus Torvalds 	if (sk->sk_sndmsg_page) {
13451da177e4SLinus Torvalds 		__free_page(sk->sk_sndmsg_page);
13461da177e4SLinus Torvalds 		sk->sk_sndmsg_page = NULL;
13471da177e4SLinus Torvalds 	}
13481da177e4SLinus Torvalds 
13491da177e4SLinus Torvalds 	atomic_dec(&tcp_sockets_allocated);
13501da177e4SLinus Torvalds 
13511da177e4SLinus Torvalds 	return 0;
13521da177e4SLinus Torvalds }
13531da177e4SLinus Torvalds 
13541da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_destroy_sock);
13551da177e4SLinus Torvalds 
13561da177e4SLinus Torvalds #ifdef CONFIG_PROC_FS
13571da177e4SLinus Torvalds /* Proc filesystem TCP sock list dumping. */
13581da177e4SLinus Torvalds 
13598feaf0c0SArnaldo Carvalho de Melo static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
13601da177e4SLinus Torvalds {
13611da177e4SLinus Torvalds 	return hlist_empty(head) ? NULL :
13628feaf0c0SArnaldo Carvalho de Melo 		list_entry(head->first, struct inet_timewait_sock, tw_node);
13631da177e4SLinus Torvalds }
13641da177e4SLinus Torvalds 
13658feaf0c0SArnaldo Carvalho de Melo static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
13661da177e4SLinus Torvalds {
13671da177e4SLinus Torvalds 	return tw->tw_node.next ?
13681da177e4SLinus Torvalds 		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
13691da177e4SLinus Torvalds }
13701da177e4SLinus Torvalds 
13711da177e4SLinus Torvalds static void *listening_get_next(struct seq_file *seq, void *cur)
13721da177e4SLinus Torvalds {
1373463c84b9SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk;
13741da177e4SLinus Torvalds 	struct hlist_node *node;
13751da177e4SLinus Torvalds 	struct sock *sk = cur;
13761da177e4SLinus Torvalds 	struct tcp_iter_state* st = seq->private;
13771da177e4SLinus Torvalds 
13781da177e4SLinus Torvalds 	if (!sk) {
13791da177e4SLinus Torvalds 		st->bucket = 0;
13806e04e021SArnaldo Carvalho de Melo 		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
13811da177e4SLinus Torvalds 		goto get_sk;
13821da177e4SLinus Torvalds 	}
13831da177e4SLinus Torvalds 
13841da177e4SLinus Torvalds 	++st->num;
13851da177e4SLinus Torvalds 
13861da177e4SLinus Torvalds 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
138760236fddSArnaldo Carvalho de Melo 		struct request_sock *req = cur;
13881da177e4SLinus Torvalds 
1389463c84b9SArnaldo Carvalho de Melo 	       	icsk = inet_csk(st->syn_wait_sk);
13901da177e4SLinus Torvalds 		req = req->dl_next;
13911da177e4SLinus Torvalds 		while (1) {
13921da177e4SLinus Torvalds 			while (req) {
139360236fddSArnaldo Carvalho de Melo 				if (req->rsk_ops->family == st->family) {
13941da177e4SLinus Torvalds 					cur = req;
13951da177e4SLinus Torvalds 					goto out;
13961da177e4SLinus Torvalds 				}
13971da177e4SLinus Torvalds 				req = req->dl_next;
13981da177e4SLinus Torvalds 			}
13991da177e4SLinus Torvalds 			if (++st->sbucket >= TCP_SYNQ_HSIZE)
14001da177e4SLinus Torvalds 				break;
14011da177e4SLinus Torvalds get_req:
1402463c84b9SArnaldo Carvalho de Melo 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
14031da177e4SLinus Torvalds 		}
14041da177e4SLinus Torvalds 		sk	  = sk_next(st->syn_wait_sk);
14051da177e4SLinus Torvalds 		st->state = TCP_SEQ_STATE_LISTENING;
1406463c84b9SArnaldo Carvalho de Melo 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
14071da177e4SLinus Torvalds 	} else {
1408463c84b9SArnaldo Carvalho de Melo 	       	icsk = inet_csk(sk);
1409463c84b9SArnaldo Carvalho de Melo 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1410463c84b9SArnaldo Carvalho de Melo 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
14111da177e4SLinus Torvalds 			goto start_req;
1412463c84b9SArnaldo Carvalho de Melo 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
14131da177e4SLinus Torvalds 		sk = sk_next(sk);
14141da177e4SLinus Torvalds 	}
14151da177e4SLinus Torvalds get_sk:
14161da177e4SLinus Torvalds 	sk_for_each_from(sk, node) {
14171da177e4SLinus Torvalds 		if (sk->sk_family == st->family) {
14181da177e4SLinus Torvalds 			cur = sk;
14191da177e4SLinus Torvalds 			goto out;
14201da177e4SLinus Torvalds 		}
1421463c84b9SArnaldo Carvalho de Melo 	       	icsk = inet_csk(sk);
1422463c84b9SArnaldo Carvalho de Melo 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1423463c84b9SArnaldo Carvalho de Melo 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
14241da177e4SLinus Torvalds start_req:
14251da177e4SLinus Torvalds 			st->uid		= sock_i_uid(sk);
14261da177e4SLinus Torvalds 			st->syn_wait_sk = sk;
14271da177e4SLinus Torvalds 			st->state	= TCP_SEQ_STATE_OPENREQ;
14281da177e4SLinus Torvalds 			st->sbucket	= 0;
14291da177e4SLinus Torvalds 			goto get_req;
14301da177e4SLinus Torvalds 		}
1431463c84b9SArnaldo Carvalho de Melo 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
14321da177e4SLinus Torvalds 	}
14330f7ff927SArnaldo Carvalho de Melo 	if (++st->bucket < INET_LHTABLE_SIZE) {
14346e04e021SArnaldo Carvalho de Melo 		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
14351da177e4SLinus Torvalds 		goto get_sk;
14361da177e4SLinus Torvalds 	}
14371da177e4SLinus Torvalds 	cur = NULL;
14381da177e4SLinus Torvalds out:
14391da177e4SLinus Torvalds 	return cur;
14401da177e4SLinus Torvalds }
14411da177e4SLinus Torvalds 
14421da177e4SLinus Torvalds static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
14431da177e4SLinus Torvalds {
14441da177e4SLinus Torvalds 	void *rc = listening_get_next(seq, NULL);
14451da177e4SLinus Torvalds 
14461da177e4SLinus Torvalds 	while (rc && *pos) {
14471da177e4SLinus Torvalds 		rc = listening_get_next(seq, rc);
14481da177e4SLinus Torvalds 		--*pos;
14491da177e4SLinus Torvalds 	}
14501da177e4SLinus Torvalds 	return rc;
14511da177e4SLinus Torvalds }
14521da177e4SLinus Torvalds 
14531da177e4SLinus Torvalds static void *established_get_first(struct seq_file *seq)
14541da177e4SLinus Torvalds {
14551da177e4SLinus Torvalds 	struct tcp_iter_state* st = seq->private;
14561da177e4SLinus Torvalds 	void *rc = NULL;
14571da177e4SLinus Torvalds 
14586e04e021SArnaldo Carvalho de Melo 	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
14591da177e4SLinus Torvalds 		struct sock *sk;
14601da177e4SLinus Torvalds 		struct hlist_node *node;
14618feaf0c0SArnaldo Carvalho de Melo 		struct inet_timewait_sock *tw;
14621da177e4SLinus Torvalds 
14631da177e4SLinus Torvalds 		/* We can reschedule _before_ having picked the target: */
14641da177e4SLinus Torvalds 		cond_resched_softirq();
14651da177e4SLinus Torvalds 
14666e04e021SArnaldo Carvalho de Melo 		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
14676e04e021SArnaldo Carvalho de Melo 		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
14681da177e4SLinus Torvalds 			if (sk->sk_family != st->family) {
14691da177e4SLinus Torvalds 				continue;
14701da177e4SLinus Torvalds 			}
14711da177e4SLinus Torvalds 			rc = sk;
14721da177e4SLinus Torvalds 			goto out;
14731da177e4SLinus Torvalds 		}
14741da177e4SLinus Torvalds 		st->state = TCP_SEQ_STATE_TIME_WAIT;
14758feaf0c0SArnaldo Carvalho de Melo 		inet_twsk_for_each(tw, node,
14766e04e021SArnaldo Carvalho de Melo 				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
14771da177e4SLinus Torvalds 			if (tw->tw_family != st->family) {
14781da177e4SLinus Torvalds 				continue;
14791da177e4SLinus Torvalds 			}
14801da177e4SLinus Torvalds 			rc = tw;
14811da177e4SLinus Torvalds 			goto out;
14821da177e4SLinus Torvalds 		}
14836e04e021SArnaldo Carvalho de Melo 		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
14841da177e4SLinus Torvalds 		st->state = TCP_SEQ_STATE_ESTABLISHED;
14851da177e4SLinus Torvalds 	}
14861da177e4SLinus Torvalds out:
14871da177e4SLinus Torvalds 	return rc;
14881da177e4SLinus Torvalds }
14891da177e4SLinus Torvalds 
14901da177e4SLinus Torvalds static void *established_get_next(struct seq_file *seq, void *cur)
14911da177e4SLinus Torvalds {
14921da177e4SLinus Torvalds 	struct sock *sk = cur;
14938feaf0c0SArnaldo Carvalho de Melo 	struct inet_timewait_sock *tw;
14941da177e4SLinus Torvalds 	struct hlist_node *node;
14951da177e4SLinus Torvalds 	struct tcp_iter_state* st = seq->private;
14961da177e4SLinus Torvalds 
14971da177e4SLinus Torvalds 	++st->num;
14981da177e4SLinus Torvalds 
14991da177e4SLinus Torvalds 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
15001da177e4SLinus Torvalds 		tw = cur;
15011da177e4SLinus Torvalds 		tw = tw_next(tw);
15021da177e4SLinus Torvalds get_tw:
15031da177e4SLinus Torvalds 		while (tw && tw->tw_family != st->family) {
15041da177e4SLinus Torvalds 			tw = tw_next(tw);
15051da177e4SLinus Torvalds 		}
15061da177e4SLinus Torvalds 		if (tw) {
15071da177e4SLinus Torvalds 			cur = tw;
15081da177e4SLinus Torvalds 			goto out;
15091da177e4SLinus Torvalds 		}
15106e04e021SArnaldo Carvalho de Melo 		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
15111da177e4SLinus Torvalds 		st->state = TCP_SEQ_STATE_ESTABLISHED;
15121da177e4SLinus Torvalds 
15131da177e4SLinus Torvalds 		/* We can reschedule between buckets: */
15141da177e4SLinus Torvalds 		cond_resched_softirq();
15151da177e4SLinus Torvalds 
15166e04e021SArnaldo Carvalho de Melo 		if (++st->bucket < tcp_hashinfo.ehash_size) {
15176e04e021SArnaldo Carvalho de Melo 			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
15186e04e021SArnaldo Carvalho de Melo 			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
15191da177e4SLinus Torvalds 		} else {
15201da177e4SLinus Torvalds 			cur = NULL;
15211da177e4SLinus Torvalds 			goto out;
15221da177e4SLinus Torvalds 		}
15231da177e4SLinus Torvalds 	} else
15241da177e4SLinus Torvalds 		sk = sk_next(sk);
15251da177e4SLinus Torvalds 
15261da177e4SLinus Torvalds 	sk_for_each_from(sk, node) {
15271da177e4SLinus Torvalds 		if (sk->sk_family == st->family)
15281da177e4SLinus Torvalds 			goto found;
15291da177e4SLinus Torvalds 	}
15301da177e4SLinus Torvalds 
15311da177e4SLinus Torvalds 	st->state = TCP_SEQ_STATE_TIME_WAIT;
15326e04e021SArnaldo Carvalho de Melo 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
15331da177e4SLinus Torvalds 	goto get_tw;
15341da177e4SLinus Torvalds found:
15351da177e4SLinus Torvalds 	cur = sk;
15361da177e4SLinus Torvalds out:
15371da177e4SLinus Torvalds 	return cur;
15381da177e4SLinus Torvalds }
15391da177e4SLinus Torvalds 
15401da177e4SLinus Torvalds static void *established_get_idx(struct seq_file *seq, loff_t pos)
15411da177e4SLinus Torvalds {
15421da177e4SLinus Torvalds 	void *rc = established_get_first(seq);
15431da177e4SLinus Torvalds 
15441da177e4SLinus Torvalds 	while (rc && pos) {
15451da177e4SLinus Torvalds 		rc = established_get_next(seq, rc);
15461da177e4SLinus Torvalds 		--pos;
15471da177e4SLinus Torvalds 	}
15481da177e4SLinus Torvalds 	return rc;
15491da177e4SLinus Torvalds }
15501da177e4SLinus Torvalds 
15511da177e4SLinus Torvalds static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
15521da177e4SLinus Torvalds {
15531da177e4SLinus Torvalds 	void *rc;
15541da177e4SLinus Torvalds 	struct tcp_iter_state* st = seq->private;
15551da177e4SLinus Torvalds 
1556f3f05f70SArnaldo Carvalho de Melo 	inet_listen_lock(&tcp_hashinfo);
15571da177e4SLinus Torvalds 	st->state = TCP_SEQ_STATE_LISTENING;
15581da177e4SLinus Torvalds 	rc	  = listening_get_idx(seq, &pos);
15591da177e4SLinus Torvalds 
15601da177e4SLinus Torvalds 	if (!rc) {
1561f3f05f70SArnaldo Carvalho de Melo 		inet_listen_unlock(&tcp_hashinfo);
15621da177e4SLinus Torvalds 		local_bh_disable();
15631da177e4SLinus Torvalds 		st->state = TCP_SEQ_STATE_ESTABLISHED;
15641da177e4SLinus Torvalds 		rc	  = established_get_idx(seq, pos);
15651da177e4SLinus Torvalds 	}
15661da177e4SLinus Torvalds 
15671da177e4SLinus Torvalds 	return rc;
15681da177e4SLinus Torvalds }
15691da177e4SLinus Torvalds 
15701da177e4SLinus Torvalds static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
15711da177e4SLinus Torvalds {
15721da177e4SLinus Torvalds 	struct tcp_iter_state* st = seq->private;
15731da177e4SLinus Torvalds 	st->state = TCP_SEQ_STATE_LISTENING;
15741da177e4SLinus Torvalds 	st->num = 0;
15751da177e4SLinus Torvalds 	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
15761da177e4SLinus Torvalds }
15771da177e4SLinus Torvalds 
15781da177e4SLinus Torvalds static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
15791da177e4SLinus Torvalds {
15801da177e4SLinus Torvalds 	void *rc = NULL;
15811da177e4SLinus Torvalds 	struct tcp_iter_state* st;
15821da177e4SLinus Torvalds 
15831da177e4SLinus Torvalds 	if (v == SEQ_START_TOKEN) {
15841da177e4SLinus Torvalds 		rc = tcp_get_idx(seq, 0);
15851da177e4SLinus Torvalds 		goto out;
15861da177e4SLinus Torvalds 	}
15871da177e4SLinus Torvalds 	st = seq->private;
15881da177e4SLinus Torvalds 
15891da177e4SLinus Torvalds 	switch (st->state) {
15901da177e4SLinus Torvalds 	case TCP_SEQ_STATE_OPENREQ:
15911da177e4SLinus Torvalds 	case TCP_SEQ_STATE_LISTENING:
15921da177e4SLinus Torvalds 		rc = listening_get_next(seq, v);
15931da177e4SLinus Torvalds 		if (!rc) {
1594f3f05f70SArnaldo Carvalho de Melo 			inet_listen_unlock(&tcp_hashinfo);
15951da177e4SLinus Torvalds 			local_bh_disable();
15961da177e4SLinus Torvalds 			st->state = TCP_SEQ_STATE_ESTABLISHED;
15971da177e4SLinus Torvalds 			rc	  = established_get_first(seq);
15981da177e4SLinus Torvalds 		}
15991da177e4SLinus Torvalds 		break;
16001da177e4SLinus Torvalds 	case TCP_SEQ_STATE_ESTABLISHED:
16011da177e4SLinus Torvalds 	case TCP_SEQ_STATE_TIME_WAIT:
16021da177e4SLinus Torvalds 		rc = established_get_next(seq, v);
16031da177e4SLinus Torvalds 		break;
16041da177e4SLinus Torvalds 	}
16051da177e4SLinus Torvalds out:
16061da177e4SLinus Torvalds 	++*pos;
16071da177e4SLinus Torvalds 	return rc;
16081da177e4SLinus Torvalds }
16091da177e4SLinus Torvalds 
16101da177e4SLinus Torvalds static void tcp_seq_stop(struct seq_file *seq, void *v)
16111da177e4SLinus Torvalds {
16121da177e4SLinus Torvalds 	struct tcp_iter_state* st = seq->private;
16131da177e4SLinus Torvalds 
16141da177e4SLinus Torvalds 	switch (st->state) {
16151da177e4SLinus Torvalds 	case TCP_SEQ_STATE_OPENREQ:
16161da177e4SLinus Torvalds 		if (v) {
1617463c84b9SArnaldo Carvalho de Melo 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1618463c84b9SArnaldo Carvalho de Melo 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
16191da177e4SLinus Torvalds 		}
16201da177e4SLinus Torvalds 	case TCP_SEQ_STATE_LISTENING:
16211da177e4SLinus Torvalds 		if (v != SEQ_START_TOKEN)
1622f3f05f70SArnaldo Carvalho de Melo 			inet_listen_unlock(&tcp_hashinfo);
16231da177e4SLinus Torvalds 		break;
16241da177e4SLinus Torvalds 	case TCP_SEQ_STATE_TIME_WAIT:
16251da177e4SLinus Torvalds 	case TCP_SEQ_STATE_ESTABLISHED:
16261da177e4SLinus Torvalds 		if (v)
16276e04e021SArnaldo Carvalho de Melo 			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
16281da177e4SLinus Torvalds 		local_bh_enable();
16291da177e4SLinus Torvalds 		break;
16301da177e4SLinus Torvalds 	}
16311da177e4SLinus Torvalds }
16321da177e4SLinus Torvalds 
16331da177e4SLinus Torvalds static int tcp_seq_open(struct inode *inode, struct file *file)
16341da177e4SLinus Torvalds {
16351da177e4SLinus Torvalds 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
16361da177e4SLinus Torvalds 	struct seq_file *seq;
16371da177e4SLinus Torvalds 	struct tcp_iter_state *s;
16381da177e4SLinus Torvalds 	int rc;
16391da177e4SLinus Torvalds 
16401da177e4SLinus Torvalds 	if (unlikely(afinfo == NULL))
16411da177e4SLinus Torvalds 		return -EINVAL;
16421da177e4SLinus Torvalds 
16431da177e4SLinus Torvalds 	s = kmalloc(sizeof(*s), GFP_KERNEL);
16441da177e4SLinus Torvalds 	if (!s)
16451da177e4SLinus Torvalds 		return -ENOMEM;
16461da177e4SLinus Torvalds 	memset(s, 0, sizeof(*s));
16471da177e4SLinus Torvalds 	s->family		= afinfo->family;
16481da177e4SLinus Torvalds 	s->seq_ops.start	= tcp_seq_start;
16491da177e4SLinus Torvalds 	s->seq_ops.next		= tcp_seq_next;
16501da177e4SLinus Torvalds 	s->seq_ops.show		= afinfo->seq_show;
16511da177e4SLinus Torvalds 	s->seq_ops.stop		= tcp_seq_stop;
16521da177e4SLinus Torvalds 
16531da177e4SLinus Torvalds 	rc = seq_open(file, &s->seq_ops);
16541da177e4SLinus Torvalds 	if (rc)
16551da177e4SLinus Torvalds 		goto out_kfree;
16561da177e4SLinus Torvalds 	seq	     = file->private_data;
16571da177e4SLinus Torvalds 	seq->private = s;
16581da177e4SLinus Torvalds out:
16591da177e4SLinus Torvalds 	return rc;
16601da177e4SLinus Torvalds out_kfree:
16611da177e4SLinus Torvalds 	kfree(s);
16621da177e4SLinus Torvalds 	goto out;
16631da177e4SLinus Torvalds }
16641da177e4SLinus Torvalds 
16651da177e4SLinus Torvalds int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
16661da177e4SLinus Torvalds {
16671da177e4SLinus Torvalds 	int rc = 0;
16681da177e4SLinus Torvalds 	struct proc_dir_entry *p;
16691da177e4SLinus Torvalds 
16701da177e4SLinus Torvalds 	if (!afinfo)
16711da177e4SLinus Torvalds 		return -EINVAL;
16721da177e4SLinus Torvalds 	afinfo->seq_fops->owner		= afinfo->owner;
16731da177e4SLinus Torvalds 	afinfo->seq_fops->open		= tcp_seq_open;
16741da177e4SLinus Torvalds 	afinfo->seq_fops->read		= seq_read;
16751da177e4SLinus Torvalds 	afinfo->seq_fops->llseek	= seq_lseek;
16761da177e4SLinus Torvalds 	afinfo->seq_fops->release	= seq_release_private;
16771da177e4SLinus Torvalds 
16781da177e4SLinus Torvalds 	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
16791da177e4SLinus Torvalds 	if (p)
16801da177e4SLinus Torvalds 		p->data = afinfo;
16811da177e4SLinus Torvalds 	else
16821da177e4SLinus Torvalds 		rc = -ENOMEM;
16831da177e4SLinus Torvalds 	return rc;
16841da177e4SLinus Torvalds }
16851da177e4SLinus Torvalds 
16861da177e4SLinus Torvalds void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
16871da177e4SLinus Torvalds {
16881da177e4SLinus Torvalds 	if (!afinfo)
16891da177e4SLinus Torvalds 		return;
16901da177e4SLinus Torvalds 	proc_net_remove(afinfo->name);
16911da177e4SLinus Torvalds 	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
16921da177e4SLinus Torvalds }
16931da177e4SLinus Torvalds 
169460236fddSArnaldo Carvalho de Melo static void get_openreq4(struct sock *sk, struct request_sock *req,
16951da177e4SLinus Torvalds 			 char *tmpbuf, int i, int uid)
16961da177e4SLinus Torvalds {
16972e6599cbSArnaldo Carvalho de Melo 	const struct inet_request_sock *ireq = inet_rsk(req);
16981da177e4SLinus Torvalds 	int ttd = req->expires - jiffies;
16991da177e4SLinus Torvalds 
17001da177e4SLinus Torvalds 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
17011da177e4SLinus Torvalds 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
17021da177e4SLinus Torvalds 		i,
17032e6599cbSArnaldo Carvalho de Melo 		ireq->loc_addr,
17041da177e4SLinus Torvalds 		ntohs(inet_sk(sk)->sport),
17052e6599cbSArnaldo Carvalho de Melo 		ireq->rmt_addr,
17062e6599cbSArnaldo Carvalho de Melo 		ntohs(ireq->rmt_port),
17071da177e4SLinus Torvalds 		TCP_SYN_RECV,
17081da177e4SLinus Torvalds 		0, 0, /* could print option size, but that is af dependent. */
17091da177e4SLinus Torvalds 		1,    /* timers active (only the expire timer) */
17101da177e4SLinus Torvalds 		jiffies_to_clock_t(ttd),
17111da177e4SLinus Torvalds 		req->retrans,
17121da177e4SLinus Torvalds 		uid,
17131da177e4SLinus Torvalds 		0,  /* non standard timer */
17141da177e4SLinus Torvalds 		0, /* open_requests have no inode */
17151da177e4SLinus Torvalds 		atomic_read(&sk->sk_refcnt),
17161da177e4SLinus Torvalds 		req);
17171da177e4SLinus Torvalds }
17181da177e4SLinus Torvalds 
17191da177e4SLinus Torvalds static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
17201da177e4SLinus Torvalds {
17211da177e4SLinus Torvalds 	int timer_active;
17221da177e4SLinus Torvalds 	unsigned long timer_expires;
17231da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sp);
1724463c84b9SArnaldo Carvalho de Melo 	const struct inet_connection_sock *icsk = inet_csk(sp);
17251da177e4SLinus Torvalds 	struct inet_sock *inet = inet_sk(sp);
17261da177e4SLinus Torvalds 	unsigned int dest = inet->daddr;
17271da177e4SLinus Torvalds 	unsigned int src = inet->rcv_saddr;
17281da177e4SLinus Torvalds 	__u16 destp = ntohs(inet->dport);
17291da177e4SLinus Torvalds 	__u16 srcp = ntohs(inet->sport);
17301da177e4SLinus Torvalds 
1731463c84b9SArnaldo Carvalho de Melo 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
17321da177e4SLinus Torvalds 		timer_active	= 1;
1733463c84b9SArnaldo Carvalho de Melo 		timer_expires	= icsk->icsk_timeout;
1734463c84b9SArnaldo Carvalho de Melo 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
17351da177e4SLinus Torvalds 		timer_active	= 4;
1736463c84b9SArnaldo Carvalho de Melo 		timer_expires	= icsk->icsk_timeout;
17371da177e4SLinus Torvalds 	} else if (timer_pending(&sp->sk_timer)) {
17381da177e4SLinus Torvalds 		timer_active	= 2;
17391da177e4SLinus Torvalds 		timer_expires	= sp->sk_timer.expires;
17401da177e4SLinus Torvalds 	} else {
17411da177e4SLinus Torvalds 		timer_active	= 0;
17421da177e4SLinus Torvalds 		timer_expires = jiffies;
17431da177e4SLinus Torvalds 	}
17441da177e4SLinus Torvalds 
17451da177e4SLinus Torvalds 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
17461da177e4SLinus Torvalds 			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
17471da177e4SLinus Torvalds 		i, src, srcp, dest, destp, sp->sk_state,
174847da8ee6SSridhar Samudrala 		tp->write_seq - tp->snd_una,
174947da8ee6SSridhar Samudrala 		(sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
17501da177e4SLinus Torvalds 		timer_active,
17511da177e4SLinus Torvalds 		jiffies_to_clock_t(timer_expires - jiffies),
1752463c84b9SArnaldo Carvalho de Melo 		icsk->icsk_retransmits,
17531da177e4SLinus Torvalds 		sock_i_uid(sp),
17546687e988SArnaldo Carvalho de Melo 		icsk->icsk_probes_out,
17551da177e4SLinus Torvalds 		sock_i_ino(sp),
17561da177e4SLinus Torvalds 		atomic_read(&sp->sk_refcnt), sp,
1757463c84b9SArnaldo Carvalho de Melo 		icsk->icsk_rto,
1758463c84b9SArnaldo Carvalho de Melo 		icsk->icsk_ack.ato,
1759463c84b9SArnaldo Carvalho de Melo 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
17601da177e4SLinus Torvalds 		tp->snd_cwnd,
17611da177e4SLinus Torvalds 		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
17621da177e4SLinus Torvalds }
17631da177e4SLinus Torvalds 
17648feaf0c0SArnaldo Carvalho de Melo static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
17651da177e4SLinus Torvalds {
17661da177e4SLinus Torvalds 	unsigned int dest, src;
17671da177e4SLinus Torvalds 	__u16 destp, srcp;
17681da177e4SLinus Torvalds 	int ttd = tw->tw_ttd - jiffies;
17691da177e4SLinus Torvalds 
17701da177e4SLinus Torvalds 	if (ttd < 0)
17711da177e4SLinus Torvalds 		ttd = 0;
17721da177e4SLinus Torvalds 
17731da177e4SLinus Torvalds 	dest  = tw->tw_daddr;
17741da177e4SLinus Torvalds 	src   = tw->tw_rcv_saddr;
17751da177e4SLinus Torvalds 	destp = ntohs(tw->tw_dport);
17761da177e4SLinus Torvalds 	srcp  = ntohs(tw->tw_sport);
17771da177e4SLinus Torvalds 
17781da177e4SLinus Torvalds 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
17791da177e4SLinus Torvalds 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
17801da177e4SLinus Torvalds 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
17811da177e4SLinus Torvalds 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
17821da177e4SLinus Torvalds 		atomic_read(&tw->tw_refcnt), tw);
17831da177e4SLinus Torvalds }
17841da177e4SLinus Torvalds 
17851da177e4SLinus Torvalds #define TMPSZ 150
17861da177e4SLinus Torvalds 
17871da177e4SLinus Torvalds static int tcp4_seq_show(struct seq_file *seq, void *v)
17881da177e4SLinus Torvalds {
17891da177e4SLinus Torvalds 	struct tcp_iter_state* st;
17901da177e4SLinus Torvalds 	char tmpbuf[TMPSZ + 1];
17911da177e4SLinus Torvalds 
17921da177e4SLinus Torvalds 	if (v == SEQ_START_TOKEN) {
17931da177e4SLinus Torvalds 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
17941da177e4SLinus Torvalds 			   "  sl  local_address rem_address   st tx_queue "
17951da177e4SLinus Torvalds 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
17961da177e4SLinus Torvalds 			   "inode");
17971da177e4SLinus Torvalds 		goto out;
17981da177e4SLinus Torvalds 	}
17991da177e4SLinus Torvalds 	st = seq->private;
18001da177e4SLinus Torvalds 
18011da177e4SLinus Torvalds 	switch (st->state) {
18021da177e4SLinus Torvalds 	case TCP_SEQ_STATE_LISTENING:
18031da177e4SLinus Torvalds 	case TCP_SEQ_STATE_ESTABLISHED:
18041da177e4SLinus Torvalds 		get_tcp4_sock(v, tmpbuf, st->num);
18051da177e4SLinus Torvalds 		break;
18061da177e4SLinus Torvalds 	case TCP_SEQ_STATE_OPENREQ:
18071da177e4SLinus Torvalds 		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
18081da177e4SLinus Torvalds 		break;
18091da177e4SLinus Torvalds 	case TCP_SEQ_STATE_TIME_WAIT:
18101da177e4SLinus Torvalds 		get_timewait4_sock(v, tmpbuf, st->num);
18111da177e4SLinus Torvalds 		break;
18121da177e4SLinus Torvalds 	}
18131da177e4SLinus Torvalds 	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
18141da177e4SLinus Torvalds out:
18151da177e4SLinus Torvalds 	return 0;
18161da177e4SLinus Torvalds }
18171da177e4SLinus Torvalds 
18181da177e4SLinus Torvalds static struct file_operations tcp4_seq_fops;
18191da177e4SLinus Torvalds static struct tcp_seq_afinfo tcp4_seq_afinfo = {
18201da177e4SLinus Torvalds 	.owner		= THIS_MODULE,
18211da177e4SLinus Torvalds 	.name		= "tcp",
18221da177e4SLinus Torvalds 	.family		= AF_INET,
18231da177e4SLinus Torvalds 	.seq_show	= tcp4_seq_show,
18241da177e4SLinus Torvalds 	.seq_fops	= &tcp4_seq_fops,
18251da177e4SLinus Torvalds };
18261da177e4SLinus Torvalds 
18271da177e4SLinus Torvalds int __init tcp4_proc_init(void)
18281da177e4SLinus Torvalds {
18291da177e4SLinus Torvalds 	return tcp_proc_register(&tcp4_seq_afinfo);
18301da177e4SLinus Torvalds }
18311da177e4SLinus Torvalds 
18321da177e4SLinus Torvalds void tcp4_proc_exit(void)
18331da177e4SLinus Torvalds {
18341da177e4SLinus Torvalds 	tcp_proc_unregister(&tcp4_seq_afinfo);
18351da177e4SLinus Torvalds }
18361da177e4SLinus Torvalds #endif /* CONFIG_PROC_FS */
18371da177e4SLinus Torvalds 
18381da177e4SLinus Torvalds struct proto tcp_prot = {
18391da177e4SLinus Torvalds 	.name			= "TCP",
18401da177e4SLinus Torvalds 	.owner			= THIS_MODULE,
18411da177e4SLinus Torvalds 	.close			= tcp_close,
18421da177e4SLinus Torvalds 	.connect		= tcp_v4_connect,
18431da177e4SLinus Torvalds 	.disconnect		= tcp_disconnect,
1844463c84b9SArnaldo Carvalho de Melo 	.accept			= inet_csk_accept,
18451da177e4SLinus Torvalds 	.ioctl			= tcp_ioctl,
18461da177e4SLinus Torvalds 	.init			= tcp_v4_init_sock,
18471da177e4SLinus Torvalds 	.destroy		= tcp_v4_destroy_sock,
18481da177e4SLinus Torvalds 	.shutdown		= tcp_shutdown,
18491da177e4SLinus Torvalds 	.setsockopt		= tcp_setsockopt,
18501da177e4SLinus Torvalds 	.getsockopt		= tcp_getsockopt,
18511da177e4SLinus Torvalds 	.sendmsg		= tcp_sendmsg,
18521da177e4SLinus Torvalds 	.recvmsg		= tcp_recvmsg,
18531da177e4SLinus Torvalds 	.backlog_rcv		= tcp_v4_do_rcv,
18541da177e4SLinus Torvalds 	.hash			= tcp_v4_hash,
18551da177e4SLinus Torvalds 	.unhash			= tcp_unhash,
18561da177e4SLinus Torvalds 	.get_port		= tcp_v4_get_port,
18571da177e4SLinus Torvalds 	.enter_memory_pressure	= tcp_enter_memory_pressure,
18581da177e4SLinus Torvalds 	.sockets_allocated	= &tcp_sockets_allocated,
18590a5578cfSArnaldo Carvalho de Melo 	.orphan_count		= &tcp_orphan_count,
18601da177e4SLinus Torvalds 	.memory_allocated	= &tcp_memory_allocated,
18611da177e4SLinus Torvalds 	.memory_pressure	= &tcp_memory_pressure,
18621da177e4SLinus Torvalds 	.sysctl_mem		= sysctl_tcp_mem,
18631da177e4SLinus Torvalds 	.sysctl_wmem		= sysctl_tcp_wmem,
18641da177e4SLinus Torvalds 	.sysctl_rmem		= sysctl_tcp_rmem,
18651da177e4SLinus Torvalds 	.max_header		= MAX_TCP_HEADER,
18661da177e4SLinus Torvalds 	.obj_size		= sizeof(struct tcp_sock),
18676d6ee43eSArnaldo Carvalho de Melo 	.twsk_prot		= &tcp_timewait_sock_ops,
186860236fddSArnaldo Carvalho de Melo 	.rsk_prot		= &tcp_request_sock_ops,
1869543d9cfeSArnaldo Carvalho de Melo #ifdef CONFIG_COMPAT
1870543d9cfeSArnaldo Carvalho de Melo 	.compat_setsockopt	= compat_tcp_setsockopt,
1871543d9cfeSArnaldo Carvalho de Melo 	.compat_getsockopt	= compat_tcp_getsockopt,
1872543d9cfeSArnaldo Carvalho de Melo #endif
18731da177e4SLinus Torvalds };
18741da177e4SLinus Torvalds 
18751da177e4SLinus Torvalds void __init tcp_v4_init(struct net_proto_family *ops)
18761da177e4SLinus Torvalds {
1877c4d93909SArnaldo Carvalho de Melo 	if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW, IPPROTO_TCP) < 0)
18781da177e4SLinus Torvalds 		panic("Failed to create the TCP control socket.\n");
18791da177e4SLinus Torvalds }
18801da177e4SLinus Torvalds 
18811da177e4SLinus Torvalds EXPORT_SYMBOL(ipv4_specific);
18821da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_hashinfo);
18831da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_prot);
18841da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_unhash);
18851da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_conn_request);
18861da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_connect);
18871da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_do_rcv);
18881da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_remember_stamp);
18891da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_send_check);
18901da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
18911da177e4SLinus Torvalds 
18921da177e4SLinus Torvalds #ifdef CONFIG_PROC_FS
18931da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_proc_register);
18941da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_proc_unregister);
18951da177e4SLinus Torvalds #endif
18961da177e4SLinus Torvalds EXPORT_SYMBOL(sysctl_local_port_range);
18971da177e4SLinus Torvalds EXPORT_SYMBOL(sysctl_tcp_low_latency);
18981da177e4SLinus Torvalds 
1899