xref: /linux/net/ipv4/tcp_output.c (revision 46d3ceabd8d98ed0ad10f20c595ca784e34786c5)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * INET		An implementation of the TCP/IP protocol suite for the LINUX
31da177e4SLinus Torvalds  *		operating system.  INET is implemented using the  BSD Socket
41da177e4SLinus Torvalds  *		interface as the means of communication with the user level.
51da177e4SLinus Torvalds  *
61da177e4SLinus Torvalds  *		Implementation of the Transmission Control Protocol(TCP).
71da177e4SLinus Torvalds  *
802c30a84SJesper Juhl  * Authors:	Ross Biro
91da177e4SLinus Torvalds  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
101da177e4SLinus Torvalds  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
111da177e4SLinus Torvalds  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
121da177e4SLinus Torvalds  *		Florian La Roche, <flla@stud.uni-sb.de>
131da177e4SLinus Torvalds  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
141da177e4SLinus Torvalds  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
151da177e4SLinus Torvalds  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
161da177e4SLinus Torvalds  *		Matthew Dillon, <dillon@apollo.west.oic.com>
171da177e4SLinus Torvalds  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
181da177e4SLinus Torvalds  *		Jorge Cwik, <jorge@laser.satlink.net>
191da177e4SLinus Torvalds  */
201da177e4SLinus Torvalds 
211da177e4SLinus Torvalds /*
221da177e4SLinus Torvalds  * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
231da177e4SLinus Torvalds  *				:	Fragmentation on mtu decrease
241da177e4SLinus Torvalds  *				:	Segment collapse on retransmit
251da177e4SLinus Torvalds  *				:	AF independence
261da177e4SLinus Torvalds  *
271da177e4SLinus Torvalds  *		Linus Torvalds	:	send_delayed_ack
281da177e4SLinus Torvalds  *		David S. Miller	:	Charge memory using the right skb
291da177e4SLinus Torvalds  *					during syn/ack processing.
301da177e4SLinus Torvalds  *		David S. Miller :	Output engine completely rewritten.
311da177e4SLinus Torvalds  *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
321da177e4SLinus Torvalds  *		Cacophonix Gaul :	draft-minshall-nagle-01
331da177e4SLinus Torvalds  *		J Hadi Salim	:	ECN support
341da177e4SLinus Torvalds  *
351da177e4SLinus Torvalds  */
361da177e4SLinus Torvalds 
3791df42beSJoe Perches #define pr_fmt(fmt) "TCP: " fmt
3891df42beSJoe Perches 
391da177e4SLinus Torvalds #include <net/tcp.h>
401da177e4SLinus Torvalds 
411da177e4SLinus Torvalds #include <linux/compiler.h>
425a0e3ad6STejun Heo #include <linux/gfp.h>
431da177e4SLinus Torvalds #include <linux/module.h>
441da177e4SLinus Torvalds 
451da177e4SLinus Torvalds /* People can turn this off for buggy TCP's found in printers etc. */
46ab32ea5dSBrian Haley int sysctl_tcp_retrans_collapse __read_mostly = 1;
471da177e4SLinus Torvalds 
4815d99e02SRick Jones /* People can turn this on to work with those rare, broken TCPs that
4915d99e02SRick Jones  * interpret the window field as a signed quantity.
5015d99e02SRick Jones  */
51ab32ea5dSBrian Haley int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
5215d99e02SRick Jones 
53*46d3ceabSEric Dumazet /* Default TSQ limit of two TSO segments */
54*46d3ceabSEric Dumazet int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
55*46d3ceabSEric Dumazet 
561da177e4SLinus Torvalds /* This limits the percentage of the congestion window which we
571da177e4SLinus Torvalds  * will allow a single TSO frame to consume.  Building TSO frames
581da177e4SLinus Torvalds  * which are too large can cause TCP streams to be bursty.
591da177e4SLinus Torvalds  */
60ab32ea5dSBrian Haley int sysctl_tcp_tso_win_divisor __read_mostly = 3;
611da177e4SLinus Torvalds 
62ab32ea5dSBrian Haley int sysctl_tcp_mtu_probing __read_mostly = 0;
6397b1ce25SShan Wei int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
645d424d5aSJohn Heffner 
6535089bb2SDavid S. Miller /* By default, RFC2861 behavior.  */
66ab32ea5dSBrian Haley int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
6735089bb2SDavid S. Miller 
68519855c5SWilliam Allen Simpson int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
69e6b09ccaSDavid S. Miller EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
70519855c5SWilliam Allen Simpson 
71*46d3ceabSEric Dumazet static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
72*46d3ceabSEric Dumazet 			   int push_one, gfp_t gfp);
73519855c5SWilliam Allen Simpson 
7467edfef7SAndi Kleen /* Account for new data that has been sent to the network. */
75cf533ea5SEric Dumazet static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
766ff03ac3SIlpo Järvinen {
776ff03ac3SIlpo Järvinen 	struct tcp_sock *tp = tcp_sk(sk);
7866f5fe62SIlpo Järvinen 	unsigned int prior_packets = tp->packets_out;
799e412ba7SIlpo Järvinen 
80fe067e8aSDavid S. Miller 	tcp_advance_send_head(sk, skb);
811da177e4SLinus Torvalds 	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
828512430eSIlpo Järvinen 
8325985edcSLucas De Marchi 	/* Don't override Nagle indefinitely with F-RTO */
848512430eSIlpo Järvinen 	if (tp->frto_counter == 2)
858512430eSIlpo Järvinen 		tp->frto_counter = 3;
8666f5fe62SIlpo Järvinen 
8766f5fe62SIlpo Järvinen 	tp->packets_out += tcp_skb_pcount(skb);
88750ea2baSYuchung Cheng 	if (!prior_packets || tp->early_retrans_delayed)
89750ea2baSYuchung Cheng 		tcp_rearm_rto(sk);
901da177e4SLinus Torvalds }
911da177e4SLinus Torvalds 
921da177e4SLinus Torvalds /* SND.NXT, if window was not shrunk.
931da177e4SLinus Torvalds  * If window has been shrunk, what should we make? It is not clear at all.
941da177e4SLinus Torvalds  * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
951da177e4SLinus Torvalds  * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
961da177e4SLinus Torvalds  * invalid. OK, let's make this for now:
971da177e4SLinus Torvalds  */
98cf533ea5SEric Dumazet static inline __u32 tcp_acceptable_seq(const struct sock *sk)
991da177e4SLinus Torvalds {
100cf533ea5SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
1019e412ba7SIlpo Järvinen 
10290840defSIlpo Järvinen 	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
1031da177e4SLinus Torvalds 		return tp->snd_nxt;
1041da177e4SLinus Torvalds 	else
10590840defSIlpo Järvinen 		return tcp_wnd_end(tp);
1061da177e4SLinus Torvalds }
1071da177e4SLinus Torvalds 
1081da177e4SLinus Torvalds /* Calculate mss to advertise in SYN segment.
1091da177e4SLinus Torvalds  * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
1101da177e4SLinus Torvalds  *
1111da177e4SLinus Torvalds  * 1. It is independent of path mtu.
1121da177e4SLinus Torvalds  * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
1131da177e4SLinus Torvalds  * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
1141da177e4SLinus Torvalds  *    attached devices, because some buggy hosts are confused by
1151da177e4SLinus Torvalds  *    large MSS.
1161da177e4SLinus Torvalds  * 4. We do not make 3, we advertise MSS, calculated from first
1171da177e4SLinus Torvalds  *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
1181da177e4SLinus Torvalds  *    This may be overridden via information stored in routing table.
1191da177e4SLinus Torvalds  * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
1201da177e4SLinus Torvalds  *    probably even Jumbo".
1211da177e4SLinus Torvalds  */
1221da177e4SLinus Torvalds static __u16 tcp_advertise_mss(struct sock *sk)
1231da177e4SLinus Torvalds {
1241da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
125cf533ea5SEric Dumazet 	const struct dst_entry *dst = __sk_dst_get(sk);
1261da177e4SLinus Torvalds 	int mss = tp->advmss;
1271da177e4SLinus Torvalds 
1280dbaee3bSDavid S. Miller 	if (dst) {
1290dbaee3bSDavid S. Miller 		unsigned int metric = dst_metric_advmss(dst);
1300dbaee3bSDavid S. Miller 
1310dbaee3bSDavid S. Miller 		if (metric < mss) {
1320dbaee3bSDavid S. Miller 			mss = metric;
1331da177e4SLinus Torvalds 			tp->advmss = mss;
1341da177e4SLinus Torvalds 		}
1350dbaee3bSDavid S. Miller 	}
1361da177e4SLinus Torvalds 
1371da177e4SLinus Torvalds 	return (__u16)mss;
1381da177e4SLinus Torvalds }
1391da177e4SLinus Torvalds 
1401da177e4SLinus Torvalds /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
1411da177e4SLinus Torvalds  * This is the first part of cwnd validation mechanism. */
142cf533ea5SEric Dumazet static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
1431da177e4SLinus Torvalds {
144463c84b9SArnaldo Carvalho de Melo 	struct tcp_sock *tp = tcp_sk(sk);
1451da177e4SLinus Torvalds 	s32 delta = tcp_time_stamp - tp->lsndtime;
1461da177e4SLinus Torvalds 	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
1471da177e4SLinus Torvalds 	u32 cwnd = tp->snd_cwnd;
1481da177e4SLinus Torvalds 
1496687e988SArnaldo Carvalho de Melo 	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
1501da177e4SLinus Torvalds 
1516687e988SArnaldo Carvalho de Melo 	tp->snd_ssthresh = tcp_current_ssthresh(sk);
1521da177e4SLinus Torvalds 	restart_cwnd = min(restart_cwnd, cwnd);
1531da177e4SLinus Torvalds 
154463c84b9SArnaldo Carvalho de Melo 	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
1551da177e4SLinus Torvalds 		cwnd >>= 1;
1561da177e4SLinus Torvalds 	tp->snd_cwnd = max(cwnd, restart_cwnd);
1571da177e4SLinus Torvalds 	tp->snd_cwnd_stamp = tcp_time_stamp;
1581da177e4SLinus Torvalds 	tp->snd_cwnd_used = 0;
1591da177e4SLinus Torvalds }
1601da177e4SLinus Torvalds 
16167edfef7SAndi Kleen /* Congestion state accounting after a packet has been sent. */
16240efc6faSStephen Hemminger static void tcp_event_data_sent(struct tcp_sock *tp,
163cf533ea5SEric Dumazet 				struct sock *sk)
1641da177e4SLinus Torvalds {
165463c84b9SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
166463c84b9SArnaldo Carvalho de Melo 	const u32 now = tcp_time_stamp;
1671da177e4SLinus Torvalds 
16835089bb2SDavid S. Miller 	if (sysctl_tcp_slow_start_after_idle &&
16935089bb2SDavid S. Miller 	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
170463c84b9SArnaldo Carvalho de Melo 		tcp_cwnd_restart(sk, __sk_dst_get(sk));
1711da177e4SLinus Torvalds 
1721da177e4SLinus Torvalds 	tp->lsndtime = now;
1731da177e4SLinus Torvalds 
1741da177e4SLinus Torvalds 	/* If it is a reply for ato after last received
1751da177e4SLinus Torvalds 	 * packet, enter pingpong mode.
1761da177e4SLinus Torvalds 	 */
177463c84b9SArnaldo Carvalho de Melo 	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
178463c84b9SArnaldo Carvalho de Melo 		icsk->icsk_ack.pingpong = 1;
1791da177e4SLinus Torvalds }
1801da177e4SLinus Torvalds 
18167edfef7SAndi Kleen /* Account for an ACK we sent. */
18240efc6faSStephen Hemminger static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
1831da177e4SLinus Torvalds {
184463c84b9SArnaldo Carvalho de Melo 	tcp_dec_quickack_mode(sk, pkts);
185463c84b9SArnaldo Carvalho de Melo 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
1861da177e4SLinus Torvalds }
1871da177e4SLinus Torvalds 
1881da177e4SLinus Torvalds /* Determine a window scaling and initial window to offer.
1891da177e4SLinus Torvalds  * Based on the assumption that the given amount of space
1901da177e4SLinus Torvalds  * will be offered. Store the results in the tp structure.
1911da177e4SLinus Torvalds  * NOTE: for smooth operation initial space offering should
1921da177e4SLinus Torvalds  * be a multiple of mss if possible. We assume here that mss >= 1.
1931da177e4SLinus Torvalds  * This MUST be enforced by all callers.
1941da177e4SLinus Torvalds  */
1951da177e4SLinus Torvalds void tcp_select_initial_window(int __space, __u32 mss,
1961da177e4SLinus Torvalds 			       __u32 *rcv_wnd, __u32 *window_clamp,
19731d12926Slaurent chavey 			       int wscale_ok, __u8 *rcv_wscale,
19831d12926Slaurent chavey 			       __u32 init_rcv_wnd)
1991da177e4SLinus Torvalds {
2001da177e4SLinus Torvalds 	unsigned int space = (__space < 0 ? 0 : __space);
2011da177e4SLinus Torvalds 
2021da177e4SLinus Torvalds 	/* If no clamp set the clamp to the max possible scaled window */
2031da177e4SLinus Torvalds 	if (*window_clamp == 0)
2041da177e4SLinus Torvalds 		(*window_clamp) = (65535 << 14);
2051da177e4SLinus Torvalds 	space = min(*window_clamp, space);
2061da177e4SLinus Torvalds 
2071da177e4SLinus Torvalds 	/* Quantize space offering to a multiple of mss if possible. */
2081da177e4SLinus Torvalds 	if (space > mss)
2091da177e4SLinus Torvalds 		space = (space / mss) * mss;
2101da177e4SLinus Torvalds 
2111da177e4SLinus Torvalds 	/* NOTE: offering an initial window larger than 32767
21215d99e02SRick Jones 	 * will break some buggy TCP stacks. If the admin tells us
21315d99e02SRick Jones 	 * it is likely we could be speaking with such a buggy stack
21415d99e02SRick Jones 	 * we will truncate our initial window offering to 32K-1
21515d99e02SRick Jones 	 * unless the remote has sent us a window scaling option,
21615d99e02SRick Jones 	 * which we interpret as a sign the remote TCP is not
21715d99e02SRick Jones 	 * misinterpreting the window field as a signed quantity.
2181da177e4SLinus Torvalds 	 */
21915d99e02SRick Jones 	if (sysctl_tcp_workaround_signed_windows)
2201da177e4SLinus Torvalds 		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
22115d99e02SRick Jones 	else
22215d99e02SRick Jones 		(*rcv_wnd) = space;
22315d99e02SRick Jones 
2241da177e4SLinus Torvalds 	(*rcv_wscale) = 0;
2251da177e4SLinus Torvalds 	if (wscale_ok) {
2261da177e4SLinus Torvalds 		/* Set window scaling on max possible window
2271da177e4SLinus Torvalds 		 * See RFC1323 for an explanation of the limit to 14
2281da177e4SLinus Torvalds 		 */
2291da177e4SLinus Torvalds 		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
230316c1592SStephen Hemminger 		space = min_t(u32, space, *window_clamp);
2311da177e4SLinus Torvalds 		while (space > 65535 && (*rcv_wscale) < 14) {
2321da177e4SLinus Torvalds 			space >>= 1;
2331da177e4SLinus Torvalds 			(*rcv_wscale)++;
2341da177e4SLinus Torvalds 		}
2351da177e4SLinus Torvalds 	}
2361da177e4SLinus Torvalds 
237356f0398SNandita Dukkipati 	/* Set initial window to a value enough for senders starting with
238356f0398SNandita Dukkipati 	 * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
239356f0398SNandita Dukkipati 	 * a limit on the initial window when mss is larger than 1460.
240356f0398SNandita Dukkipati 	 */
2411da177e4SLinus Torvalds 	if (mss > (1 << *rcv_wscale)) {
242356f0398SNandita Dukkipati 		int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
243356f0398SNandita Dukkipati 		if (mss > 1460)
244356f0398SNandita Dukkipati 			init_cwnd =
245356f0398SNandita Dukkipati 			max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
24631d12926Slaurent chavey 		/* when initializing use the value from init_rcv_wnd
24731d12926Slaurent chavey 		 * rather than the default from above
24831d12926Slaurent chavey 		 */
249b1afde60SNandita Dukkipati 		if (init_rcv_wnd)
250b1afde60SNandita Dukkipati 			*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
251b1afde60SNandita Dukkipati 		else
252b1afde60SNandita Dukkipati 			*rcv_wnd = min(*rcv_wnd, init_cwnd * mss);
2531da177e4SLinus Torvalds 	}
2541da177e4SLinus Torvalds 
2551da177e4SLinus Torvalds 	/* Set the clamp no higher than max representable value */
2561da177e4SLinus Torvalds 	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
2571da177e4SLinus Torvalds }
2584bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_select_initial_window);
2591da177e4SLinus Torvalds 
2601da177e4SLinus Torvalds /* Chose a new window to advertise, update state in tcp_sock for the
2611da177e4SLinus Torvalds  * socket, and return result with RFC1323 scaling applied.  The return
2621da177e4SLinus Torvalds  * value can be stuffed directly into th->window for an outgoing
2631da177e4SLinus Torvalds  * frame.
2641da177e4SLinus Torvalds  */
26540efc6faSStephen Hemminger static u16 tcp_select_window(struct sock *sk)
2661da177e4SLinus Torvalds {
2671da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
2681da177e4SLinus Torvalds 	u32 cur_win = tcp_receive_window(tp);
2691da177e4SLinus Torvalds 	u32 new_win = __tcp_select_window(sk);
2701da177e4SLinus Torvalds 
2711da177e4SLinus Torvalds 	/* Never shrink the offered window */
2721da177e4SLinus Torvalds 	if (new_win < cur_win) {
2731da177e4SLinus Torvalds 		/* Danger Will Robinson!
2741da177e4SLinus Torvalds 		 * Don't update rcv_wup/rcv_wnd here or else
2751da177e4SLinus Torvalds 		 * we will not be able to advertise a zero
2761da177e4SLinus Torvalds 		 * window in time.  --DaveM
2771da177e4SLinus Torvalds 		 *
2781da177e4SLinus Torvalds 		 * Relax Will Robinson.
2791da177e4SLinus Torvalds 		 */
280607bfbf2SPatrick McHardy 		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
2811da177e4SLinus Torvalds 	}
2821da177e4SLinus Torvalds 	tp->rcv_wnd = new_win;
2831da177e4SLinus Torvalds 	tp->rcv_wup = tp->rcv_nxt;
2841da177e4SLinus Torvalds 
2851da177e4SLinus Torvalds 	/* Make sure we do not exceed the maximum possible
2861da177e4SLinus Torvalds 	 * scaled window.
2871da177e4SLinus Torvalds 	 */
28815d99e02SRick Jones 	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
2891da177e4SLinus Torvalds 		new_win = min(new_win, MAX_TCP_WINDOW);
2901da177e4SLinus Torvalds 	else
2911da177e4SLinus Torvalds 		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
2921da177e4SLinus Torvalds 
2931da177e4SLinus Torvalds 	/* RFC1323 scaling applied */
2941da177e4SLinus Torvalds 	new_win >>= tp->rx_opt.rcv_wscale;
2951da177e4SLinus Torvalds 
2961da177e4SLinus Torvalds 	/* If we advertise zero window, disable fast path. */
2971da177e4SLinus Torvalds 	if (new_win == 0)
2981da177e4SLinus Torvalds 		tp->pred_flags = 0;
2991da177e4SLinus Torvalds 
3001da177e4SLinus Torvalds 	return new_win;
3011da177e4SLinus Torvalds }
3021da177e4SLinus Torvalds 
30367edfef7SAndi Kleen /* Packet ECN state for a SYN-ACK */
304cf533ea5SEric Dumazet static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
305bdf1ee5dSIlpo Järvinen {
3064de075e0SEric Dumazet 	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
307bdf1ee5dSIlpo Järvinen 	if (!(tp->ecn_flags & TCP_ECN_OK))
3084de075e0SEric Dumazet 		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
309bdf1ee5dSIlpo Järvinen }
310bdf1ee5dSIlpo Järvinen 
31167edfef7SAndi Kleen /* Packet ECN state for a SYN.  */
312bdf1ee5dSIlpo Järvinen static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
313bdf1ee5dSIlpo Järvinen {
314bdf1ee5dSIlpo Järvinen 	struct tcp_sock *tp = tcp_sk(sk);
315bdf1ee5dSIlpo Järvinen 
316bdf1ee5dSIlpo Järvinen 	tp->ecn_flags = 0;
317255cac91SIlpo Järvinen 	if (sysctl_tcp_ecn == 1) {
3184de075e0SEric Dumazet 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
319bdf1ee5dSIlpo Järvinen 		tp->ecn_flags = TCP_ECN_OK;
320bdf1ee5dSIlpo Järvinen 	}
321bdf1ee5dSIlpo Järvinen }
322bdf1ee5dSIlpo Järvinen 
323bdf1ee5dSIlpo Järvinen static __inline__ void
324cf533ea5SEric Dumazet TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
325bdf1ee5dSIlpo Järvinen {
326bdf1ee5dSIlpo Järvinen 	if (inet_rsk(req)->ecn_ok)
327bdf1ee5dSIlpo Järvinen 		th->ece = 1;
328bdf1ee5dSIlpo Järvinen }
329bdf1ee5dSIlpo Järvinen 
33067edfef7SAndi Kleen /* Set up ECN state for a packet on a ESTABLISHED socket that is about to
33167edfef7SAndi Kleen  * be sent.
33267edfef7SAndi Kleen  */
333bdf1ee5dSIlpo Järvinen static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
334bdf1ee5dSIlpo Järvinen 				int tcp_header_len)
335bdf1ee5dSIlpo Järvinen {
336bdf1ee5dSIlpo Järvinen 	struct tcp_sock *tp = tcp_sk(sk);
337bdf1ee5dSIlpo Järvinen 
338bdf1ee5dSIlpo Järvinen 	if (tp->ecn_flags & TCP_ECN_OK) {
339bdf1ee5dSIlpo Järvinen 		/* Not-retransmitted data segment: set ECT and inject CWR. */
340bdf1ee5dSIlpo Järvinen 		if (skb->len != tcp_header_len &&
341bdf1ee5dSIlpo Järvinen 		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
342bdf1ee5dSIlpo Järvinen 			INET_ECN_xmit(sk);
343bdf1ee5dSIlpo Järvinen 			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
344bdf1ee5dSIlpo Järvinen 				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
345bdf1ee5dSIlpo Järvinen 				tcp_hdr(skb)->cwr = 1;
346bdf1ee5dSIlpo Järvinen 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
347bdf1ee5dSIlpo Järvinen 			}
348bdf1ee5dSIlpo Järvinen 		} else {
349bdf1ee5dSIlpo Järvinen 			/* ACK or retransmitted segment: clear ECT|CE */
350bdf1ee5dSIlpo Järvinen 			INET_ECN_dontxmit(sk);
351bdf1ee5dSIlpo Järvinen 		}
352bdf1ee5dSIlpo Järvinen 		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
353bdf1ee5dSIlpo Järvinen 			tcp_hdr(skb)->ece = 1;
354bdf1ee5dSIlpo Järvinen 	}
355bdf1ee5dSIlpo Järvinen }
356bdf1ee5dSIlpo Järvinen 
357e870a8efSIlpo Järvinen /* Constructs common control bits of non-data skb. If SYN/FIN is present,
358e870a8efSIlpo Järvinen  * auto increment end seqno.
359e870a8efSIlpo Järvinen  */
360e870a8efSIlpo Järvinen static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
361e870a8efSIlpo Järvinen {
3622e8e18efSDavid S. Miller 	skb->ip_summed = CHECKSUM_PARTIAL;
363e870a8efSIlpo Järvinen 	skb->csum = 0;
364e870a8efSIlpo Järvinen 
3654de075e0SEric Dumazet 	TCP_SKB_CB(skb)->tcp_flags = flags;
366e870a8efSIlpo Järvinen 	TCP_SKB_CB(skb)->sacked = 0;
367e870a8efSIlpo Järvinen 
368e870a8efSIlpo Järvinen 	skb_shinfo(skb)->gso_segs = 1;
369e870a8efSIlpo Järvinen 	skb_shinfo(skb)->gso_size = 0;
370e870a8efSIlpo Järvinen 	skb_shinfo(skb)->gso_type = 0;
371e870a8efSIlpo Järvinen 
372e870a8efSIlpo Järvinen 	TCP_SKB_CB(skb)->seq = seq;
373a3433f35SChangli Gao 	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
374e870a8efSIlpo Järvinen 		seq++;
375e870a8efSIlpo Järvinen 	TCP_SKB_CB(skb)->end_seq = seq;
376e870a8efSIlpo Järvinen }
377e870a8efSIlpo Järvinen 
378a2a385d6SEric Dumazet static inline bool tcp_urg_mode(const struct tcp_sock *tp)
37933f5f57eSIlpo Järvinen {
38033f5f57eSIlpo Järvinen 	return tp->snd_una != tp->snd_up;
38133f5f57eSIlpo Järvinen }
38233f5f57eSIlpo Järvinen 
38333ad798cSAdam Langley #define OPTION_SACK_ADVERTISE	(1 << 0)
38433ad798cSAdam Langley #define OPTION_TS		(1 << 1)
38533ad798cSAdam Langley #define OPTION_MD5		(1 << 2)
38689e95a61SOri Finkelman #define OPTION_WSCALE		(1 << 3)
387bd0388aeSWilliam Allen Simpson #define OPTION_COOKIE_EXTENSION	(1 << 4)
38833ad798cSAdam Langley 
38933ad798cSAdam Langley struct tcp_out_options {
39033ad798cSAdam Langley 	u8 options;		/* bit field of OPTION_* */
39133ad798cSAdam Langley 	u8 ws;			/* window scale, 0 to disable */
39233ad798cSAdam Langley 	u8 num_sack_blocks;	/* number of SACK blocks to include */
393bd0388aeSWilliam Allen Simpson 	u8 hash_size;		/* bytes in hash_location */
39433ad798cSAdam Langley 	u16 mss;		/* 0 to disable */
39533ad798cSAdam Langley 	__u32 tsval, tsecr;	/* need to include OPTION_TS */
396bd0388aeSWilliam Allen Simpson 	__u8 *hash_location;	/* temporary pointer, overloaded */
39733ad798cSAdam Langley };
39833ad798cSAdam Langley 
399bd0388aeSWilliam Allen Simpson /* The sysctl int routines are generic, so check consistency here.
400bd0388aeSWilliam Allen Simpson  */
401bd0388aeSWilliam Allen Simpson static u8 tcp_cookie_size_check(u8 desired)
402bd0388aeSWilliam Allen Simpson {
403f1987257SEric Dumazet 	int cookie_size;
404f1987257SEric Dumazet 
405f1987257SEric Dumazet 	if (desired > 0)
406bd0388aeSWilliam Allen Simpson 		/* previously specified */
407bd0388aeSWilliam Allen Simpson 		return desired;
408f1987257SEric Dumazet 
409f1987257SEric Dumazet 	cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
410f1987257SEric Dumazet 	if (cookie_size <= 0)
411bd0388aeSWilliam Allen Simpson 		/* no default specified */
412bd0388aeSWilliam Allen Simpson 		return 0;
413f1987257SEric Dumazet 
414f1987257SEric Dumazet 	if (cookie_size <= TCP_COOKIE_MIN)
415bd0388aeSWilliam Allen Simpson 		/* value too small, specify minimum */
416bd0388aeSWilliam Allen Simpson 		return TCP_COOKIE_MIN;
417f1987257SEric Dumazet 
418f1987257SEric Dumazet 	if (cookie_size >= TCP_COOKIE_MAX)
419bd0388aeSWilliam Allen Simpson 		/* value too large, specify maximum */
420bd0388aeSWilliam Allen Simpson 		return TCP_COOKIE_MAX;
421f1987257SEric Dumazet 
422f1987257SEric Dumazet 	if (cookie_size & 1)
423bd0388aeSWilliam Allen Simpson 		/* 8-bit multiple, illegal, fix it */
424f1987257SEric Dumazet 		cookie_size++;
425f1987257SEric Dumazet 
426f1987257SEric Dumazet 	return (u8)cookie_size;
427bd0388aeSWilliam Allen Simpson }
428bd0388aeSWilliam Allen Simpson 
42967edfef7SAndi Kleen /* Write previously computed TCP options to the packet.
43067edfef7SAndi Kleen  *
43167edfef7SAndi Kleen  * Beware: Something in the Internet is very sensitive to the ordering of
432fd6149d3SIlpo Järvinen  * TCP options, we learned this through the hard way, so be careful here.
433fd6149d3SIlpo Järvinen  * Luckily we can at least blame others for their non-compliance but from
434fd6149d3SIlpo Järvinen  * inter-operatibility perspective it seems that we're somewhat stuck with
435fd6149d3SIlpo Järvinen  * the ordering which we have been using if we want to keep working with
436fd6149d3SIlpo Järvinen  * those broken things (not that it currently hurts anybody as there isn't
437fd6149d3SIlpo Järvinen  * particular reason why the ordering would need to be changed).
438fd6149d3SIlpo Järvinen  *
439fd6149d3SIlpo Järvinen  * At least SACK_PERM as the first option is known to lead to a disaster
440fd6149d3SIlpo Järvinen  * (but it may well be that other scenarios fail similarly).
441fd6149d3SIlpo Järvinen  */
44233ad798cSAdam Langley static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
443bd0388aeSWilliam Allen Simpson 			      struct tcp_out_options *opts)
444bd0388aeSWilliam Allen Simpson {
445bd0388aeSWilliam Allen Simpson 	u8 options = opts->options;	/* mungable copy */
446bd0388aeSWilliam Allen Simpson 
447bd0388aeSWilliam Allen Simpson 	/* Having both authentication and cookies for security is redundant,
448bd0388aeSWilliam Allen Simpson 	 * and there's certainly not enough room.  Instead, the cookie-less
449bd0388aeSWilliam Allen Simpson 	 * extension variant is proposed.
450bd0388aeSWilliam Allen Simpson 	 *
451bd0388aeSWilliam Allen Simpson 	 * Consider the pessimal case with authentication.  The options
452bd0388aeSWilliam Allen Simpson 	 * could look like:
453bd0388aeSWilliam Allen Simpson 	 *   COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
454bd0388aeSWilliam Allen Simpson 	 */
455bd0388aeSWilliam Allen Simpson 	if (unlikely(OPTION_MD5 & options)) {
456bd0388aeSWilliam Allen Simpson 		if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
457bd0388aeSWilliam Allen Simpson 			*ptr++ = htonl((TCPOPT_COOKIE << 24) |
458bd0388aeSWilliam Allen Simpson 				       (TCPOLEN_COOKIE_BASE << 16) |
459bd0388aeSWilliam Allen Simpson 				       (TCPOPT_MD5SIG << 8) |
460bd0388aeSWilliam Allen Simpson 				       TCPOLEN_MD5SIG);
461bd0388aeSWilliam Allen Simpson 		} else {
46233ad798cSAdam Langley 			*ptr++ = htonl((TCPOPT_NOP << 24) |
46333ad798cSAdam Langley 				       (TCPOPT_NOP << 16) |
46433ad798cSAdam Langley 				       (TCPOPT_MD5SIG << 8) |
46533ad798cSAdam Langley 				       TCPOLEN_MD5SIG);
466bd0388aeSWilliam Allen Simpson 		}
467bd0388aeSWilliam Allen Simpson 		options &= ~OPTION_COOKIE_EXTENSION;
468bd0388aeSWilliam Allen Simpson 		/* overload cookie hash location */
469bd0388aeSWilliam Allen Simpson 		opts->hash_location = (__u8 *)ptr;
47033ad798cSAdam Langley 		ptr += 4;
47133ad798cSAdam Langley 	}
47233ad798cSAdam Langley 
473fd6149d3SIlpo Järvinen 	if (unlikely(opts->mss)) {
474fd6149d3SIlpo Järvinen 		*ptr++ = htonl((TCPOPT_MSS << 24) |
475fd6149d3SIlpo Järvinen 			       (TCPOLEN_MSS << 16) |
476fd6149d3SIlpo Järvinen 			       opts->mss);
477fd6149d3SIlpo Järvinen 	}
478fd6149d3SIlpo Järvinen 
479bd0388aeSWilliam Allen Simpson 	if (likely(OPTION_TS & options)) {
480bd0388aeSWilliam Allen Simpson 		if (unlikely(OPTION_SACK_ADVERTISE & options)) {
48133ad798cSAdam Langley 			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
48233ad798cSAdam Langley 				       (TCPOLEN_SACK_PERM << 16) |
48333ad798cSAdam Langley 				       (TCPOPT_TIMESTAMP << 8) |
48433ad798cSAdam Langley 				       TCPOLEN_TIMESTAMP);
485bd0388aeSWilliam Allen Simpson 			options &= ~OPTION_SACK_ADVERTISE;
48633ad798cSAdam Langley 		} else {
487496c98dfSYOSHIFUJI Hideaki 			*ptr++ = htonl((TCPOPT_NOP << 24) |
48840efc6faSStephen Hemminger 				       (TCPOPT_NOP << 16) |
48940efc6faSStephen Hemminger 				       (TCPOPT_TIMESTAMP << 8) |
49040efc6faSStephen Hemminger 				       TCPOLEN_TIMESTAMP);
49140efc6faSStephen Hemminger 		}
49233ad798cSAdam Langley 		*ptr++ = htonl(opts->tsval);
49333ad798cSAdam Langley 		*ptr++ = htonl(opts->tsecr);
49433ad798cSAdam Langley 	}
49533ad798cSAdam Langley 
496bd0388aeSWilliam Allen Simpson 	/* Specification requires after timestamp, so do it now.
497bd0388aeSWilliam Allen Simpson 	 *
498bd0388aeSWilliam Allen Simpson 	 * Consider the pessimal case without authentication.  The options
499bd0388aeSWilliam Allen Simpson 	 * could look like:
500bd0388aeSWilliam Allen Simpson 	 *   MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
501bd0388aeSWilliam Allen Simpson 	 */
502bd0388aeSWilliam Allen Simpson 	if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
503bd0388aeSWilliam Allen Simpson 		__u8 *cookie_copy = opts->hash_location;
504bd0388aeSWilliam Allen Simpson 		u8 cookie_size = opts->hash_size;
505bd0388aeSWilliam Allen Simpson 
506bd0388aeSWilliam Allen Simpson 		/* 8-bit multiple handled in tcp_cookie_size_check() above,
507bd0388aeSWilliam Allen Simpson 		 * and elsewhere.
508bd0388aeSWilliam Allen Simpson 		 */
509bd0388aeSWilliam Allen Simpson 		if (0x2 & cookie_size) {
510bd0388aeSWilliam Allen Simpson 			__u8 *p = (__u8 *)ptr;
511bd0388aeSWilliam Allen Simpson 
512bd0388aeSWilliam Allen Simpson 			/* 16-bit multiple */
513bd0388aeSWilliam Allen Simpson 			*p++ = TCPOPT_COOKIE;
514bd0388aeSWilliam Allen Simpson 			*p++ = TCPOLEN_COOKIE_BASE + cookie_size;
515bd0388aeSWilliam Allen Simpson 			*p++ = *cookie_copy++;
516bd0388aeSWilliam Allen Simpson 			*p++ = *cookie_copy++;
517bd0388aeSWilliam Allen Simpson 			ptr++;
518bd0388aeSWilliam Allen Simpson 			cookie_size -= 2;
519bd0388aeSWilliam Allen Simpson 		} else {
520bd0388aeSWilliam Allen Simpson 			/* 32-bit multiple */
521bd0388aeSWilliam Allen Simpson 			*ptr++ = htonl(((TCPOPT_NOP << 24) |
522bd0388aeSWilliam Allen Simpson 					(TCPOPT_NOP << 16) |
523bd0388aeSWilliam Allen Simpson 					(TCPOPT_COOKIE << 8) |
524bd0388aeSWilliam Allen Simpson 					TCPOLEN_COOKIE_BASE) +
525bd0388aeSWilliam Allen Simpson 				       cookie_size);
526bd0388aeSWilliam Allen Simpson 		}
527bd0388aeSWilliam Allen Simpson 
528bd0388aeSWilliam Allen Simpson 		if (cookie_size > 0) {
529bd0388aeSWilliam Allen Simpson 			memcpy(ptr, cookie_copy, cookie_size);
530bd0388aeSWilliam Allen Simpson 			ptr += (cookie_size / 4);
531bd0388aeSWilliam Allen Simpson 		}
532bd0388aeSWilliam Allen Simpson 	}
533bd0388aeSWilliam Allen Simpson 
534bd0388aeSWilliam Allen Simpson 	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
53533ad798cSAdam Langley 		*ptr++ = htonl((TCPOPT_NOP << 24) |
53633ad798cSAdam Langley 			       (TCPOPT_NOP << 16) |
53733ad798cSAdam Langley 			       (TCPOPT_SACK_PERM << 8) |
53833ad798cSAdam Langley 			       TCPOLEN_SACK_PERM);
53933ad798cSAdam Langley 	}
54033ad798cSAdam Langley 
541bd0388aeSWilliam Allen Simpson 	if (unlikely(OPTION_WSCALE & options)) {
54233ad798cSAdam Langley 		*ptr++ = htonl((TCPOPT_NOP << 24) |
54333ad798cSAdam Langley 			       (TCPOPT_WINDOW << 16) |
54433ad798cSAdam Langley 			       (TCPOLEN_WINDOW << 8) |
54533ad798cSAdam Langley 			       opts->ws);
54633ad798cSAdam Langley 	}
54733ad798cSAdam Langley 
54833ad798cSAdam Langley 	if (unlikely(opts->num_sack_blocks)) {
54933ad798cSAdam Langley 		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
55033ad798cSAdam Langley 			tp->duplicate_sack : tp->selective_acks;
55140efc6faSStephen Hemminger 		int this_sack;
55240efc6faSStephen Hemminger 
55340efc6faSStephen Hemminger 		*ptr++ = htonl((TCPOPT_NOP  << 24) |
55440efc6faSStephen Hemminger 			       (TCPOPT_NOP  << 16) |
55540efc6faSStephen Hemminger 			       (TCPOPT_SACK <<  8) |
55633ad798cSAdam Langley 			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
55740efc6faSStephen Hemminger 						     TCPOLEN_SACK_PERBLOCK)));
5582de979bdSStephen Hemminger 
55933ad798cSAdam Langley 		for (this_sack = 0; this_sack < opts->num_sack_blocks;
56033ad798cSAdam Langley 		     ++this_sack) {
56140efc6faSStephen Hemminger 			*ptr++ = htonl(sp[this_sack].start_seq);
56240efc6faSStephen Hemminger 			*ptr++ = htonl(sp[this_sack].end_seq);
56340efc6faSStephen Hemminger 		}
5642de979bdSStephen Hemminger 
56540efc6faSStephen Hemminger 		tp->rx_opt.dsack = 0;
56640efc6faSStephen Hemminger 	}
56740efc6faSStephen Hemminger }
56840efc6faSStephen Hemminger 
56967edfef7SAndi Kleen /* Compute TCP options for SYN packets. This is not the final
57067edfef7SAndi Kleen  * network wire format yet.
57167edfef7SAndi Kleen  */
57295c96174SEric Dumazet static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
57333ad798cSAdam Langley 				struct tcp_out_options *opts,
574cf533ea5SEric Dumazet 				struct tcp_md5sig_key **md5)
575cf533ea5SEric Dumazet {
57633ad798cSAdam Langley 	struct tcp_sock *tp = tcp_sk(sk);
577bd0388aeSWilliam Allen Simpson 	struct tcp_cookie_values *cvp = tp->cookie_values;
57895c96174SEric Dumazet 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
579bd0388aeSWilliam Allen Simpson 	u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
580bd0388aeSWilliam Allen Simpson 			 tcp_cookie_size_check(cvp->cookie_desired) :
581bd0388aeSWilliam Allen Simpson 			 0;
58233ad798cSAdam Langley 
583cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
58433ad798cSAdam Langley 	*md5 = tp->af_specific->md5_lookup(sk, sk);
58533ad798cSAdam Langley 	if (*md5) {
58633ad798cSAdam Langley 		opts->options |= OPTION_MD5;
587bd0388aeSWilliam Allen Simpson 		remaining -= TCPOLEN_MD5SIG_ALIGNED;
588cfb6eeb4SYOSHIFUJI Hideaki 	}
58933ad798cSAdam Langley #else
59033ad798cSAdam Langley 	*md5 = NULL;
591cfb6eeb4SYOSHIFUJI Hideaki #endif
59233ad798cSAdam Langley 
59333ad798cSAdam Langley 	/* We always get an MSS option.  The option bytes which will be seen in
59433ad798cSAdam Langley 	 * normal data packets should timestamps be used, must be in the MSS
59533ad798cSAdam Langley 	 * advertised.  But we subtract them from tp->mss_cache so that
59633ad798cSAdam Langley 	 * calculations in tcp_sendmsg are simpler etc.  So account for this
59733ad798cSAdam Langley 	 * fact here if necessary.  If we don't do this correctly, as a
59833ad798cSAdam Langley 	 * receiver we won't recognize data packets as being full sized when we
59933ad798cSAdam Langley 	 * should, and thus we won't abide by the delayed ACK rules correctly.
60033ad798cSAdam Langley 	 * SACKs don't matter, we never delay an ACK when we have any of those
60133ad798cSAdam Langley 	 * going out.  */
60233ad798cSAdam Langley 	opts->mss = tcp_advertise_mss(sk);
603bd0388aeSWilliam Allen Simpson 	remaining -= TCPOLEN_MSS_ALIGNED;
60433ad798cSAdam Langley 
605bb5b7c11SDavid S. Miller 	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
60633ad798cSAdam Langley 		opts->options |= OPTION_TS;
60733ad798cSAdam Langley 		opts->tsval = TCP_SKB_CB(skb)->when;
60833ad798cSAdam Langley 		opts->tsecr = tp->rx_opt.ts_recent;
609bd0388aeSWilliam Allen Simpson 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
61033ad798cSAdam Langley 	}
611bb5b7c11SDavid S. Miller 	if (likely(sysctl_tcp_window_scaling)) {
61233ad798cSAdam Langley 		opts->ws = tp->rx_opt.rcv_wscale;
61389e95a61SOri Finkelman 		opts->options |= OPTION_WSCALE;
614bd0388aeSWilliam Allen Simpson 		remaining -= TCPOLEN_WSCALE_ALIGNED;
61533ad798cSAdam Langley 	}
616bb5b7c11SDavid S. Miller 	if (likely(sysctl_tcp_sack)) {
61733ad798cSAdam Langley 		opts->options |= OPTION_SACK_ADVERTISE;
618b32d1310SDavid S. Miller 		if (unlikely(!(OPTION_TS & opts->options)))
619bd0388aeSWilliam Allen Simpson 			remaining -= TCPOLEN_SACKPERM_ALIGNED;
62033ad798cSAdam Langley 	}
62133ad798cSAdam Langley 
622bd0388aeSWilliam Allen Simpson 	/* Note that timestamps are required by the specification.
623bd0388aeSWilliam Allen Simpson 	 *
624bd0388aeSWilliam Allen Simpson 	 * Odd numbers of bytes are prohibited by the specification, ensuring
625bd0388aeSWilliam Allen Simpson 	 * that the cookie is 16-bit aligned, and the resulting cookie pair is
626bd0388aeSWilliam Allen Simpson 	 * 32-bit aligned.
627bd0388aeSWilliam Allen Simpson 	 */
628bd0388aeSWilliam Allen Simpson 	if (*md5 == NULL &&
629bd0388aeSWilliam Allen Simpson 	    (OPTION_TS & opts->options) &&
630bd0388aeSWilliam Allen Simpson 	    cookie_size > 0) {
631bd0388aeSWilliam Allen Simpson 		int need = TCPOLEN_COOKIE_BASE + cookie_size;
632bd0388aeSWilliam Allen Simpson 
633bd0388aeSWilliam Allen Simpson 		if (0x2 & need) {
634bd0388aeSWilliam Allen Simpson 			/* 32-bit multiple */
635bd0388aeSWilliam Allen Simpson 			need += 2; /* NOPs */
636bd0388aeSWilliam Allen Simpson 
637bd0388aeSWilliam Allen Simpson 			if (need > remaining) {
638bd0388aeSWilliam Allen Simpson 				/* try shrinking cookie to fit */
639bd0388aeSWilliam Allen Simpson 				cookie_size -= 2;
640bd0388aeSWilliam Allen Simpson 				need -= 4;
641bd0388aeSWilliam Allen Simpson 			}
642bd0388aeSWilliam Allen Simpson 		}
643bd0388aeSWilliam Allen Simpson 		while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
644bd0388aeSWilliam Allen Simpson 			cookie_size -= 4;
645bd0388aeSWilliam Allen Simpson 			need -= 4;
646bd0388aeSWilliam Allen Simpson 		}
647bd0388aeSWilliam Allen Simpson 		if (TCP_COOKIE_MIN <= cookie_size) {
648bd0388aeSWilliam Allen Simpson 			opts->options |= OPTION_COOKIE_EXTENSION;
649bd0388aeSWilliam Allen Simpson 			opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
650bd0388aeSWilliam Allen Simpson 			opts->hash_size = cookie_size;
651bd0388aeSWilliam Allen Simpson 
652bd0388aeSWilliam Allen Simpson 			/* Remember for future incarnations. */
653bd0388aeSWilliam Allen Simpson 			cvp->cookie_desired = cookie_size;
654bd0388aeSWilliam Allen Simpson 
655bd0388aeSWilliam Allen Simpson 			if (cvp->cookie_desired != cvp->cookie_pair_size) {
656bd0388aeSWilliam Allen Simpson 				/* Currently use random bytes as a nonce,
657bd0388aeSWilliam Allen Simpson 				 * assuming these are completely unpredictable
658bd0388aeSWilliam Allen Simpson 				 * by hostile users of the same system.
659bd0388aeSWilliam Allen Simpson 				 */
660bd0388aeSWilliam Allen Simpson 				get_random_bytes(&cvp->cookie_pair[0],
661bd0388aeSWilliam Allen Simpson 						 cookie_size);
662bd0388aeSWilliam Allen Simpson 				cvp->cookie_pair_size = cookie_size;
663bd0388aeSWilliam Allen Simpson 			}
664bd0388aeSWilliam Allen Simpson 
665bd0388aeSWilliam Allen Simpson 			remaining -= need;
666bd0388aeSWilliam Allen Simpson 		}
667bd0388aeSWilliam Allen Simpson 	}
668bd0388aeSWilliam Allen Simpson 	return MAX_TCP_OPTION_SPACE - remaining;
66933ad798cSAdam Langley }
67033ad798cSAdam Langley 
67167edfef7SAndi Kleen /* Set up TCP options for SYN-ACKs. */
67295c96174SEric Dumazet static unsigned int tcp_synack_options(struct sock *sk,
67333ad798cSAdam Langley 				   struct request_sock *req,
67495c96174SEric Dumazet 				   unsigned int mss, struct sk_buff *skb,
67533ad798cSAdam Langley 				   struct tcp_out_options *opts,
6764957faadSWilliam Allen Simpson 				   struct tcp_md5sig_key **md5,
6774957faadSWilliam Allen Simpson 				   struct tcp_extend_values *xvp)
6784957faadSWilliam Allen Simpson {
67933ad798cSAdam Langley 	struct inet_request_sock *ireq = inet_rsk(req);
68095c96174SEric Dumazet 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
6814957faadSWilliam Allen Simpson 	u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
6824957faadSWilliam Allen Simpson 			 xvp->cookie_plus :
6834957faadSWilliam Allen Simpson 			 0;
68433ad798cSAdam Langley 
68533ad798cSAdam Langley #ifdef CONFIG_TCP_MD5SIG
68633ad798cSAdam Langley 	*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
68733ad798cSAdam Langley 	if (*md5) {
68833ad798cSAdam Langley 		opts->options |= OPTION_MD5;
6894957faadSWilliam Allen Simpson 		remaining -= TCPOLEN_MD5SIG_ALIGNED;
6904957faadSWilliam Allen Simpson 
6914957faadSWilliam Allen Simpson 		/* We can't fit any SACK blocks in a packet with MD5 + TS
6924957faadSWilliam Allen Simpson 		 * options. There was discussion about disabling SACK
6934957faadSWilliam Allen Simpson 		 * rather than TS in order to fit in better with old,
6944957faadSWilliam Allen Simpson 		 * buggy kernels, but that was deemed to be unnecessary.
6954957faadSWilliam Allen Simpson 		 */
696de213e5eSEric Dumazet 		ireq->tstamp_ok &= !ireq->sack_ok;
69733ad798cSAdam Langley 	}
69833ad798cSAdam Langley #else
69933ad798cSAdam Langley 	*md5 = NULL;
70033ad798cSAdam Langley #endif
70133ad798cSAdam Langley 
7024957faadSWilliam Allen Simpson 	/* We always send an MSS option. */
70333ad798cSAdam Langley 	opts->mss = mss;
7044957faadSWilliam Allen Simpson 	remaining -= TCPOLEN_MSS_ALIGNED;
70533ad798cSAdam Langley 
70633ad798cSAdam Langley 	if (likely(ireq->wscale_ok)) {
70733ad798cSAdam Langley 		opts->ws = ireq->rcv_wscale;
70889e95a61SOri Finkelman 		opts->options |= OPTION_WSCALE;
7094957faadSWilliam Allen Simpson 		remaining -= TCPOLEN_WSCALE_ALIGNED;
71033ad798cSAdam Langley 	}
711de213e5eSEric Dumazet 	if (likely(ireq->tstamp_ok)) {
71233ad798cSAdam Langley 		opts->options |= OPTION_TS;
71333ad798cSAdam Langley 		opts->tsval = TCP_SKB_CB(skb)->when;
71433ad798cSAdam Langley 		opts->tsecr = req->ts_recent;
7154957faadSWilliam Allen Simpson 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
71633ad798cSAdam Langley 	}
71733ad798cSAdam Langley 	if (likely(ireq->sack_ok)) {
71833ad798cSAdam Langley 		opts->options |= OPTION_SACK_ADVERTISE;
719de213e5eSEric Dumazet 		if (unlikely(!ireq->tstamp_ok))
7204957faadSWilliam Allen Simpson 			remaining -= TCPOLEN_SACKPERM_ALIGNED;
72133ad798cSAdam Langley 	}
72233ad798cSAdam Langley 
7234957faadSWilliam Allen Simpson 	/* Similar rationale to tcp_syn_options() applies here, too.
7244957faadSWilliam Allen Simpson 	 * If the <SYN> options fit, the same options should fit now!
7254957faadSWilliam Allen Simpson 	 */
7264957faadSWilliam Allen Simpson 	if (*md5 == NULL &&
727de213e5eSEric Dumazet 	    ireq->tstamp_ok &&
7284957faadSWilliam Allen Simpson 	    cookie_plus > TCPOLEN_COOKIE_BASE) {
7294957faadSWilliam Allen Simpson 		int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
7304957faadSWilliam Allen Simpson 
7314957faadSWilliam Allen Simpson 		if (0x2 & need) {
7324957faadSWilliam Allen Simpson 			/* 32-bit multiple */
7334957faadSWilliam Allen Simpson 			need += 2; /* NOPs */
7344957faadSWilliam Allen Simpson 		}
7354957faadSWilliam Allen Simpson 		if (need <= remaining) {
7364957faadSWilliam Allen Simpson 			opts->options |= OPTION_COOKIE_EXTENSION;
7374957faadSWilliam Allen Simpson 			opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
7384957faadSWilliam Allen Simpson 			remaining -= need;
7394957faadSWilliam Allen Simpson 		} else {
7404957faadSWilliam Allen Simpson 			/* There's no error return, so flag it. */
7414957faadSWilliam Allen Simpson 			xvp->cookie_out_never = 1; /* true */
7424957faadSWilliam Allen Simpson 			opts->hash_size = 0;
7434957faadSWilliam Allen Simpson 		}
7444957faadSWilliam Allen Simpson 	}
7454957faadSWilliam Allen Simpson 	return MAX_TCP_OPTION_SPACE - remaining;
74633ad798cSAdam Langley }
74733ad798cSAdam Langley 
74867edfef7SAndi Kleen /* Compute TCP options for ESTABLISHED sockets. This is not the
74967edfef7SAndi Kleen  * final wire format yet.
75067edfef7SAndi Kleen  */
75195c96174SEric Dumazet static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
75233ad798cSAdam Langley 					struct tcp_out_options *opts,
753cf533ea5SEric Dumazet 					struct tcp_md5sig_key **md5)
754cf533ea5SEric Dumazet {
75533ad798cSAdam Langley 	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
75633ad798cSAdam Langley 	struct tcp_sock *tp = tcp_sk(sk);
75795c96174SEric Dumazet 	unsigned int size = 0;
758cabeccbdSIlpo Järvinen 	unsigned int eff_sacks;
75933ad798cSAdam Langley 
76033ad798cSAdam Langley #ifdef CONFIG_TCP_MD5SIG
76133ad798cSAdam Langley 	*md5 = tp->af_specific->md5_lookup(sk, sk);
76233ad798cSAdam Langley 	if (unlikely(*md5)) {
76333ad798cSAdam Langley 		opts->options |= OPTION_MD5;
76433ad798cSAdam Langley 		size += TCPOLEN_MD5SIG_ALIGNED;
76533ad798cSAdam Langley 	}
76633ad798cSAdam Langley #else
76733ad798cSAdam Langley 	*md5 = NULL;
76833ad798cSAdam Langley #endif
76933ad798cSAdam Langley 
77033ad798cSAdam Langley 	if (likely(tp->rx_opt.tstamp_ok)) {
77133ad798cSAdam Langley 		opts->options |= OPTION_TS;
77233ad798cSAdam Langley 		opts->tsval = tcb ? tcb->when : 0;
77333ad798cSAdam Langley 		opts->tsecr = tp->rx_opt.ts_recent;
77433ad798cSAdam Langley 		size += TCPOLEN_TSTAMP_ALIGNED;
77533ad798cSAdam Langley 	}
77633ad798cSAdam Langley 
777cabeccbdSIlpo Järvinen 	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
778cabeccbdSIlpo Järvinen 	if (unlikely(eff_sacks)) {
77995c96174SEric Dumazet 		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
78033ad798cSAdam Langley 		opts->num_sack_blocks =
78195c96174SEric Dumazet 			min_t(unsigned int, eff_sacks,
78233ad798cSAdam Langley 			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
78333ad798cSAdam Langley 			      TCPOLEN_SACK_PERBLOCK);
78433ad798cSAdam Langley 		size += TCPOLEN_SACK_BASE_ALIGNED +
78533ad798cSAdam Langley 			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
78633ad798cSAdam Langley 	}
78733ad798cSAdam Langley 
78833ad798cSAdam Langley 	return size;
78940efc6faSStephen Hemminger }
7901da177e4SLinus Torvalds 
791*46d3ceabSEric Dumazet 
792*46d3ceabSEric Dumazet /* TCP SMALL QUEUES (TSQ)
793*46d3ceabSEric Dumazet  *
794*46d3ceabSEric Dumazet  * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
795*46d3ceabSEric Dumazet  * to reduce RTT and bufferbloat.
796*46d3ceabSEric Dumazet  * We do this using a special skb destructor (tcp_wfree).
797*46d3ceabSEric Dumazet  *
798*46d3ceabSEric Dumazet  * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
799*46d3ceabSEric Dumazet  * needs to be reallocated in a driver.
800*46d3ceabSEric Dumazet  * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
801*46d3ceabSEric Dumazet  *
802*46d3ceabSEric Dumazet  * Since transmit from skb destructor is forbidden, we use a tasklet
803*46d3ceabSEric Dumazet  * to process all sockets that eventually need to send more skbs.
804*46d3ceabSEric Dumazet  * We use one tasklet per cpu, with its own queue of sockets.
805*46d3ceabSEric Dumazet  */
806*46d3ceabSEric Dumazet struct tsq_tasklet {
807*46d3ceabSEric Dumazet 	struct tasklet_struct	tasklet;
808*46d3ceabSEric Dumazet 	struct list_head	head; /* queue of tcp sockets */
809*46d3ceabSEric Dumazet };
810*46d3ceabSEric Dumazet static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
811*46d3ceabSEric Dumazet 
812*46d3ceabSEric Dumazet /*
813*46d3ceabSEric Dumazet  * One tasklest per cpu tries to send more skbs.
814*46d3ceabSEric Dumazet  * We run in tasklet context but need to disable irqs when
815*46d3ceabSEric Dumazet  * transfering tsq->head because tcp_wfree() might
816*46d3ceabSEric Dumazet  * interrupt us (non NAPI drivers)
817*46d3ceabSEric Dumazet  */
818*46d3ceabSEric Dumazet static void tcp_tasklet_func(unsigned long data)
819*46d3ceabSEric Dumazet {
820*46d3ceabSEric Dumazet 	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
821*46d3ceabSEric Dumazet 	LIST_HEAD(list);
822*46d3ceabSEric Dumazet 	unsigned long flags;
823*46d3ceabSEric Dumazet 	struct list_head *q, *n;
824*46d3ceabSEric Dumazet 	struct tcp_sock *tp;
825*46d3ceabSEric Dumazet 	struct sock *sk;
826*46d3ceabSEric Dumazet 
827*46d3ceabSEric Dumazet 	local_irq_save(flags);
828*46d3ceabSEric Dumazet 	list_splice_init(&tsq->head, &list);
829*46d3ceabSEric Dumazet 	local_irq_restore(flags);
830*46d3ceabSEric Dumazet 
831*46d3ceabSEric Dumazet 	list_for_each_safe(q, n, &list) {
832*46d3ceabSEric Dumazet 		tp = list_entry(q, struct tcp_sock, tsq_node);
833*46d3ceabSEric Dumazet 		list_del(&tp->tsq_node);
834*46d3ceabSEric Dumazet 
835*46d3ceabSEric Dumazet 		sk = (struct sock *)tp;
836*46d3ceabSEric Dumazet 		bh_lock_sock(sk);
837*46d3ceabSEric Dumazet 
838*46d3ceabSEric Dumazet 		if (!sock_owned_by_user(sk)) {
839*46d3ceabSEric Dumazet 			if ((1 << sk->sk_state) &
840*46d3ceabSEric Dumazet 			    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
841*46d3ceabSEric Dumazet 			     TCPF_CLOSING | TCPF_CLOSE_WAIT))
842*46d3ceabSEric Dumazet 				tcp_write_xmit(sk,
843*46d3ceabSEric Dumazet 					       tcp_current_mss(sk),
844*46d3ceabSEric Dumazet 					       0, 0,
845*46d3ceabSEric Dumazet 					       GFP_ATOMIC);
846*46d3ceabSEric Dumazet 		} else {
847*46d3ceabSEric Dumazet 			/* defer the work to tcp_release_cb() */
848*46d3ceabSEric Dumazet 			set_bit(TSQ_OWNED, &tp->tsq_flags);
849*46d3ceabSEric Dumazet 		}
850*46d3ceabSEric Dumazet 		bh_unlock_sock(sk);
851*46d3ceabSEric Dumazet 
852*46d3ceabSEric Dumazet 		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
853*46d3ceabSEric Dumazet 		sk_free(sk);
854*46d3ceabSEric Dumazet 	}
855*46d3ceabSEric Dumazet }
856*46d3ceabSEric Dumazet 
857*46d3ceabSEric Dumazet /**
858*46d3ceabSEric Dumazet  * tcp_release_cb - tcp release_sock() callback
859*46d3ceabSEric Dumazet  * @sk: socket
860*46d3ceabSEric Dumazet  *
861*46d3ceabSEric Dumazet  * called from release_sock() to perform protocol dependent
862*46d3ceabSEric Dumazet  * actions before socket release.
863*46d3ceabSEric Dumazet  */
864*46d3ceabSEric Dumazet void tcp_release_cb(struct sock *sk)
865*46d3ceabSEric Dumazet {
866*46d3ceabSEric Dumazet 	struct tcp_sock *tp = tcp_sk(sk);
867*46d3ceabSEric Dumazet 
868*46d3ceabSEric Dumazet 	if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
869*46d3ceabSEric Dumazet 		if ((1 << sk->sk_state) &
870*46d3ceabSEric Dumazet 		    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
871*46d3ceabSEric Dumazet 		     TCPF_CLOSING | TCPF_CLOSE_WAIT))
872*46d3ceabSEric Dumazet 			tcp_write_xmit(sk,
873*46d3ceabSEric Dumazet 				       tcp_current_mss(sk),
874*46d3ceabSEric Dumazet 				       0, 0,
875*46d3ceabSEric Dumazet 				       GFP_ATOMIC);
876*46d3ceabSEric Dumazet 	}
877*46d3ceabSEric Dumazet }
878*46d3ceabSEric Dumazet EXPORT_SYMBOL(tcp_release_cb);
879*46d3ceabSEric Dumazet 
880*46d3ceabSEric Dumazet void __init tcp_tasklet_init(void)
881*46d3ceabSEric Dumazet {
882*46d3ceabSEric Dumazet 	int i;
883*46d3ceabSEric Dumazet 
884*46d3ceabSEric Dumazet 	for_each_possible_cpu(i) {
885*46d3ceabSEric Dumazet 		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
886*46d3ceabSEric Dumazet 
887*46d3ceabSEric Dumazet 		INIT_LIST_HEAD(&tsq->head);
888*46d3ceabSEric Dumazet 		tasklet_init(&tsq->tasklet,
889*46d3ceabSEric Dumazet 			     tcp_tasklet_func,
890*46d3ceabSEric Dumazet 			     (unsigned long)tsq);
891*46d3ceabSEric Dumazet 	}
892*46d3ceabSEric Dumazet }
893*46d3ceabSEric Dumazet 
894*46d3ceabSEric Dumazet /*
895*46d3ceabSEric Dumazet  * Write buffer destructor automatically called from kfree_skb.
896*46d3ceabSEric Dumazet  * We cant xmit new skbs from this context, as we might already
897*46d3ceabSEric Dumazet  * hold qdisc lock.
898*46d3ceabSEric Dumazet  */
899*46d3ceabSEric Dumazet void tcp_wfree(struct sk_buff *skb)
900*46d3ceabSEric Dumazet {
901*46d3ceabSEric Dumazet 	struct sock *sk = skb->sk;
902*46d3ceabSEric Dumazet 	struct tcp_sock *tp = tcp_sk(sk);
903*46d3ceabSEric Dumazet 
904*46d3ceabSEric Dumazet 	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
905*46d3ceabSEric Dumazet 	    !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
906*46d3ceabSEric Dumazet 		unsigned long flags;
907*46d3ceabSEric Dumazet 		struct tsq_tasklet *tsq;
908*46d3ceabSEric Dumazet 
909*46d3ceabSEric Dumazet 		/* Keep a ref on socket.
910*46d3ceabSEric Dumazet 		 * This last ref will be released in tcp_tasklet_func()
911*46d3ceabSEric Dumazet 		 */
912*46d3ceabSEric Dumazet 		atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
913*46d3ceabSEric Dumazet 
914*46d3ceabSEric Dumazet 		/* queue this socket to tasklet queue */
915*46d3ceabSEric Dumazet 		local_irq_save(flags);
916*46d3ceabSEric Dumazet 		tsq = &__get_cpu_var(tsq_tasklet);
917*46d3ceabSEric Dumazet 		list_add(&tp->tsq_node, &tsq->head);
918*46d3ceabSEric Dumazet 		tasklet_schedule(&tsq->tasklet);
919*46d3ceabSEric Dumazet 		local_irq_restore(flags);
920*46d3ceabSEric Dumazet 	} else {
921*46d3ceabSEric Dumazet 		sock_wfree(skb);
922*46d3ceabSEric Dumazet 	}
923*46d3ceabSEric Dumazet }
924*46d3ceabSEric Dumazet 
9251da177e4SLinus Torvalds /* This routine actually transmits TCP packets queued in by
9261da177e4SLinus Torvalds  * tcp_do_sendmsg().  This is used by both the initial
9271da177e4SLinus Torvalds  * transmission and possible later retransmissions.
9281da177e4SLinus Torvalds  * All SKB's seen here are completely headerless.  It is our
9291da177e4SLinus Torvalds  * job to build the TCP header, and pass the packet down to
9301da177e4SLinus Torvalds  * IP so it can do the same plus pass the packet off to the
9311da177e4SLinus Torvalds  * device.
9321da177e4SLinus Torvalds  *
9331da177e4SLinus Torvalds  * We are working here with either a clone of the original
9341da177e4SLinus Torvalds  * SKB, or a fresh unique copy made by the retransmit engine.
9351da177e4SLinus Torvalds  */
936056834d9SIlpo Järvinen static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
937056834d9SIlpo Järvinen 			    gfp_t gfp_mask)
9381da177e4SLinus Torvalds {
9396687e988SArnaldo Carvalho de Melo 	const struct inet_connection_sock *icsk = inet_csk(sk);
940dfb4b9dcSDavid S. Miller 	struct inet_sock *inet;
941dfb4b9dcSDavid S. Miller 	struct tcp_sock *tp;
942dfb4b9dcSDavid S. Miller 	struct tcp_skb_cb *tcb;
94333ad798cSAdam Langley 	struct tcp_out_options opts;
94495c96174SEric Dumazet 	unsigned int tcp_options_size, tcp_header_size;
945cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_md5sig_key *md5;
9461da177e4SLinus Torvalds 	struct tcphdr *th;
9471da177e4SLinus Torvalds 	int err;
9481da177e4SLinus Torvalds 
949dfb4b9dcSDavid S. Miller 	BUG_ON(!skb || !tcp_skb_pcount(skb));
950dfb4b9dcSDavid S. Miller 
951dfb4b9dcSDavid S. Miller 	/* If congestion control is doing timestamping, we must
952dfb4b9dcSDavid S. Miller 	 * take such a timestamp before we potentially clone/copy.
953dfb4b9dcSDavid S. Miller 	 */
954164891aaSStephen Hemminger 	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
955dfb4b9dcSDavid S. Miller 		__net_timestamp(skb);
956dfb4b9dcSDavid S. Miller 
957dfb4b9dcSDavid S. Miller 	if (likely(clone_it)) {
958dfb4b9dcSDavid S. Miller 		if (unlikely(skb_cloned(skb)))
959dfb4b9dcSDavid S. Miller 			skb = pskb_copy(skb, gfp_mask);
960dfb4b9dcSDavid S. Miller 		else
961dfb4b9dcSDavid S. Miller 			skb = skb_clone(skb, gfp_mask);
962dfb4b9dcSDavid S. Miller 		if (unlikely(!skb))
963dfb4b9dcSDavid S. Miller 			return -ENOBUFS;
964dfb4b9dcSDavid S. Miller 	}
965dfb4b9dcSDavid S. Miller 
966dfb4b9dcSDavid S. Miller 	inet = inet_sk(sk);
967dfb4b9dcSDavid S. Miller 	tp = tcp_sk(sk);
968dfb4b9dcSDavid S. Miller 	tcb = TCP_SKB_CB(skb);
96933ad798cSAdam Langley 	memset(&opts, 0, sizeof(opts));
9701da177e4SLinus Torvalds 
9714de075e0SEric Dumazet 	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
97233ad798cSAdam Langley 		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
97333ad798cSAdam Langley 	else
97433ad798cSAdam Langley 		tcp_options_size = tcp_established_options(sk, skb, &opts,
97533ad798cSAdam Langley 							   &md5);
97633ad798cSAdam Langley 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
9771da177e4SLinus Torvalds 
9783853b584STom Herbert 	if (tcp_packets_in_flight(tp) == 0) {
9796687e988SArnaldo Carvalho de Melo 		tcp_ca_event(sk, CA_EVENT_TX_START);
9803853b584STom Herbert 		skb->ooo_okay = 1;
9813853b584STom Herbert 	} else
9823853b584STom Herbert 		skb->ooo_okay = 0;
9831da177e4SLinus Torvalds 
984aa8223c7SArnaldo Carvalho de Melo 	skb_push(skb, tcp_header_size);
985aa8223c7SArnaldo Carvalho de Melo 	skb_reset_transport_header(skb);
986*46d3ceabSEric Dumazet 
987*46d3ceabSEric Dumazet 	skb_orphan(skb);
988*46d3ceabSEric Dumazet 	skb->sk = sk;
989*46d3ceabSEric Dumazet 	skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
990*46d3ceabSEric Dumazet 			  tcp_wfree : sock_wfree;
991*46d3ceabSEric Dumazet 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
9921da177e4SLinus Torvalds 
9931da177e4SLinus Torvalds 	/* Build TCP header and checksum it. */
994aa8223c7SArnaldo Carvalho de Melo 	th = tcp_hdr(skb);
995c720c7e8SEric Dumazet 	th->source		= inet->inet_sport;
996c720c7e8SEric Dumazet 	th->dest		= inet->inet_dport;
9971da177e4SLinus Torvalds 	th->seq			= htonl(tcb->seq);
9981da177e4SLinus Torvalds 	th->ack_seq		= htonl(tp->rcv_nxt);
999df7a3b07SAl Viro 	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
10004de075e0SEric Dumazet 					tcb->tcp_flags);
1001dfb4b9dcSDavid S. Miller 
10024de075e0SEric Dumazet 	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
10031da177e4SLinus Torvalds 		/* RFC1323: The window in SYN & SYN/ACK segments
10041da177e4SLinus Torvalds 		 * is never scaled.
10051da177e4SLinus Torvalds 		 */
1006600ff0c2SIlpo Järvinen 		th->window	= htons(min(tp->rcv_wnd, 65535U));
10071da177e4SLinus Torvalds 	} else {
10081da177e4SLinus Torvalds 		th->window	= htons(tcp_select_window(sk));
10091da177e4SLinus Torvalds 	}
10101da177e4SLinus Torvalds 	th->check		= 0;
10111da177e4SLinus Torvalds 	th->urg_ptr		= 0;
10121da177e4SLinus Torvalds 
101333f5f57eSIlpo Järvinen 	/* The urg_mode check is necessary during a below snd_una win probe */
10147691367dSHerbert Xu 	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
10157691367dSHerbert Xu 		if (before(tp->snd_up, tcb->seq + 0x10000)) {
10161da177e4SLinus Torvalds 			th->urg_ptr = htons(tp->snd_up - tcb->seq);
10171da177e4SLinus Torvalds 			th->urg = 1;
10187691367dSHerbert Xu 		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
10190eae88f3SEric Dumazet 			th->urg_ptr = htons(0xFFFF);
10207691367dSHerbert Xu 			th->urg = 1;
10217691367dSHerbert Xu 		}
10221da177e4SLinus Torvalds 	}
10231da177e4SLinus Torvalds 
1024bd0388aeSWilliam Allen Simpson 	tcp_options_write((__be32 *)(th + 1), tp, &opts);
10254de075e0SEric Dumazet 	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
10269e412ba7SIlpo Järvinen 		TCP_ECN_send(sk, skb, tcp_header_size);
1027dfb4b9dcSDavid S. Miller 
1028cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
1029cfb6eeb4SYOSHIFUJI Hideaki 	/* Calculate the MD5 hash, as we have all we need now */
1030cfb6eeb4SYOSHIFUJI Hideaki 	if (md5) {
1031a465419bSEric Dumazet 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1032bd0388aeSWilliam Allen Simpson 		tp->af_specific->calc_md5_hash(opts.hash_location,
103349a72dfbSAdam Langley 					       md5, sk, NULL, skb);
1034cfb6eeb4SYOSHIFUJI Hideaki 	}
1035cfb6eeb4SYOSHIFUJI Hideaki #endif
1036cfb6eeb4SYOSHIFUJI Hideaki 
1037bb296246SHerbert Xu 	icsk->icsk_af_ops->send_check(sk, skb);
10381da177e4SLinus Torvalds 
10394de075e0SEric Dumazet 	if (likely(tcb->tcp_flags & TCPHDR_ACK))
1040fc6415bcSDavid S. Miller 		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
10411da177e4SLinus Torvalds 
10421da177e4SLinus Torvalds 	if (skb->len != tcp_header_size)
1043cf533ea5SEric Dumazet 		tcp_event_data_sent(tp, sk);
10441da177e4SLinus Torvalds 
1045bd37a088SWei Yongjun 	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
1046aa2ea058STom Herbert 		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
1047aa2ea058STom Herbert 			      tcp_skb_pcount(skb));
10481da177e4SLinus Torvalds 
1049d9d8da80SDavid S. Miller 	err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
105083de47cdSHua Zhong 	if (likely(err <= 0))
10511da177e4SLinus Torvalds 		return err;
10521da177e4SLinus Torvalds 
10533cfe3baaSIlpo Järvinen 	tcp_enter_cwr(sk, 1);
10541da177e4SLinus Torvalds 
1055b9df3cb8SGerrit Renker 	return net_xmit_eval(err);
10561da177e4SLinus Torvalds }
10571da177e4SLinus Torvalds 
105867edfef7SAndi Kleen /* This routine just queues the buffer for sending.
10591da177e4SLinus Torvalds  *
10601da177e4SLinus Torvalds  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
10611da177e4SLinus Torvalds  * otherwise socket can stall.
10621da177e4SLinus Torvalds  */
10631da177e4SLinus Torvalds static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
10641da177e4SLinus Torvalds {
10651da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
10661da177e4SLinus Torvalds 
10671da177e4SLinus Torvalds 	/* Advance write_seq and place onto the write_queue. */
10681da177e4SLinus Torvalds 	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
10691da177e4SLinus Torvalds 	skb_header_release(skb);
1070fe067e8aSDavid S. Miller 	tcp_add_write_queue_tail(sk, skb);
10713ab224beSHideo Aoki 	sk->sk_wmem_queued += skb->truesize;
10723ab224beSHideo Aoki 	sk_mem_charge(sk, skb->truesize);
10731da177e4SLinus Torvalds }
10741da177e4SLinus Torvalds 
107567edfef7SAndi Kleen /* Initialize TSO segments for a packet. */
1076cf533ea5SEric Dumazet static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
1077056834d9SIlpo Järvinen 				 unsigned int mss_now)
1078f6302d1dSDavid S. Miller {
10798e5b9ddaSHerbert Xu 	if (skb->len <= mss_now || !sk_can_gso(sk) ||
10808e5b9ddaSHerbert Xu 	    skb->ip_summed == CHECKSUM_NONE) {
1081f6302d1dSDavid S. Miller 		/* Avoid the costly divide in the normal
1082f6302d1dSDavid S. Miller 		 * non-TSO case.
1083f6302d1dSDavid S. Miller 		 */
10847967168cSHerbert Xu 		skb_shinfo(skb)->gso_segs = 1;
10857967168cSHerbert Xu 		skb_shinfo(skb)->gso_size = 0;
10867967168cSHerbert Xu 		skb_shinfo(skb)->gso_type = 0;
1087f6302d1dSDavid S. Miller 	} else {
1088356f89e1SIlpo Järvinen 		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
10897967168cSHerbert Xu 		skb_shinfo(skb)->gso_size = mss_now;
1090bcd76111SHerbert Xu 		skb_shinfo(skb)->gso_type = sk->sk_gso_type;
10911da177e4SLinus Torvalds 	}
10921da177e4SLinus Torvalds }
10931da177e4SLinus Torvalds 
109491fed7a1SIlpo Järvinen /* When a modification to fackets out becomes necessary, we need to check
109568f8353bSIlpo Järvinen  * skb is counted to fackets_out or not.
109691fed7a1SIlpo Järvinen  */
1097cf533ea5SEric Dumazet static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
109891fed7a1SIlpo Järvinen 				   int decr)
109991fed7a1SIlpo Järvinen {
1100a47e5a98SIlpo Järvinen 	struct tcp_sock *tp = tcp_sk(sk);
1101a47e5a98SIlpo Järvinen 
1102dc86967bSIlpo Järvinen 	if (!tp->sacked_out || tcp_is_reno(tp))
110391fed7a1SIlpo Järvinen 		return;
110491fed7a1SIlpo Järvinen 
11056859d494SIlpo Järvinen 	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
110691fed7a1SIlpo Järvinen 		tp->fackets_out -= decr;
110791fed7a1SIlpo Järvinen }
110891fed7a1SIlpo Järvinen 
1109797108d1SIlpo Järvinen /* Pcount in the middle of the write queue got changed, we need to do various
1110797108d1SIlpo Järvinen  * tweaks to fix counters
1111797108d1SIlpo Järvinen  */
1112cf533ea5SEric Dumazet static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1113797108d1SIlpo Järvinen {
1114797108d1SIlpo Järvinen 	struct tcp_sock *tp = tcp_sk(sk);
1115797108d1SIlpo Järvinen 
1116797108d1SIlpo Järvinen 	tp->packets_out -= decr;
1117797108d1SIlpo Järvinen 
1118797108d1SIlpo Järvinen 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1119797108d1SIlpo Järvinen 		tp->sacked_out -= decr;
1120797108d1SIlpo Järvinen 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1121797108d1SIlpo Järvinen 		tp->retrans_out -= decr;
1122797108d1SIlpo Järvinen 	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1123797108d1SIlpo Järvinen 		tp->lost_out -= decr;
1124797108d1SIlpo Järvinen 
1125797108d1SIlpo Järvinen 	/* Reno case is special. Sigh... */
1126797108d1SIlpo Järvinen 	if (tcp_is_reno(tp) && decr > 0)
1127797108d1SIlpo Järvinen 		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1128797108d1SIlpo Järvinen 
1129797108d1SIlpo Järvinen 	tcp_adjust_fackets_out(sk, skb, decr);
1130797108d1SIlpo Järvinen 
1131797108d1SIlpo Järvinen 	if (tp->lost_skb_hint &&
1132797108d1SIlpo Järvinen 	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
113352cf3cc8SIlpo Järvinen 	    (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
1134797108d1SIlpo Järvinen 		tp->lost_cnt_hint -= decr;
1135797108d1SIlpo Järvinen 
1136797108d1SIlpo Järvinen 	tcp_verify_left_out(tp);
1137797108d1SIlpo Järvinen }
1138797108d1SIlpo Järvinen 
11391da177e4SLinus Torvalds /* Function to create two new TCP segments.  Shrinks the given segment
11401da177e4SLinus Torvalds  * to the specified size and appends a new segment with the rest of the
11411da177e4SLinus Torvalds  * packet to the list.  This won't be called frequently, I hope.
11421da177e4SLinus Torvalds  * Remember, these are still headerless SKBs at this point.
11431da177e4SLinus Torvalds  */
1144056834d9SIlpo Järvinen int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1145056834d9SIlpo Järvinen 		 unsigned int mss_now)
11461da177e4SLinus Torvalds {
11471da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
11481da177e4SLinus Torvalds 	struct sk_buff *buff;
11496475be16SDavid S. Miller 	int nsize, old_factor;
1150b60b49eaSHerbert Xu 	int nlen;
11519ce01461SIlpo Järvinen 	u8 flags;
11521da177e4SLinus Torvalds 
11532fceec13SIlpo Järvinen 	if (WARN_ON(len > skb->len))
11542fceec13SIlpo Järvinen 		return -EINVAL;
11556a438bbeSStephen Hemminger 
11561da177e4SLinus Torvalds 	nsize = skb_headlen(skb) - len;
11571da177e4SLinus Torvalds 	if (nsize < 0)
11581da177e4SLinus Torvalds 		nsize = 0;
11591da177e4SLinus Torvalds 
11601da177e4SLinus Torvalds 	if (skb_cloned(skb) &&
11611da177e4SLinus Torvalds 	    skb_is_nonlinear(skb) &&
11621da177e4SLinus Torvalds 	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
11631da177e4SLinus Torvalds 		return -ENOMEM;
11641da177e4SLinus Torvalds 
11651da177e4SLinus Torvalds 	/* Get a new skb... force flag on. */
11661da177e4SLinus Torvalds 	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
11671da177e4SLinus Torvalds 	if (buff == NULL)
11681da177e4SLinus Torvalds 		return -ENOMEM; /* We'll just try again later. */
1169ef5cb973SHerbert Xu 
11703ab224beSHideo Aoki 	sk->sk_wmem_queued += buff->truesize;
11713ab224beSHideo Aoki 	sk_mem_charge(sk, buff->truesize);
1172b60b49eaSHerbert Xu 	nlen = skb->len - len - nsize;
1173b60b49eaSHerbert Xu 	buff->truesize += nlen;
1174b60b49eaSHerbert Xu 	skb->truesize -= nlen;
11751da177e4SLinus Torvalds 
11761da177e4SLinus Torvalds 	/* Correct the sequence numbers. */
11771da177e4SLinus Torvalds 	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
11781da177e4SLinus Torvalds 	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
11791da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
11801da177e4SLinus Torvalds 
11811da177e4SLinus Torvalds 	/* PSH and FIN should only be set in the second packet. */
11824de075e0SEric Dumazet 	flags = TCP_SKB_CB(skb)->tcp_flags;
11834de075e0SEric Dumazet 	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
11844de075e0SEric Dumazet 	TCP_SKB_CB(buff)->tcp_flags = flags;
1185e14c3cafSHerbert Xu 	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
11861da177e4SLinus Torvalds 
118784fa7933SPatrick McHardy 	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
11881da177e4SLinus Torvalds 		/* Copy and checksum data tail into the new buffer. */
1189056834d9SIlpo Järvinen 		buff->csum = csum_partial_copy_nocheck(skb->data + len,
1190056834d9SIlpo Järvinen 						       skb_put(buff, nsize),
11911da177e4SLinus Torvalds 						       nsize, 0);
11921da177e4SLinus Torvalds 
11931da177e4SLinus Torvalds 		skb_trim(skb, len);
11941da177e4SLinus Torvalds 
11951da177e4SLinus Torvalds 		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
11961da177e4SLinus Torvalds 	} else {
119784fa7933SPatrick McHardy 		skb->ip_summed = CHECKSUM_PARTIAL;
11981da177e4SLinus Torvalds 		skb_split(skb, buff, len);
11991da177e4SLinus Torvalds 	}
12001da177e4SLinus Torvalds 
12011da177e4SLinus Torvalds 	buff->ip_summed = skb->ip_summed;
12021da177e4SLinus Torvalds 
12031da177e4SLinus Torvalds 	/* Looks stupid, but our code really uses when of
12041da177e4SLinus Torvalds 	 * skbs, which it never sent before. --ANK
12051da177e4SLinus Torvalds 	 */
12061da177e4SLinus Torvalds 	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
1207a61bbcf2SPatrick McHardy 	buff->tstamp = skb->tstamp;
12081da177e4SLinus Torvalds 
12096475be16SDavid S. Miller 	old_factor = tcp_skb_pcount(skb);
12106475be16SDavid S. Miller 
12111da177e4SLinus Torvalds 	/* Fix up tso_factor for both original and new SKB.  */
1212846998aeSDavid S. Miller 	tcp_set_skb_tso_segs(sk, skb, mss_now);
1213846998aeSDavid S. Miller 	tcp_set_skb_tso_segs(sk, buff, mss_now);
12141da177e4SLinus Torvalds 
12156475be16SDavid S. Miller 	/* If this packet has been sent out already, we must
12166475be16SDavid S. Miller 	 * adjust the various packet counters.
12176475be16SDavid S. Miller 	 */
1218cf0b450cSHerbert Xu 	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
12196475be16SDavid S. Miller 		int diff = old_factor - tcp_skb_pcount(skb) -
12206475be16SDavid S. Miller 			tcp_skb_pcount(buff);
12211da177e4SLinus Torvalds 
1222797108d1SIlpo Järvinen 		if (diff)
1223797108d1SIlpo Järvinen 			tcp_adjust_pcount(sk, skb, diff);
12241da177e4SLinus Torvalds 	}
12251da177e4SLinus Torvalds 
12261da177e4SLinus Torvalds 	/* Link BUFF into the send queue. */
1227f44b5271SDavid S. Miller 	skb_header_release(buff);
1228fe067e8aSDavid S. Miller 	tcp_insert_write_queue_after(skb, buff, sk);
12291da177e4SLinus Torvalds 
12301da177e4SLinus Torvalds 	return 0;
12311da177e4SLinus Torvalds }
12321da177e4SLinus Torvalds 
12331da177e4SLinus Torvalds /* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
12341da177e4SLinus Torvalds  * eventually). The difference is that pulled data not copied, but
12351da177e4SLinus Torvalds  * immediately discarded.
12361da177e4SLinus Torvalds  */
1237f2911969SHerbert Xu ~{PmVHI~} static void __pskb_trim_head(struct sk_buff *skb, int len)
12381da177e4SLinus Torvalds {
12391da177e4SLinus Torvalds 	int i, k, eat;
12401da177e4SLinus Torvalds 
12414fa48bf3SEric Dumazet 	eat = min_t(int, len, skb_headlen(skb));
12424fa48bf3SEric Dumazet 	if (eat) {
12434fa48bf3SEric Dumazet 		__skb_pull(skb, eat);
124422b4a4f2SEric Dumazet 		skb->avail_size -= eat;
12454fa48bf3SEric Dumazet 		len -= eat;
12464fa48bf3SEric Dumazet 		if (!len)
12474fa48bf3SEric Dumazet 			return;
12484fa48bf3SEric Dumazet 	}
12491da177e4SLinus Torvalds 	eat = len;
12501da177e4SLinus Torvalds 	k = 0;
12511da177e4SLinus Torvalds 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
12529e903e08SEric Dumazet 		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
12539e903e08SEric Dumazet 
12549e903e08SEric Dumazet 		if (size <= eat) {
1255aff65da0SIan Campbell 			skb_frag_unref(skb, i);
12569e903e08SEric Dumazet 			eat -= size;
12571da177e4SLinus Torvalds 		} else {
12581da177e4SLinus Torvalds 			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
12591da177e4SLinus Torvalds 			if (eat) {
12601da177e4SLinus Torvalds 				skb_shinfo(skb)->frags[k].page_offset += eat;
12619e903e08SEric Dumazet 				skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
12621da177e4SLinus Torvalds 				eat = 0;
12631da177e4SLinus Torvalds 			}
12641da177e4SLinus Torvalds 			k++;
12651da177e4SLinus Torvalds 		}
12661da177e4SLinus Torvalds 	}
12671da177e4SLinus Torvalds 	skb_shinfo(skb)->nr_frags = k;
12681da177e4SLinus Torvalds 
126927a884dcSArnaldo Carvalho de Melo 	skb_reset_tail_pointer(skb);
12701da177e4SLinus Torvalds 	skb->data_len -= len;
12711da177e4SLinus Torvalds 	skb->len = skb->data_len;
12721da177e4SLinus Torvalds }
12731da177e4SLinus Torvalds 
127467edfef7SAndi Kleen /* Remove acked data from a packet in the transmit queue. */
12751da177e4SLinus Torvalds int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
12761da177e4SLinus Torvalds {
1277056834d9SIlpo Järvinen 	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
12781da177e4SLinus Torvalds 		return -ENOMEM;
12791da177e4SLinus Torvalds 
12804fa48bf3SEric Dumazet 	__pskb_trim_head(skb, len);
12811da177e4SLinus Torvalds 
12821da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->seq += len;
128384fa7933SPatrick McHardy 	skb->ip_summed = CHECKSUM_PARTIAL;
12841da177e4SLinus Torvalds 
12851da177e4SLinus Torvalds 	skb->truesize	     -= len;
12861da177e4SLinus Torvalds 	sk->sk_wmem_queued   -= len;
12873ab224beSHideo Aoki 	sk_mem_uncharge(sk, len);
12881da177e4SLinus Torvalds 	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
12891da177e4SLinus Torvalds 
12905b35e1e6SNeal Cardwell 	/* Any change of skb->len requires recalculation of tso factor. */
12911da177e4SLinus Torvalds 	if (tcp_skb_pcount(skb) > 1)
12925b35e1e6SNeal Cardwell 		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
12931da177e4SLinus Torvalds 
12941da177e4SLinus Torvalds 	return 0;
12951da177e4SLinus Torvalds }
12961da177e4SLinus Torvalds 
129767edfef7SAndi Kleen /* Calculate MSS. Not accounting for SACKs here.  */
129867469601SEric Dumazet int tcp_mtu_to_mss(struct sock *sk, int pmtu)
12995d424d5aSJohn Heffner {
1300cf533ea5SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
1301cf533ea5SEric Dumazet 	const struct inet_connection_sock *icsk = inet_csk(sk);
13025d424d5aSJohn Heffner 	int mss_now;
13035d424d5aSJohn Heffner 
13045d424d5aSJohn Heffner 	/* Calculate base mss without TCP options:
13055d424d5aSJohn Heffner 	   It is MMS_S - sizeof(tcphdr) of rfc1122
13065d424d5aSJohn Heffner 	 */
13075d424d5aSJohn Heffner 	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
13085d424d5aSJohn Heffner 
130967469601SEric Dumazet 	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
131067469601SEric Dumazet 	if (icsk->icsk_af_ops->net_frag_header_len) {
131167469601SEric Dumazet 		const struct dst_entry *dst = __sk_dst_get(sk);
131267469601SEric Dumazet 
131367469601SEric Dumazet 		if (dst && dst_allfrag(dst))
131467469601SEric Dumazet 			mss_now -= icsk->icsk_af_ops->net_frag_header_len;
131567469601SEric Dumazet 	}
131667469601SEric Dumazet 
13175d424d5aSJohn Heffner 	/* Clamp it (mss_clamp does not include tcp options) */
13185d424d5aSJohn Heffner 	if (mss_now > tp->rx_opt.mss_clamp)
13195d424d5aSJohn Heffner 		mss_now = tp->rx_opt.mss_clamp;
13205d424d5aSJohn Heffner 
13215d424d5aSJohn Heffner 	/* Now subtract optional transport overhead */
13225d424d5aSJohn Heffner 	mss_now -= icsk->icsk_ext_hdr_len;
13235d424d5aSJohn Heffner 
13245d424d5aSJohn Heffner 	/* Then reserve room for full set of TCP options and 8 bytes of data */
13255d424d5aSJohn Heffner 	if (mss_now < 48)
13265d424d5aSJohn Heffner 		mss_now = 48;
13275d424d5aSJohn Heffner 
13285d424d5aSJohn Heffner 	/* Now subtract TCP options size, not including SACKs */
13295d424d5aSJohn Heffner 	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
13305d424d5aSJohn Heffner 
13315d424d5aSJohn Heffner 	return mss_now;
13325d424d5aSJohn Heffner }
13335d424d5aSJohn Heffner 
13345d424d5aSJohn Heffner /* Inverse of above */
133567469601SEric Dumazet int tcp_mss_to_mtu(struct sock *sk, int mss)
13365d424d5aSJohn Heffner {
1337cf533ea5SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
1338cf533ea5SEric Dumazet 	const struct inet_connection_sock *icsk = inet_csk(sk);
13395d424d5aSJohn Heffner 	int mtu;
13405d424d5aSJohn Heffner 
13415d424d5aSJohn Heffner 	mtu = mss +
13425d424d5aSJohn Heffner 	      tp->tcp_header_len +
13435d424d5aSJohn Heffner 	      icsk->icsk_ext_hdr_len +
13445d424d5aSJohn Heffner 	      icsk->icsk_af_ops->net_header_len;
13455d424d5aSJohn Heffner 
134667469601SEric Dumazet 	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
134767469601SEric Dumazet 	if (icsk->icsk_af_ops->net_frag_header_len) {
134867469601SEric Dumazet 		const struct dst_entry *dst = __sk_dst_get(sk);
134967469601SEric Dumazet 
135067469601SEric Dumazet 		if (dst && dst_allfrag(dst))
135167469601SEric Dumazet 			mtu += icsk->icsk_af_ops->net_frag_header_len;
135267469601SEric Dumazet 	}
13535d424d5aSJohn Heffner 	return mtu;
13545d424d5aSJohn Heffner }
13555d424d5aSJohn Heffner 
135667edfef7SAndi Kleen /* MTU probing init per socket */
13575d424d5aSJohn Heffner void tcp_mtup_init(struct sock *sk)
13585d424d5aSJohn Heffner {
13595d424d5aSJohn Heffner 	struct tcp_sock *tp = tcp_sk(sk);
13605d424d5aSJohn Heffner 	struct inet_connection_sock *icsk = inet_csk(sk);
13615d424d5aSJohn Heffner 
13625d424d5aSJohn Heffner 	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
13635d424d5aSJohn Heffner 	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
13645d424d5aSJohn Heffner 			       icsk->icsk_af_ops->net_header_len;
13655d424d5aSJohn Heffner 	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
13665d424d5aSJohn Heffner 	icsk->icsk_mtup.probe_size = 0;
13675d424d5aSJohn Heffner }
13684bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_mtup_init);
13695d424d5aSJohn Heffner 
13701da177e4SLinus Torvalds /* This function synchronize snd mss to current pmtu/exthdr set.
13711da177e4SLinus Torvalds 
13721da177e4SLinus Torvalds    tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
13731da177e4SLinus Torvalds    for TCP options, but includes only bare TCP header.
13741da177e4SLinus Torvalds 
13751da177e4SLinus Torvalds    tp->rx_opt.mss_clamp is mss negotiated at connection setup.
1376caa20d9aSStephen Hemminger    It is minimum of user_mss and mss received with SYN.
13771da177e4SLinus Torvalds    It also does not include TCP options.
13781da177e4SLinus Torvalds 
1379d83d8461SArnaldo Carvalho de Melo    inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
13801da177e4SLinus Torvalds 
13811da177e4SLinus Torvalds    tp->mss_cache is current effective sending mss, including
13821da177e4SLinus Torvalds    all tcp options except for SACKs. It is evaluated,
13831da177e4SLinus Torvalds    taking into account current pmtu, but never exceeds
13841da177e4SLinus Torvalds    tp->rx_opt.mss_clamp.
13851da177e4SLinus Torvalds 
13861da177e4SLinus Torvalds    NOTE1. rfc1122 clearly states that advertised MSS
13871da177e4SLinus Torvalds    DOES NOT include either tcp or ip options.
13881da177e4SLinus Torvalds 
1389d83d8461SArnaldo Carvalho de Melo    NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
1390d83d8461SArnaldo Carvalho de Melo    are READ ONLY outside this function.		--ANK (980731)
13911da177e4SLinus Torvalds  */
13921da177e4SLinus Torvalds unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
13931da177e4SLinus Torvalds {
13941da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
1395d83d8461SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
13965d424d5aSJohn Heffner 	int mss_now;
13971da177e4SLinus Torvalds 
13985d424d5aSJohn Heffner 	if (icsk->icsk_mtup.search_high > pmtu)
13995d424d5aSJohn Heffner 		icsk->icsk_mtup.search_high = pmtu;
14001da177e4SLinus Torvalds 
14015d424d5aSJohn Heffner 	mss_now = tcp_mtu_to_mss(sk, pmtu);
1402409d22b4SIlpo Järvinen 	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
14031da177e4SLinus Torvalds 
14041da177e4SLinus Torvalds 	/* And store cached results */
1405d83d8461SArnaldo Carvalho de Melo 	icsk->icsk_pmtu_cookie = pmtu;
14065d424d5aSJohn Heffner 	if (icsk->icsk_mtup.enabled)
14075d424d5aSJohn Heffner 		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1408c1b4a7e6SDavid S. Miller 	tp->mss_cache = mss_now;
14091da177e4SLinus Torvalds 
14101da177e4SLinus Torvalds 	return mss_now;
14111da177e4SLinus Torvalds }
14124bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_sync_mss);
14131da177e4SLinus Torvalds 
14141da177e4SLinus Torvalds /* Compute the current effective MSS, taking SACKs and IP options,
14151da177e4SLinus Torvalds  * and even PMTU discovery events into account.
14161da177e4SLinus Torvalds  */
14170c54b85fSIlpo Järvinen unsigned int tcp_current_mss(struct sock *sk)
14181da177e4SLinus Torvalds {
1419cf533ea5SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
1420cf533ea5SEric Dumazet 	const struct dst_entry *dst = __sk_dst_get(sk);
1421c1b4a7e6SDavid S. Miller 	u32 mss_now;
142295c96174SEric Dumazet 	unsigned int header_len;
142333ad798cSAdam Langley 	struct tcp_out_options opts;
142433ad798cSAdam Langley 	struct tcp_md5sig_key *md5;
14251da177e4SLinus Torvalds 
1426c1b4a7e6SDavid S. Miller 	mss_now = tp->mss_cache;
1427c1b4a7e6SDavid S. Miller 
14281da177e4SLinus Torvalds 	if (dst) {
14291da177e4SLinus Torvalds 		u32 mtu = dst_mtu(dst);
1430d83d8461SArnaldo Carvalho de Melo 		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
14311da177e4SLinus Torvalds 			mss_now = tcp_sync_mss(sk, mtu);
14321da177e4SLinus Torvalds 	}
14331da177e4SLinus Torvalds 
143433ad798cSAdam Langley 	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
143533ad798cSAdam Langley 		     sizeof(struct tcphdr);
143633ad798cSAdam Langley 	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
143733ad798cSAdam Langley 	 * some common options. If this is an odd packet (because we have SACK
143833ad798cSAdam Langley 	 * blocks etc) then our calculated header_len will be different, and
143933ad798cSAdam Langley 	 * we have to adjust mss_now correspondingly */
144033ad798cSAdam Langley 	if (header_len != tp->tcp_header_len) {
144133ad798cSAdam Langley 		int delta = (int) header_len - tp->tcp_header_len;
144233ad798cSAdam Langley 		mss_now -= delta;
144333ad798cSAdam Langley 	}
1444cfb6eeb4SYOSHIFUJI Hideaki 
14451da177e4SLinus Torvalds 	return mss_now;
14461da177e4SLinus Torvalds }
14471da177e4SLinus Torvalds 
1448a762a980SDavid S. Miller /* Congestion window validation. (RFC2861) */
14499e412ba7SIlpo Järvinen static void tcp_cwnd_validate(struct sock *sk)
1450a762a980SDavid S. Miller {
14519e412ba7SIlpo Järvinen 	struct tcp_sock *tp = tcp_sk(sk);
1452a762a980SDavid S. Miller 
1453d436d686SIlpo Järvinen 	if (tp->packets_out >= tp->snd_cwnd) {
1454a762a980SDavid S. Miller 		/* Network is feed fully. */
1455a762a980SDavid S. Miller 		tp->snd_cwnd_used = 0;
1456a762a980SDavid S. Miller 		tp->snd_cwnd_stamp = tcp_time_stamp;
1457a762a980SDavid S. Miller 	} else {
1458a762a980SDavid S. Miller 		/* Network starves. */
1459a762a980SDavid S. Miller 		if (tp->packets_out > tp->snd_cwnd_used)
1460a762a980SDavid S. Miller 			tp->snd_cwnd_used = tp->packets_out;
1461a762a980SDavid S. Miller 
146215d33c07SDavid S. Miller 		if (sysctl_tcp_slow_start_after_idle &&
146315d33c07SDavid S. Miller 		    (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
1464a762a980SDavid S. Miller 			tcp_cwnd_application_limited(sk);
1465a762a980SDavid S. Miller 	}
1466a762a980SDavid S. Miller }
1467a762a980SDavid S. Miller 
14680e3a4803SIlpo Järvinen /* Returns the portion of skb which can be sent right away without
14690e3a4803SIlpo Järvinen  * introducing MSS oddities to segment boundaries. In rare cases where
14700e3a4803SIlpo Järvinen  * mss_now != mss_cache, we will request caller to create a small skb
14710e3a4803SIlpo Järvinen  * per input skb which could be mostly avoided here (if desired).
14725ea3a748SIlpo Järvinen  *
14735ea3a748SIlpo Järvinen  * We explicitly want to create a request for splitting write queue tail
14745ea3a748SIlpo Järvinen  * to a small skb for Nagle purposes while avoiding unnecessary modulos,
14755ea3a748SIlpo Järvinen  * thus all the complexity (cwnd_len is always MSS multiple which we
14765ea3a748SIlpo Järvinen  * return whenever allowed by the other factors). Basically we need the
14775ea3a748SIlpo Järvinen  * modulo only when the receiver window alone is the limiting factor or
14785ea3a748SIlpo Järvinen  * when we would be allowed to send the split-due-to-Nagle skb fully.
14790e3a4803SIlpo Järvinen  */
1480cf533ea5SEric Dumazet static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
1481056834d9SIlpo Järvinen 					unsigned int mss_now, unsigned int cwnd)
1482c1b4a7e6SDavid S. Miller {
1483cf533ea5SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
14840e3a4803SIlpo Järvinen 	u32 needed, window, cwnd_len;
1485c1b4a7e6SDavid S. Miller 
148690840defSIlpo Järvinen 	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1487c1b4a7e6SDavid S. Miller 	cwnd_len = mss_now * cwnd;
14880e3a4803SIlpo Järvinen 
14890e3a4803SIlpo Järvinen 	if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
14900e3a4803SIlpo Järvinen 		return cwnd_len;
14910e3a4803SIlpo Järvinen 
14925ea3a748SIlpo Järvinen 	needed = min(skb->len, window);
14935ea3a748SIlpo Järvinen 
149417515408SIlpo Järvinen 	if (cwnd_len <= needed)
14950e3a4803SIlpo Järvinen 		return cwnd_len;
14960e3a4803SIlpo Järvinen 
14970e3a4803SIlpo Järvinen 	return needed - needed % mss_now;
1498c1b4a7e6SDavid S. Miller }
1499c1b4a7e6SDavid S. Miller 
1500c1b4a7e6SDavid S. Miller /* Can at least one segment of SKB be sent right now, according to the
1501c1b4a7e6SDavid S. Miller  * congestion window rules?  If so, return how many segments are allowed.
1502c1b4a7e6SDavid S. Miller  */
1503cf533ea5SEric Dumazet static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1504cf533ea5SEric Dumazet 					 const struct sk_buff *skb)
1505c1b4a7e6SDavid S. Miller {
1506c1b4a7e6SDavid S. Miller 	u32 in_flight, cwnd;
1507c1b4a7e6SDavid S. Miller 
1508c1b4a7e6SDavid S. Miller 	/* Don't be strict about the congestion window for the final FIN.  */
15094de075e0SEric Dumazet 	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
15104de075e0SEric Dumazet 	    tcp_skb_pcount(skb) == 1)
1511c1b4a7e6SDavid S. Miller 		return 1;
1512c1b4a7e6SDavid S. Miller 
1513c1b4a7e6SDavid S. Miller 	in_flight = tcp_packets_in_flight(tp);
1514c1b4a7e6SDavid S. Miller 	cwnd = tp->snd_cwnd;
1515c1b4a7e6SDavid S. Miller 	if (in_flight < cwnd)
1516c1b4a7e6SDavid S. Miller 		return (cwnd - in_flight);
1517c1b4a7e6SDavid S. Miller 
1518c1b4a7e6SDavid S. Miller 	return 0;
1519c1b4a7e6SDavid S. Miller }
1520c1b4a7e6SDavid S. Miller 
1521b595076aSUwe Kleine-König /* Initialize TSO state of a skb.
152267edfef7SAndi Kleen  * This must be invoked the first time we consider transmitting
1523c1b4a7e6SDavid S. Miller  * SKB onto the wire.
1524c1b4a7e6SDavid S. Miller  */
1525cf533ea5SEric Dumazet static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
1526056834d9SIlpo Järvinen 			     unsigned int mss_now)
1527c1b4a7e6SDavid S. Miller {
1528c1b4a7e6SDavid S. Miller 	int tso_segs = tcp_skb_pcount(skb);
1529c1b4a7e6SDavid S. Miller 
1530f8269a49SIlpo Järvinen 	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
1531846998aeSDavid S. Miller 		tcp_set_skb_tso_segs(sk, skb, mss_now);
1532c1b4a7e6SDavid S. Miller 		tso_segs = tcp_skb_pcount(skb);
1533c1b4a7e6SDavid S. Miller 	}
1534c1b4a7e6SDavid S. Miller 	return tso_segs;
1535c1b4a7e6SDavid S. Miller }
1536c1b4a7e6SDavid S. Miller 
153767edfef7SAndi Kleen /* Minshall's variant of the Nagle send check. */
1538a2a385d6SEric Dumazet static inline bool tcp_minshall_check(const struct tcp_sock *tp)
1539c1b4a7e6SDavid S. Miller {
1540c1b4a7e6SDavid S. Miller 	return after(tp->snd_sml, tp->snd_una) &&
1541c1b4a7e6SDavid S. Miller 		!after(tp->snd_sml, tp->snd_nxt);
1542c1b4a7e6SDavid S. Miller }
1543c1b4a7e6SDavid S. Miller 
1544a2a385d6SEric Dumazet /* Return false, if packet can be sent now without violation Nagle's rules:
1545c1b4a7e6SDavid S. Miller  * 1. It is full sized.
1546c1b4a7e6SDavid S. Miller  * 2. Or it contains FIN. (already checked by caller)
15476d67e9beSFeng King  * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1548c1b4a7e6SDavid S. Miller  * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1549c1b4a7e6SDavid S. Miller  *    With Minshall's modification: all sent small packets are ACKed.
1550c1b4a7e6SDavid S. Miller  */
1551a2a385d6SEric Dumazet static inline bool tcp_nagle_check(const struct tcp_sock *tp,
1552c1b4a7e6SDavid S. Miller 				  const struct sk_buff *skb,
155395c96174SEric Dumazet 				  unsigned int mss_now, int nonagle)
1554c1b4a7e6SDavid S. Miller {
1555a02cec21SEric Dumazet 	return skb->len < mss_now &&
1556c1b4a7e6SDavid S. Miller 		((nonagle & TCP_NAGLE_CORK) ||
1557a02cec21SEric Dumazet 		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1558c1b4a7e6SDavid S. Miller }
1559c1b4a7e6SDavid S. Miller 
1560a2a385d6SEric Dumazet /* Return true if the Nagle test allows this packet to be
1561c1b4a7e6SDavid S. Miller  * sent now.
1562c1b4a7e6SDavid S. Miller  */
1563a2a385d6SEric Dumazet static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
1564c1b4a7e6SDavid S. Miller 				  unsigned int cur_mss, int nonagle)
1565c1b4a7e6SDavid S. Miller {
1566c1b4a7e6SDavid S. Miller 	/* Nagle rule does not apply to frames, which sit in the middle of the
1567c1b4a7e6SDavid S. Miller 	 * write_queue (they have no chances to get new data).
1568c1b4a7e6SDavid S. Miller 	 *
1569c1b4a7e6SDavid S. Miller 	 * This is implemented in the callers, where they modify the 'nonagle'
1570c1b4a7e6SDavid S. Miller 	 * argument based upon the location of SKB in the send queue.
1571c1b4a7e6SDavid S. Miller 	 */
1572c1b4a7e6SDavid S. Miller 	if (nonagle & TCP_NAGLE_PUSH)
1573a2a385d6SEric Dumazet 		return true;
1574c1b4a7e6SDavid S. Miller 
1575d551e454SIlpo Järvinen 	/* Don't use the nagle rule for urgent data (or for the final FIN).
1576d551e454SIlpo Järvinen 	 * Nagle can be ignored during F-RTO too (see RFC4138).
1577d551e454SIlpo Järvinen 	 */
157833f5f57eSIlpo Järvinen 	if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
15794de075e0SEric Dumazet 	    (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1580a2a385d6SEric Dumazet 		return true;
1581c1b4a7e6SDavid S. Miller 
1582c1b4a7e6SDavid S. Miller 	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
1583a2a385d6SEric Dumazet 		return true;
1584c1b4a7e6SDavid S. Miller 
1585a2a385d6SEric Dumazet 	return false;
1586c1b4a7e6SDavid S. Miller }
1587c1b4a7e6SDavid S. Miller 
1588c1b4a7e6SDavid S. Miller /* Does at least the first segment of SKB fit into the send window? */
1589a2a385d6SEric Dumazet static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1590a2a385d6SEric Dumazet 			     const struct sk_buff *skb,
1591056834d9SIlpo Järvinen 			     unsigned int cur_mss)
1592c1b4a7e6SDavid S. Miller {
1593c1b4a7e6SDavid S. Miller 	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1594c1b4a7e6SDavid S. Miller 
1595c1b4a7e6SDavid S. Miller 	if (skb->len > cur_mss)
1596c1b4a7e6SDavid S. Miller 		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1597c1b4a7e6SDavid S. Miller 
159890840defSIlpo Järvinen 	return !after(end_seq, tcp_wnd_end(tp));
1599c1b4a7e6SDavid S. Miller }
1600c1b4a7e6SDavid S. Miller 
1601fe067e8aSDavid S. Miller /* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
1602c1b4a7e6SDavid S. Miller  * should be put on the wire right now.  If so, it returns the number of
1603c1b4a7e6SDavid S. Miller  * packets allowed by the congestion window.
1604c1b4a7e6SDavid S. Miller  */
1605cf533ea5SEric Dumazet static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
1606c1b4a7e6SDavid S. Miller 				 unsigned int cur_mss, int nonagle)
1607c1b4a7e6SDavid S. Miller {
1608cf533ea5SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
1609c1b4a7e6SDavid S. Miller 	unsigned int cwnd_quota;
1610c1b4a7e6SDavid S. Miller 
1611846998aeSDavid S. Miller 	tcp_init_tso_segs(sk, skb, cur_mss);
1612c1b4a7e6SDavid S. Miller 
1613c1b4a7e6SDavid S. Miller 	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
1614c1b4a7e6SDavid S. Miller 		return 0;
1615c1b4a7e6SDavid S. Miller 
1616c1b4a7e6SDavid S. Miller 	cwnd_quota = tcp_cwnd_test(tp, skb);
1617056834d9SIlpo Järvinen 	if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
1618c1b4a7e6SDavid S. Miller 		cwnd_quota = 0;
1619c1b4a7e6SDavid S. Miller 
1620c1b4a7e6SDavid S. Miller 	return cwnd_quota;
1621c1b4a7e6SDavid S. Miller }
1622c1b4a7e6SDavid S. Miller 
162367edfef7SAndi Kleen /* Test if sending is allowed right now. */
1624a2a385d6SEric Dumazet bool tcp_may_send_now(struct sock *sk)
1625c1b4a7e6SDavid S. Miller {
1626cf533ea5SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
1627fe067e8aSDavid S. Miller 	struct sk_buff *skb = tcp_send_head(sk);
1628c1b4a7e6SDavid S. Miller 
1629a02cec21SEric Dumazet 	return skb &&
16300c54b85fSIlpo Järvinen 		tcp_snd_test(sk, skb, tcp_current_mss(sk),
1631c1b4a7e6SDavid S. Miller 			     (tcp_skb_is_last(sk, skb) ?
1632a02cec21SEric Dumazet 			      tp->nonagle : TCP_NAGLE_PUSH));
1633c1b4a7e6SDavid S. Miller }
1634c1b4a7e6SDavid S. Miller 
1635c1b4a7e6SDavid S. Miller /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
1636c1b4a7e6SDavid S. Miller  * which is put after SKB on the list.  It is very much like
1637c1b4a7e6SDavid S. Miller  * tcp_fragment() except that it may make several kinds of assumptions
1638c1b4a7e6SDavid S. Miller  * in order to speed up the splitting operation.  In particular, we
1639c1b4a7e6SDavid S. Miller  * know that all the data is in scatter-gather pages, and that the
1640c1b4a7e6SDavid S. Miller  * packet has never been sent out before (and thus is not cloned).
1641c1b4a7e6SDavid S. Miller  */
1642056834d9SIlpo Järvinen static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1643c4ead4c5SEric Dumazet 			unsigned int mss_now, gfp_t gfp)
1644c1b4a7e6SDavid S. Miller {
1645c1b4a7e6SDavid S. Miller 	struct sk_buff *buff;
1646c1b4a7e6SDavid S. Miller 	int nlen = skb->len - len;
16479ce01461SIlpo Järvinen 	u8 flags;
1648c1b4a7e6SDavid S. Miller 
1649c1b4a7e6SDavid S. Miller 	/* All of a TSO frame must be composed of paged data.  */
1650c8ac3774SHerbert Xu 	if (skb->len != skb->data_len)
1651c8ac3774SHerbert Xu 		return tcp_fragment(sk, skb, len, mss_now);
1652c1b4a7e6SDavid S. Miller 
1653c4ead4c5SEric Dumazet 	buff = sk_stream_alloc_skb(sk, 0, gfp);
1654c1b4a7e6SDavid S. Miller 	if (unlikely(buff == NULL))
1655c1b4a7e6SDavid S. Miller 		return -ENOMEM;
1656c1b4a7e6SDavid S. Miller 
16573ab224beSHideo Aoki 	sk->sk_wmem_queued += buff->truesize;
16583ab224beSHideo Aoki 	sk_mem_charge(sk, buff->truesize);
1659b60b49eaSHerbert Xu 	buff->truesize += nlen;
1660c1b4a7e6SDavid S. Miller 	skb->truesize -= nlen;
1661c1b4a7e6SDavid S. Miller 
1662c1b4a7e6SDavid S. Miller 	/* Correct the sequence numbers. */
1663c1b4a7e6SDavid S. Miller 	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1664c1b4a7e6SDavid S. Miller 	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1665c1b4a7e6SDavid S. Miller 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1666c1b4a7e6SDavid S. Miller 
1667c1b4a7e6SDavid S. Miller 	/* PSH and FIN should only be set in the second packet. */
16684de075e0SEric Dumazet 	flags = TCP_SKB_CB(skb)->tcp_flags;
16694de075e0SEric Dumazet 	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
16704de075e0SEric Dumazet 	TCP_SKB_CB(buff)->tcp_flags = flags;
1671c1b4a7e6SDavid S. Miller 
1672c1b4a7e6SDavid S. Miller 	/* This packet was never sent out yet, so no SACK bits. */
1673c1b4a7e6SDavid S. Miller 	TCP_SKB_CB(buff)->sacked = 0;
1674c1b4a7e6SDavid S. Miller 
167584fa7933SPatrick McHardy 	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
1676c1b4a7e6SDavid S. Miller 	skb_split(skb, buff, len);
1677c1b4a7e6SDavid S. Miller 
1678c1b4a7e6SDavid S. Miller 	/* Fix up tso_factor for both original and new SKB.  */
1679846998aeSDavid S. Miller 	tcp_set_skb_tso_segs(sk, skb, mss_now);
1680846998aeSDavid S. Miller 	tcp_set_skb_tso_segs(sk, buff, mss_now);
1681c1b4a7e6SDavid S. Miller 
1682c1b4a7e6SDavid S. Miller 	/* Link BUFF into the send queue. */
1683c1b4a7e6SDavid S. Miller 	skb_header_release(buff);
1684fe067e8aSDavid S. Miller 	tcp_insert_write_queue_after(skb, buff, sk);
1685c1b4a7e6SDavid S. Miller 
1686c1b4a7e6SDavid S. Miller 	return 0;
1687c1b4a7e6SDavid S. Miller }
1688c1b4a7e6SDavid S. Miller 
1689c1b4a7e6SDavid S. Miller /* Try to defer sending, if possible, in order to minimize the amount
1690c1b4a7e6SDavid S. Miller  * of TSO splitting we do.  View it as a kind of TSO Nagle test.
1691c1b4a7e6SDavid S. Miller  *
1692c1b4a7e6SDavid S. Miller  * This algorithm is from John Heffner.
1693c1b4a7e6SDavid S. Miller  */
1694a2a385d6SEric Dumazet static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1695c1b4a7e6SDavid S. Miller {
16969e412ba7SIlpo Järvinen 	struct tcp_sock *tp = tcp_sk(sk);
16976687e988SArnaldo Carvalho de Melo 	const struct inet_connection_sock *icsk = inet_csk(sk);
1698c1b4a7e6SDavid S. Miller 	u32 send_win, cong_win, limit, in_flight;
1699ad9f4f50SEric Dumazet 	int win_divisor;
1700c1b4a7e6SDavid S. Miller 
17014de075e0SEric Dumazet 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1702ae8064acSJohn Heffner 		goto send_now;
1703c1b4a7e6SDavid S. Miller 
17046687e988SArnaldo Carvalho de Melo 	if (icsk->icsk_ca_state != TCP_CA_Open)
1705ae8064acSJohn Heffner 		goto send_now;
1706ae8064acSJohn Heffner 
1707ae8064acSJohn Heffner 	/* Defer for less than two clock ticks. */
1708bd515c3eSIlpo Järvinen 	if (tp->tso_deferred &&
1709a2acde07SIlpo Järvinen 	    (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
1710ae8064acSJohn Heffner 		goto send_now;
1711908a75c1SDavid S. Miller 
1712c1b4a7e6SDavid S. Miller 	in_flight = tcp_packets_in_flight(tp);
1713c1b4a7e6SDavid S. Miller 
1714056834d9SIlpo Järvinen 	BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
1715c1b4a7e6SDavid S. Miller 
171690840defSIlpo Järvinen 	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1717c1b4a7e6SDavid S. Miller 
1718c1b4a7e6SDavid S. Miller 	/* From in_flight test above, we know that cwnd > in_flight.  */
1719c1b4a7e6SDavid S. Miller 	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
1720c1b4a7e6SDavid S. Miller 
1721c1b4a7e6SDavid S. Miller 	limit = min(send_win, cong_win);
1722c1b4a7e6SDavid S. Miller 
1723ba244fe9SDavid S. Miller 	/* If a full-sized TSO skb can be sent, do it. */
172482cc1a7aSPeter P Waskiewicz Jr 	if (limit >= sk->sk_gso_max_size)
1725ae8064acSJohn Heffner 		goto send_now;
1726ba244fe9SDavid S. Miller 
172762ad2761SIlpo Järvinen 	/* Middle in queue won't get any more data, full sendable already? */
172862ad2761SIlpo Järvinen 	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
172962ad2761SIlpo Järvinen 		goto send_now;
173062ad2761SIlpo Järvinen 
1731ad9f4f50SEric Dumazet 	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
1732ad9f4f50SEric Dumazet 	if (win_divisor) {
1733c1b4a7e6SDavid S. Miller 		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1734c1b4a7e6SDavid S. Miller 
1735c1b4a7e6SDavid S. Miller 		/* If at least some fraction of a window is available,
1736c1b4a7e6SDavid S. Miller 		 * just use it.
1737c1b4a7e6SDavid S. Miller 		 */
1738ad9f4f50SEric Dumazet 		chunk /= win_divisor;
1739c1b4a7e6SDavid S. Miller 		if (limit >= chunk)
1740ae8064acSJohn Heffner 			goto send_now;
1741c1b4a7e6SDavid S. Miller 	} else {
1742c1b4a7e6SDavid S. Miller 		/* Different approach, try not to defer past a single
1743c1b4a7e6SDavid S. Miller 		 * ACK.  Receiver should ACK every other full sized
1744c1b4a7e6SDavid S. Miller 		 * frame, so if we have space for more than 3 frames
1745c1b4a7e6SDavid S. Miller 		 * then send now.
1746c1b4a7e6SDavid S. Miller 		 */
17476b5a5c0dSNeal Cardwell 		if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
1748ae8064acSJohn Heffner 			goto send_now;
1749c1b4a7e6SDavid S. Miller 	}
1750c1b4a7e6SDavid S. Miller 
1751c1b4a7e6SDavid S. Miller 	/* Ok, it looks like it is advisable to defer.  */
1752ae8064acSJohn Heffner 	tp->tso_deferred = 1 | (jiffies << 1);
1753ae8064acSJohn Heffner 
1754a2a385d6SEric Dumazet 	return true;
1755ae8064acSJohn Heffner 
1756ae8064acSJohn Heffner send_now:
1757ae8064acSJohn Heffner 	tp->tso_deferred = 0;
1758a2a385d6SEric Dumazet 	return false;
1759c1b4a7e6SDavid S. Miller }
1760c1b4a7e6SDavid S. Miller 
17615d424d5aSJohn Heffner /* Create a new MTU probe if we are ready.
176267edfef7SAndi Kleen  * MTU probe is regularly attempting to increase the path MTU by
176367edfef7SAndi Kleen  * deliberately sending larger packets.  This discovers routing
176467edfef7SAndi Kleen  * changes resulting in larger path MTUs.
176567edfef7SAndi Kleen  *
17665d424d5aSJohn Heffner  * Returns 0 if we should wait to probe (no cwnd available),
17675d424d5aSJohn Heffner  *         1 if a probe was sent,
1768056834d9SIlpo Järvinen  *         -1 otherwise
1769056834d9SIlpo Järvinen  */
17705d424d5aSJohn Heffner static int tcp_mtu_probe(struct sock *sk)
17715d424d5aSJohn Heffner {
17725d424d5aSJohn Heffner 	struct tcp_sock *tp = tcp_sk(sk);
17735d424d5aSJohn Heffner 	struct inet_connection_sock *icsk = inet_csk(sk);
17745d424d5aSJohn Heffner 	struct sk_buff *skb, *nskb, *next;
17755d424d5aSJohn Heffner 	int len;
17765d424d5aSJohn Heffner 	int probe_size;
177791cc17c0SIlpo Järvinen 	int size_needed;
17785d424d5aSJohn Heffner 	int copy;
17795d424d5aSJohn Heffner 	int mss_now;
17805d424d5aSJohn Heffner 
17815d424d5aSJohn Heffner 	/* Not currently probing/verifying,
17825d424d5aSJohn Heffner 	 * not in recovery,
17835d424d5aSJohn Heffner 	 * have enough cwnd, and
17845d424d5aSJohn Heffner 	 * not SACKing (the variable headers throw things off) */
17855d424d5aSJohn Heffner 	if (!icsk->icsk_mtup.enabled ||
17865d424d5aSJohn Heffner 	    icsk->icsk_mtup.probe_size ||
17875d424d5aSJohn Heffner 	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
17885d424d5aSJohn Heffner 	    tp->snd_cwnd < 11 ||
1789cabeccbdSIlpo Järvinen 	    tp->rx_opt.num_sacks || tp->rx_opt.dsack)
17905d424d5aSJohn Heffner 		return -1;
17915d424d5aSJohn Heffner 
17925d424d5aSJohn Heffner 	/* Very simple search strategy: just double the MSS. */
17930c54b85fSIlpo Järvinen 	mss_now = tcp_current_mss(sk);
17945d424d5aSJohn Heffner 	probe_size = 2 * tp->mss_cache;
179591cc17c0SIlpo Järvinen 	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
17965d424d5aSJohn Heffner 	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
17975d424d5aSJohn Heffner 		/* TODO: set timer for probe_converge_event */
17985d424d5aSJohn Heffner 		return -1;
17995d424d5aSJohn Heffner 	}
18005d424d5aSJohn Heffner 
18015d424d5aSJohn Heffner 	/* Have enough data in the send queue to probe? */
18027f9c33e5SIlpo Järvinen 	if (tp->write_seq - tp->snd_nxt < size_needed)
18035d424d5aSJohn Heffner 		return -1;
18045d424d5aSJohn Heffner 
180591cc17c0SIlpo Järvinen 	if (tp->snd_wnd < size_needed)
18065d424d5aSJohn Heffner 		return -1;
180790840defSIlpo Järvinen 	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
18085d424d5aSJohn Heffner 		return 0;
18095d424d5aSJohn Heffner 
1810d67c58e9SIlpo Järvinen 	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
1811d67c58e9SIlpo Järvinen 	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
1812d67c58e9SIlpo Järvinen 		if (!tcp_packets_in_flight(tp))
18135d424d5aSJohn Heffner 			return -1;
18145d424d5aSJohn Heffner 		else
18155d424d5aSJohn Heffner 			return 0;
18165d424d5aSJohn Heffner 	}
18175d424d5aSJohn Heffner 
18185d424d5aSJohn Heffner 	/* We're allowed to probe.  Build it now. */
18195d424d5aSJohn Heffner 	if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
18205d424d5aSJohn Heffner 		return -1;
18213ab224beSHideo Aoki 	sk->sk_wmem_queued += nskb->truesize;
18223ab224beSHideo Aoki 	sk_mem_charge(sk, nskb->truesize);
18235d424d5aSJohn Heffner 
1824fe067e8aSDavid S. Miller 	skb = tcp_send_head(sk);
18255d424d5aSJohn Heffner 
18265d424d5aSJohn Heffner 	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
18275d424d5aSJohn Heffner 	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
18284de075e0SEric Dumazet 	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
18295d424d5aSJohn Heffner 	TCP_SKB_CB(nskb)->sacked = 0;
18305d424d5aSJohn Heffner 	nskb->csum = 0;
183184fa7933SPatrick McHardy 	nskb->ip_summed = skb->ip_summed;
18325d424d5aSJohn Heffner 
183350c4817eSIlpo Järvinen 	tcp_insert_write_queue_before(nskb, skb, sk);
183450c4817eSIlpo Järvinen 
18355d424d5aSJohn Heffner 	len = 0;
1836234b6860SIlpo Järvinen 	tcp_for_write_queue_from_safe(skb, next, sk) {
18375d424d5aSJohn Heffner 		copy = min_t(int, skb->len, probe_size - len);
18385d424d5aSJohn Heffner 		if (nskb->ip_summed)
18395d424d5aSJohn Heffner 			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
18405d424d5aSJohn Heffner 		else
18415d424d5aSJohn Heffner 			nskb->csum = skb_copy_and_csum_bits(skb, 0,
1842056834d9SIlpo Järvinen 							    skb_put(nskb, copy),
1843056834d9SIlpo Järvinen 							    copy, nskb->csum);
18445d424d5aSJohn Heffner 
18455d424d5aSJohn Heffner 		if (skb->len <= copy) {
18465d424d5aSJohn Heffner 			/* We've eaten all the data from this skb.
18475d424d5aSJohn Heffner 			 * Throw it away. */
18484de075e0SEric Dumazet 			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1849fe067e8aSDavid S. Miller 			tcp_unlink_write_queue(skb, sk);
18503ab224beSHideo Aoki 			sk_wmem_free_skb(sk, skb);
18515d424d5aSJohn Heffner 		} else {
18524de075e0SEric Dumazet 			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
1853a3433f35SChangli Gao 						   ~(TCPHDR_FIN|TCPHDR_PSH);
18545d424d5aSJohn Heffner 			if (!skb_shinfo(skb)->nr_frags) {
18555d424d5aSJohn Heffner 				skb_pull(skb, copy);
185684fa7933SPatrick McHardy 				if (skb->ip_summed != CHECKSUM_PARTIAL)
1857056834d9SIlpo Järvinen 					skb->csum = csum_partial(skb->data,
1858056834d9SIlpo Järvinen 								 skb->len, 0);
18595d424d5aSJohn Heffner 			} else {
18605d424d5aSJohn Heffner 				__pskb_trim_head(skb, copy);
18615d424d5aSJohn Heffner 				tcp_set_skb_tso_segs(sk, skb, mss_now);
18625d424d5aSJohn Heffner 			}
18635d424d5aSJohn Heffner 			TCP_SKB_CB(skb)->seq += copy;
18645d424d5aSJohn Heffner 		}
18655d424d5aSJohn Heffner 
18665d424d5aSJohn Heffner 		len += copy;
1867234b6860SIlpo Järvinen 
1868234b6860SIlpo Järvinen 		if (len >= probe_size)
1869234b6860SIlpo Järvinen 			break;
18705d424d5aSJohn Heffner 	}
18715d424d5aSJohn Heffner 	tcp_init_tso_segs(sk, nskb, nskb->len);
18725d424d5aSJohn Heffner 
18735d424d5aSJohn Heffner 	/* We're ready to send.  If this fails, the probe will
18745d424d5aSJohn Heffner 	 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
18755d424d5aSJohn Heffner 	TCP_SKB_CB(nskb)->when = tcp_time_stamp;
18765d424d5aSJohn Heffner 	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
18775d424d5aSJohn Heffner 		/* Decrement cwnd here because we are sending
18785d424d5aSJohn Heffner 		 * effectively two packets. */
18795d424d5aSJohn Heffner 		tp->snd_cwnd--;
188066f5fe62SIlpo Järvinen 		tcp_event_new_data_sent(sk, nskb);
18815d424d5aSJohn Heffner 
18825d424d5aSJohn Heffner 		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
18830e7b1368SJohn Heffner 		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
18840e7b1368SJohn Heffner 		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
18855d424d5aSJohn Heffner 
18865d424d5aSJohn Heffner 		return 1;
18875d424d5aSJohn Heffner 	}
18885d424d5aSJohn Heffner 
18895d424d5aSJohn Heffner 	return -1;
18905d424d5aSJohn Heffner }
18915d424d5aSJohn Heffner 
18921da177e4SLinus Torvalds /* This routine writes packets to the network.  It advances the
18931da177e4SLinus Torvalds  * send_head.  This happens as incoming acks open up the remote
18941da177e4SLinus Torvalds  * window for us.
18951da177e4SLinus Torvalds  *
1896f8269a49SIlpo Järvinen  * LARGESEND note: !tcp_urg_mode is overkill, only frames between
1897f8269a49SIlpo Järvinen  * snd_up-64k-mss .. snd_up cannot be large. However, taking into
1898f8269a49SIlpo Järvinen  * account rare use of URG, this is not a big flaw.
1899f8269a49SIlpo Järvinen  *
1900a2a385d6SEric Dumazet  * Returns true, if no segments are in flight and we have queued segments,
1901a2a385d6SEric Dumazet  * but cannot send anything now because of SWS or another problem.
19021da177e4SLinus Torvalds  */
1903a2a385d6SEric Dumazet static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1904d5dd9175SIlpo Järvinen 			   int push_one, gfp_t gfp)
19051da177e4SLinus Torvalds {
19061da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
190792df7b51SDavid S. Miller 	struct sk_buff *skb;
1908c1b4a7e6SDavid S. Miller 	unsigned int tso_segs, sent_pkts;
1909c1b4a7e6SDavid S. Miller 	int cwnd_quota;
19105d424d5aSJohn Heffner 	int result;
19111da177e4SLinus Torvalds 
1912c1b4a7e6SDavid S. Miller 	sent_pkts = 0;
19135d424d5aSJohn Heffner 
1914d5dd9175SIlpo Järvinen 	if (!push_one) {
19155d424d5aSJohn Heffner 		/* Do MTU probing. */
1916d5dd9175SIlpo Järvinen 		result = tcp_mtu_probe(sk);
1917d5dd9175SIlpo Järvinen 		if (!result) {
1918a2a385d6SEric Dumazet 			return false;
19195d424d5aSJohn Heffner 		} else if (result > 0) {
19205d424d5aSJohn Heffner 			sent_pkts = 1;
19215d424d5aSJohn Heffner 		}
1922d5dd9175SIlpo Järvinen 	}
19235d424d5aSJohn Heffner 
1924fe067e8aSDavid S. Miller 	while ((skb = tcp_send_head(sk))) {
1925c8ac3774SHerbert Xu 		unsigned int limit;
1926c8ac3774SHerbert Xu 
1927*46d3ceabSEric Dumazet 
1928b68e9f85SHerbert Xu 		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1929c1b4a7e6SDavid S. Miller 		BUG_ON(!tso_segs);
1930c1b4a7e6SDavid S. Miller 
1931b68e9f85SHerbert Xu 		cwnd_quota = tcp_cwnd_test(tp, skb);
1932b68e9f85SHerbert Xu 		if (!cwnd_quota)
1933b68e9f85SHerbert Xu 			break;
1934b68e9f85SHerbert Xu 
1935b68e9f85SHerbert Xu 		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
1936b68e9f85SHerbert Xu 			break;
1937b68e9f85SHerbert Xu 
1938c1b4a7e6SDavid S. Miller 		if (tso_segs == 1) {
1939aa93466bSDavid S. Miller 			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
1940aa93466bSDavid S. Miller 						     (tcp_skb_is_last(sk, skb) ?
1941aa93466bSDavid S. Miller 						      nonagle : TCP_NAGLE_PUSH))))
1942aa93466bSDavid S. Miller 				break;
1943c1b4a7e6SDavid S. Miller 		} else {
1944d5dd9175SIlpo Järvinen 			if (!push_one && tcp_tso_should_defer(sk, skb))
1945aa93466bSDavid S. Miller 				break;
1946c1b4a7e6SDavid S. Miller 		}
1947aa93466bSDavid S. Miller 
1948*46d3ceabSEric Dumazet 		/* TSQ : sk_wmem_alloc accounts skb truesize,
1949*46d3ceabSEric Dumazet 		 * including skb overhead. But thats OK.
1950*46d3ceabSEric Dumazet 		 */
1951*46d3ceabSEric Dumazet 		if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
1952*46d3ceabSEric Dumazet 			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
1953*46d3ceabSEric Dumazet 			break;
1954*46d3ceabSEric Dumazet 		}
1955c8ac3774SHerbert Xu 		limit = mss_now;
1956f8269a49SIlpo Järvinen 		if (tso_segs > 1 && !tcp_urg_mode(tp))
19570e3a4803SIlpo Järvinen 			limit = tcp_mss_split_point(sk, skb, mss_now,
19580e3a4803SIlpo Järvinen 						    cwnd_quota);
1959c8ac3774SHerbert Xu 
1960c8ac3774SHerbert Xu 		if (skb->len > limit &&
1961c4ead4c5SEric Dumazet 		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
19621da177e4SLinus Torvalds 			break;
19631da177e4SLinus Torvalds 
19641da177e4SLinus Torvalds 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
1965c1b4a7e6SDavid S. Miller 
1966d5dd9175SIlpo Järvinen 		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
19671da177e4SLinus Torvalds 			break;
19681da177e4SLinus Torvalds 
19691da177e4SLinus Torvalds 		/* Advance the send_head.  This one is sent out.
19701da177e4SLinus Torvalds 		 * This call will increment packets_out.
19711da177e4SLinus Torvalds 		 */
197266f5fe62SIlpo Järvinen 		tcp_event_new_data_sent(sk, skb);
19731da177e4SLinus Torvalds 
19741da177e4SLinus Torvalds 		tcp_minshall_update(tp, mss_now, skb);
1975a262f0cdSNandita Dukkipati 		sent_pkts += tcp_skb_pcount(skb);
1976d5dd9175SIlpo Järvinen 
1977d5dd9175SIlpo Järvinen 		if (push_one)
1978d5dd9175SIlpo Järvinen 			break;
19791da177e4SLinus Torvalds 	}
1980a262f0cdSNandita Dukkipati 	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
1981a262f0cdSNandita Dukkipati 		tp->prr_out += sent_pkts;
19821da177e4SLinus Torvalds 
1983aa93466bSDavid S. Miller 	if (likely(sent_pkts)) {
19849e412ba7SIlpo Järvinen 		tcp_cwnd_validate(sk);
1985a2a385d6SEric Dumazet 		return false;
19861da177e4SLinus Torvalds 	}
1987fe067e8aSDavid S. Miller 	return !tp->packets_out && tcp_send_head(sk);
19881da177e4SLinus Torvalds }
19891da177e4SLinus Torvalds 
1990a762a980SDavid S. Miller /* Push out any pending frames which were held back due to
1991a762a980SDavid S. Miller  * TCP_CORK or attempt at coalescing tiny packets.
1992a762a980SDavid S. Miller  * The socket must be locked by the caller.
1993a762a980SDavid S. Miller  */
19949e412ba7SIlpo Järvinen void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
19959e412ba7SIlpo Järvinen 			       int nonagle)
1996a762a980SDavid S. Miller {
1997726e07a8SIlpo Järvinen 	/* If we are closed, the bytes will have to remain here.
1998726e07a8SIlpo Järvinen 	 * In time closedown will finish, we empty the write queue and
1999726e07a8SIlpo Järvinen 	 * all will be happy.
2000726e07a8SIlpo Järvinen 	 */
2001726e07a8SIlpo Järvinen 	if (unlikely(sk->sk_state == TCP_CLOSE))
2002726e07a8SIlpo Järvinen 		return;
2003726e07a8SIlpo Järvinen 
2004d5dd9175SIlpo Järvinen 	if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
20059e412ba7SIlpo Järvinen 		tcp_check_probe_timer(sk);
2006a762a980SDavid S. Miller }
2007a762a980SDavid S. Miller 
2008c1b4a7e6SDavid S. Miller /* Send _single_ skb sitting at the send head. This function requires
2009c1b4a7e6SDavid S. Miller  * true push pending frames to setup probe timer etc.
2010c1b4a7e6SDavid S. Miller  */
2011c1b4a7e6SDavid S. Miller void tcp_push_one(struct sock *sk, unsigned int mss_now)
2012c1b4a7e6SDavid S. Miller {
2013fe067e8aSDavid S. Miller 	struct sk_buff *skb = tcp_send_head(sk);
2014c1b4a7e6SDavid S. Miller 
2015c1b4a7e6SDavid S. Miller 	BUG_ON(!skb || skb->len < mss_now);
2016c1b4a7e6SDavid S. Miller 
2017d5dd9175SIlpo Järvinen 	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
2018c1b4a7e6SDavid S. Miller }
2019c1b4a7e6SDavid S. Miller 
20201da177e4SLinus Torvalds /* This function returns the amount that we can raise the
20211da177e4SLinus Torvalds  * usable window based on the following constraints
20221da177e4SLinus Torvalds  *
20231da177e4SLinus Torvalds  * 1. The window can never be shrunk once it is offered (RFC 793)
20241da177e4SLinus Torvalds  * 2. We limit memory per socket
20251da177e4SLinus Torvalds  *
20261da177e4SLinus Torvalds  * RFC 1122:
20271da177e4SLinus Torvalds  * "the suggested [SWS] avoidance algorithm for the receiver is to keep
20281da177e4SLinus Torvalds  *  RECV.NEXT + RCV.WIN fixed until:
20291da177e4SLinus Torvalds  *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
20301da177e4SLinus Torvalds  *
20311da177e4SLinus Torvalds  * i.e. don't raise the right edge of the window until you can raise
20321da177e4SLinus Torvalds  * it at least MSS bytes.
20331da177e4SLinus Torvalds  *
20341da177e4SLinus Torvalds  * Unfortunately, the recommended algorithm breaks header prediction,
20351da177e4SLinus Torvalds  * since header prediction assumes th->window stays fixed.
20361da177e4SLinus Torvalds  *
20371da177e4SLinus Torvalds  * Strictly speaking, keeping th->window fixed violates the receiver
20381da177e4SLinus Torvalds  * side SWS prevention criteria. The problem is that under this rule
20391da177e4SLinus Torvalds  * a stream of single byte packets will cause the right side of the
20401da177e4SLinus Torvalds  * window to always advance by a single byte.
20411da177e4SLinus Torvalds  *
20421da177e4SLinus Torvalds  * Of course, if the sender implements sender side SWS prevention
20431da177e4SLinus Torvalds  * then this will not be a problem.
20441da177e4SLinus Torvalds  *
20451da177e4SLinus Torvalds  * BSD seems to make the following compromise:
20461da177e4SLinus Torvalds  *
20471da177e4SLinus Torvalds  *	If the free space is less than the 1/4 of the maximum
20481da177e4SLinus Torvalds  *	space available and the free space is less than 1/2 mss,
20491da177e4SLinus Torvalds  *	then set the window to 0.
20501da177e4SLinus Torvalds  *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
20511da177e4SLinus Torvalds  *	Otherwise, just prevent the window from shrinking
20521da177e4SLinus Torvalds  *	and from being larger than the largest representable value.
20531da177e4SLinus Torvalds  *
20541da177e4SLinus Torvalds  * This prevents incremental opening of the window in the regime
20551da177e4SLinus Torvalds  * where TCP is limited by the speed of the reader side taking
20561da177e4SLinus Torvalds  * data out of the TCP receive queue. It does nothing about
20571da177e4SLinus Torvalds  * those cases where the window is constrained on the sender side
20581da177e4SLinus Torvalds  * because the pipeline is full.
20591da177e4SLinus Torvalds  *
20601da177e4SLinus Torvalds  * BSD also seems to "accidentally" limit itself to windows that are a
20611da177e4SLinus Torvalds  * multiple of MSS, at least until the free space gets quite small.
20621da177e4SLinus Torvalds  * This would appear to be a side effect of the mbuf implementation.
20631da177e4SLinus Torvalds  * Combining these two algorithms results in the observed behavior
20641da177e4SLinus Torvalds  * of having a fixed window size at almost all times.
20651da177e4SLinus Torvalds  *
20661da177e4SLinus Torvalds  * Below we obtain similar behavior by forcing the offered window to
20671da177e4SLinus Torvalds  * a multiple of the mss when it is feasible to do so.
20681da177e4SLinus Torvalds  *
20691da177e4SLinus Torvalds  * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
20701da177e4SLinus Torvalds  * Regular options like TIMESTAMP are taken into account.
20711da177e4SLinus Torvalds  */
20721da177e4SLinus Torvalds u32 __tcp_select_window(struct sock *sk)
20731da177e4SLinus Torvalds {
2074463c84b9SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
20751da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
2076caa20d9aSStephen Hemminger 	/* MSS for the peer's data.  Previous versions used mss_clamp
20771da177e4SLinus Torvalds 	 * here.  I don't know if the value based on our guesses
20781da177e4SLinus Torvalds 	 * of peer's MSS is better for the performance.  It's more correct
20791da177e4SLinus Torvalds 	 * but may be worse for the performance because of rcv_mss
20801da177e4SLinus Torvalds 	 * fluctuations.  --SAW  1998/11/1
20811da177e4SLinus Torvalds 	 */
2082463c84b9SArnaldo Carvalho de Melo 	int mss = icsk->icsk_ack.rcv_mss;
20831da177e4SLinus Torvalds 	int free_space = tcp_space(sk);
20841da177e4SLinus Torvalds 	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
20851da177e4SLinus Torvalds 	int window;
20861da177e4SLinus Torvalds 
20871da177e4SLinus Torvalds 	if (mss > full_space)
20881da177e4SLinus Torvalds 		mss = full_space;
20891da177e4SLinus Torvalds 
2090b92edbe0SEric Dumazet 	if (free_space < (full_space >> 1)) {
2091463c84b9SArnaldo Carvalho de Melo 		icsk->icsk_ack.quick = 0;
20921da177e4SLinus Torvalds 
2093180d8cd9SGlauber Costa 		if (sk_under_memory_pressure(sk))
2094056834d9SIlpo Järvinen 			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2095056834d9SIlpo Järvinen 					       4U * tp->advmss);
20961da177e4SLinus Torvalds 
20971da177e4SLinus Torvalds 		if (free_space < mss)
20981da177e4SLinus Torvalds 			return 0;
20991da177e4SLinus Torvalds 	}
21001da177e4SLinus Torvalds 
21011da177e4SLinus Torvalds 	if (free_space > tp->rcv_ssthresh)
21021da177e4SLinus Torvalds 		free_space = tp->rcv_ssthresh;
21031da177e4SLinus Torvalds 
21041da177e4SLinus Torvalds 	/* Don't do rounding if we are using window scaling, since the
21051da177e4SLinus Torvalds 	 * scaled window will not line up with the MSS boundary anyway.
21061da177e4SLinus Torvalds 	 */
21071da177e4SLinus Torvalds 	window = tp->rcv_wnd;
21081da177e4SLinus Torvalds 	if (tp->rx_opt.rcv_wscale) {
21091da177e4SLinus Torvalds 		window = free_space;
21101da177e4SLinus Torvalds 
21111da177e4SLinus Torvalds 		/* Advertise enough space so that it won't get scaled away.
21121da177e4SLinus Torvalds 		 * Import case: prevent zero window announcement if
21131da177e4SLinus Torvalds 		 * 1<<rcv_wscale > mss.
21141da177e4SLinus Torvalds 		 */
21151da177e4SLinus Torvalds 		if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
21161da177e4SLinus Torvalds 			window = (((window >> tp->rx_opt.rcv_wscale) + 1)
21171da177e4SLinus Torvalds 				  << tp->rx_opt.rcv_wscale);
21181da177e4SLinus Torvalds 	} else {
21191da177e4SLinus Torvalds 		/* Get the largest window that is a nice multiple of mss.
21201da177e4SLinus Torvalds 		 * Window clamp already applied above.
21211da177e4SLinus Torvalds 		 * If our current window offering is within 1 mss of the
21221da177e4SLinus Torvalds 		 * free space we just keep it. This prevents the divide
21231da177e4SLinus Torvalds 		 * and multiply from happening most of the time.
21241da177e4SLinus Torvalds 		 * We also don't do any window rounding when the free space
21251da177e4SLinus Torvalds 		 * is too small.
21261da177e4SLinus Torvalds 		 */
21271da177e4SLinus Torvalds 		if (window <= free_space - mss || window > free_space)
21281da177e4SLinus Torvalds 			window = (free_space / mss) * mss;
212984565070SJohn Heffner 		else if (mss == full_space &&
2130b92edbe0SEric Dumazet 			 free_space > window + (full_space >> 1))
213184565070SJohn Heffner 			window = free_space;
21321da177e4SLinus Torvalds 	}
21331da177e4SLinus Torvalds 
21341da177e4SLinus Torvalds 	return window;
21351da177e4SLinus Torvalds }
21361da177e4SLinus Torvalds 
21374a17fc3aSIlpo Järvinen /* Collapses two adjacent SKB's during retransmission. */
21384a17fc3aSIlpo Järvinen static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
21391da177e4SLinus Torvalds {
21401da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
2141fe067e8aSDavid S. Miller 	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
2142058dc334SIlpo Järvinen 	int skb_size, next_skb_size;
21431da177e4SLinus Torvalds 
2144058dc334SIlpo Järvinen 	skb_size = skb->len;
2145058dc334SIlpo Järvinen 	next_skb_size = next_skb->len;
21461da177e4SLinus Torvalds 
2147058dc334SIlpo Järvinen 	BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
21481da177e4SLinus Torvalds 
21496859d494SIlpo Järvinen 	tcp_highest_sack_combine(sk, next_skb, skb);
2150a6963a6bSIlpo Järvinen 
2151fe067e8aSDavid S. Miller 	tcp_unlink_write_queue(next_skb, sk);
21521da177e4SLinus Torvalds 
2153058dc334SIlpo Järvinen 	skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
21541a4e2d09SArnaldo Carvalho de Melo 				  next_skb_size);
21551da177e4SLinus Torvalds 
215652d570aaSJarek Poplawski 	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
215752d570aaSJarek Poplawski 		skb->ip_summed = CHECKSUM_PARTIAL;
21581da177e4SLinus Torvalds 
215984fa7933SPatrick McHardy 	if (skb->ip_summed != CHECKSUM_PARTIAL)
21601da177e4SLinus Torvalds 		skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
21611da177e4SLinus Torvalds 
21621da177e4SLinus Torvalds 	/* Update sequence range on original skb. */
21631da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
21641da177e4SLinus Torvalds 
2165e6c7d085SIlpo Järvinen 	/* Merge over control information. This moves PSH/FIN etc. over */
21664de075e0SEric Dumazet 	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
21671da177e4SLinus Torvalds 
21681da177e4SLinus Torvalds 	/* All done, get rid of second SKB and account for it so
21691da177e4SLinus Torvalds 	 * packet counting does not break.
21701da177e4SLinus Torvalds 	 */
21714828e7f4SIlpo Järvinen 	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
2172b7689205SIlpo Järvinen 
2173b7689205SIlpo Järvinen 	/* changed transmit queue under us so clear hints */
2174ef9da47cSIlpo Järvinen 	tcp_clear_retrans_hints_partial(tp);
2175ef9da47cSIlpo Järvinen 	if (next_skb == tp->retransmit_skb_hint)
2176ef9da47cSIlpo Järvinen 		tp->retransmit_skb_hint = skb;
2177b7689205SIlpo Järvinen 
2178797108d1SIlpo Järvinen 	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
2179797108d1SIlpo Järvinen 
21803ab224beSHideo Aoki 	sk_wmem_free_skb(sk, next_skb);
21811da177e4SLinus Torvalds }
21821da177e4SLinus Torvalds 
218367edfef7SAndi Kleen /* Check if coalescing SKBs is legal. */
2184a2a385d6SEric Dumazet static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
21854a17fc3aSIlpo Järvinen {
21864a17fc3aSIlpo Järvinen 	if (tcp_skb_pcount(skb) > 1)
2187a2a385d6SEric Dumazet 		return false;
21884a17fc3aSIlpo Järvinen 	/* TODO: SACK collapsing could be used to remove this condition */
21894a17fc3aSIlpo Järvinen 	if (skb_shinfo(skb)->nr_frags != 0)
2190a2a385d6SEric Dumazet 		return false;
21914a17fc3aSIlpo Järvinen 	if (skb_cloned(skb))
2192a2a385d6SEric Dumazet 		return false;
21934a17fc3aSIlpo Järvinen 	if (skb == tcp_send_head(sk))
2194a2a385d6SEric Dumazet 		return false;
21954a17fc3aSIlpo Järvinen 	/* Some heurestics for collapsing over SACK'd could be invented */
21964a17fc3aSIlpo Järvinen 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2197a2a385d6SEric Dumazet 		return false;
21984a17fc3aSIlpo Järvinen 
2199a2a385d6SEric Dumazet 	return true;
22004a17fc3aSIlpo Järvinen }
22014a17fc3aSIlpo Järvinen 
220267edfef7SAndi Kleen /* Collapse packets in the retransmit queue to make to create
220367edfef7SAndi Kleen  * less packets on the wire. This is only done on retransmission.
220467edfef7SAndi Kleen  */
22054a17fc3aSIlpo Järvinen static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
22064a17fc3aSIlpo Järvinen 				     int space)
22074a17fc3aSIlpo Järvinen {
22084a17fc3aSIlpo Järvinen 	struct tcp_sock *tp = tcp_sk(sk);
22094a17fc3aSIlpo Järvinen 	struct sk_buff *skb = to, *tmp;
2210a2a385d6SEric Dumazet 	bool first = true;
22114a17fc3aSIlpo Järvinen 
22124a17fc3aSIlpo Järvinen 	if (!sysctl_tcp_retrans_collapse)
22134a17fc3aSIlpo Järvinen 		return;
22144de075e0SEric Dumazet 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
22154a17fc3aSIlpo Järvinen 		return;
22164a17fc3aSIlpo Järvinen 
22174a17fc3aSIlpo Järvinen 	tcp_for_write_queue_from_safe(skb, tmp, sk) {
22184a17fc3aSIlpo Järvinen 		if (!tcp_can_collapse(sk, skb))
22194a17fc3aSIlpo Järvinen 			break;
22204a17fc3aSIlpo Järvinen 
22214a17fc3aSIlpo Järvinen 		space -= skb->len;
22224a17fc3aSIlpo Järvinen 
22234a17fc3aSIlpo Järvinen 		if (first) {
2224a2a385d6SEric Dumazet 			first = false;
22254a17fc3aSIlpo Järvinen 			continue;
22264a17fc3aSIlpo Järvinen 		}
22274a17fc3aSIlpo Järvinen 
22284a17fc3aSIlpo Järvinen 		if (space < 0)
22294a17fc3aSIlpo Järvinen 			break;
22304a17fc3aSIlpo Järvinen 		/* Punt if not enough space exists in the first SKB for
22314a17fc3aSIlpo Järvinen 		 * the data in the second
22324a17fc3aSIlpo Järvinen 		 */
2233a21d4572SEric Dumazet 		if (skb->len > skb_availroom(to))
22344a17fc3aSIlpo Järvinen 			break;
22354a17fc3aSIlpo Järvinen 
22364a17fc3aSIlpo Järvinen 		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
22374a17fc3aSIlpo Järvinen 			break;
22384a17fc3aSIlpo Järvinen 
22394a17fc3aSIlpo Järvinen 		tcp_collapse_retrans(sk, to);
22404a17fc3aSIlpo Järvinen 	}
22414a17fc3aSIlpo Järvinen }
22424a17fc3aSIlpo Järvinen 
22431da177e4SLinus Torvalds /* This retransmits one SKB.  Policy decisions and retransmit queue
22441da177e4SLinus Torvalds  * state updates are done by the caller.  Returns non-zero if an
22451da177e4SLinus Torvalds  * error occurred which prevented the send.
22461da177e4SLinus Torvalds  */
22471da177e4SLinus Torvalds int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
22481da177e4SLinus Torvalds {
22491da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
22505d424d5aSJohn Heffner 	struct inet_connection_sock *icsk = inet_csk(sk);
22517d227cd2SSridhar Samudrala 	unsigned int cur_mss;
22521da177e4SLinus Torvalds 	int err;
22531da177e4SLinus Torvalds 
22545d424d5aSJohn Heffner 	/* Inconslusive MTU probe */
22555d424d5aSJohn Heffner 	if (icsk->icsk_mtup.probe_size) {
22565d424d5aSJohn Heffner 		icsk->icsk_mtup.probe_size = 0;
22575d424d5aSJohn Heffner 	}
22585d424d5aSJohn Heffner 
22591da177e4SLinus Torvalds 	/* Do not sent more than we queued. 1/4 is reserved for possible
2260caa20d9aSStephen Hemminger 	 * copying overhead: fragmentation, tunneling, mangling etc.
22611da177e4SLinus Torvalds 	 */
22621da177e4SLinus Torvalds 	if (atomic_read(&sk->sk_wmem_alloc) >
22631da177e4SLinus Torvalds 	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
22641da177e4SLinus Torvalds 		return -EAGAIN;
22651da177e4SLinus Torvalds 
22661da177e4SLinus Torvalds 	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
22671da177e4SLinus Torvalds 		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
22681da177e4SLinus Torvalds 			BUG();
22691da177e4SLinus Torvalds 		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
22701da177e4SLinus Torvalds 			return -ENOMEM;
22711da177e4SLinus Torvalds 	}
22721da177e4SLinus Torvalds 
22737d227cd2SSridhar Samudrala 	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
22747d227cd2SSridhar Samudrala 		return -EHOSTUNREACH; /* Routing failure or similar. */
22757d227cd2SSridhar Samudrala 
22760c54b85fSIlpo Järvinen 	cur_mss = tcp_current_mss(sk);
22777d227cd2SSridhar Samudrala 
22781da177e4SLinus Torvalds 	/* If receiver has shrunk his window, and skb is out of
22791da177e4SLinus Torvalds 	 * new window, do not retransmit it. The exception is the
22801da177e4SLinus Torvalds 	 * case, when window is shrunk to zero. In this case
22811da177e4SLinus Torvalds 	 * our retransmit serves as a zero window probe.
22821da177e4SLinus Torvalds 	 */
22839d4fb27dSJoe Perches 	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
22849d4fb27dSJoe Perches 	    TCP_SKB_CB(skb)->seq != tp->snd_una)
22851da177e4SLinus Torvalds 		return -EAGAIN;
22861da177e4SLinus Torvalds 
22871da177e4SLinus Torvalds 	if (skb->len > cur_mss) {
2288846998aeSDavid S. Miller 		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
22891da177e4SLinus Torvalds 			return -ENOMEM; /* We'll try again later. */
229002276f3cSIlpo Järvinen 	} else {
22919eb9362eSIlpo Järvinen 		int oldpcount = tcp_skb_pcount(skb);
22929eb9362eSIlpo Järvinen 
22939eb9362eSIlpo Järvinen 		if (unlikely(oldpcount > 1)) {
229402276f3cSIlpo Järvinen 			tcp_init_tso_segs(sk, skb, cur_mss);
22959eb9362eSIlpo Järvinen 			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
22969eb9362eSIlpo Järvinen 		}
22971da177e4SLinus Torvalds 	}
22981da177e4SLinus Torvalds 
22991da177e4SLinus Torvalds 	tcp_retrans_try_collapse(sk, skb, cur_mss);
23001da177e4SLinus Torvalds 
23011da177e4SLinus Torvalds 	/* Some Solaris stacks overoptimize and ignore the FIN on a
23021da177e4SLinus Torvalds 	 * retransmit when old data is attached.  So strip it off
23031da177e4SLinus Torvalds 	 * since it is cheap to do so and saves bytes on the network.
23041da177e4SLinus Torvalds 	 */
23051da177e4SLinus Torvalds 	if (skb->len > 0 &&
23064de075e0SEric Dumazet 	    (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
23071da177e4SLinus Torvalds 	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
23081da177e4SLinus Torvalds 		if (!pskb_trim(skb, 0)) {
2309e870a8efSIlpo Järvinen 			/* Reuse, even though it does some unnecessary work */
2310e870a8efSIlpo Järvinen 			tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
23114de075e0SEric Dumazet 					     TCP_SKB_CB(skb)->tcp_flags);
23121da177e4SLinus Torvalds 			skb->ip_summed = CHECKSUM_NONE;
23131da177e4SLinus Torvalds 		}
23141da177e4SLinus Torvalds 	}
23151da177e4SLinus Torvalds 
23161da177e4SLinus Torvalds 	/* Make a copy, if the first transmission SKB clone we made
23171da177e4SLinus Torvalds 	 * is still in somebody's hands, else make a clone.
23181da177e4SLinus Torvalds 	 */
23191da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
23201da177e4SLinus Torvalds 
2321117632e6SEric Dumazet 	/* make sure skb->data is aligned on arches that require it */
2322117632e6SEric Dumazet 	if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) {
2323117632e6SEric Dumazet 		struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
2324117632e6SEric Dumazet 						   GFP_ATOMIC);
2325117632e6SEric Dumazet 		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2326117632e6SEric Dumazet 			     -ENOBUFS;
2327117632e6SEric Dumazet 	} else {
2328dfb4b9dcSDavid S. Miller 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2329117632e6SEric Dumazet 	}
23301da177e4SLinus Torvalds 
23311da177e4SLinus Torvalds 	if (err == 0) {
23321da177e4SLinus Torvalds 		/* Update global TCP statistics. */
233381cc8a75SPavel Emelyanov 		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
23341da177e4SLinus Torvalds 
23351da177e4SLinus Torvalds 		tp->total_retrans++;
23361da177e4SLinus Torvalds 
23371da177e4SLinus Torvalds #if FASTRETRANS_DEBUG > 0
23381da177e4SLinus Torvalds 		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2339e87cc472SJoe Perches 			net_dbg_ratelimited("retrans_out leaked\n");
23401da177e4SLinus Torvalds 		}
23411da177e4SLinus Torvalds #endif
2342b08d6cb2SIlpo Järvinen 		if (!tp->retrans_out)
2343b08d6cb2SIlpo Järvinen 			tp->lost_retrans_low = tp->snd_nxt;
23441da177e4SLinus Torvalds 		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
23451da177e4SLinus Torvalds 		tp->retrans_out += tcp_skb_pcount(skb);
23461da177e4SLinus Torvalds 
23471da177e4SLinus Torvalds 		/* Save stamp of the first retransmit. */
23481da177e4SLinus Torvalds 		if (!tp->retrans_stamp)
23491da177e4SLinus Torvalds 			tp->retrans_stamp = TCP_SKB_CB(skb)->when;
23501da177e4SLinus Torvalds 
2351c24f691bSYuchung Cheng 		tp->undo_retrans += tcp_skb_pcount(skb);
23521da177e4SLinus Torvalds 
23531da177e4SLinus Torvalds 		/* snd_nxt is stored to detect loss of retransmitted segment,
23541da177e4SLinus Torvalds 		 * see tcp_input.c tcp_sacktag_write_queue().
23551da177e4SLinus Torvalds 		 */
23561da177e4SLinus Torvalds 		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
23571da177e4SLinus Torvalds 	}
23581da177e4SLinus Torvalds 	return err;
23591da177e4SLinus Torvalds }
23601da177e4SLinus Torvalds 
236167edfef7SAndi Kleen /* Check if we forward retransmits are possible in the current
236267edfef7SAndi Kleen  * window/congestion state.
236367edfef7SAndi Kleen  */
2364a2a385d6SEric Dumazet static bool tcp_can_forward_retransmit(struct sock *sk)
2365b5afe7bcSIlpo Järvinen {
2366b5afe7bcSIlpo Järvinen 	const struct inet_connection_sock *icsk = inet_csk(sk);
2367cf533ea5SEric Dumazet 	const struct tcp_sock *tp = tcp_sk(sk);
2368b5afe7bcSIlpo Järvinen 
2369b5afe7bcSIlpo Järvinen 	/* Forward retransmissions are possible only during Recovery. */
2370b5afe7bcSIlpo Järvinen 	if (icsk->icsk_ca_state != TCP_CA_Recovery)
2371a2a385d6SEric Dumazet 		return false;
2372b5afe7bcSIlpo Järvinen 
2373b5afe7bcSIlpo Järvinen 	/* No forward retransmissions in Reno are possible. */
2374b5afe7bcSIlpo Järvinen 	if (tcp_is_reno(tp))
2375a2a385d6SEric Dumazet 		return false;
2376b5afe7bcSIlpo Järvinen 
2377b5afe7bcSIlpo Järvinen 	/* Yeah, we have to make difficult choice between forward transmission
2378b5afe7bcSIlpo Järvinen 	 * and retransmission... Both ways have their merits...
2379b5afe7bcSIlpo Järvinen 	 *
2380b5afe7bcSIlpo Järvinen 	 * For now we do not retransmit anything, while we have some new
2381b5afe7bcSIlpo Järvinen 	 * segments to send. In the other cases, follow rule 3 for
2382b5afe7bcSIlpo Järvinen 	 * NextSeg() specified in RFC3517.
2383b5afe7bcSIlpo Järvinen 	 */
2384b5afe7bcSIlpo Järvinen 
2385b5afe7bcSIlpo Järvinen 	if (tcp_may_send_now(sk))
2386a2a385d6SEric Dumazet 		return false;
2387b5afe7bcSIlpo Järvinen 
2388a2a385d6SEric Dumazet 	return true;
2389b5afe7bcSIlpo Järvinen }
2390b5afe7bcSIlpo Järvinen 
23911da177e4SLinus Torvalds /* This gets called after a retransmit timeout, and the initially
23921da177e4SLinus Torvalds  * retransmitted data is acknowledged.  It tries to continue
23931da177e4SLinus Torvalds  * resending the rest of the retransmit queue, until either
23941da177e4SLinus Torvalds  * we've sent it all or the congestion window limit is reached.
23951da177e4SLinus Torvalds  * If doing SACK, the first ACK which comes back for a timeout
23961da177e4SLinus Torvalds  * based retransmit packet might feed us FACK information again.
23971da177e4SLinus Torvalds  * If so, we use it to avoid unnecessarily retransmissions.
23981da177e4SLinus Torvalds  */
23991da177e4SLinus Torvalds void tcp_xmit_retransmit_queue(struct sock *sk)
24001da177e4SLinus Torvalds {
24016687e988SArnaldo Carvalho de Melo 	const struct inet_connection_sock *icsk = inet_csk(sk);
24021da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
24031da177e4SLinus Torvalds 	struct sk_buff *skb;
24040e1c54c2SIlpo Järvinen 	struct sk_buff *hole = NULL;
2405618d9f25SIlpo Järvinen 	u32 last_lost;
240661eb55f4SIlpo Järvinen 	int mib_idx;
24070e1c54c2SIlpo Järvinen 	int fwd_rexmitting = 0;
24086a438bbeSStephen Hemminger 
240945e77d31SIlpo Järvinen 	if (!tp->packets_out)
241045e77d31SIlpo Järvinen 		return;
241145e77d31SIlpo Järvinen 
241208ebd172SIlpo Järvinen 	if (!tp->lost_out)
241308ebd172SIlpo Järvinen 		tp->retransmit_high = tp->snd_una;
241408ebd172SIlpo Järvinen 
2415618d9f25SIlpo Järvinen 	if (tp->retransmit_skb_hint) {
24166a438bbeSStephen Hemminger 		skb = tp->retransmit_skb_hint;
2417618d9f25SIlpo Järvinen 		last_lost = TCP_SKB_CB(skb)->end_seq;
2418618d9f25SIlpo Järvinen 		if (after(last_lost, tp->retransmit_high))
2419618d9f25SIlpo Järvinen 			last_lost = tp->retransmit_high;
2420618d9f25SIlpo Järvinen 	} else {
2421fe067e8aSDavid S. Miller 		skb = tcp_write_queue_head(sk);
2422618d9f25SIlpo Järvinen 		last_lost = tp->snd_una;
2423618d9f25SIlpo Järvinen 	}
24241da177e4SLinus Torvalds 
2425fe067e8aSDavid S. Miller 	tcp_for_write_queue_from(skb, sk) {
24261da177e4SLinus Torvalds 		__u8 sacked = TCP_SKB_CB(skb)->sacked;
24271da177e4SLinus Torvalds 
2428fe067e8aSDavid S. Miller 		if (skb == tcp_send_head(sk))
2429fe067e8aSDavid S. Miller 			break;
24306a438bbeSStephen Hemminger 		/* we could do better than to assign each time */
24310e1c54c2SIlpo Järvinen 		if (hole == NULL)
24326a438bbeSStephen Hemminger 			tp->retransmit_skb_hint = skb;
24336a438bbeSStephen Hemminger 
24341da177e4SLinus Torvalds 		/* Assume this retransmit will generate
24351da177e4SLinus Torvalds 		 * only one packet for congestion window
24361da177e4SLinus Torvalds 		 * calculation purposes.  This works because
24371da177e4SLinus Torvalds 		 * tcp_retransmit_skb() will chop up the
24381da177e4SLinus Torvalds 		 * packet to be MSS sized and all the
24391da177e4SLinus Torvalds 		 * packet counting works out.
24401da177e4SLinus Torvalds 		 */
24411da177e4SLinus Torvalds 		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
24421da177e4SLinus Torvalds 			return;
24430e1c54c2SIlpo Järvinen 
24440e1c54c2SIlpo Järvinen 		if (fwd_rexmitting) {
24450e1c54c2SIlpo Järvinen begin_fwd:
24460e1c54c2SIlpo Järvinen 			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
2447006f582cSIlpo Järvinen 				break;
24480e1c54c2SIlpo Järvinen 			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
24490e1c54c2SIlpo Järvinen 
24500e1c54c2SIlpo Järvinen 		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
2451618d9f25SIlpo Järvinen 			tp->retransmit_high = last_lost;
24520e1c54c2SIlpo Järvinen 			if (!tcp_can_forward_retransmit(sk))
24530e1c54c2SIlpo Järvinen 				break;
24540e1c54c2SIlpo Järvinen 			/* Backtrack if necessary to non-L'ed skb */
24550e1c54c2SIlpo Järvinen 			if (hole != NULL) {
24560e1c54c2SIlpo Järvinen 				skb = hole;
24570e1c54c2SIlpo Järvinen 				hole = NULL;
24580e1c54c2SIlpo Järvinen 			}
24590e1c54c2SIlpo Järvinen 			fwd_rexmitting = 1;
24600e1c54c2SIlpo Järvinen 			goto begin_fwd;
24610e1c54c2SIlpo Järvinen 
24620e1c54c2SIlpo Järvinen 		} else if (!(sacked & TCPCB_LOST)) {
2463ac11ba75SIlpo Järvinen 			if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
24640e1c54c2SIlpo Järvinen 				hole = skb;
246561eb55f4SIlpo Järvinen 			continue;
24661da177e4SLinus Torvalds 
24670e1c54c2SIlpo Järvinen 		} else {
2468618d9f25SIlpo Järvinen 			last_lost = TCP_SKB_CB(skb)->end_seq;
24690e1c54c2SIlpo Järvinen 			if (icsk->icsk_ca_state != TCP_CA_Loss)
24700e1c54c2SIlpo Järvinen 				mib_idx = LINUX_MIB_TCPFASTRETRANS;
24710e1c54c2SIlpo Järvinen 			else
24720e1c54c2SIlpo Järvinen 				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
24730e1c54c2SIlpo Järvinen 		}
24740e1c54c2SIlpo Järvinen 
24750e1c54c2SIlpo Järvinen 		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
247661eb55f4SIlpo Järvinen 			continue;
247740b215e5SPavel Emelyanov 
247809e9b813SEric Dumazet 		if (tcp_retransmit_skb(sk, skb)) {
247909e9b813SEric Dumazet 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
24801da177e4SLinus Torvalds 			return;
248109e9b813SEric Dumazet 		}
2482de0744afSPavel Emelyanov 		NET_INC_STATS_BH(sock_net(sk), mib_idx);
24831da177e4SLinus Torvalds 
2484a262f0cdSNandita Dukkipati 		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
2485a262f0cdSNandita Dukkipati 			tp->prr_out += tcp_skb_pcount(skb);
2486a262f0cdSNandita Dukkipati 
2487fe067e8aSDavid S. Miller 		if (skb == tcp_write_queue_head(sk))
2488463c84b9SArnaldo Carvalho de Melo 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
24893f421baaSArnaldo Carvalho de Melo 						  inet_csk(sk)->icsk_rto,
24903f421baaSArnaldo Carvalho de Melo 						  TCP_RTO_MAX);
24911da177e4SLinus Torvalds 	}
24921da177e4SLinus Torvalds }
24931da177e4SLinus Torvalds 
24941da177e4SLinus Torvalds /* Send a fin.  The caller locks the socket for us.  This cannot be
24951da177e4SLinus Torvalds  * allowed to fail queueing a FIN frame under any circumstances.
24961da177e4SLinus Torvalds  */
24971da177e4SLinus Torvalds void tcp_send_fin(struct sock *sk)
24981da177e4SLinus Torvalds {
24991da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
2500fe067e8aSDavid S. Miller 	struct sk_buff *skb = tcp_write_queue_tail(sk);
25011da177e4SLinus Torvalds 	int mss_now;
25021da177e4SLinus Torvalds 
25031da177e4SLinus Torvalds 	/* Optimization, tack on the FIN if we have a queue of
25041da177e4SLinus Torvalds 	 * unsent frames.  But be careful about outgoing SACKS
25051da177e4SLinus Torvalds 	 * and IP options.
25061da177e4SLinus Torvalds 	 */
25070c54b85fSIlpo Järvinen 	mss_now = tcp_current_mss(sk);
25081da177e4SLinus Torvalds 
2509fe067e8aSDavid S. Miller 	if (tcp_send_head(sk) != NULL) {
25104de075e0SEric Dumazet 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
25111da177e4SLinus Torvalds 		TCP_SKB_CB(skb)->end_seq++;
25121da177e4SLinus Torvalds 		tp->write_seq++;
25131da177e4SLinus Torvalds 	} else {
25141da177e4SLinus Torvalds 		/* Socket is locked, keep trying until memory is available. */
25151da177e4SLinus Torvalds 		for (;;) {
2516aa133076SWu Fengguang 			skb = alloc_skb_fclone(MAX_TCP_HEADER,
2517aa133076SWu Fengguang 					       sk->sk_allocation);
25181da177e4SLinus Torvalds 			if (skb)
25191da177e4SLinus Torvalds 				break;
25201da177e4SLinus Torvalds 			yield();
25211da177e4SLinus Torvalds 		}
25221da177e4SLinus Torvalds 
25231da177e4SLinus Torvalds 		/* Reserve space for headers and prepare control bits. */
25241da177e4SLinus Torvalds 		skb_reserve(skb, MAX_TCP_HEADER);
25251da177e4SLinus Torvalds 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2526e870a8efSIlpo Järvinen 		tcp_init_nondata_skb(skb, tp->write_seq,
2527a3433f35SChangli Gao 				     TCPHDR_ACK | TCPHDR_FIN);
25281da177e4SLinus Torvalds 		tcp_queue_skb(sk, skb);
25291da177e4SLinus Torvalds 	}
25309e412ba7SIlpo Järvinen 	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
25311da177e4SLinus Torvalds }
25321da177e4SLinus Torvalds 
25331da177e4SLinus Torvalds /* We get here when a process closes a file descriptor (either due to
25341da177e4SLinus Torvalds  * an explicit close() or as a byproduct of exit()'ing) and there
25351da177e4SLinus Torvalds  * was unread data in the receive queue.  This behavior is recommended
253665bb723cSGerrit Renker  * by RFC 2525, section 2.17.  -DaveM
25371da177e4SLinus Torvalds  */
2538dd0fc66fSAl Viro void tcp_send_active_reset(struct sock *sk, gfp_t priority)
25391da177e4SLinus Torvalds {
25401da177e4SLinus Torvalds 	struct sk_buff *skb;
25411da177e4SLinus Torvalds 
25421da177e4SLinus Torvalds 	/* NOTE: No TCP options attached and we never retransmit this. */
25431da177e4SLinus Torvalds 	skb = alloc_skb(MAX_TCP_HEADER, priority);
25441da177e4SLinus Torvalds 	if (!skb) {
25454e673444SPavel Emelyanov 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
25461da177e4SLinus Torvalds 		return;
25471da177e4SLinus Torvalds 	}
25481da177e4SLinus Torvalds 
25491da177e4SLinus Torvalds 	/* Reserve space for headers and prepare control bits. */
25501da177e4SLinus Torvalds 	skb_reserve(skb, MAX_TCP_HEADER);
2551e870a8efSIlpo Järvinen 	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
2552a3433f35SChangli Gao 			     TCPHDR_ACK | TCPHDR_RST);
25531da177e4SLinus Torvalds 	/* Send it off. */
25541da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
2555dfb4b9dcSDavid S. Miller 	if (tcp_transmit_skb(sk, skb, 0, priority))
25564e673444SPavel Emelyanov 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
255726af65cbSSridhar Samudrala 
255881cc8a75SPavel Emelyanov 	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
25591da177e4SLinus Torvalds }
25601da177e4SLinus Torvalds 
256167edfef7SAndi Kleen /* Send a crossed SYN-ACK during socket establishment.
256267edfef7SAndi Kleen  * WARNING: This routine must only be called when we have already sent
25631da177e4SLinus Torvalds  * a SYN packet that crossed the incoming SYN that caused this routine
25641da177e4SLinus Torvalds  * to get called. If this assumption fails then the initial rcv_wnd
25651da177e4SLinus Torvalds  * and rcv_wscale values will not be correct.
25661da177e4SLinus Torvalds  */
25671da177e4SLinus Torvalds int tcp_send_synack(struct sock *sk)
25681da177e4SLinus Torvalds {
25691da177e4SLinus Torvalds 	struct sk_buff *skb;
25701da177e4SLinus Torvalds 
2571fe067e8aSDavid S. Miller 	skb = tcp_write_queue_head(sk);
25724de075e0SEric Dumazet 	if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
257391df42beSJoe Perches 		pr_debug("%s: wrong queue state\n", __func__);
25741da177e4SLinus Torvalds 		return -EFAULT;
25751da177e4SLinus Torvalds 	}
25764de075e0SEric Dumazet 	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
25771da177e4SLinus Torvalds 		if (skb_cloned(skb)) {
25781da177e4SLinus Torvalds 			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
25791da177e4SLinus Torvalds 			if (nskb == NULL)
25801da177e4SLinus Torvalds 				return -ENOMEM;
2581fe067e8aSDavid S. Miller 			tcp_unlink_write_queue(skb, sk);
25821da177e4SLinus Torvalds 			skb_header_release(nskb);
2583fe067e8aSDavid S. Miller 			__tcp_add_write_queue_head(sk, nskb);
25843ab224beSHideo Aoki 			sk_wmem_free_skb(sk, skb);
25853ab224beSHideo Aoki 			sk->sk_wmem_queued += nskb->truesize;
25863ab224beSHideo Aoki 			sk_mem_charge(sk, nskb->truesize);
25871da177e4SLinus Torvalds 			skb = nskb;
25881da177e4SLinus Torvalds 		}
25891da177e4SLinus Torvalds 
25904de075e0SEric Dumazet 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
25911da177e4SLinus Torvalds 		TCP_ECN_send_synack(tcp_sk(sk), skb);
25921da177e4SLinus Torvalds 	}
25931da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
2594dfb4b9dcSDavid S. Miller 	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
25951da177e4SLinus Torvalds }
25961da177e4SLinus Torvalds 
25974aea39c1SEric Dumazet /**
25984aea39c1SEric Dumazet  * tcp_make_synack - Prepare a SYN-ACK.
25994aea39c1SEric Dumazet  * sk: listener socket
26004aea39c1SEric Dumazet  * dst: dst entry attached to the SYNACK
26014aea39c1SEric Dumazet  * req: request_sock pointer
26024aea39c1SEric Dumazet  * rvp: request_values pointer
26034aea39c1SEric Dumazet  *
26044aea39c1SEric Dumazet  * Allocate one skb and build a SYNACK packet.
26054aea39c1SEric Dumazet  * @dst is consumed : Caller should not use it again.
26064aea39c1SEric Dumazet  */
26071da177e4SLinus Torvalds struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2608e6b4d113SWilliam Allen Simpson 				struct request_sock *req,
2609e6b4d113SWilliam Allen Simpson 				struct request_values *rvp)
26101da177e4SLinus Torvalds {
2611bd0388aeSWilliam Allen Simpson 	struct tcp_out_options opts;
26124957faadSWilliam Allen Simpson 	struct tcp_extend_values *xvp = tcp_xv(rvp);
26132e6599cbSArnaldo Carvalho de Melo 	struct inet_request_sock *ireq = inet_rsk(req);
26141da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
261528b2774aSEric Dumazet 	const struct tcp_cookie_values *cvp = tp->cookie_values;
26161da177e4SLinus Torvalds 	struct tcphdr *th;
26171da177e4SLinus Torvalds 	struct sk_buff *skb;
2618cfb6eeb4SYOSHIFUJI Hideaki 	struct tcp_md5sig_key *md5;
2619bd0388aeSWilliam Allen Simpson 	int tcp_header_size;
2620f5fff5dcSTom Quetchenbach 	int mss;
262128b2774aSEric Dumazet 	int s_data_desired = 0;
26221da177e4SLinus Torvalds 
262328b2774aSEric Dumazet 	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
262428b2774aSEric Dumazet 		s_data_desired = cvp->s_data_desired;
262590ba9b19SEric Dumazet 	skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
26264aea39c1SEric Dumazet 	if (unlikely(!skb)) {
26274aea39c1SEric Dumazet 		dst_release(dst);
26281da177e4SLinus Torvalds 		return NULL;
26294aea39c1SEric Dumazet 	}
26301da177e4SLinus Torvalds 	/* Reserve space for headers. */
26311da177e4SLinus Torvalds 	skb_reserve(skb, MAX_TCP_HEADER);
26321da177e4SLinus Torvalds 
26334aea39c1SEric Dumazet 	skb_dst_set(skb, dst);
26341da177e4SLinus Torvalds 
26350dbaee3bSDavid S. Miller 	mss = dst_metric_advmss(dst);
2636f5fff5dcSTom Quetchenbach 	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2637f5fff5dcSTom Quetchenbach 		mss = tp->rx_opt.user_mss;
2638f5fff5dcSTom Quetchenbach 
263933ad798cSAdam Langley 	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
264033ad798cSAdam Langley 		__u8 rcv_wscale;
264133ad798cSAdam Langley 		/* Set this up on the first call only */
264233ad798cSAdam Langley 		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2643e88c64f0SHagen Paul Pfeifer 
2644e88c64f0SHagen Paul Pfeifer 		/* limit the window selection if the user enforce a smaller rx buffer */
2645e88c64f0SHagen Paul Pfeifer 		if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2646e88c64f0SHagen Paul Pfeifer 		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2647e88c64f0SHagen Paul Pfeifer 			req->window_clamp = tcp_full_space(sk);
2648e88c64f0SHagen Paul Pfeifer 
264933ad798cSAdam Langley 		/* tcp_full_space because it is guaranteed to be the first packet */
265033ad798cSAdam Langley 		tcp_select_initial_window(tcp_full_space(sk),
2651f5fff5dcSTom Quetchenbach 			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
265233ad798cSAdam Langley 			&req->rcv_wnd,
265333ad798cSAdam Langley 			&req->window_clamp,
265433ad798cSAdam Langley 			ireq->wscale_ok,
265531d12926Slaurent chavey 			&rcv_wscale,
265631d12926Slaurent chavey 			dst_metric(dst, RTAX_INITRWND));
265733ad798cSAdam Langley 		ireq->rcv_wscale = rcv_wscale;
265833ad798cSAdam Langley 	}
2659cfb6eeb4SYOSHIFUJI Hideaki 
266033ad798cSAdam Langley 	memset(&opts, 0, sizeof(opts));
26618b5f12d0SFlorian Westphal #ifdef CONFIG_SYN_COOKIES
26628b5f12d0SFlorian Westphal 	if (unlikely(req->cookie_ts))
26638b5f12d0SFlorian Westphal 		TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
26648b5f12d0SFlorian Westphal 	else
26658b5f12d0SFlorian Westphal #endif
266633ad798cSAdam Langley 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
2667f5fff5dcSTom Quetchenbach 	tcp_header_size = tcp_synack_options(sk, req, mss,
26684957faadSWilliam Allen Simpson 					     skb, &opts, &md5, xvp)
26694957faadSWilliam Allen Simpson 			+ sizeof(*th);
267033ad798cSAdam Langley 
2671aa8223c7SArnaldo Carvalho de Melo 	skb_push(skb, tcp_header_size);
2672aa8223c7SArnaldo Carvalho de Melo 	skb_reset_transport_header(skb);
26731da177e4SLinus Torvalds 
2674aa8223c7SArnaldo Carvalho de Melo 	th = tcp_hdr(skb);
26751da177e4SLinus Torvalds 	memset(th, 0, sizeof(struct tcphdr));
26761da177e4SLinus Torvalds 	th->syn = 1;
26771da177e4SLinus Torvalds 	th->ack = 1;
26781da177e4SLinus Torvalds 	TCP_ECN_make_synack(req, th);
2679a3116ac5SKOVACS Krisztian 	th->source = ireq->loc_port;
26802e6599cbSArnaldo Carvalho de Melo 	th->dest = ireq->rmt_port;
2681e870a8efSIlpo Järvinen 	/* Setting of flags are superfluous here for callers (and ECE is
2682e870a8efSIlpo Järvinen 	 * not even correctly set)
2683e870a8efSIlpo Järvinen 	 */
2684e870a8efSIlpo Järvinen 	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2685a3433f35SChangli Gao 			     TCPHDR_SYN | TCPHDR_ACK);
26864957faadSWilliam Allen Simpson 
26874957faadSWilliam Allen Simpson 	if (OPTION_COOKIE_EXTENSION & opts.options) {
268828b2774aSEric Dumazet 		if (s_data_desired) {
268928b2774aSEric Dumazet 			u8 *buf = skb_put(skb, s_data_desired);
26904957faadSWilliam Allen Simpson 
26914957faadSWilliam Allen Simpson 			/* copy data directly from the listening socket. */
269228b2774aSEric Dumazet 			memcpy(buf, cvp->s_data_payload, s_data_desired);
269328b2774aSEric Dumazet 			TCP_SKB_CB(skb)->end_seq += s_data_desired;
26944957faadSWilliam Allen Simpson 		}
26954957faadSWilliam Allen Simpson 
26964957faadSWilliam Allen Simpson 		if (opts.hash_size > 0) {
26974957faadSWilliam Allen Simpson 			__u32 workspace[SHA_WORKSPACE_WORDS];
26984957faadSWilliam Allen Simpson 			u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
26994957faadSWilliam Allen Simpson 			u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
27004957faadSWilliam Allen Simpson 
27014957faadSWilliam Allen Simpson 			/* Secret recipe depends on the Timestamp, (future)
27024957faadSWilliam Allen Simpson 			 * Sequence and Acknowledgment Numbers, Initiator
27034957faadSWilliam Allen Simpson 			 * Cookie, and others handled by IP variant caller.
27044957faadSWilliam Allen Simpson 			 */
27054957faadSWilliam Allen Simpson 			*tail-- ^= opts.tsval;
27064957faadSWilliam Allen Simpson 			*tail-- ^= tcp_rsk(req)->rcv_isn + 1;
27074957faadSWilliam Allen Simpson 			*tail-- ^= TCP_SKB_CB(skb)->seq + 1;
27084957faadSWilliam Allen Simpson 
27094957faadSWilliam Allen Simpson 			/* recommended */
27100eae88f3SEric Dumazet 			*tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
2711f9a2e69eSDavid S. Miller 			*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
27124957faadSWilliam Allen Simpson 
27134957faadSWilliam Allen Simpson 			sha_transform((__u32 *)&xvp->cookie_bakery[0],
27144957faadSWilliam Allen Simpson 				      (char *)mess,
27154957faadSWilliam Allen Simpson 				      &workspace[0]);
27164957faadSWilliam Allen Simpson 			opts.hash_location =
27174957faadSWilliam Allen Simpson 				(__u8 *)&xvp->cookie_bakery[0];
27184957faadSWilliam Allen Simpson 		}
27194957faadSWilliam Allen Simpson 	}
27204957faadSWilliam Allen Simpson 
27211da177e4SLinus Torvalds 	th->seq = htonl(TCP_SKB_CB(skb)->seq);
27222e6599cbSArnaldo Carvalho de Melo 	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
27231da177e4SLinus Torvalds 
27241da177e4SLinus Torvalds 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2725600ff0c2SIlpo Järvinen 	th->window = htons(min(req->rcv_wnd, 65535U));
2726bd0388aeSWilliam Allen Simpson 	tcp_options_write((__be32 *)(th + 1), tp, &opts);
27271da177e4SLinus Torvalds 	th->doff = (tcp_header_size >> 2);
2728aa2ea058STom Herbert 	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
2729cfb6eeb4SYOSHIFUJI Hideaki 
2730cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
2731cfb6eeb4SYOSHIFUJI Hideaki 	/* Okay, we have all we need - do the md5 hash if needed */
2732cfb6eeb4SYOSHIFUJI Hideaki 	if (md5) {
2733bd0388aeSWilliam Allen Simpson 		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
273449a72dfbSAdam Langley 					       md5, NULL, req, skb);
2735cfb6eeb4SYOSHIFUJI Hideaki 	}
2736cfb6eeb4SYOSHIFUJI Hideaki #endif
2737cfb6eeb4SYOSHIFUJI Hideaki 
27381da177e4SLinus Torvalds 	return skb;
27391da177e4SLinus Torvalds }
27404bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_make_synack);
27411da177e4SLinus Torvalds 
274267edfef7SAndi Kleen /* Do all connect socket setups that can be done AF independent. */
2743370816aeSPavel Emelyanov void tcp_connect_init(struct sock *sk)
27441da177e4SLinus Torvalds {
2745cf533ea5SEric Dumazet 	const struct dst_entry *dst = __sk_dst_get(sk);
27461da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
27471da177e4SLinus Torvalds 	__u8 rcv_wscale;
27481da177e4SLinus Torvalds 
27491da177e4SLinus Torvalds 	/* We'll fix this up when we get a response from the other end.
27501da177e4SLinus Torvalds 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
27511da177e4SLinus Torvalds 	 */
27521da177e4SLinus Torvalds 	tp->tcp_header_len = sizeof(struct tcphdr) +
2753bb5b7c11SDavid S. Miller 		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
27541da177e4SLinus Torvalds 
2755cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG
2756cfb6eeb4SYOSHIFUJI Hideaki 	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
2757cfb6eeb4SYOSHIFUJI Hideaki 		tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
2758cfb6eeb4SYOSHIFUJI Hideaki #endif
2759cfb6eeb4SYOSHIFUJI Hideaki 
27601da177e4SLinus Torvalds 	/* If user gave his TCP_MAXSEG, record it to clamp */
27611da177e4SLinus Torvalds 	if (tp->rx_opt.user_mss)
27621da177e4SLinus Torvalds 		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
27631da177e4SLinus Torvalds 	tp->max_window = 0;
27645d424d5aSJohn Heffner 	tcp_mtup_init(sk);
27651da177e4SLinus Torvalds 	tcp_sync_mss(sk, dst_mtu(dst));
27661da177e4SLinus Torvalds 
27671da177e4SLinus Torvalds 	if (!tp->window_clamp)
27681da177e4SLinus Torvalds 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
27690dbaee3bSDavid S. Miller 	tp->advmss = dst_metric_advmss(dst);
2770f5fff5dcSTom Quetchenbach 	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
2771f5fff5dcSTom Quetchenbach 		tp->advmss = tp->rx_opt.user_mss;
2772f5fff5dcSTom Quetchenbach 
27731da177e4SLinus Torvalds 	tcp_initialize_rcv_mss(sk);
27741da177e4SLinus Torvalds 
2775e88c64f0SHagen Paul Pfeifer 	/* limit the window selection if the user enforce a smaller rx buffer */
2776e88c64f0SHagen Paul Pfeifer 	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2777e88c64f0SHagen Paul Pfeifer 	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
2778e88c64f0SHagen Paul Pfeifer 		tp->window_clamp = tcp_full_space(sk);
2779e88c64f0SHagen Paul Pfeifer 
27801da177e4SLinus Torvalds 	tcp_select_initial_window(tcp_full_space(sk),
27811da177e4SLinus Torvalds 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
27821da177e4SLinus Torvalds 				  &tp->rcv_wnd,
27831da177e4SLinus Torvalds 				  &tp->window_clamp,
2784bb5b7c11SDavid S. Miller 				  sysctl_tcp_window_scaling,
278531d12926Slaurent chavey 				  &rcv_wscale,
278631d12926Slaurent chavey 				  dst_metric(dst, RTAX_INITRWND));
27871da177e4SLinus Torvalds 
27881da177e4SLinus Torvalds 	tp->rx_opt.rcv_wscale = rcv_wscale;
27891da177e4SLinus Torvalds 	tp->rcv_ssthresh = tp->rcv_wnd;
27901da177e4SLinus Torvalds 
27911da177e4SLinus Torvalds 	sk->sk_err = 0;
27921da177e4SLinus Torvalds 	sock_reset_flag(sk, SOCK_DONE);
27931da177e4SLinus Torvalds 	tp->snd_wnd = 0;
2794ee7537b6SHantzis Fotis 	tcp_init_wl(tp, 0);
27951da177e4SLinus Torvalds 	tp->snd_una = tp->write_seq;
27961da177e4SLinus Torvalds 	tp->snd_sml = tp->write_seq;
279733f5f57eSIlpo Järvinen 	tp->snd_up = tp->write_seq;
2798370816aeSPavel Emelyanov 	tp->snd_nxt = tp->write_seq;
2799ee995283SPavel Emelyanov 
2800ee995283SPavel Emelyanov 	if (likely(!tp->repair))
28011da177e4SLinus Torvalds 		tp->rcv_nxt = 0;
2802ee995283SPavel Emelyanov 	tp->rcv_wup = tp->rcv_nxt;
2803ee995283SPavel Emelyanov 	tp->copied_seq = tp->rcv_nxt;
28041da177e4SLinus Torvalds 
2805463c84b9SArnaldo Carvalho de Melo 	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
2806463c84b9SArnaldo Carvalho de Melo 	inet_csk(sk)->icsk_retransmits = 0;
28071da177e4SLinus Torvalds 	tcp_clear_retrans(tp);
28081da177e4SLinus Torvalds }
28091da177e4SLinus Torvalds 
281067edfef7SAndi Kleen /* Build a SYN and send it off. */
28111da177e4SLinus Torvalds int tcp_connect(struct sock *sk)
28121da177e4SLinus Torvalds {
28131da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
28141da177e4SLinus Torvalds 	struct sk_buff *buff;
2815ee586811SEric Paris 	int err;
28161da177e4SLinus Torvalds 
28171da177e4SLinus Torvalds 	tcp_connect_init(sk);
28181da177e4SLinus Torvalds 
2819d179cd12SDavid S. Miller 	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
28201da177e4SLinus Torvalds 	if (unlikely(buff == NULL))
28211da177e4SLinus Torvalds 		return -ENOBUFS;
28221da177e4SLinus Torvalds 
28231da177e4SLinus Torvalds 	/* Reserve space for headers. */
28241da177e4SLinus Torvalds 	skb_reserve(buff, MAX_TCP_HEADER);
28251da177e4SLinus Torvalds 
2826a3433f35SChangli Gao 	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
2827e870a8efSIlpo Järvinen 	TCP_ECN_send_syn(sk, buff);
28281da177e4SLinus Torvalds 
28291da177e4SLinus Torvalds 	/* Send it off. */
28301da177e4SLinus Torvalds 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
28311da177e4SLinus Torvalds 	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
28321da177e4SLinus Torvalds 	skb_header_release(buff);
2833fe067e8aSDavid S. Miller 	__tcp_add_write_queue_tail(sk, buff);
28343ab224beSHideo Aoki 	sk->sk_wmem_queued += buff->truesize;
28353ab224beSHideo Aoki 	sk_mem_charge(sk, buff->truesize);
28361da177e4SLinus Torvalds 	tp->packets_out += tcp_skb_pcount(buff);
2837ee586811SEric Paris 	err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
2838ee586811SEric Paris 	if (err == -ECONNREFUSED)
2839ee586811SEric Paris 		return err;
2840bd37a088SWei Yongjun 
2841bd37a088SWei Yongjun 	/* We change tp->snd_nxt after the tcp_transmit_skb() call
2842bd37a088SWei Yongjun 	 * in order to make this packet get counted in tcpOutSegs.
2843bd37a088SWei Yongjun 	 */
2844bd37a088SWei Yongjun 	tp->snd_nxt = tp->write_seq;
2845bd37a088SWei Yongjun 	tp->pushed_seq = tp->write_seq;
284681cc8a75SPavel Emelyanov 	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
28471da177e4SLinus Torvalds 
28481da177e4SLinus Torvalds 	/* Timer for repeating the SYN until an answer. */
28493f421baaSArnaldo Carvalho de Melo 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
28503f421baaSArnaldo Carvalho de Melo 				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
28511da177e4SLinus Torvalds 	return 0;
28521da177e4SLinus Torvalds }
28534bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_connect);
28541da177e4SLinus Torvalds 
28551da177e4SLinus Torvalds /* Send out a delayed ack, the caller does the policy checking
28561da177e4SLinus Torvalds  * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
28571da177e4SLinus Torvalds  * for details.
28581da177e4SLinus Torvalds  */
28591da177e4SLinus Torvalds void tcp_send_delayed_ack(struct sock *sk)
28601da177e4SLinus Torvalds {
2861463c84b9SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
2862463c84b9SArnaldo Carvalho de Melo 	int ato = icsk->icsk_ack.ato;
28631da177e4SLinus Torvalds 	unsigned long timeout;
28641da177e4SLinus Torvalds 
28651da177e4SLinus Torvalds 	if (ato > TCP_DELACK_MIN) {
2866463c84b9SArnaldo Carvalho de Melo 		const struct tcp_sock *tp = tcp_sk(sk);
28671da177e4SLinus Torvalds 		int max_ato = HZ / 2;
28681da177e4SLinus Torvalds 
2869056834d9SIlpo Järvinen 		if (icsk->icsk_ack.pingpong ||
2870056834d9SIlpo Järvinen 		    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
28711da177e4SLinus Torvalds 			max_ato = TCP_DELACK_MAX;
28721da177e4SLinus Torvalds 
28731da177e4SLinus Torvalds 		/* Slow path, intersegment interval is "high". */
28741da177e4SLinus Torvalds 
28751da177e4SLinus Torvalds 		/* If some rtt estimate is known, use it to bound delayed ack.
2876463c84b9SArnaldo Carvalho de Melo 		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
28771da177e4SLinus Torvalds 		 * directly.
28781da177e4SLinus Torvalds 		 */
28791da177e4SLinus Torvalds 		if (tp->srtt) {
28801da177e4SLinus Torvalds 			int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
28811da177e4SLinus Torvalds 
28821da177e4SLinus Torvalds 			if (rtt < max_ato)
28831da177e4SLinus Torvalds 				max_ato = rtt;
28841da177e4SLinus Torvalds 		}
28851da177e4SLinus Torvalds 
28861da177e4SLinus Torvalds 		ato = min(ato, max_ato);
28871da177e4SLinus Torvalds 	}
28881da177e4SLinus Torvalds 
28891da177e4SLinus Torvalds 	/* Stay within the limit we were given */
28901da177e4SLinus Torvalds 	timeout = jiffies + ato;
28911da177e4SLinus Torvalds 
28921da177e4SLinus Torvalds 	/* Use new timeout only if there wasn't a older one earlier. */
2893463c84b9SArnaldo Carvalho de Melo 	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
28941da177e4SLinus Torvalds 		/* If delack timer was blocked or is about to expire,
28951da177e4SLinus Torvalds 		 * send ACK now.
28961da177e4SLinus Torvalds 		 */
2897463c84b9SArnaldo Carvalho de Melo 		if (icsk->icsk_ack.blocked ||
2898463c84b9SArnaldo Carvalho de Melo 		    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
28991da177e4SLinus Torvalds 			tcp_send_ack(sk);
29001da177e4SLinus Torvalds 			return;
29011da177e4SLinus Torvalds 		}
29021da177e4SLinus Torvalds 
2903463c84b9SArnaldo Carvalho de Melo 		if (!time_before(timeout, icsk->icsk_ack.timeout))
2904463c84b9SArnaldo Carvalho de Melo 			timeout = icsk->icsk_ack.timeout;
29051da177e4SLinus Torvalds 	}
2906463c84b9SArnaldo Carvalho de Melo 	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
2907463c84b9SArnaldo Carvalho de Melo 	icsk->icsk_ack.timeout = timeout;
2908463c84b9SArnaldo Carvalho de Melo 	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
29091da177e4SLinus Torvalds }
29101da177e4SLinus Torvalds 
29111da177e4SLinus Torvalds /* This routine sends an ack and also updates the window. */
29121da177e4SLinus Torvalds void tcp_send_ack(struct sock *sk)
29131da177e4SLinus Torvalds {
29141da177e4SLinus Torvalds 	struct sk_buff *buff;
29151da177e4SLinus Torvalds 
2916058dc334SIlpo Järvinen 	/* If we have been reset, we may not send again. */
2917058dc334SIlpo Järvinen 	if (sk->sk_state == TCP_CLOSE)
2918058dc334SIlpo Järvinen 		return;
2919058dc334SIlpo Järvinen 
29201da177e4SLinus Torvalds 	/* We are not putting this on the write queue, so
29211da177e4SLinus Torvalds 	 * tcp_transmit_skb() will set the ownership to this
29221da177e4SLinus Torvalds 	 * sock.
29231da177e4SLinus Torvalds 	 */
29241da177e4SLinus Torvalds 	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
29251da177e4SLinus Torvalds 	if (buff == NULL) {
2926463c84b9SArnaldo Carvalho de Melo 		inet_csk_schedule_ack(sk);
2927463c84b9SArnaldo Carvalho de Melo 		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
29283f421baaSArnaldo Carvalho de Melo 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
29293f421baaSArnaldo Carvalho de Melo 					  TCP_DELACK_MAX, TCP_RTO_MAX);
29301da177e4SLinus Torvalds 		return;
29311da177e4SLinus Torvalds 	}
29321da177e4SLinus Torvalds 
29331da177e4SLinus Torvalds 	/* Reserve space for headers and prepare control bits. */
29341da177e4SLinus Torvalds 	skb_reserve(buff, MAX_TCP_HEADER);
2935a3433f35SChangli Gao 	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
29361da177e4SLinus Torvalds 
29371da177e4SLinus Torvalds 	/* Send it off, this clears delayed acks for us. */
29381da177e4SLinus Torvalds 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
2939dfb4b9dcSDavid S. Miller 	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
29401da177e4SLinus Torvalds }
29411da177e4SLinus Torvalds 
29421da177e4SLinus Torvalds /* This routine sends a packet with an out of date sequence
29431da177e4SLinus Torvalds  * number. It assumes the other end will try to ack it.
29441da177e4SLinus Torvalds  *
29451da177e4SLinus Torvalds  * Question: what should we make while urgent mode?
29461da177e4SLinus Torvalds  * 4.4BSD forces sending single byte of data. We cannot send
29471da177e4SLinus Torvalds  * out of window data, because we have SND.NXT==SND.MAX...
29481da177e4SLinus Torvalds  *
29491da177e4SLinus Torvalds  * Current solution: to send TWO zero-length segments in urgent mode:
29501da177e4SLinus Torvalds  * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
29511da177e4SLinus Torvalds  * out-of-date with SND.UNA-1 to probe window.
29521da177e4SLinus Torvalds  */
29531da177e4SLinus Torvalds static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
29541da177e4SLinus Torvalds {
29551da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
29561da177e4SLinus Torvalds 	struct sk_buff *skb;
29571da177e4SLinus Torvalds 
29581da177e4SLinus Torvalds 	/* We don't queue it, tcp_transmit_skb() sets ownership. */
29591da177e4SLinus Torvalds 	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
29601da177e4SLinus Torvalds 	if (skb == NULL)
29611da177e4SLinus Torvalds 		return -1;
29621da177e4SLinus Torvalds 
29631da177e4SLinus Torvalds 	/* Reserve space for headers and set control bits. */
29641da177e4SLinus Torvalds 	skb_reserve(skb, MAX_TCP_HEADER);
29651da177e4SLinus Torvalds 	/* Use a previous sequence.  This should cause the other
29661da177e4SLinus Torvalds 	 * end to send an ack.  Don't queue or clone SKB, just
29671da177e4SLinus Torvalds 	 * send it.
29681da177e4SLinus Torvalds 	 */
2969a3433f35SChangli Gao 	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
29701da177e4SLinus Torvalds 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
2971dfb4b9dcSDavid S. Miller 	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
29721da177e4SLinus Torvalds }
29731da177e4SLinus Torvalds 
2974ee995283SPavel Emelyanov void tcp_send_window_probe(struct sock *sk)
2975ee995283SPavel Emelyanov {
2976ee995283SPavel Emelyanov 	if (sk->sk_state == TCP_ESTABLISHED) {
2977ee995283SPavel Emelyanov 		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
2978c0e88ff0SPavel Emelyanov 		tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
2979ee995283SPavel Emelyanov 		tcp_xmit_probe_skb(sk, 0);
2980ee995283SPavel Emelyanov 	}
2981ee995283SPavel Emelyanov }
2982ee995283SPavel Emelyanov 
298367edfef7SAndi Kleen /* Initiate keepalive or window probe from timer. */
29841da177e4SLinus Torvalds int tcp_write_wakeup(struct sock *sk)
29851da177e4SLinus Torvalds {
29861da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
29871da177e4SLinus Torvalds 	struct sk_buff *skb;
29881da177e4SLinus Torvalds 
2989058dc334SIlpo Järvinen 	if (sk->sk_state == TCP_CLOSE)
2990058dc334SIlpo Järvinen 		return -1;
2991058dc334SIlpo Järvinen 
2992fe067e8aSDavid S. Miller 	if ((skb = tcp_send_head(sk)) != NULL &&
299390840defSIlpo Järvinen 	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
29941da177e4SLinus Torvalds 		int err;
29950c54b85fSIlpo Järvinen 		unsigned int mss = tcp_current_mss(sk);
299690840defSIlpo Järvinen 		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
29971da177e4SLinus Torvalds 
29981da177e4SLinus Torvalds 		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
29991da177e4SLinus Torvalds 			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
30001da177e4SLinus Torvalds 
30011da177e4SLinus Torvalds 		/* We are probing the opening of a window
30021da177e4SLinus Torvalds 		 * but the window size is != 0
30031da177e4SLinus Torvalds 		 * must have been a result SWS avoidance ( sender )
30041da177e4SLinus Torvalds 		 */
30051da177e4SLinus Torvalds 		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
30061da177e4SLinus Torvalds 		    skb->len > mss) {
30071da177e4SLinus Torvalds 			seg_size = min(seg_size, mss);
30084de075e0SEric Dumazet 			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3009846998aeSDavid S. Miller 			if (tcp_fragment(sk, skb, seg_size, mss))
30101da177e4SLinus Torvalds 				return -1;
30111da177e4SLinus Torvalds 		} else if (!tcp_skb_pcount(skb))
3012846998aeSDavid S. Miller 			tcp_set_skb_tso_segs(sk, skb, mss);
30131da177e4SLinus Torvalds 
30144de075e0SEric Dumazet 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
30151da177e4SLinus Torvalds 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
3016dfb4b9dcSDavid S. Miller 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
301766f5fe62SIlpo Järvinen 		if (!err)
301866f5fe62SIlpo Järvinen 			tcp_event_new_data_sent(sk, skb);
30191da177e4SLinus Torvalds 		return err;
30201da177e4SLinus Torvalds 	} else {
302133f5f57eSIlpo Järvinen 		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
30224828e7f4SIlpo Järvinen 			tcp_xmit_probe_skb(sk, 1);
30231da177e4SLinus Torvalds 		return tcp_xmit_probe_skb(sk, 0);
30241da177e4SLinus Torvalds 	}
30251da177e4SLinus Torvalds }
30261da177e4SLinus Torvalds 
30271da177e4SLinus Torvalds /* A window probe timeout has occurred.  If window is not closed send
30281da177e4SLinus Torvalds  * a partial packet else a zero probe.
30291da177e4SLinus Torvalds  */
30301da177e4SLinus Torvalds void tcp_send_probe0(struct sock *sk)
30311da177e4SLinus Torvalds {
3032463c84b9SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
30331da177e4SLinus Torvalds 	struct tcp_sock *tp = tcp_sk(sk);
30341da177e4SLinus Torvalds 	int err;
30351da177e4SLinus Torvalds 
30361da177e4SLinus Torvalds 	err = tcp_write_wakeup(sk);
30371da177e4SLinus Torvalds 
3038fe067e8aSDavid S. Miller 	if (tp->packets_out || !tcp_send_head(sk)) {
30391da177e4SLinus Torvalds 		/* Cancel probe timer, if it is not required. */
30406687e988SArnaldo Carvalho de Melo 		icsk->icsk_probes_out = 0;
3041463c84b9SArnaldo Carvalho de Melo 		icsk->icsk_backoff = 0;
30421da177e4SLinus Torvalds 		return;
30431da177e4SLinus Torvalds 	}
30441da177e4SLinus Torvalds 
30451da177e4SLinus Torvalds 	if (err <= 0) {
3046463c84b9SArnaldo Carvalho de Melo 		if (icsk->icsk_backoff < sysctl_tcp_retries2)
3047463c84b9SArnaldo Carvalho de Melo 			icsk->icsk_backoff++;
30486687e988SArnaldo Carvalho de Melo 		icsk->icsk_probes_out++;
3049463c84b9SArnaldo Carvalho de Melo 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
30503f421baaSArnaldo Carvalho de Melo 					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
30513f421baaSArnaldo Carvalho de Melo 					  TCP_RTO_MAX);
30521da177e4SLinus Torvalds 	} else {
30531da177e4SLinus Torvalds 		/* If packet was not sent due to local congestion,
30546687e988SArnaldo Carvalho de Melo 		 * do not backoff and do not remember icsk_probes_out.
30551da177e4SLinus Torvalds 		 * Let local senders to fight for local resources.
30561da177e4SLinus Torvalds 		 *
30571da177e4SLinus Torvalds 		 * Use accumulated backoff yet.
30581da177e4SLinus Torvalds 		 */
30596687e988SArnaldo Carvalho de Melo 		if (!icsk->icsk_probes_out)
30606687e988SArnaldo Carvalho de Melo 			icsk->icsk_probes_out = 1;
3061463c84b9SArnaldo Carvalho de Melo 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3062463c84b9SArnaldo Carvalho de Melo 					  min(icsk->icsk_rto << icsk->icsk_backoff,
30633f421baaSArnaldo Carvalho de Melo 					      TCP_RESOURCE_PROBE_INTERVAL),
30643f421baaSArnaldo Carvalho de Melo 					  TCP_RTO_MAX);
30651da177e4SLinus Torvalds 	}
30661da177e4SLinus Torvalds }
3067