xref: /linux/net/ipv4/tcp_ipv4.c (revision d39d0ed196aa1685bb24771e92f78633c66ac9cb)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64 
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75 
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81 
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84 
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87 EXPORT_SYMBOL(sysctl_tcp_low_latency);
88 
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92 						   __be32 addr);
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, struct tcphdr *th);
95 #else
96 static inline
97 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98 {
99 	return NULL;
100 }
101 #endif
102 
103 struct inet_hashinfo tcp_hashinfo;
104 EXPORT_SYMBOL(tcp_hashinfo);
105 
106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107 {
108 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109 					  ip_hdr(skb)->saddr,
110 					  tcp_hdr(skb)->dest,
111 					  tcp_hdr(skb)->source);
112 }
113 
114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115 {
116 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 	struct tcp_sock *tp = tcp_sk(sk);
118 
119 	/* With PAWS, it is safe from the viewpoint
120 	   of data integrity. Even without PAWS it is safe provided sequence
121 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122 
123 	   Actually, the idea is close to VJ's one, only timestamp cache is
124 	   held not per host, but per port pair and TW bucket is used as state
125 	   holder.
126 
127 	   If TW bucket has been already destroyed we fall back to VJ's scheme
128 	   and use initial timestamp retrieved from peer table.
129 	 */
130 	if (tcptw->tw_ts_recent_stamp &&
131 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134 		if (tp->write_seq == 0)
135 			tp->write_seq = 1;
136 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 		sock_hold(sktw);
139 		return 1;
140 	}
141 
142 	return 0;
143 }
144 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145 
146 /* This will initiate an outgoing connection. */
147 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148 {
149 	struct inet_sock *inet = inet_sk(sk);
150 	struct tcp_sock *tp = tcp_sk(sk);
151 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
152 	struct rtable *rt;
153 	__be32 daddr, nexthop;
154 	int tmp;
155 	int err;
156 
157 	if (addr_len < sizeof(struct sockaddr_in))
158 		return -EINVAL;
159 
160 	if (usin->sin_family != AF_INET)
161 		return -EAFNOSUPPORT;
162 
163 	nexthop = daddr = usin->sin_addr.s_addr;
164 	if (inet->opt && inet->opt->srr) {
165 		if (!daddr)
166 			return -EINVAL;
167 		nexthop = inet->opt->faddr;
168 	}
169 
170 	tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
171 			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			       IPPROTO_TCP,
173 			       inet->inet_sport, usin->sin_port, sk, 1);
174 	if (tmp < 0) {
175 		if (tmp == -ENETUNREACH)
176 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
177 		return tmp;
178 	}
179 
180 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
181 		ip_rt_put(rt);
182 		return -ENETUNREACH;
183 	}
184 
185 	if (!inet->opt || !inet->opt->srr)
186 		daddr = rt->rt_dst;
187 
188 	if (!inet->inet_saddr)
189 		inet->inet_saddr = rt->rt_src;
190 	inet->inet_rcv_saddr = inet->inet_saddr;
191 
192 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
193 		/* Reset inherited state */
194 		tp->rx_opt.ts_recent	   = 0;
195 		tp->rx_opt.ts_recent_stamp = 0;
196 		tp->write_seq		   = 0;
197 	}
198 
199 	if (tcp_death_row.sysctl_tw_recycle &&
200 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
201 		struct inet_peer *peer = rt_get_peer(rt);
202 		/*
203 		 * VJ's idea. We save last timestamp seen from
204 		 * the destination in peer table, when entering state
205 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
206 		 * when trying new connection.
207 		 */
208 		if (peer) {
209 			inet_peer_refcheck(peer);
210 			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
211 				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212 				tp->rx_opt.ts_recent = peer->tcp_ts;
213 			}
214 		}
215 	}
216 
217 	inet->inet_dport = usin->sin_port;
218 	inet->inet_daddr = daddr;
219 
220 	inet_csk(sk)->icsk_ext_hdr_len = 0;
221 	if (inet->opt)
222 		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
223 
224 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
225 
226 	/* Socket identity is still unknown (sport may be zero).
227 	 * However we set state to SYN-SENT and not releasing socket
228 	 * lock select source port, enter ourselves into the hash tables and
229 	 * complete initialization after this.
230 	 */
231 	tcp_set_state(sk, TCP_SYN_SENT);
232 	err = inet_hash_connect(&tcp_death_row, sk);
233 	if (err)
234 		goto failure;
235 
236 	err = ip_route_newports(&rt, IPPROTO_TCP,
237 				inet->inet_sport, inet->inet_dport, sk);
238 	if (err)
239 		goto failure;
240 
241 	/* OK, now commit destination to socket.  */
242 	sk->sk_gso_type = SKB_GSO_TCPV4;
243 	sk_setup_caps(sk, &rt->dst);
244 
245 	if (!tp->write_seq)
246 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
247 							   inet->inet_daddr,
248 							   inet->inet_sport,
249 							   usin->sin_port);
250 
251 	inet->inet_id = tp->write_seq ^ jiffies;
252 
253 	err = tcp_connect(sk);
254 	rt = NULL;
255 	if (err)
256 		goto failure;
257 
258 	return 0;
259 
260 failure:
261 	/*
262 	 * This unhashes the socket and releases the local port,
263 	 * if necessary.
264 	 */
265 	tcp_set_state(sk, TCP_CLOSE);
266 	ip_rt_put(rt);
267 	sk->sk_route_caps = 0;
268 	inet->inet_dport = 0;
269 	return err;
270 }
271 EXPORT_SYMBOL(tcp_v4_connect);
272 
273 /*
274  * This routine does path mtu discovery as defined in RFC1191.
275  */
276 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
277 {
278 	struct dst_entry *dst;
279 	struct inet_sock *inet = inet_sk(sk);
280 
281 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
282 	 * send out by Linux are always <576bytes so they should go through
283 	 * unfragmented).
284 	 */
285 	if (sk->sk_state == TCP_LISTEN)
286 		return;
287 
288 	/* We don't check in the destentry if pmtu discovery is forbidden
289 	 * on this route. We just assume that no packet_to_big packets
290 	 * are send back when pmtu discovery is not active.
291 	 * There is a small race when the user changes this flag in the
292 	 * route, but I think that's acceptable.
293 	 */
294 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
295 		return;
296 
297 	dst->ops->update_pmtu(dst, mtu);
298 
299 	/* Something is about to be wrong... Remember soft error
300 	 * for the case, if this connection will not able to recover.
301 	 */
302 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
303 		sk->sk_err_soft = EMSGSIZE;
304 
305 	mtu = dst_mtu(dst);
306 
307 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
308 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
309 		tcp_sync_mss(sk, mtu);
310 
311 		/* Resend the TCP packet because it's
312 		 * clear that the old packet has been
313 		 * dropped. This is the new "fast" path mtu
314 		 * discovery.
315 		 */
316 		tcp_simple_retransmit(sk);
317 	} /* else let the usual retransmit timer handle it */
318 }
319 
320 /*
321  * This routine is called by the ICMP module when it gets some
322  * sort of error condition.  If err < 0 then the socket should
323  * be closed and the error returned to the user.  If err > 0
324  * it's just the icmp type << 8 | icmp code.  After adjustment
325  * header points to the first 8 bytes of the tcp header.  We need
326  * to find the appropriate port.
327  *
328  * The locking strategy used here is very "optimistic". When
329  * someone else accesses the socket the ICMP is just dropped
330  * and for some paths there is no check at all.
331  * A more general error queue to queue errors for later handling
332  * is probably better.
333  *
334  */
335 
336 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
337 {
338 	struct iphdr *iph = (struct iphdr *)icmp_skb->data;
339 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
340 	struct inet_connection_sock *icsk;
341 	struct tcp_sock *tp;
342 	struct inet_sock *inet;
343 	const int type = icmp_hdr(icmp_skb)->type;
344 	const int code = icmp_hdr(icmp_skb)->code;
345 	struct sock *sk;
346 	struct sk_buff *skb;
347 	__u32 seq;
348 	__u32 remaining;
349 	int err;
350 	struct net *net = dev_net(icmp_skb->dev);
351 
352 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
353 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
354 		return;
355 	}
356 
357 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
358 			iph->saddr, th->source, inet_iif(icmp_skb));
359 	if (!sk) {
360 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
361 		return;
362 	}
363 	if (sk->sk_state == TCP_TIME_WAIT) {
364 		inet_twsk_put(inet_twsk(sk));
365 		return;
366 	}
367 
368 	bh_lock_sock(sk);
369 	/* If too many ICMPs get dropped on busy
370 	 * servers this needs to be solved differently.
371 	 */
372 	if (sock_owned_by_user(sk))
373 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
374 
375 	if (sk->sk_state == TCP_CLOSE)
376 		goto out;
377 
378 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
379 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
380 		goto out;
381 	}
382 
383 	icsk = inet_csk(sk);
384 	tp = tcp_sk(sk);
385 	seq = ntohl(th->seq);
386 	if (sk->sk_state != TCP_LISTEN &&
387 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
388 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
389 		goto out;
390 	}
391 
392 	switch (type) {
393 	case ICMP_SOURCE_QUENCH:
394 		/* Just silently ignore these. */
395 		goto out;
396 	case ICMP_PARAMETERPROB:
397 		err = EPROTO;
398 		break;
399 	case ICMP_DEST_UNREACH:
400 		if (code > NR_ICMP_UNREACH)
401 			goto out;
402 
403 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
404 			if (!sock_owned_by_user(sk))
405 				do_pmtu_discovery(sk, iph, info);
406 			goto out;
407 		}
408 
409 		err = icmp_err_convert[code].errno;
410 		/* check if icmp_skb allows revert of backoff
411 		 * (see draft-zimmermann-tcp-lcd) */
412 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
413 			break;
414 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
415 		    !icsk->icsk_backoff)
416 			break;
417 
418 		icsk->icsk_backoff--;
419 		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
420 					 icsk->icsk_backoff;
421 		tcp_bound_rto(sk);
422 
423 		skb = tcp_write_queue_head(sk);
424 		BUG_ON(!skb);
425 
426 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
427 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
428 
429 		if (remaining) {
430 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
431 						  remaining, TCP_RTO_MAX);
432 		} else if (sock_owned_by_user(sk)) {
433 			/* RTO revert clocked out retransmission,
434 			 * but socket is locked. Will defer. */
435 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
436 						  HZ/20, TCP_RTO_MAX);
437 		} else {
438 			/* RTO revert clocked out retransmission.
439 			 * Will retransmit now */
440 			tcp_retransmit_timer(sk);
441 		}
442 
443 		break;
444 	case ICMP_TIME_EXCEEDED:
445 		err = EHOSTUNREACH;
446 		break;
447 	default:
448 		goto out;
449 	}
450 
451 	switch (sk->sk_state) {
452 		struct request_sock *req, **prev;
453 	case TCP_LISTEN:
454 		if (sock_owned_by_user(sk))
455 			goto out;
456 
457 		req = inet_csk_search_req(sk, &prev, th->dest,
458 					  iph->daddr, iph->saddr);
459 		if (!req)
460 			goto out;
461 
462 		/* ICMPs are not backlogged, hence we cannot get
463 		   an established socket here.
464 		 */
465 		WARN_ON(req->sk);
466 
467 		if (seq != tcp_rsk(req)->snt_isn) {
468 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
469 			goto out;
470 		}
471 
472 		/*
473 		 * Still in SYN_RECV, just remove it silently.
474 		 * There is no good way to pass the error to the newly
475 		 * created socket, and POSIX does not want network
476 		 * errors returned from accept().
477 		 */
478 		inet_csk_reqsk_queue_drop(sk, req, prev);
479 		goto out;
480 
481 	case TCP_SYN_SENT:
482 	case TCP_SYN_RECV:  /* Cannot happen.
483 			       It can f.e. if SYNs crossed.
484 			     */
485 		if (!sock_owned_by_user(sk)) {
486 			sk->sk_err = err;
487 
488 			sk->sk_error_report(sk);
489 
490 			tcp_done(sk);
491 		} else {
492 			sk->sk_err_soft = err;
493 		}
494 		goto out;
495 	}
496 
497 	/* If we've already connected we will keep trying
498 	 * until we time out, or the user gives up.
499 	 *
500 	 * rfc1122 4.2.3.9 allows to consider as hard errors
501 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
502 	 * but it is obsoleted by pmtu discovery).
503 	 *
504 	 * Note, that in modern internet, where routing is unreliable
505 	 * and in each dark corner broken firewalls sit, sending random
506 	 * errors ordered by their masters even this two messages finally lose
507 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
508 	 *
509 	 * Now we are in compliance with RFCs.
510 	 *							--ANK (980905)
511 	 */
512 
513 	inet = inet_sk(sk);
514 	if (!sock_owned_by_user(sk) && inet->recverr) {
515 		sk->sk_err = err;
516 		sk->sk_error_report(sk);
517 	} else	{ /* Only an error on timeout */
518 		sk->sk_err_soft = err;
519 	}
520 
521 out:
522 	bh_unlock_sock(sk);
523 	sock_put(sk);
524 }
525 
526 static void __tcp_v4_send_check(struct sk_buff *skb,
527 				__be32 saddr, __be32 daddr)
528 {
529 	struct tcphdr *th = tcp_hdr(skb);
530 
531 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
532 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
533 		skb->csum_start = skb_transport_header(skb) - skb->head;
534 		skb->csum_offset = offsetof(struct tcphdr, check);
535 	} else {
536 		th->check = tcp_v4_check(skb->len, saddr, daddr,
537 					 csum_partial(th,
538 						      th->doff << 2,
539 						      skb->csum));
540 	}
541 }
542 
543 /* This routine computes an IPv4 TCP checksum. */
544 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
545 {
546 	struct inet_sock *inet = inet_sk(sk);
547 
548 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
549 }
550 EXPORT_SYMBOL(tcp_v4_send_check);
551 
552 int tcp_v4_gso_send_check(struct sk_buff *skb)
553 {
554 	const struct iphdr *iph;
555 	struct tcphdr *th;
556 
557 	if (!pskb_may_pull(skb, sizeof(*th)))
558 		return -EINVAL;
559 
560 	iph = ip_hdr(skb);
561 	th = tcp_hdr(skb);
562 
563 	th->check = 0;
564 	skb->ip_summed = CHECKSUM_PARTIAL;
565 	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
566 	return 0;
567 }
568 
569 /*
570  *	This routine will send an RST to the other tcp.
571  *
572  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
573  *		      for reset.
574  *	Answer: if a packet caused RST, it is not for a socket
575  *		existing in our system, if it is matched to a socket,
576  *		it is just duplicate segment or bug in other side's TCP.
577  *		So that we build reply only basing on parameters
578  *		arrived with segment.
579  *	Exception: precedence violation. We do not implement it in any case.
580  */
581 
582 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
583 {
584 	struct tcphdr *th = tcp_hdr(skb);
585 	struct {
586 		struct tcphdr th;
587 #ifdef CONFIG_TCP_MD5SIG
588 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
589 #endif
590 	} rep;
591 	struct ip_reply_arg arg;
592 #ifdef CONFIG_TCP_MD5SIG
593 	struct tcp_md5sig_key *key;
594 #endif
595 	struct net *net;
596 
597 	/* Never send a reset in response to a reset. */
598 	if (th->rst)
599 		return;
600 
601 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
602 		return;
603 
604 	/* Swap the send and the receive. */
605 	memset(&rep, 0, sizeof(rep));
606 	rep.th.dest   = th->source;
607 	rep.th.source = th->dest;
608 	rep.th.doff   = sizeof(struct tcphdr) / 4;
609 	rep.th.rst    = 1;
610 
611 	if (th->ack) {
612 		rep.th.seq = th->ack_seq;
613 	} else {
614 		rep.th.ack = 1;
615 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
616 				       skb->len - (th->doff << 2));
617 	}
618 
619 	memset(&arg, 0, sizeof(arg));
620 	arg.iov[0].iov_base = (unsigned char *)&rep;
621 	arg.iov[0].iov_len  = sizeof(rep.th);
622 
623 #ifdef CONFIG_TCP_MD5SIG
624 	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
625 	if (key) {
626 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
627 				   (TCPOPT_NOP << 16) |
628 				   (TCPOPT_MD5SIG << 8) |
629 				   TCPOLEN_MD5SIG);
630 		/* Update length and the length the header thinks exists */
631 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
632 		rep.th.doff = arg.iov[0].iov_len / 4;
633 
634 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
635 				     key, ip_hdr(skb)->saddr,
636 				     ip_hdr(skb)->daddr, &rep.th);
637 	}
638 #endif
639 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
640 				      ip_hdr(skb)->saddr, /* XXX */
641 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
642 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
643 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
644 
645 	net = dev_net(skb_dst(skb)->dev);
646 	ip_send_reply(net->ipv4.tcp_sock, skb,
647 		      &arg, arg.iov[0].iov_len);
648 
649 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
650 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
651 }
652 
653 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
654    outside socket context is ugly, certainly. What can I do?
655  */
656 
657 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
658 			    u32 win, u32 ts, int oif,
659 			    struct tcp_md5sig_key *key,
660 			    int reply_flags)
661 {
662 	struct tcphdr *th = tcp_hdr(skb);
663 	struct {
664 		struct tcphdr th;
665 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
666 #ifdef CONFIG_TCP_MD5SIG
667 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
668 #endif
669 			];
670 	} rep;
671 	struct ip_reply_arg arg;
672 	struct net *net = dev_net(skb_dst(skb)->dev);
673 
674 	memset(&rep.th, 0, sizeof(struct tcphdr));
675 	memset(&arg, 0, sizeof(arg));
676 
677 	arg.iov[0].iov_base = (unsigned char *)&rep;
678 	arg.iov[0].iov_len  = sizeof(rep.th);
679 	if (ts) {
680 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
681 				   (TCPOPT_TIMESTAMP << 8) |
682 				   TCPOLEN_TIMESTAMP);
683 		rep.opt[1] = htonl(tcp_time_stamp);
684 		rep.opt[2] = htonl(ts);
685 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
686 	}
687 
688 	/* Swap the send and the receive. */
689 	rep.th.dest    = th->source;
690 	rep.th.source  = th->dest;
691 	rep.th.doff    = arg.iov[0].iov_len / 4;
692 	rep.th.seq     = htonl(seq);
693 	rep.th.ack_seq = htonl(ack);
694 	rep.th.ack     = 1;
695 	rep.th.window  = htons(win);
696 
697 #ifdef CONFIG_TCP_MD5SIG
698 	if (key) {
699 		int offset = (ts) ? 3 : 0;
700 
701 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
702 					  (TCPOPT_NOP << 16) |
703 					  (TCPOPT_MD5SIG << 8) |
704 					  TCPOLEN_MD5SIG);
705 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
706 		rep.th.doff = arg.iov[0].iov_len/4;
707 
708 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
709 				    key, ip_hdr(skb)->saddr,
710 				    ip_hdr(skb)->daddr, &rep.th);
711 	}
712 #endif
713 	arg.flags = reply_flags;
714 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
715 				      ip_hdr(skb)->saddr, /* XXX */
716 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
717 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
718 	if (oif)
719 		arg.bound_dev_if = oif;
720 
721 	ip_send_reply(net->ipv4.tcp_sock, skb,
722 		      &arg, arg.iov[0].iov_len);
723 
724 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
725 }
726 
727 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
728 {
729 	struct inet_timewait_sock *tw = inet_twsk(sk);
730 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
731 
732 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
733 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
734 			tcptw->tw_ts_recent,
735 			tw->tw_bound_dev_if,
736 			tcp_twsk_md5_key(tcptw),
737 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
738 			);
739 
740 	inet_twsk_put(tw);
741 }
742 
743 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
744 				  struct request_sock *req)
745 {
746 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
747 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
748 			req->ts_recent,
749 			0,
750 			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
751 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
752 }
753 
754 /*
755  *	Send a SYN-ACK after having received a SYN.
756  *	This still operates on a request_sock only, not on a big
757  *	socket.
758  */
759 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
760 			      struct request_sock *req,
761 			      struct request_values *rvp)
762 {
763 	const struct inet_request_sock *ireq = inet_rsk(req);
764 	int err = -1;
765 	struct sk_buff * skb;
766 
767 	/* First, grab a route. */
768 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
769 		return -1;
770 
771 	skb = tcp_make_synack(sk, dst, req, rvp);
772 
773 	if (skb) {
774 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
775 
776 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
777 					    ireq->rmt_addr,
778 					    ireq->opt);
779 		err = net_xmit_eval(err);
780 	}
781 
782 	dst_release(dst);
783 	return err;
784 }
785 
786 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
787 			      struct request_values *rvp)
788 {
789 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
790 	return tcp_v4_send_synack(sk, NULL, req, rvp);
791 }
792 
793 /*
794  *	IPv4 request_sock destructor.
795  */
796 static void tcp_v4_reqsk_destructor(struct request_sock *req)
797 {
798 	kfree(inet_rsk(req)->opt);
799 }
800 
801 static void syn_flood_warning(const struct sk_buff *skb)
802 {
803 	const char *msg;
804 
805 #ifdef CONFIG_SYN_COOKIES
806 	if (sysctl_tcp_syncookies)
807 		msg = "Sending cookies";
808 	else
809 #endif
810 		msg = "Dropping request";
811 
812 	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
813 				ntohs(tcp_hdr(skb)->dest), msg);
814 }
815 
816 /*
817  * Save and compile IPv4 options into the request_sock if needed.
818  */
819 static struct ip_options *tcp_v4_save_options(struct sock *sk,
820 					      struct sk_buff *skb)
821 {
822 	struct ip_options *opt = &(IPCB(skb)->opt);
823 	struct ip_options *dopt = NULL;
824 
825 	if (opt && opt->optlen) {
826 		int opt_size = optlength(opt);
827 		dopt = kmalloc(opt_size, GFP_ATOMIC);
828 		if (dopt) {
829 			if (ip_options_echo(dopt, skb)) {
830 				kfree(dopt);
831 				dopt = NULL;
832 			}
833 		}
834 	}
835 	return dopt;
836 }
837 
838 #ifdef CONFIG_TCP_MD5SIG
839 /*
840  * RFC2385 MD5 checksumming requires a mapping of
841  * IP address->MD5 Key.
842  * We need to maintain these in the sk structure.
843  */
844 
845 /* Find the Key structure for an address.  */
846 static struct tcp_md5sig_key *
847 			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
848 {
849 	struct tcp_sock *tp = tcp_sk(sk);
850 	int i;
851 
852 	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
853 		return NULL;
854 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
855 		if (tp->md5sig_info->keys4[i].addr == addr)
856 			return &tp->md5sig_info->keys4[i].base;
857 	}
858 	return NULL;
859 }
860 
861 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
862 					 struct sock *addr_sk)
863 {
864 	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
865 }
866 EXPORT_SYMBOL(tcp_v4_md5_lookup);
867 
868 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
869 						      struct request_sock *req)
870 {
871 	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
872 }
873 
874 /* This can be called on a newly created socket, from other files */
875 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
876 		      u8 *newkey, u8 newkeylen)
877 {
878 	/* Add Key to the list */
879 	struct tcp_md5sig_key *key;
880 	struct tcp_sock *tp = tcp_sk(sk);
881 	struct tcp4_md5sig_key *keys;
882 
883 	key = tcp_v4_md5_do_lookup(sk, addr);
884 	if (key) {
885 		/* Pre-existing entry - just update that one. */
886 		kfree(key->key);
887 		key->key = newkey;
888 		key->keylen = newkeylen;
889 	} else {
890 		struct tcp_md5sig_info *md5sig;
891 
892 		if (!tp->md5sig_info) {
893 			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
894 						  GFP_ATOMIC);
895 			if (!tp->md5sig_info) {
896 				kfree(newkey);
897 				return -ENOMEM;
898 			}
899 			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
900 		}
901 		if (tcp_alloc_md5sig_pool(sk) == NULL) {
902 			kfree(newkey);
903 			return -ENOMEM;
904 		}
905 		md5sig = tp->md5sig_info;
906 
907 		if (md5sig->alloced4 == md5sig->entries4) {
908 			keys = kmalloc((sizeof(*keys) *
909 					(md5sig->entries4 + 1)), GFP_ATOMIC);
910 			if (!keys) {
911 				kfree(newkey);
912 				tcp_free_md5sig_pool();
913 				return -ENOMEM;
914 			}
915 
916 			if (md5sig->entries4)
917 				memcpy(keys, md5sig->keys4,
918 				       sizeof(*keys) * md5sig->entries4);
919 
920 			/* Free old key list, and reference new one */
921 			kfree(md5sig->keys4);
922 			md5sig->keys4 = keys;
923 			md5sig->alloced4++;
924 		}
925 		md5sig->entries4++;
926 		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
927 		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
928 		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
929 	}
930 	return 0;
931 }
932 EXPORT_SYMBOL(tcp_v4_md5_do_add);
933 
934 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
935 			       u8 *newkey, u8 newkeylen)
936 {
937 	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
938 				 newkey, newkeylen);
939 }
940 
941 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
942 {
943 	struct tcp_sock *tp = tcp_sk(sk);
944 	int i;
945 
946 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
947 		if (tp->md5sig_info->keys4[i].addr == addr) {
948 			/* Free the key */
949 			kfree(tp->md5sig_info->keys4[i].base.key);
950 			tp->md5sig_info->entries4--;
951 
952 			if (tp->md5sig_info->entries4 == 0) {
953 				kfree(tp->md5sig_info->keys4);
954 				tp->md5sig_info->keys4 = NULL;
955 				tp->md5sig_info->alloced4 = 0;
956 			} else if (tp->md5sig_info->entries4 != i) {
957 				/* Need to do some manipulation */
958 				memmove(&tp->md5sig_info->keys4[i],
959 					&tp->md5sig_info->keys4[i+1],
960 					(tp->md5sig_info->entries4 - i) *
961 					 sizeof(struct tcp4_md5sig_key));
962 			}
963 			tcp_free_md5sig_pool();
964 			return 0;
965 		}
966 	}
967 	return -ENOENT;
968 }
969 EXPORT_SYMBOL(tcp_v4_md5_do_del);
970 
971 static void tcp_v4_clear_md5_list(struct sock *sk)
972 {
973 	struct tcp_sock *tp = tcp_sk(sk);
974 
975 	/* Free each key, then the set of key keys,
976 	 * the crypto element, and then decrement our
977 	 * hold on the last resort crypto.
978 	 */
979 	if (tp->md5sig_info->entries4) {
980 		int i;
981 		for (i = 0; i < tp->md5sig_info->entries4; i++)
982 			kfree(tp->md5sig_info->keys4[i].base.key);
983 		tp->md5sig_info->entries4 = 0;
984 		tcp_free_md5sig_pool();
985 	}
986 	if (tp->md5sig_info->keys4) {
987 		kfree(tp->md5sig_info->keys4);
988 		tp->md5sig_info->keys4 = NULL;
989 		tp->md5sig_info->alloced4  = 0;
990 	}
991 }
992 
993 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
994 				 int optlen)
995 {
996 	struct tcp_md5sig cmd;
997 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
998 	u8 *newkey;
999 
1000 	if (optlen < sizeof(cmd))
1001 		return -EINVAL;
1002 
1003 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1004 		return -EFAULT;
1005 
1006 	if (sin->sin_family != AF_INET)
1007 		return -EINVAL;
1008 
1009 	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1010 		if (!tcp_sk(sk)->md5sig_info)
1011 			return -ENOENT;
1012 		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1013 	}
1014 
1015 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1016 		return -EINVAL;
1017 
1018 	if (!tcp_sk(sk)->md5sig_info) {
1019 		struct tcp_sock *tp = tcp_sk(sk);
1020 		struct tcp_md5sig_info *p;
1021 
1022 		p = kzalloc(sizeof(*p), sk->sk_allocation);
1023 		if (!p)
1024 			return -EINVAL;
1025 
1026 		tp->md5sig_info = p;
1027 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1028 	}
1029 
1030 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1031 	if (!newkey)
1032 		return -ENOMEM;
1033 	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1034 				 newkey, cmd.tcpm_keylen);
1035 }
1036 
1037 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1038 					__be32 daddr, __be32 saddr, int nbytes)
1039 {
1040 	struct tcp4_pseudohdr *bp;
1041 	struct scatterlist sg;
1042 
1043 	bp = &hp->md5_blk.ip4;
1044 
1045 	/*
1046 	 * 1. the TCP pseudo-header (in the order: source IP address,
1047 	 * destination IP address, zero-padded protocol number, and
1048 	 * segment length)
1049 	 */
1050 	bp->saddr = saddr;
1051 	bp->daddr = daddr;
1052 	bp->pad = 0;
1053 	bp->protocol = IPPROTO_TCP;
1054 	bp->len = cpu_to_be16(nbytes);
1055 
1056 	sg_init_one(&sg, bp, sizeof(*bp));
1057 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1058 }
1059 
1060 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1061 			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1062 {
1063 	struct tcp_md5sig_pool *hp;
1064 	struct hash_desc *desc;
1065 
1066 	hp = tcp_get_md5sig_pool();
1067 	if (!hp)
1068 		goto clear_hash_noput;
1069 	desc = &hp->md5_desc;
1070 
1071 	if (crypto_hash_init(desc))
1072 		goto clear_hash;
1073 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1074 		goto clear_hash;
1075 	if (tcp_md5_hash_header(hp, th))
1076 		goto clear_hash;
1077 	if (tcp_md5_hash_key(hp, key))
1078 		goto clear_hash;
1079 	if (crypto_hash_final(desc, md5_hash))
1080 		goto clear_hash;
1081 
1082 	tcp_put_md5sig_pool();
1083 	return 0;
1084 
1085 clear_hash:
1086 	tcp_put_md5sig_pool();
1087 clear_hash_noput:
1088 	memset(md5_hash, 0, 16);
1089 	return 1;
1090 }
1091 
1092 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1093 			struct sock *sk, struct request_sock *req,
1094 			struct sk_buff *skb)
1095 {
1096 	struct tcp_md5sig_pool *hp;
1097 	struct hash_desc *desc;
1098 	struct tcphdr *th = tcp_hdr(skb);
1099 	__be32 saddr, daddr;
1100 
1101 	if (sk) {
1102 		saddr = inet_sk(sk)->inet_saddr;
1103 		daddr = inet_sk(sk)->inet_daddr;
1104 	} else if (req) {
1105 		saddr = inet_rsk(req)->loc_addr;
1106 		daddr = inet_rsk(req)->rmt_addr;
1107 	} else {
1108 		const struct iphdr *iph = ip_hdr(skb);
1109 		saddr = iph->saddr;
1110 		daddr = iph->daddr;
1111 	}
1112 
1113 	hp = tcp_get_md5sig_pool();
1114 	if (!hp)
1115 		goto clear_hash_noput;
1116 	desc = &hp->md5_desc;
1117 
1118 	if (crypto_hash_init(desc))
1119 		goto clear_hash;
1120 
1121 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1122 		goto clear_hash;
1123 	if (tcp_md5_hash_header(hp, th))
1124 		goto clear_hash;
1125 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1126 		goto clear_hash;
1127 	if (tcp_md5_hash_key(hp, key))
1128 		goto clear_hash;
1129 	if (crypto_hash_final(desc, md5_hash))
1130 		goto clear_hash;
1131 
1132 	tcp_put_md5sig_pool();
1133 	return 0;
1134 
1135 clear_hash:
1136 	tcp_put_md5sig_pool();
1137 clear_hash_noput:
1138 	memset(md5_hash, 0, 16);
1139 	return 1;
1140 }
1141 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1142 
1143 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1144 {
1145 	/*
1146 	 * This gets called for each TCP segment that arrives
1147 	 * so we want to be efficient.
1148 	 * We have 3 drop cases:
1149 	 * o No MD5 hash and one expected.
1150 	 * o MD5 hash and we're not expecting one.
1151 	 * o MD5 hash and its wrong.
1152 	 */
1153 	__u8 *hash_location = NULL;
1154 	struct tcp_md5sig_key *hash_expected;
1155 	const struct iphdr *iph = ip_hdr(skb);
1156 	struct tcphdr *th = tcp_hdr(skb);
1157 	int genhash;
1158 	unsigned char newhash[16];
1159 
1160 	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1161 	hash_location = tcp_parse_md5sig_option(th);
1162 
1163 	/* We've parsed the options - do we have a hash? */
1164 	if (!hash_expected && !hash_location)
1165 		return 0;
1166 
1167 	if (hash_expected && !hash_location) {
1168 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1169 		return 1;
1170 	}
1171 
1172 	if (!hash_expected && hash_location) {
1173 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1174 		return 1;
1175 	}
1176 
1177 	/* Okay, so this is hash_expected and hash_location -
1178 	 * so we need to calculate the checksum.
1179 	 */
1180 	genhash = tcp_v4_md5_hash_skb(newhash,
1181 				      hash_expected,
1182 				      NULL, NULL, skb);
1183 
1184 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1185 		if (net_ratelimit()) {
1186 			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1187 			       &iph->saddr, ntohs(th->source),
1188 			       &iph->daddr, ntohs(th->dest),
1189 			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1190 		}
1191 		return 1;
1192 	}
1193 	return 0;
1194 }
1195 
1196 #endif
1197 
1198 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1199 	.family		=	PF_INET,
1200 	.obj_size	=	sizeof(struct tcp_request_sock),
1201 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1202 	.send_ack	=	tcp_v4_reqsk_send_ack,
1203 	.destructor	=	tcp_v4_reqsk_destructor,
1204 	.send_reset	=	tcp_v4_send_reset,
1205 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1206 };
1207 
1208 #ifdef CONFIG_TCP_MD5SIG
1209 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1210 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1211 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1212 };
1213 #endif
1214 
1215 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1216 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1217 	.twsk_unique	= tcp_twsk_unique,
1218 	.twsk_destructor= tcp_twsk_destructor,
1219 };
1220 
1221 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1222 {
1223 	struct tcp_extend_values tmp_ext;
1224 	struct tcp_options_received tmp_opt;
1225 	u8 *hash_location;
1226 	struct request_sock *req;
1227 	struct inet_request_sock *ireq;
1228 	struct tcp_sock *tp = tcp_sk(sk);
1229 	struct dst_entry *dst = NULL;
1230 	__be32 saddr = ip_hdr(skb)->saddr;
1231 	__be32 daddr = ip_hdr(skb)->daddr;
1232 	__u32 isn = TCP_SKB_CB(skb)->when;
1233 #ifdef CONFIG_SYN_COOKIES
1234 	int want_cookie = 0;
1235 #else
1236 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1237 #endif
1238 
1239 	/* Never answer to SYNs send to broadcast or multicast */
1240 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1241 		goto drop;
1242 
1243 	/* TW buckets are converted to open requests without
1244 	 * limitations, they conserve resources and peer is
1245 	 * evidently real one.
1246 	 */
1247 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1248 		if (net_ratelimit())
1249 			syn_flood_warning(skb);
1250 #ifdef CONFIG_SYN_COOKIES
1251 		if (sysctl_tcp_syncookies) {
1252 			want_cookie = 1;
1253 		} else
1254 #endif
1255 		goto drop;
1256 	}
1257 
1258 	/* Accept backlog is full. If we have already queued enough
1259 	 * of warm entries in syn queue, drop request. It is better than
1260 	 * clogging syn queue with openreqs with exponentially increasing
1261 	 * timeout.
1262 	 */
1263 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1264 		goto drop;
1265 
1266 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1267 	if (!req)
1268 		goto drop;
1269 
1270 #ifdef CONFIG_TCP_MD5SIG
1271 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1272 #endif
1273 
1274 	tcp_clear_options(&tmp_opt);
1275 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1276 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1277 	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1278 
1279 	if (tmp_opt.cookie_plus > 0 &&
1280 	    tmp_opt.saw_tstamp &&
1281 	    !tp->rx_opt.cookie_out_never &&
1282 	    (sysctl_tcp_cookie_size > 0 ||
1283 	     (tp->cookie_values != NULL &&
1284 	      tp->cookie_values->cookie_desired > 0))) {
1285 		u8 *c;
1286 		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1287 		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1288 
1289 		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1290 			goto drop_and_release;
1291 
1292 		/* Secret recipe starts with IP addresses */
1293 		*mess++ ^= (__force u32)daddr;
1294 		*mess++ ^= (__force u32)saddr;
1295 
1296 		/* plus variable length Initiator Cookie */
1297 		c = (u8 *)mess;
1298 		while (l-- > 0)
1299 			*c++ ^= *hash_location++;
1300 
1301 #ifdef CONFIG_SYN_COOKIES
1302 		want_cookie = 0;	/* not our kind of cookie */
1303 #endif
1304 		tmp_ext.cookie_out_never = 0; /* false */
1305 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1306 	} else if (!tp->rx_opt.cookie_in_always) {
1307 		/* redundant indications, but ensure initialization. */
1308 		tmp_ext.cookie_out_never = 1; /* true */
1309 		tmp_ext.cookie_plus = 0;
1310 	} else {
1311 		goto drop_and_release;
1312 	}
1313 	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1314 
1315 	if (want_cookie && !tmp_opt.saw_tstamp)
1316 		tcp_clear_options(&tmp_opt);
1317 
1318 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1319 	tcp_openreq_init(req, &tmp_opt, skb);
1320 
1321 	ireq = inet_rsk(req);
1322 	ireq->loc_addr = daddr;
1323 	ireq->rmt_addr = saddr;
1324 	ireq->no_srccheck = inet_sk(sk)->transparent;
1325 	ireq->opt = tcp_v4_save_options(sk, skb);
1326 
1327 	if (security_inet_conn_request(sk, skb, req))
1328 		goto drop_and_free;
1329 
1330 	if (!want_cookie || tmp_opt.tstamp_ok)
1331 		TCP_ECN_create_request(req, tcp_hdr(skb));
1332 
1333 	if (want_cookie) {
1334 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1335 		req->cookie_ts = tmp_opt.tstamp_ok;
1336 	} else if (!isn) {
1337 		struct inet_peer *peer = NULL;
1338 
1339 		/* VJ's idea. We save last timestamp seen
1340 		 * from the destination in peer table, when entering
1341 		 * state TIME-WAIT, and check against it before
1342 		 * accepting new connection request.
1343 		 *
1344 		 * If "isn" is not zero, this request hit alive
1345 		 * timewait bucket, so that all the necessary checks
1346 		 * are made in the function processing timewait state.
1347 		 */
1348 		if (tmp_opt.saw_tstamp &&
1349 		    tcp_death_row.sysctl_tw_recycle &&
1350 		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1351 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1352 		    peer->v4daddr == saddr) {
1353 			inet_peer_refcheck(peer);
1354 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1355 			    (s32)(peer->tcp_ts - req->ts_recent) >
1356 							TCP_PAWS_WINDOW) {
1357 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1358 				goto drop_and_release;
1359 			}
1360 		}
1361 		/* Kill the following clause, if you dislike this way. */
1362 		else if (!sysctl_tcp_syncookies &&
1363 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1364 			  (sysctl_max_syn_backlog >> 2)) &&
1365 			 (!peer || !peer->tcp_ts_stamp) &&
1366 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1367 			/* Without syncookies last quarter of
1368 			 * backlog is filled with destinations,
1369 			 * proven to be alive.
1370 			 * It means that we continue to communicate
1371 			 * to destinations, already remembered
1372 			 * to the moment of synflood.
1373 			 */
1374 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1375 				       &saddr, ntohs(tcp_hdr(skb)->source));
1376 			goto drop_and_release;
1377 		}
1378 
1379 		isn = tcp_v4_init_sequence(skb);
1380 	}
1381 	tcp_rsk(req)->snt_isn = isn;
1382 
1383 	if (tcp_v4_send_synack(sk, dst, req,
1384 			       (struct request_values *)&tmp_ext) ||
1385 	    want_cookie)
1386 		goto drop_and_free;
1387 
1388 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1389 	return 0;
1390 
1391 drop_and_release:
1392 	dst_release(dst);
1393 drop_and_free:
1394 	reqsk_free(req);
1395 drop:
1396 	return 0;
1397 }
1398 EXPORT_SYMBOL(tcp_v4_conn_request);
1399 
1400 
1401 /*
1402  * The three way handshake has completed - we got a valid synack -
1403  * now create the new socket.
1404  */
1405 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1406 				  struct request_sock *req,
1407 				  struct dst_entry *dst)
1408 {
1409 	struct inet_request_sock *ireq;
1410 	struct inet_sock *newinet;
1411 	struct tcp_sock *newtp;
1412 	struct sock *newsk;
1413 #ifdef CONFIG_TCP_MD5SIG
1414 	struct tcp_md5sig_key *key;
1415 #endif
1416 
1417 	if (sk_acceptq_is_full(sk))
1418 		goto exit_overflow;
1419 
1420 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1421 		goto exit;
1422 
1423 	newsk = tcp_create_openreq_child(sk, req, skb);
1424 	if (!newsk)
1425 		goto exit;
1426 
1427 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1428 	sk_setup_caps(newsk, dst);
1429 
1430 	newtp		      = tcp_sk(newsk);
1431 	newinet		      = inet_sk(newsk);
1432 	ireq		      = inet_rsk(req);
1433 	newinet->inet_daddr   = ireq->rmt_addr;
1434 	newinet->inet_rcv_saddr = ireq->loc_addr;
1435 	newinet->inet_saddr	      = ireq->loc_addr;
1436 	newinet->opt	      = ireq->opt;
1437 	ireq->opt	      = NULL;
1438 	newinet->mc_index     = inet_iif(skb);
1439 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1440 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1441 	if (newinet->opt)
1442 		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1443 	newinet->inet_id = newtp->write_seq ^ jiffies;
1444 
1445 	tcp_mtup_init(newsk);
1446 	tcp_sync_mss(newsk, dst_mtu(dst));
1447 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1448 	if (tcp_sk(sk)->rx_opt.user_mss &&
1449 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1450 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1451 
1452 	tcp_initialize_rcv_mss(newsk);
1453 
1454 #ifdef CONFIG_TCP_MD5SIG
1455 	/* Copy over the MD5 key from the original socket */
1456 	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1457 	if (key != NULL) {
1458 		/*
1459 		 * We're using one, so create a matching key
1460 		 * on the newsk structure. If we fail to get
1461 		 * memory, then we end up not copying the key
1462 		 * across. Shucks.
1463 		 */
1464 		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1465 		if (newkey != NULL)
1466 			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1467 					  newkey, key->keylen);
1468 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1469 	}
1470 #endif
1471 
1472 	__inet_hash_nolisten(newsk, NULL);
1473 	__inet_inherit_port(sk, newsk);
1474 
1475 	return newsk;
1476 
1477 exit_overflow:
1478 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1479 exit:
1480 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1481 	dst_release(dst);
1482 	return NULL;
1483 }
1484 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1485 
1486 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1487 {
1488 	struct tcphdr *th = tcp_hdr(skb);
1489 	const struct iphdr *iph = ip_hdr(skb);
1490 	struct sock *nsk;
1491 	struct request_sock **prev;
1492 	/* Find possible connection requests. */
1493 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1494 						       iph->saddr, iph->daddr);
1495 	if (req)
1496 		return tcp_check_req(sk, skb, req, prev);
1497 
1498 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1499 			th->source, iph->daddr, th->dest, inet_iif(skb));
1500 
1501 	if (nsk) {
1502 		if (nsk->sk_state != TCP_TIME_WAIT) {
1503 			bh_lock_sock(nsk);
1504 			return nsk;
1505 		}
1506 		inet_twsk_put(inet_twsk(nsk));
1507 		return NULL;
1508 	}
1509 
1510 #ifdef CONFIG_SYN_COOKIES
1511 	if (!th->syn)
1512 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1513 #endif
1514 	return sk;
1515 }
1516 
1517 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1518 {
1519 	const struct iphdr *iph = ip_hdr(skb);
1520 
1521 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1522 		if (!tcp_v4_check(skb->len, iph->saddr,
1523 				  iph->daddr, skb->csum)) {
1524 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1525 			return 0;
1526 		}
1527 	}
1528 
1529 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1530 				       skb->len, IPPROTO_TCP, 0);
1531 
1532 	if (skb->len <= 76) {
1533 		return __skb_checksum_complete(skb);
1534 	}
1535 	return 0;
1536 }
1537 
1538 
1539 /* The socket must have it's spinlock held when we get
1540  * here.
1541  *
1542  * We have a potential double-lock case here, so even when
1543  * doing backlog processing we use the BH locking scheme.
1544  * This is because we cannot sleep with the original spinlock
1545  * held.
1546  */
1547 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1548 {
1549 	struct sock *rsk;
1550 #ifdef CONFIG_TCP_MD5SIG
1551 	/*
1552 	 * We really want to reject the packet as early as possible
1553 	 * if:
1554 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1555 	 *  o There is an MD5 option and we're not expecting one
1556 	 */
1557 	if (tcp_v4_inbound_md5_hash(sk, skb))
1558 		goto discard;
1559 #endif
1560 
1561 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1562 		sock_rps_save_rxhash(sk, skb->rxhash);
1563 		TCP_CHECK_TIMER(sk);
1564 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1565 			rsk = sk;
1566 			goto reset;
1567 		}
1568 		TCP_CHECK_TIMER(sk);
1569 		return 0;
1570 	}
1571 
1572 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1573 		goto csum_err;
1574 
1575 	if (sk->sk_state == TCP_LISTEN) {
1576 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1577 		if (!nsk)
1578 			goto discard;
1579 
1580 		if (nsk != sk) {
1581 			if (tcp_child_process(sk, nsk, skb)) {
1582 				rsk = nsk;
1583 				goto reset;
1584 			}
1585 			return 0;
1586 		}
1587 	} else
1588 		sock_rps_save_rxhash(sk, skb->rxhash);
1589 
1590 
1591 	TCP_CHECK_TIMER(sk);
1592 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1593 		rsk = sk;
1594 		goto reset;
1595 	}
1596 	TCP_CHECK_TIMER(sk);
1597 	return 0;
1598 
1599 reset:
1600 	tcp_v4_send_reset(rsk, skb);
1601 discard:
1602 	kfree_skb(skb);
1603 	/* Be careful here. If this function gets more complicated and
1604 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1605 	 * might be destroyed here. This current version compiles correctly,
1606 	 * but you have been warned.
1607 	 */
1608 	return 0;
1609 
1610 csum_err:
1611 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1612 	goto discard;
1613 }
1614 EXPORT_SYMBOL(tcp_v4_do_rcv);
1615 
1616 /*
1617  *	From tcp_input.c
1618  */
1619 
1620 int tcp_v4_rcv(struct sk_buff *skb)
1621 {
1622 	const struct iphdr *iph;
1623 	struct tcphdr *th;
1624 	struct sock *sk;
1625 	int ret;
1626 	struct net *net = dev_net(skb->dev);
1627 
1628 	if (skb->pkt_type != PACKET_HOST)
1629 		goto discard_it;
1630 
1631 	/* Count it even if it's bad */
1632 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1633 
1634 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1635 		goto discard_it;
1636 
1637 	th = tcp_hdr(skb);
1638 
1639 	if (th->doff < sizeof(struct tcphdr) / 4)
1640 		goto bad_packet;
1641 	if (!pskb_may_pull(skb, th->doff * 4))
1642 		goto discard_it;
1643 
1644 	/* An explanation is required here, I think.
1645 	 * Packet length and doff are validated by header prediction,
1646 	 * provided case of th->doff==0 is eliminated.
1647 	 * So, we defer the checks. */
1648 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1649 		goto bad_packet;
1650 
1651 	th = tcp_hdr(skb);
1652 	iph = ip_hdr(skb);
1653 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1654 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1655 				    skb->len - th->doff * 4);
1656 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1657 	TCP_SKB_CB(skb)->when	 = 0;
1658 	TCP_SKB_CB(skb)->flags	 = iph->tos;
1659 	TCP_SKB_CB(skb)->sacked	 = 0;
1660 
1661 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1662 	if (!sk)
1663 		goto no_tcp_socket;
1664 
1665 process:
1666 	if (sk->sk_state == TCP_TIME_WAIT)
1667 		goto do_time_wait;
1668 
1669 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1670 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1671 		goto discard_and_relse;
1672 	}
1673 
1674 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1675 		goto discard_and_relse;
1676 	nf_reset(skb);
1677 
1678 	if (sk_filter(sk, skb))
1679 		goto discard_and_relse;
1680 
1681 	skb->dev = NULL;
1682 
1683 	bh_lock_sock_nested(sk);
1684 	ret = 0;
1685 	if (!sock_owned_by_user(sk)) {
1686 #ifdef CONFIG_NET_DMA
1687 		struct tcp_sock *tp = tcp_sk(sk);
1688 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1689 			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1690 		if (tp->ucopy.dma_chan)
1691 			ret = tcp_v4_do_rcv(sk, skb);
1692 		else
1693 #endif
1694 		{
1695 			if (!tcp_prequeue(sk, skb))
1696 				ret = tcp_v4_do_rcv(sk, skb);
1697 		}
1698 	} else if (unlikely(sk_add_backlog(sk, skb))) {
1699 		bh_unlock_sock(sk);
1700 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1701 		goto discard_and_relse;
1702 	}
1703 	bh_unlock_sock(sk);
1704 
1705 	sock_put(sk);
1706 
1707 	return ret;
1708 
1709 no_tcp_socket:
1710 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1711 		goto discard_it;
1712 
1713 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1714 bad_packet:
1715 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1716 	} else {
1717 		tcp_v4_send_reset(NULL, skb);
1718 	}
1719 
1720 discard_it:
1721 	/* Discard frame. */
1722 	kfree_skb(skb);
1723 	return 0;
1724 
1725 discard_and_relse:
1726 	sock_put(sk);
1727 	goto discard_it;
1728 
1729 do_time_wait:
1730 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1731 		inet_twsk_put(inet_twsk(sk));
1732 		goto discard_it;
1733 	}
1734 
1735 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1736 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1737 		inet_twsk_put(inet_twsk(sk));
1738 		goto discard_it;
1739 	}
1740 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1741 	case TCP_TW_SYN: {
1742 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1743 							&tcp_hashinfo,
1744 							iph->daddr, th->dest,
1745 							inet_iif(skb));
1746 		if (sk2) {
1747 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1748 			inet_twsk_put(inet_twsk(sk));
1749 			sk = sk2;
1750 			goto process;
1751 		}
1752 		/* Fall through to ACK */
1753 	}
1754 	case TCP_TW_ACK:
1755 		tcp_v4_timewait_ack(sk, skb);
1756 		break;
1757 	case TCP_TW_RST:
1758 		goto no_tcp_socket;
1759 	case TCP_TW_SUCCESS:;
1760 	}
1761 	goto discard_it;
1762 }
1763 
1764 /* VJ's idea. Save last timestamp seen from this destination
1765  * and hold it at least for normal timewait interval to use for duplicate
1766  * segment detection in subsequent connections, before they enter synchronized
1767  * state.
1768  */
1769 
1770 int tcp_v4_remember_stamp(struct sock *sk)
1771 {
1772 	struct inet_sock *inet = inet_sk(sk);
1773 	struct tcp_sock *tp = tcp_sk(sk);
1774 	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1775 	struct inet_peer *peer = NULL;
1776 	int release_it = 0;
1777 
1778 	if (!rt || rt->rt_dst != inet->inet_daddr) {
1779 		peer = inet_getpeer(inet->inet_daddr, 1);
1780 		release_it = 1;
1781 	} else {
1782 		if (!rt->peer)
1783 			rt_bind_peer(rt, 1);
1784 		peer = rt->peer;
1785 	}
1786 
1787 	if (peer) {
1788 		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1789 		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1790 		     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1791 			peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1792 			peer->tcp_ts = tp->rx_opt.ts_recent;
1793 		}
1794 		if (release_it)
1795 			inet_putpeer(peer);
1796 		return 1;
1797 	}
1798 
1799 	return 0;
1800 }
1801 EXPORT_SYMBOL(tcp_v4_remember_stamp);
1802 
1803 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1804 {
1805 	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1806 
1807 	if (peer) {
1808 		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1809 
1810 		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1811 		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1812 		     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1813 			peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1814 			peer->tcp_ts	   = tcptw->tw_ts_recent;
1815 		}
1816 		inet_putpeer(peer);
1817 		return 1;
1818 	}
1819 
1820 	return 0;
1821 }
1822 
1823 const struct inet_connection_sock_af_ops ipv4_specific = {
1824 	.queue_xmit	   = ip_queue_xmit,
1825 	.send_check	   = tcp_v4_send_check,
1826 	.rebuild_header	   = inet_sk_rebuild_header,
1827 	.conn_request	   = tcp_v4_conn_request,
1828 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1829 	.remember_stamp	   = tcp_v4_remember_stamp,
1830 	.net_header_len	   = sizeof(struct iphdr),
1831 	.setsockopt	   = ip_setsockopt,
1832 	.getsockopt	   = ip_getsockopt,
1833 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1834 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1835 	.bind_conflict	   = inet_csk_bind_conflict,
1836 #ifdef CONFIG_COMPAT
1837 	.compat_setsockopt = compat_ip_setsockopt,
1838 	.compat_getsockopt = compat_ip_getsockopt,
1839 #endif
1840 };
1841 EXPORT_SYMBOL(ipv4_specific);
1842 
1843 #ifdef CONFIG_TCP_MD5SIG
1844 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1845 	.md5_lookup		= tcp_v4_md5_lookup,
1846 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1847 	.md5_add		= tcp_v4_md5_add_func,
1848 	.md5_parse		= tcp_v4_parse_md5_keys,
1849 };
1850 #endif
1851 
1852 /* NOTE: A lot of things set to zero explicitly by call to
1853  *       sk_alloc() so need not be done here.
1854  */
1855 static int tcp_v4_init_sock(struct sock *sk)
1856 {
1857 	struct inet_connection_sock *icsk = inet_csk(sk);
1858 	struct tcp_sock *tp = tcp_sk(sk);
1859 
1860 	skb_queue_head_init(&tp->out_of_order_queue);
1861 	tcp_init_xmit_timers(sk);
1862 	tcp_prequeue_init(tp);
1863 
1864 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1865 	tp->mdev = TCP_TIMEOUT_INIT;
1866 
1867 	/* So many TCP implementations out there (incorrectly) count the
1868 	 * initial SYN frame in their delayed-ACK and congestion control
1869 	 * algorithms that we must have the following bandaid to talk
1870 	 * efficiently to them.  -DaveM
1871 	 */
1872 	tp->snd_cwnd = 2;
1873 
1874 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1875 	 * initialization of these values.
1876 	 */
1877 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1878 	tp->snd_cwnd_clamp = ~0;
1879 	tp->mss_cache = TCP_MSS_DEFAULT;
1880 
1881 	tp->reordering = sysctl_tcp_reordering;
1882 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1883 
1884 	sk->sk_state = TCP_CLOSE;
1885 
1886 	sk->sk_write_space = sk_stream_write_space;
1887 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1888 
1889 	icsk->icsk_af_ops = &ipv4_specific;
1890 	icsk->icsk_sync_mss = tcp_sync_mss;
1891 #ifdef CONFIG_TCP_MD5SIG
1892 	tp->af_specific = &tcp_sock_ipv4_specific;
1893 #endif
1894 
1895 	/* TCP Cookie Transactions */
1896 	if (sysctl_tcp_cookie_size > 0) {
1897 		/* Default, cookies without s_data_payload. */
1898 		tp->cookie_values =
1899 			kzalloc(sizeof(*tp->cookie_values),
1900 				sk->sk_allocation);
1901 		if (tp->cookie_values != NULL)
1902 			kref_init(&tp->cookie_values->kref);
1903 	}
1904 	/* Presumed zeroed, in order of appearance:
1905 	 *	cookie_in_always, cookie_out_never,
1906 	 *	s_data_constant, s_data_in, s_data_out
1907 	 */
1908 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1909 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1910 
1911 	local_bh_disable();
1912 	percpu_counter_inc(&tcp_sockets_allocated);
1913 	local_bh_enable();
1914 
1915 	return 0;
1916 }
1917 
1918 void tcp_v4_destroy_sock(struct sock *sk)
1919 {
1920 	struct tcp_sock *tp = tcp_sk(sk);
1921 
1922 	tcp_clear_xmit_timers(sk);
1923 
1924 	tcp_cleanup_congestion_control(sk);
1925 
1926 	/* Cleanup up the write buffer. */
1927 	tcp_write_queue_purge(sk);
1928 
1929 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1930 	__skb_queue_purge(&tp->out_of_order_queue);
1931 
1932 #ifdef CONFIG_TCP_MD5SIG
1933 	/* Clean up the MD5 key list, if any */
1934 	if (tp->md5sig_info) {
1935 		tcp_v4_clear_md5_list(sk);
1936 		kfree(tp->md5sig_info);
1937 		tp->md5sig_info = NULL;
1938 	}
1939 #endif
1940 
1941 #ifdef CONFIG_NET_DMA
1942 	/* Cleans up our sk_async_wait_queue */
1943 	__skb_queue_purge(&sk->sk_async_wait_queue);
1944 #endif
1945 
1946 	/* Clean prequeue, it must be empty really */
1947 	__skb_queue_purge(&tp->ucopy.prequeue);
1948 
1949 	/* Clean up a referenced TCP bind bucket. */
1950 	if (inet_csk(sk)->icsk_bind_hash)
1951 		inet_put_port(sk);
1952 
1953 	/*
1954 	 * If sendmsg cached page exists, toss it.
1955 	 */
1956 	if (sk->sk_sndmsg_page) {
1957 		__free_page(sk->sk_sndmsg_page);
1958 		sk->sk_sndmsg_page = NULL;
1959 	}
1960 
1961 	/* TCP Cookie Transactions */
1962 	if (tp->cookie_values != NULL) {
1963 		kref_put(&tp->cookie_values->kref,
1964 			 tcp_cookie_values_release);
1965 		tp->cookie_values = NULL;
1966 	}
1967 
1968 	percpu_counter_dec(&tcp_sockets_allocated);
1969 }
1970 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1971 
1972 #ifdef CONFIG_PROC_FS
1973 /* Proc filesystem TCP sock list dumping. */
1974 
1975 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1976 {
1977 	return hlist_nulls_empty(head) ? NULL :
1978 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1979 }
1980 
1981 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1982 {
1983 	return !is_a_nulls(tw->tw_node.next) ?
1984 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1985 }
1986 
1987 /*
1988  * Get next listener socket follow cur.  If cur is NULL, get first socket
1989  * starting from bucket given in st->bucket; when st->bucket is zero the
1990  * very first socket in the hash table is returned.
1991  */
1992 static void *listening_get_next(struct seq_file *seq, void *cur)
1993 {
1994 	struct inet_connection_sock *icsk;
1995 	struct hlist_nulls_node *node;
1996 	struct sock *sk = cur;
1997 	struct inet_listen_hashbucket *ilb;
1998 	struct tcp_iter_state *st = seq->private;
1999 	struct net *net = seq_file_net(seq);
2000 
2001 	if (!sk) {
2002 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2003 		spin_lock_bh(&ilb->lock);
2004 		sk = sk_nulls_head(&ilb->head);
2005 		st->offset = 0;
2006 		goto get_sk;
2007 	}
2008 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2009 	++st->num;
2010 	++st->offset;
2011 
2012 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2013 		struct request_sock *req = cur;
2014 
2015 		icsk = inet_csk(st->syn_wait_sk);
2016 		req = req->dl_next;
2017 		while (1) {
2018 			while (req) {
2019 				if (req->rsk_ops->family == st->family) {
2020 					cur = req;
2021 					goto out;
2022 				}
2023 				req = req->dl_next;
2024 			}
2025 			st->offset = 0;
2026 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2027 				break;
2028 get_req:
2029 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2030 		}
2031 		sk	  = sk_next(st->syn_wait_sk);
2032 		st->state = TCP_SEQ_STATE_LISTENING;
2033 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2034 	} else {
2035 		icsk = inet_csk(sk);
2036 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2037 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2038 			goto start_req;
2039 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2040 		sk = sk_next(sk);
2041 	}
2042 get_sk:
2043 	sk_nulls_for_each_from(sk, node) {
2044 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2045 			cur = sk;
2046 			goto out;
2047 		}
2048 		icsk = inet_csk(sk);
2049 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2050 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2051 start_req:
2052 			st->uid		= sock_i_uid(sk);
2053 			st->syn_wait_sk = sk;
2054 			st->state	= TCP_SEQ_STATE_OPENREQ;
2055 			st->sbucket	= 0;
2056 			goto get_req;
2057 		}
2058 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2059 	}
2060 	spin_unlock_bh(&ilb->lock);
2061 	st->offset = 0;
2062 	if (++st->bucket < INET_LHTABLE_SIZE) {
2063 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2064 		spin_lock_bh(&ilb->lock);
2065 		sk = sk_nulls_head(&ilb->head);
2066 		goto get_sk;
2067 	}
2068 	cur = NULL;
2069 out:
2070 	return cur;
2071 }
2072 
2073 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2074 {
2075 	struct tcp_iter_state *st = seq->private;
2076 	void *rc;
2077 
2078 	st->bucket = 0;
2079 	st->offset = 0;
2080 	rc = listening_get_next(seq, NULL);
2081 
2082 	while (rc && *pos) {
2083 		rc = listening_get_next(seq, rc);
2084 		--*pos;
2085 	}
2086 	return rc;
2087 }
2088 
2089 static inline int empty_bucket(struct tcp_iter_state *st)
2090 {
2091 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2092 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2093 }
2094 
2095 /*
2096  * Get first established socket starting from bucket given in st->bucket.
2097  * If st->bucket is zero, the very first socket in the hash is returned.
2098  */
2099 static void *established_get_first(struct seq_file *seq)
2100 {
2101 	struct tcp_iter_state *st = seq->private;
2102 	struct net *net = seq_file_net(seq);
2103 	void *rc = NULL;
2104 
2105 	st->offset = 0;
2106 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2107 		struct sock *sk;
2108 		struct hlist_nulls_node *node;
2109 		struct inet_timewait_sock *tw;
2110 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2111 
2112 		/* Lockless fast path for the common case of empty buckets */
2113 		if (empty_bucket(st))
2114 			continue;
2115 
2116 		spin_lock_bh(lock);
2117 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2118 			if (sk->sk_family != st->family ||
2119 			    !net_eq(sock_net(sk), net)) {
2120 				continue;
2121 			}
2122 			rc = sk;
2123 			goto out;
2124 		}
2125 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2126 		inet_twsk_for_each(tw, node,
2127 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2128 			if (tw->tw_family != st->family ||
2129 			    !net_eq(twsk_net(tw), net)) {
2130 				continue;
2131 			}
2132 			rc = tw;
2133 			goto out;
2134 		}
2135 		spin_unlock_bh(lock);
2136 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2137 	}
2138 out:
2139 	return rc;
2140 }
2141 
2142 static void *established_get_next(struct seq_file *seq, void *cur)
2143 {
2144 	struct sock *sk = cur;
2145 	struct inet_timewait_sock *tw;
2146 	struct hlist_nulls_node *node;
2147 	struct tcp_iter_state *st = seq->private;
2148 	struct net *net = seq_file_net(seq);
2149 
2150 	++st->num;
2151 	++st->offset;
2152 
2153 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2154 		tw = cur;
2155 		tw = tw_next(tw);
2156 get_tw:
2157 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2158 			tw = tw_next(tw);
2159 		}
2160 		if (tw) {
2161 			cur = tw;
2162 			goto out;
2163 		}
2164 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2165 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2166 
2167 		/* Look for next non empty bucket */
2168 		st->offset = 0;
2169 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2170 				empty_bucket(st))
2171 			;
2172 		if (st->bucket > tcp_hashinfo.ehash_mask)
2173 			return NULL;
2174 
2175 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2176 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2177 	} else
2178 		sk = sk_nulls_next(sk);
2179 
2180 	sk_nulls_for_each_from(sk, node) {
2181 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2182 			goto found;
2183 	}
2184 
2185 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2186 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2187 	goto get_tw;
2188 found:
2189 	cur = sk;
2190 out:
2191 	return cur;
2192 }
2193 
2194 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2195 {
2196 	struct tcp_iter_state *st = seq->private;
2197 	void *rc;
2198 
2199 	st->bucket = 0;
2200 	rc = established_get_first(seq);
2201 
2202 	while (rc && pos) {
2203 		rc = established_get_next(seq, rc);
2204 		--pos;
2205 	}
2206 	return rc;
2207 }
2208 
2209 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2210 {
2211 	void *rc;
2212 	struct tcp_iter_state *st = seq->private;
2213 
2214 	st->state = TCP_SEQ_STATE_LISTENING;
2215 	rc	  = listening_get_idx(seq, &pos);
2216 
2217 	if (!rc) {
2218 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2219 		rc	  = established_get_idx(seq, pos);
2220 	}
2221 
2222 	return rc;
2223 }
2224 
2225 static void *tcp_seek_last_pos(struct seq_file *seq)
2226 {
2227 	struct tcp_iter_state *st = seq->private;
2228 	int offset = st->offset;
2229 	int orig_num = st->num;
2230 	void *rc = NULL;
2231 
2232 	switch (st->state) {
2233 	case TCP_SEQ_STATE_OPENREQ:
2234 	case TCP_SEQ_STATE_LISTENING:
2235 		if (st->bucket >= INET_LHTABLE_SIZE)
2236 			break;
2237 		st->state = TCP_SEQ_STATE_LISTENING;
2238 		rc = listening_get_next(seq, NULL);
2239 		while (offset-- && rc)
2240 			rc = listening_get_next(seq, rc);
2241 		if (rc)
2242 			break;
2243 		st->bucket = 0;
2244 		/* Fallthrough */
2245 	case TCP_SEQ_STATE_ESTABLISHED:
2246 	case TCP_SEQ_STATE_TIME_WAIT:
2247 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2248 		if (st->bucket > tcp_hashinfo.ehash_mask)
2249 			break;
2250 		rc = established_get_first(seq);
2251 		while (offset-- && rc)
2252 			rc = established_get_next(seq, rc);
2253 	}
2254 
2255 	st->num = orig_num;
2256 
2257 	return rc;
2258 }
2259 
2260 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2261 {
2262 	struct tcp_iter_state *st = seq->private;
2263 	void *rc;
2264 
2265 	if (*pos && *pos == st->last_pos) {
2266 		rc = tcp_seek_last_pos(seq);
2267 		if (rc)
2268 			goto out;
2269 	}
2270 
2271 	st->state = TCP_SEQ_STATE_LISTENING;
2272 	st->num = 0;
2273 	st->bucket = 0;
2274 	st->offset = 0;
2275 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2276 
2277 out:
2278 	st->last_pos = *pos;
2279 	return rc;
2280 }
2281 
2282 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2283 {
2284 	struct tcp_iter_state *st = seq->private;
2285 	void *rc = NULL;
2286 
2287 	if (v == SEQ_START_TOKEN) {
2288 		rc = tcp_get_idx(seq, 0);
2289 		goto out;
2290 	}
2291 
2292 	switch (st->state) {
2293 	case TCP_SEQ_STATE_OPENREQ:
2294 	case TCP_SEQ_STATE_LISTENING:
2295 		rc = listening_get_next(seq, v);
2296 		if (!rc) {
2297 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2298 			st->bucket = 0;
2299 			st->offset = 0;
2300 			rc	  = established_get_first(seq);
2301 		}
2302 		break;
2303 	case TCP_SEQ_STATE_ESTABLISHED:
2304 	case TCP_SEQ_STATE_TIME_WAIT:
2305 		rc = established_get_next(seq, v);
2306 		break;
2307 	}
2308 out:
2309 	++*pos;
2310 	st->last_pos = *pos;
2311 	return rc;
2312 }
2313 
2314 static void tcp_seq_stop(struct seq_file *seq, void *v)
2315 {
2316 	struct tcp_iter_state *st = seq->private;
2317 
2318 	switch (st->state) {
2319 	case TCP_SEQ_STATE_OPENREQ:
2320 		if (v) {
2321 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2322 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2323 		}
2324 	case TCP_SEQ_STATE_LISTENING:
2325 		if (v != SEQ_START_TOKEN)
2326 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2327 		break;
2328 	case TCP_SEQ_STATE_TIME_WAIT:
2329 	case TCP_SEQ_STATE_ESTABLISHED:
2330 		if (v)
2331 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2332 		break;
2333 	}
2334 }
2335 
2336 static int tcp_seq_open(struct inode *inode, struct file *file)
2337 {
2338 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2339 	struct tcp_iter_state *s;
2340 	int err;
2341 
2342 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2343 			  sizeof(struct tcp_iter_state));
2344 	if (err < 0)
2345 		return err;
2346 
2347 	s = ((struct seq_file *)file->private_data)->private;
2348 	s->family		= afinfo->family;
2349 	s->last_pos 		= 0;
2350 	return 0;
2351 }
2352 
2353 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2354 {
2355 	int rc = 0;
2356 	struct proc_dir_entry *p;
2357 
2358 	afinfo->seq_fops.open		= tcp_seq_open;
2359 	afinfo->seq_fops.read		= seq_read;
2360 	afinfo->seq_fops.llseek		= seq_lseek;
2361 	afinfo->seq_fops.release	= seq_release_net;
2362 
2363 	afinfo->seq_ops.start		= tcp_seq_start;
2364 	afinfo->seq_ops.next		= tcp_seq_next;
2365 	afinfo->seq_ops.stop		= tcp_seq_stop;
2366 
2367 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2368 			     &afinfo->seq_fops, afinfo);
2369 	if (!p)
2370 		rc = -ENOMEM;
2371 	return rc;
2372 }
2373 EXPORT_SYMBOL(tcp_proc_register);
2374 
2375 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2376 {
2377 	proc_net_remove(net, afinfo->name);
2378 }
2379 EXPORT_SYMBOL(tcp_proc_unregister);
2380 
2381 static void get_openreq4(struct sock *sk, struct request_sock *req,
2382 			 struct seq_file *f, int i, int uid, int *len)
2383 {
2384 	const struct inet_request_sock *ireq = inet_rsk(req);
2385 	int ttd = req->expires - jiffies;
2386 
2387 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2388 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2389 		i,
2390 		ireq->loc_addr,
2391 		ntohs(inet_sk(sk)->inet_sport),
2392 		ireq->rmt_addr,
2393 		ntohs(ireq->rmt_port),
2394 		TCP_SYN_RECV,
2395 		0, 0, /* could print option size, but that is af dependent. */
2396 		1,    /* timers active (only the expire timer) */
2397 		jiffies_to_clock_t(ttd),
2398 		req->retrans,
2399 		uid,
2400 		0,  /* non standard timer */
2401 		0, /* open_requests have no inode */
2402 		atomic_read(&sk->sk_refcnt),
2403 		req,
2404 		len);
2405 }
2406 
2407 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2408 {
2409 	int timer_active;
2410 	unsigned long timer_expires;
2411 	struct tcp_sock *tp = tcp_sk(sk);
2412 	const struct inet_connection_sock *icsk = inet_csk(sk);
2413 	struct inet_sock *inet = inet_sk(sk);
2414 	__be32 dest = inet->inet_daddr;
2415 	__be32 src = inet->inet_rcv_saddr;
2416 	__u16 destp = ntohs(inet->inet_dport);
2417 	__u16 srcp = ntohs(inet->inet_sport);
2418 	int rx_queue;
2419 
2420 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2421 		timer_active	= 1;
2422 		timer_expires	= icsk->icsk_timeout;
2423 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2424 		timer_active	= 4;
2425 		timer_expires	= icsk->icsk_timeout;
2426 	} else if (timer_pending(&sk->sk_timer)) {
2427 		timer_active	= 2;
2428 		timer_expires	= sk->sk_timer.expires;
2429 	} else {
2430 		timer_active	= 0;
2431 		timer_expires = jiffies;
2432 	}
2433 
2434 	if (sk->sk_state == TCP_LISTEN)
2435 		rx_queue = sk->sk_ack_backlog;
2436 	else
2437 		/*
2438 		 * because we dont lock socket, we might find a transient negative value
2439 		 */
2440 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2441 
2442 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2443 			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2444 		i, src, srcp, dest, destp, sk->sk_state,
2445 		tp->write_seq - tp->snd_una,
2446 		rx_queue,
2447 		timer_active,
2448 		jiffies_to_clock_t(timer_expires - jiffies),
2449 		icsk->icsk_retransmits,
2450 		sock_i_uid(sk),
2451 		icsk->icsk_probes_out,
2452 		sock_i_ino(sk),
2453 		atomic_read(&sk->sk_refcnt), sk,
2454 		jiffies_to_clock_t(icsk->icsk_rto),
2455 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2456 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2457 		tp->snd_cwnd,
2458 		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2459 		len);
2460 }
2461 
2462 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2463 			       struct seq_file *f, int i, int *len)
2464 {
2465 	__be32 dest, src;
2466 	__u16 destp, srcp;
2467 	int ttd = tw->tw_ttd - jiffies;
2468 
2469 	if (ttd < 0)
2470 		ttd = 0;
2471 
2472 	dest  = tw->tw_daddr;
2473 	src   = tw->tw_rcv_saddr;
2474 	destp = ntohs(tw->tw_dport);
2475 	srcp  = ntohs(tw->tw_sport);
2476 
2477 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2478 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2479 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2480 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2481 		atomic_read(&tw->tw_refcnt), tw, len);
2482 }
2483 
2484 #define TMPSZ 150
2485 
2486 static int tcp4_seq_show(struct seq_file *seq, void *v)
2487 {
2488 	struct tcp_iter_state *st;
2489 	int len;
2490 
2491 	if (v == SEQ_START_TOKEN) {
2492 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2493 			   "  sl  local_address rem_address   st tx_queue "
2494 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2495 			   "inode");
2496 		goto out;
2497 	}
2498 	st = seq->private;
2499 
2500 	switch (st->state) {
2501 	case TCP_SEQ_STATE_LISTENING:
2502 	case TCP_SEQ_STATE_ESTABLISHED:
2503 		get_tcp4_sock(v, seq, st->num, &len);
2504 		break;
2505 	case TCP_SEQ_STATE_OPENREQ:
2506 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2507 		break;
2508 	case TCP_SEQ_STATE_TIME_WAIT:
2509 		get_timewait4_sock(v, seq, st->num, &len);
2510 		break;
2511 	}
2512 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2513 out:
2514 	return 0;
2515 }
2516 
2517 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2518 	.name		= "tcp",
2519 	.family		= AF_INET,
2520 	.seq_fops	= {
2521 		.owner		= THIS_MODULE,
2522 	},
2523 	.seq_ops	= {
2524 		.show		= tcp4_seq_show,
2525 	},
2526 };
2527 
2528 static int __net_init tcp4_proc_init_net(struct net *net)
2529 {
2530 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2531 }
2532 
2533 static void __net_exit tcp4_proc_exit_net(struct net *net)
2534 {
2535 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2536 }
2537 
2538 static struct pernet_operations tcp4_net_ops = {
2539 	.init = tcp4_proc_init_net,
2540 	.exit = tcp4_proc_exit_net,
2541 };
2542 
2543 int __init tcp4_proc_init(void)
2544 {
2545 	return register_pernet_subsys(&tcp4_net_ops);
2546 }
2547 
2548 void tcp4_proc_exit(void)
2549 {
2550 	unregister_pernet_subsys(&tcp4_net_ops);
2551 }
2552 #endif /* CONFIG_PROC_FS */
2553 
2554 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2555 {
2556 	struct iphdr *iph = skb_gro_network_header(skb);
2557 
2558 	switch (skb->ip_summed) {
2559 	case CHECKSUM_COMPLETE:
2560 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2561 				  skb->csum)) {
2562 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2563 			break;
2564 		}
2565 
2566 		/* fall through */
2567 	case CHECKSUM_NONE:
2568 		NAPI_GRO_CB(skb)->flush = 1;
2569 		return NULL;
2570 	}
2571 
2572 	return tcp_gro_receive(head, skb);
2573 }
2574 EXPORT_SYMBOL(tcp4_gro_receive);
2575 
2576 int tcp4_gro_complete(struct sk_buff *skb)
2577 {
2578 	struct iphdr *iph = ip_hdr(skb);
2579 	struct tcphdr *th = tcp_hdr(skb);
2580 
2581 	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2582 				  iph->saddr, iph->daddr, 0);
2583 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2584 
2585 	return tcp_gro_complete(skb);
2586 }
2587 EXPORT_SYMBOL(tcp4_gro_complete);
2588 
2589 struct proto tcp_prot = {
2590 	.name			= "TCP",
2591 	.owner			= THIS_MODULE,
2592 	.close			= tcp_close,
2593 	.connect		= tcp_v4_connect,
2594 	.disconnect		= tcp_disconnect,
2595 	.accept			= inet_csk_accept,
2596 	.ioctl			= tcp_ioctl,
2597 	.init			= tcp_v4_init_sock,
2598 	.destroy		= tcp_v4_destroy_sock,
2599 	.shutdown		= tcp_shutdown,
2600 	.setsockopt		= tcp_setsockopt,
2601 	.getsockopt		= tcp_getsockopt,
2602 	.recvmsg		= tcp_recvmsg,
2603 	.sendmsg		= tcp_sendmsg,
2604 	.sendpage		= tcp_sendpage,
2605 	.backlog_rcv		= tcp_v4_do_rcv,
2606 	.hash			= inet_hash,
2607 	.unhash			= inet_unhash,
2608 	.get_port		= inet_csk_get_port,
2609 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2610 	.sockets_allocated	= &tcp_sockets_allocated,
2611 	.orphan_count		= &tcp_orphan_count,
2612 	.memory_allocated	= &tcp_memory_allocated,
2613 	.memory_pressure	= &tcp_memory_pressure,
2614 	.sysctl_mem		= sysctl_tcp_mem,
2615 	.sysctl_wmem		= sysctl_tcp_wmem,
2616 	.sysctl_rmem		= sysctl_tcp_rmem,
2617 	.max_header		= MAX_TCP_HEADER,
2618 	.obj_size		= sizeof(struct tcp_sock),
2619 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2620 	.twsk_prot		= &tcp_timewait_sock_ops,
2621 	.rsk_prot		= &tcp_request_sock_ops,
2622 	.h.hashinfo		= &tcp_hashinfo,
2623 	.no_autobind		= true,
2624 #ifdef CONFIG_COMPAT
2625 	.compat_setsockopt	= compat_tcp_setsockopt,
2626 	.compat_getsockopt	= compat_tcp_getsockopt,
2627 #endif
2628 };
2629 EXPORT_SYMBOL(tcp_prot);
2630 
2631 
2632 static int __net_init tcp_sk_init(struct net *net)
2633 {
2634 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2635 				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2636 }
2637 
2638 static void __net_exit tcp_sk_exit(struct net *net)
2639 {
2640 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2641 }
2642 
2643 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2644 {
2645 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2646 }
2647 
2648 static struct pernet_operations __net_initdata tcp_sk_ops = {
2649        .init	   = tcp_sk_init,
2650        .exit	   = tcp_sk_exit,
2651        .exit_batch = tcp_sk_exit_batch,
2652 };
2653 
2654 void __init tcp_v4_init(void)
2655 {
2656 	inet_hashinfo_init(&tcp_hashinfo);
2657 	if (register_pernet_subsys(&tcp_sk_ops))
2658 		panic("Failed to create the TCP control socket.\n");
2659 }
2660