xref: /linux/net/ipv4/tcp_ipv4.c (revision 13abf8130139c2ccd4962a7e5a8902be5e6cb5a7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *		IPv4 specific functions
11  *
12  *
13  *		code split from:
14  *		linux/ipv4/tcp.c
15  *		linux/ipv4/tcp_input.c
16  *		linux/ipv4/tcp_output.c
17  *
18  *		See tcp.c for author information
19  *
20  *	This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25 
26 /*
27  * Changes:
28  *		David S. Miller	:	New socket lookup architecture.
29  *					This code is dedicated to John Dyson.
30  *		David S. Miller :	Change semantics of established hash,
31  *					half is devoted to TIME_WAIT sockets
32  *					and the rest go in the other half.
33  *		Andi Kleen :		Add support for syncookies and fixed
34  *					some bugs: ip options weren't passed to
35  *					the TCP layer, missed a check for an
36  *					ACK bit.
37  *		Andi Kleen :		Implemented fast path mtu discovery.
38  *	     				Fixed many serious bugs in the
39  *					request_sock handling and moved
40  *					most of it into the af independent code.
41  *					Added tail drop and some other bugfixes.
42  *					Added new listen sematics.
43  *		Mike McLagan	:	Routing by source
44  *	Juan Jose Ciarlante:		ip_dynaddr bits
45  *		Andi Kleen:		various fixes.
46  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47  *					coma.
48  *	Andi Kleen		:	Fix new listen.
49  *	Andi Kleen		:	Fix accept error reporting.
50  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52  *					a single port at the same time.
53  */
54 
55 #include <linux/config.h>
56 
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65 
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/xfrm.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82 
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85 
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88 
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 		       struct sk_buff *skb);
91 
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 	.lhash_lock	= RW_LOCK_UNLOCKED,
94 	.lhash_users	= ATOMIC_INIT(0),
95 	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
97 	.port_rover	= 1024 - 1,
98 };
99 
100 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
101 {
102 	return inet_csk_get_port(&tcp_hashinfo, sk, snum);
103 }
104 
105 static void tcp_v4_hash(struct sock *sk)
106 {
107 	inet_hash(&tcp_hashinfo, sk);
108 }
109 
110 void tcp_unhash(struct sock *sk)
111 {
112 	inet_unhash(&tcp_hashinfo, sk);
113 }
114 
115 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
116 {
117 	return secure_tcp_sequence_number(skb->nh.iph->daddr,
118 					  skb->nh.iph->saddr,
119 					  skb->h.th->dest,
120 					  skb->h.th->source);
121 }
122 
123 /* called with local bh disabled */
124 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
125 				      struct inet_timewait_sock **twp)
126 {
127 	struct inet_sock *inet = inet_sk(sk);
128 	u32 daddr = inet->rcv_saddr;
129 	u32 saddr = inet->daddr;
130 	int dif = sk->sk_bound_dev_if;
131 	INET_ADDR_COOKIE(acookie, saddr, daddr)
132 	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
133 	const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
134 	struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
135 	struct sock *sk2;
136 	const struct hlist_node *node;
137 	struct inet_timewait_sock *tw;
138 
139 	write_lock(&head->lock);
140 
141 	/* Check TIME-WAIT sockets first. */
142 	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
143 		tw = inet_twsk(sk2);
144 
145 		if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
146 			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
147 			struct tcp_sock *tp = tcp_sk(sk);
148 
149 			/* With PAWS, it is safe from the viewpoint
150 			   of data integrity. Even without PAWS it
151 			   is safe provided sequence spaces do not
152 			   overlap i.e. at data rates <= 80Mbit/sec.
153 
154 			   Actually, the idea is close to VJ's one,
155 			   only timestamp cache is held not per host,
156 			   but per port pair and TW bucket is used
157 			   as state holder.
158 
159 			   If TW bucket has been already destroyed we
160 			   fall back to VJ's scheme and use initial
161 			   timestamp retrieved from peer table.
162 			 */
163 			if (tcptw->tw_ts_recent_stamp &&
164 			    (!twp || (sysctl_tcp_tw_reuse &&
165 				      xtime.tv_sec -
166 				      tcptw->tw_ts_recent_stamp > 1))) {
167 				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
168 				if (tp->write_seq == 0)
169 					tp->write_seq = 1;
170 				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
171 				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 				sock_hold(sk2);
173 				goto unique;
174 			} else
175 				goto not_unique;
176 		}
177 	}
178 	tw = NULL;
179 
180 	/* And established part... */
181 	sk_for_each(sk2, node, &head->chain) {
182 		if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
183 			goto not_unique;
184 	}
185 
186 unique:
187 	/* Must record num and sport now. Otherwise we will see
188 	 * in hash table socket with a funny identity. */
189 	inet->num = lport;
190 	inet->sport = htons(lport);
191 	sk->sk_hashent = hash;
192 	BUG_TRAP(sk_unhashed(sk));
193 	__sk_add_node(sk, &head->chain);
194 	sock_prot_inc_use(sk->sk_prot);
195 	write_unlock(&head->lock);
196 
197 	if (twp) {
198 		*twp = tw;
199 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
200 	} else if (tw) {
201 		/* Silly. Should hash-dance instead... */
202 		inet_twsk_deschedule(tw, &tcp_death_row);
203 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
204 
205 		inet_twsk_put(tw);
206 	}
207 
208 	return 0;
209 
210 not_unique:
211 	write_unlock(&head->lock);
212 	return -EADDRNOTAVAIL;
213 }
214 
215 static inline u32 connect_port_offset(const struct sock *sk)
216 {
217 	const struct inet_sock *inet = inet_sk(sk);
218 
219 	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
220 					 inet->dport);
221 }
222 
223 /*
224  * Bind a port for a connect operation and hash it.
225  */
226 static inline int tcp_v4_hash_connect(struct sock *sk)
227 {
228 	const unsigned short snum = inet_sk(sk)->num;
229  	struct inet_bind_hashbucket *head;
230  	struct inet_bind_bucket *tb;
231 	int ret;
232 
233  	if (!snum) {
234  		int low = sysctl_local_port_range[0];
235  		int high = sysctl_local_port_range[1];
236 		int range = high - low;
237  		int i;
238 		int port;
239 		static u32 hint;
240 		u32 offset = hint + connect_port_offset(sk);
241 		struct hlist_node *node;
242  		struct inet_timewait_sock *tw = NULL;
243 
244  		local_bh_disable();
245 		for (i = 1; i <= range; i++) {
246 			port = low + (i + offset) % range;
247  			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
248  			spin_lock(&head->lock);
249 
250  			/* Does not bother with rcv_saddr checks,
251  			 * because the established check is already
252  			 * unique enough.
253  			 */
254 			inet_bind_bucket_for_each(tb, node, &head->chain) {
255  				if (tb->port == port) {
256  					BUG_TRAP(!hlist_empty(&tb->owners));
257  					if (tb->fastreuse >= 0)
258  						goto next_port;
259  					if (!__tcp_v4_check_established(sk,
260 									port,
261 									&tw))
262  						goto ok;
263  					goto next_port;
264  				}
265  			}
266 
267  			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
268  			if (!tb) {
269  				spin_unlock(&head->lock);
270  				break;
271  			}
272  			tb->fastreuse = -1;
273  			goto ok;
274 
275  		next_port:
276  			spin_unlock(&head->lock);
277  		}
278  		local_bh_enable();
279 
280  		return -EADDRNOTAVAIL;
281 
282 ok:
283 		hint += i;
284 
285  		/* Head lock still held and bh's disabled */
286  		inet_bind_hash(sk, tb, port);
287 		if (sk_unhashed(sk)) {
288  			inet_sk(sk)->sport = htons(port);
289  			__inet_hash(&tcp_hashinfo, sk, 0);
290  		}
291  		spin_unlock(&head->lock);
292 
293  		if (tw) {
294  			inet_twsk_deschedule(tw, &tcp_death_row);;
295  			inet_twsk_put(tw);
296  		}
297 
298 		ret = 0;
299 		goto out;
300  	}
301 
302  	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
303  	tb  = inet_csk(sk)->icsk_bind_hash;
304 	spin_lock_bh(&head->lock);
305 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
306 		__inet_hash(&tcp_hashinfo, sk, 0);
307 		spin_unlock_bh(&head->lock);
308 		return 0;
309 	} else {
310 		spin_unlock(&head->lock);
311 		/* No definite answer... Walk to established hash table */
312 		ret = __tcp_v4_check_established(sk, snum, NULL);
313 out:
314 		local_bh_enable();
315 		return ret;
316 	}
317 }
318 
319 /* This will initiate an outgoing connection. */
320 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
321 {
322 	struct inet_sock *inet = inet_sk(sk);
323 	struct tcp_sock *tp = tcp_sk(sk);
324 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
325 	struct rtable *rt;
326 	u32 daddr, nexthop;
327 	int tmp;
328 	int err;
329 
330 	if (addr_len < sizeof(struct sockaddr_in))
331 		return -EINVAL;
332 
333 	if (usin->sin_family != AF_INET)
334 		return -EAFNOSUPPORT;
335 
336 	nexthop = daddr = usin->sin_addr.s_addr;
337 	if (inet->opt && inet->opt->srr) {
338 		if (!daddr)
339 			return -EINVAL;
340 		nexthop = inet->opt->faddr;
341 	}
342 
343 	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
344 			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
345 			       IPPROTO_TCP,
346 			       inet->sport, usin->sin_port, sk);
347 	if (tmp < 0)
348 		return tmp;
349 
350 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
351 		ip_rt_put(rt);
352 		return -ENETUNREACH;
353 	}
354 
355 	if (!inet->opt || !inet->opt->srr)
356 		daddr = rt->rt_dst;
357 
358 	if (!inet->saddr)
359 		inet->saddr = rt->rt_src;
360 	inet->rcv_saddr = inet->saddr;
361 
362 	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
363 		/* Reset inherited state */
364 		tp->rx_opt.ts_recent	   = 0;
365 		tp->rx_opt.ts_recent_stamp = 0;
366 		tp->write_seq		   = 0;
367 	}
368 
369 	if (tcp_death_row.sysctl_tw_recycle &&
370 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
371 		struct inet_peer *peer = rt_get_peer(rt);
372 
373 		/* VJ's idea. We save last timestamp seen from
374 		 * the destination in peer table, when entering state TIME-WAIT
375 		 * and initialize rx_opt.ts_recent from it, when trying new connection.
376 		 */
377 
378 		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
379 			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
380 			tp->rx_opt.ts_recent = peer->tcp_ts;
381 		}
382 	}
383 
384 	inet->dport = usin->sin_port;
385 	inet->daddr = daddr;
386 
387 	tp->ext_header_len = 0;
388 	if (inet->opt)
389 		tp->ext_header_len = inet->opt->optlen;
390 
391 	tp->rx_opt.mss_clamp = 536;
392 
393 	/* Socket identity is still unknown (sport may be zero).
394 	 * However we set state to SYN-SENT and not releasing socket
395 	 * lock select source port, enter ourselves into the hash tables and
396 	 * complete initialization after this.
397 	 */
398 	tcp_set_state(sk, TCP_SYN_SENT);
399 	err = tcp_v4_hash_connect(sk);
400 	if (err)
401 		goto failure;
402 
403 	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
404 	if (err)
405 		goto failure;
406 
407 	/* OK, now commit destination to socket.  */
408 	sk_setup_caps(sk, &rt->u.dst);
409 
410 	if (!tp->write_seq)
411 		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
412 							   inet->daddr,
413 							   inet->sport,
414 							   usin->sin_port);
415 
416 	inet->id = tp->write_seq ^ jiffies;
417 
418 	err = tcp_connect(sk);
419 	rt = NULL;
420 	if (err)
421 		goto failure;
422 
423 	return 0;
424 
425 failure:
426 	/* This unhashes the socket and releases the local port, if necessary. */
427 	tcp_set_state(sk, TCP_CLOSE);
428 	ip_rt_put(rt);
429 	sk->sk_route_caps = 0;
430 	inet->dport = 0;
431 	return err;
432 }
433 
434 /*
435  * This routine does path mtu discovery as defined in RFC1191.
436  */
437 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
438 				     u32 mtu)
439 {
440 	struct dst_entry *dst;
441 	struct inet_sock *inet = inet_sk(sk);
442 	struct tcp_sock *tp = tcp_sk(sk);
443 
444 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
445 	 * send out by Linux are always <576bytes so they should go through
446 	 * unfragmented).
447 	 */
448 	if (sk->sk_state == TCP_LISTEN)
449 		return;
450 
451 	/* We don't check in the destentry if pmtu discovery is forbidden
452 	 * on this route. We just assume that no packet_to_big packets
453 	 * are send back when pmtu discovery is not active.
454      	 * There is a small race when the user changes this flag in the
455 	 * route, but I think that's acceptable.
456 	 */
457 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
458 		return;
459 
460 	dst->ops->update_pmtu(dst, mtu);
461 
462 	/* Something is about to be wrong... Remember soft error
463 	 * for the case, if this connection will not able to recover.
464 	 */
465 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
466 		sk->sk_err_soft = EMSGSIZE;
467 
468 	mtu = dst_mtu(dst);
469 
470 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
471 	    tp->pmtu_cookie > mtu) {
472 		tcp_sync_mss(sk, mtu);
473 
474 		/* Resend the TCP packet because it's
475 		 * clear that the old packet has been
476 		 * dropped. This is the new "fast" path mtu
477 		 * discovery.
478 		 */
479 		tcp_simple_retransmit(sk);
480 	} /* else let the usual retransmit timer handle it */
481 }
482 
483 /*
484  * This routine is called by the ICMP module when it gets some
485  * sort of error condition.  If err < 0 then the socket should
486  * be closed and the error returned to the user.  If err > 0
487  * it's just the icmp type << 8 | icmp code.  After adjustment
488  * header points to the first 8 bytes of the tcp header.  We need
489  * to find the appropriate port.
490  *
491  * The locking strategy used here is very "optimistic". When
492  * someone else accesses the socket the ICMP is just dropped
493  * and for some paths there is no check at all.
494  * A more general error queue to queue errors for later handling
495  * is probably better.
496  *
497  */
498 
499 void tcp_v4_err(struct sk_buff *skb, u32 info)
500 {
501 	struct iphdr *iph = (struct iphdr *)skb->data;
502 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
503 	struct tcp_sock *tp;
504 	struct inet_sock *inet;
505 	int type = skb->h.icmph->type;
506 	int code = skb->h.icmph->code;
507 	struct sock *sk;
508 	__u32 seq;
509 	int err;
510 
511 	if (skb->len < (iph->ihl << 2) + 8) {
512 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
513 		return;
514 	}
515 
516 	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
517 			 th->source, inet_iif(skb));
518 	if (!sk) {
519 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
520 		return;
521 	}
522 	if (sk->sk_state == TCP_TIME_WAIT) {
523 		inet_twsk_put((struct inet_timewait_sock *)sk);
524 		return;
525 	}
526 
527 	bh_lock_sock(sk);
528 	/* If too many ICMPs get dropped on busy
529 	 * servers this needs to be solved differently.
530 	 */
531 	if (sock_owned_by_user(sk))
532 		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
533 
534 	if (sk->sk_state == TCP_CLOSE)
535 		goto out;
536 
537 	tp = tcp_sk(sk);
538 	seq = ntohl(th->seq);
539 	if (sk->sk_state != TCP_LISTEN &&
540 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
541 		NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
542 		goto out;
543 	}
544 
545 	switch (type) {
546 	case ICMP_SOURCE_QUENCH:
547 		/* Just silently ignore these. */
548 		goto out;
549 	case ICMP_PARAMETERPROB:
550 		err = EPROTO;
551 		break;
552 	case ICMP_DEST_UNREACH:
553 		if (code > NR_ICMP_UNREACH)
554 			goto out;
555 
556 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
557 			if (!sock_owned_by_user(sk))
558 				do_pmtu_discovery(sk, iph, info);
559 			goto out;
560 		}
561 
562 		err = icmp_err_convert[code].errno;
563 		break;
564 	case ICMP_TIME_EXCEEDED:
565 		err = EHOSTUNREACH;
566 		break;
567 	default:
568 		goto out;
569 	}
570 
571 	switch (sk->sk_state) {
572 		struct request_sock *req, **prev;
573 	case TCP_LISTEN:
574 		if (sock_owned_by_user(sk))
575 			goto out;
576 
577 		req = inet_csk_search_req(sk, &prev, th->dest,
578 					  iph->daddr, iph->saddr);
579 		if (!req)
580 			goto out;
581 
582 		/* ICMPs are not backlogged, hence we cannot get
583 		   an established socket here.
584 		 */
585 		BUG_TRAP(!req->sk);
586 
587 		if (seq != tcp_rsk(req)->snt_isn) {
588 			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
589 			goto out;
590 		}
591 
592 		/*
593 		 * Still in SYN_RECV, just remove it silently.
594 		 * There is no good way to pass the error to the newly
595 		 * created socket, and POSIX does not want network
596 		 * errors returned from accept().
597 		 */
598 		inet_csk_reqsk_queue_drop(sk, req, prev);
599 		goto out;
600 
601 	case TCP_SYN_SENT:
602 	case TCP_SYN_RECV:  /* Cannot happen.
603 			       It can f.e. if SYNs crossed.
604 			     */
605 		if (!sock_owned_by_user(sk)) {
606 			TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
607 			sk->sk_err = err;
608 
609 			sk->sk_error_report(sk);
610 
611 			tcp_done(sk);
612 		} else {
613 			sk->sk_err_soft = err;
614 		}
615 		goto out;
616 	}
617 
618 	/* If we've already connected we will keep trying
619 	 * until we time out, or the user gives up.
620 	 *
621 	 * rfc1122 4.2.3.9 allows to consider as hard errors
622 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
623 	 * but it is obsoleted by pmtu discovery).
624 	 *
625 	 * Note, that in modern internet, where routing is unreliable
626 	 * and in each dark corner broken firewalls sit, sending random
627 	 * errors ordered by their masters even this two messages finally lose
628 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
629 	 *
630 	 * Now we are in compliance with RFCs.
631 	 *							--ANK (980905)
632 	 */
633 
634 	inet = inet_sk(sk);
635 	if (!sock_owned_by_user(sk) && inet->recverr) {
636 		sk->sk_err = err;
637 		sk->sk_error_report(sk);
638 	} else	{ /* Only an error on timeout */
639 		sk->sk_err_soft = err;
640 	}
641 
642 out:
643 	bh_unlock_sock(sk);
644 	sock_put(sk);
645 }
646 
647 /* This routine computes an IPv4 TCP checksum. */
648 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
649 		       struct sk_buff *skb)
650 {
651 	struct inet_sock *inet = inet_sk(sk);
652 
653 	if (skb->ip_summed == CHECKSUM_HW) {
654 		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
655 		skb->csum = offsetof(struct tcphdr, check);
656 	} else {
657 		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
658 					 csum_partial((char *)th,
659 						      th->doff << 2,
660 						      skb->csum));
661 	}
662 }
663 
664 /*
665  *	This routine will send an RST to the other tcp.
666  *
667  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
668  *		      for reset.
669  *	Answer: if a packet caused RST, it is not for a socket
670  *		existing in our system, if it is matched to a socket,
671  *		it is just duplicate segment or bug in other side's TCP.
672  *		So that we build reply only basing on parameters
673  *		arrived with segment.
674  *	Exception: precedence violation. We do not implement it in any case.
675  */
676 
677 static void tcp_v4_send_reset(struct sk_buff *skb)
678 {
679 	struct tcphdr *th = skb->h.th;
680 	struct tcphdr rth;
681 	struct ip_reply_arg arg;
682 
683 	/* Never send a reset in response to a reset. */
684 	if (th->rst)
685 		return;
686 
687 	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
688 		return;
689 
690 	/* Swap the send and the receive. */
691 	memset(&rth, 0, sizeof(struct tcphdr));
692 	rth.dest   = th->source;
693 	rth.source = th->dest;
694 	rth.doff   = sizeof(struct tcphdr) / 4;
695 	rth.rst    = 1;
696 
697 	if (th->ack) {
698 		rth.seq = th->ack_seq;
699 	} else {
700 		rth.ack = 1;
701 		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
702 				    skb->len - (th->doff << 2));
703 	}
704 
705 	memset(&arg, 0, sizeof arg);
706 	arg.iov[0].iov_base = (unsigned char *)&rth;
707 	arg.iov[0].iov_len  = sizeof rth;
708 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
709 				      skb->nh.iph->saddr, /*XXX*/
710 				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
711 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
712 
713 	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
714 
715 	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
716 	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
717 }
718 
719 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
720    outside socket context is ugly, certainly. What can I do?
721  */
722 
723 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
724 			    u32 win, u32 ts)
725 {
726 	struct tcphdr *th = skb->h.th;
727 	struct {
728 		struct tcphdr th;
729 		u32 tsopt[3];
730 	} rep;
731 	struct ip_reply_arg arg;
732 
733 	memset(&rep.th, 0, sizeof(struct tcphdr));
734 	memset(&arg, 0, sizeof arg);
735 
736 	arg.iov[0].iov_base = (unsigned char *)&rep;
737 	arg.iov[0].iov_len  = sizeof(rep.th);
738 	if (ts) {
739 		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
740 				     (TCPOPT_TIMESTAMP << 8) |
741 				     TCPOLEN_TIMESTAMP);
742 		rep.tsopt[1] = htonl(tcp_time_stamp);
743 		rep.tsopt[2] = htonl(ts);
744 		arg.iov[0].iov_len = sizeof(rep);
745 	}
746 
747 	/* Swap the send and the receive. */
748 	rep.th.dest    = th->source;
749 	rep.th.source  = th->dest;
750 	rep.th.doff    = arg.iov[0].iov_len / 4;
751 	rep.th.seq     = htonl(seq);
752 	rep.th.ack_seq = htonl(ack);
753 	rep.th.ack     = 1;
754 	rep.th.window  = htons(win);
755 
756 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
757 				      skb->nh.iph->saddr, /*XXX*/
758 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
759 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
760 
761 	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
762 
763 	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
764 }
765 
766 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
767 {
768 	struct inet_timewait_sock *tw = inet_twsk(sk);
769 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
770 
771 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
772 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
773 
774 	inet_twsk_put(tw);
775 }
776 
777 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
778 {
779 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
780 			req->ts_recent);
781 }
782 
783 /*
784  *	Send a SYN-ACK after having received an ACK.
785  *	This still operates on a request_sock only, not on a big
786  *	socket.
787  */
788 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
789 			      struct dst_entry *dst)
790 {
791 	const struct inet_request_sock *ireq = inet_rsk(req);
792 	int err = -1;
793 	struct sk_buff * skb;
794 
795 	/* First, grab a route. */
796 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
797 		goto out;
798 
799 	skb = tcp_make_synack(sk, dst, req);
800 
801 	if (skb) {
802 		struct tcphdr *th = skb->h.th;
803 
804 		th->check = tcp_v4_check(th, skb->len,
805 					 ireq->loc_addr,
806 					 ireq->rmt_addr,
807 					 csum_partial((char *)th, skb->len,
808 						      skb->csum));
809 
810 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
811 					    ireq->rmt_addr,
812 					    ireq->opt);
813 		if (err == NET_XMIT_CN)
814 			err = 0;
815 	}
816 
817 out:
818 	dst_release(dst);
819 	return err;
820 }
821 
822 /*
823  *	IPv4 request_sock destructor.
824  */
825 static void tcp_v4_reqsk_destructor(struct request_sock *req)
826 {
827 	if (inet_rsk(req)->opt)
828 		kfree(inet_rsk(req)->opt);
829 }
830 
831 static inline void syn_flood_warning(struct sk_buff *skb)
832 {
833 	static unsigned long warntime;
834 
835 	if (time_after(jiffies, (warntime + HZ * 60))) {
836 		warntime = jiffies;
837 		printk(KERN_INFO
838 		       "possible SYN flooding on port %d. Sending cookies.\n",
839 		       ntohs(skb->h.th->dest));
840 	}
841 }
842 
843 /*
844  * Save and compile IPv4 options into the request_sock if needed.
845  */
846 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
847 						     struct sk_buff *skb)
848 {
849 	struct ip_options *opt = &(IPCB(skb)->opt);
850 	struct ip_options *dopt = NULL;
851 
852 	if (opt && opt->optlen) {
853 		int opt_size = optlength(opt);
854 		dopt = kmalloc(opt_size, GFP_ATOMIC);
855 		if (dopt) {
856 			if (ip_options_echo(dopt, skb)) {
857 				kfree(dopt);
858 				dopt = NULL;
859 			}
860 		}
861 	}
862 	return dopt;
863 }
864 
865 struct request_sock_ops tcp_request_sock_ops = {
866 	.family		=	PF_INET,
867 	.obj_size	=	sizeof(struct tcp_request_sock),
868 	.rtx_syn_ack	=	tcp_v4_send_synack,
869 	.send_ack	=	tcp_v4_reqsk_send_ack,
870 	.destructor	=	tcp_v4_reqsk_destructor,
871 	.send_reset	=	tcp_v4_send_reset,
872 };
873 
874 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
875 {
876 	struct inet_request_sock *ireq;
877 	struct tcp_options_received tmp_opt;
878 	struct request_sock *req;
879 	__u32 saddr = skb->nh.iph->saddr;
880 	__u32 daddr = skb->nh.iph->daddr;
881 	__u32 isn = TCP_SKB_CB(skb)->when;
882 	struct dst_entry *dst = NULL;
883 #ifdef CONFIG_SYN_COOKIES
884 	int want_cookie = 0;
885 #else
886 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
887 #endif
888 
889 	/* Never answer to SYNs send to broadcast or multicast */
890 	if (((struct rtable *)skb->dst)->rt_flags &
891 	    (RTCF_BROADCAST | RTCF_MULTICAST))
892 		goto drop;
893 
894 	/* TW buckets are converted to open requests without
895 	 * limitations, they conserve resources and peer is
896 	 * evidently real one.
897 	 */
898 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
899 #ifdef CONFIG_SYN_COOKIES
900 		if (sysctl_tcp_syncookies) {
901 			want_cookie = 1;
902 		} else
903 #endif
904 		goto drop;
905 	}
906 
907 	/* Accept backlog is full. If we have already queued enough
908 	 * of warm entries in syn queue, drop request. It is better than
909 	 * clogging syn queue with openreqs with exponentially increasing
910 	 * timeout.
911 	 */
912 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
913 		goto drop;
914 
915 	req = reqsk_alloc(&tcp_request_sock_ops);
916 	if (!req)
917 		goto drop;
918 
919 	tcp_clear_options(&tmp_opt);
920 	tmp_opt.mss_clamp = 536;
921 	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
922 
923 	tcp_parse_options(skb, &tmp_opt, 0);
924 
925 	if (want_cookie) {
926 		tcp_clear_options(&tmp_opt);
927 		tmp_opt.saw_tstamp = 0;
928 	}
929 
930 	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
931 		/* Some OSes (unknown ones, but I see them on web server, which
932 		 * contains information interesting only for windows'
933 		 * users) do not send their stamp in SYN. It is easy case.
934 		 * We simply do not advertise TS support.
935 		 */
936 		tmp_opt.saw_tstamp = 0;
937 		tmp_opt.tstamp_ok  = 0;
938 	}
939 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
940 
941 	tcp_openreq_init(req, &tmp_opt, skb);
942 
943 	ireq = inet_rsk(req);
944 	ireq->loc_addr = daddr;
945 	ireq->rmt_addr = saddr;
946 	ireq->opt = tcp_v4_save_options(sk, skb);
947 	if (!want_cookie)
948 		TCP_ECN_create_request(req, skb->h.th);
949 
950 	if (want_cookie) {
951 #ifdef CONFIG_SYN_COOKIES
952 		syn_flood_warning(skb);
953 #endif
954 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
955 	} else if (!isn) {
956 		struct inet_peer *peer = NULL;
957 
958 		/* VJ's idea. We save last timestamp seen
959 		 * from the destination in peer table, when entering
960 		 * state TIME-WAIT, and check against it before
961 		 * accepting new connection request.
962 		 *
963 		 * If "isn" is not zero, this request hit alive
964 		 * timewait bucket, so that all the necessary checks
965 		 * are made in the function processing timewait state.
966 		 */
967 		if (tmp_opt.saw_tstamp &&
968 		    tcp_death_row.sysctl_tw_recycle &&
969 		    (dst = inet_csk_route_req(sk, req)) != NULL &&
970 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
971 		    peer->v4daddr == saddr) {
972 			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
973 			    (s32)(peer->tcp_ts - req->ts_recent) >
974 							TCP_PAWS_WINDOW) {
975 				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
976 				dst_release(dst);
977 				goto drop_and_free;
978 			}
979 		}
980 		/* Kill the following clause, if you dislike this way. */
981 		else if (!sysctl_tcp_syncookies &&
982 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
983 			  (sysctl_max_syn_backlog >> 2)) &&
984 			 (!peer || !peer->tcp_ts_stamp) &&
985 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
986 			/* Without syncookies last quarter of
987 			 * backlog is filled with destinations,
988 			 * proven to be alive.
989 			 * It means that we continue to communicate
990 			 * to destinations, already remembered
991 			 * to the moment of synflood.
992 			 */
993 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
994 				       "request from %u.%u.%u.%u/%u\n",
995 				       NIPQUAD(saddr),
996 				       ntohs(skb->h.th->source));
997 			dst_release(dst);
998 			goto drop_and_free;
999 		}
1000 
1001 		isn = tcp_v4_init_sequence(sk, skb);
1002 	}
1003 	tcp_rsk(req)->snt_isn = isn;
1004 
1005 	if (tcp_v4_send_synack(sk, req, dst))
1006 		goto drop_and_free;
1007 
1008 	if (want_cookie) {
1009 	   	reqsk_free(req);
1010 	} else {
1011 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1012 	}
1013 	return 0;
1014 
1015 drop_and_free:
1016 	reqsk_free(req);
1017 drop:
1018 	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1019 	return 0;
1020 }
1021 
1022 
1023 /*
1024  * The three way handshake has completed - we got a valid synack -
1025  * now create the new socket.
1026  */
1027 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1028 				  struct request_sock *req,
1029 				  struct dst_entry *dst)
1030 {
1031 	struct inet_request_sock *ireq;
1032 	struct inet_sock *newinet;
1033 	struct tcp_sock *newtp;
1034 	struct sock *newsk;
1035 
1036 	if (sk_acceptq_is_full(sk))
1037 		goto exit_overflow;
1038 
1039 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1040 		goto exit;
1041 
1042 	newsk = tcp_create_openreq_child(sk, req, skb);
1043 	if (!newsk)
1044 		goto exit;
1045 
1046 	sk_setup_caps(newsk, dst);
1047 
1048 	newtp		      = tcp_sk(newsk);
1049 	newinet		      = inet_sk(newsk);
1050 	ireq		      = inet_rsk(req);
1051 	newinet->daddr	      = ireq->rmt_addr;
1052 	newinet->rcv_saddr    = ireq->loc_addr;
1053 	newinet->saddr	      = ireq->loc_addr;
1054 	newinet->opt	      = ireq->opt;
1055 	ireq->opt	      = NULL;
1056 	newinet->mc_index     = inet_iif(skb);
1057 	newinet->mc_ttl	      = skb->nh.iph->ttl;
1058 	newtp->ext_header_len = 0;
1059 	if (newinet->opt)
1060 		newtp->ext_header_len = newinet->opt->optlen;
1061 	newinet->id = newtp->write_seq ^ jiffies;
1062 
1063 	tcp_sync_mss(newsk, dst_mtu(dst));
1064 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1065 	tcp_initialize_rcv_mss(newsk);
1066 
1067 	__inet_hash(&tcp_hashinfo, newsk, 0);
1068 	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
1069 
1070 	return newsk;
1071 
1072 exit_overflow:
1073 	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1074 exit:
1075 	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1076 	dst_release(dst);
1077 	return NULL;
1078 }
1079 
1080 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1081 {
1082 	struct tcphdr *th = skb->h.th;
1083 	struct iphdr *iph = skb->nh.iph;
1084 	struct sock *nsk;
1085 	struct request_sock **prev;
1086 	/* Find possible connection requests. */
1087 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1088 						       iph->saddr, iph->daddr);
1089 	if (req)
1090 		return tcp_check_req(sk, skb, req, prev);
1091 
1092 	nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1093 					th->source, skb->nh.iph->daddr,
1094 					ntohs(th->dest), inet_iif(skb));
1095 
1096 	if (nsk) {
1097 		if (nsk->sk_state != TCP_TIME_WAIT) {
1098 			bh_lock_sock(nsk);
1099 			return nsk;
1100 		}
1101 		inet_twsk_put((struct inet_timewait_sock *)nsk);
1102 		return NULL;
1103 	}
1104 
1105 #ifdef CONFIG_SYN_COOKIES
1106 	if (!th->rst && !th->syn && th->ack)
1107 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1108 #endif
1109 	return sk;
1110 }
1111 
1112 static int tcp_v4_checksum_init(struct sk_buff *skb)
1113 {
1114 	if (skb->ip_summed == CHECKSUM_HW) {
1115 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1116 		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1117 				  skb->nh.iph->daddr, skb->csum))
1118 			return 0;
1119 
1120 		LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
1121 		skb->ip_summed = CHECKSUM_NONE;
1122 	}
1123 	if (skb->len <= 76) {
1124 		if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1125 				 skb->nh.iph->daddr,
1126 				 skb_checksum(skb, 0, skb->len, 0)))
1127 			return -1;
1128 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1129 	} else {
1130 		skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1131 					  skb->nh.iph->saddr,
1132 					  skb->nh.iph->daddr, 0);
1133 	}
1134 	return 0;
1135 }
1136 
1137 
1138 /* The socket must have it's spinlock held when we get
1139  * here.
1140  *
1141  * We have a potential double-lock case here, so even when
1142  * doing backlog processing we use the BH locking scheme.
1143  * This is because we cannot sleep with the original spinlock
1144  * held.
1145  */
1146 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1147 {
1148 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1149 		TCP_CHECK_TIMER(sk);
1150 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1151 			goto reset;
1152 		TCP_CHECK_TIMER(sk);
1153 		return 0;
1154 	}
1155 
1156 	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1157 		goto csum_err;
1158 
1159 	if (sk->sk_state == TCP_LISTEN) {
1160 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1161 		if (!nsk)
1162 			goto discard;
1163 
1164 		if (nsk != sk) {
1165 			if (tcp_child_process(sk, nsk, skb))
1166 				goto reset;
1167 			return 0;
1168 		}
1169 	}
1170 
1171 	TCP_CHECK_TIMER(sk);
1172 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1173 		goto reset;
1174 	TCP_CHECK_TIMER(sk);
1175 	return 0;
1176 
1177 reset:
1178 	tcp_v4_send_reset(skb);
1179 discard:
1180 	kfree_skb(skb);
1181 	/* Be careful here. If this function gets more complicated and
1182 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1183 	 * might be destroyed here. This current version compiles correctly,
1184 	 * but you have been warned.
1185 	 */
1186 	return 0;
1187 
1188 csum_err:
1189 	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1190 	goto discard;
1191 }
1192 
1193 /*
1194  *	From tcp_input.c
1195  */
1196 
1197 int tcp_v4_rcv(struct sk_buff *skb)
1198 {
1199 	struct tcphdr *th;
1200 	struct sock *sk;
1201 	int ret;
1202 
1203 	if (skb->pkt_type != PACKET_HOST)
1204 		goto discard_it;
1205 
1206 	/* Count it even if it's bad */
1207 	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1208 
1209 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1210 		goto discard_it;
1211 
1212 	th = skb->h.th;
1213 
1214 	if (th->doff < sizeof(struct tcphdr) / 4)
1215 		goto bad_packet;
1216 	if (!pskb_may_pull(skb, th->doff * 4))
1217 		goto discard_it;
1218 
1219 	/* An explanation is required here, I think.
1220 	 * Packet length and doff are validated by header prediction,
1221 	 * provided case of th->doff==0 is elimineted.
1222 	 * So, we defer the checks. */
1223 	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1224 	     tcp_v4_checksum_init(skb) < 0))
1225 		goto bad_packet;
1226 
1227 	th = skb->h.th;
1228 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1229 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1230 				    skb->len - th->doff * 4);
1231 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1232 	TCP_SKB_CB(skb)->when	 = 0;
1233 	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
1234 	TCP_SKB_CB(skb)->sacked	 = 0;
1235 
1236 	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1237 			   skb->nh.iph->daddr, ntohs(th->dest),
1238 			   inet_iif(skb));
1239 
1240 	if (!sk)
1241 		goto no_tcp_socket;
1242 
1243 process:
1244 	if (sk->sk_state == TCP_TIME_WAIT)
1245 		goto do_time_wait;
1246 
1247 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1248 		goto discard_and_relse;
1249 
1250 	if (sk_filter(sk, skb, 0))
1251 		goto discard_and_relse;
1252 
1253 	skb->dev = NULL;
1254 
1255 	bh_lock_sock(sk);
1256 	ret = 0;
1257 	if (!sock_owned_by_user(sk)) {
1258 		if (!tcp_prequeue(sk, skb))
1259 			ret = tcp_v4_do_rcv(sk, skb);
1260 	} else
1261 		sk_add_backlog(sk, skb);
1262 	bh_unlock_sock(sk);
1263 
1264 	sock_put(sk);
1265 
1266 	return ret;
1267 
1268 no_tcp_socket:
1269 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1270 		goto discard_it;
1271 
1272 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1273 bad_packet:
1274 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1275 	} else {
1276 		tcp_v4_send_reset(skb);
1277 	}
1278 
1279 discard_it:
1280 	/* Discard frame. */
1281 	kfree_skb(skb);
1282   	return 0;
1283 
1284 discard_and_relse:
1285 	sock_put(sk);
1286 	goto discard_it;
1287 
1288 do_time_wait:
1289 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1290 		inet_twsk_put((struct inet_timewait_sock *) sk);
1291 		goto discard_it;
1292 	}
1293 
1294 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1295 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1296 		inet_twsk_put((struct inet_timewait_sock *) sk);
1297 		goto discard_it;
1298 	}
1299 	switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1300 					   skb, th)) {
1301 	case TCP_TW_SYN: {
1302 		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1303 							skb->nh.iph->daddr,
1304 							ntohs(th->dest),
1305 							inet_iif(skb));
1306 		if (sk2) {
1307 			inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1308 					     &tcp_death_row);
1309 			inet_twsk_put((struct inet_timewait_sock *)sk);
1310 			sk = sk2;
1311 			goto process;
1312 		}
1313 		/* Fall through to ACK */
1314 	}
1315 	case TCP_TW_ACK:
1316 		tcp_v4_timewait_ack(sk, skb);
1317 		break;
1318 	case TCP_TW_RST:
1319 		goto no_tcp_socket;
1320 	case TCP_TW_SUCCESS:;
1321 	}
1322 	goto discard_it;
1323 }
1324 
1325 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1326 {
1327 	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1328 	struct inet_sock *inet = inet_sk(sk);
1329 
1330 	sin->sin_family		= AF_INET;
1331 	sin->sin_addr.s_addr	= inet->daddr;
1332 	sin->sin_port		= inet->dport;
1333 }
1334 
1335 /* VJ's idea. Save last timestamp seen from this destination
1336  * and hold it at least for normal timewait interval to use for duplicate
1337  * segment detection in subsequent connections, before they enter synchronized
1338  * state.
1339  */
1340 
1341 int tcp_v4_remember_stamp(struct sock *sk)
1342 {
1343 	struct inet_sock *inet = inet_sk(sk);
1344 	struct tcp_sock *tp = tcp_sk(sk);
1345 	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1346 	struct inet_peer *peer = NULL;
1347 	int release_it = 0;
1348 
1349 	if (!rt || rt->rt_dst != inet->daddr) {
1350 		peer = inet_getpeer(inet->daddr, 1);
1351 		release_it = 1;
1352 	} else {
1353 		if (!rt->peer)
1354 			rt_bind_peer(rt, 1);
1355 		peer = rt->peer;
1356 	}
1357 
1358 	if (peer) {
1359 		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1360 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1361 		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1362 			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1363 			peer->tcp_ts = tp->rx_opt.ts_recent;
1364 		}
1365 		if (release_it)
1366 			inet_putpeer(peer);
1367 		return 1;
1368 	}
1369 
1370 	return 0;
1371 }
1372 
1373 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1374 {
1375 	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1376 
1377 	if (peer) {
1378 		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1379 
1380 		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1381 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1382 		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1383 			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1384 			peer->tcp_ts	   = tcptw->tw_ts_recent;
1385 		}
1386 		inet_putpeer(peer);
1387 		return 1;
1388 	}
1389 
1390 	return 0;
1391 }
1392 
1393 struct tcp_func ipv4_specific = {
1394 	.queue_xmit	=	ip_queue_xmit,
1395 	.send_check	=	tcp_v4_send_check,
1396 	.rebuild_header	=	inet_sk_rebuild_header,
1397 	.conn_request	=	tcp_v4_conn_request,
1398 	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
1399 	.remember_stamp	=	tcp_v4_remember_stamp,
1400 	.net_header_len	=	sizeof(struct iphdr),
1401 	.setsockopt	=	ip_setsockopt,
1402 	.getsockopt	=	ip_getsockopt,
1403 	.addr2sockaddr	=	v4_addr2sockaddr,
1404 	.sockaddr_len	=	sizeof(struct sockaddr_in),
1405 };
1406 
1407 /* NOTE: A lot of things set to zero explicitly by call to
1408  *       sk_alloc() so need not be done here.
1409  */
1410 static int tcp_v4_init_sock(struct sock *sk)
1411 {
1412 	struct inet_connection_sock *icsk = inet_csk(sk);
1413 	struct tcp_sock *tp = tcp_sk(sk);
1414 
1415 	skb_queue_head_init(&tp->out_of_order_queue);
1416 	tcp_init_xmit_timers(sk);
1417 	tcp_prequeue_init(tp);
1418 
1419 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1420 	tp->mdev = TCP_TIMEOUT_INIT;
1421 
1422 	/* So many TCP implementations out there (incorrectly) count the
1423 	 * initial SYN frame in their delayed-ACK and congestion control
1424 	 * algorithms that we must have the following bandaid to talk
1425 	 * efficiently to them.  -DaveM
1426 	 */
1427 	tp->snd_cwnd = 2;
1428 
1429 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1430 	 * initialization of these values.
1431 	 */
1432 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1433 	tp->snd_cwnd_clamp = ~0;
1434 	tp->mss_cache = 536;
1435 
1436 	tp->reordering = sysctl_tcp_reordering;
1437 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1438 
1439 	sk->sk_state = TCP_CLOSE;
1440 
1441 	sk->sk_write_space = sk_stream_write_space;
1442 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1443 
1444 	tp->af_specific = &ipv4_specific;
1445 
1446 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1447 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1448 
1449 	atomic_inc(&tcp_sockets_allocated);
1450 
1451 	return 0;
1452 }
1453 
1454 int tcp_v4_destroy_sock(struct sock *sk)
1455 {
1456 	struct tcp_sock *tp = tcp_sk(sk);
1457 
1458 	tcp_clear_xmit_timers(sk);
1459 
1460 	tcp_cleanup_congestion_control(sk);
1461 
1462 	/* Cleanup up the write buffer. */
1463   	sk_stream_writequeue_purge(sk);
1464 
1465 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1466   	__skb_queue_purge(&tp->out_of_order_queue);
1467 
1468 	/* Clean prequeue, it must be empty really */
1469 	__skb_queue_purge(&tp->ucopy.prequeue);
1470 
1471 	/* Clean up a referenced TCP bind bucket. */
1472 	if (inet_csk(sk)->icsk_bind_hash)
1473 		inet_put_port(&tcp_hashinfo, sk);
1474 
1475 	/*
1476 	 * If sendmsg cached page exists, toss it.
1477 	 */
1478 	if (sk->sk_sndmsg_page) {
1479 		__free_page(sk->sk_sndmsg_page);
1480 		sk->sk_sndmsg_page = NULL;
1481 	}
1482 
1483 	atomic_dec(&tcp_sockets_allocated);
1484 
1485 	return 0;
1486 }
1487 
1488 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1489 
1490 #ifdef CONFIG_PROC_FS
1491 /* Proc filesystem TCP sock list dumping. */
1492 
1493 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1494 {
1495 	return hlist_empty(head) ? NULL :
1496 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1497 }
1498 
1499 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1500 {
1501 	return tw->tw_node.next ?
1502 		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1503 }
1504 
1505 static void *listening_get_next(struct seq_file *seq, void *cur)
1506 {
1507 	struct inet_connection_sock *icsk;
1508 	struct hlist_node *node;
1509 	struct sock *sk = cur;
1510 	struct tcp_iter_state* st = seq->private;
1511 
1512 	if (!sk) {
1513 		st->bucket = 0;
1514 		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1515 		goto get_sk;
1516 	}
1517 
1518 	++st->num;
1519 
1520 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1521 		struct request_sock *req = cur;
1522 
1523 	       	icsk = inet_csk(st->syn_wait_sk);
1524 		req = req->dl_next;
1525 		while (1) {
1526 			while (req) {
1527 				if (req->rsk_ops->family == st->family) {
1528 					cur = req;
1529 					goto out;
1530 				}
1531 				req = req->dl_next;
1532 			}
1533 			if (++st->sbucket >= TCP_SYNQ_HSIZE)
1534 				break;
1535 get_req:
1536 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1537 		}
1538 		sk	  = sk_next(st->syn_wait_sk);
1539 		st->state = TCP_SEQ_STATE_LISTENING;
1540 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1541 	} else {
1542 	       	icsk = inet_csk(sk);
1543 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1544 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1545 			goto start_req;
1546 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1547 		sk = sk_next(sk);
1548 	}
1549 get_sk:
1550 	sk_for_each_from(sk, node) {
1551 		if (sk->sk_family == st->family) {
1552 			cur = sk;
1553 			goto out;
1554 		}
1555 	       	icsk = inet_csk(sk);
1556 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1557 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1558 start_req:
1559 			st->uid		= sock_i_uid(sk);
1560 			st->syn_wait_sk = sk;
1561 			st->state	= TCP_SEQ_STATE_OPENREQ;
1562 			st->sbucket	= 0;
1563 			goto get_req;
1564 		}
1565 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1566 	}
1567 	if (++st->bucket < INET_LHTABLE_SIZE) {
1568 		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1569 		goto get_sk;
1570 	}
1571 	cur = NULL;
1572 out:
1573 	return cur;
1574 }
1575 
1576 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1577 {
1578 	void *rc = listening_get_next(seq, NULL);
1579 
1580 	while (rc && *pos) {
1581 		rc = listening_get_next(seq, rc);
1582 		--*pos;
1583 	}
1584 	return rc;
1585 }
1586 
1587 static void *established_get_first(struct seq_file *seq)
1588 {
1589 	struct tcp_iter_state* st = seq->private;
1590 	void *rc = NULL;
1591 
1592 	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1593 		struct sock *sk;
1594 		struct hlist_node *node;
1595 		struct inet_timewait_sock *tw;
1596 
1597 		/* We can reschedule _before_ having picked the target: */
1598 		cond_resched_softirq();
1599 
1600 		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1601 		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1602 			if (sk->sk_family != st->family) {
1603 				continue;
1604 			}
1605 			rc = sk;
1606 			goto out;
1607 		}
1608 		st->state = TCP_SEQ_STATE_TIME_WAIT;
1609 		inet_twsk_for_each(tw, node,
1610 				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1611 			if (tw->tw_family != st->family) {
1612 				continue;
1613 			}
1614 			rc = tw;
1615 			goto out;
1616 		}
1617 		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1618 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1619 	}
1620 out:
1621 	return rc;
1622 }
1623 
1624 static void *established_get_next(struct seq_file *seq, void *cur)
1625 {
1626 	struct sock *sk = cur;
1627 	struct inet_timewait_sock *tw;
1628 	struct hlist_node *node;
1629 	struct tcp_iter_state* st = seq->private;
1630 
1631 	++st->num;
1632 
1633 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1634 		tw = cur;
1635 		tw = tw_next(tw);
1636 get_tw:
1637 		while (tw && tw->tw_family != st->family) {
1638 			tw = tw_next(tw);
1639 		}
1640 		if (tw) {
1641 			cur = tw;
1642 			goto out;
1643 		}
1644 		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1645 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1646 
1647 		/* We can reschedule between buckets: */
1648 		cond_resched_softirq();
1649 
1650 		if (++st->bucket < tcp_hashinfo.ehash_size) {
1651 			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1652 			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1653 		} else {
1654 			cur = NULL;
1655 			goto out;
1656 		}
1657 	} else
1658 		sk = sk_next(sk);
1659 
1660 	sk_for_each_from(sk, node) {
1661 		if (sk->sk_family == st->family)
1662 			goto found;
1663 	}
1664 
1665 	st->state = TCP_SEQ_STATE_TIME_WAIT;
1666 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1667 	goto get_tw;
1668 found:
1669 	cur = sk;
1670 out:
1671 	return cur;
1672 }
1673 
1674 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1675 {
1676 	void *rc = established_get_first(seq);
1677 
1678 	while (rc && pos) {
1679 		rc = established_get_next(seq, rc);
1680 		--pos;
1681 	}
1682 	return rc;
1683 }
1684 
1685 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1686 {
1687 	void *rc;
1688 	struct tcp_iter_state* st = seq->private;
1689 
1690 	inet_listen_lock(&tcp_hashinfo);
1691 	st->state = TCP_SEQ_STATE_LISTENING;
1692 	rc	  = listening_get_idx(seq, &pos);
1693 
1694 	if (!rc) {
1695 		inet_listen_unlock(&tcp_hashinfo);
1696 		local_bh_disable();
1697 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1698 		rc	  = established_get_idx(seq, pos);
1699 	}
1700 
1701 	return rc;
1702 }
1703 
1704 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1705 {
1706 	struct tcp_iter_state* st = seq->private;
1707 	st->state = TCP_SEQ_STATE_LISTENING;
1708 	st->num = 0;
1709 	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1710 }
1711 
1712 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1713 {
1714 	void *rc = NULL;
1715 	struct tcp_iter_state* st;
1716 
1717 	if (v == SEQ_START_TOKEN) {
1718 		rc = tcp_get_idx(seq, 0);
1719 		goto out;
1720 	}
1721 	st = seq->private;
1722 
1723 	switch (st->state) {
1724 	case TCP_SEQ_STATE_OPENREQ:
1725 	case TCP_SEQ_STATE_LISTENING:
1726 		rc = listening_get_next(seq, v);
1727 		if (!rc) {
1728 			inet_listen_unlock(&tcp_hashinfo);
1729 			local_bh_disable();
1730 			st->state = TCP_SEQ_STATE_ESTABLISHED;
1731 			rc	  = established_get_first(seq);
1732 		}
1733 		break;
1734 	case TCP_SEQ_STATE_ESTABLISHED:
1735 	case TCP_SEQ_STATE_TIME_WAIT:
1736 		rc = established_get_next(seq, v);
1737 		break;
1738 	}
1739 out:
1740 	++*pos;
1741 	return rc;
1742 }
1743 
1744 static void tcp_seq_stop(struct seq_file *seq, void *v)
1745 {
1746 	struct tcp_iter_state* st = seq->private;
1747 
1748 	switch (st->state) {
1749 	case TCP_SEQ_STATE_OPENREQ:
1750 		if (v) {
1751 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1752 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1753 		}
1754 	case TCP_SEQ_STATE_LISTENING:
1755 		if (v != SEQ_START_TOKEN)
1756 			inet_listen_unlock(&tcp_hashinfo);
1757 		break;
1758 	case TCP_SEQ_STATE_TIME_WAIT:
1759 	case TCP_SEQ_STATE_ESTABLISHED:
1760 		if (v)
1761 			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1762 		local_bh_enable();
1763 		break;
1764 	}
1765 }
1766 
1767 static int tcp_seq_open(struct inode *inode, struct file *file)
1768 {
1769 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1770 	struct seq_file *seq;
1771 	struct tcp_iter_state *s;
1772 	int rc;
1773 
1774 	if (unlikely(afinfo == NULL))
1775 		return -EINVAL;
1776 
1777 	s = kmalloc(sizeof(*s), GFP_KERNEL);
1778 	if (!s)
1779 		return -ENOMEM;
1780 	memset(s, 0, sizeof(*s));
1781 	s->family		= afinfo->family;
1782 	s->seq_ops.start	= tcp_seq_start;
1783 	s->seq_ops.next		= tcp_seq_next;
1784 	s->seq_ops.show		= afinfo->seq_show;
1785 	s->seq_ops.stop		= tcp_seq_stop;
1786 
1787 	rc = seq_open(file, &s->seq_ops);
1788 	if (rc)
1789 		goto out_kfree;
1790 	seq	     = file->private_data;
1791 	seq->private = s;
1792 out:
1793 	return rc;
1794 out_kfree:
1795 	kfree(s);
1796 	goto out;
1797 }
1798 
1799 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1800 {
1801 	int rc = 0;
1802 	struct proc_dir_entry *p;
1803 
1804 	if (!afinfo)
1805 		return -EINVAL;
1806 	afinfo->seq_fops->owner		= afinfo->owner;
1807 	afinfo->seq_fops->open		= tcp_seq_open;
1808 	afinfo->seq_fops->read		= seq_read;
1809 	afinfo->seq_fops->llseek	= seq_lseek;
1810 	afinfo->seq_fops->release	= seq_release_private;
1811 
1812 	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1813 	if (p)
1814 		p->data = afinfo;
1815 	else
1816 		rc = -ENOMEM;
1817 	return rc;
1818 }
1819 
1820 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1821 {
1822 	if (!afinfo)
1823 		return;
1824 	proc_net_remove(afinfo->name);
1825 	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1826 }
1827 
1828 static void get_openreq4(struct sock *sk, struct request_sock *req,
1829 			 char *tmpbuf, int i, int uid)
1830 {
1831 	const struct inet_request_sock *ireq = inet_rsk(req);
1832 	int ttd = req->expires - jiffies;
1833 
1834 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1835 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1836 		i,
1837 		ireq->loc_addr,
1838 		ntohs(inet_sk(sk)->sport),
1839 		ireq->rmt_addr,
1840 		ntohs(ireq->rmt_port),
1841 		TCP_SYN_RECV,
1842 		0, 0, /* could print option size, but that is af dependent. */
1843 		1,    /* timers active (only the expire timer) */
1844 		jiffies_to_clock_t(ttd),
1845 		req->retrans,
1846 		uid,
1847 		0,  /* non standard timer */
1848 		0, /* open_requests have no inode */
1849 		atomic_read(&sk->sk_refcnt),
1850 		req);
1851 }
1852 
1853 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1854 {
1855 	int timer_active;
1856 	unsigned long timer_expires;
1857 	struct tcp_sock *tp = tcp_sk(sp);
1858 	const struct inet_connection_sock *icsk = inet_csk(sp);
1859 	struct inet_sock *inet = inet_sk(sp);
1860 	unsigned int dest = inet->daddr;
1861 	unsigned int src = inet->rcv_saddr;
1862 	__u16 destp = ntohs(inet->dport);
1863 	__u16 srcp = ntohs(inet->sport);
1864 
1865 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1866 		timer_active	= 1;
1867 		timer_expires	= icsk->icsk_timeout;
1868 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1869 		timer_active	= 4;
1870 		timer_expires	= icsk->icsk_timeout;
1871 	} else if (timer_pending(&sp->sk_timer)) {
1872 		timer_active	= 2;
1873 		timer_expires	= sp->sk_timer.expires;
1874 	} else {
1875 		timer_active	= 0;
1876 		timer_expires = jiffies;
1877 	}
1878 
1879 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1880 			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
1881 		i, src, srcp, dest, destp, sp->sk_state,
1882 		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1883 		timer_active,
1884 		jiffies_to_clock_t(timer_expires - jiffies),
1885 		icsk->icsk_retransmits,
1886 		sock_i_uid(sp),
1887 		icsk->icsk_probes_out,
1888 		sock_i_ino(sp),
1889 		atomic_read(&sp->sk_refcnt), sp,
1890 		icsk->icsk_rto,
1891 		icsk->icsk_ack.ato,
1892 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1893 		tp->snd_cwnd,
1894 		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1895 }
1896 
1897 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1898 {
1899 	unsigned int dest, src;
1900 	__u16 destp, srcp;
1901 	int ttd = tw->tw_ttd - jiffies;
1902 
1903 	if (ttd < 0)
1904 		ttd = 0;
1905 
1906 	dest  = tw->tw_daddr;
1907 	src   = tw->tw_rcv_saddr;
1908 	destp = ntohs(tw->tw_dport);
1909 	srcp  = ntohs(tw->tw_sport);
1910 
1911 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1912 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1913 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1914 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1915 		atomic_read(&tw->tw_refcnt), tw);
1916 }
1917 
1918 #define TMPSZ 150
1919 
1920 static int tcp4_seq_show(struct seq_file *seq, void *v)
1921 {
1922 	struct tcp_iter_state* st;
1923 	char tmpbuf[TMPSZ + 1];
1924 
1925 	if (v == SEQ_START_TOKEN) {
1926 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
1927 			   "  sl  local_address rem_address   st tx_queue "
1928 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
1929 			   "inode");
1930 		goto out;
1931 	}
1932 	st = seq->private;
1933 
1934 	switch (st->state) {
1935 	case TCP_SEQ_STATE_LISTENING:
1936 	case TCP_SEQ_STATE_ESTABLISHED:
1937 		get_tcp4_sock(v, tmpbuf, st->num);
1938 		break;
1939 	case TCP_SEQ_STATE_OPENREQ:
1940 		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1941 		break;
1942 	case TCP_SEQ_STATE_TIME_WAIT:
1943 		get_timewait4_sock(v, tmpbuf, st->num);
1944 		break;
1945 	}
1946 	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1947 out:
1948 	return 0;
1949 }
1950 
1951 static struct file_operations tcp4_seq_fops;
1952 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1953 	.owner		= THIS_MODULE,
1954 	.name		= "tcp",
1955 	.family		= AF_INET,
1956 	.seq_show	= tcp4_seq_show,
1957 	.seq_fops	= &tcp4_seq_fops,
1958 };
1959 
1960 int __init tcp4_proc_init(void)
1961 {
1962 	return tcp_proc_register(&tcp4_seq_afinfo);
1963 }
1964 
1965 void tcp4_proc_exit(void)
1966 {
1967 	tcp_proc_unregister(&tcp4_seq_afinfo);
1968 }
1969 #endif /* CONFIG_PROC_FS */
1970 
1971 struct proto tcp_prot = {
1972 	.name			= "TCP",
1973 	.owner			= THIS_MODULE,
1974 	.close			= tcp_close,
1975 	.connect		= tcp_v4_connect,
1976 	.disconnect		= tcp_disconnect,
1977 	.accept			= inet_csk_accept,
1978 	.ioctl			= tcp_ioctl,
1979 	.init			= tcp_v4_init_sock,
1980 	.destroy		= tcp_v4_destroy_sock,
1981 	.shutdown		= tcp_shutdown,
1982 	.setsockopt		= tcp_setsockopt,
1983 	.getsockopt		= tcp_getsockopt,
1984 	.sendmsg		= tcp_sendmsg,
1985 	.recvmsg		= tcp_recvmsg,
1986 	.backlog_rcv		= tcp_v4_do_rcv,
1987 	.hash			= tcp_v4_hash,
1988 	.unhash			= tcp_unhash,
1989 	.get_port		= tcp_v4_get_port,
1990 	.enter_memory_pressure	= tcp_enter_memory_pressure,
1991 	.sockets_allocated	= &tcp_sockets_allocated,
1992 	.orphan_count		= &tcp_orphan_count,
1993 	.memory_allocated	= &tcp_memory_allocated,
1994 	.memory_pressure	= &tcp_memory_pressure,
1995 	.sysctl_mem		= sysctl_tcp_mem,
1996 	.sysctl_wmem		= sysctl_tcp_wmem,
1997 	.sysctl_rmem		= sysctl_tcp_rmem,
1998 	.max_header		= MAX_TCP_HEADER,
1999 	.obj_size		= sizeof(struct tcp_sock),
2000 	.twsk_obj_size		= sizeof(struct tcp_timewait_sock),
2001 	.rsk_prot		= &tcp_request_sock_ops,
2002 };
2003 
2004 
2005 
2006 void __init tcp_v4_init(struct net_proto_family *ops)
2007 {
2008 	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2009 	if (err < 0)
2010 		panic("Failed to create the TCP control socket.\n");
2011 	tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2012 	inet_sk(tcp_socket->sk)->uc_ttl = -1;
2013 
2014 	/* Unhash it so that IP input processing does not even
2015 	 * see it, we do not wish this socket to see incoming
2016 	 * packets.
2017 	 */
2018 	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2019 }
2020 
2021 EXPORT_SYMBOL(ipv4_specific);
2022 EXPORT_SYMBOL(inet_bind_bucket_create);
2023 EXPORT_SYMBOL(tcp_hashinfo);
2024 EXPORT_SYMBOL(tcp_prot);
2025 EXPORT_SYMBOL(tcp_unhash);
2026 EXPORT_SYMBOL(tcp_v4_conn_request);
2027 EXPORT_SYMBOL(tcp_v4_connect);
2028 EXPORT_SYMBOL(tcp_v4_do_rcv);
2029 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2030 EXPORT_SYMBOL(tcp_v4_send_check);
2031 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2032 
2033 #ifdef CONFIG_PROC_FS
2034 EXPORT_SYMBOL(tcp_proc_register);
2035 EXPORT_SYMBOL(tcp_proc_unregister);
2036 #endif
2037 EXPORT_SYMBOL(sysctl_local_port_range);
2038 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2039 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2040 
2041