xref: /linux/net/ipv4/tcp_ipv4.c (revision 9ce7677cfd7cd871adb457c80bea3b581b839641)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *		IPv4 specific functions
11  *
12  *
13  *		code split from:
14  *		linux/ipv4/tcp.c
15  *		linux/ipv4/tcp_input.c
16  *		linux/ipv4/tcp_output.c
17  *
18  *		See tcp.c for author information
19  *
20  *	This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25 
26 /*
27  * Changes:
28  *		David S. Miller	:	New socket lookup architecture.
29  *					This code is dedicated to John Dyson.
30  *		David S. Miller :	Change semantics of established hash,
31  *					half is devoted to TIME_WAIT sockets
32  *					and the rest go in the other half.
33  *		Andi Kleen :		Add support for syncookies and fixed
34  *					some bugs: ip options weren't passed to
35  *					the TCP layer, missed a check for an
36  *					ACK bit.
37  *		Andi Kleen :		Implemented fast path mtu discovery.
38  *	     				Fixed many serious bugs in the
39  *					request_sock handling and moved
40  *					most of it into the af independent code.
41  *					Added tail drop and some other bugfixes.
42  *					Added new listen semantics.
43  *		Mike McLagan	:	Routing by source
44  *	Juan Jose Ciarlante:		ip_dynaddr bits
45  *		Andi Kleen:		various fixes.
46  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
47  *					coma.
48  *	Andi Kleen		:	Fix new listen.
49  *	Andi Kleen		:	Fix accept error reporting.
50  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
51  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
52  *					a single port at the same time.
53  */
54 
55 #include <linux/config.h>
56 
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65 
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/xfrm.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82 
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85 
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88 
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 		       struct sk_buff *skb);
91 
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 	.lhash_lock	= RW_LOCK_UNLOCKED,
94 	.lhash_users	= ATOMIC_INIT(0),
95 	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 };
97 
98 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99 {
100 	return inet_csk_get_port(&tcp_hashinfo, sk, snum);
101 }
102 
103 static void tcp_v4_hash(struct sock *sk)
104 {
105 	inet_hash(&tcp_hashinfo, sk);
106 }
107 
108 void tcp_unhash(struct sock *sk)
109 {
110 	inet_unhash(&tcp_hashinfo, sk);
111 }
112 
113 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
114 {
115 	return secure_tcp_sequence_number(skb->nh.iph->daddr,
116 					  skb->nh.iph->saddr,
117 					  skb->h.th->dest,
118 					  skb->h.th->source);
119 }
120 
121 /* called with local bh disabled */
122 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
123 				      struct inet_timewait_sock **twp)
124 {
125 	struct inet_sock *inet = inet_sk(sk);
126 	u32 daddr = inet->rcv_saddr;
127 	u32 saddr = inet->daddr;
128 	int dif = sk->sk_bound_dev_if;
129 	INET_ADDR_COOKIE(acookie, saddr, daddr)
130 	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
131 	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
132 	struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
133 	struct sock *sk2;
134 	const struct hlist_node *node;
135 	struct inet_timewait_sock *tw;
136 
137 	prefetch(head->chain.first);
138 	write_lock(&head->lock);
139 
140 	/* Check TIME-WAIT sockets first. */
141 	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
142 		tw = inet_twsk(sk2);
143 
144 		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
145 			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
146 			struct tcp_sock *tp = tcp_sk(sk);
147 
148 			/* With PAWS, it is safe from the viewpoint
149 			   of data integrity. Even without PAWS it
150 			   is safe provided sequence spaces do not
151 			   overlap i.e. at data rates <= 80Mbit/sec.
152 
153 			   Actually, the idea is close to VJ's one,
154 			   only timestamp cache is held not per host,
155 			   but per port pair and TW bucket is used
156 			   as state holder.
157 
158 			   If TW bucket has been already destroyed we
159 			   fall back to VJ's scheme and use initial
160 			   timestamp retrieved from peer table.
161 			 */
162 			if (tcptw->tw_ts_recent_stamp &&
163 			    (!twp || (sysctl_tcp_tw_reuse &&
164 				      xtime.tv_sec -
165 				      tcptw->tw_ts_recent_stamp > 1))) {
166 				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
167 				if (tp->write_seq == 0)
168 					tp->write_seq = 1;
169 				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
170 				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
171 				sock_hold(sk2);
172 				goto unique;
173 			} else
174 				goto not_unique;
175 		}
176 	}
177 	tw = NULL;
178 
179 	/* And established part... */
180 	sk_for_each(sk2, node, &head->chain) {
181 		if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
182 			goto not_unique;
183 	}
184 
185 unique:
186 	/* Must record num and sport now. Otherwise we will see
187 	 * in hash table socket with a funny identity. */
188 	inet->num = lport;
189 	inet->sport = htons(lport);
190 	sk->sk_hash = hash;
191 	BUG_TRAP(sk_unhashed(sk));
192 	__sk_add_node(sk, &head->chain);
193 	sock_prot_inc_use(sk->sk_prot);
194 	write_unlock(&head->lock);
195 
196 	if (twp) {
197 		*twp = tw;
198 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
199 	} else if (tw) {
200 		/* Silly. Should hash-dance instead... */
201 		inet_twsk_deschedule(tw, &tcp_death_row);
202 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
203 
204 		inet_twsk_put(tw);
205 	}
206 
207 	return 0;
208 
209 not_unique:
210 	write_unlock(&head->lock);
211 	return -EADDRNOTAVAIL;
212 }
213 
214 static inline u32 connect_port_offset(const struct sock *sk)
215 {
216 	const struct inet_sock *inet = inet_sk(sk);
217 
218 	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
219 					 inet->dport);
220 }
221 
222 /*
223  * Bind a port for a connect operation and hash it.
224  */
225 static inline int tcp_v4_hash_connect(struct sock *sk)
226 {
227 	const unsigned short snum = inet_sk(sk)->num;
228  	struct inet_bind_hashbucket *head;
229  	struct inet_bind_bucket *tb;
230 	int ret;
231 
232  	if (!snum) {
233  		int low = sysctl_local_port_range[0];
234  		int high = sysctl_local_port_range[1];
235 		int range = high - low;
236  		int i;
237 		int port;
238 		static u32 hint;
239 		u32 offset = hint + connect_port_offset(sk);
240 		struct hlist_node *node;
241  		struct inet_timewait_sock *tw = NULL;
242 
243  		local_bh_disable();
244 		for (i = 1; i <= range; i++) {
245 			port = low + (i + offset) % range;
246  			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
247  			spin_lock(&head->lock);
248 
249  			/* Does not bother with rcv_saddr checks,
250  			 * because the established check is already
251  			 * unique enough.
252  			 */
253 			inet_bind_bucket_for_each(tb, node, &head->chain) {
254  				if (tb->port == port) {
255  					BUG_TRAP(!hlist_empty(&tb->owners));
256  					if (tb->fastreuse >= 0)
257  						goto next_port;
258  					if (!__tcp_v4_check_established(sk,
259 									port,
260 									&tw))
261  						goto ok;
262  					goto next_port;
263  				}
264  			}
265 
266  			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
267  			if (!tb) {
268  				spin_unlock(&head->lock);
269  				break;
270  			}
271  			tb->fastreuse = -1;
272  			goto ok;
273 
274  		next_port:
275  			spin_unlock(&head->lock);
276  		}
277  		local_bh_enable();
278 
279  		return -EADDRNOTAVAIL;
280 
281 ok:
282 		hint += i;
283 
284  		/* Head lock still held and bh's disabled */
285  		inet_bind_hash(sk, tb, port);
286 		if (sk_unhashed(sk)) {
287  			inet_sk(sk)->sport = htons(port);
288  			__inet_hash(&tcp_hashinfo, sk, 0);
289  		}
290  		spin_unlock(&head->lock);
291 
292  		if (tw) {
293  			inet_twsk_deschedule(tw, &tcp_death_row);;
294  			inet_twsk_put(tw);
295  		}
296 
297 		ret = 0;
298 		goto out;
299  	}
300 
301  	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
302  	tb  = inet_csk(sk)->icsk_bind_hash;
303 	spin_lock_bh(&head->lock);
304 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
305 		__inet_hash(&tcp_hashinfo, sk, 0);
306 		spin_unlock_bh(&head->lock);
307 		return 0;
308 	} else {
309 		spin_unlock(&head->lock);
310 		/* No definite answer... Walk to established hash table */
311 		ret = __tcp_v4_check_established(sk, snum, NULL);
312 out:
313 		local_bh_enable();
314 		return ret;
315 	}
316 }
317 
318 /* This will initiate an outgoing connection. */
319 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
320 {
321 	struct inet_sock *inet = inet_sk(sk);
322 	struct tcp_sock *tp = tcp_sk(sk);
323 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
324 	struct rtable *rt;
325 	u32 daddr, nexthop;
326 	int tmp;
327 	int err;
328 
329 	if (addr_len < sizeof(struct sockaddr_in))
330 		return -EINVAL;
331 
332 	if (usin->sin_family != AF_INET)
333 		return -EAFNOSUPPORT;
334 
335 	nexthop = daddr = usin->sin_addr.s_addr;
336 	if (inet->opt && inet->opt->srr) {
337 		if (!daddr)
338 			return -EINVAL;
339 		nexthop = inet->opt->faddr;
340 	}
341 
342 	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
343 			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
344 			       IPPROTO_TCP,
345 			       inet->sport, usin->sin_port, sk);
346 	if (tmp < 0)
347 		return tmp;
348 
349 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
350 		ip_rt_put(rt);
351 		return -ENETUNREACH;
352 	}
353 
354 	if (!inet->opt || !inet->opt->srr)
355 		daddr = rt->rt_dst;
356 
357 	if (!inet->saddr)
358 		inet->saddr = rt->rt_src;
359 	inet->rcv_saddr = inet->saddr;
360 
361 	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
362 		/* Reset inherited state */
363 		tp->rx_opt.ts_recent	   = 0;
364 		tp->rx_opt.ts_recent_stamp = 0;
365 		tp->write_seq		   = 0;
366 	}
367 
368 	if (tcp_death_row.sysctl_tw_recycle &&
369 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
370 		struct inet_peer *peer = rt_get_peer(rt);
371 
372 		/* VJ's idea. We save last timestamp seen from
373 		 * the destination in peer table, when entering state TIME-WAIT
374 		 * and initialize rx_opt.ts_recent from it, when trying new connection.
375 		 */
376 
377 		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
378 			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
379 			tp->rx_opt.ts_recent = peer->tcp_ts;
380 		}
381 	}
382 
383 	inet->dport = usin->sin_port;
384 	inet->daddr = daddr;
385 
386 	tp->ext_header_len = 0;
387 	if (inet->opt)
388 		tp->ext_header_len = inet->opt->optlen;
389 
390 	tp->rx_opt.mss_clamp = 536;
391 
392 	/* Socket identity is still unknown (sport may be zero).
393 	 * However we set state to SYN-SENT and not releasing socket
394 	 * lock select source port, enter ourselves into the hash tables and
395 	 * complete initialization after this.
396 	 */
397 	tcp_set_state(sk, TCP_SYN_SENT);
398 	err = tcp_v4_hash_connect(sk);
399 	if (err)
400 		goto failure;
401 
402 	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
403 	if (err)
404 		goto failure;
405 
406 	/* OK, now commit destination to socket.  */
407 	sk_setup_caps(sk, &rt->u.dst);
408 
409 	if (!tp->write_seq)
410 		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
411 							   inet->daddr,
412 							   inet->sport,
413 							   usin->sin_port);
414 
415 	inet->id = tp->write_seq ^ jiffies;
416 
417 	err = tcp_connect(sk);
418 	rt = NULL;
419 	if (err)
420 		goto failure;
421 
422 	return 0;
423 
424 failure:
425 	/* This unhashes the socket and releases the local port, if necessary. */
426 	tcp_set_state(sk, TCP_CLOSE);
427 	ip_rt_put(rt);
428 	sk->sk_route_caps = 0;
429 	inet->dport = 0;
430 	return err;
431 }
432 
433 /*
434  * This routine does path mtu discovery as defined in RFC1191.
435  */
436 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
437 				     u32 mtu)
438 {
439 	struct dst_entry *dst;
440 	struct inet_sock *inet = inet_sk(sk);
441 	struct tcp_sock *tp = tcp_sk(sk);
442 
443 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
444 	 * send out by Linux are always <576bytes so they should go through
445 	 * unfragmented).
446 	 */
447 	if (sk->sk_state == TCP_LISTEN)
448 		return;
449 
450 	/* We don't check in the destentry if pmtu discovery is forbidden
451 	 * on this route. We just assume that no packet_to_big packets
452 	 * are send back when pmtu discovery is not active.
453      	 * There is a small race when the user changes this flag in the
454 	 * route, but I think that's acceptable.
455 	 */
456 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
457 		return;
458 
459 	dst->ops->update_pmtu(dst, mtu);
460 
461 	/* Something is about to be wrong... Remember soft error
462 	 * for the case, if this connection will not able to recover.
463 	 */
464 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
465 		sk->sk_err_soft = EMSGSIZE;
466 
467 	mtu = dst_mtu(dst);
468 
469 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
470 	    tp->pmtu_cookie > mtu) {
471 		tcp_sync_mss(sk, mtu);
472 
473 		/* Resend the TCP packet because it's
474 		 * clear that the old packet has been
475 		 * dropped. This is the new "fast" path mtu
476 		 * discovery.
477 		 */
478 		tcp_simple_retransmit(sk);
479 	} /* else let the usual retransmit timer handle it */
480 }
481 
482 /*
483  * This routine is called by the ICMP module when it gets some
484  * sort of error condition.  If err < 0 then the socket should
485  * be closed and the error returned to the user.  If err > 0
486  * it's just the icmp type << 8 | icmp code.  After adjustment
487  * header points to the first 8 bytes of the tcp header.  We need
488  * to find the appropriate port.
489  *
490  * The locking strategy used here is very "optimistic". When
491  * someone else accesses the socket the ICMP is just dropped
492  * and for some paths there is no check at all.
493  * A more general error queue to queue errors for later handling
494  * is probably better.
495  *
496  */
497 
498 void tcp_v4_err(struct sk_buff *skb, u32 info)
499 {
500 	struct iphdr *iph = (struct iphdr *)skb->data;
501 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
502 	struct tcp_sock *tp;
503 	struct inet_sock *inet;
504 	int type = skb->h.icmph->type;
505 	int code = skb->h.icmph->code;
506 	struct sock *sk;
507 	__u32 seq;
508 	int err;
509 
510 	if (skb->len < (iph->ihl << 2) + 8) {
511 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
512 		return;
513 	}
514 
515 	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
516 			 th->source, inet_iif(skb));
517 	if (!sk) {
518 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
519 		return;
520 	}
521 	if (sk->sk_state == TCP_TIME_WAIT) {
522 		inet_twsk_put((struct inet_timewait_sock *)sk);
523 		return;
524 	}
525 
526 	bh_lock_sock(sk);
527 	/* If too many ICMPs get dropped on busy
528 	 * servers this needs to be solved differently.
529 	 */
530 	if (sock_owned_by_user(sk))
531 		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
532 
533 	if (sk->sk_state == TCP_CLOSE)
534 		goto out;
535 
536 	tp = tcp_sk(sk);
537 	seq = ntohl(th->seq);
538 	if (sk->sk_state != TCP_LISTEN &&
539 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
540 		NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
541 		goto out;
542 	}
543 
544 	switch (type) {
545 	case ICMP_SOURCE_QUENCH:
546 		/* Just silently ignore these. */
547 		goto out;
548 	case ICMP_PARAMETERPROB:
549 		err = EPROTO;
550 		break;
551 	case ICMP_DEST_UNREACH:
552 		if (code > NR_ICMP_UNREACH)
553 			goto out;
554 
555 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
556 			if (!sock_owned_by_user(sk))
557 				do_pmtu_discovery(sk, iph, info);
558 			goto out;
559 		}
560 
561 		err = icmp_err_convert[code].errno;
562 		break;
563 	case ICMP_TIME_EXCEEDED:
564 		err = EHOSTUNREACH;
565 		break;
566 	default:
567 		goto out;
568 	}
569 
570 	switch (sk->sk_state) {
571 		struct request_sock *req, **prev;
572 	case TCP_LISTEN:
573 		if (sock_owned_by_user(sk))
574 			goto out;
575 
576 		req = inet_csk_search_req(sk, &prev, th->dest,
577 					  iph->daddr, iph->saddr);
578 		if (!req)
579 			goto out;
580 
581 		/* ICMPs are not backlogged, hence we cannot get
582 		   an established socket here.
583 		 */
584 		BUG_TRAP(!req->sk);
585 
586 		if (seq != tcp_rsk(req)->snt_isn) {
587 			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
588 			goto out;
589 		}
590 
591 		/*
592 		 * Still in SYN_RECV, just remove it silently.
593 		 * There is no good way to pass the error to the newly
594 		 * created socket, and POSIX does not want network
595 		 * errors returned from accept().
596 		 */
597 		inet_csk_reqsk_queue_drop(sk, req, prev);
598 		goto out;
599 
600 	case TCP_SYN_SENT:
601 	case TCP_SYN_RECV:  /* Cannot happen.
602 			       It can f.e. if SYNs crossed.
603 			     */
604 		if (!sock_owned_by_user(sk)) {
605 			TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
606 			sk->sk_err = err;
607 
608 			sk->sk_error_report(sk);
609 
610 			tcp_done(sk);
611 		} else {
612 			sk->sk_err_soft = err;
613 		}
614 		goto out;
615 	}
616 
617 	/* If we've already connected we will keep trying
618 	 * until we time out, or the user gives up.
619 	 *
620 	 * rfc1122 4.2.3.9 allows to consider as hard errors
621 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
622 	 * but it is obsoleted by pmtu discovery).
623 	 *
624 	 * Note, that in modern internet, where routing is unreliable
625 	 * and in each dark corner broken firewalls sit, sending random
626 	 * errors ordered by their masters even this two messages finally lose
627 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
628 	 *
629 	 * Now we are in compliance with RFCs.
630 	 *							--ANK (980905)
631 	 */
632 
633 	inet = inet_sk(sk);
634 	if (!sock_owned_by_user(sk) && inet->recverr) {
635 		sk->sk_err = err;
636 		sk->sk_error_report(sk);
637 	} else	{ /* Only an error on timeout */
638 		sk->sk_err_soft = err;
639 	}
640 
641 out:
642 	bh_unlock_sock(sk);
643 	sock_put(sk);
644 }
645 
646 /* This routine computes an IPv4 TCP checksum. */
647 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
648 		       struct sk_buff *skb)
649 {
650 	struct inet_sock *inet = inet_sk(sk);
651 
652 	if (skb->ip_summed == CHECKSUM_HW) {
653 		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
654 		skb->csum = offsetof(struct tcphdr, check);
655 	} else {
656 		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
657 					 csum_partial((char *)th,
658 						      th->doff << 2,
659 						      skb->csum));
660 	}
661 }
662 
663 /*
664  *	This routine will send an RST to the other tcp.
665  *
666  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
667  *		      for reset.
668  *	Answer: if a packet caused RST, it is not for a socket
669  *		existing in our system, if it is matched to a socket,
670  *		it is just duplicate segment or bug in other side's TCP.
671  *		So that we build reply only basing on parameters
672  *		arrived with segment.
673  *	Exception: precedence violation. We do not implement it in any case.
674  */
675 
676 static void tcp_v4_send_reset(struct sk_buff *skb)
677 {
678 	struct tcphdr *th = skb->h.th;
679 	struct tcphdr rth;
680 	struct ip_reply_arg arg;
681 
682 	/* Never send a reset in response to a reset. */
683 	if (th->rst)
684 		return;
685 
686 	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
687 		return;
688 
689 	/* Swap the send and the receive. */
690 	memset(&rth, 0, sizeof(struct tcphdr));
691 	rth.dest   = th->source;
692 	rth.source = th->dest;
693 	rth.doff   = sizeof(struct tcphdr) / 4;
694 	rth.rst    = 1;
695 
696 	if (th->ack) {
697 		rth.seq = th->ack_seq;
698 	} else {
699 		rth.ack = 1;
700 		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 				    skb->len - (th->doff << 2));
702 	}
703 
704 	memset(&arg, 0, sizeof arg);
705 	arg.iov[0].iov_base = (unsigned char *)&rth;
706 	arg.iov[0].iov_len  = sizeof rth;
707 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
708 				      skb->nh.iph->saddr, /*XXX*/
709 				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
710 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
711 
712 	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
713 
714 	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
715 	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
716 }
717 
718 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
719    outside socket context is ugly, certainly. What can I do?
720  */
721 
722 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
723 			    u32 win, u32 ts)
724 {
725 	struct tcphdr *th = skb->h.th;
726 	struct {
727 		struct tcphdr th;
728 		u32 tsopt[3];
729 	} rep;
730 	struct ip_reply_arg arg;
731 
732 	memset(&rep.th, 0, sizeof(struct tcphdr));
733 	memset(&arg, 0, sizeof arg);
734 
735 	arg.iov[0].iov_base = (unsigned char *)&rep;
736 	arg.iov[0].iov_len  = sizeof(rep.th);
737 	if (ts) {
738 		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739 				     (TCPOPT_TIMESTAMP << 8) |
740 				     TCPOLEN_TIMESTAMP);
741 		rep.tsopt[1] = htonl(tcp_time_stamp);
742 		rep.tsopt[2] = htonl(ts);
743 		arg.iov[0].iov_len = sizeof(rep);
744 	}
745 
746 	/* Swap the send and the receive. */
747 	rep.th.dest    = th->source;
748 	rep.th.source  = th->dest;
749 	rep.th.doff    = arg.iov[0].iov_len / 4;
750 	rep.th.seq     = htonl(seq);
751 	rep.th.ack_seq = htonl(ack);
752 	rep.th.ack     = 1;
753 	rep.th.window  = htons(win);
754 
755 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
756 				      skb->nh.iph->saddr, /*XXX*/
757 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
758 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
759 
760 	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
761 
762 	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
763 }
764 
765 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
766 {
767 	struct inet_timewait_sock *tw = inet_twsk(sk);
768 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
769 
770 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
771 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
772 
773 	inet_twsk_put(tw);
774 }
775 
776 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
777 {
778 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
779 			req->ts_recent);
780 }
781 
782 /*
783  *	Send a SYN-ACK after having received an ACK.
784  *	This still operates on a request_sock only, not on a big
785  *	socket.
786  */
787 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
788 			      struct dst_entry *dst)
789 {
790 	const struct inet_request_sock *ireq = inet_rsk(req);
791 	int err = -1;
792 	struct sk_buff * skb;
793 
794 	/* First, grab a route. */
795 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
796 		goto out;
797 
798 	skb = tcp_make_synack(sk, dst, req);
799 
800 	if (skb) {
801 		struct tcphdr *th = skb->h.th;
802 
803 		th->check = tcp_v4_check(th, skb->len,
804 					 ireq->loc_addr,
805 					 ireq->rmt_addr,
806 					 csum_partial((char *)th, skb->len,
807 						      skb->csum));
808 
809 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
810 					    ireq->rmt_addr,
811 					    ireq->opt);
812 		if (err == NET_XMIT_CN)
813 			err = 0;
814 	}
815 
816 out:
817 	dst_release(dst);
818 	return err;
819 }
820 
821 /*
822  *	IPv4 request_sock destructor.
823  */
824 static void tcp_v4_reqsk_destructor(struct request_sock *req)
825 {
826 	kfree(inet_rsk(req)->opt);
827 }
828 
829 static inline void syn_flood_warning(struct sk_buff *skb)
830 {
831 	static unsigned long warntime;
832 
833 	if (time_after(jiffies, (warntime + HZ * 60))) {
834 		warntime = jiffies;
835 		printk(KERN_INFO
836 		       "possible SYN flooding on port %d. Sending cookies.\n",
837 		       ntohs(skb->h.th->dest));
838 	}
839 }
840 
841 /*
842  * Save and compile IPv4 options into the request_sock if needed.
843  */
844 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
845 						     struct sk_buff *skb)
846 {
847 	struct ip_options *opt = &(IPCB(skb)->opt);
848 	struct ip_options *dopt = NULL;
849 
850 	if (opt && opt->optlen) {
851 		int opt_size = optlength(opt);
852 		dopt = kmalloc(opt_size, GFP_ATOMIC);
853 		if (dopt) {
854 			if (ip_options_echo(dopt, skb)) {
855 				kfree(dopt);
856 				dopt = NULL;
857 			}
858 		}
859 	}
860 	return dopt;
861 }
862 
863 struct request_sock_ops tcp_request_sock_ops = {
864 	.family		=	PF_INET,
865 	.obj_size	=	sizeof(struct tcp_request_sock),
866 	.rtx_syn_ack	=	tcp_v4_send_synack,
867 	.send_ack	=	tcp_v4_reqsk_send_ack,
868 	.destructor	=	tcp_v4_reqsk_destructor,
869 	.send_reset	=	tcp_v4_send_reset,
870 };
871 
872 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
873 {
874 	struct inet_request_sock *ireq;
875 	struct tcp_options_received tmp_opt;
876 	struct request_sock *req;
877 	__u32 saddr = skb->nh.iph->saddr;
878 	__u32 daddr = skb->nh.iph->daddr;
879 	__u32 isn = TCP_SKB_CB(skb)->when;
880 	struct dst_entry *dst = NULL;
881 #ifdef CONFIG_SYN_COOKIES
882 	int want_cookie = 0;
883 #else
884 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
885 #endif
886 
887 	/* Never answer to SYNs send to broadcast or multicast */
888 	if (((struct rtable *)skb->dst)->rt_flags &
889 	    (RTCF_BROADCAST | RTCF_MULTICAST))
890 		goto drop;
891 
892 	/* TW buckets are converted to open requests without
893 	 * limitations, they conserve resources and peer is
894 	 * evidently real one.
895 	 */
896 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
897 #ifdef CONFIG_SYN_COOKIES
898 		if (sysctl_tcp_syncookies) {
899 			want_cookie = 1;
900 		} else
901 #endif
902 		goto drop;
903 	}
904 
905 	/* Accept backlog is full. If we have already queued enough
906 	 * of warm entries in syn queue, drop request. It is better than
907 	 * clogging syn queue with openreqs with exponentially increasing
908 	 * timeout.
909 	 */
910 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
911 		goto drop;
912 
913 	req = reqsk_alloc(&tcp_request_sock_ops);
914 	if (!req)
915 		goto drop;
916 
917 	tcp_clear_options(&tmp_opt);
918 	tmp_opt.mss_clamp = 536;
919 	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
920 
921 	tcp_parse_options(skb, &tmp_opt, 0);
922 
923 	if (want_cookie) {
924 		tcp_clear_options(&tmp_opt);
925 		tmp_opt.saw_tstamp = 0;
926 	}
927 
928 	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
929 		/* Some OSes (unknown ones, but I see them on web server, which
930 		 * contains information interesting only for windows'
931 		 * users) do not send their stamp in SYN. It is easy case.
932 		 * We simply do not advertise TS support.
933 		 */
934 		tmp_opt.saw_tstamp = 0;
935 		tmp_opt.tstamp_ok  = 0;
936 	}
937 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
938 
939 	tcp_openreq_init(req, &tmp_opt, skb);
940 
941 	ireq = inet_rsk(req);
942 	ireq->loc_addr = daddr;
943 	ireq->rmt_addr = saddr;
944 	ireq->opt = tcp_v4_save_options(sk, skb);
945 	if (!want_cookie)
946 		TCP_ECN_create_request(req, skb->h.th);
947 
948 	if (want_cookie) {
949 #ifdef CONFIG_SYN_COOKIES
950 		syn_flood_warning(skb);
951 #endif
952 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
953 	} else if (!isn) {
954 		struct inet_peer *peer = NULL;
955 
956 		/* VJ's idea. We save last timestamp seen
957 		 * from the destination in peer table, when entering
958 		 * state TIME-WAIT, and check against it before
959 		 * accepting new connection request.
960 		 *
961 		 * If "isn" is not zero, this request hit alive
962 		 * timewait bucket, so that all the necessary checks
963 		 * are made in the function processing timewait state.
964 		 */
965 		if (tmp_opt.saw_tstamp &&
966 		    tcp_death_row.sysctl_tw_recycle &&
967 		    (dst = inet_csk_route_req(sk, req)) != NULL &&
968 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
969 		    peer->v4daddr == saddr) {
970 			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
971 			    (s32)(peer->tcp_ts - req->ts_recent) >
972 							TCP_PAWS_WINDOW) {
973 				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
974 				dst_release(dst);
975 				goto drop_and_free;
976 			}
977 		}
978 		/* Kill the following clause, if you dislike this way. */
979 		else if (!sysctl_tcp_syncookies &&
980 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
981 			  (sysctl_max_syn_backlog >> 2)) &&
982 			 (!peer || !peer->tcp_ts_stamp) &&
983 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
984 			/* Without syncookies last quarter of
985 			 * backlog is filled with destinations,
986 			 * proven to be alive.
987 			 * It means that we continue to communicate
988 			 * to destinations, already remembered
989 			 * to the moment of synflood.
990 			 */
991 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
992 				       "request from %u.%u.%u.%u/%u\n",
993 				       NIPQUAD(saddr),
994 				       ntohs(skb->h.th->source));
995 			dst_release(dst);
996 			goto drop_and_free;
997 		}
998 
999 		isn = tcp_v4_init_sequence(sk, skb);
1000 	}
1001 	tcp_rsk(req)->snt_isn = isn;
1002 
1003 	if (tcp_v4_send_synack(sk, req, dst))
1004 		goto drop_and_free;
1005 
1006 	if (want_cookie) {
1007 	   	reqsk_free(req);
1008 	} else {
1009 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1010 	}
1011 	return 0;
1012 
1013 drop_and_free:
1014 	reqsk_free(req);
1015 drop:
1016 	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1017 	return 0;
1018 }
1019 
1020 
1021 /*
1022  * The three way handshake has completed - we got a valid synack -
1023  * now create the new socket.
1024  */
1025 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1026 				  struct request_sock *req,
1027 				  struct dst_entry *dst)
1028 {
1029 	struct inet_request_sock *ireq;
1030 	struct inet_sock *newinet;
1031 	struct tcp_sock *newtp;
1032 	struct sock *newsk;
1033 
1034 	if (sk_acceptq_is_full(sk))
1035 		goto exit_overflow;
1036 
1037 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1038 		goto exit;
1039 
1040 	newsk = tcp_create_openreq_child(sk, req, skb);
1041 	if (!newsk)
1042 		goto exit;
1043 
1044 	sk_setup_caps(newsk, dst);
1045 
1046 	newtp		      = tcp_sk(newsk);
1047 	newinet		      = inet_sk(newsk);
1048 	ireq		      = inet_rsk(req);
1049 	newinet->daddr	      = ireq->rmt_addr;
1050 	newinet->rcv_saddr    = ireq->loc_addr;
1051 	newinet->saddr	      = ireq->loc_addr;
1052 	newinet->opt	      = ireq->opt;
1053 	ireq->opt	      = NULL;
1054 	newinet->mc_index     = inet_iif(skb);
1055 	newinet->mc_ttl	      = skb->nh.iph->ttl;
1056 	newtp->ext_header_len = 0;
1057 	if (newinet->opt)
1058 		newtp->ext_header_len = newinet->opt->optlen;
1059 	newinet->id = newtp->write_seq ^ jiffies;
1060 
1061 	tcp_sync_mss(newsk, dst_mtu(dst));
1062 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1063 	tcp_initialize_rcv_mss(newsk);
1064 
1065 	__inet_hash(&tcp_hashinfo, newsk, 0);
1066 	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
1067 
1068 	return newsk;
1069 
1070 exit_overflow:
1071 	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1072 exit:
1073 	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1074 	dst_release(dst);
1075 	return NULL;
1076 }
1077 
1078 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1079 {
1080 	struct tcphdr *th = skb->h.th;
1081 	struct iphdr *iph = skb->nh.iph;
1082 	struct sock *nsk;
1083 	struct request_sock **prev;
1084 	/* Find possible connection requests. */
1085 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1086 						       iph->saddr, iph->daddr);
1087 	if (req)
1088 		return tcp_check_req(sk, skb, req, prev);
1089 
1090 	nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1091 					th->source, skb->nh.iph->daddr,
1092 					ntohs(th->dest), inet_iif(skb));
1093 
1094 	if (nsk) {
1095 		if (nsk->sk_state != TCP_TIME_WAIT) {
1096 			bh_lock_sock(nsk);
1097 			return nsk;
1098 		}
1099 		inet_twsk_put((struct inet_timewait_sock *)nsk);
1100 		return NULL;
1101 	}
1102 
1103 #ifdef CONFIG_SYN_COOKIES
1104 	if (!th->rst && !th->syn && th->ack)
1105 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1106 #endif
1107 	return sk;
1108 }
1109 
1110 static int tcp_v4_checksum_init(struct sk_buff *skb)
1111 {
1112 	if (skb->ip_summed == CHECKSUM_HW) {
1113 		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1114 				  skb->nh.iph->daddr, skb->csum)) {
1115 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1116 			return 0;
1117 		}
1118 	}
1119 
1120 	skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1121 				       skb->len, IPPROTO_TCP, 0);
1122 
1123 	if (skb->len <= 76) {
1124 		return __skb_checksum_complete(skb);
1125 	}
1126 	return 0;
1127 }
1128 
1129 
1130 /* The socket must have it's spinlock held when we get
1131  * here.
1132  *
1133  * We have a potential double-lock case here, so even when
1134  * doing backlog processing we use the BH locking scheme.
1135  * This is because we cannot sleep with the original spinlock
1136  * held.
1137  */
1138 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1139 {
1140 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1141 		TCP_CHECK_TIMER(sk);
1142 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1143 			goto reset;
1144 		TCP_CHECK_TIMER(sk);
1145 		return 0;
1146 	}
1147 
1148 	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1149 		goto csum_err;
1150 
1151 	if (sk->sk_state == TCP_LISTEN) {
1152 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1153 		if (!nsk)
1154 			goto discard;
1155 
1156 		if (nsk != sk) {
1157 			if (tcp_child_process(sk, nsk, skb))
1158 				goto reset;
1159 			return 0;
1160 		}
1161 	}
1162 
1163 	TCP_CHECK_TIMER(sk);
1164 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1165 		goto reset;
1166 	TCP_CHECK_TIMER(sk);
1167 	return 0;
1168 
1169 reset:
1170 	tcp_v4_send_reset(skb);
1171 discard:
1172 	kfree_skb(skb);
1173 	/* Be careful here. If this function gets more complicated and
1174 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1175 	 * might be destroyed here. This current version compiles correctly,
1176 	 * but you have been warned.
1177 	 */
1178 	return 0;
1179 
1180 csum_err:
1181 	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1182 	goto discard;
1183 }
1184 
1185 /*
1186  *	From tcp_input.c
1187  */
1188 
1189 int tcp_v4_rcv(struct sk_buff *skb)
1190 {
1191 	struct tcphdr *th;
1192 	struct sock *sk;
1193 	int ret;
1194 
1195 	if (skb->pkt_type != PACKET_HOST)
1196 		goto discard_it;
1197 
1198 	/* Count it even if it's bad */
1199 	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1200 
1201 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1202 		goto discard_it;
1203 
1204 	th = skb->h.th;
1205 
1206 	if (th->doff < sizeof(struct tcphdr) / 4)
1207 		goto bad_packet;
1208 	if (!pskb_may_pull(skb, th->doff * 4))
1209 		goto discard_it;
1210 
1211 	/* An explanation is required here, I think.
1212 	 * Packet length and doff are validated by header prediction,
1213 	 * provided case of th->doff==0 is eliminated.
1214 	 * So, we defer the checks. */
1215 	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1216 	     tcp_v4_checksum_init(skb)))
1217 		goto bad_packet;
1218 
1219 	th = skb->h.th;
1220 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1221 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1222 				    skb->len - th->doff * 4);
1223 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1224 	TCP_SKB_CB(skb)->when	 = 0;
1225 	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
1226 	TCP_SKB_CB(skb)->sacked	 = 0;
1227 
1228 	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1229 			   skb->nh.iph->daddr, ntohs(th->dest),
1230 			   inet_iif(skb));
1231 
1232 	if (!sk)
1233 		goto no_tcp_socket;
1234 
1235 process:
1236 	if (sk->sk_state == TCP_TIME_WAIT)
1237 		goto do_time_wait;
1238 
1239 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1240 		goto discard_and_relse;
1241 
1242 	if (sk_filter(sk, skb, 0))
1243 		goto discard_and_relse;
1244 
1245 	skb->dev = NULL;
1246 
1247 	bh_lock_sock(sk);
1248 	ret = 0;
1249 	if (!sock_owned_by_user(sk)) {
1250 		if (!tcp_prequeue(sk, skb))
1251 			ret = tcp_v4_do_rcv(sk, skb);
1252 	} else
1253 		sk_add_backlog(sk, skb);
1254 	bh_unlock_sock(sk);
1255 
1256 	sock_put(sk);
1257 
1258 	return ret;
1259 
1260 no_tcp_socket:
1261 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1262 		goto discard_it;
1263 
1264 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1265 bad_packet:
1266 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1267 	} else {
1268 		tcp_v4_send_reset(skb);
1269 	}
1270 
1271 discard_it:
1272 	/* Discard frame. */
1273 	kfree_skb(skb);
1274   	return 0;
1275 
1276 discard_and_relse:
1277 	sock_put(sk);
1278 	goto discard_it;
1279 
1280 do_time_wait:
1281 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1282 		inet_twsk_put((struct inet_timewait_sock *) sk);
1283 		goto discard_it;
1284 	}
1285 
1286 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1287 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1288 		inet_twsk_put((struct inet_timewait_sock *) sk);
1289 		goto discard_it;
1290 	}
1291 	switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1292 					   skb, th)) {
1293 	case TCP_TW_SYN: {
1294 		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1295 							skb->nh.iph->daddr,
1296 							ntohs(th->dest),
1297 							inet_iif(skb));
1298 		if (sk2) {
1299 			inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1300 					     &tcp_death_row);
1301 			inet_twsk_put((struct inet_timewait_sock *)sk);
1302 			sk = sk2;
1303 			goto process;
1304 		}
1305 		/* Fall through to ACK */
1306 	}
1307 	case TCP_TW_ACK:
1308 		tcp_v4_timewait_ack(sk, skb);
1309 		break;
1310 	case TCP_TW_RST:
1311 		goto no_tcp_socket;
1312 	case TCP_TW_SUCCESS:;
1313 	}
1314 	goto discard_it;
1315 }
1316 
1317 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1318 {
1319 	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1320 	struct inet_sock *inet = inet_sk(sk);
1321 
1322 	sin->sin_family		= AF_INET;
1323 	sin->sin_addr.s_addr	= inet->daddr;
1324 	sin->sin_port		= inet->dport;
1325 }
1326 
1327 /* VJ's idea. Save last timestamp seen from this destination
1328  * and hold it at least for normal timewait interval to use for duplicate
1329  * segment detection in subsequent connections, before they enter synchronized
1330  * state.
1331  */
1332 
1333 int tcp_v4_remember_stamp(struct sock *sk)
1334 {
1335 	struct inet_sock *inet = inet_sk(sk);
1336 	struct tcp_sock *tp = tcp_sk(sk);
1337 	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1338 	struct inet_peer *peer = NULL;
1339 	int release_it = 0;
1340 
1341 	if (!rt || rt->rt_dst != inet->daddr) {
1342 		peer = inet_getpeer(inet->daddr, 1);
1343 		release_it = 1;
1344 	} else {
1345 		if (!rt->peer)
1346 			rt_bind_peer(rt, 1);
1347 		peer = rt->peer;
1348 	}
1349 
1350 	if (peer) {
1351 		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1352 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1353 		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1354 			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1355 			peer->tcp_ts = tp->rx_opt.ts_recent;
1356 		}
1357 		if (release_it)
1358 			inet_putpeer(peer);
1359 		return 1;
1360 	}
1361 
1362 	return 0;
1363 }
1364 
1365 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1366 {
1367 	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1368 
1369 	if (peer) {
1370 		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1371 
1372 		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1373 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1374 		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1375 			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1376 			peer->tcp_ts	   = tcptw->tw_ts_recent;
1377 		}
1378 		inet_putpeer(peer);
1379 		return 1;
1380 	}
1381 
1382 	return 0;
1383 }
1384 
1385 struct tcp_func ipv4_specific = {
1386 	.queue_xmit	=	ip_queue_xmit,
1387 	.send_check	=	tcp_v4_send_check,
1388 	.rebuild_header	=	inet_sk_rebuild_header,
1389 	.conn_request	=	tcp_v4_conn_request,
1390 	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
1391 	.remember_stamp	=	tcp_v4_remember_stamp,
1392 	.net_header_len	=	sizeof(struct iphdr),
1393 	.setsockopt	=	ip_setsockopt,
1394 	.getsockopt	=	ip_getsockopt,
1395 	.addr2sockaddr	=	v4_addr2sockaddr,
1396 	.sockaddr_len	=	sizeof(struct sockaddr_in),
1397 };
1398 
1399 /* NOTE: A lot of things set to zero explicitly by call to
1400  *       sk_alloc() so need not be done here.
1401  */
1402 static int tcp_v4_init_sock(struct sock *sk)
1403 {
1404 	struct inet_connection_sock *icsk = inet_csk(sk);
1405 	struct tcp_sock *tp = tcp_sk(sk);
1406 
1407 	skb_queue_head_init(&tp->out_of_order_queue);
1408 	tcp_init_xmit_timers(sk);
1409 	tcp_prequeue_init(tp);
1410 
1411 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1412 	tp->mdev = TCP_TIMEOUT_INIT;
1413 
1414 	/* So many TCP implementations out there (incorrectly) count the
1415 	 * initial SYN frame in their delayed-ACK and congestion control
1416 	 * algorithms that we must have the following bandaid to talk
1417 	 * efficiently to them.  -DaveM
1418 	 */
1419 	tp->snd_cwnd = 2;
1420 
1421 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1422 	 * initialization of these values.
1423 	 */
1424 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1425 	tp->snd_cwnd_clamp = ~0;
1426 	tp->mss_cache = 536;
1427 
1428 	tp->reordering = sysctl_tcp_reordering;
1429 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1430 
1431 	sk->sk_state = TCP_CLOSE;
1432 
1433 	sk->sk_write_space = sk_stream_write_space;
1434 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1435 
1436 	tp->af_specific = &ipv4_specific;
1437 
1438 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1439 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1440 
1441 	atomic_inc(&tcp_sockets_allocated);
1442 
1443 	return 0;
1444 }
1445 
1446 int tcp_v4_destroy_sock(struct sock *sk)
1447 {
1448 	struct tcp_sock *tp = tcp_sk(sk);
1449 
1450 	tcp_clear_xmit_timers(sk);
1451 
1452 	tcp_cleanup_congestion_control(sk);
1453 
1454 	/* Cleanup up the write buffer. */
1455   	sk_stream_writequeue_purge(sk);
1456 
1457 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1458   	__skb_queue_purge(&tp->out_of_order_queue);
1459 
1460 	/* Clean prequeue, it must be empty really */
1461 	__skb_queue_purge(&tp->ucopy.prequeue);
1462 
1463 	/* Clean up a referenced TCP bind bucket. */
1464 	if (inet_csk(sk)->icsk_bind_hash)
1465 		inet_put_port(&tcp_hashinfo, sk);
1466 
1467 	/*
1468 	 * If sendmsg cached page exists, toss it.
1469 	 */
1470 	if (sk->sk_sndmsg_page) {
1471 		__free_page(sk->sk_sndmsg_page);
1472 		sk->sk_sndmsg_page = NULL;
1473 	}
1474 
1475 	atomic_dec(&tcp_sockets_allocated);
1476 
1477 	return 0;
1478 }
1479 
1480 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1481 
1482 #ifdef CONFIG_PROC_FS
1483 /* Proc filesystem TCP sock list dumping. */
1484 
1485 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1486 {
1487 	return hlist_empty(head) ? NULL :
1488 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1489 }
1490 
1491 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1492 {
1493 	return tw->tw_node.next ?
1494 		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1495 }
1496 
1497 static void *listening_get_next(struct seq_file *seq, void *cur)
1498 {
1499 	struct inet_connection_sock *icsk;
1500 	struct hlist_node *node;
1501 	struct sock *sk = cur;
1502 	struct tcp_iter_state* st = seq->private;
1503 
1504 	if (!sk) {
1505 		st->bucket = 0;
1506 		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1507 		goto get_sk;
1508 	}
1509 
1510 	++st->num;
1511 
1512 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1513 		struct request_sock *req = cur;
1514 
1515 	       	icsk = inet_csk(st->syn_wait_sk);
1516 		req = req->dl_next;
1517 		while (1) {
1518 			while (req) {
1519 				if (req->rsk_ops->family == st->family) {
1520 					cur = req;
1521 					goto out;
1522 				}
1523 				req = req->dl_next;
1524 			}
1525 			if (++st->sbucket >= TCP_SYNQ_HSIZE)
1526 				break;
1527 get_req:
1528 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1529 		}
1530 		sk	  = sk_next(st->syn_wait_sk);
1531 		st->state = TCP_SEQ_STATE_LISTENING;
1532 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1533 	} else {
1534 	       	icsk = inet_csk(sk);
1535 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1536 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1537 			goto start_req;
1538 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1539 		sk = sk_next(sk);
1540 	}
1541 get_sk:
1542 	sk_for_each_from(sk, node) {
1543 		if (sk->sk_family == st->family) {
1544 			cur = sk;
1545 			goto out;
1546 		}
1547 	       	icsk = inet_csk(sk);
1548 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1549 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1550 start_req:
1551 			st->uid		= sock_i_uid(sk);
1552 			st->syn_wait_sk = sk;
1553 			st->state	= TCP_SEQ_STATE_OPENREQ;
1554 			st->sbucket	= 0;
1555 			goto get_req;
1556 		}
1557 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1558 	}
1559 	if (++st->bucket < INET_LHTABLE_SIZE) {
1560 		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1561 		goto get_sk;
1562 	}
1563 	cur = NULL;
1564 out:
1565 	return cur;
1566 }
1567 
1568 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1569 {
1570 	void *rc = listening_get_next(seq, NULL);
1571 
1572 	while (rc && *pos) {
1573 		rc = listening_get_next(seq, rc);
1574 		--*pos;
1575 	}
1576 	return rc;
1577 }
1578 
1579 static void *established_get_first(struct seq_file *seq)
1580 {
1581 	struct tcp_iter_state* st = seq->private;
1582 	void *rc = NULL;
1583 
1584 	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1585 		struct sock *sk;
1586 		struct hlist_node *node;
1587 		struct inet_timewait_sock *tw;
1588 
1589 		/* We can reschedule _before_ having picked the target: */
1590 		cond_resched_softirq();
1591 
1592 		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1593 		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1594 			if (sk->sk_family != st->family) {
1595 				continue;
1596 			}
1597 			rc = sk;
1598 			goto out;
1599 		}
1600 		st->state = TCP_SEQ_STATE_TIME_WAIT;
1601 		inet_twsk_for_each(tw, node,
1602 				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1603 			if (tw->tw_family != st->family) {
1604 				continue;
1605 			}
1606 			rc = tw;
1607 			goto out;
1608 		}
1609 		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1610 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1611 	}
1612 out:
1613 	return rc;
1614 }
1615 
1616 static void *established_get_next(struct seq_file *seq, void *cur)
1617 {
1618 	struct sock *sk = cur;
1619 	struct inet_timewait_sock *tw;
1620 	struct hlist_node *node;
1621 	struct tcp_iter_state* st = seq->private;
1622 
1623 	++st->num;
1624 
1625 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1626 		tw = cur;
1627 		tw = tw_next(tw);
1628 get_tw:
1629 		while (tw && tw->tw_family != st->family) {
1630 			tw = tw_next(tw);
1631 		}
1632 		if (tw) {
1633 			cur = tw;
1634 			goto out;
1635 		}
1636 		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1637 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1638 
1639 		/* We can reschedule between buckets: */
1640 		cond_resched_softirq();
1641 
1642 		if (++st->bucket < tcp_hashinfo.ehash_size) {
1643 			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1644 			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1645 		} else {
1646 			cur = NULL;
1647 			goto out;
1648 		}
1649 	} else
1650 		sk = sk_next(sk);
1651 
1652 	sk_for_each_from(sk, node) {
1653 		if (sk->sk_family == st->family)
1654 			goto found;
1655 	}
1656 
1657 	st->state = TCP_SEQ_STATE_TIME_WAIT;
1658 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1659 	goto get_tw;
1660 found:
1661 	cur = sk;
1662 out:
1663 	return cur;
1664 }
1665 
1666 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1667 {
1668 	void *rc = established_get_first(seq);
1669 
1670 	while (rc && pos) {
1671 		rc = established_get_next(seq, rc);
1672 		--pos;
1673 	}
1674 	return rc;
1675 }
1676 
1677 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1678 {
1679 	void *rc;
1680 	struct tcp_iter_state* st = seq->private;
1681 
1682 	inet_listen_lock(&tcp_hashinfo);
1683 	st->state = TCP_SEQ_STATE_LISTENING;
1684 	rc	  = listening_get_idx(seq, &pos);
1685 
1686 	if (!rc) {
1687 		inet_listen_unlock(&tcp_hashinfo);
1688 		local_bh_disable();
1689 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1690 		rc	  = established_get_idx(seq, pos);
1691 	}
1692 
1693 	return rc;
1694 }
1695 
1696 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1697 {
1698 	struct tcp_iter_state* st = seq->private;
1699 	st->state = TCP_SEQ_STATE_LISTENING;
1700 	st->num = 0;
1701 	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1702 }
1703 
1704 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1705 {
1706 	void *rc = NULL;
1707 	struct tcp_iter_state* st;
1708 
1709 	if (v == SEQ_START_TOKEN) {
1710 		rc = tcp_get_idx(seq, 0);
1711 		goto out;
1712 	}
1713 	st = seq->private;
1714 
1715 	switch (st->state) {
1716 	case TCP_SEQ_STATE_OPENREQ:
1717 	case TCP_SEQ_STATE_LISTENING:
1718 		rc = listening_get_next(seq, v);
1719 		if (!rc) {
1720 			inet_listen_unlock(&tcp_hashinfo);
1721 			local_bh_disable();
1722 			st->state = TCP_SEQ_STATE_ESTABLISHED;
1723 			rc	  = established_get_first(seq);
1724 		}
1725 		break;
1726 	case TCP_SEQ_STATE_ESTABLISHED:
1727 	case TCP_SEQ_STATE_TIME_WAIT:
1728 		rc = established_get_next(seq, v);
1729 		break;
1730 	}
1731 out:
1732 	++*pos;
1733 	return rc;
1734 }
1735 
1736 static void tcp_seq_stop(struct seq_file *seq, void *v)
1737 {
1738 	struct tcp_iter_state* st = seq->private;
1739 
1740 	switch (st->state) {
1741 	case TCP_SEQ_STATE_OPENREQ:
1742 		if (v) {
1743 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1744 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1745 		}
1746 	case TCP_SEQ_STATE_LISTENING:
1747 		if (v != SEQ_START_TOKEN)
1748 			inet_listen_unlock(&tcp_hashinfo);
1749 		break;
1750 	case TCP_SEQ_STATE_TIME_WAIT:
1751 	case TCP_SEQ_STATE_ESTABLISHED:
1752 		if (v)
1753 			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1754 		local_bh_enable();
1755 		break;
1756 	}
1757 }
1758 
1759 static int tcp_seq_open(struct inode *inode, struct file *file)
1760 {
1761 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1762 	struct seq_file *seq;
1763 	struct tcp_iter_state *s;
1764 	int rc;
1765 
1766 	if (unlikely(afinfo == NULL))
1767 		return -EINVAL;
1768 
1769 	s = kmalloc(sizeof(*s), GFP_KERNEL);
1770 	if (!s)
1771 		return -ENOMEM;
1772 	memset(s, 0, sizeof(*s));
1773 	s->family		= afinfo->family;
1774 	s->seq_ops.start	= tcp_seq_start;
1775 	s->seq_ops.next		= tcp_seq_next;
1776 	s->seq_ops.show		= afinfo->seq_show;
1777 	s->seq_ops.stop		= tcp_seq_stop;
1778 
1779 	rc = seq_open(file, &s->seq_ops);
1780 	if (rc)
1781 		goto out_kfree;
1782 	seq	     = file->private_data;
1783 	seq->private = s;
1784 out:
1785 	return rc;
1786 out_kfree:
1787 	kfree(s);
1788 	goto out;
1789 }
1790 
1791 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1792 {
1793 	int rc = 0;
1794 	struct proc_dir_entry *p;
1795 
1796 	if (!afinfo)
1797 		return -EINVAL;
1798 	afinfo->seq_fops->owner		= afinfo->owner;
1799 	afinfo->seq_fops->open		= tcp_seq_open;
1800 	afinfo->seq_fops->read		= seq_read;
1801 	afinfo->seq_fops->llseek	= seq_lseek;
1802 	afinfo->seq_fops->release	= seq_release_private;
1803 
1804 	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1805 	if (p)
1806 		p->data = afinfo;
1807 	else
1808 		rc = -ENOMEM;
1809 	return rc;
1810 }
1811 
1812 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1813 {
1814 	if (!afinfo)
1815 		return;
1816 	proc_net_remove(afinfo->name);
1817 	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1818 }
1819 
1820 static void get_openreq4(struct sock *sk, struct request_sock *req,
1821 			 char *tmpbuf, int i, int uid)
1822 {
1823 	const struct inet_request_sock *ireq = inet_rsk(req);
1824 	int ttd = req->expires - jiffies;
1825 
1826 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1827 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1828 		i,
1829 		ireq->loc_addr,
1830 		ntohs(inet_sk(sk)->sport),
1831 		ireq->rmt_addr,
1832 		ntohs(ireq->rmt_port),
1833 		TCP_SYN_RECV,
1834 		0, 0, /* could print option size, but that is af dependent. */
1835 		1,    /* timers active (only the expire timer) */
1836 		jiffies_to_clock_t(ttd),
1837 		req->retrans,
1838 		uid,
1839 		0,  /* non standard timer */
1840 		0, /* open_requests have no inode */
1841 		atomic_read(&sk->sk_refcnt),
1842 		req);
1843 }
1844 
1845 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1846 {
1847 	int timer_active;
1848 	unsigned long timer_expires;
1849 	struct tcp_sock *tp = tcp_sk(sp);
1850 	const struct inet_connection_sock *icsk = inet_csk(sp);
1851 	struct inet_sock *inet = inet_sk(sp);
1852 	unsigned int dest = inet->daddr;
1853 	unsigned int src = inet->rcv_saddr;
1854 	__u16 destp = ntohs(inet->dport);
1855 	__u16 srcp = ntohs(inet->sport);
1856 
1857 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1858 		timer_active	= 1;
1859 		timer_expires	= icsk->icsk_timeout;
1860 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1861 		timer_active	= 4;
1862 		timer_expires	= icsk->icsk_timeout;
1863 	} else if (timer_pending(&sp->sk_timer)) {
1864 		timer_active	= 2;
1865 		timer_expires	= sp->sk_timer.expires;
1866 	} else {
1867 		timer_active	= 0;
1868 		timer_expires = jiffies;
1869 	}
1870 
1871 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1872 			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
1873 		i, src, srcp, dest, destp, sp->sk_state,
1874 		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1875 		timer_active,
1876 		jiffies_to_clock_t(timer_expires - jiffies),
1877 		icsk->icsk_retransmits,
1878 		sock_i_uid(sp),
1879 		icsk->icsk_probes_out,
1880 		sock_i_ino(sp),
1881 		atomic_read(&sp->sk_refcnt), sp,
1882 		icsk->icsk_rto,
1883 		icsk->icsk_ack.ato,
1884 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1885 		tp->snd_cwnd,
1886 		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1887 }
1888 
1889 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1890 {
1891 	unsigned int dest, src;
1892 	__u16 destp, srcp;
1893 	int ttd = tw->tw_ttd - jiffies;
1894 
1895 	if (ttd < 0)
1896 		ttd = 0;
1897 
1898 	dest  = tw->tw_daddr;
1899 	src   = tw->tw_rcv_saddr;
1900 	destp = ntohs(tw->tw_dport);
1901 	srcp  = ntohs(tw->tw_sport);
1902 
1903 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1904 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1905 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1906 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1907 		atomic_read(&tw->tw_refcnt), tw);
1908 }
1909 
1910 #define TMPSZ 150
1911 
1912 static int tcp4_seq_show(struct seq_file *seq, void *v)
1913 {
1914 	struct tcp_iter_state* st;
1915 	char tmpbuf[TMPSZ + 1];
1916 
1917 	if (v == SEQ_START_TOKEN) {
1918 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
1919 			   "  sl  local_address rem_address   st tx_queue "
1920 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
1921 			   "inode");
1922 		goto out;
1923 	}
1924 	st = seq->private;
1925 
1926 	switch (st->state) {
1927 	case TCP_SEQ_STATE_LISTENING:
1928 	case TCP_SEQ_STATE_ESTABLISHED:
1929 		get_tcp4_sock(v, tmpbuf, st->num);
1930 		break;
1931 	case TCP_SEQ_STATE_OPENREQ:
1932 		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1933 		break;
1934 	case TCP_SEQ_STATE_TIME_WAIT:
1935 		get_timewait4_sock(v, tmpbuf, st->num);
1936 		break;
1937 	}
1938 	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1939 out:
1940 	return 0;
1941 }
1942 
1943 static struct file_operations tcp4_seq_fops;
1944 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1945 	.owner		= THIS_MODULE,
1946 	.name		= "tcp",
1947 	.family		= AF_INET,
1948 	.seq_show	= tcp4_seq_show,
1949 	.seq_fops	= &tcp4_seq_fops,
1950 };
1951 
1952 int __init tcp4_proc_init(void)
1953 {
1954 	return tcp_proc_register(&tcp4_seq_afinfo);
1955 }
1956 
1957 void tcp4_proc_exit(void)
1958 {
1959 	tcp_proc_unregister(&tcp4_seq_afinfo);
1960 }
1961 #endif /* CONFIG_PROC_FS */
1962 
1963 struct proto tcp_prot = {
1964 	.name			= "TCP",
1965 	.owner			= THIS_MODULE,
1966 	.close			= tcp_close,
1967 	.connect		= tcp_v4_connect,
1968 	.disconnect		= tcp_disconnect,
1969 	.accept			= inet_csk_accept,
1970 	.ioctl			= tcp_ioctl,
1971 	.init			= tcp_v4_init_sock,
1972 	.destroy		= tcp_v4_destroy_sock,
1973 	.shutdown		= tcp_shutdown,
1974 	.setsockopt		= tcp_setsockopt,
1975 	.getsockopt		= tcp_getsockopt,
1976 	.sendmsg		= tcp_sendmsg,
1977 	.recvmsg		= tcp_recvmsg,
1978 	.backlog_rcv		= tcp_v4_do_rcv,
1979 	.hash			= tcp_v4_hash,
1980 	.unhash			= tcp_unhash,
1981 	.get_port		= tcp_v4_get_port,
1982 	.enter_memory_pressure	= tcp_enter_memory_pressure,
1983 	.sockets_allocated	= &tcp_sockets_allocated,
1984 	.orphan_count		= &tcp_orphan_count,
1985 	.memory_allocated	= &tcp_memory_allocated,
1986 	.memory_pressure	= &tcp_memory_pressure,
1987 	.sysctl_mem		= sysctl_tcp_mem,
1988 	.sysctl_wmem		= sysctl_tcp_wmem,
1989 	.sysctl_rmem		= sysctl_tcp_rmem,
1990 	.max_header		= MAX_TCP_HEADER,
1991 	.obj_size		= sizeof(struct tcp_sock),
1992 	.twsk_obj_size		= sizeof(struct tcp_timewait_sock),
1993 	.rsk_prot		= &tcp_request_sock_ops,
1994 };
1995 
1996 
1997 
1998 void __init tcp_v4_init(struct net_proto_family *ops)
1999 {
2000 	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2001 	if (err < 0)
2002 		panic("Failed to create the TCP control socket.\n");
2003 	tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2004 	inet_sk(tcp_socket->sk)->uc_ttl = -1;
2005 
2006 	/* Unhash it so that IP input processing does not even
2007 	 * see it, we do not wish this socket to see incoming
2008 	 * packets.
2009 	 */
2010 	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2011 }
2012 
2013 EXPORT_SYMBOL(ipv4_specific);
2014 EXPORT_SYMBOL(inet_bind_bucket_create);
2015 EXPORT_SYMBOL(tcp_hashinfo);
2016 EXPORT_SYMBOL(tcp_prot);
2017 EXPORT_SYMBOL(tcp_unhash);
2018 EXPORT_SYMBOL(tcp_v4_conn_request);
2019 EXPORT_SYMBOL(tcp_v4_connect);
2020 EXPORT_SYMBOL(tcp_v4_do_rcv);
2021 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2022 EXPORT_SYMBOL(tcp_v4_send_check);
2023 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2024 
2025 #ifdef CONFIG_PROC_FS
2026 EXPORT_SYMBOL(tcp_proc_register);
2027 EXPORT_SYMBOL(tcp_proc_unregister);
2028 #endif
2029 EXPORT_SYMBOL(sysctl_local_port_range);
2030 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2031 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2032 
2033