xref: /linux/net/ipv4/tcp_ipv4.c (revision e9e8bcb8178e197d889ec31e79fa1ddc1732c8f9)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64 
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75 
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81 
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84 
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87 EXPORT_SYMBOL(sysctl_tcp_low_latency);
88 
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92 						   __be32 addr);
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, struct tcphdr *th);
95 #else
96 static inline
97 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98 {
99 	return NULL;
100 }
101 #endif
102 
103 struct inet_hashinfo tcp_hashinfo;
104 EXPORT_SYMBOL(tcp_hashinfo);
105 
106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107 {
108 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109 					  ip_hdr(skb)->saddr,
110 					  tcp_hdr(skb)->dest,
111 					  tcp_hdr(skb)->source);
112 }
113 
114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115 {
116 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 	struct tcp_sock *tp = tcp_sk(sk);
118 
119 	/* With PAWS, it is safe from the viewpoint
120 	   of data integrity. Even without PAWS it is safe provided sequence
121 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122 
123 	   Actually, the idea is close to VJ's one, only timestamp cache is
124 	   held not per host, but per port pair and TW bucket is used as state
125 	   holder.
126 
127 	   If TW bucket has been already destroyed we fall back to VJ's scheme
128 	   and use initial timestamp retrieved from peer table.
129 	 */
130 	if (tcptw->tw_ts_recent_stamp &&
131 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134 		if (tp->write_seq == 0)
135 			tp->write_seq = 1;
136 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 		sock_hold(sktw);
139 		return 1;
140 	}
141 
142 	return 0;
143 }
144 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145 
146 /* This will initiate an outgoing connection. */
147 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148 {
149 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150 	struct inet_sock *inet = inet_sk(sk);
151 	struct tcp_sock *tp = tcp_sk(sk);
152 	__be16 orig_sport, orig_dport;
153 	__be32 daddr, nexthop;
154 	struct flowi4 *fl4;
155 	struct rtable *rt;
156 	int err;
157 	struct ip_options_rcu *inet_opt;
158 
159 	if (addr_len < sizeof(struct sockaddr_in))
160 		return -EINVAL;
161 
162 	if (usin->sin_family != AF_INET)
163 		return -EAFNOSUPPORT;
164 
165 	nexthop = daddr = usin->sin_addr.s_addr;
166 	inet_opt = rcu_dereference_protected(inet->inet_opt,
167 					     sock_owned_by_user(sk));
168 	if (inet_opt && inet_opt->opt.srr) {
169 		if (!daddr)
170 			return -EINVAL;
171 		nexthop = inet_opt->opt.faddr;
172 	}
173 
174 	orig_sport = inet->inet_sport;
175 	orig_dport = usin->sin_port;
176 	fl4 = &inet->cork.fl.u.ip4;
177 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
178 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
179 			      IPPROTO_TCP,
180 			      orig_sport, orig_dport, sk, true);
181 	if (IS_ERR(rt)) {
182 		err = PTR_ERR(rt);
183 		if (err == -ENETUNREACH)
184 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
185 		return err;
186 	}
187 
188 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
189 		ip_rt_put(rt);
190 		return -ENETUNREACH;
191 	}
192 
193 	if (!inet_opt || !inet_opt->opt.srr)
194 		daddr = fl4->daddr;
195 
196 	if (!inet->inet_saddr)
197 		inet->inet_saddr = fl4->saddr;
198 	inet->inet_rcv_saddr = inet->inet_saddr;
199 
200 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
201 		/* Reset inherited state */
202 		tp->rx_opt.ts_recent	   = 0;
203 		tp->rx_opt.ts_recent_stamp = 0;
204 		tp->write_seq		   = 0;
205 	}
206 
207 	if (tcp_death_row.sysctl_tw_recycle &&
208 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
209 		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
210 		/*
211 		 * VJ's idea. We save last timestamp seen from
212 		 * the destination in peer table, when entering state
213 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
214 		 * when trying new connection.
215 		 */
216 		if (peer) {
217 			inet_peer_refcheck(peer);
218 			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
219 				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
220 				tp->rx_opt.ts_recent = peer->tcp_ts;
221 			}
222 		}
223 	}
224 
225 	inet->inet_dport = usin->sin_port;
226 	inet->inet_daddr = daddr;
227 
228 	inet_csk(sk)->icsk_ext_hdr_len = 0;
229 	if (inet_opt)
230 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
231 
232 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
233 
234 	/* Socket identity is still unknown (sport may be zero).
235 	 * However we set state to SYN-SENT and not releasing socket
236 	 * lock select source port, enter ourselves into the hash tables and
237 	 * complete initialization after this.
238 	 */
239 	tcp_set_state(sk, TCP_SYN_SENT);
240 	err = inet_hash_connect(&tcp_death_row, sk);
241 	if (err)
242 		goto failure;
243 
244 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
245 			       inet->inet_sport, inet->inet_dport, sk);
246 	if (IS_ERR(rt)) {
247 		err = PTR_ERR(rt);
248 		rt = NULL;
249 		goto failure;
250 	}
251 	/* OK, now commit destination to socket.  */
252 	sk->sk_gso_type = SKB_GSO_TCPV4;
253 	sk_setup_caps(sk, &rt->dst);
254 
255 	if (!tp->write_seq)
256 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
257 							   inet->inet_daddr,
258 							   inet->inet_sport,
259 							   usin->sin_port);
260 
261 	inet->inet_id = tp->write_seq ^ jiffies;
262 
263 	err = tcp_connect(sk);
264 	rt = NULL;
265 	if (err)
266 		goto failure;
267 
268 	return 0;
269 
270 failure:
271 	/*
272 	 * This unhashes the socket and releases the local port,
273 	 * if necessary.
274 	 */
275 	tcp_set_state(sk, TCP_CLOSE);
276 	ip_rt_put(rt);
277 	sk->sk_route_caps = 0;
278 	inet->inet_dport = 0;
279 	return err;
280 }
281 EXPORT_SYMBOL(tcp_v4_connect);
282 
283 /*
284  * This routine does path mtu discovery as defined in RFC1191.
285  */
286 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
287 {
288 	struct dst_entry *dst;
289 	struct inet_sock *inet = inet_sk(sk);
290 
291 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
292 	 * send out by Linux are always <576bytes so they should go through
293 	 * unfragmented).
294 	 */
295 	if (sk->sk_state == TCP_LISTEN)
296 		return;
297 
298 	/* We don't check in the destentry if pmtu discovery is forbidden
299 	 * on this route. We just assume that no packet_to_big packets
300 	 * are send back when pmtu discovery is not active.
301 	 * There is a small race when the user changes this flag in the
302 	 * route, but I think that's acceptable.
303 	 */
304 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
305 		return;
306 
307 	dst->ops->update_pmtu(dst, mtu);
308 
309 	/* Something is about to be wrong... Remember soft error
310 	 * for the case, if this connection will not able to recover.
311 	 */
312 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
313 		sk->sk_err_soft = EMSGSIZE;
314 
315 	mtu = dst_mtu(dst);
316 
317 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
318 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
319 		tcp_sync_mss(sk, mtu);
320 
321 		/* Resend the TCP packet because it's
322 		 * clear that the old packet has been
323 		 * dropped. This is the new "fast" path mtu
324 		 * discovery.
325 		 */
326 		tcp_simple_retransmit(sk);
327 	} /* else let the usual retransmit timer handle it */
328 }
329 
330 /*
331  * This routine is called by the ICMP module when it gets some
332  * sort of error condition.  If err < 0 then the socket should
333  * be closed and the error returned to the user.  If err > 0
334  * it's just the icmp type << 8 | icmp code.  After adjustment
335  * header points to the first 8 bytes of the tcp header.  We need
336  * to find the appropriate port.
337  *
338  * The locking strategy used here is very "optimistic". When
339  * someone else accesses the socket the ICMP is just dropped
340  * and for some paths there is no check at all.
341  * A more general error queue to queue errors for later handling
342  * is probably better.
343  *
344  */
345 
346 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
347 {
348 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
349 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
350 	struct inet_connection_sock *icsk;
351 	struct tcp_sock *tp;
352 	struct inet_sock *inet;
353 	const int type = icmp_hdr(icmp_skb)->type;
354 	const int code = icmp_hdr(icmp_skb)->code;
355 	struct sock *sk;
356 	struct sk_buff *skb;
357 	__u32 seq;
358 	__u32 remaining;
359 	int err;
360 	struct net *net = dev_net(icmp_skb->dev);
361 
362 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
363 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
364 		return;
365 	}
366 
367 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
368 			iph->saddr, th->source, inet_iif(icmp_skb));
369 	if (!sk) {
370 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
371 		return;
372 	}
373 	if (sk->sk_state == TCP_TIME_WAIT) {
374 		inet_twsk_put(inet_twsk(sk));
375 		return;
376 	}
377 
378 	bh_lock_sock(sk);
379 	/* If too many ICMPs get dropped on busy
380 	 * servers this needs to be solved differently.
381 	 */
382 	if (sock_owned_by_user(sk))
383 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
384 
385 	if (sk->sk_state == TCP_CLOSE)
386 		goto out;
387 
388 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
389 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
390 		goto out;
391 	}
392 
393 	icsk = inet_csk(sk);
394 	tp = tcp_sk(sk);
395 	seq = ntohl(th->seq);
396 	if (sk->sk_state != TCP_LISTEN &&
397 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
398 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
399 		goto out;
400 	}
401 
402 	switch (type) {
403 	case ICMP_SOURCE_QUENCH:
404 		/* Just silently ignore these. */
405 		goto out;
406 	case ICMP_PARAMETERPROB:
407 		err = EPROTO;
408 		break;
409 	case ICMP_DEST_UNREACH:
410 		if (code > NR_ICMP_UNREACH)
411 			goto out;
412 
413 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
414 			if (!sock_owned_by_user(sk))
415 				do_pmtu_discovery(sk, iph, info);
416 			goto out;
417 		}
418 
419 		err = icmp_err_convert[code].errno;
420 		/* check if icmp_skb allows revert of backoff
421 		 * (see draft-zimmermann-tcp-lcd) */
422 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 			break;
424 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
425 		    !icsk->icsk_backoff)
426 			break;
427 
428 		if (sock_owned_by_user(sk))
429 			break;
430 
431 		icsk->icsk_backoff--;
432 		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
433 					 icsk->icsk_backoff;
434 		tcp_bound_rto(sk);
435 
436 		skb = tcp_write_queue_head(sk);
437 		BUG_ON(!skb);
438 
439 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
440 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
441 
442 		if (remaining) {
443 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
444 						  remaining, TCP_RTO_MAX);
445 		} else {
446 			/* RTO revert clocked out retransmission.
447 			 * Will retransmit now */
448 			tcp_retransmit_timer(sk);
449 		}
450 
451 		break;
452 	case ICMP_TIME_EXCEEDED:
453 		err = EHOSTUNREACH;
454 		break;
455 	default:
456 		goto out;
457 	}
458 
459 	switch (sk->sk_state) {
460 		struct request_sock *req, **prev;
461 	case TCP_LISTEN:
462 		if (sock_owned_by_user(sk))
463 			goto out;
464 
465 		req = inet_csk_search_req(sk, &prev, th->dest,
466 					  iph->daddr, iph->saddr);
467 		if (!req)
468 			goto out;
469 
470 		/* ICMPs are not backlogged, hence we cannot get
471 		   an established socket here.
472 		 */
473 		WARN_ON(req->sk);
474 
475 		if (seq != tcp_rsk(req)->snt_isn) {
476 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
477 			goto out;
478 		}
479 
480 		/*
481 		 * Still in SYN_RECV, just remove it silently.
482 		 * There is no good way to pass the error to the newly
483 		 * created socket, and POSIX does not want network
484 		 * errors returned from accept().
485 		 */
486 		inet_csk_reqsk_queue_drop(sk, req, prev);
487 		goto out;
488 
489 	case TCP_SYN_SENT:
490 	case TCP_SYN_RECV:  /* Cannot happen.
491 			       It can f.e. if SYNs crossed.
492 			     */
493 		if (!sock_owned_by_user(sk)) {
494 			sk->sk_err = err;
495 
496 			sk->sk_error_report(sk);
497 
498 			tcp_done(sk);
499 		} else {
500 			sk->sk_err_soft = err;
501 		}
502 		goto out;
503 	}
504 
505 	/* If we've already connected we will keep trying
506 	 * until we time out, or the user gives up.
507 	 *
508 	 * rfc1122 4.2.3.9 allows to consider as hard errors
509 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
510 	 * but it is obsoleted by pmtu discovery).
511 	 *
512 	 * Note, that in modern internet, where routing is unreliable
513 	 * and in each dark corner broken firewalls sit, sending random
514 	 * errors ordered by their masters even this two messages finally lose
515 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
516 	 *
517 	 * Now we are in compliance with RFCs.
518 	 *							--ANK (980905)
519 	 */
520 
521 	inet = inet_sk(sk);
522 	if (!sock_owned_by_user(sk) && inet->recverr) {
523 		sk->sk_err = err;
524 		sk->sk_error_report(sk);
525 	} else	{ /* Only an error on timeout */
526 		sk->sk_err_soft = err;
527 	}
528 
529 out:
530 	bh_unlock_sock(sk);
531 	sock_put(sk);
532 }
533 
534 static void __tcp_v4_send_check(struct sk_buff *skb,
535 				__be32 saddr, __be32 daddr)
536 {
537 	struct tcphdr *th = tcp_hdr(skb);
538 
539 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
540 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
541 		skb->csum_start = skb_transport_header(skb) - skb->head;
542 		skb->csum_offset = offsetof(struct tcphdr, check);
543 	} else {
544 		th->check = tcp_v4_check(skb->len, saddr, daddr,
545 					 csum_partial(th,
546 						      th->doff << 2,
547 						      skb->csum));
548 	}
549 }
550 
551 /* This routine computes an IPv4 TCP checksum. */
552 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
553 {
554 	struct inet_sock *inet = inet_sk(sk);
555 
556 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
557 }
558 EXPORT_SYMBOL(tcp_v4_send_check);
559 
560 int tcp_v4_gso_send_check(struct sk_buff *skb)
561 {
562 	const struct iphdr *iph;
563 	struct tcphdr *th;
564 
565 	if (!pskb_may_pull(skb, sizeof(*th)))
566 		return -EINVAL;
567 
568 	iph = ip_hdr(skb);
569 	th = tcp_hdr(skb);
570 
571 	th->check = 0;
572 	skb->ip_summed = CHECKSUM_PARTIAL;
573 	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
574 	return 0;
575 }
576 
577 /*
578  *	This routine will send an RST to the other tcp.
579  *
580  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
581  *		      for reset.
582  *	Answer: if a packet caused RST, it is not for a socket
583  *		existing in our system, if it is matched to a socket,
584  *		it is just duplicate segment or bug in other side's TCP.
585  *		So that we build reply only basing on parameters
586  *		arrived with segment.
587  *	Exception: precedence violation. We do not implement it in any case.
588  */
589 
590 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
591 {
592 	struct tcphdr *th = tcp_hdr(skb);
593 	struct {
594 		struct tcphdr th;
595 #ifdef CONFIG_TCP_MD5SIG
596 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
597 #endif
598 	} rep;
599 	struct ip_reply_arg arg;
600 #ifdef CONFIG_TCP_MD5SIG
601 	struct tcp_md5sig_key *key;
602 #endif
603 	struct net *net;
604 
605 	/* Never send a reset in response to a reset. */
606 	if (th->rst)
607 		return;
608 
609 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
610 		return;
611 
612 	/* Swap the send and the receive. */
613 	memset(&rep, 0, sizeof(rep));
614 	rep.th.dest   = th->source;
615 	rep.th.source = th->dest;
616 	rep.th.doff   = sizeof(struct tcphdr) / 4;
617 	rep.th.rst    = 1;
618 
619 	if (th->ack) {
620 		rep.th.seq = th->ack_seq;
621 	} else {
622 		rep.th.ack = 1;
623 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
624 				       skb->len - (th->doff << 2));
625 	}
626 
627 	memset(&arg, 0, sizeof(arg));
628 	arg.iov[0].iov_base = (unsigned char *)&rep;
629 	arg.iov[0].iov_len  = sizeof(rep.th);
630 
631 #ifdef CONFIG_TCP_MD5SIG
632 	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
633 	if (key) {
634 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
635 				   (TCPOPT_NOP << 16) |
636 				   (TCPOPT_MD5SIG << 8) |
637 				   TCPOLEN_MD5SIG);
638 		/* Update length and the length the header thinks exists */
639 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
640 		rep.th.doff = arg.iov[0].iov_len / 4;
641 
642 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
643 				     key, ip_hdr(skb)->saddr,
644 				     ip_hdr(skb)->daddr, &rep.th);
645 	}
646 #endif
647 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
648 				      ip_hdr(skb)->saddr, /* XXX */
649 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
650 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
651 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
652 
653 	net = dev_net(skb_dst(skb)->dev);
654 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
655 		      &arg, arg.iov[0].iov_len);
656 
657 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
658 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
659 }
660 
661 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
662    outside socket context is ugly, certainly. What can I do?
663  */
664 
665 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
666 			    u32 win, u32 ts, int oif,
667 			    struct tcp_md5sig_key *key,
668 			    int reply_flags)
669 {
670 	struct tcphdr *th = tcp_hdr(skb);
671 	struct {
672 		struct tcphdr th;
673 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
674 #ifdef CONFIG_TCP_MD5SIG
675 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
676 #endif
677 			];
678 	} rep;
679 	struct ip_reply_arg arg;
680 	struct net *net = dev_net(skb_dst(skb)->dev);
681 
682 	memset(&rep.th, 0, sizeof(struct tcphdr));
683 	memset(&arg, 0, sizeof(arg));
684 
685 	arg.iov[0].iov_base = (unsigned char *)&rep;
686 	arg.iov[0].iov_len  = sizeof(rep.th);
687 	if (ts) {
688 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
689 				   (TCPOPT_TIMESTAMP << 8) |
690 				   TCPOLEN_TIMESTAMP);
691 		rep.opt[1] = htonl(tcp_time_stamp);
692 		rep.opt[2] = htonl(ts);
693 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
694 	}
695 
696 	/* Swap the send and the receive. */
697 	rep.th.dest    = th->source;
698 	rep.th.source  = th->dest;
699 	rep.th.doff    = arg.iov[0].iov_len / 4;
700 	rep.th.seq     = htonl(seq);
701 	rep.th.ack_seq = htonl(ack);
702 	rep.th.ack     = 1;
703 	rep.th.window  = htons(win);
704 
705 #ifdef CONFIG_TCP_MD5SIG
706 	if (key) {
707 		int offset = (ts) ? 3 : 0;
708 
709 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
710 					  (TCPOPT_NOP << 16) |
711 					  (TCPOPT_MD5SIG << 8) |
712 					  TCPOLEN_MD5SIG);
713 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
714 		rep.th.doff = arg.iov[0].iov_len/4;
715 
716 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
717 				    key, ip_hdr(skb)->saddr,
718 				    ip_hdr(skb)->daddr, &rep.th);
719 	}
720 #endif
721 	arg.flags = reply_flags;
722 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
723 				      ip_hdr(skb)->saddr, /* XXX */
724 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
725 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
726 	if (oif)
727 		arg.bound_dev_if = oif;
728 
729 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
730 		      &arg, arg.iov[0].iov_len);
731 
732 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
733 }
734 
735 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
736 {
737 	struct inet_timewait_sock *tw = inet_twsk(sk);
738 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
739 
740 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
741 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
742 			tcptw->tw_ts_recent,
743 			tw->tw_bound_dev_if,
744 			tcp_twsk_md5_key(tcptw),
745 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
746 			);
747 
748 	inet_twsk_put(tw);
749 }
750 
751 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
752 				  struct request_sock *req)
753 {
754 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
755 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
756 			req->ts_recent,
757 			0,
758 			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
759 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
760 }
761 
762 /*
763  *	Send a SYN-ACK after having received a SYN.
764  *	This still operates on a request_sock only, not on a big
765  *	socket.
766  */
767 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
768 			      struct request_sock *req,
769 			      struct request_values *rvp)
770 {
771 	const struct inet_request_sock *ireq = inet_rsk(req);
772 	struct flowi4 fl4;
773 	int err = -1;
774 	struct sk_buff * skb;
775 
776 	/* First, grab a route. */
777 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
778 		return -1;
779 
780 	skb = tcp_make_synack(sk, dst, req, rvp);
781 
782 	if (skb) {
783 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
784 
785 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
786 					    ireq->rmt_addr,
787 					    ireq->opt);
788 		err = net_xmit_eval(err);
789 	}
790 
791 	dst_release(dst);
792 	return err;
793 }
794 
795 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
796 			      struct request_values *rvp)
797 {
798 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
799 	return tcp_v4_send_synack(sk, NULL, req, rvp);
800 }
801 
802 /*
803  *	IPv4 request_sock destructor.
804  */
805 static void tcp_v4_reqsk_destructor(struct request_sock *req)
806 {
807 	kfree(inet_rsk(req)->opt);
808 }
809 
810 static void syn_flood_warning(const struct sk_buff *skb)
811 {
812 	const char *msg;
813 
814 #ifdef CONFIG_SYN_COOKIES
815 	if (sysctl_tcp_syncookies)
816 		msg = "Sending cookies";
817 	else
818 #endif
819 		msg = "Dropping request";
820 
821 	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
822 				ntohs(tcp_hdr(skb)->dest), msg);
823 }
824 
825 /*
826  * Save and compile IPv4 options into the request_sock if needed.
827  */
828 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
829 						  struct sk_buff *skb)
830 {
831 	const struct ip_options *opt = &(IPCB(skb)->opt);
832 	struct ip_options_rcu *dopt = NULL;
833 
834 	if (opt && opt->optlen) {
835 		int opt_size = sizeof(*dopt) + opt->optlen;
836 
837 		dopt = kmalloc(opt_size, GFP_ATOMIC);
838 		if (dopt) {
839 			if (ip_options_echo(&dopt->opt, skb)) {
840 				kfree(dopt);
841 				dopt = NULL;
842 			}
843 		}
844 	}
845 	return dopt;
846 }
847 
848 #ifdef CONFIG_TCP_MD5SIG
849 /*
850  * RFC2385 MD5 checksumming requires a mapping of
851  * IP address->MD5 Key.
852  * We need to maintain these in the sk structure.
853  */
854 
855 /* Find the Key structure for an address.  */
856 static struct tcp_md5sig_key *
857 			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
858 {
859 	struct tcp_sock *tp = tcp_sk(sk);
860 	int i;
861 
862 	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
863 		return NULL;
864 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
865 		if (tp->md5sig_info->keys4[i].addr == addr)
866 			return &tp->md5sig_info->keys4[i].base;
867 	}
868 	return NULL;
869 }
870 
871 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
872 					 struct sock *addr_sk)
873 {
874 	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
875 }
876 EXPORT_SYMBOL(tcp_v4_md5_lookup);
877 
878 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
879 						      struct request_sock *req)
880 {
881 	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
882 }
883 
884 /* This can be called on a newly created socket, from other files */
885 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
886 		      u8 *newkey, u8 newkeylen)
887 {
888 	/* Add Key to the list */
889 	struct tcp_md5sig_key *key;
890 	struct tcp_sock *tp = tcp_sk(sk);
891 	struct tcp4_md5sig_key *keys;
892 
893 	key = tcp_v4_md5_do_lookup(sk, addr);
894 	if (key) {
895 		/* Pre-existing entry - just update that one. */
896 		kfree(key->key);
897 		key->key = newkey;
898 		key->keylen = newkeylen;
899 	} else {
900 		struct tcp_md5sig_info *md5sig;
901 
902 		if (!tp->md5sig_info) {
903 			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
904 						  GFP_ATOMIC);
905 			if (!tp->md5sig_info) {
906 				kfree(newkey);
907 				return -ENOMEM;
908 			}
909 			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
910 		}
911 		if (tcp_alloc_md5sig_pool(sk) == NULL) {
912 			kfree(newkey);
913 			return -ENOMEM;
914 		}
915 		md5sig = tp->md5sig_info;
916 
917 		if (md5sig->alloced4 == md5sig->entries4) {
918 			keys = kmalloc((sizeof(*keys) *
919 					(md5sig->entries4 + 1)), GFP_ATOMIC);
920 			if (!keys) {
921 				kfree(newkey);
922 				tcp_free_md5sig_pool();
923 				return -ENOMEM;
924 			}
925 
926 			if (md5sig->entries4)
927 				memcpy(keys, md5sig->keys4,
928 				       sizeof(*keys) * md5sig->entries4);
929 
930 			/* Free old key list, and reference new one */
931 			kfree(md5sig->keys4);
932 			md5sig->keys4 = keys;
933 			md5sig->alloced4++;
934 		}
935 		md5sig->entries4++;
936 		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
937 		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
938 		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
939 	}
940 	return 0;
941 }
942 EXPORT_SYMBOL(tcp_v4_md5_do_add);
943 
944 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
945 			       u8 *newkey, u8 newkeylen)
946 {
947 	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
948 				 newkey, newkeylen);
949 }
950 
951 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
952 {
953 	struct tcp_sock *tp = tcp_sk(sk);
954 	int i;
955 
956 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
957 		if (tp->md5sig_info->keys4[i].addr == addr) {
958 			/* Free the key */
959 			kfree(tp->md5sig_info->keys4[i].base.key);
960 			tp->md5sig_info->entries4--;
961 
962 			if (tp->md5sig_info->entries4 == 0) {
963 				kfree(tp->md5sig_info->keys4);
964 				tp->md5sig_info->keys4 = NULL;
965 				tp->md5sig_info->alloced4 = 0;
966 			} else if (tp->md5sig_info->entries4 != i) {
967 				/* Need to do some manipulation */
968 				memmove(&tp->md5sig_info->keys4[i],
969 					&tp->md5sig_info->keys4[i+1],
970 					(tp->md5sig_info->entries4 - i) *
971 					 sizeof(struct tcp4_md5sig_key));
972 			}
973 			tcp_free_md5sig_pool();
974 			return 0;
975 		}
976 	}
977 	return -ENOENT;
978 }
979 EXPORT_SYMBOL(tcp_v4_md5_do_del);
980 
981 static void tcp_v4_clear_md5_list(struct sock *sk)
982 {
983 	struct tcp_sock *tp = tcp_sk(sk);
984 
985 	/* Free each key, then the set of key keys,
986 	 * the crypto element, and then decrement our
987 	 * hold on the last resort crypto.
988 	 */
989 	if (tp->md5sig_info->entries4) {
990 		int i;
991 		for (i = 0; i < tp->md5sig_info->entries4; i++)
992 			kfree(tp->md5sig_info->keys4[i].base.key);
993 		tp->md5sig_info->entries4 = 0;
994 		tcp_free_md5sig_pool();
995 	}
996 	if (tp->md5sig_info->keys4) {
997 		kfree(tp->md5sig_info->keys4);
998 		tp->md5sig_info->keys4 = NULL;
999 		tp->md5sig_info->alloced4  = 0;
1000 	}
1001 }
1002 
1003 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1004 				 int optlen)
1005 {
1006 	struct tcp_md5sig cmd;
1007 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1008 	u8 *newkey;
1009 
1010 	if (optlen < sizeof(cmd))
1011 		return -EINVAL;
1012 
1013 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1014 		return -EFAULT;
1015 
1016 	if (sin->sin_family != AF_INET)
1017 		return -EINVAL;
1018 
1019 	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1020 		if (!tcp_sk(sk)->md5sig_info)
1021 			return -ENOENT;
1022 		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1023 	}
1024 
1025 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1026 		return -EINVAL;
1027 
1028 	if (!tcp_sk(sk)->md5sig_info) {
1029 		struct tcp_sock *tp = tcp_sk(sk);
1030 		struct tcp_md5sig_info *p;
1031 
1032 		p = kzalloc(sizeof(*p), sk->sk_allocation);
1033 		if (!p)
1034 			return -EINVAL;
1035 
1036 		tp->md5sig_info = p;
1037 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1038 	}
1039 
1040 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1041 	if (!newkey)
1042 		return -ENOMEM;
1043 	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1044 				 newkey, cmd.tcpm_keylen);
1045 }
1046 
1047 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1048 					__be32 daddr, __be32 saddr, int nbytes)
1049 {
1050 	struct tcp4_pseudohdr *bp;
1051 	struct scatterlist sg;
1052 
1053 	bp = &hp->md5_blk.ip4;
1054 
1055 	/*
1056 	 * 1. the TCP pseudo-header (in the order: source IP address,
1057 	 * destination IP address, zero-padded protocol number, and
1058 	 * segment length)
1059 	 */
1060 	bp->saddr = saddr;
1061 	bp->daddr = daddr;
1062 	bp->pad = 0;
1063 	bp->protocol = IPPROTO_TCP;
1064 	bp->len = cpu_to_be16(nbytes);
1065 
1066 	sg_init_one(&sg, bp, sizeof(*bp));
1067 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1068 }
1069 
1070 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1071 			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1072 {
1073 	struct tcp_md5sig_pool *hp;
1074 	struct hash_desc *desc;
1075 
1076 	hp = tcp_get_md5sig_pool();
1077 	if (!hp)
1078 		goto clear_hash_noput;
1079 	desc = &hp->md5_desc;
1080 
1081 	if (crypto_hash_init(desc))
1082 		goto clear_hash;
1083 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1084 		goto clear_hash;
1085 	if (tcp_md5_hash_header(hp, th))
1086 		goto clear_hash;
1087 	if (tcp_md5_hash_key(hp, key))
1088 		goto clear_hash;
1089 	if (crypto_hash_final(desc, md5_hash))
1090 		goto clear_hash;
1091 
1092 	tcp_put_md5sig_pool();
1093 	return 0;
1094 
1095 clear_hash:
1096 	tcp_put_md5sig_pool();
1097 clear_hash_noput:
1098 	memset(md5_hash, 0, 16);
1099 	return 1;
1100 }
1101 
1102 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1103 			struct sock *sk, struct request_sock *req,
1104 			struct sk_buff *skb)
1105 {
1106 	struct tcp_md5sig_pool *hp;
1107 	struct hash_desc *desc;
1108 	struct tcphdr *th = tcp_hdr(skb);
1109 	__be32 saddr, daddr;
1110 
1111 	if (sk) {
1112 		saddr = inet_sk(sk)->inet_saddr;
1113 		daddr = inet_sk(sk)->inet_daddr;
1114 	} else if (req) {
1115 		saddr = inet_rsk(req)->loc_addr;
1116 		daddr = inet_rsk(req)->rmt_addr;
1117 	} else {
1118 		const struct iphdr *iph = ip_hdr(skb);
1119 		saddr = iph->saddr;
1120 		daddr = iph->daddr;
1121 	}
1122 
1123 	hp = tcp_get_md5sig_pool();
1124 	if (!hp)
1125 		goto clear_hash_noput;
1126 	desc = &hp->md5_desc;
1127 
1128 	if (crypto_hash_init(desc))
1129 		goto clear_hash;
1130 
1131 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1132 		goto clear_hash;
1133 	if (tcp_md5_hash_header(hp, th))
1134 		goto clear_hash;
1135 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1136 		goto clear_hash;
1137 	if (tcp_md5_hash_key(hp, key))
1138 		goto clear_hash;
1139 	if (crypto_hash_final(desc, md5_hash))
1140 		goto clear_hash;
1141 
1142 	tcp_put_md5sig_pool();
1143 	return 0;
1144 
1145 clear_hash:
1146 	tcp_put_md5sig_pool();
1147 clear_hash_noput:
1148 	memset(md5_hash, 0, 16);
1149 	return 1;
1150 }
1151 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1152 
1153 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1154 {
1155 	/*
1156 	 * This gets called for each TCP segment that arrives
1157 	 * so we want to be efficient.
1158 	 * We have 3 drop cases:
1159 	 * o No MD5 hash and one expected.
1160 	 * o MD5 hash and we're not expecting one.
1161 	 * o MD5 hash and its wrong.
1162 	 */
1163 	__u8 *hash_location = NULL;
1164 	struct tcp_md5sig_key *hash_expected;
1165 	const struct iphdr *iph = ip_hdr(skb);
1166 	struct tcphdr *th = tcp_hdr(skb);
1167 	int genhash;
1168 	unsigned char newhash[16];
1169 
1170 	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1171 	hash_location = tcp_parse_md5sig_option(th);
1172 
1173 	/* We've parsed the options - do we have a hash? */
1174 	if (!hash_expected && !hash_location)
1175 		return 0;
1176 
1177 	if (hash_expected && !hash_location) {
1178 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1179 		return 1;
1180 	}
1181 
1182 	if (!hash_expected && hash_location) {
1183 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1184 		return 1;
1185 	}
1186 
1187 	/* Okay, so this is hash_expected and hash_location -
1188 	 * so we need to calculate the checksum.
1189 	 */
1190 	genhash = tcp_v4_md5_hash_skb(newhash,
1191 				      hash_expected,
1192 				      NULL, NULL, skb);
1193 
1194 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1195 		if (net_ratelimit()) {
1196 			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1197 			       &iph->saddr, ntohs(th->source),
1198 			       &iph->daddr, ntohs(th->dest),
1199 			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1200 		}
1201 		return 1;
1202 	}
1203 	return 0;
1204 }
1205 
1206 #endif
1207 
1208 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1209 	.family		=	PF_INET,
1210 	.obj_size	=	sizeof(struct tcp_request_sock),
1211 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1212 	.send_ack	=	tcp_v4_reqsk_send_ack,
1213 	.destructor	=	tcp_v4_reqsk_destructor,
1214 	.send_reset	=	tcp_v4_send_reset,
1215 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1216 };
1217 
1218 #ifdef CONFIG_TCP_MD5SIG
1219 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1220 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1221 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1222 };
1223 #endif
1224 
1225 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1226 {
1227 	struct tcp_extend_values tmp_ext;
1228 	struct tcp_options_received tmp_opt;
1229 	u8 *hash_location;
1230 	struct request_sock *req;
1231 	struct inet_request_sock *ireq;
1232 	struct tcp_sock *tp = tcp_sk(sk);
1233 	struct dst_entry *dst = NULL;
1234 	__be32 saddr = ip_hdr(skb)->saddr;
1235 	__be32 daddr = ip_hdr(skb)->daddr;
1236 	__u32 isn = TCP_SKB_CB(skb)->when;
1237 #ifdef CONFIG_SYN_COOKIES
1238 	int want_cookie = 0;
1239 #else
1240 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1241 #endif
1242 
1243 	/* Never answer to SYNs send to broadcast or multicast */
1244 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1245 		goto drop;
1246 
1247 	/* TW buckets are converted to open requests without
1248 	 * limitations, they conserve resources and peer is
1249 	 * evidently real one.
1250 	 */
1251 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1252 		if (net_ratelimit())
1253 			syn_flood_warning(skb);
1254 #ifdef CONFIG_SYN_COOKIES
1255 		if (sysctl_tcp_syncookies) {
1256 			want_cookie = 1;
1257 		} else
1258 #endif
1259 		goto drop;
1260 	}
1261 
1262 	/* Accept backlog is full. If we have already queued enough
1263 	 * of warm entries in syn queue, drop request. It is better than
1264 	 * clogging syn queue with openreqs with exponentially increasing
1265 	 * timeout.
1266 	 */
1267 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1268 		goto drop;
1269 
1270 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1271 	if (!req)
1272 		goto drop;
1273 
1274 #ifdef CONFIG_TCP_MD5SIG
1275 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1276 #endif
1277 
1278 	tcp_clear_options(&tmp_opt);
1279 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1280 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1281 	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1282 
1283 	if (tmp_opt.cookie_plus > 0 &&
1284 	    tmp_opt.saw_tstamp &&
1285 	    !tp->rx_opt.cookie_out_never &&
1286 	    (sysctl_tcp_cookie_size > 0 ||
1287 	     (tp->cookie_values != NULL &&
1288 	      tp->cookie_values->cookie_desired > 0))) {
1289 		u8 *c;
1290 		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1291 		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1292 
1293 		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1294 			goto drop_and_release;
1295 
1296 		/* Secret recipe starts with IP addresses */
1297 		*mess++ ^= (__force u32)daddr;
1298 		*mess++ ^= (__force u32)saddr;
1299 
1300 		/* plus variable length Initiator Cookie */
1301 		c = (u8 *)mess;
1302 		while (l-- > 0)
1303 			*c++ ^= *hash_location++;
1304 
1305 #ifdef CONFIG_SYN_COOKIES
1306 		want_cookie = 0;	/* not our kind of cookie */
1307 #endif
1308 		tmp_ext.cookie_out_never = 0; /* false */
1309 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1310 	} else if (!tp->rx_opt.cookie_in_always) {
1311 		/* redundant indications, but ensure initialization. */
1312 		tmp_ext.cookie_out_never = 1; /* true */
1313 		tmp_ext.cookie_plus = 0;
1314 	} else {
1315 		goto drop_and_release;
1316 	}
1317 	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1318 
1319 	if (want_cookie && !tmp_opt.saw_tstamp)
1320 		tcp_clear_options(&tmp_opt);
1321 
1322 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1323 	tcp_openreq_init(req, &tmp_opt, skb);
1324 
1325 	ireq = inet_rsk(req);
1326 	ireq->loc_addr = daddr;
1327 	ireq->rmt_addr = saddr;
1328 	ireq->no_srccheck = inet_sk(sk)->transparent;
1329 	ireq->opt = tcp_v4_save_options(sk, skb);
1330 
1331 	if (security_inet_conn_request(sk, skb, req))
1332 		goto drop_and_free;
1333 
1334 	if (!want_cookie || tmp_opt.tstamp_ok)
1335 		TCP_ECN_create_request(req, tcp_hdr(skb));
1336 
1337 	if (want_cookie) {
1338 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1339 		req->cookie_ts = tmp_opt.tstamp_ok;
1340 	} else if (!isn) {
1341 		struct inet_peer *peer = NULL;
1342 		struct flowi4 fl4;
1343 
1344 		/* VJ's idea. We save last timestamp seen
1345 		 * from the destination in peer table, when entering
1346 		 * state TIME-WAIT, and check against it before
1347 		 * accepting new connection request.
1348 		 *
1349 		 * If "isn" is not zero, this request hit alive
1350 		 * timewait bucket, so that all the necessary checks
1351 		 * are made in the function processing timewait state.
1352 		 */
1353 		if (tmp_opt.saw_tstamp &&
1354 		    tcp_death_row.sysctl_tw_recycle &&
1355 		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1356 		    fl4.daddr == saddr &&
1357 		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1358 			inet_peer_refcheck(peer);
1359 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1360 			    (s32)(peer->tcp_ts - req->ts_recent) >
1361 							TCP_PAWS_WINDOW) {
1362 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1363 				goto drop_and_release;
1364 			}
1365 		}
1366 		/* Kill the following clause, if you dislike this way. */
1367 		else if (!sysctl_tcp_syncookies &&
1368 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1369 			  (sysctl_max_syn_backlog >> 2)) &&
1370 			 (!peer || !peer->tcp_ts_stamp) &&
1371 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1372 			/* Without syncookies last quarter of
1373 			 * backlog is filled with destinations,
1374 			 * proven to be alive.
1375 			 * It means that we continue to communicate
1376 			 * to destinations, already remembered
1377 			 * to the moment of synflood.
1378 			 */
1379 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1380 				       &saddr, ntohs(tcp_hdr(skb)->source));
1381 			goto drop_and_release;
1382 		}
1383 
1384 		isn = tcp_v4_init_sequence(skb);
1385 	}
1386 	tcp_rsk(req)->snt_isn = isn;
1387 
1388 	if (tcp_v4_send_synack(sk, dst, req,
1389 			       (struct request_values *)&tmp_ext) ||
1390 	    want_cookie)
1391 		goto drop_and_free;
1392 
1393 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1394 	return 0;
1395 
1396 drop_and_release:
1397 	dst_release(dst);
1398 drop_and_free:
1399 	reqsk_free(req);
1400 drop:
1401 	return 0;
1402 }
1403 EXPORT_SYMBOL(tcp_v4_conn_request);
1404 
1405 
1406 /*
1407  * The three way handshake has completed - we got a valid synack -
1408  * now create the new socket.
1409  */
1410 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1411 				  struct request_sock *req,
1412 				  struct dst_entry *dst)
1413 {
1414 	struct inet_request_sock *ireq;
1415 	struct inet_sock *newinet;
1416 	struct tcp_sock *newtp;
1417 	struct sock *newsk;
1418 #ifdef CONFIG_TCP_MD5SIG
1419 	struct tcp_md5sig_key *key;
1420 #endif
1421 	struct ip_options_rcu *inet_opt;
1422 
1423 	if (sk_acceptq_is_full(sk))
1424 		goto exit_overflow;
1425 
1426 	newsk = tcp_create_openreq_child(sk, req, skb);
1427 	if (!newsk)
1428 		goto exit_nonewsk;
1429 
1430 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1431 
1432 	newtp		      = tcp_sk(newsk);
1433 	newinet		      = inet_sk(newsk);
1434 	ireq		      = inet_rsk(req);
1435 	newinet->inet_daddr   = ireq->rmt_addr;
1436 	newinet->inet_rcv_saddr = ireq->loc_addr;
1437 	newinet->inet_saddr	      = ireq->loc_addr;
1438 	inet_opt	      = ireq->opt;
1439 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1440 	ireq->opt	      = NULL;
1441 	newinet->mc_index     = inet_iif(skb);
1442 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1443 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1444 	if (inet_opt)
1445 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1446 	newinet->inet_id = newtp->write_seq ^ jiffies;
1447 
1448 	if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1449 		goto put_and_exit;
1450 
1451 	sk_setup_caps(newsk, dst);
1452 
1453 	tcp_mtup_init(newsk);
1454 	tcp_sync_mss(newsk, dst_mtu(dst));
1455 	newtp->advmss = dst_metric_advmss(dst);
1456 	if (tcp_sk(sk)->rx_opt.user_mss &&
1457 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1458 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1459 
1460 	tcp_initialize_rcv_mss(newsk);
1461 
1462 #ifdef CONFIG_TCP_MD5SIG
1463 	/* Copy over the MD5 key from the original socket */
1464 	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1465 	if (key != NULL) {
1466 		/*
1467 		 * We're using one, so create a matching key
1468 		 * on the newsk structure. If we fail to get
1469 		 * memory, then we end up not copying the key
1470 		 * across. Shucks.
1471 		 */
1472 		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1473 		if (newkey != NULL)
1474 			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1475 					  newkey, key->keylen);
1476 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1477 	}
1478 #endif
1479 
1480 	if (__inet_inherit_port(sk, newsk) < 0)
1481 		goto put_and_exit;
1482 	__inet_hash_nolisten(newsk, NULL);
1483 
1484 	return newsk;
1485 
1486 exit_overflow:
1487 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1488 exit_nonewsk:
1489 	dst_release(dst);
1490 exit:
1491 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1492 	return NULL;
1493 put_and_exit:
1494 	sock_put(newsk);
1495 	goto exit;
1496 }
1497 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1498 
1499 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1500 {
1501 	struct tcphdr *th = tcp_hdr(skb);
1502 	const struct iphdr *iph = ip_hdr(skb);
1503 	struct sock *nsk;
1504 	struct request_sock **prev;
1505 	/* Find possible connection requests. */
1506 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1507 						       iph->saddr, iph->daddr);
1508 	if (req)
1509 		return tcp_check_req(sk, skb, req, prev);
1510 
1511 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1512 			th->source, iph->daddr, th->dest, inet_iif(skb));
1513 
1514 	if (nsk) {
1515 		if (nsk->sk_state != TCP_TIME_WAIT) {
1516 			bh_lock_sock(nsk);
1517 			return nsk;
1518 		}
1519 		inet_twsk_put(inet_twsk(nsk));
1520 		return NULL;
1521 	}
1522 
1523 #ifdef CONFIG_SYN_COOKIES
1524 	if (!th->syn)
1525 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1526 #endif
1527 	return sk;
1528 }
1529 
1530 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1531 {
1532 	const struct iphdr *iph = ip_hdr(skb);
1533 
1534 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1535 		if (!tcp_v4_check(skb->len, iph->saddr,
1536 				  iph->daddr, skb->csum)) {
1537 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1538 			return 0;
1539 		}
1540 	}
1541 
1542 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1543 				       skb->len, IPPROTO_TCP, 0);
1544 
1545 	if (skb->len <= 76) {
1546 		return __skb_checksum_complete(skb);
1547 	}
1548 	return 0;
1549 }
1550 
1551 
1552 /* The socket must have it's spinlock held when we get
1553  * here.
1554  *
1555  * We have a potential double-lock case here, so even when
1556  * doing backlog processing we use the BH locking scheme.
1557  * This is because we cannot sleep with the original spinlock
1558  * held.
1559  */
1560 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1561 {
1562 	struct sock *rsk;
1563 #ifdef CONFIG_TCP_MD5SIG
1564 	/*
1565 	 * We really want to reject the packet as early as possible
1566 	 * if:
1567 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1568 	 *  o There is an MD5 option and we're not expecting one
1569 	 */
1570 	if (tcp_v4_inbound_md5_hash(sk, skb))
1571 		goto discard;
1572 #endif
1573 
1574 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1575 		sock_rps_save_rxhash(sk, skb->rxhash);
1576 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1577 			rsk = sk;
1578 			goto reset;
1579 		}
1580 		return 0;
1581 	}
1582 
1583 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1584 		goto csum_err;
1585 
1586 	if (sk->sk_state == TCP_LISTEN) {
1587 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1588 		if (!nsk)
1589 			goto discard;
1590 
1591 		if (nsk != sk) {
1592 			if (tcp_child_process(sk, nsk, skb)) {
1593 				rsk = nsk;
1594 				goto reset;
1595 			}
1596 			return 0;
1597 		}
1598 	} else
1599 		sock_rps_save_rxhash(sk, skb->rxhash);
1600 
1601 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1602 		rsk = sk;
1603 		goto reset;
1604 	}
1605 	return 0;
1606 
1607 reset:
1608 	tcp_v4_send_reset(rsk, skb);
1609 discard:
1610 	kfree_skb(skb);
1611 	/* Be careful here. If this function gets more complicated and
1612 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1613 	 * might be destroyed here. This current version compiles correctly,
1614 	 * but you have been warned.
1615 	 */
1616 	return 0;
1617 
1618 csum_err:
1619 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1620 	goto discard;
1621 }
1622 EXPORT_SYMBOL(tcp_v4_do_rcv);
1623 
1624 /*
1625  *	From tcp_input.c
1626  */
1627 
1628 int tcp_v4_rcv(struct sk_buff *skb)
1629 {
1630 	const struct iphdr *iph;
1631 	struct tcphdr *th;
1632 	struct sock *sk;
1633 	int ret;
1634 	struct net *net = dev_net(skb->dev);
1635 
1636 	if (skb->pkt_type != PACKET_HOST)
1637 		goto discard_it;
1638 
1639 	/* Count it even if it's bad */
1640 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1641 
1642 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1643 		goto discard_it;
1644 
1645 	th = tcp_hdr(skb);
1646 
1647 	if (th->doff < sizeof(struct tcphdr) / 4)
1648 		goto bad_packet;
1649 	if (!pskb_may_pull(skb, th->doff * 4))
1650 		goto discard_it;
1651 
1652 	/* An explanation is required here, I think.
1653 	 * Packet length and doff are validated by header prediction,
1654 	 * provided case of th->doff==0 is eliminated.
1655 	 * So, we defer the checks. */
1656 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1657 		goto bad_packet;
1658 
1659 	th = tcp_hdr(skb);
1660 	iph = ip_hdr(skb);
1661 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1662 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1663 				    skb->len - th->doff * 4);
1664 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1665 	TCP_SKB_CB(skb)->when	 = 0;
1666 	TCP_SKB_CB(skb)->flags	 = iph->tos;
1667 	TCP_SKB_CB(skb)->sacked	 = 0;
1668 
1669 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1670 	if (!sk)
1671 		goto no_tcp_socket;
1672 
1673 process:
1674 	if (sk->sk_state == TCP_TIME_WAIT)
1675 		goto do_time_wait;
1676 
1677 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1678 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1679 		goto discard_and_relse;
1680 	}
1681 
1682 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1683 		goto discard_and_relse;
1684 	nf_reset(skb);
1685 
1686 	if (sk_filter(sk, skb))
1687 		goto discard_and_relse;
1688 
1689 	skb->dev = NULL;
1690 
1691 	bh_lock_sock_nested(sk);
1692 	ret = 0;
1693 	if (!sock_owned_by_user(sk)) {
1694 #ifdef CONFIG_NET_DMA
1695 		struct tcp_sock *tp = tcp_sk(sk);
1696 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1697 			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1698 		if (tp->ucopy.dma_chan)
1699 			ret = tcp_v4_do_rcv(sk, skb);
1700 		else
1701 #endif
1702 		{
1703 			if (!tcp_prequeue(sk, skb))
1704 				ret = tcp_v4_do_rcv(sk, skb);
1705 		}
1706 	} else if (unlikely(sk_add_backlog(sk, skb))) {
1707 		bh_unlock_sock(sk);
1708 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1709 		goto discard_and_relse;
1710 	}
1711 	bh_unlock_sock(sk);
1712 
1713 	sock_put(sk);
1714 
1715 	return ret;
1716 
1717 no_tcp_socket:
1718 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1719 		goto discard_it;
1720 
1721 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1722 bad_packet:
1723 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1724 	} else {
1725 		tcp_v4_send_reset(NULL, skb);
1726 	}
1727 
1728 discard_it:
1729 	/* Discard frame. */
1730 	kfree_skb(skb);
1731 	return 0;
1732 
1733 discard_and_relse:
1734 	sock_put(sk);
1735 	goto discard_it;
1736 
1737 do_time_wait:
1738 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1739 		inet_twsk_put(inet_twsk(sk));
1740 		goto discard_it;
1741 	}
1742 
1743 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1744 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1745 		inet_twsk_put(inet_twsk(sk));
1746 		goto discard_it;
1747 	}
1748 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1749 	case TCP_TW_SYN: {
1750 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1751 							&tcp_hashinfo,
1752 							iph->daddr, th->dest,
1753 							inet_iif(skb));
1754 		if (sk2) {
1755 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1756 			inet_twsk_put(inet_twsk(sk));
1757 			sk = sk2;
1758 			goto process;
1759 		}
1760 		/* Fall through to ACK */
1761 	}
1762 	case TCP_TW_ACK:
1763 		tcp_v4_timewait_ack(sk, skb);
1764 		break;
1765 	case TCP_TW_RST:
1766 		goto no_tcp_socket;
1767 	case TCP_TW_SUCCESS:;
1768 	}
1769 	goto discard_it;
1770 }
1771 
1772 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1773 {
1774 	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1775 	struct inet_sock *inet = inet_sk(sk);
1776 	struct inet_peer *peer;
1777 
1778 	if (!rt ||
1779 	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1780 		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1781 		*release_it = true;
1782 	} else {
1783 		if (!rt->peer)
1784 			rt_bind_peer(rt, inet->inet_daddr, 1);
1785 		peer = rt->peer;
1786 		*release_it = false;
1787 	}
1788 
1789 	return peer;
1790 }
1791 EXPORT_SYMBOL(tcp_v4_get_peer);
1792 
1793 void *tcp_v4_tw_get_peer(struct sock *sk)
1794 {
1795 	struct inet_timewait_sock *tw = inet_twsk(sk);
1796 
1797 	return inet_getpeer_v4(tw->tw_daddr, 1);
1798 }
1799 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1800 
1801 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1802 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1803 	.twsk_unique	= tcp_twsk_unique,
1804 	.twsk_destructor= tcp_twsk_destructor,
1805 	.twsk_getpeer	= tcp_v4_tw_get_peer,
1806 };
1807 
1808 const struct inet_connection_sock_af_ops ipv4_specific = {
1809 	.queue_xmit	   = ip_queue_xmit,
1810 	.send_check	   = tcp_v4_send_check,
1811 	.rebuild_header	   = inet_sk_rebuild_header,
1812 	.conn_request	   = tcp_v4_conn_request,
1813 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1814 	.get_peer	   = tcp_v4_get_peer,
1815 	.net_header_len	   = sizeof(struct iphdr),
1816 	.setsockopt	   = ip_setsockopt,
1817 	.getsockopt	   = ip_getsockopt,
1818 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1819 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1820 	.bind_conflict	   = inet_csk_bind_conflict,
1821 #ifdef CONFIG_COMPAT
1822 	.compat_setsockopt = compat_ip_setsockopt,
1823 	.compat_getsockopt = compat_ip_getsockopt,
1824 #endif
1825 };
1826 EXPORT_SYMBOL(ipv4_specific);
1827 
1828 #ifdef CONFIG_TCP_MD5SIG
1829 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1830 	.md5_lookup		= tcp_v4_md5_lookup,
1831 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1832 	.md5_add		= tcp_v4_md5_add_func,
1833 	.md5_parse		= tcp_v4_parse_md5_keys,
1834 };
1835 #endif
1836 
1837 /* NOTE: A lot of things set to zero explicitly by call to
1838  *       sk_alloc() so need not be done here.
1839  */
1840 static int tcp_v4_init_sock(struct sock *sk)
1841 {
1842 	struct inet_connection_sock *icsk = inet_csk(sk);
1843 	struct tcp_sock *tp = tcp_sk(sk);
1844 
1845 	skb_queue_head_init(&tp->out_of_order_queue);
1846 	tcp_init_xmit_timers(sk);
1847 	tcp_prequeue_init(tp);
1848 
1849 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1850 	tp->mdev = TCP_TIMEOUT_INIT;
1851 
1852 	/* So many TCP implementations out there (incorrectly) count the
1853 	 * initial SYN frame in their delayed-ACK and congestion control
1854 	 * algorithms that we must have the following bandaid to talk
1855 	 * efficiently to them.  -DaveM
1856 	 */
1857 	tp->snd_cwnd = 2;
1858 
1859 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1860 	 * initialization of these values.
1861 	 */
1862 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1863 	tp->snd_cwnd_clamp = ~0;
1864 	tp->mss_cache = TCP_MSS_DEFAULT;
1865 
1866 	tp->reordering = sysctl_tcp_reordering;
1867 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1868 
1869 	sk->sk_state = TCP_CLOSE;
1870 
1871 	sk->sk_write_space = sk_stream_write_space;
1872 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1873 
1874 	icsk->icsk_af_ops = &ipv4_specific;
1875 	icsk->icsk_sync_mss = tcp_sync_mss;
1876 #ifdef CONFIG_TCP_MD5SIG
1877 	tp->af_specific = &tcp_sock_ipv4_specific;
1878 #endif
1879 
1880 	/* TCP Cookie Transactions */
1881 	if (sysctl_tcp_cookie_size > 0) {
1882 		/* Default, cookies without s_data_payload. */
1883 		tp->cookie_values =
1884 			kzalloc(sizeof(*tp->cookie_values),
1885 				sk->sk_allocation);
1886 		if (tp->cookie_values != NULL)
1887 			kref_init(&tp->cookie_values->kref);
1888 	}
1889 	/* Presumed zeroed, in order of appearance:
1890 	 *	cookie_in_always, cookie_out_never,
1891 	 *	s_data_constant, s_data_in, s_data_out
1892 	 */
1893 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1894 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1895 
1896 	local_bh_disable();
1897 	percpu_counter_inc(&tcp_sockets_allocated);
1898 	local_bh_enable();
1899 
1900 	return 0;
1901 }
1902 
1903 void tcp_v4_destroy_sock(struct sock *sk)
1904 {
1905 	struct tcp_sock *tp = tcp_sk(sk);
1906 
1907 	tcp_clear_xmit_timers(sk);
1908 
1909 	tcp_cleanup_congestion_control(sk);
1910 
1911 	/* Cleanup up the write buffer. */
1912 	tcp_write_queue_purge(sk);
1913 
1914 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1915 	__skb_queue_purge(&tp->out_of_order_queue);
1916 
1917 #ifdef CONFIG_TCP_MD5SIG
1918 	/* Clean up the MD5 key list, if any */
1919 	if (tp->md5sig_info) {
1920 		tcp_v4_clear_md5_list(sk);
1921 		kfree(tp->md5sig_info);
1922 		tp->md5sig_info = NULL;
1923 	}
1924 #endif
1925 
1926 #ifdef CONFIG_NET_DMA
1927 	/* Cleans up our sk_async_wait_queue */
1928 	__skb_queue_purge(&sk->sk_async_wait_queue);
1929 #endif
1930 
1931 	/* Clean prequeue, it must be empty really */
1932 	__skb_queue_purge(&tp->ucopy.prequeue);
1933 
1934 	/* Clean up a referenced TCP bind bucket. */
1935 	if (inet_csk(sk)->icsk_bind_hash)
1936 		inet_put_port(sk);
1937 
1938 	/*
1939 	 * If sendmsg cached page exists, toss it.
1940 	 */
1941 	if (sk->sk_sndmsg_page) {
1942 		__free_page(sk->sk_sndmsg_page);
1943 		sk->sk_sndmsg_page = NULL;
1944 	}
1945 
1946 	/* TCP Cookie Transactions */
1947 	if (tp->cookie_values != NULL) {
1948 		kref_put(&tp->cookie_values->kref,
1949 			 tcp_cookie_values_release);
1950 		tp->cookie_values = NULL;
1951 	}
1952 
1953 	percpu_counter_dec(&tcp_sockets_allocated);
1954 }
1955 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1956 
1957 #ifdef CONFIG_PROC_FS
1958 /* Proc filesystem TCP sock list dumping. */
1959 
1960 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1961 {
1962 	return hlist_nulls_empty(head) ? NULL :
1963 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1964 }
1965 
1966 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1967 {
1968 	return !is_a_nulls(tw->tw_node.next) ?
1969 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1970 }
1971 
1972 /*
1973  * Get next listener socket follow cur.  If cur is NULL, get first socket
1974  * starting from bucket given in st->bucket; when st->bucket is zero the
1975  * very first socket in the hash table is returned.
1976  */
1977 static void *listening_get_next(struct seq_file *seq, void *cur)
1978 {
1979 	struct inet_connection_sock *icsk;
1980 	struct hlist_nulls_node *node;
1981 	struct sock *sk = cur;
1982 	struct inet_listen_hashbucket *ilb;
1983 	struct tcp_iter_state *st = seq->private;
1984 	struct net *net = seq_file_net(seq);
1985 
1986 	if (!sk) {
1987 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1988 		spin_lock_bh(&ilb->lock);
1989 		sk = sk_nulls_head(&ilb->head);
1990 		st->offset = 0;
1991 		goto get_sk;
1992 	}
1993 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1994 	++st->num;
1995 	++st->offset;
1996 
1997 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1998 		struct request_sock *req = cur;
1999 
2000 		icsk = inet_csk(st->syn_wait_sk);
2001 		req = req->dl_next;
2002 		while (1) {
2003 			while (req) {
2004 				if (req->rsk_ops->family == st->family) {
2005 					cur = req;
2006 					goto out;
2007 				}
2008 				req = req->dl_next;
2009 			}
2010 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2011 				break;
2012 get_req:
2013 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2014 		}
2015 		sk	  = sk_nulls_next(st->syn_wait_sk);
2016 		st->state = TCP_SEQ_STATE_LISTENING;
2017 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2018 	} else {
2019 		icsk = inet_csk(sk);
2020 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2022 			goto start_req;
2023 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024 		sk = sk_nulls_next(sk);
2025 	}
2026 get_sk:
2027 	sk_nulls_for_each_from(sk, node) {
2028 		if (!net_eq(sock_net(sk), net))
2029 			continue;
2030 		if (sk->sk_family == st->family) {
2031 			cur = sk;
2032 			goto out;
2033 		}
2034 		icsk = inet_csk(sk);
2035 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2036 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2037 start_req:
2038 			st->uid		= sock_i_uid(sk);
2039 			st->syn_wait_sk = sk;
2040 			st->state	= TCP_SEQ_STATE_OPENREQ;
2041 			st->sbucket	= 0;
2042 			goto get_req;
2043 		}
2044 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2045 	}
2046 	spin_unlock_bh(&ilb->lock);
2047 	st->offset = 0;
2048 	if (++st->bucket < INET_LHTABLE_SIZE) {
2049 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2050 		spin_lock_bh(&ilb->lock);
2051 		sk = sk_nulls_head(&ilb->head);
2052 		goto get_sk;
2053 	}
2054 	cur = NULL;
2055 out:
2056 	return cur;
2057 }
2058 
2059 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2060 {
2061 	struct tcp_iter_state *st = seq->private;
2062 	void *rc;
2063 
2064 	st->bucket = 0;
2065 	st->offset = 0;
2066 	rc = listening_get_next(seq, NULL);
2067 
2068 	while (rc && *pos) {
2069 		rc = listening_get_next(seq, rc);
2070 		--*pos;
2071 	}
2072 	return rc;
2073 }
2074 
2075 static inline int empty_bucket(struct tcp_iter_state *st)
2076 {
2077 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2078 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2079 }
2080 
2081 /*
2082  * Get first established socket starting from bucket given in st->bucket.
2083  * If st->bucket is zero, the very first socket in the hash is returned.
2084  */
2085 static void *established_get_first(struct seq_file *seq)
2086 {
2087 	struct tcp_iter_state *st = seq->private;
2088 	struct net *net = seq_file_net(seq);
2089 	void *rc = NULL;
2090 
2091 	st->offset = 0;
2092 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2093 		struct sock *sk;
2094 		struct hlist_nulls_node *node;
2095 		struct inet_timewait_sock *tw;
2096 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2097 
2098 		/* Lockless fast path for the common case of empty buckets */
2099 		if (empty_bucket(st))
2100 			continue;
2101 
2102 		spin_lock_bh(lock);
2103 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2104 			if (sk->sk_family != st->family ||
2105 			    !net_eq(sock_net(sk), net)) {
2106 				continue;
2107 			}
2108 			rc = sk;
2109 			goto out;
2110 		}
2111 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2112 		inet_twsk_for_each(tw, node,
2113 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2114 			if (tw->tw_family != st->family ||
2115 			    !net_eq(twsk_net(tw), net)) {
2116 				continue;
2117 			}
2118 			rc = tw;
2119 			goto out;
2120 		}
2121 		spin_unlock_bh(lock);
2122 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2123 	}
2124 out:
2125 	return rc;
2126 }
2127 
2128 static void *established_get_next(struct seq_file *seq, void *cur)
2129 {
2130 	struct sock *sk = cur;
2131 	struct inet_timewait_sock *tw;
2132 	struct hlist_nulls_node *node;
2133 	struct tcp_iter_state *st = seq->private;
2134 	struct net *net = seq_file_net(seq);
2135 
2136 	++st->num;
2137 	++st->offset;
2138 
2139 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2140 		tw = cur;
2141 		tw = tw_next(tw);
2142 get_tw:
2143 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2144 			tw = tw_next(tw);
2145 		}
2146 		if (tw) {
2147 			cur = tw;
2148 			goto out;
2149 		}
2150 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2151 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2152 
2153 		/* Look for next non empty bucket */
2154 		st->offset = 0;
2155 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2156 				empty_bucket(st))
2157 			;
2158 		if (st->bucket > tcp_hashinfo.ehash_mask)
2159 			return NULL;
2160 
2161 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2162 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2163 	} else
2164 		sk = sk_nulls_next(sk);
2165 
2166 	sk_nulls_for_each_from(sk, node) {
2167 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2168 			goto found;
2169 	}
2170 
2171 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2172 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2173 	goto get_tw;
2174 found:
2175 	cur = sk;
2176 out:
2177 	return cur;
2178 }
2179 
2180 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2181 {
2182 	struct tcp_iter_state *st = seq->private;
2183 	void *rc;
2184 
2185 	st->bucket = 0;
2186 	rc = established_get_first(seq);
2187 
2188 	while (rc && pos) {
2189 		rc = established_get_next(seq, rc);
2190 		--pos;
2191 	}
2192 	return rc;
2193 }
2194 
2195 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2196 {
2197 	void *rc;
2198 	struct tcp_iter_state *st = seq->private;
2199 
2200 	st->state = TCP_SEQ_STATE_LISTENING;
2201 	rc	  = listening_get_idx(seq, &pos);
2202 
2203 	if (!rc) {
2204 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2205 		rc	  = established_get_idx(seq, pos);
2206 	}
2207 
2208 	return rc;
2209 }
2210 
2211 static void *tcp_seek_last_pos(struct seq_file *seq)
2212 {
2213 	struct tcp_iter_state *st = seq->private;
2214 	int offset = st->offset;
2215 	int orig_num = st->num;
2216 	void *rc = NULL;
2217 
2218 	switch (st->state) {
2219 	case TCP_SEQ_STATE_OPENREQ:
2220 	case TCP_SEQ_STATE_LISTENING:
2221 		if (st->bucket >= INET_LHTABLE_SIZE)
2222 			break;
2223 		st->state = TCP_SEQ_STATE_LISTENING;
2224 		rc = listening_get_next(seq, NULL);
2225 		while (offset-- && rc)
2226 			rc = listening_get_next(seq, rc);
2227 		if (rc)
2228 			break;
2229 		st->bucket = 0;
2230 		/* Fallthrough */
2231 	case TCP_SEQ_STATE_ESTABLISHED:
2232 	case TCP_SEQ_STATE_TIME_WAIT:
2233 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2234 		if (st->bucket > tcp_hashinfo.ehash_mask)
2235 			break;
2236 		rc = established_get_first(seq);
2237 		while (offset-- && rc)
2238 			rc = established_get_next(seq, rc);
2239 	}
2240 
2241 	st->num = orig_num;
2242 
2243 	return rc;
2244 }
2245 
2246 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2247 {
2248 	struct tcp_iter_state *st = seq->private;
2249 	void *rc;
2250 
2251 	if (*pos && *pos == st->last_pos) {
2252 		rc = tcp_seek_last_pos(seq);
2253 		if (rc)
2254 			goto out;
2255 	}
2256 
2257 	st->state = TCP_SEQ_STATE_LISTENING;
2258 	st->num = 0;
2259 	st->bucket = 0;
2260 	st->offset = 0;
2261 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2262 
2263 out:
2264 	st->last_pos = *pos;
2265 	return rc;
2266 }
2267 
2268 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2269 {
2270 	struct tcp_iter_state *st = seq->private;
2271 	void *rc = NULL;
2272 
2273 	if (v == SEQ_START_TOKEN) {
2274 		rc = tcp_get_idx(seq, 0);
2275 		goto out;
2276 	}
2277 
2278 	switch (st->state) {
2279 	case TCP_SEQ_STATE_OPENREQ:
2280 	case TCP_SEQ_STATE_LISTENING:
2281 		rc = listening_get_next(seq, v);
2282 		if (!rc) {
2283 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2284 			st->bucket = 0;
2285 			st->offset = 0;
2286 			rc	  = established_get_first(seq);
2287 		}
2288 		break;
2289 	case TCP_SEQ_STATE_ESTABLISHED:
2290 	case TCP_SEQ_STATE_TIME_WAIT:
2291 		rc = established_get_next(seq, v);
2292 		break;
2293 	}
2294 out:
2295 	++*pos;
2296 	st->last_pos = *pos;
2297 	return rc;
2298 }
2299 
2300 static void tcp_seq_stop(struct seq_file *seq, void *v)
2301 {
2302 	struct tcp_iter_state *st = seq->private;
2303 
2304 	switch (st->state) {
2305 	case TCP_SEQ_STATE_OPENREQ:
2306 		if (v) {
2307 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2308 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2309 		}
2310 	case TCP_SEQ_STATE_LISTENING:
2311 		if (v != SEQ_START_TOKEN)
2312 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2313 		break;
2314 	case TCP_SEQ_STATE_TIME_WAIT:
2315 	case TCP_SEQ_STATE_ESTABLISHED:
2316 		if (v)
2317 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2318 		break;
2319 	}
2320 }
2321 
2322 static int tcp_seq_open(struct inode *inode, struct file *file)
2323 {
2324 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2325 	struct tcp_iter_state *s;
2326 	int err;
2327 
2328 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2329 			  sizeof(struct tcp_iter_state));
2330 	if (err < 0)
2331 		return err;
2332 
2333 	s = ((struct seq_file *)file->private_data)->private;
2334 	s->family		= afinfo->family;
2335 	s->last_pos 		= 0;
2336 	return 0;
2337 }
2338 
2339 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2340 {
2341 	int rc = 0;
2342 	struct proc_dir_entry *p;
2343 
2344 	afinfo->seq_fops.open		= tcp_seq_open;
2345 	afinfo->seq_fops.read		= seq_read;
2346 	afinfo->seq_fops.llseek		= seq_lseek;
2347 	afinfo->seq_fops.release	= seq_release_net;
2348 
2349 	afinfo->seq_ops.start		= tcp_seq_start;
2350 	afinfo->seq_ops.next		= tcp_seq_next;
2351 	afinfo->seq_ops.stop		= tcp_seq_stop;
2352 
2353 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2354 			     &afinfo->seq_fops, afinfo);
2355 	if (!p)
2356 		rc = -ENOMEM;
2357 	return rc;
2358 }
2359 EXPORT_SYMBOL(tcp_proc_register);
2360 
2361 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2362 {
2363 	proc_net_remove(net, afinfo->name);
2364 }
2365 EXPORT_SYMBOL(tcp_proc_unregister);
2366 
2367 static void get_openreq4(struct sock *sk, struct request_sock *req,
2368 			 struct seq_file *f, int i, int uid, int *len)
2369 {
2370 	const struct inet_request_sock *ireq = inet_rsk(req);
2371 	int ttd = req->expires - jiffies;
2372 
2373 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2374 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2375 		i,
2376 		ireq->loc_addr,
2377 		ntohs(inet_sk(sk)->inet_sport),
2378 		ireq->rmt_addr,
2379 		ntohs(ireq->rmt_port),
2380 		TCP_SYN_RECV,
2381 		0, 0, /* could print option size, but that is af dependent. */
2382 		1,    /* timers active (only the expire timer) */
2383 		jiffies_to_clock_t(ttd),
2384 		req->retrans,
2385 		uid,
2386 		0,  /* non standard timer */
2387 		0, /* open_requests have no inode */
2388 		atomic_read(&sk->sk_refcnt),
2389 		req,
2390 		len);
2391 }
2392 
2393 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2394 {
2395 	int timer_active;
2396 	unsigned long timer_expires;
2397 	struct tcp_sock *tp = tcp_sk(sk);
2398 	const struct inet_connection_sock *icsk = inet_csk(sk);
2399 	struct inet_sock *inet = inet_sk(sk);
2400 	__be32 dest = inet->inet_daddr;
2401 	__be32 src = inet->inet_rcv_saddr;
2402 	__u16 destp = ntohs(inet->inet_dport);
2403 	__u16 srcp = ntohs(inet->inet_sport);
2404 	int rx_queue;
2405 
2406 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2407 		timer_active	= 1;
2408 		timer_expires	= icsk->icsk_timeout;
2409 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2410 		timer_active	= 4;
2411 		timer_expires	= icsk->icsk_timeout;
2412 	} else if (timer_pending(&sk->sk_timer)) {
2413 		timer_active	= 2;
2414 		timer_expires	= sk->sk_timer.expires;
2415 	} else {
2416 		timer_active	= 0;
2417 		timer_expires = jiffies;
2418 	}
2419 
2420 	if (sk->sk_state == TCP_LISTEN)
2421 		rx_queue = sk->sk_ack_backlog;
2422 	else
2423 		/*
2424 		 * because we dont lock socket, we might find a transient negative value
2425 		 */
2426 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2427 
2428 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2429 			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2430 		i, src, srcp, dest, destp, sk->sk_state,
2431 		tp->write_seq - tp->snd_una,
2432 		rx_queue,
2433 		timer_active,
2434 		jiffies_to_clock_t(timer_expires - jiffies),
2435 		icsk->icsk_retransmits,
2436 		sock_i_uid(sk),
2437 		icsk->icsk_probes_out,
2438 		sock_i_ino(sk),
2439 		atomic_read(&sk->sk_refcnt), sk,
2440 		jiffies_to_clock_t(icsk->icsk_rto),
2441 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2442 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2443 		tp->snd_cwnd,
2444 		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2445 		len);
2446 }
2447 
2448 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2449 			       struct seq_file *f, int i, int *len)
2450 {
2451 	__be32 dest, src;
2452 	__u16 destp, srcp;
2453 	int ttd = tw->tw_ttd - jiffies;
2454 
2455 	if (ttd < 0)
2456 		ttd = 0;
2457 
2458 	dest  = tw->tw_daddr;
2459 	src   = tw->tw_rcv_saddr;
2460 	destp = ntohs(tw->tw_dport);
2461 	srcp  = ntohs(tw->tw_sport);
2462 
2463 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2464 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2465 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2466 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2467 		atomic_read(&tw->tw_refcnt), tw, len);
2468 }
2469 
2470 #define TMPSZ 150
2471 
2472 static int tcp4_seq_show(struct seq_file *seq, void *v)
2473 {
2474 	struct tcp_iter_state *st;
2475 	int len;
2476 
2477 	if (v == SEQ_START_TOKEN) {
2478 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2479 			   "  sl  local_address rem_address   st tx_queue "
2480 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2481 			   "inode");
2482 		goto out;
2483 	}
2484 	st = seq->private;
2485 
2486 	switch (st->state) {
2487 	case TCP_SEQ_STATE_LISTENING:
2488 	case TCP_SEQ_STATE_ESTABLISHED:
2489 		get_tcp4_sock(v, seq, st->num, &len);
2490 		break;
2491 	case TCP_SEQ_STATE_OPENREQ:
2492 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2493 		break;
2494 	case TCP_SEQ_STATE_TIME_WAIT:
2495 		get_timewait4_sock(v, seq, st->num, &len);
2496 		break;
2497 	}
2498 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2499 out:
2500 	return 0;
2501 }
2502 
2503 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2504 	.name		= "tcp",
2505 	.family		= AF_INET,
2506 	.seq_fops	= {
2507 		.owner		= THIS_MODULE,
2508 	},
2509 	.seq_ops	= {
2510 		.show		= tcp4_seq_show,
2511 	},
2512 };
2513 
2514 static int __net_init tcp4_proc_init_net(struct net *net)
2515 {
2516 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2517 }
2518 
2519 static void __net_exit tcp4_proc_exit_net(struct net *net)
2520 {
2521 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2522 }
2523 
2524 static struct pernet_operations tcp4_net_ops = {
2525 	.init = tcp4_proc_init_net,
2526 	.exit = tcp4_proc_exit_net,
2527 };
2528 
2529 int __init tcp4_proc_init(void)
2530 {
2531 	return register_pernet_subsys(&tcp4_net_ops);
2532 }
2533 
2534 void tcp4_proc_exit(void)
2535 {
2536 	unregister_pernet_subsys(&tcp4_net_ops);
2537 }
2538 #endif /* CONFIG_PROC_FS */
2539 
2540 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2541 {
2542 	const struct iphdr *iph = skb_gro_network_header(skb);
2543 
2544 	switch (skb->ip_summed) {
2545 	case CHECKSUM_COMPLETE:
2546 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2547 				  skb->csum)) {
2548 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2549 			break;
2550 		}
2551 
2552 		/* fall through */
2553 	case CHECKSUM_NONE:
2554 		NAPI_GRO_CB(skb)->flush = 1;
2555 		return NULL;
2556 	}
2557 
2558 	return tcp_gro_receive(head, skb);
2559 }
2560 
2561 int tcp4_gro_complete(struct sk_buff *skb)
2562 {
2563 	const struct iphdr *iph = ip_hdr(skb);
2564 	struct tcphdr *th = tcp_hdr(skb);
2565 
2566 	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2567 				  iph->saddr, iph->daddr, 0);
2568 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2569 
2570 	return tcp_gro_complete(skb);
2571 }
2572 
2573 struct proto tcp_prot = {
2574 	.name			= "TCP",
2575 	.owner			= THIS_MODULE,
2576 	.close			= tcp_close,
2577 	.connect		= tcp_v4_connect,
2578 	.disconnect		= tcp_disconnect,
2579 	.accept			= inet_csk_accept,
2580 	.ioctl			= tcp_ioctl,
2581 	.init			= tcp_v4_init_sock,
2582 	.destroy		= tcp_v4_destroy_sock,
2583 	.shutdown		= tcp_shutdown,
2584 	.setsockopt		= tcp_setsockopt,
2585 	.getsockopt		= tcp_getsockopt,
2586 	.recvmsg		= tcp_recvmsg,
2587 	.sendmsg		= tcp_sendmsg,
2588 	.sendpage		= tcp_sendpage,
2589 	.backlog_rcv		= tcp_v4_do_rcv,
2590 	.hash			= inet_hash,
2591 	.unhash			= inet_unhash,
2592 	.get_port		= inet_csk_get_port,
2593 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2594 	.sockets_allocated	= &tcp_sockets_allocated,
2595 	.orphan_count		= &tcp_orphan_count,
2596 	.memory_allocated	= &tcp_memory_allocated,
2597 	.memory_pressure	= &tcp_memory_pressure,
2598 	.sysctl_mem		= sysctl_tcp_mem,
2599 	.sysctl_wmem		= sysctl_tcp_wmem,
2600 	.sysctl_rmem		= sysctl_tcp_rmem,
2601 	.max_header		= MAX_TCP_HEADER,
2602 	.obj_size		= sizeof(struct tcp_sock),
2603 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2604 	.twsk_prot		= &tcp_timewait_sock_ops,
2605 	.rsk_prot		= &tcp_request_sock_ops,
2606 	.h.hashinfo		= &tcp_hashinfo,
2607 	.no_autobind		= true,
2608 #ifdef CONFIG_COMPAT
2609 	.compat_setsockopt	= compat_tcp_setsockopt,
2610 	.compat_getsockopt	= compat_tcp_getsockopt,
2611 #endif
2612 };
2613 EXPORT_SYMBOL(tcp_prot);
2614 
2615 
2616 static int __net_init tcp_sk_init(struct net *net)
2617 {
2618 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2619 				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2620 }
2621 
2622 static void __net_exit tcp_sk_exit(struct net *net)
2623 {
2624 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2625 }
2626 
2627 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2628 {
2629 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2630 }
2631 
2632 static struct pernet_operations __net_initdata tcp_sk_ops = {
2633        .init	   = tcp_sk_init,
2634        .exit	   = tcp_sk_exit,
2635        .exit_batch = tcp_sk_exit_batch,
2636 };
2637 
2638 void __init tcp_v4_init(void)
2639 {
2640 	inet_hashinfo_init(&tcp_hashinfo);
2641 	if (register_pernet_subsys(&tcp_sk_ops))
2642 		panic("Failed to create the TCP control socket.\n");
2643 }
2644