xref: /linux/net/ipv4/tcp_ipv4.c (revision 80d443e8876602be2c130f79c4de81e12e2a700d)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
99 {
100 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
101 					  ip_hdr(skb)->saddr,
102 					  tcp_hdr(skb)->dest,
103 					  tcp_hdr(skb)->source, tsoff);
104 }
105 
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
109 	struct tcp_sock *tp = tcp_sk(sk);
110 
111 	/* With PAWS, it is safe from the viewpoint
112 	   of data integrity. Even without PAWS it is safe provided sequence
113 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
114 
115 	   Actually, the idea is close to VJ's one, only timestamp cache is
116 	   held not per host, but per port pair and TW bucket is used as state
117 	   holder.
118 
119 	   If TW bucket has been already destroyed we fall back to VJ's scheme
120 	   and use initial timestamp retrieved from peer table.
121 	 */
122 	if (tcptw->tw_ts_recent_stamp &&
123 	    (!twp || (sysctl_tcp_tw_reuse &&
124 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
125 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
126 		if (tp->write_seq == 0)
127 			tp->write_seq = 1;
128 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
129 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
130 		sock_hold(sktw);
131 		return 1;
132 	}
133 
134 	return 0;
135 }
136 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
137 
138 /* This will initiate an outgoing connection. */
139 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
140 {
141 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
142 	struct inet_sock *inet = inet_sk(sk);
143 	struct tcp_sock *tp = tcp_sk(sk);
144 	__be16 orig_sport, orig_dport;
145 	__be32 daddr, nexthop;
146 	struct flowi4 *fl4;
147 	struct rtable *rt;
148 	int err;
149 	struct ip_options_rcu *inet_opt;
150 
151 	if (addr_len < sizeof(struct sockaddr_in))
152 		return -EINVAL;
153 
154 	if (usin->sin_family != AF_INET)
155 		return -EAFNOSUPPORT;
156 
157 	nexthop = daddr = usin->sin_addr.s_addr;
158 	inet_opt = rcu_dereference_protected(inet->inet_opt,
159 					     lockdep_sock_is_held(sk));
160 	if (inet_opt && inet_opt->opt.srr) {
161 		if (!daddr)
162 			return -EINVAL;
163 		nexthop = inet_opt->opt.faddr;
164 	}
165 
166 	orig_sport = inet->inet_sport;
167 	orig_dport = usin->sin_port;
168 	fl4 = &inet->cork.fl.u.ip4;
169 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
170 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
171 			      IPPROTO_TCP,
172 			      orig_sport, orig_dport, sk);
173 	if (IS_ERR(rt)) {
174 		err = PTR_ERR(rt);
175 		if (err == -ENETUNREACH)
176 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
177 		return err;
178 	}
179 
180 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
181 		ip_rt_put(rt);
182 		return -ENETUNREACH;
183 	}
184 
185 	if (!inet_opt || !inet_opt->opt.srr)
186 		daddr = fl4->daddr;
187 
188 	if (!inet->inet_saddr)
189 		inet->inet_saddr = fl4->saddr;
190 	sk_rcv_saddr_set(sk, inet->inet_saddr);
191 
192 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
193 		/* Reset inherited state */
194 		tp->rx_opt.ts_recent	   = 0;
195 		tp->rx_opt.ts_recent_stamp = 0;
196 		if (likely(!tp->repair))
197 			tp->write_seq	   = 0;
198 	}
199 
200 	if (tcp_death_row.sysctl_tw_recycle &&
201 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
202 		tcp_fetch_timewait_stamp(sk, &rt->dst);
203 
204 	inet->inet_dport = usin->sin_port;
205 	sk_daddr_set(sk, daddr);
206 
207 	inet_csk(sk)->icsk_ext_hdr_len = 0;
208 	if (inet_opt)
209 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
210 
211 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
212 
213 	/* Socket identity is still unknown (sport may be zero).
214 	 * However we set state to SYN-SENT and not releasing socket
215 	 * lock select source port, enter ourselves into the hash tables and
216 	 * complete initialization after this.
217 	 */
218 	tcp_set_state(sk, TCP_SYN_SENT);
219 	err = inet_hash_connect(&tcp_death_row, sk);
220 	if (err)
221 		goto failure;
222 
223 	sk_set_txhash(sk);
224 
225 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
226 			       inet->inet_sport, inet->inet_dport, sk);
227 	if (IS_ERR(rt)) {
228 		err = PTR_ERR(rt);
229 		rt = NULL;
230 		goto failure;
231 	}
232 	/* OK, now commit destination to socket.  */
233 	sk->sk_gso_type = SKB_GSO_TCPV4;
234 	sk_setup_caps(sk, &rt->dst);
235 
236 	if (!tp->write_seq && likely(!tp->repair))
237 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
238 							   inet->inet_daddr,
239 							   inet->inet_sport,
240 							   usin->sin_port,
241 							   &tp->tsoffset);
242 
243 	inet->inet_id = tp->write_seq ^ jiffies;
244 
245 	err = tcp_connect(sk);
246 
247 	rt = NULL;
248 	if (err)
249 		goto failure;
250 
251 	return 0;
252 
253 failure:
254 	/*
255 	 * This unhashes the socket and releases the local port,
256 	 * if necessary.
257 	 */
258 	tcp_set_state(sk, TCP_CLOSE);
259 	ip_rt_put(rt);
260 	sk->sk_route_caps = 0;
261 	inet->inet_dport = 0;
262 	return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 	u32 mtu = tcp_sk(sk)->mtu_info;
276 
277 	dst = inet_csk_update_pmtu(sk, mtu);
278 	if (!dst)
279 		return;
280 
281 	/* Something is about to be wrong... Remember soft error
282 	 * for the case, if this connection will not able to recover.
283 	 */
284 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 		sk->sk_err_soft = EMSGSIZE;
286 
287 	mtu = dst_mtu(dst);
288 
289 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 	    ip_sk_accept_pmtu(sk) &&
291 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 		tcp_sync_mss(sk, mtu);
293 
294 		/* Resend the TCP packet because it's
295 		 * clear that the old packet has been
296 		 * dropped. This is the new "fast" path mtu
297 		 * discovery.
298 		 */
299 		tcp_simple_retransmit(sk);
300 	} /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 	struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308 	if (dst)
309 		dst->ops->redirect(dst, sk, skb);
310 }
311 
312 
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316 	struct request_sock *req = inet_reqsk(sk);
317 	struct net *net = sock_net(sk);
318 
319 	/* ICMPs are not backlogged, hence we cannot get
320 	 * an established socket here.
321 	 */
322 	if (seq != tcp_rsk(req)->snt_isn) {
323 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
324 	} else if (abort) {
325 		/*
326 		 * Still in SYN_RECV, just remove it silently.
327 		 * There is no good way to pass the error to the newly
328 		 * created socket, and POSIX does not want network
329 		 * errors returned from accept().
330 		 */
331 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
332 		tcp_listendrop(req->rsk_listener);
333 	}
334 	reqsk_put(req);
335 }
336 EXPORT_SYMBOL(tcp_req_err);
337 
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353 
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355 {
356 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358 	struct inet_connection_sock *icsk;
359 	struct tcp_sock *tp;
360 	struct inet_sock *inet;
361 	const int type = icmp_hdr(icmp_skb)->type;
362 	const int code = icmp_hdr(icmp_skb)->code;
363 	struct sock *sk;
364 	struct sk_buff *skb;
365 	struct request_sock *fastopen;
366 	__u32 seq, snd_una;
367 	__u32 remaining;
368 	int err;
369 	struct net *net = dev_net(icmp_skb->dev);
370 
371 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
372 				       th->dest, iph->saddr, ntohs(th->source),
373 				       inet_iif(icmp_skb));
374 	if (!sk) {
375 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
376 		return;
377 	}
378 	if (sk->sk_state == TCP_TIME_WAIT) {
379 		inet_twsk_put(inet_twsk(sk));
380 		return;
381 	}
382 	seq = ntohl(th->seq);
383 	if (sk->sk_state == TCP_NEW_SYN_RECV)
384 		return tcp_req_err(sk, seq,
385 				  type == ICMP_PARAMETERPROB ||
386 				  type == ICMP_TIME_EXCEEDED ||
387 				  (type == ICMP_DEST_UNREACH &&
388 				   (code == ICMP_NET_UNREACH ||
389 				    code == ICMP_HOST_UNREACH)));
390 
391 	bh_lock_sock(sk);
392 	/* If too many ICMPs get dropped on busy
393 	 * servers this needs to be solved differently.
394 	 * We do take care of PMTU discovery (RFC1191) special case :
395 	 * we can receive locally generated ICMP messages while socket is held.
396 	 */
397 	if (sock_owned_by_user(sk)) {
398 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
399 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
400 	}
401 	if (sk->sk_state == TCP_CLOSE)
402 		goto out;
403 
404 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
405 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
406 		goto out;
407 	}
408 
409 	icsk = inet_csk(sk);
410 	tp = tcp_sk(sk);
411 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
412 	fastopen = tp->fastopen_rsk;
413 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
414 	if (sk->sk_state != TCP_LISTEN &&
415 	    !between(seq, snd_una, tp->snd_nxt)) {
416 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
417 		goto out;
418 	}
419 
420 	switch (type) {
421 	case ICMP_REDIRECT:
422 		do_redirect(icmp_skb, sk);
423 		goto out;
424 	case ICMP_SOURCE_QUENCH:
425 		/* Just silently ignore these. */
426 		goto out;
427 	case ICMP_PARAMETERPROB:
428 		err = EPROTO;
429 		break;
430 	case ICMP_DEST_UNREACH:
431 		if (code > NR_ICMP_UNREACH)
432 			goto out;
433 
434 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
435 			/* We are not interested in TCP_LISTEN and open_requests
436 			 * (SYN-ACKs send out by Linux are always <576bytes so
437 			 * they should go through unfragmented).
438 			 */
439 			if (sk->sk_state == TCP_LISTEN)
440 				goto out;
441 
442 			tp->mtu_info = info;
443 			if (!sock_owned_by_user(sk)) {
444 				tcp_v4_mtu_reduced(sk);
445 			} else {
446 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
447 					sock_hold(sk);
448 			}
449 			goto out;
450 		}
451 
452 		err = icmp_err_convert[code].errno;
453 		/* check if icmp_skb allows revert of backoff
454 		 * (see draft-zimmermann-tcp-lcd) */
455 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
456 			break;
457 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
458 		    !icsk->icsk_backoff || fastopen)
459 			break;
460 
461 		if (sock_owned_by_user(sk))
462 			break;
463 
464 		icsk->icsk_backoff--;
465 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
466 					       TCP_TIMEOUT_INIT;
467 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
468 
469 		skb = tcp_write_queue_head(sk);
470 		BUG_ON(!skb);
471 
472 		remaining = icsk->icsk_rto -
473 			    min(icsk->icsk_rto,
474 				tcp_time_stamp - tcp_skb_timestamp(skb));
475 
476 		if (remaining) {
477 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
478 						  remaining, TCP_RTO_MAX);
479 		} else {
480 			/* RTO revert clocked out retransmission.
481 			 * Will retransmit now */
482 			tcp_retransmit_timer(sk);
483 		}
484 
485 		break;
486 	case ICMP_TIME_EXCEEDED:
487 		err = EHOSTUNREACH;
488 		break;
489 	default:
490 		goto out;
491 	}
492 
493 	switch (sk->sk_state) {
494 	case TCP_SYN_SENT:
495 	case TCP_SYN_RECV:
496 		/* Only in fast or simultaneous open. If a fast open socket is
497 		 * is already accepted it is treated as a connected one below.
498 		 */
499 		if (fastopen && !fastopen->sk)
500 			break;
501 
502 		if (!sock_owned_by_user(sk)) {
503 			sk->sk_err = err;
504 
505 			sk->sk_error_report(sk);
506 
507 			tcp_done(sk);
508 		} else {
509 			sk->sk_err_soft = err;
510 		}
511 		goto out;
512 	}
513 
514 	/* If we've already connected we will keep trying
515 	 * until we time out, or the user gives up.
516 	 *
517 	 * rfc1122 4.2.3.9 allows to consider as hard errors
518 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519 	 * but it is obsoleted by pmtu discovery).
520 	 *
521 	 * Note, that in modern internet, where routing is unreliable
522 	 * and in each dark corner broken firewalls sit, sending random
523 	 * errors ordered by their masters even this two messages finally lose
524 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
525 	 *
526 	 * Now we are in compliance with RFCs.
527 	 *							--ANK (980905)
528 	 */
529 
530 	inet = inet_sk(sk);
531 	if (!sock_owned_by_user(sk) && inet->recverr) {
532 		sk->sk_err = err;
533 		sk->sk_error_report(sk);
534 	} else	{ /* Only an error on timeout */
535 		sk->sk_err_soft = err;
536 	}
537 
538 out:
539 	bh_unlock_sock(sk);
540 	sock_put(sk);
541 }
542 
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545 	struct tcphdr *th = tcp_hdr(skb);
546 
547 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
548 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549 		skb->csum_start = skb_transport_header(skb) - skb->head;
550 		skb->csum_offset = offsetof(struct tcphdr, check);
551 	} else {
552 		th->check = tcp_v4_check(skb->len, saddr, daddr,
553 					 csum_partial(th,
554 						      th->doff << 2,
555 						      skb->csum));
556 	}
557 }
558 
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562 	const struct inet_sock *inet = inet_sk(sk);
563 
564 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567 
568 /*
569  *	This routine will send an RST to the other tcp.
570  *
571  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *		      for reset.
573  *	Answer: if a packet caused RST, it is not for a socket
574  *		existing in our system, if it is matched to a socket,
575  *		it is just duplicate segment or bug in other side's TCP.
576  *		So that we build reply only basing on parameters
577  *		arrived with segment.
578  *	Exception: precedence violation. We do not implement it in any case.
579  */
580 
581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
582 {
583 	const struct tcphdr *th = tcp_hdr(skb);
584 	struct {
585 		struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589 	} rep;
590 	struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592 	struct tcp_md5sig_key *key = NULL;
593 	const __u8 *hash_location = NULL;
594 	unsigned char newhash[16];
595 	int genhash;
596 	struct sock *sk1 = NULL;
597 #endif
598 	struct net *net;
599 
600 	/* Never send a reset in response to a reset. */
601 	if (th->rst)
602 		return;
603 
604 	/* If sk not NULL, it means we did a successful lookup and incoming
605 	 * route had to be correct. prequeue might have dropped our dst.
606 	 */
607 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
608 		return;
609 
610 	/* Swap the send and the receive. */
611 	memset(&rep, 0, sizeof(rep));
612 	rep.th.dest   = th->source;
613 	rep.th.source = th->dest;
614 	rep.th.doff   = sizeof(struct tcphdr) / 4;
615 	rep.th.rst    = 1;
616 
617 	if (th->ack) {
618 		rep.th.seq = th->ack_seq;
619 	} else {
620 		rep.th.ack = 1;
621 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622 				       skb->len - (th->doff << 2));
623 	}
624 
625 	memset(&arg, 0, sizeof(arg));
626 	arg.iov[0].iov_base = (unsigned char *)&rep;
627 	arg.iov[0].iov_len  = sizeof(rep.th);
628 
629 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
630 #ifdef CONFIG_TCP_MD5SIG
631 	rcu_read_lock();
632 	hash_location = tcp_parse_md5sig_option(th);
633 	if (sk && sk_fullsock(sk)) {
634 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
635 					&ip_hdr(skb)->saddr, AF_INET);
636 	} else if (hash_location) {
637 		/*
638 		 * active side is lost. Try to find listening socket through
639 		 * source port, and then find md5 key through listening socket.
640 		 * we are not loose security here:
641 		 * Incoming packet is checked with md5 hash with finding key,
642 		 * no RST generated if md5 hash doesn't match.
643 		 */
644 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
645 					     ip_hdr(skb)->saddr,
646 					     th->source, ip_hdr(skb)->daddr,
647 					     ntohs(th->source), inet_iif(skb));
648 		/* don't send rst if it can't find key */
649 		if (!sk1)
650 			goto out;
651 
652 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
653 					&ip_hdr(skb)->saddr, AF_INET);
654 		if (!key)
655 			goto out;
656 
657 
658 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
660 			goto out;
661 
662 	}
663 
664 	if (key) {
665 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666 				   (TCPOPT_NOP << 16) |
667 				   (TCPOPT_MD5SIG << 8) |
668 				   TCPOLEN_MD5SIG);
669 		/* Update length and the length the header thinks exists */
670 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
671 		rep.th.doff = arg.iov[0].iov_len / 4;
672 
673 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
674 				     key, ip_hdr(skb)->saddr,
675 				     ip_hdr(skb)->daddr, &rep.th);
676 	}
677 #endif
678 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
679 				      ip_hdr(skb)->saddr, /* XXX */
680 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
681 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
682 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
683 
684 	/* When socket is gone, all binding information is lost.
685 	 * routing might fail in this case. No choice here, if we choose to force
686 	 * input interface, we will misroute in case of asymmetric route.
687 	 */
688 	if (sk)
689 		arg.bound_dev_if = sk->sk_bound_dev_if;
690 
691 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
692 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
693 
694 	arg.tos = ip_hdr(skb)->tos;
695 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
696 	local_bh_disable();
697 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
698 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
699 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
700 			      &arg, arg.iov[0].iov_len);
701 
702 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
703 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
704 	local_bh_enable();
705 
706 #ifdef CONFIG_TCP_MD5SIG
707 out:
708 	rcu_read_unlock();
709 #endif
710 }
711 
712 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
713    outside socket context is ugly, certainly. What can I do?
714  */
715 
716 static void tcp_v4_send_ack(const struct sock *sk,
717 			    struct sk_buff *skb, u32 seq, u32 ack,
718 			    u32 win, u32 tsval, u32 tsecr, int oif,
719 			    struct tcp_md5sig_key *key,
720 			    int reply_flags, u8 tos)
721 {
722 	const struct tcphdr *th = tcp_hdr(skb);
723 	struct {
724 		struct tcphdr th;
725 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
726 #ifdef CONFIG_TCP_MD5SIG
727 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
728 #endif
729 			];
730 	} rep;
731 	struct net *net = sock_net(sk);
732 	struct ip_reply_arg arg;
733 
734 	memset(&rep.th, 0, sizeof(struct tcphdr));
735 	memset(&arg, 0, sizeof(arg));
736 
737 	arg.iov[0].iov_base = (unsigned char *)&rep;
738 	arg.iov[0].iov_len  = sizeof(rep.th);
739 	if (tsecr) {
740 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
741 				   (TCPOPT_TIMESTAMP << 8) |
742 				   TCPOLEN_TIMESTAMP);
743 		rep.opt[1] = htonl(tsval);
744 		rep.opt[2] = htonl(tsecr);
745 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
746 	}
747 
748 	/* Swap the send and the receive. */
749 	rep.th.dest    = th->source;
750 	rep.th.source  = th->dest;
751 	rep.th.doff    = arg.iov[0].iov_len / 4;
752 	rep.th.seq     = htonl(seq);
753 	rep.th.ack_seq = htonl(ack);
754 	rep.th.ack     = 1;
755 	rep.th.window  = htons(win);
756 
757 #ifdef CONFIG_TCP_MD5SIG
758 	if (key) {
759 		int offset = (tsecr) ? 3 : 0;
760 
761 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
762 					  (TCPOPT_NOP << 16) |
763 					  (TCPOPT_MD5SIG << 8) |
764 					  TCPOLEN_MD5SIG);
765 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 		rep.th.doff = arg.iov[0].iov_len/4;
767 
768 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
769 				    key, ip_hdr(skb)->saddr,
770 				    ip_hdr(skb)->daddr, &rep.th);
771 	}
772 #endif
773 	arg.flags = reply_flags;
774 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
775 				      ip_hdr(skb)->saddr, /* XXX */
776 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
777 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
778 	if (oif)
779 		arg.bound_dev_if = oif;
780 	arg.tos = tos;
781 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
782 	local_bh_disable();
783 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
784 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
785 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
786 			      &arg, arg.iov[0].iov_len);
787 
788 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
789 	local_bh_enable();
790 }
791 
792 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
793 {
794 	struct inet_timewait_sock *tw = inet_twsk(sk);
795 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
796 
797 	tcp_v4_send_ack(sk, skb,
798 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
799 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
800 			tcp_time_stamp + tcptw->tw_ts_offset,
801 			tcptw->tw_ts_recent,
802 			tw->tw_bound_dev_if,
803 			tcp_twsk_md5_key(tcptw),
804 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
805 			tw->tw_tos
806 			);
807 
808 	inet_twsk_put(tw);
809 }
810 
811 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
812 				  struct request_sock *req)
813 {
814 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
815 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
816 	 */
817 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
818 					     tcp_sk(sk)->snd_nxt;
819 
820 	/* RFC 7323 2.3
821 	 * The window field (SEG.WND) of every outgoing segment, with the
822 	 * exception of <SYN> segments, MUST be right-shifted by
823 	 * Rcv.Wind.Shift bits:
824 	 */
825 	tcp_v4_send_ack(sk, skb, seq,
826 			tcp_rsk(req)->rcv_nxt,
827 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
828 			tcp_time_stamp + tcp_rsk(req)->ts_off,
829 			req->ts_recent,
830 			0,
831 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
832 					  AF_INET),
833 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
834 			ip_hdr(skb)->tos);
835 }
836 
837 /*
838  *	Send a SYN-ACK after having received a SYN.
839  *	This still operates on a request_sock only, not on a big
840  *	socket.
841  */
842 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
843 			      struct flowi *fl,
844 			      struct request_sock *req,
845 			      struct tcp_fastopen_cookie *foc,
846 			      enum tcp_synack_type synack_type)
847 {
848 	const struct inet_request_sock *ireq = inet_rsk(req);
849 	struct flowi4 fl4;
850 	int err = -1;
851 	struct sk_buff *skb;
852 
853 	/* First, grab a route. */
854 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
855 		return -1;
856 
857 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
858 
859 	if (skb) {
860 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
861 
862 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
863 					    ireq->ir_rmt_addr,
864 					    ireq->opt);
865 		err = net_xmit_eval(err);
866 	}
867 
868 	return err;
869 }
870 
871 /*
872  *	IPv4 request_sock destructor.
873  */
874 static void tcp_v4_reqsk_destructor(struct request_sock *req)
875 {
876 	kfree(inet_rsk(req)->opt);
877 }
878 
879 #ifdef CONFIG_TCP_MD5SIG
880 /*
881  * RFC2385 MD5 checksumming requires a mapping of
882  * IP address->MD5 Key.
883  * We need to maintain these in the sk structure.
884  */
885 
886 /* Find the Key structure for an address.  */
887 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
888 					 const union tcp_md5_addr *addr,
889 					 int family)
890 {
891 	const struct tcp_sock *tp = tcp_sk(sk);
892 	struct tcp_md5sig_key *key;
893 	unsigned int size = sizeof(struct in_addr);
894 	const struct tcp_md5sig_info *md5sig;
895 
896 	/* caller either holds rcu_read_lock() or socket lock */
897 	md5sig = rcu_dereference_check(tp->md5sig_info,
898 				       lockdep_sock_is_held(sk));
899 	if (!md5sig)
900 		return NULL;
901 #if IS_ENABLED(CONFIG_IPV6)
902 	if (family == AF_INET6)
903 		size = sizeof(struct in6_addr);
904 #endif
905 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
906 		if (key->family != family)
907 			continue;
908 		if (!memcmp(&key->addr, addr, size))
909 			return key;
910 	}
911 	return NULL;
912 }
913 EXPORT_SYMBOL(tcp_md5_do_lookup);
914 
915 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
916 					 const struct sock *addr_sk)
917 {
918 	const union tcp_md5_addr *addr;
919 
920 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
921 	return tcp_md5_do_lookup(sk, addr, AF_INET);
922 }
923 EXPORT_SYMBOL(tcp_v4_md5_lookup);
924 
925 /* This can be called on a newly created socket, from other files */
926 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
927 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
928 {
929 	/* Add Key to the list */
930 	struct tcp_md5sig_key *key;
931 	struct tcp_sock *tp = tcp_sk(sk);
932 	struct tcp_md5sig_info *md5sig;
933 
934 	key = tcp_md5_do_lookup(sk, addr, family);
935 	if (key) {
936 		/* Pre-existing entry - just update that one. */
937 		memcpy(key->key, newkey, newkeylen);
938 		key->keylen = newkeylen;
939 		return 0;
940 	}
941 
942 	md5sig = rcu_dereference_protected(tp->md5sig_info,
943 					   lockdep_sock_is_held(sk));
944 	if (!md5sig) {
945 		md5sig = kmalloc(sizeof(*md5sig), gfp);
946 		if (!md5sig)
947 			return -ENOMEM;
948 
949 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
950 		INIT_HLIST_HEAD(&md5sig->head);
951 		rcu_assign_pointer(tp->md5sig_info, md5sig);
952 	}
953 
954 	key = sock_kmalloc(sk, sizeof(*key), gfp);
955 	if (!key)
956 		return -ENOMEM;
957 	if (!tcp_alloc_md5sig_pool()) {
958 		sock_kfree_s(sk, key, sizeof(*key));
959 		return -ENOMEM;
960 	}
961 
962 	memcpy(key->key, newkey, newkeylen);
963 	key->keylen = newkeylen;
964 	key->family = family;
965 	memcpy(&key->addr, addr,
966 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
967 				      sizeof(struct in_addr));
968 	hlist_add_head_rcu(&key->node, &md5sig->head);
969 	return 0;
970 }
971 EXPORT_SYMBOL(tcp_md5_do_add);
972 
973 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
974 {
975 	struct tcp_md5sig_key *key;
976 
977 	key = tcp_md5_do_lookup(sk, addr, family);
978 	if (!key)
979 		return -ENOENT;
980 	hlist_del_rcu(&key->node);
981 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
982 	kfree_rcu(key, rcu);
983 	return 0;
984 }
985 EXPORT_SYMBOL(tcp_md5_do_del);
986 
987 static void tcp_clear_md5_list(struct sock *sk)
988 {
989 	struct tcp_sock *tp = tcp_sk(sk);
990 	struct tcp_md5sig_key *key;
991 	struct hlist_node *n;
992 	struct tcp_md5sig_info *md5sig;
993 
994 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
995 
996 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
997 		hlist_del_rcu(&key->node);
998 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
999 		kfree_rcu(key, rcu);
1000 	}
1001 }
1002 
1003 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1004 				 int optlen)
1005 {
1006 	struct tcp_md5sig cmd;
1007 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1008 
1009 	if (optlen < sizeof(cmd))
1010 		return -EINVAL;
1011 
1012 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1013 		return -EFAULT;
1014 
1015 	if (sin->sin_family != AF_INET)
1016 		return -EINVAL;
1017 
1018 	if (!cmd.tcpm_keylen)
1019 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1020 				      AF_INET);
1021 
1022 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1023 		return -EINVAL;
1024 
1025 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1026 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1027 			      GFP_KERNEL);
1028 }
1029 
1030 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1031 				   __be32 daddr, __be32 saddr,
1032 				   const struct tcphdr *th, int nbytes)
1033 {
1034 	struct tcp4_pseudohdr *bp;
1035 	struct scatterlist sg;
1036 	struct tcphdr *_th;
1037 
1038 	bp = hp->scratch;
1039 	bp->saddr = saddr;
1040 	bp->daddr = daddr;
1041 	bp->pad = 0;
1042 	bp->protocol = IPPROTO_TCP;
1043 	bp->len = cpu_to_be16(nbytes);
1044 
1045 	_th = (struct tcphdr *)(bp + 1);
1046 	memcpy(_th, th, sizeof(*th));
1047 	_th->check = 0;
1048 
1049 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1050 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1051 				sizeof(*bp) + sizeof(*th));
1052 	return crypto_ahash_update(hp->md5_req);
1053 }
1054 
1055 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1056 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1057 {
1058 	struct tcp_md5sig_pool *hp;
1059 	struct ahash_request *req;
1060 
1061 	hp = tcp_get_md5sig_pool();
1062 	if (!hp)
1063 		goto clear_hash_noput;
1064 	req = hp->md5_req;
1065 
1066 	if (crypto_ahash_init(req))
1067 		goto clear_hash;
1068 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1069 		goto clear_hash;
1070 	if (tcp_md5_hash_key(hp, key))
1071 		goto clear_hash;
1072 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1073 	if (crypto_ahash_final(req))
1074 		goto clear_hash;
1075 
1076 	tcp_put_md5sig_pool();
1077 	return 0;
1078 
1079 clear_hash:
1080 	tcp_put_md5sig_pool();
1081 clear_hash_noput:
1082 	memset(md5_hash, 0, 16);
1083 	return 1;
1084 }
1085 
1086 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1087 			const struct sock *sk,
1088 			const struct sk_buff *skb)
1089 {
1090 	struct tcp_md5sig_pool *hp;
1091 	struct ahash_request *req;
1092 	const struct tcphdr *th = tcp_hdr(skb);
1093 	__be32 saddr, daddr;
1094 
1095 	if (sk) { /* valid for establish/request sockets */
1096 		saddr = sk->sk_rcv_saddr;
1097 		daddr = sk->sk_daddr;
1098 	} else {
1099 		const struct iphdr *iph = ip_hdr(skb);
1100 		saddr = iph->saddr;
1101 		daddr = iph->daddr;
1102 	}
1103 
1104 	hp = tcp_get_md5sig_pool();
1105 	if (!hp)
1106 		goto clear_hash_noput;
1107 	req = hp->md5_req;
1108 
1109 	if (crypto_ahash_init(req))
1110 		goto clear_hash;
1111 
1112 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1113 		goto clear_hash;
1114 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1115 		goto clear_hash;
1116 	if (tcp_md5_hash_key(hp, key))
1117 		goto clear_hash;
1118 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1119 	if (crypto_ahash_final(req))
1120 		goto clear_hash;
1121 
1122 	tcp_put_md5sig_pool();
1123 	return 0;
1124 
1125 clear_hash:
1126 	tcp_put_md5sig_pool();
1127 clear_hash_noput:
1128 	memset(md5_hash, 0, 16);
1129 	return 1;
1130 }
1131 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1132 
1133 #endif
1134 
1135 /* Called with rcu_read_lock() */
1136 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1137 				    const struct sk_buff *skb)
1138 {
1139 #ifdef CONFIG_TCP_MD5SIG
1140 	/*
1141 	 * This gets called for each TCP segment that arrives
1142 	 * so we want to be efficient.
1143 	 * We have 3 drop cases:
1144 	 * o No MD5 hash and one expected.
1145 	 * o MD5 hash and we're not expecting one.
1146 	 * o MD5 hash and its wrong.
1147 	 */
1148 	const __u8 *hash_location = NULL;
1149 	struct tcp_md5sig_key *hash_expected;
1150 	const struct iphdr *iph = ip_hdr(skb);
1151 	const struct tcphdr *th = tcp_hdr(skb);
1152 	int genhash;
1153 	unsigned char newhash[16];
1154 
1155 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1156 					  AF_INET);
1157 	hash_location = tcp_parse_md5sig_option(th);
1158 
1159 	/* We've parsed the options - do we have a hash? */
1160 	if (!hash_expected && !hash_location)
1161 		return false;
1162 
1163 	if (hash_expected && !hash_location) {
1164 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1165 		return true;
1166 	}
1167 
1168 	if (!hash_expected && hash_location) {
1169 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1170 		return true;
1171 	}
1172 
1173 	/* Okay, so this is hash_expected and hash_location -
1174 	 * so we need to calculate the checksum.
1175 	 */
1176 	genhash = tcp_v4_md5_hash_skb(newhash,
1177 				      hash_expected,
1178 				      NULL, skb);
1179 
1180 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1181 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1182 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1183 				     &iph->saddr, ntohs(th->source),
1184 				     &iph->daddr, ntohs(th->dest),
1185 				     genhash ? " tcp_v4_calc_md5_hash failed"
1186 				     : "");
1187 		return true;
1188 	}
1189 	return false;
1190 #endif
1191 	return false;
1192 }
1193 
1194 static void tcp_v4_init_req(struct request_sock *req,
1195 			    const struct sock *sk_listener,
1196 			    struct sk_buff *skb)
1197 {
1198 	struct inet_request_sock *ireq = inet_rsk(req);
1199 
1200 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1201 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1202 	ireq->opt = tcp_v4_save_options(skb);
1203 }
1204 
1205 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1206 					  struct flowi *fl,
1207 					  const struct request_sock *req,
1208 					  bool *strict)
1209 {
1210 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1211 
1212 	if (strict) {
1213 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1214 			*strict = true;
1215 		else
1216 			*strict = false;
1217 	}
1218 
1219 	return dst;
1220 }
1221 
1222 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1223 	.family		=	PF_INET,
1224 	.obj_size	=	sizeof(struct tcp_request_sock),
1225 	.rtx_syn_ack	=	tcp_rtx_synack,
1226 	.send_ack	=	tcp_v4_reqsk_send_ack,
1227 	.destructor	=	tcp_v4_reqsk_destructor,
1228 	.send_reset	=	tcp_v4_send_reset,
1229 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1230 };
1231 
1232 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1233 	.mss_clamp	=	TCP_MSS_DEFAULT,
1234 #ifdef CONFIG_TCP_MD5SIG
1235 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1236 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1237 #endif
1238 	.init_req	=	tcp_v4_init_req,
1239 #ifdef CONFIG_SYN_COOKIES
1240 	.cookie_init_seq =	cookie_v4_init_sequence,
1241 #endif
1242 	.route_req	=	tcp_v4_route_req,
1243 	.init_seq	=	tcp_v4_init_sequence,
1244 	.send_synack	=	tcp_v4_send_synack,
1245 };
1246 
1247 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1248 {
1249 	/* Never answer to SYNs send to broadcast or multicast */
1250 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1251 		goto drop;
1252 
1253 	return tcp_conn_request(&tcp_request_sock_ops,
1254 				&tcp_request_sock_ipv4_ops, sk, skb);
1255 
1256 drop:
1257 	tcp_listendrop(sk);
1258 	return 0;
1259 }
1260 EXPORT_SYMBOL(tcp_v4_conn_request);
1261 
1262 
1263 /*
1264  * The three way handshake has completed - we got a valid synack -
1265  * now create the new socket.
1266  */
1267 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1268 				  struct request_sock *req,
1269 				  struct dst_entry *dst,
1270 				  struct request_sock *req_unhash,
1271 				  bool *own_req)
1272 {
1273 	struct inet_request_sock *ireq;
1274 	struct inet_sock *newinet;
1275 	struct tcp_sock *newtp;
1276 	struct sock *newsk;
1277 #ifdef CONFIG_TCP_MD5SIG
1278 	struct tcp_md5sig_key *key;
1279 #endif
1280 	struct ip_options_rcu *inet_opt;
1281 
1282 	if (sk_acceptq_is_full(sk))
1283 		goto exit_overflow;
1284 
1285 	newsk = tcp_create_openreq_child(sk, req, skb);
1286 	if (!newsk)
1287 		goto exit_nonewsk;
1288 
1289 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1290 	inet_sk_rx_dst_set(newsk, skb);
1291 
1292 	newtp		      = tcp_sk(newsk);
1293 	newinet		      = inet_sk(newsk);
1294 	ireq		      = inet_rsk(req);
1295 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1296 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1297 	newsk->sk_bound_dev_if = ireq->ir_iif;
1298 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1299 	inet_opt	      = ireq->opt;
1300 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1301 	ireq->opt	      = NULL;
1302 	newinet->mc_index     = inet_iif(skb);
1303 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1304 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1305 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1306 	if (inet_opt)
1307 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1308 	newinet->inet_id = newtp->write_seq ^ jiffies;
1309 
1310 	if (!dst) {
1311 		dst = inet_csk_route_child_sock(sk, newsk, req);
1312 		if (!dst)
1313 			goto put_and_exit;
1314 	} else {
1315 		/* syncookie case : see end of cookie_v4_check() */
1316 	}
1317 	sk_setup_caps(newsk, dst);
1318 
1319 	tcp_ca_openreq_child(newsk, dst);
1320 
1321 	tcp_sync_mss(newsk, dst_mtu(dst));
1322 	newtp->advmss = dst_metric_advmss(dst);
1323 	if (tcp_sk(sk)->rx_opt.user_mss &&
1324 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1325 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1326 
1327 	tcp_initialize_rcv_mss(newsk);
1328 
1329 #ifdef CONFIG_TCP_MD5SIG
1330 	/* Copy over the MD5 key from the original socket */
1331 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1332 				AF_INET);
1333 	if (key) {
1334 		/*
1335 		 * We're using one, so create a matching key
1336 		 * on the newsk structure. If we fail to get
1337 		 * memory, then we end up not copying the key
1338 		 * across. Shucks.
1339 		 */
1340 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1341 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1342 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1343 	}
1344 #endif
1345 
1346 	if (__inet_inherit_port(sk, newsk) < 0)
1347 		goto put_and_exit;
1348 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1349 	if (*own_req)
1350 		tcp_move_syn(newtp, req);
1351 
1352 	return newsk;
1353 
1354 exit_overflow:
1355 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1356 exit_nonewsk:
1357 	dst_release(dst);
1358 exit:
1359 	tcp_listendrop(sk);
1360 	return NULL;
1361 put_and_exit:
1362 	inet_csk_prepare_forced_close(newsk);
1363 	tcp_done(newsk);
1364 	goto exit;
1365 }
1366 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1367 
1368 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1369 {
1370 #ifdef CONFIG_SYN_COOKIES
1371 	const struct tcphdr *th = tcp_hdr(skb);
1372 
1373 	if (!th->syn)
1374 		sk = cookie_v4_check(sk, skb);
1375 #endif
1376 	return sk;
1377 }
1378 
1379 /* The socket must have it's spinlock held when we get
1380  * here, unless it is a TCP_LISTEN socket.
1381  *
1382  * We have a potential double-lock case here, so even when
1383  * doing backlog processing we use the BH locking scheme.
1384  * This is because we cannot sleep with the original spinlock
1385  * held.
1386  */
1387 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1388 {
1389 	struct sock *rsk;
1390 
1391 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1392 		struct dst_entry *dst = sk->sk_rx_dst;
1393 
1394 		sock_rps_save_rxhash(sk, skb);
1395 		sk_mark_napi_id(sk, skb);
1396 		if (dst) {
1397 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1398 			    !dst->ops->check(dst, 0)) {
1399 				dst_release(dst);
1400 				sk->sk_rx_dst = NULL;
1401 			}
1402 		}
1403 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1404 		return 0;
1405 	}
1406 
1407 	if (tcp_checksum_complete(skb))
1408 		goto csum_err;
1409 
1410 	if (sk->sk_state == TCP_LISTEN) {
1411 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1412 
1413 		if (!nsk)
1414 			goto discard;
1415 		if (nsk != sk) {
1416 			sock_rps_save_rxhash(nsk, skb);
1417 			sk_mark_napi_id(nsk, skb);
1418 			if (tcp_child_process(sk, nsk, skb)) {
1419 				rsk = nsk;
1420 				goto reset;
1421 			}
1422 			return 0;
1423 		}
1424 	} else
1425 		sock_rps_save_rxhash(sk, skb);
1426 
1427 	if (tcp_rcv_state_process(sk, skb)) {
1428 		rsk = sk;
1429 		goto reset;
1430 	}
1431 	return 0;
1432 
1433 reset:
1434 	tcp_v4_send_reset(rsk, skb);
1435 discard:
1436 	kfree_skb(skb);
1437 	/* Be careful here. If this function gets more complicated and
1438 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1439 	 * might be destroyed here. This current version compiles correctly,
1440 	 * but you have been warned.
1441 	 */
1442 	return 0;
1443 
1444 csum_err:
1445 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1446 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1447 	goto discard;
1448 }
1449 EXPORT_SYMBOL(tcp_v4_do_rcv);
1450 
1451 void tcp_v4_early_demux(struct sk_buff *skb)
1452 {
1453 	const struct iphdr *iph;
1454 	const struct tcphdr *th;
1455 	struct sock *sk;
1456 
1457 	if (skb->pkt_type != PACKET_HOST)
1458 		return;
1459 
1460 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1461 		return;
1462 
1463 	iph = ip_hdr(skb);
1464 	th = tcp_hdr(skb);
1465 
1466 	if (th->doff < sizeof(struct tcphdr) / 4)
1467 		return;
1468 
1469 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1470 				       iph->saddr, th->source,
1471 				       iph->daddr, ntohs(th->dest),
1472 				       skb->skb_iif);
1473 	if (sk) {
1474 		skb->sk = sk;
1475 		skb->destructor = sock_edemux;
1476 		if (sk_fullsock(sk)) {
1477 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1478 
1479 			if (dst)
1480 				dst = dst_check(dst, 0);
1481 			if (dst &&
1482 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1483 				skb_dst_set_noref(skb, dst);
1484 		}
1485 	}
1486 }
1487 
1488 /* Packet is added to VJ-style prequeue for processing in process
1489  * context, if a reader task is waiting. Apparently, this exciting
1490  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1491  * failed somewhere. Latency? Burstiness? Well, at least now we will
1492  * see, why it failed. 8)8)				  --ANK
1493  *
1494  */
1495 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1496 {
1497 	struct tcp_sock *tp = tcp_sk(sk);
1498 
1499 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1500 		return false;
1501 
1502 	if (skb->len <= tcp_hdrlen(skb) &&
1503 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1504 		return false;
1505 
1506 	/* Before escaping RCU protected region, we need to take care of skb
1507 	 * dst. Prequeue is only enabled for established sockets.
1508 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1509 	 * Instead of doing full sk_rx_dst validity here, let's perform
1510 	 * an optimistic check.
1511 	 */
1512 	if (likely(sk->sk_rx_dst))
1513 		skb_dst_drop(skb);
1514 	else
1515 		skb_dst_force_safe(skb);
1516 
1517 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1518 	tp->ucopy.memory += skb->truesize;
1519 	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1520 	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1521 		struct sk_buff *skb1;
1522 
1523 		BUG_ON(sock_owned_by_user(sk));
1524 		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1525 				skb_queue_len(&tp->ucopy.prequeue));
1526 
1527 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1528 			sk_backlog_rcv(sk, skb1);
1529 
1530 		tp->ucopy.memory = 0;
1531 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1532 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1533 					   POLLIN | POLLRDNORM | POLLRDBAND);
1534 		if (!inet_csk_ack_scheduled(sk))
1535 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1536 						  (3 * tcp_rto_min(sk)) / 4,
1537 						  TCP_RTO_MAX);
1538 	}
1539 	return true;
1540 }
1541 EXPORT_SYMBOL(tcp_prequeue);
1542 
1543 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1544 {
1545 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1546 
1547 	/* Only socket owner can try to collapse/prune rx queues
1548 	 * to reduce memory overhead, so add a little headroom here.
1549 	 * Few sockets backlog are possibly concurrently non empty.
1550 	 */
1551 	limit += 64*1024;
1552 
1553 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1554 	 * we can fix skb->truesize to its real value to avoid future drops.
1555 	 * This is valid because skb is not yet charged to the socket.
1556 	 * It has been noticed pure SACK packets were sometimes dropped
1557 	 * (if cooked by drivers without copybreak feature).
1558 	 */
1559 	if (!skb->data_len)
1560 		skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1561 
1562 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1563 		bh_unlock_sock(sk);
1564 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1565 		return true;
1566 	}
1567 	return false;
1568 }
1569 EXPORT_SYMBOL(tcp_add_backlog);
1570 
1571 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1572 {
1573 	struct tcphdr *th = (struct tcphdr *)skb->data;
1574 	unsigned int eaten = skb->len;
1575 	int err;
1576 
1577 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1578 	if (!err) {
1579 		eaten -= skb->len;
1580 		TCP_SKB_CB(skb)->end_seq -= eaten;
1581 	}
1582 	return err;
1583 }
1584 EXPORT_SYMBOL(tcp_filter);
1585 
1586 /*
1587  *	From tcp_input.c
1588  */
1589 
1590 int tcp_v4_rcv(struct sk_buff *skb)
1591 {
1592 	struct net *net = dev_net(skb->dev);
1593 	const struct iphdr *iph;
1594 	const struct tcphdr *th;
1595 	bool refcounted;
1596 	struct sock *sk;
1597 	int ret;
1598 
1599 	if (skb->pkt_type != PACKET_HOST)
1600 		goto discard_it;
1601 
1602 	/* Count it even if it's bad */
1603 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1604 
1605 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1606 		goto discard_it;
1607 
1608 	th = (const struct tcphdr *)skb->data;
1609 
1610 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1611 		goto bad_packet;
1612 	if (!pskb_may_pull(skb, th->doff * 4))
1613 		goto discard_it;
1614 
1615 	/* An explanation is required here, I think.
1616 	 * Packet length and doff are validated by header prediction,
1617 	 * provided case of th->doff==0 is eliminated.
1618 	 * So, we defer the checks. */
1619 
1620 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1621 		goto csum_error;
1622 
1623 	th = (const struct tcphdr *)skb->data;
1624 	iph = ip_hdr(skb);
1625 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1626 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1627 	 */
1628 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1629 		sizeof(struct inet_skb_parm));
1630 	barrier();
1631 
1632 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1633 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1634 				    skb->len - th->doff * 4);
1635 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1636 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1637 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1638 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1639 	TCP_SKB_CB(skb)->sacked	 = 0;
1640 
1641 lookup:
1642 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1643 			       th->dest, &refcounted);
1644 	if (!sk)
1645 		goto no_tcp_socket;
1646 
1647 process:
1648 	if (sk->sk_state == TCP_TIME_WAIT)
1649 		goto do_time_wait;
1650 
1651 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1652 		struct request_sock *req = inet_reqsk(sk);
1653 		struct sock *nsk;
1654 
1655 		sk = req->rsk_listener;
1656 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1657 			sk_drops_add(sk, skb);
1658 			reqsk_put(req);
1659 			goto discard_it;
1660 		}
1661 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1662 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1663 			goto lookup;
1664 		}
1665 		/* We own a reference on the listener, increase it again
1666 		 * as we might lose it too soon.
1667 		 */
1668 		sock_hold(sk);
1669 		refcounted = true;
1670 		nsk = tcp_check_req(sk, skb, req, false);
1671 		if (!nsk) {
1672 			reqsk_put(req);
1673 			goto discard_and_relse;
1674 		}
1675 		if (nsk == sk) {
1676 			reqsk_put(req);
1677 		} else if (tcp_child_process(sk, nsk, skb)) {
1678 			tcp_v4_send_reset(nsk, skb);
1679 			goto discard_and_relse;
1680 		} else {
1681 			sock_put(sk);
1682 			return 0;
1683 		}
1684 	}
1685 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1686 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1687 		goto discard_and_relse;
1688 	}
1689 
1690 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1691 		goto discard_and_relse;
1692 
1693 	if (tcp_v4_inbound_md5_hash(sk, skb))
1694 		goto discard_and_relse;
1695 
1696 	nf_reset(skb);
1697 
1698 	if (tcp_filter(sk, skb))
1699 		goto discard_and_relse;
1700 	th = (const struct tcphdr *)skb->data;
1701 	iph = ip_hdr(skb);
1702 
1703 	skb->dev = NULL;
1704 
1705 	if (sk->sk_state == TCP_LISTEN) {
1706 		ret = tcp_v4_do_rcv(sk, skb);
1707 		goto put_and_return;
1708 	}
1709 
1710 	sk_incoming_cpu_update(sk);
1711 
1712 	bh_lock_sock_nested(sk);
1713 	tcp_segs_in(tcp_sk(sk), skb);
1714 	ret = 0;
1715 	if (!sock_owned_by_user(sk)) {
1716 		if (!tcp_prequeue(sk, skb))
1717 			ret = tcp_v4_do_rcv(sk, skb);
1718 	} else if (tcp_add_backlog(sk, skb)) {
1719 		goto discard_and_relse;
1720 	}
1721 	bh_unlock_sock(sk);
1722 
1723 put_and_return:
1724 	if (refcounted)
1725 		sock_put(sk);
1726 
1727 	return ret;
1728 
1729 no_tcp_socket:
1730 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1731 		goto discard_it;
1732 
1733 	if (tcp_checksum_complete(skb)) {
1734 csum_error:
1735 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1736 bad_packet:
1737 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1738 	} else {
1739 		tcp_v4_send_reset(NULL, skb);
1740 	}
1741 
1742 discard_it:
1743 	/* Discard frame. */
1744 	kfree_skb(skb);
1745 	return 0;
1746 
1747 discard_and_relse:
1748 	sk_drops_add(sk, skb);
1749 	if (refcounted)
1750 		sock_put(sk);
1751 	goto discard_it;
1752 
1753 do_time_wait:
1754 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1755 		inet_twsk_put(inet_twsk(sk));
1756 		goto discard_it;
1757 	}
1758 
1759 	if (tcp_checksum_complete(skb)) {
1760 		inet_twsk_put(inet_twsk(sk));
1761 		goto csum_error;
1762 	}
1763 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1764 	case TCP_TW_SYN: {
1765 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1766 							&tcp_hashinfo, skb,
1767 							__tcp_hdrlen(th),
1768 							iph->saddr, th->source,
1769 							iph->daddr, th->dest,
1770 							inet_iif(skb));
1771 		if (sk2) {
1772 			inet_twsk_deschedule_put(inet_twsk(sk));
1773 			sk = sk2;
1774 			refcounted = false;
1775 			goto process;
1776 		}
1777 		/* Fall through to ACK */
1778 	}
1779 	case TCP_TW_ACK:
1780 		tcp_v4_timewait_ack(sk, skb);
1781 		break;
1782 	case TCP_TW_RST:
1783 		tcp_v4_send_reset(sk, skb);
1784 		inet_twsk_deschedule_put(inet_twsk(sk));
1785 		goto discard_it;
1786 	case TCP_TW_SUCCESS:;
1787 	}
1788 	goto discard_it;
1789 }
1790 
1791 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1792 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1793 	.twsk_unique	= tcp_twsk_unique,
1794 	.twsk_destructor= tcp_twsk_destructor,
1795 };
1796 
1797 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1798 {
1799 	struct dst_entry *dst = skb_dst(skb);
1800 
1801 	if (dst && dst_hold_safe(dst)) {
1802 		sk->sk_rx_dst = dst;
1803 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1804 	}
1805 }
1806 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1807 
1808 const struct inet_connection_sock_af_ops ipv4_specific = {
1809 	.queue_xmit	   = ip_queue_xmit,
1810 	.send_check	   = tcp_v4_send_check,
1811 	.rebuild_header	   = inet_sk_rebuild_header,
1812 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1813 	.conn_request	   = tcp_v4_conn_request,
1814 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1815 	.net_header_len	   = sizeof(struct iphdr),
1816 	.setsockopt	   = ip_setsockopt,
1817 	.getsockopt	   = ip_getsockopt,
1818 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1819 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1820 	.bind_conflict	   = inet_csk_bind_conflict,
1821 #ifdef CONFIG_COMPAT
1822 	.compat_setsockopt = compat_ip_setsockopt,
1823 	.compat_getsockopt = compat_ip_getsockopt,
1824 #endif
1825 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1826 };
1827 EXPORT_SYMBOL(ipv4_specific);
1828 
1829 #ifdef CONFIG_TCP_MD5SIG
1830 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1831 	.md5_lookup		= tcp_v4_md5_lookup,
1832 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1833 	.md5_parse		= tcp_v4_parse_md5_keys,
1834 };
1835 #endif
1836 
1837 /* NOTE: A lot of things set to zero explicitly by call to
1838  *       sk_alloc() so need not be done here.
1839  */
1840 static int tcp_v4_init_sock(struct sock *sk)
1841 {
1842 	struct inet_connection_sock *icsk = inet_csk(sk);
1843 
1844 	tcp_init_sock(sk);
1845 
1846 	icsk->icsk_af_ops = &ipv4_specific;
1847 
1848 #ifdef CONFIG_TCP_MD5SIG
1849 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1850 #endif
1851 
1852 	return 0;
1853 }
1854 
1855 void tcp_v4_destroy_sock(struct sock *sk)
1856 {
1857 	struct tcp_sock *tp = tcp_sk(sk);
1858 
1859 	tcp_clear_xmit_timers(sk);
1860 
1861 	tcp_cleanup_congestion_control(sk);
1862 
1863 	/* Cleanup up the write buffer. */
1864 	tcp_write_queue_purge(sk);
1865 
1866 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1867 	skb_rbtree_purge(&tp->out_of_order_queue);
1868 
1869 #ifdef CONFIG_TCP_MD5SIG
1870 	/* Clean up the MD5 key list, if any */
1871 	if (tp->md5sig_info) {
1872 		tcp_clear_md5_list(sk);
1873 		kfree_rcu(tp->md5sig_info, rcu);
1874 		tp->md5sig_info = NULL;
1875 	}
1876 #endif
1877 
1878 	/* Clean prequeue, it must be empty really */
1879 	__skb_queue_purge(&tp->ucopy.prequeue);
1880 
1881 	/* Clean up a referenced TCP bind bucket. */
1882 	if (inet_csk(sk)->icsk_bind_hash)
1883 		inet_put_port(sk);
1884 
1885 	BUG_ON(tp->fastopen_rsk);
1886 
1887 	/* If socket is aborted during connect operation */
1888 	tcp_free_fastopen_req(tp);
1889 	tcp_saved_syn_free(tp);
1890 
1891 	local_bh_disable();
1892 	sk_sockets_allocated_dec(sk);
1893 	local_bh_enable();
1894 }
1895 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1896 
1897 #ifdef CONFIG_PROC_FS
1898 /* Proc filesystem TCP sock list dumping. */
1899 
1900 /*
1901  * Get next listener socket follow cur.  If cur is NULL, get first socket
1902  * starting from bucket given in st->bucket; when st->bucket is zero the
1903  * very first socket in the hash table is returned.
1904  */
1905 static void *listening_get_next(struct seq_file *seq, void *cur)
1906 {
1907 	struct tcp_iter_state *st = seq->private;
1908 	struct net *net = seq_file_net(seq);
1909 	struct inet_listen_hashbucket *ilb;
1910 	struct sock *sk = cur;
1911 
1912 	if (!sk) {
1913 get_head:
1914 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1915 		spin_lock(&ilb->lock);
1916 		sk = sk_head(&ilb->head);
1917 		st->offset = 0;
1918 		goto get_sk;
1919 	}
1920 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1921 	++st->num;
1922 	++st->offset;
1923 
1924 	sk = sk_next(sk);
1925 get_sk:
1926 	sk_for_each_from(sk) {
1927 		if (!net_eq(sock_net(sk), net))
1928 			continue;
1929 		if (sk->sk_family == st->family)
1930 			return sk;
1931 	}
1932 	spin_unlock(&ilb->lock);
1933 	st->offset = 0;
1934 	if (++st->bucket < INET_LHTABLE_SIZE)
1935 		goto get_head;
1936 	return NULL;
1937 }
1938 
1939 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1940 {
1941 	struct tcp_iter_state *st = seq->private;
1942 	void *rc;
1943 
1944 	st->bucket = 0;
1945 	st->offset = 0;
1946 	rc = listening_get_next(seq, NULL);
1947 
1948 	while (rc && *pos) {
1949 		rc = listening_get_next(seq, rc);
1950 		--*pos;
1951 	}
1952 	return rc;
1953 }
1954 
1955 static inline bool empty_bucket(const struct tcp_iter_state *st)
1956 {
1957 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1958 }
1959 
1960 /*
1961  * Get first established socket starting from bucket given in st->bucket.
1962  * If st->bucket is zero, the very first socket in the hash is returned.
1963  */
1964 static void *established_get_first(struct seq_file *seq)
1965 {
1966 	struct tcp_iter_state *st = seq->private;
1967 	struct net *net = seq_file_net(seq);
1968 	void *rc = NULL;
1969 
1970 	st->offset = 0;
1971 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1972 		struct sock *sk;
1973 		struct hlist_nulls_node *node;
1974 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1975 
1976 		/* Lockless fast path for the common case of empty buckets */
1977 		if (empty_bucket(st))
1978 			continue;
1979 
1980 		spin_lock_bh(lock);
1981 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1982 			if (sk->sk_family != st->family ||
1983 			    !net_eq(sock_net(sk), net)) {
1984 				continue;
1985 			}
1986 			rc = sk;
1987 			goto out;
1988 		}
1989 		spin_unlock_bh(lock);
1990 	}
1991 out:
1992 	return rc;
1993 }
1994 
1995 static void *established_get_next(struct seq_file *seq, void *cur)
1996 {
1997 	struct sock *sk = cur;
1998 	struct hlist_nulls_node *node;
1999 	struct tcp_iter_state *st = seq->private;
2000 	struct net *net = seq_file_net(seq);
2001 
2002 	++st->num;
2003 	++st->offset;
2004 
2005 	sk = sk_nulls_next(sk);
2006 
2007 	sk_nulls_for_each_from(sk, node) {
2008 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2009 			return sk;
2010 	}
2011 
2012 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2013 	++st->bucket;
2014 	return established_get_first(seq);
2015 }
2016 
2017 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2018 {
2019 	struct tcp_iter_state *st = seq->private;
2020 	void *rc;
2021 
2022 	st->bucket = 0;
2023 	rc = established_get_first(seq);
2024 
2025 	while (rc && pos) {
2026 		rc = established_get_next(seq, rc);
2027 		--pos;
2028 	}
2029 	return rc;
2030 }
2031 
2032 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2033 {
2034 	void *rc;
2035 	struct tcp_iter_state *st = seq->private;
2036 
2037 	st->state = TCP_SEQ_STATE_LISTENING;
2038 	rc	  = listening_get_idx(seq, &pos);
2039 
2040 	if (!rc) {
2041 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2042 		rc	  = established_get_idx(seq, pos);
2043 	}
2044 
2045 	return rc;
2046 }
2047 
2048 static void *tcp_seek_last_pos(struct seq_file *seq)
2049 {
2050 	struct tcp_iter_state *st = seq->private;
2051 	int offset = st->offset;
2052 	int orig_num = st->num;
2053 	void *rc = NULL;
2054 
2055 	switch (st->state) {
2056 	case TCP_SEQ_STATE_LISTENING:
2057 		if (st->bucket >= INET_LHTABLE_SIZE)
2058 			break;
2059 		st->state = TCP_SEQ_STATE_LISTENING;
2060 		rc = listening_get_next(seq, NULL);
2061 		while (offset-- && rc)
2062 			rc = listening_get_next(seq, rc);
2063 		if (rc)
2064 			break;
2065 		st->bucket = 0;
2066 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2067 		/* Fallthrough */
2068 	case TCP_SEQ_STATE_ESTABLISHED:
2069 		if (st->bucket > tcp_hashinfo.ehash_mask)
2070 			break;
2071 		rc = established_get_first(seq);
2072 		while (offset-- && rc)
2073 			rc = established_get_next(seq, rc);
2074 	}
2075 
2076 	st->num = orig_num;
2077 
2078 	return rc;
2079 }
2080 
2081 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2082 {
2083 	struct tcp_iter_state *st = seq->private;
2084 	void *rc;
2085 
2086 	if (*pos && *pos == st->last_pos) {
2087 		rc = tcp_seek_last_pos(seq);
2088 		if (rc)
2089 			goto out;
2090 	}
2091 
2092 	st->state = TCP_SEQ_STATE_LISTENING;
2093 	st->num = 0;
2094 	st->bucket = 0;
2095 	st->offset = 0;
2096 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2097 
2098 out:
2099 	st->last_pos = *pos;
2100 	return rc;
2101 }
2102 
2103 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2104 {
2105 	struct tcp_iter_state *st = seq->private;
2106 	void *rc = NULL;
2107 
2108 	if (v == SEQ_START_TOKEN) {
2109 		rc = tcp_get_idx(seq, 0);
2110 		goto out;
2111 	}
2112 
2113 	switch (st->state) {
2114 	case TCP_SEQ_STATE_LISTENING:
2115 		rc = listening_get_next(seq, v);
2116 		if (!rc) {
2117 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2118 			st->bucket = 0;
2119 			st->offset = 0;
2120 			rc	  = established_get_first(seq);
2121 		}
2122 		break;
2123 	case TCP_SEQ_STATE_ESTABLISHED:
2124 		rc = established_get_next(seq, v);
2125 		break;
2126 	}
2127 out:
2128 	++*pos;
2129 	st->last_pos = *pos;
2130 	return rc;
2131 }
2132 
2133 static void tcp_seq_stop(struct seq_file *seq, void *v)
2134 {
2135 	struct tcp_iter_state *st = seq->private;
2136 
2137 	switch (st->state) {
2138 	case TCP_SEQ_STATE_LISTENING:
2139 		if (v != SEQ_START_TOKEN)
2140 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2141 		break;
2142 	case TCP_SEQ_STATE_ESTABLISHED:
2143 		if (v)
2144 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2145 		break;
2146 	}
2147 }
2148 
2149 int tcp_seq_open(struct inode *inode, struct file *file)
2150 {
2151 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2152 	struct tcp_iter_state *s;
2153 	int err;
2154 
2155 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2156 			  sizeof(struct tcp_iter_state));
2157 	if (err < 0)
2158 		return err;
2159 
2160 	s = ((struct seq_file *)file->private_data)->private;
2161 	s->family		= afinfo->family;
2162 	s->last_pos		= 0;
2163 	return 0;
2164 }
2165 EXPORT_SYMBOL(tcp_seq_open);
2166 
2167 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2168 {
2169 	int rc = 0;
2170 	struct proc_dir_entry *p;
2171 
2172 	afinfo->seq_ops.start		= tcp_seq_start;
2173 	afinfo->seq_ops.next		= tcp_seq_next;
2174 	afinfo->seq_ops.stop		= tcp_seq_stop;
2175 
2176 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2177 			     afinfo->seq_fops, afinfo);
2178 	if (!p)
2179 		rc = -ENOMEM;
2180 	return rc;
2181 }
2182 EXPORT_SYMBOL(tcp_proc_register);
2183 
2184 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2185 {
2186 	remove_proc_entry(afinfo->name, net->proc_net);
2187 }
2188 EXPORT_SYMBOL(tcp_proc_unregister);
2189 
2190 static void get_openreq4(const struct request_sock *req,
2191 			 struct seq_file *f, int i)
2192 {
2193 	const struct inet_request_sock *ireq = inet_rsk(req);
2194 	long delta = req->rsk_timer.expires - jiffies;
2195 
2196 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2197 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2198 		i,
2199 		ireq->ir_loc_addr,
2200 		ireq->ir_num,
2201 		ireq->ir_rmt_addr,
2202 		ntohs(ireq->ir_rmt_port),
2203 		TCP_SYN_RECV,
2204 		0, 0, /* could print option size, but that is af dependent. */
2205 		1,    /* timers active (only the expire timer) */
2206 		jiffies_delta_to_clock_t(delta),
2207 		req->num_timeout,
2208 		from_kuid_munged(seq_user_ns(f),
2209 				 sock_i_uid(req->rsk_listener)),
2210 		0,  /* non standard timer */
2211 		0, /* open_requests have no inode */
2212 		0,
2213 		req);
2214 }
2215 
2216 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2217 {
2218 	int timer_active;
2219 	unsigned long timer_expires;
2220 	const struct tcp_sock *tp = tcp_sk(sk);
2221 	const struct inet_connection_sock *icsk = inet_csk(sk);
2222 	const struct inet_sock *inet = inet_sk(sk);
2223 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2224 	__be32 dest = inet->inet_daddr;
2225 	__be32 src = inet->inet_rcv_saddr;
2226 	__u16 destp = ntohs(inet->inet_dport);
2227 	__u16 srcp = ntohs(inet->inet_sport);
2228 	int rx_queue;
2229 	int state;
2230 
2231 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2232 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2233 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2234 		timer_active	= 1;
2235 		timer_expires	= icsk->icsk_timeout;
2236 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2237 		timer_active	= 4;
2238 		timer_expires	= icsk->icsk_timeout;
2239 	} else if (timer_pending(&sk->sk_timer)) {
2240 		timer_active	= 2;
2241 		timer_expires	= sk->sk_timer.expires;
2242 	} else {
2243 		timer_active	= 0;
2244 		timer_expires = jiffies;
2245 	}
2246 
2247 	state = sk_state_load(sk);
2248 	if (state == TCP_LISTEN)
2249 		rx_queue = sk->sk_ack_backlog;
2250 	else
2251 		/* Because we don't lock the socket,
2252 		 * we might find a transient negative value.
2253 		 */
2254 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2255 
2256 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2257 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2258 		i, src, srcp, dest, destp, state,
2259 		tp->write_seq - tp->snd_una,
2260 		rx_queue,
2261 		timer_active,
2262 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2263 		icsk->icsk_retransmits,
2264 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2265 		icsk->icsk_probes_out,
2266 		sock_i_ino(sk),
2267 		atomic_read(&sk->sk_refcnt), sk,
2268 		jiffies_to_clock_t(icsk->icsk_rto),
2269 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2270 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2271 		tp->snd_cwnd,
2272 		state == TCP_LISTEN ?
2273 		    fastopenq->max_qlen :
2274 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2275 }
2276 
2277 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2278 			       struct seq_file *f, int i)
2279 {
2280 	long delta = tw->tw_timer.expires - jiffies;
2281 	__be32 dest, src;
2282 	__u16 destp, srcp;
2283 
2284 	dest  = tw->tw_daddr;
2285 	src   = tw->tw_rcv_saddr;
2286 	destp = ntohs(tw->tw_dport);
2287 	srcp  = ntohs(tw->tw_sport);
2288 
2289 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2290 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2291 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2292 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2293 		atomic_read(&tw->tw_refcnt), tw);
2294 }
2295 
2296 #define TMPSZ 150
2297 
2298 static int tcp4_seq_show(struct seq_file *seq, void *v)
2299 {
2300 	struct tcp_iter_state *st;
2301 	struct sock *sk = v;
2302 
2303 	seq_setwidth(seq, TMPSZ - 1);
2304 	if (v == SEQ_START_TOKEN) {
2305 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2306 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2307 			   "inode");
2308 		goto out;
2309 	}
2310 	st = seq->private;
2311 
2312 	if (sk->sk_state == TCP_TIME_WAIT)
2313 		get_timewait4_sock(v, seq, st->num);
2314 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2315 		get_openreq4(v, seq, st->num);
2316 	else
2317 		get_tcp4_sock(v, seq, st->num);
2318 out:
2319 	seq_pad(seq, '\n');
2320 	return 0;
2321 }
2322 
2323 static const struct file_operations tcp_afinfo_seq_fops = {
2324 	.owner   = THIS_MODULE,
2325 	.open    = tcp_seq_open,
2326 	.read    = seq_read,
2327 	.llseek  = seq_lseek,
2328 	.release = seq_release_net
2329 };
2330 
2331 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2332 	.name		= "tcp",
2333 	.family		= AF_INET,
2334 	.seq_fops	= &tcp_afinfo_seq_fops,
2335 	.seq_ops	= {
2336 		.show		= tcp4_seq_show,
2337 	},
2338 };
2339 
2340 static int __net_init tcp4_proc_init_net(struct net *net)
2341 {
2342 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2343 }
2344 
2345 static void __net_exit tcp4_proc_exit_net(struct net *net)
2346 {
2347 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2348 }
2349 
2350 static struct pernet_operations tcp4_net_ops = {
2351 	.init = tcp4_proc_init_net,
2352 	.exit = tcp4_proc_exit_net,
2353 };
2354 
2355 int __init tcp4_proc_init(void)
2356 {
2357 	return register_pernet_subsys(&tcp4_net_ops);
2358 }
2359 
2360 void tcp4_proc_exit(void)
2361 {
2362 	unregister_pernet_subsys(&tcp4_net_ops);
2363 }
2364 #endif /* CONFIG_PROC_FS */
2365 
2366 struct proto tcp_prot = {
2367 	.name			= "TCP",
2368 	.owner			= THIS_MODULE,
2369 	.close			= tcp_close,
2370 	.connect		= tcp_v4_connect,
2371 	.disconnect		= tcp_disconnect,
2372 	.accept			= inet_csk_accept,
2373 	.ioctl			= tcp_ioctl,
2374 	.init			= tcp_v4_init_sock,
2375 	.destroy		= tcp_v4_destroy_sock,
2376 	.shutdown		= tcp_shutdown,
2377 	.setsockopt		= tcp_setsockopt,
2378 	.getsockopt		= tcp_getsockopt,
2379 	.recvmsg		= tcp_recvmsg,
2380 	.sendmsg		= tcp_sendmsg,
2381 	.sendpage		= tcp_sendpage,
2382 	.backlog_rcv		= tcp_v4_do_rcv,
2383 	.release_cb		= tcp_release_cb,
2384 	.hash			= inet_hash,
2385 	.unhash			= inet_unhash,
2386 	.get_port		= inet_csk_get_port,
2387 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2388 	.stream_memory_free	= tcp_stream_memory_free,
2389 	.sockets_allocated	= &tcp_sockets_allocated,
2390 	.orphan_count		= &tcp_orphan_count,
2391 	.memory_allocated	= &tcp_memory_allocated,
2392 	.memory_pressure	= &tcp_memory_pressure,
2393 	.sysctl_mem		= sysctl_tcp_mem,
2394 	.sysctl_wmem		= sysctl_tcp_wmem,
2395 	.sysctl_rmem		= sysctl_tcp_rmem,
2396 	.max_header		= MAX_TCP_HEADER,
2397 	.obj_size		= sizeof(struct tcp_sock),
2398 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2399 	.twsk_prot		= &tcp_timewait_sock_ops,
2400 	.rsk_prot		= &tcp_request_sock_ops,
2401 	.h.hashinfo		= &tcp_hashinfo,
2402 	.no_autobind		= true,
2403 #ifdef CONFIG_COMPAT
2404 	.compat_setsockopt	= compat_tcp_setsockopt,
2405 	.compat_getsockopt	= compat_tcp_getsockopt,
2406 #endif
2407 	.diag_destroy		= tcp_abort,
2408 };
2409 EXPORT_SYMBOL(tcp_prot);
2410 
2411 static void __net_exit tcp_sk_exit(struct net *net)
2412 {
2413 	int cpu;
2414 
2415 	for_each_possible_cpu(cpu)
2416 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2417 	free_percpu(net->ipv4.tcp_sk);
2418 }
2419 
2420 static int __net_init tcp_sk_init(struct net *net)
2421 {
2422 	int res, cpu;
2423 
2424 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2425 	if (!net->ipv4.tcp_sk)
2426 		return -ENOMEM;
2427 
2428 	for_each_possible_cpu(cpu) {
2429 		struct sock *sk;
2430 
2431 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2432 					   IPPROTO_TCP, net);
2433 		if (res)
2434 			goto fail;
2435 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2436 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2437 	}
2438 
2439 	net->ipv4.sysctl_tcp_ecn = 2;
2440 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2441 
2442 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2443 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2444 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2445 
2446 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2447 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2448 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2449 
2450 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2451 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2452 	net->ipv4.sysctl_tcp_syncookies = 1;
2453 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2454 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2455 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2456 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2457 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2458 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2459 
2460 	return 0;
2461 fail:
2462 	tcp_sk_exit(net);
2463 
2464 	return res;
2465 }
2466 
2467 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2468 {
2469 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2470 }
2471 
2472 static struct pernet_operations __net_initdata tcp_sk_ops = {
2473        .init	   = tcp_sk_init,
2474        .exit	   = tcp_sk_exit,
2475        .exit_batch = tcp_sk_exit_batch,
2476 };
2477 
2478 void __init tcp_v4_init(void)
2479 {
2480 	inet_hashinfo_init(&tcp_hashinfo);
2481 	if (register_pernet_subsys(&tcp_sk_ops))
2482 		panic("Failed to create the TCP control socket.\n");
2483 }
2484