xref: /linux/net/ipv4/tcp_ipv4.c (revision 4e0385dd7469d933c4adf84a617f872ca547aa07)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_low_latency __read_mostly;
88 
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93 
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96 
97 static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
98 {
99 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
100 					  ip_hdr(skb)->saddr,
101 					  tcp_hdr(skb)->dest,
102 					  tcp_hdr(skb)->source, tsoff);
103 }
104 
105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
106 {
107 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
108 	struct tcp_sock *tp = tcp_sk(sk);
109 
110 	/* With PAWS, it is safe from the viewpoint
111 	   of data integrity. Even without PAWS it is safe provided sequence
112 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
113 
114 	   Actually, the idea is close to VJ's one, only timestamp cache is
115 	   held not per host, but per port pair and TW bucket is used as state
116 	   holder.
117 
118 	   If TW bucket has been already destroyed we fall back to VJ's scheme
119 	   and use initial timestamp retrieved from peer table.
120 	 */
121 	if (tcptw->tw_ts_recent_stamp &&
122 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
123 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
124 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
125 		if (tp->write_seq == 0)
126 			tp->write_seq = 1;
127 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
128 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
129 		sock_hold(sktw);
130 		return 1;
131 	}
132 
133 	return 0;
134 }
135 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
136 
137 /* This will initiate an outgoing connection. */
138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
139 {
140 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
141 	struct inet_sock *inet = inet_sk(sk);
142 	struct tcp_sock *tp = tcp_sk(sk);
143 	__be16 orig_sport, orig_dport;
144 	__be32 daddr, nexthop;
145 	struct flowi4 *fl4;
146 	struct rtable *rt;
147 	int err;
148 	u32 seq;
149 	struct ip_options_rcu *inet_opt;
150 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     lockdep_sock_is_held(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	if (tcp_death_row->sysctl_tw_recycle &&
202 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 		tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205 	inet->inet_dport = usin->sin_port;
206 	sk_daddr_set(sk, daddr);
207 
208 	inet_csk(sk)->icsk_ext_hdr_len = 0;
209 	if (inet_opt)
210 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214 	/* Socket identity is still unknown (sport may be zero).
215 	 * However we set state to SYN-SENT and not releasing socket
216 	 * lock select source port, enter ourselves into the hash tables and
217 	 * complete initialization after this.
218 	 */
219 	tcp_set_state(sk, TCP_SYN_SENT);
220 	err = inet_hash_connect(tcp_death_row, sk);
221 	if (err)
222 		goto failure;
223 
224 	sk_set_txhash(sk);
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 	rt = NULL;
237 
238 	if (likely(!tp->repair)) {
239 		seq = secure_tcp_sequence_number(inet->inet_saddr,
240 						 inet->inet_daddr,
241 						 inet->inet_sport,
242 						 usin->sin_port,
243 						 &tp->tsoffset);
244 		if (!tp->write_seq)
245 			tp->write_seq = seq;
246 	}
247 
248 	inet->inet_id = tp->write_seq ^ jiffies;
249 
250 	if (tcp_fastopen_defer_connect(sk, &err))
251 		return err;
252 	if (err)
253 		goto failure;
254 
255 	err = tcp_connect(sk);
256 
257 	if (err)
258 		goto failure;
259 
260 	return 0;
261 
262 failure:
263 	/*
264 	 * This unhashes the socket and releases the local port,
265 	 * if necessary.
266 	 */
267 	tcp_set_state(sk, TCP_CLOSE);
268 	ip_rt_put(rt);
269 	sk->sk_route_caps = 0;
270 	inet->inet_dport = 0;
271 	return err;
272 }
273 EXPORT_SYMBOL(tcp_v4_connect);
274 
275 /*
276  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
277  * It can be called through tcp_release_cb() if socket was owned by user
278  * at the time tcp_v4_err() was called to handle ICMP message.
279  */
280 void tcp_v4_mtu_reduced(struct sock *sk)
281 {
282 	struct dst_entry *dst;
283 	struct inet_sock *inet = inet_sk(sk);
284 	u32 mtu = tcp_sk(sk)->mtu_info;
285 
286 	dst = inet_csk_update_pmtu(sk, mtu);
287 	if (!dst)
288 		return;
289 
290 	/* Something is about to be wrong... Remember soft error
291 	 * for the case, if this connection will not able to recover.
292 	 */
293 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
294 		sk->sk_err_soft = EMSGSIZE;
295 
296 	mtu = dst_mtu(dst);
297 
298 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
299 	    ip_sk_accept_pmtu(sk) &&
300 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
301 		tcp_sync_mss(sk, mtu);
302 
303 		/* Resend the TCP packet because it's
304 		 * clear that the old packet has been
305 		 * dropped. This is the new "fast" path mtu
306 		 * discovery.
307 		 */
308 		tcp_simple_retransmit(sk);
309 	} /* else let the usual retransmit timer handle it */
310 }
311 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
312 
313 static void do_redirect(struct sk_buff *skb, struct sock *sk)
314 {
315 	struct dst_entry *dst = __sk_dst_check(sk, 0);
316 
317 	if (dst)
318 		dst->ops->redirect(dst, sk, skb);
319 }
320 
321 
322 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
323 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
324 {
325 	struct request_sock *req = inet_reqsk(sk);
326 	struct net *net = sock_net(sk);
327 
328 	/* ICMPs are not backlogged, hence we cannot get
329 	 * an established socket here.
330 	 */
331 	if (seq != tcp_rsk(req)->snt_isn) {
332 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
333 	} else if (abort) {
334 		/*
335 		 * Still in SYN_RECV, just remove it silently.
336 		 * There is no good way to pass the error to the newly
337 		 * created socket, and POSIX does not want network
338 		 * errors returned from accept().
339 		 */
340 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
341 		tcp_listendrop(req->rsk_listener);
342 	}
343 	reqsk_put(req);
344 }
345 EXPORT_SYMBOL(tcp_req_err);
346 
347 /*
348  * This routine is called by the ICMP module when it gets some
349  * sort of error condition.  If err < 0 then the socket should
350  * be closed and the error returned to the user.  If err > 0
351  * it's just the icmp type << 8 | icmp code.  After adjustment
352  * header points to the first 8 bytes of the tcp header.  We need
353  * to find the appropriate port.
354  *
355  * The locking strategy used here is very "optimistic". When
356  * someone else accesses the socket the ICMP is just dropped
357  * and for some paths there is no check at all.
358  * A more general error queue to queue errors for later handling
359  * is probably better.
360  *
361  */
362 
363 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
364 {
365 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
366 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
367 	struct inet_connection_sock *icsk;
368 	struct tcp_sock *tp;
369 	struct inet_sock *inet;
370 	const int type = icmp_hdr(icmp_skb)->type;
371 	const int code = icmp_hdr(icmp_skb)->code;
372 	struct sock *sk;
373 	struct sk_buff *skb;
374 	struct request_sock *fastopen;
375 	__u32 seq, snd_una;
376 	__u32 remaining;
377 	int err;
378 	struct net *net = dev_net(icmp_skb->dev);
379 
380 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
381 				       th->dest, iph->saddr, ntohs(th->source),
382 				       inet_iif(icmp_skb));
383 	if (!sk) {
384 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
385 		return;
386 	}
387 	if (sk->sk_state == TCP_TIME_WAIT) {
388 		inet_twsk_put(inet_twsk(sk));
389 		return;
390 	}
391 	seq = ntohl(th->seq);
392 	if (sk->sk_state == TCP_NEW_SYN_RECV)
393 		return tcp_req_err(sk, seq,
394 				  type == ICMP_PARAMETERPROB ||
395 				  type == ICMP_TIME_EXCEEDED ||
396 				  (type == ICMP_DEST_UNREACH &&
397 				   (code == ICMP_NET_UNREACH ||
398 				    code == ICMP_HOST_UNREACH)));
399 
400 	bh_lock_sock(sk);
401 	/* If too many ICMPs get dropped on busy
402 	 * servers this needs to be solved differently.
403 	 * We do take care of PMTU discovery (RFC1191) special case :
404 	 * we can receive locally generated ICMP messages while socket is held.
405 	 */
406 	if (sock_owned_by_user(sk)) {
407 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
408 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
409 	}
410 	if (sk->sk_state == TCP_CLOSE)
411 		goto out;
412 
413 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
414 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
415 		goto out;
416 	}
417 
418 	icsk = inet_csk(sk);
419 	tp = tcp_sk(sk);
420 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
421 	fastopen = tp->fastopen_rsk;
422 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
423 	if (sk->sk_state != TCP_LISTEN &&
424 	    !between(seq, snd_una, tp->snd_nxt)) {
425 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
426 		goto out;
427 	}
428 
429 	switch (type) {
430 	case ICMP_REDIRECT:
431 		do_redirect(icmp_skb, sk);
432 		goto out;
433 	case ICMP_SOURCE_QUENCH:
434 		/* Just silently ignore these. */
435 		goto out;
436 	case ICMP_PARAMETERPROB:
437 		err = EPROTO;
438 		break;
439 	case ICMP_DEST_UNREACH:
440 		if (code > NR_ICMP_UNREACH)
441 			goto out;
442 
443 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
444 			/* We are not interested in TCP_LISTEN and open_requests
445 			 * (SYN-ACKs send out by Linux are always <576bytes so
446 			 * they should go through unfragmented).
447 			 */
448 			if (sk->sk_state == TCP_LISTEN)
449 				goto out;
450 
451 			tp->mtu_info = info;
452 			if (!sock_owned_by_user(sk)) {
453 				tcp_v4_mtu_reduced(sk);
454 			} else {
455 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
456 					sock_hold(sk);
457 			}
458 			goto out;
459 		}
460 
461 		err = icmp_err_convert[code].errno;
462 		/* check if icmp_skb allows revert of backoff
463 		 * (see draft-zimmermann-tcp-lcd) */
464 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
465 			break;
466 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
467 		    !icsk->icsk_backoff || fastopen)
468 			break;
469 
470 		if (sock_owned_by_user(sk))
471 			break;
472 
473 		icsk->icsk_backoff--;
474 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
475 					       TCP_TIMEOUT_INIT;
476 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
477 
478 		skb = tcp_write_queue_head(sk);
479 		BUG_ON(!skb);
480 
481 		remaining = icsk->icsk_rto -
482 			    min(icsk->icsk_rto,
483 				tcp_time_stamp - tcp_skb_timestamp(skb));
484 
485 		if (remaining) {
486 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
487 						  remaining, TCP_RTO_MAX);
488 		} else {
489 			/* RTO revert clocked out retransmission.
490 			 * Will retransmit now */
491 			tcp_retransmit_timer(sk);
492 		}
493 
494 		break;
495 	case ICMP_TIME_EXCEEDED:
496 		err = EHOSTUNREACH;
497 		break;
498 	default:
499 		goto out;
500 	}
501 
502 	switch (sk->sk_state) {
503 	case TCP_SYN_SENT:
504 	case TCP_SYN_RECV:
505 		/* Only in fast or simultaneous open. If a fast open socket is
506 		 * is already accepted it is treated as a connected one below.
507 		 */
508 		if (fastopen && !fastopen->sk)
509 			break;
510 
511 		if (!sock_owned_by_user(sk)) {
512 			sk->sk_err = err;
513 
514 			sk->sk_error_report(sk);
515 
516 			tcp_done(sk);
517 		} else {
518 			sk->sk_err_soft = err;
519 		}
520 		goto out;
521 	}
522 
523 	/* If we've already connected we will keep trying
524 	 * until we time out, or the user gives up.
525 	 *
526 	 * rfc1122 4.2.3.9 allows to consider as hard errors
527 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
528 	 * but it is obsoleted by pmtu discovery).
529 	 *
530 	 * Note, that in modern internet, where routing is unreliable
531 	 * and in each dark corner broken firewalls sit, sending random
532 	 * errors ordered by their masters even this two messages finally lose
533 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
534 	 *
535 	 * Now we are in compliance with RFCs.
536 	 *							--ANK (980905)
537 	 */
538 
539 	inet = inet_sk(sk);
540 	if (!sock_owned_by_user(sk) && inet->recverr) {
541 		sk->sk_err = err;
542 		sk->sk_error_report(sk);
543 	} else	{ /* Only an error on timeout */
544 		sk->sk_err_soft = err;
545 	}
546 
547 out:
548 	bh_unlock_sock(sk);
549 	sock_put(sk);
550 }
551 
552 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
553 {
554 	struct tcphdr *th = tcp_hdr(skb);
555 
556 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
557 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
558 		skb->csum_start = skb_transport_header(skb) - skb->head;
559 		skb->csum_offset = offsetof(struct tcphdr, check);
560 	} else {
561 		th->check = tcp_v4_check(skb->len, saddr, daddr,
562 					 csum_partial(th,
563 						      th->doff << 2,
564 						      skb->csum));
565 	}
566 }
567 
568 /* This routine computes an IPv4 TCP checksum. */
569 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
570 {
571 	const struct inet_sock *inet = inet_sk(sk);
572 
573 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
574 }
575 EXPORT_SYMBOL(tcp_v4_send_check);
576 
577 /*
578  *	This routine will send an RST to the other tcp.
579  *
580  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
581  *		      for reset.
582  *	Answer: if a packet caused RST, it is not for a socket
583  *		existing in our system, if it is matched to a socket,
584  *		it is just duplicate segment or bug in other side's TCP.
585  *		So that we build reply only basing on parameters
586  *		arrived with segment.
587  *	Exception: precedence violation. We do not implement it in any case.
588  */
589 
590 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
591 {
592 	const struct tcphdr *th = tcp_hdr(skb);
593 	struct {
594 		struct tcphdr th;
595 #ifdef CONFIG_TCP_MD5SIG
596 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
597 #endif
598 	} rep;
599 	struct ip_reply_arg arg;
600 #ifdef CONFIG_TCP_MD5SIG
601 	struct tcp_md5sig_key *key = NULL;
602 	const __u8 *hash_location = NULL;
603 	unsigned char newhash[16];
604 	int genhash;
605 	struct sock *sk1 = NULL;
606 #endif
607 	struct net *net;
608 
609 	/* Never send a reset in response to a reset. */
610 	if (th->rst)
611 		return;
612 
613 	/* If sk not NULL, it means we did a successful lookup and incoming
614 	 * route had to be correct. prequeue might have dropped our dst.
615 	 */
616 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
617 		return;
618 
619 	/* Swap the send and the receive. */
620 	memset(&rep, 0, sizeof(rep));
621 	rep.th.dest   = th->source;
622 	rep.th.source = th->dest;
623 	rep.th.doff   = sizeof(struct tcphdr) / 4;
624 	rep.th.rst    = 1;
625 
626 	if (th->ack) {
627 		rep.th.seq = th->ack_seq;
628 	} else {
629 		rep.th.ack = 1;
630 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
631 				       skb->len - (th->doff << 2));
632 	}
633 
634 	memset(&arg, 0, sizeof(arg));
635 	arg.iov[0].iov_base = (unsigned char *)&rep;
636 	arg.iov[0].iov_len  = sizeof(rep.th);
637 
638 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
639 #ifdef CONFIG_TCP_MD5SIG
640 	rcu_read_lock();
641 	hash_location = tcp_parse_md5sig_option(th);
642 	if (sk && sk_fullsock(sk)) {
643 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
644 					&ip_hdr(skb)->saddr, AF_INET);
645 	} else if (hash_location) {
646 		/*
647 		 * active side is lost. Try to find listening socket through
648 		 * source port, and then find md5 key through listening socket.
649 		 * we are not loose security here:
650 		 * Incoming packet is checked with md5 hash with finding key,
651 		 * no RST generated if md5 hash doesn't match.
652 		 */
653 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
654 					     ip_hdr(skb)->saddr,
655 					     th->source, ip_hdr(skb)->daddr,
656 					     ntohs(th->source), inet_iif(skb));
657 		/* don't send rst if it can't find key */
658 		if (!sk1)
659 			goto out;
660 
661 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
662 					&ip_hdr(skb)->saddr, AF_INET);
663 		if (!key)
664 			goto out;
665 
666 
667 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
668 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
669 			goto out;
670 
671 	}
672 
673 	if (key) {
674 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
675 				   (TCPOPT_NOP << 16) |
676 				   (TCPOPT_MD5SIG << 8) |
677 				   TCPOLEN_MD5SIG);
678 		/* Update length and the length the header thinks exists */
679 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
680 		rep.th.doff = arg.iov[0].iov_len / 4;
681 
682 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
683 				     key, ip_hdr(skb)->saddr,
684 				     ip_hdr(skb)->daddr, &rep.th);
685 	}
686 #endif
687 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
688 				      ip_hdr(skb)->saddr, /* XXX */
689 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
690 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
691 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
692 
693 	/* When socket is gone, all binding information is lost.
694 	 * routing might fail in this case. No choice here, if we choose to force
695 	 * input interface, we will misroute in case of asymmetric route.
696 	 */
697 	if (sk)
698 		arg.bound_dev_if = sk->sk_bound_dev_if;
699 
700 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
701 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
702 
703 	arg.tos = ip_hdr(skb)->tos;
704 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
705 	local_bh_disable();
706 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
707 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
708 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
709 			      &arg, arg.iov[0].iov_len);
710 
711 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
712 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
713 	local_bh_enable();
714 
715 #ifdef CONFIG_TCP_MD5SIG
716 out:
717 	rcu_read_unlock();
718 #endif
719 }
720 
721 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
722    outside socket context is ugly, certainly. What can I do?
723  */
724 
725 static void tcp_v4_send_ack(const struct sock *sk,
726 			    struct sk_buff *skb, u32 seq, u32 ack,
727 			    u32 win, u32 tsval, u32 tsecr, int oif,
728 			    struct tcp_md5sig_key *key,
729 			    int reply_flags, u8 tos)
730 {
731 	const struct tcphdr *th = tcp_hdr(skb);
732 	struct {
733 		struct tcphdr th;
734 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
735 #ifdef CONFIG_TCP_MD5SIG
736 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
737 #endif
738 			];
739 	} rep;
740 	struct net *net = sock_net(sk);
741 	struct ip_reply_arg arg;
742 
743 	memset(&rep.th, 0, sizeof(struct tcphdr));
744 	memset(&arg, 0, sizeof(arg));
745 
746 	arg.iov[0].iov_base = (unsigned char *)&rep;
747 	arg.iov[0].iov_len  = sizeof(rep.th);
748 	if (tsecr) {
749 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
750 				   (TCPOPT_TIMESTAMP << 8) |
751 				   TCPOLEN_TIMESTAMP);
752 		rep.opt[1] = htonl(tsval);
753 		rep.opt[2] = htonl(tsecr);
754 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
755 	}
756 
757 	/* Swap the send and the receive. */
758 	rep.th.dest    = th->source;
759 	rep.th.source  = th->dest;
760 	rep.th.doff    = arg.iov[0].iov_len / 4;
761 	rep.th.seq     = htonl(seq);
762 	rep.th.ack_seq = htonl(ack);
763 	rep.th.ack     = 1;
764 	rep.th.window  = htons(win);
765 
766 #ifdef CONFIG_TCP_MD5SIG
767 	if (key) {
768 		int offset = (tsecr) ? 3 : 0;
769 
770 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
771 					  (TCPOPT_NOP << 16) |
772 					  (TCPOPT_MD5SIG << 8) |
773 					  TCPOLEN_MD5SIG);
774 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
775 		rep.th.doff = arg.iov[0].iov_len/4;
776 
777 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
778 				    key, ip_hdr(skb)->saddr,
779 				    ip_hdr(skb)->daddr, &rep.th);
780 	}
781 #endif
782 	arg.flags = reply_flags;
783 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
784 				      ip_hdr(skb)->saddr, /* XXX */
785 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
786 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
787 	if (oif)
788 		arg.bound_dev_if = oif;
789 	arg.tos = tos;
790 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
791 	local_bh_disable();
792 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
793 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
794 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
795 			      &arg, arg.iov[0].iov_len);
796 
797 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
798 	local_bh_enable();
799 }
800 
801 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
802 {
803 	struct inet_timewait_sock *tw = inet_twsk(sk);
804 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
805 
806 	tcp_v4_send_ack(sk, skb,
807 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
808 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
809 			tcp_time_stamp + tcptw->tw_ts_offset,
810 			tcptw->tw_ts_recent,
811 			tw->tw_bound_dev_if,
812 			tcp_twsk_md5_key(tcptw),
813 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
814 			tw->tw_tos
815 			);
816 
817 	inet_twsk_put(tw);
818 }
819 
820 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
821 				  struct request_sock *req)
822 {
823 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
824 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
825 	 */
826 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
827 					     tcp_sk(sk)->snd_nxt;
828 
829 	/* RFC 7323 2.3
830 	 * The window field (SEG.WND) of every outgoing segment, with the
831 	 * exception of <SYN> segments, MUST be right-shifted by
832 	 * Rcv.Wind.Shift bits:
833 	 */
834 	tcp_v4_send_ack(sk, skb, seq,
835 			tcp_rsk(req)->rcv_nxt,
836 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
837 			tcp_time_stamp + tcp_rsk(req)->ts_off,
838 			req->ts_recent,
839 			0,
840 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
841 					  AF_INET),
842 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
843 			ip_hdr(skb)->tos);
844 }
845 
846 /*
847  *	Send a SYN-ACK after having received a SYN.
848  *	This still operates on a request_sock only, not on a big
849  *	socket.
850  */
851 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
852 			      struct flowi *fl,
853 			      struct request_sock *req,
854 			      struct tcp_fastopen_cookie *foc,
855 			      enum tcp_synack_type synack_type)
856 {
857 	const struct inet_request_sock *ireq = inet_rsk(req);
858 	struct flowi4 fl4;
859 	int err = -1;
860 	struct sk_buff *skb;
861 
862 	/* First, grab a route. */
863 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
864 		return -1;
865 
866 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
867 
868 	if (skb) {
869 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
870 
871 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
872 					    ireq->ir_rmt_addr,
873 					    ireq->opt);
874 		err = net_xmit_eval(err);
875 	}
876 
877 	return err;
878 }
879 
880 /*
881  *	IPv4 request_sock destructor.
882  */
883 static void tcp_v4_reqsk_destructor(struct request_sock *req)
884 {
885 	kfree(inet_rsk(req)->opt);
886 }
887 
888 #ifdef CONFIG_TCP_MD5SIG
889 /*
890  * RFC2385 MD5 checksumming requires a mapping of
891  * IP address->MD5 Key.
892  * We need to maintain these in the sk structure.
893  */
894 
895 /* Find the Key structure for an address.  */
896 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
897 					 const union tcp_md5_addr *addr,
898 					 int family)
899 {
900 	const struct tcp_sock *tp = tcp_sk(sk);
901 	struct tcp_md5sig_key *key;
902 	unsigned int size = sizeof(struct in_addr);
903 	const struct tcp_md5sig_info *md5sig;
904 
905 	/* caller either holds rcu_read_lock() or socket lock */
906 	md5sig = rcu_dereference_check(tp->md5sig_info,
907 				       lockdep_sock_is_held(sk));
908 	if (!md5sig)
909 		return NULL;
910 #if IS_ENABLED(CONFIG_IPV6)
911 	if (family == AF_INET6)
912 		size = sizeof(struct in6_addr);
913 #endif
914 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
915 		if (key->family != family)
916 			continue;
917 		if (!memcmp(&key->addr, addr, size))
918 			return key;
919 	}
920 	return NULL;
921 }
922 EXPORT_SYMBOL(tcp_md5_do_lookup);
923 
924 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
925 					 const struct sock *addr_sk)
926 {
927 	const union tcp_md5_addr *addr;
928 
929 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
930 	return tcp_md5_do_lookup(sk, addr, AF_INET);
931 }
932 EXPORT_SYMBOL(tcp_v4_md5_lookup);
933 
934 /* This can be called on a newly created socket, from other files */
935 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
936 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
937 {
938 	/* Add Key to the list */
939 	struct tcp_md5sig_key *key;
940 	struct tcp_sock *tp = tcp_sk(sk);
941 	struct tcp_md5sig_info *md5sig;
942 
943 	key = tcp_md5_do_lookup(sk, addr, family);
944 	if (key) {
945 		/* Pre-existing entry - just update that one. */
946 		memcpy(key->key, newkey, newkeylen);
947 		key->keylen = newkeylen;
948 		return 0;
949 	}
950 
951 	md5sig = rcu_dereference_protected(tp->md5sig_info,
952 					   lockdep_sock_is_held(sk));
953 	if (!md5sig) {
954 		md5sig = kmalloc(sizeof(*md5sig), gfp);
955 		if (!md5sig)
956 			return -ENOMEM;
957 
958 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
959 		INIT_HLIST_HEAD(&md5sig->head);
960 		rcu_assign_pointer(tp->md5sig_info, md5sig);
961 	}
962 
963 	key = sock_kmalloc(sk, sizeof(*key), gfp);
964 	if (!key)
965 		return -ENOMEM;
966 	if (!tcp_alloc_md5sig_pool()) {
967 		sock_kfree_s(sk, key, sizeof(*key));
968 		return -ENOMEM;
969 	}
970 
971 	memcpy(key->key, newkey, newkeylen);
972 	key->keylen = newkeylen;
973 	key->family = family;
974 	memcpy(&key->addr, addr,
975 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
976 				      sizeof(struct in_addr));
977 	hlist_add_head_rcu(&key->node, &md5sig->head);
978 	return 0;
979 }
980 EXPORT_SYMBOL(tcp_md5_do_add);
981 
982 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
983 {
984 	struct tcp_md5sig_key *key;
985 
986 	key = tcp_md5_do_lookup(sk, addr, family);
987 	if (!key)
988 		return -ENOENT;
989 	hlist_del_rcu(&key->node);
990 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
991 	kfree_rcu(key, rcu);
992 	return 0;
993 }
994 EXPORT_SYMBOL(tcp_md5_do_del);
995 
996 static void tcp_clear_md5_list(struct sock *sk)
997 {
998 	struct tcp_sock *tp = tcp_sk(sk);
999 	struct tcp_md5sig_key *key;
1000 	struct hlist_node *n;
1001 	struct tcp_md5sig_info *md5sig;
1002 
1003 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1004 
1005 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1006 		hlist_del_rcu(&key->node);
1007 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1008 		kfree_rcu(key, rcu);
1009 	}
1010 }
1011 
1012 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1013 				 int optlen)
1014 {
1015 	struct tcp_md5sig cmd;
1016 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1017 
1018 	if (optlen < sizeof(cmd))
1019 		return -EINVAL;
1020 
1021 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1022 		return -EFAULT;
1023 
1024 	if (sin->sin_family != AF_INET)
1025 		return -EINVAL;
1026 
1027 	if (!cmd.tcpm_keylen)
1028 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1029 				      AF_INET);
1030 
1031 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1032 		return -EINVAL;
1033 
1034 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1035 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1036 			      GFP_KERNEL);
1037 }
1038 
1039 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1040 				   __be32 daddr, __be32 saddr,
1041 				   const struct tcphdr *th, int nbytes)
1042 {
1043 	struct tcp4_pseudohdr *bp;
1044 	struct scatterlist sg;
1045 	struct tcphdr *_th;
1046 
1047 	bp = hp->scratch;
1048 	bp->saddr = saddr;
1049 	bp->daddr = daddr;
1050 	bp->pad = 0;
1051 	bp->protocol = IPPROTO_TCP;
1052 	bp->len = cpu_to_be16(nbytes);
1053 
1054 	_th = (struct tcphdr *)(bp + 1);
1055 	memcpy(_th, th, sizeof(*th));
1056 	_th->check = 0;
1057 
1058 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1059 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1060 				sizeof(*bp) + sizeof(*th));
1061 	return crypto_ahash_update(hp->md5_req);
1062 }
1063 
1064 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1065 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1066 {
1067 	struct tcp_md5sig_pool *hp;
1068 	struct ahash_request *req;
1069 
1070 	hp = tcp_get_md5sig_pool();
1071 	if (!hp)
1072 		goto clear_hash_noput;
1073 	req = hp->md5_req;
1074 
1075 	if (crypto_ahash_init(req))
1076 		goto clear_hash;
1077 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1078 		goto clear_hash;
1079 	if (tcp_md5_hash_key(hp, key))
1080 		goto clear_hash;
1081 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1082 	if (crypto_ahash_final(req))
1083 		goto clear_hash;
1084 
1085 	tcp_put_md5sig_pool();
1086 	return 0;
1087 
1088 clear_hash:
1089 	tcp_put_md5sig_pool();
1090 clear_hash_noput:
1091 	memset(md5_hash, 0, 16);
1092 	return 1;
1093 }
1094 
1095 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1096 			const struct sock *sk,
1097 			const struct sk_buff *skb)
1098 {
1099 	struct tcp_md5sig_pool *hp;
1100 	struct ahash_request *req;
1101 	const struct tcphdr *th = tcp_hdr(skb);
1102 	__be32 saddr, daddr;
1103 
1104 	if (sk) { /* valid for establish/request sockets */
1105 		saddr = sk->sk_rcv_saddr;
1106 		daddr = sk->sk_daddr;
1107 	} else {
1108 		const struct iphdr *iph = ip_hdr(skb);
1109 		saddr = iph->saddr;
1110 		daddr = iph->daddr;
1111 	}
1112 
1113 	hp = tcp_get_md5sig_pool();
1114 	if (!hp)
1115 		goto clear_hash_noput;
1116 	req = hp->md5_req;
1117 
1118 	if (crypto_ahash_init(req))
1119 		goto clear_hash;
1120 
1121 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1122 		goto clear_hash;
1123 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1124 		goto clear_hash;
1125 	if (tcp_md5_hash_key(hp, key))
1126 		goto clear_hash;
1127 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1128 	if (crypto_ahash_final(req))
1129 		goto clear_hash;
1130 
1131 	tcp_put_md5sig_pool();
1132 	return 0;
1133 
1134 clear_hash:
1135 	tcp_put_md5sig_pool();
1136 clear_hash_noput:
1137 	memset(md5_hash, 0, 16);
1138 	return 1;
1139 }
1140 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1141 
1142 #endif
1143 
1144 /* Called with rcu_read_lock() */
1145 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1146 				    const struct sk_buff *skb)
1147 {
1148 #ifdef CONFIG_TCP_MD5SIG
1149 	/*
1150 	 * This gets called for each TCP segment that arrives
1151 	 * so we want to be efficient.
1152 	 * We have 3 drop cases:
1153 	 * o No MD5 hash and one expected.
1154 	 * o MD5 hash and we're not expecting one.
1155 	 * o MD5 hash and its wrong.
1156 	 */
1157 	const __u8 *hash_location = NULL;
1158 	struct tcp_md5sig_key *hash_expected;
1159 	const struct iphdr *iph = ip_hdr(skb);
1160 	const struct tcphdr *th = tcp_hdr(skb);
1161 	int genhash;
1162 	unsigned char newhash[16];
1163 
1164 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1165 					  AF_INET);
1166 	hash_location = tcp_parse_md5sig_option(th);
1167 
1168 	/* We've parsed the options - do we have a hash? */
1169 	if (!hash_expected && !hash_location)
1170 		return false;
1171 
1172 	if (hash_expected && !hash_location) {
1173 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1174 		return true;
1175 	}
1176 
1177 	if (!hash_expected && hash_location) {
1178 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1179 		return true;
1180 	}
1181 
1182 	/* Okay, so this is hash_expected and hash_location -
1183 	 * so we need to calculate the checksum.
1184 	 */
1185 	genhash = tcp_v4_md5_hash_skb(newhash,
1186 				      hash_expected,
1187 				      NULL, skb);
1188 
1189 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1190 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1191 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1192 				     &iph->saddr, ntohs(th->source),
1193 				     &iph->daddr, ntohs(th->dest),
1194 				     genhash ? " tcp_v4_calc_md5_hash failed"
1195 				     : "");
1196 		return true;
1197 	}
1198 	return false;
1199 #endif
1200 	return false;
1201 }
1202 
1203 static void tcp_v4_init_req(struct request_sock *req,
1204 			    const struct sock *sk_listener,
1205 			    struct sk_buff *skb)
1206 {
1207 	struct inet_request_sock *ireq = inet_rsk(req);
1208 
1209 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1210 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1211 	ireq->opt = tcp_v4_save_options(skb);
1212 }
1213 
1214 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1215 					  struct flowi *fl,
1216 					  const struct request_sock *req,
1217 					  bool *strict)
1218 {
1219 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1220 
1221 	if (strict) {
1222 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1223 			*strict = true;
1224 		else
1225 			*strict = false;
1226 	}
1227 
1228 	return dst;
1229 }
1230 
1231 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1232 	.family		=	PF_INET,
1233 	.obj_size	=	sizeof(struct tcp_request_sock),
1234 	.rtx_syn_ack	=	tcp_rtx_synack,
1235 	.send_ack	=	tcp_v4_reqsk_send_ack,
1236 	.destructor	=	tcp_v4_reqsk_destructor,
1237 	.send_reset	=	tcp_v4_send_reset,
1238 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1239 };
1240 
1241 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1242 	.mss_clamp	=	TCP_MSS_DEFAULT,
1243 #ifdef CONFIG_TCP_MD5SIG
1244 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1245 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1246 #endif
1247 	.init_req	=	tcp_v4_init_req,
1248 #ifdef CONFIG_SYN_COOKIES
1249 	.cookie_init_seq =	cookie_v4_init_sequence,
1250 #endif
1251 	.route_req	=	tcp_v4_route_req,
1252 	.init_seq	=	tcp_v4_init_sequence,
1253 	.send_synack	=	tcp_v4_send_synack,
1254 };
1255 
1256 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1257 {
1258 	/* Never answer to SYNs send to broadcast or multicast */
1259 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1260 		goto drop;
1261 
1262 	return tcp_conn_request(&tcp_request_sock_ops,
1263 				&tcp_request_sock_ipv4_ops, sk, skb);
1264 
1265 drop:
1266 	tcp_listendrop(sk);
1267 	return 0;
1268 }
1269 EXPORT_SYMBOL(tcp_v4_conn_request);
1270 
1271 
1272 /*
1273  * The three way handshake has completed - we got a valid synack -
1274  * now create the new socket.
1275  */
1276 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1277 				  struct request_sock *req,
1278 				  struct dst_entry *dst,
1279 				  struct request_sock *req_unhash,
1280 				  bool *own_req)
1281 {
1282 	struct inet_request_sock *ireq;
1283 	struct inet_sock *newinet;
1284 	struct tcp_sock *newtp;
1285 	struct sock *newsk;
1286 #ifdef CONFIG_TCP_MD5SIG
1287 	struct tcp_md5sig_key *key;
1288 #endif
1289 	struct ip_options_rcu *inet_opt;
1290 
1291 	if (sk_acceptq_is_full(sk))
1292 		goto exit_overflow;
1293 
1294 	newsk = tcp_create_openreq_child(sk, req, skb);
1295 	if (!newsk)
1296 		goto exit_nonewsk;
1297 
1298 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1299 	inet_sk_rx_dst_set(newsk, skb);
1300 
1301 	newtp		      = tcp_sk(newsk);
1302 	newinet		      = inet_sk(newsk);
1303 	ireq		      = inet_rsk(req);
1304 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1305 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1306 	newsk->sk_bound_dev_if = ireq->ir_iif;
1307 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1308 	inet_opt	      = ireq->opt;
1309 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1310 	ireq->opt	      = NULL;
1311 	newinet->mc_index     = inet_iif(skb);
1312 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1313 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1314 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1315 	if (inet_opt)
1316 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1317 	newinet->inet_id = newtp->write_seq ^ jiffies;
1318 
1319 	if (!dst) {
1320 		dst = inet_csk_route_child_sock(sk, newsk, req);
1321 		if (!dst)
1322 			goto put_and_exit;
1323 	} else {
1324 		/* syncookie case : see end of cookie_v4_check() */
1325 	}
1326 	sk_setup_caps(newsk, dst);
1327 
1328 	tcp_ca_openreq_child(newsk, dst);
1329 
1330 	tcp_sync_mss(newsk, dst_mtu(dst));
1331 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1332 
1333 	tcp_initialize_rcv_mss(newsk);
1334 
1335 #ifdef CONFIG_TCP_MD5SIG
1336 	/* Copy over the MD5 key from the original socket */
1337 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1338 				AF_INET);
1339 	if (key) {
1340 		/*
1341 		 * We're using one, so create a matching key
1342 		 * on the newsk structure. If we fail to get
1343 		 * memory, then we end up not copying the key
1344 		 * across. Shucks.
1345 		 */
1346 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1347 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1348 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1349 	}
1350 #endif
1351 
1352 	if (__inet_inherit_port(sk, newsk) < 0)
1353 		goto put_and_exit;
1354 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1355 	if (*own_req)
1356 		tcp_move_syn(newtp, req);
1357 
1358 	return newsk;
1359 
1360 exit_overflow:
1361 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1362 exit_nonewsk:
1363 	dst_release(dst);
1364 exit:
1365 	tcp_listendrop(sk);
1366 	return NULL;
1367 put_and_exit:
1368 	inet_csk_prepare_forced_close(newsk);
1369 	tcp_done(newsk);
1370 	goto exit;
1371 }
1372 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1373 
1374 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1375 {
1376 #ifdef CONFIG_SYN_COOKIES
1377 	const struct tcphdr *th = tcp_hdr(skb);
1378 
1379 	if (!th->syn)
1380 		sk = cookie_v4_check(sk, skb);
1381 #endif
1382 	return sk;
1383 }
1384 
1385 /* The socket must have it's spinlock held when we get
1386  * here, unless it is a TCP_LISTEN socket.
1387  *
1388  * We have a potential double-lock case here, so even when
1389  * doing backlog processing we use the BH locking scheme.
1390  * This is because we cannot sleep with the original spinlock
1391  * held.
1392  */
1393 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1394 {
1395 	struct sock *rsk;
1396 
1397 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1398 		struct dst_entry *dst = sk->sk_rx_dst;
1399 
1400 		sock_rps_save_rxhash(sk, skb);
1401 		sk_mark_napi_id(sk, skb);
1402 		if (dst) {
1403 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1404 			    !dst->ops->check(dst, 0)) {
1405 				dst_release(dst);
1406 				sk->sk_rx_dst = NULL;
1407 			}
1408 		}
1409 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1410 		return 0;
1411 	}
1412 
1413 	if (tcp_checksum_complete(skb))
1414 		goto csum_err;
1415 
1416 	if (sk->sk_state == TCP_LISTEN) {
1417 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1418 
1419 		if (!nsk)
1420 			goto discard;
1421 		if (nsk != sk) {
1422 			sock_rps_save_rxhash(nsk, skb);
1423 			sk_mark_napi_id(nsk, skb);
1424 			if (tcp_child_process(sk, nsk, skb)) {
1425 				rsk = nsk;
1426 				goto reset;
1427 			}
1428 			return 0;
1429 		}
1430 	} else
1431 		sock_rps_save_rxhash(sk, skb);
1432 
1433 	if (tcp_rcv_state_process(sk, skb)) {
1434 		rsk = sk;
1435 		goto reset;
1436 	}
1437 	return 0;
1438 
1439 reset:
1440 	tcp_v4_send_reset(rsk, skb);
1441 discard:
1442 	kfree_skb(skb);
1443 	/* Be careful here. If this function gets more complicated and
1444 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1445 	 * might be destroyed here. This current version compiles correctly,
1446 	 * but you have been warned.
1447 	 */
1448 	return 0;
1449 
1450 csum_err:
1451 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1452 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1453 	goto discard;
1454 }
1455 EXPORT_SYMBOL(tcp_v4_do_rcv);
1456 
1457 void tcp_v4_early_demux(struct sk_buff *skb)
1458 {
1459 	const struct iphdr *iph;
1460 	const struct tcphdr *th;
1461 	struct sock *sk;
1462 
1463 	if (skb->pkt_type != PACKET_HOST)
1464 		return;
1465 
1466 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1467 		return;
1468 
1469 	iph = ip_hdr(skb);
1470 	th = tcp_hdr(skb);
1471 
1472 	if (th->doff < sizeof(struct tcphdr) / 4)
1473 		return;
1474 
1475 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1476 				       iph->saddr, th->source,
1477 				       iph->daddr, ntohs(th->dest),
1478 				       skb->skb_iif);
1479 	if (sk) {
1480 		skb->sk = sk;
1481 		skb->destructor = sock_edemux;
1482 		if (sk_fullsock(sk)) {
1483 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1484 
1485 			if (dst)
1486 				dst = dst_check(dst, 0);
1487 			if (dst &&
1488 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1489 				skb_dst_set_noref(skb, dst);
1490 		}
1491 	}
1492 }
1493 
1494 /* Packet is added to VJ-style prequeue for processing in process
1495  * context, if a reader task is waiting. Apparently, this exciting
1496  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1497  * failed somewhere. Latency? Burstiness? Well, at least now we will
1498  * see, why it failed. 8)8)				  --ANK
1499  *
1500  */
1501 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1502 {
1503 	struct tcp_sock *tp = tcp_sk(sk);
1504 
1505 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1506 		return false;
1507 
1508 	if (skb->len <= tcp_hdrlen(skb) &&
1509 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1510 		return false;
1511 
1512 	/* Before escaping RCU protected region, we need to take care of skb
1513 	 * dst. Prequeue is only enabled for established sockets.
1514 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1515 	 * Instead of doing full sk_rx_dst validity here, let's perform
1516 	 * an optimistic check.
1517 	 */
1518 	if (likely(sk->sk_rx_dst))
1519 		skb_dst_drop(skb);
1520 	else
1521 		skb_dst_force_safe(skb);
1522 
1523 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1524 	tp->ucopy.memory += skb->truesize;
1525 	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1526 	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1527 		struct sk_buff *skb1;
1528 
1529 		BUG_ON(sock_owned_by_user(sk));
1530 		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1531 				skb_queue_len(&tp->ucopy.prequeue));
1532 
1533 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1534 			sk_backlog_rcv(sk, skb1);
1535 
1536 		tp->ucopy.memory = 0;
1537 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1538 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1539 					   POLLIN | POLLRDNORM | POLLRDBAND);
1540 		if (!inet_csk_ack_scheduled(sk))
1541 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1542 						  (3 * tcp_rto_min(sk)) / 4,
1543 						  TCP_RTO_MAX);
1544 	}
1545 	return true;
1546 }
1547 EXPORT_SYMBOL(tcp_prequeue);
1548 
1549 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1550 {
1551 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1552 
1553 	/* Only socket owner can try to collapse/prune rx queues
1554 	 * to reduce memory overhead, so add a little headroom here.
1555 	 * Few sockets backlog are possibly concurrently non empty.
1556 	 */
1557 	limit += 64*1024;
1558 
1559 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1560 	 * we can fix skb->truesize to its real value to avoid future drops.
1561 	 * This is valid because skb is not yet charged to the socket.
1562 	 * It has been noticed pure SACK packets were sometimes dropped
1563 	 * (if cooked by drivers without copybreak feature).
1564 	 */
1565 	skb_condense(skb);
1566 
1567 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1568 		bh_unlock_sock(sk);
1569 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1570 		return true;
1571 	}
1572 	return false;
1573 }
1574 EXPORT_SYMBOL(tcp_add_backlog);
1575 
1576 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1577 {
1578 	struct tcphdr *th = (struct tcphdr *)skb->data;
1579 	unsigned int eaten = skb->len;
1580 	int err;
1581 
1582 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1583 	if (!err) {
1584 		eaten -= skb->len;
1585 		TCP_SKB_CB(skb)->end_seq -= eaten;
1586 	}
1587 	return err;
1588 }
1589 EXPORT_SYMBOL(tcp_filter);
1590 
1591 /*
1592  *	From tcp_input.c
1593  */
1594 
1595 int tcp_v4_rcv(struct sk_buff *skb)
1596 {
1597 	struct net *net = dev_net(skb->dev);
1598 	const struct iphdr *iph;
1599 	const struct tcphdr *th;
1600 	bool refcounted;
1601 	struct sock *sk;
1602 	int ret;
1603 
1604 	if (skb->pkt_type != PACKET_HOST)
1605 		goto discard_it;
1606 
1607 	/* Count it even if it's bad */
1608 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1609 
1610 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1611 		goto discard_it;
1612 
1613 	th = (const struct tcphdr *)skb->data;
1614 
1615 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1616 		goto bad_packet;
1617 	if (!pskb_may_pull(skb, th->doff * 4))
1618 		goto discard_it;
1619 
1620 	/* An explanation is required here, I think.
1621 	 * Packet length and doff are validated by header prediction,
1622 	 * provided case of th->doff==0 is eliminated.
1623 	 * So, we defer the checks. */
1624 
1625 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1626 		goto csum_error;
1627 
1628 	th = (const struct tcphdr *)skb->data;
1629 	iph = ip_hdr(skb);
1630 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1631 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1632 	 */
1633 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1634 		sizeof(struct inet_skb_parm));
1635 	barrier();
1636 
1637 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1638 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1639 				    skb->len - th->doff * 4);
1640 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1641 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1642 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1643 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1644 	TCP_SKB_CB(skb)->sacked	 = 0;
1645 
1646 lookup:
1647 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1648 			       th->dest, &refcounted);
1649 	if (!sk)
1650 		goto no_tcp_socket;
1651 
1652 process:
1653 	if (sk->sk_state == TCP_TIME_WAIT)
1654 		goto do_time_wait;
1655 
1656 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1657 		struct request_sock *req = inet_reqsk(sk);
1658 		struct sock *nsk;
1659 
1660 		sk = req->rsk_listener;
1661 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1662 			sk_drops_add(sk, skb);
1663 			reqsk_put(req);
1664 			goto discard_it;
1665 		}
1666 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1667 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1668 			goto lookup;
1669 		}
1670 		/* We own a reference on the listener, increase it again
1671 		 * as we might lose it too soon.
1672 		 */
1673 		sock_hold(sk);
1674 		refcounted = true;
1675 		nsk = tcp_check_req(sk, skb, req, false);
1676 		if (!nsk) {
1677 			reqsk_put(req);
1678 			goto discard_and_relse;
1679 		}
1680 		if (nsk == sk) {
1681 			reqsk_put(req);
1682 		} else if (tcp_child_process(sk, nsk, skb)) {
1683 			tcp_v4_send_reset(nsk, skb);
1684 			goto discard_and_relse;
1685 		} else {
1686 			sock_put(sk);
1687 			return 0;
1688 		}
1689 	}
1690 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1691 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1692 		goto discard_and_relse;
1693 	}
1694 
1695 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1696 		goto discard_and_relse;
1697 
1698 	if (tcp_v4_inbound_md5_hash(sk, skb))
1699 		goto discard_and_relse;
1700 
1701 	nf_reset(skb);
1702 
1703 	if (tcp_filter(sk, skb))
1704 		goto discard_and_relse;
1705 	th = (const struct tcphdr *)skb->data;
1706 	iph = ip_hdr(skb);
1707 
1708 	skb->dev = NULL;
1709 
1710 	if (sk->sk_state == TCP_LISTEN) {
1711 		ret = tcp_v4_do_rcv(sk, skb);
1712 		goto put_and_return;
1713 	}
1714 
1715 	sk_incoming_cpu_update(sk);
1716 
1717 	bh_lock_sock_nested(sk);
1718 	tcp_segs_in(tcp_sk(sk), skb);
1719 	ret = 0;
1720 	if (!sock_owned_by_user(sk)) {
1721 		if (!tcp_prequeue(sk, skb))
1722 			ret = tcp_v4_do_rcv(sk, skb);
1723 	} else if (tcp_add_backlog(sk, skb)) {
1724 		goto discard_and_relse;
1725 	}
1726 	bh_unlock_sock(sk);
1727 
1728 put_and_return:
1729 	if (refcounted)
1730 		sock_put(sk);
1731 
1732 	return ret;
1733 
1734 no_tcp_socket:
1735 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1736 		goto discard_it;
1737 
1738 	if (tcp_checksum_complete(skb)) {
1739 csum_error:
1740 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1741 bad_packet:
1742 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1743 	} else {
1744 		tcp_v4_send_reset(NULL, skb);
1745 	}
1746 
1747 discard_it:
1748 	/* Discard frame. */
1749 	kfree_skb(skb);
1750 	return 0;
1751 
1752 discard_and_relse:
1753 	sk_drops_add(sk, skb);
1754 	if (refcounted)
1755 		sock_put(sk);
1756 	goto discard_it;
1757 
1758 do_time_wait:
1759 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1760 		inet_twsk_put(inet_twsk(sk));
1761 		goto discard_it;
1762 	}
1763 
1764 	if (tcp_checksum_complete(skb)) {
1765 		inet_twsk_put(inet_twsk(sk));
1766 		goto csum_error;
1767 	}
1768 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1769 	case TCP_TW_SYN: {
1770 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1771 							&tcp_hashinfo, skb,
1772 							__tcp_hdrlen(th),
1773 							iph->saddr, th->source,
1774 							iph->daddr, th->dest,
1775 							inet_iif(skb));
1776 		if (sk2) {
1777 			inet_twsk_deschedule_put(inet_twsk(sk));
1778 			sk = sk2;
1779 			refcounted = false;
1780 			goto process;
1781 		}
1782 		/* Fall through to ACK */
1783 	}
1784 	case TCP_TW_ACK:
1785 		tcp_v4_timewait_ack(sk, skb);
1786 		break;
1787 	case TCP_TW_RST:
1788 		tcp_v4_send_reset(sk, skb);
1789 		inet_twsk_deschedule_put(inet_twsk(sk));
1790 		goto discard_it;
1791 	case TCP_TW_SUCCESS:;
1792 	}
1793 	goto discard_it;
1794 }
1795 
1796 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1797 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1798 	.twsk_unique	= tcp_twsk_unique,
1799 	.twsk_destructor= tcp_twsk_destructor,
1800 };
1801 
1802 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1803 {
1804 	struct dst_entry *dst = skb_dst(skb);
1805 
1806 	if (dst && dst_hold_safe(dst)) {
1807 		sk->sk_rx_dst = dst;
1808 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1809 	}
1810 }
1811 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1812 
1813 const struct inet_connection_sock_af_ops ipv4_specific = {
1814 	.queue_xmit	   = ip_queue_xmit,
1815 	.send_check	   = tcp_v4_send_check,
1816 	.rebuild_header	   = inet_sk_rebuild_header,
1817 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1818 	.conn_request	   = tcp_v4_conn_request,
1819 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1820 	.net_header_len	   = sizeof(struct iphdr),
1821 	.setsockopt	   = ip_setsockopt,
1822 	.getsockopt	   = ip_getsockopt,
1823 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1824 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1825 #ifdef CONFIG_COMPAT
1826 	.compat_setsockopt = compat_ip_setsockopt,
1827 	.compat_getsockopt = compat_ip_getsockopt,
1828 #endif
1829 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1830 };
1831 EXPORT_SYMBOL(ipv4_specific);
1832 
1833 #ifdef CONFIG_TCP_MD5SIG
1834 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1835 	.md5_lookup		= tcp_v4_md5_lookup,
1836 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1837 	.md5_parse		= tcp_v4_parse_md5_keys,
1838 };
1839 #endif
1840 
1841 /* NOTE: A lot of things set to zero explicitly by call to
1842  *       sk_alloc() so need not be done here.
1843  */
1844 static int tcp_v4_init_sock(struct sock *sk)
1845 {
1846 	struct inet_connection_sock *icsk = inet_csk(sk);
1847 
1848 	tcp_init_sock(sk);
1849 
1850 	icsk->icsk_af_ops = &ipv4_specific;
1851 
1852 #ifdef CONFIG_TCP_MD5SIG
1853 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1854 #endif
1855 
1856 	return 0;
1857 }
1858 
1859 void tcp_v4_destroy_sock(struct sock *sk)
1860 {
1861 	struct tcp_sock *tp = tcp_sk(sk);
1862 
1863 	tcp_clear_xmit_timers(sk);
1864 
1865 	tcp_cleanup_congestion_control(sk);
1866 
1867 	/* Cleanup up the write buffer. */
1868 	tcp_write_queue_purge(sk);
1869 
1870 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1871 	skb_rbtree_purge(&tp->out_of_order_queue);
1872 
1873 #ifdef CONFIG_TCP_MD5SIG
1874 	/* Clean up the MD5 key list, if any */
1875 	if (tp->md5sig_info) {
1876 		tcp_clear_md5_list(sk);
1877 		kfree_rcu(tp->md5sig_info, rcu);
1878 		tp->md5sig_info = NULL;
1879 	}
1880 #endif
1881 
1882 	/* Clean prequeue, it must be empty really */
1883 	__skb_queue_purge(&tp->ucopy.prequeue);
1884 
1885 	/* Clean up a referenced TCP bind bucket. */
1886 	if (inet_csk(sk)->icsk_bind_hash)
1887 		inet_put_port(sk);
1888 
1889 	BUG_ON(tp->fastopen_rsk);
1890 
1891 	/* If socket is aborted during connect operation */
1892 	tcp_free_fastopen_req(tp);
1893 	tcp_saved_syn_free(tp);
1894 
1895 	sk_sockets_allocated_dec(sk);
1896 }
1897 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1898 
1899 #ifdef CONFIG_PROC_FS
1900 /* Proc filesystem TCP sock list dumping. */
1901 
1902 /*
1903  * Get next listener socket follow cur.  If cur is NULL, get first socket
1904  * starting from bucket given in st->bucket; when st->bucket is zero the
1905  * very first socket in the hash table is returned.
1906  */
1907 static void *listening_get_next(struct seq_file *seq, void *cur)
1908 {
1909 	struct tcp_iter_state *st = seq->private;
1910 	struct net *net = seq_file_net(seq);
1911 	struct inet_listen_hashbucket *ilb;
1912 	struct sock *sk = cur;
1913 
1914 	if (!sk) {
1915 get_head:
1916 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1917 		spin_lock(&ilb->lock);
1918 		sk = sk_head(&ilb->head);
1919 		st->offset = 0;
1920 		goto get_sk;
1921 	}
1922 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1923 	++st->num;
1924 	++st->offset;
1925 
1926 	sk = sk_next(sk);
1927 get_sk:
1928 	sk_for_each_from(sk) {
1929 		if (!net_eq(sock_net(sk), net))
1930 			continue;
1931 		if (sk->sk_family == st->family)
1932 			return sk;
1933 	}
1934 	spin_unlock(&ilb->lock);
1935 	st->offset = 0;
1936 	if (++st->bucket < INET_LHTABLE_SIZE)
1937 		goto get_head;
1938 	return NULL;
1939 }
1940 
1941 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1942 {
1943 	struct tcp_iter_state *st = seq->private;
1944 	void *rc;
1945 
1946 	st->bucket = 0;
1947 	st->offset = 0;
1948 	rc = listening_get_next(seq, NULL);
1949 
1950 	while (rc && *pos) {
1951 		rc = listening_get_next(seq, rc);
1952 		--*pos;
1953 	}
1954 	return rc;
1955 }
1956 
1957 static inline bool empty_bucket(const struct tcp_iter_state *st)
1958 {
1959 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1960 }
1961 
1962 /*
1963  * Get first established socket starting from bucket given in st->bucket.
1964  * If st->bucket is zero, the very first socket in the hash is returned.
1965  */
1966 static void *established_get_first(struct seq_file *seq)
1967 {
1968 	struct tcp_iter_state *st = seq->private;
1969 	struct net *net = seq_file_net(seq);
1970 	void *rc = NULL;
1971 
1972 	st->offset = 0;
1973 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1974 		struct sock *sk;
1975 		struct hlist_nulls_node *node;
1976 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1977 
1978 		/* Lockless fast path for the common case of empty buckets */
1979 		if (empty_bucket(st))
1980 			continue;
1981 
1982 		spin_lock_bh(lock);
1983 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1984 			if (sk->sk_family != st->family ||
1985 			    !net_eq(sock_net(sk), net)) {
1986 				continue;
1987 			}
1988 			rc = sk;
1989 			goto out;
1990 		}
1991 		spin_unlock_bh(lock);
1992 	}
1993 out:
1994 	return rc;
1995 }
1996 
1997 static void *established_get_next(struct seq_file *seq, void *cur)
1998 {
1999 	struct sock *sk = cur;
2000 	struct hlist_nulls_node *node;
2001 	struct tcp_iter_state *st = seq->private;
2002 	struct net *net = seq_file_net(seq);
2003 
2004 	++st->num;
2005 	++st->offset;
2006 
2007 	sk = sk_nulls_next(sk);
2008 
2009 	sk_nulls_for_each_from(sk, node) {
2010 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2011 			return sk;
2012 	}
2013 
2014 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2015 	++st->bucket;
2016 	return established_get_first(seq);
2017 }
2018 
2019 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2020 {
2021 	struct tcp_iter_state *st = seq->private;
2022 	void *rc;
2023 
2024 	st->bucket = 0;
2025 	rc = established_get_first(seq);
2026 
2027 	while (rc && pos) {
2028 		rc = established_get_next(seq, rc);
2029 		--pos;
2030 	}
2031 	return rc;
2032 }
2033 
2034 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2035 {
2036 	void *rc;
2037 	struct tcp_iter_state *st = seq->private;
2038 
2039 	st->state = TCP_SEQ_STATE_LISTENING;
2040 	rc	  = listening_get_idx(seq, &pos);
2041 
2042 	if (!rc) {
2043 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2044 		rc	  = established_get_idx(seq, pos);
2045 	}
2046 
2047 	return rc;
2048 }
2049 
2050 static void *tcp_seek_last_pos(struct seq_file *seq)
2051 {
2052 	struct tcp_iter_state *st = seq->private;
2053 	int offset = st->offset;
2054 	int orig_num = st->num;
2055 	void *rc = NULL;
2056 
2057 	switch (st->state) {
2058 	case TCP_SEQ_STATE_LISTENING:
2059 		if (st->bucket >= INET_LHTABLE_SIZE)
2060 			break;
2061 		st->state = TCP_SEQ_STATE_LISTENING;
2062 		rc = listening_get_next(seq, NULL);
2063 		while (offset-- && rc)
2064 			rc = listening_get_next(seq, rc);
2065 		if (rc)
2066 			break;
2067 		st->bucket = 0;
2068 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2069 		/* Fallthrough */
2070 	case TCP_SEQ_STATE_ESTABLISHED:
2071 		if (st->bucket > tcp_hashinfo.ehash_mask)
2072 			break;
2073 		rc = established_get_first(seq);
2074 		while (offset-- && rc)
2075 			rc = established_get_next(seq, rc);
2076 	}
2077 
2078 	st->num = orig_num;
2079 
2080 	return rc;
2081 }
2082 
2083 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2084 {
2085 	struct tcp_iter_state *st = seq->private;
2086 	void *rc;
2087 
2088 	if (*pos && *pos == st->last_pos) {
2089 		rc = tcp_seek_last_pos(seq);
2090 		if (rc)
2091 			goto out;
2092 	}
2093 
2094 	st->state = TCP_SEQ_STATE_LISTENING;
2095 	st->num = 0;
2096 	st->bucket = 0;
2097 	st->offset = 0;
2098 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2099 
2100 out:
2101 	st->last_pos = *pos;
2102 	return rc;
2103 }
2104 
2105 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2106 {
2107 	struct tcp_iter_state *st = seq->private;
2108 	void *rc = NULL;
2109 
2110 	if (v == SEQ_START_TOKEN) {
2111 		rc = tcp_get_idx(seq, 0);
2112 		goto out;
2113 	}
2114 
2115 	switch (st->state) {
2116 	case TCP_SEQ_STATE_LISTENING:
2117 		rc = listening_get_next(seq, v);
2118 		if (!rc) {
2119 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2120 			st->bucket = 0;
2121 			st->offset = 0;
2122 			rc	  = established_get_first(seq);
2123 		}
2124 		break;
2125 	case TCP_SEQ_STATE_ESTABLISHED:
2126 		rc = established_get_next(seq, v);
2127 		break;
2128 	}
2129 out:
2130 	++*pos;
2131 	st->last_pos = *pos;
2132 	return rc;
2133 }
2134 
2135 static void tcp_seq_stop(struct seq_file *seq, void *v)
2136 {
2137 	struct tcp_iter_state *st = seq->private;
2138 
2139 	switch (st->state) {
2140 	case TCP_SEQ_STATE_LISTENING:
2141 		if (v != SEQ_START_TOKEN)
2142 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2143 		break;
2144 	case TCP_SEQ_STATE_ESTABLISHED:
2145 		if (v)
2146 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2147 		break;
2148 	}
2149 }
2150 
2151 int tcp_seq_open(struct inode *inode, struct file *file)
2152 {
2153 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2154 	struct tcp_iter_state *s;
2155 	int err;
2156 
2157 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2158 			  sizeof(struct tcp_iter_state));
2159 	if (err < 0)
2160 		return err;
2161 
2162 	s = ((struct seq_file *)file->private_data)->private;
2163 	s->family		= afinfo->family;
2164 	s->last_pos		= 0;
2165 	return 0;
2166 }
2167 EXPORT_SYMBOL(tcp_seq_open);
2168 
2169 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2170 {
2171 	int rc = 0;
2172 	struct proc_dir_entry *p;
2173 
2174 	afinfo->seq_ops.start		= tcp_seq_start;
2175 	afinfo->seq_ops.next		= tcp_seq_next;
2176 	afinfo->seq_ops.stop		= tcp_seq_stop;
2177 
2178 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2179 			     afinfo->seq_fops, afinfo);
2180 	if (!p)
2181 		rc = -ENOMEM;
2182 	return rc;
2183 }
2184 EXPORT_SYMBOL(tcp_proc_register);
2185 
2186 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2187 {
2188 	remove_proc_entry(afinfo->name, net->proc_net);
2189 }
2190 EXPORT_SYMBOL(tcp_proc_unregister);
2191 
2192 static void get_openreq4(const struct request_sock *req,
2193 			 struct seq_file *f, int i)
2194 {
2195 	const struct inet_request_sock *ireq = inet_rsk(req);
2196 	long delta = req->rsk_timer.expires - jiffies;
2197 
2198 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2199 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2200 		i,
2201 		ireq->ir_loc_addr,
2202 		ireq->ir_num,
2203 		ireq->ir_rmt_addr,
2204 		ntohs(ireq->ir_rmt_port),
2205 		TCP_SYN_RECV,
2206 		0, 0, /* could print option size, but that is af dependent. */
2207 		1,    /* timers active (only the expire timer) */
2208 		jiffies_delta_to_clock_t(delta),
2209 		req->num_timeout,
2210 		from_kuid_munged(seq_user_ns(f),
2211 				 sock_i_uid(req->rsk_listener)),
2212 		0,  /* non standard timer */
2213 		0, /* open_requests have no inode */
2214 		0,
2215 		req);
2216 }
2217 
2218 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2219 {
2220 	int timer_active;
2221 	unsigned long timer_expires;
2222 	const struct tcp_sock *tp = tcp_sk(sk);
2223 	const struct inet_connection_sock *icsk = inet_csk(sk);
2224 	const struct inet_sock *inet = inet_sk(sk);
2225 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2226 	__be32 dest = inet->inet_daddr;
2227 	__be32 src = inet->inet_rcv_saddr;
2228 	__u16 destp = ntohs(inet->inet_dport);
2229 	__u16 srcp = ntohs(inet->inet_sport);
2230 	int rx_queue;
2231 	int state;
2232 
2233 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2234 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2235 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2236 		timer_active	= 1;
2237 		timer_expires	= icsk->icsk_timeout;
2238 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2239 		timer_active	= 4;
2240 		timer_expires	= icsk->icsk_timeout;
2241 	} else if (timer_pending(&sk->sk_timer)) {
2242 		timer_active	= 2;
2243 		timer_expires	= sk->sk_timer.expires;
2244 	} else {
2245 		timer_active	= 0;
2246 		timer_expires = jiffies;
2247 	}
2248 
2249 	state = sk_state_load(sk);
2250 	if (state == TCP_LISTEN)
2251 		rx_queue = sk->sk_ack_backlog;
2252 	else
2253 		/* Because we don't lock the socket,
2254 		 * we might find a transient negative value.
2255 		 */
2256 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2257 
2258 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2259 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2260 		i, src, srcp, dest, destp, state,
2261 		tp->write_seq - tp->snd_una,
2262 		rx_queue,
2263 		timer_active,
2264 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2265 		icsk->icsk_retransmits,
2266 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2267 		icsk->icsk_probes_out,
2268 		sock_i_ino(sk),
2269 		atomic_read(&sk->sk_refcnt), sk,
2270 		jiffies_to_clock_t(icsk->icsk_rto),
2271 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2272 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2273 		tp->snd_cwnd,
2274 		state == TCP_LISTEN ?
2275 		    fastopenq->max_qlen :
2276 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2277 }
2278 
2279 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2280 			       struct seq_file *f, int i)
2281 {
2282 	long delta = tw->tw_timer.expires - jiffies;
2283 	__be32 dest, src;
2284 	__u16 destp, srcp;
2285 
2286 	dest  = tw->tw_daddr;
2287 	src   = tw->tw_rcv_saddr;
2288 	destp = ntohs(tw->tw_dport);
2289 	srcp  = ntohs(tw->tw_sport);
2290 
2291 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2292 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2293 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2294 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2295 		atomic_read(&tw->tw_refcnt), tw);
2296 }
2297 
2298 #define TMPSZ 150
2299 
2300 static int tcp4_seq_show(struct seq_file *seq, void *v)
2301 {
2302 	struct tcp_iter_state *st;
2303 	struct sock *sk = v;
2304 
2305 	seq_setwidth(seq, TMPSZ - 1);
2306 	if (v == SEQ_START_TOKEN) {
2307 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2308 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2309 			   "inode");
2310 		goto out;
2311 	}
2312 	st = seq->private;
2313 
2314 	if (sk->sk_state == TCP_TIME_WAIT)
2315 		get_timewait4_sock(v, seq, st->num);
2316 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2317 		get_openreq4(v, seq, st->num);
2318 	else
2319 		get_tcp4_sock(v, seq, st->num);
2320 out:
2321 	seq_pad(seq, '\n');
2322 	return 0;
2323 }
2324 
2325 static const struct file_operations tcp_afinfo_seq_fops = {
2326 	.owner   = THIS_MODULE,
2327 	.open    = tcp_seq_open,
2328 	.read    = seq_read,
2329 	.llseek  = seq_lseek,
2330 	.release = seq_release_net
2331 };
2332 
2333 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2334 	.name		= "tcp",
2335 	.family		= AF_INET,
2336 	.seq_fops	= &tcp_afinfo_seq_fops,
2337 	.seq_ops	= {
2338 		.show		= tcp4_seq_show,
2339 	},
2340 };
2341 
2342 static int __net_init tcp4_proc_init_net(struct net *net)
2343 {
2344 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2345 }
2346 
2347 static void __net_exit tcp4_proc_exit_net(struct net *net)
2348 {
2349 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2350 }
2351 
2352 static struct pernet_operations tcp4_net_ops = {
2353 	.init = tcp4_proc_init_net,
2354 	.exit = tcp4_proc_exit_net,
2355 };
2356 
2357 int __init tcp4_proc_init(void)
2358 {
2359 	return register_pernet_subsys(&tcp4_net_ops);
2360 }
2361 
2362 void tcp4_proc_exit(void)
2363 {
2364 	unregister_pernet_subsys(&tcp4_net_ops);
2365 }
2366 #endif /* CONFIG_PROC_FS */
2367 
2368 struct proto tcp_prot = {
2369 	.name			= "TCP",
2370 	.owner			= THIS_MODULE,
2371 	.close			= tcp_close,
2372 	.connect		= tcp_v4_connect,
2373 	.disconnect		= tcp_disconnect,
2374 	.accept			= inet_csk_accept,
2375 	.ioctl			= tcp_ioctl,
2376 	.init			= tcp_v4_init_sock,
2377 	.destroy		= tcp_v4_destroy_sock,
2378 	.shutdown		= tcp_shutdown,
2379 	.setsockopt		= tcp_setsockopt,
2380 	.getsockopt		= tcp_getsockopt,
2381 	.keepalive		= tcp_set_keepalive,
2382 	.recvmsg		= tcp_recvmsg,
2383 	.sendmsg		= tcp_sendmsg,
2384 	.sendpage		= tcp_sendpage,
2385 	.backlog_rcv		= tcp_v4_do_rcv,
2386 	.release_cb		= tcp_release_cb,
2387 	.hash			= inet_hash,
2388 	.unhash			= inet_unhash,
2389 	.get_port		= inet_csk_get_port,
2390 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2391 	.stream_memory_free	= tcp_stream_memory_free,
2392 	.sockets_allocated	= &tcp_sockets_allocated,
2393 	.orphan_count		= &tcp_orphan_count,
2394 	.memory_allocated	= &tcp_memory_allocated,
2395 	.memory_pressure	= &tcp_memory_pressure,
2396 	.sysctl_mem		= sysctl_tcp_mem,
2397 	.sysctl_wmem		= sysctl_tcp_wmem,
2398 	.sysctl_rmem		= sysctl_tcp_rmem,
2399 	.max_header		= MAX_TCP_HEADER,
2400 	.obj_size		= sizeof(struct tcp_sock),
2401 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2402 	.twsk_prot		= &tcp_timewait_sock_ops,
2403 	.rsk_prot		= &tcp_request_sock_ops,
2404 	.h.hashinfo		= &tcp_hashinfo,
2405 	.no_autobind		= true,
2406 #ifdef CONFIG_COMPAT
2407 	.compat_setsockopt	= compat_tcp_setsockopt,
2408 	.compat_getsockopt	= compat_tcp_getsockopt,
2409 #endif
2410 	.diag_destroy		= tcp_abort,
2411 };
2412 EXPORT_SYMBOL(tcp_prot);
2413 
2414 static void __net_exit tcp_sk_exit(struct net *net)
2415 {
2416 	int cpu;
2417 
2418 	for_each_possible_cpu(cpu)
2419 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2420 	free_percpu(net->ipv4.tcp_sk);
2421 }
2422 
2423 static int __net_init tcp_sk_init(struct net *net)
2424 {
2425 	int res, cpu, cnt;
2426 
2427 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2428 	if (!net->ipv4.tcp_sk)
2429 		return -ENOMEM;
2430 
2431 	for_each_possible_cpu(cpu) {
2432 		struct sock *sk;
2433 
2434 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2435 					   IPPROTO_TCP, net);
2436 		if (res)
2437 			goto fail;
2438 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2439 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2440 	}
2441 
2442 	net->ipv4.sysctl_tcp_ecn = 2;
2443 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2444 
2445 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2446 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2447 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2448 
2449 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2450 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2451 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2452 
2453 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2454 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2455 	net->ipv4.sysctl_tcp_syncookies = 1;
2456 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2457 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2458 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2459 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2460 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2461 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2462 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2463 
2464 	cnt = tcp_hashinfo.ehash_mask + 1;
2465 	net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
2466 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2467 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2468 
2469 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2470 
2471 	return 0;
2472 fail:
2473 	tcp_sk_exit(net);
2474 
2475 	return res;
2476 }
2477 
2478 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2479 {
2480 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2481 }
2482 
2483 static struct pernet_operations __net_initdata tcp_sk_ops = {
2484        .init	   = tcp_sk_init,
2485        .exit	   = tcp_sk_exit,
2486        .exit_batch = tcp_sk_exit_batch,
2487 };
2488 
2489 void __init tcp_v4_init(void)
2490 {
2491 	if (register_pernet_subsys(&tcp_sk_ops))
2492 		panic("Failed to create the TCP control socket.\n");
2493 }
2494