xref: /linux/net/ipv4/tcp_ipv4.c (revision 4cb584e0ee7df70fd0376aee60cf701855ea8c81)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_low_latency __read_mostly;
88 
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93 
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96 
97 static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
98 {
99 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
100 					  ip_hdr(skb)->saddr,
101 					  tcp_hdr(skb)->dest,
102 					  tcp_hdr(skb)->source, tsoff);
103 }
104 
105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
106 {
107 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
108 	struct tcp_sock *tp = tcp_sk(sk);
109 
110 	/* With PAWS, it is safe from the viewpoint
111 	   of data integrity. Even without PAWS it is safe provided sequence
112 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
113 
114 	   Actually, the idea is close to VJ's one, only timestamp cache is
115 	   held not per host, but per port pair and TW bucket is used as state
116 	   holder.
117 
118 	   If TW bucket has been already destroyed we fall back to VJ's scheme
119 	   and use initial timestamp retrieved from peer table.
120 	 */
121 	if (tcptw->tw_ts_recent_stamp &&
122 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
123 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
124 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
125 		if (tp->write_seq == 0)
126 			tp->write_seq = 1;
127 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
128 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
129 		sock_hold(sktw);
130 		return 1;
131 	}
132 
133 	return 0;
134 }
135 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
136 
137 /* This will initiate an outgoing connection. */
138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
139 {
140 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
141 	struct inet_sock *inet = inet_sk(sk);
142 	struct tcp_sock *tp = tcp_sk(sk);
143 	__be16 orig_sport, orig_dport;
144 	__be32 daddr, nexthop;
145 	struct flowi4 *fl4;
146 	struct rtable *rt;
147 	int err;
148 	struct ip_options_rcu *inet_opt;
149 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
150 
151 	if (addr_len < sizeof(struct sockaddr_in))
152 		return -EINVAL;
153 
154 	if (usin->sin_family != AF_INET)
155 		return -EAFNOSUPPORT;
156 
157 	nexthop = daddr = usin->sin_addr.s_addr;
158 	inet_opt = rcu_dereference_protected(inet->inet_opt,
159 					     lockdep_sock_is_held(sk));
160 	if (inet_opt && inet_opt->opt.srr) {
161 		if (!daddr)
162 			return -EINVAL;
163 		nexthop = inet_opt->opt.faddr;
164 	}
165 
166 	orig_sport = inet->inet_sport;
167 	orig_dport = usin->sin_port;
168 	fl4 = &inet->cork.fl.u.ip4;
169 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
170 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
171 			      IPPROTO_TCP,
172 			      orig_sport, orig_dport, sk);
173 	if (IS_ERR(rt)) {
174 		err = PTR_ERR(rt);
175 		if (err == -ENETUNREACH)
176 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
177 		return err;
178 	}
179 
180 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
181 		ip_rt_put(rt);
182 		return -ENETUNREACH;
183 	}
184 
185 	if (!inet_opt || !inet_opt->opt.srr)
186 		daddr = fl4->daddr;
187 
188 	if (!inet->inet_saddr)
189 		inet->inet_saddr = fl4->saddr;
190 	sk_rcv_saddr_set(sk, inet->inet_saddr);
191 
192 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
193 		/* Reset inherited state */
194 		tp->rx_opt.ts_recent	   = 0;
195 		tp->rx_opt.ts_recent_stamp = 0;
196 		if (likely(!tp->repair))
197 			tp->write_seq	   = 0;
198 	}
199 
200 	if (tcp_death_row->sysctl_tw_recycle &&
201 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
202 		tcp_fetch_timewait_stamp(sk, &rt->dst);
203 
204 	inet->inet_dport = usin->sin_port;
205 	sk_daddr_set(sk, daddr);
206 
207 	inet_csk(sk)->icsk_ext_hdr_len = 0;
208 	if (inet_opt)
209 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
210 
211 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
212 
213 	/* Socket identity is still unknown (sport may be zero).
214 	 * However we set state to SYN-SENT and not releasing socket
215 	 * lock select source port, enter ourselves into the hash tables and
216 	 * complete initialization after this.
217 	 */
218 	tcp_set_state(sk, TCP_SYN_SENT);
219 	err = inet_hash_connect(tcp_death_row, sk);
220 	if (err)
221 		goto failure;
222 
223 	sk_set_txhash(sk);
224 
225 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
226 			       inet->inet_sport, inet->inet_dport, sk);
227 	if (IS_ERR(rt)) {
228 		err = PTR_ERR(rt);
229 		rt = NULL;
230 		goto failure;
231 	}
232 	/* OK, now commit destination to socket.  */
233 	sk->sk_gso_type = SKB_GSO_TCPV4;
234 	sk_setup_caps(sk, &rt->dst);
235 	rt = NULL;
236 
237 	if (!tp->write_seq && likely(!tp->repair))
238 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 							   inet->inet_daddr,
240 							   inet->inet_sport,
241 							   usin->sin_port,
242 							   &tp->tsoffset);
243 
244 	inet->inet_id = tp->write_seq ^ jiffies;
245 
246 	if (tcp_fastopen_defer_connect(sk, &err))
247 		return err;
248 	if (err)
249 		goto failure;
250 
251 	err = tcp_connect(sk);
252 
253 	if (err)
254 		goto failure;
255 
256 	return 0;
257 
258 failure:
259 	/*
260 	 * This unhashes the socket and releases the local port,
261 	 * if necessary.
262 	 */
263 	tcp_set_state(sk, TCP_CLOSE);
264 	ip_rt_put(rt);
265 	sk->sk_route_caps = 0;
266 	inet->inet_dport = 0;
267 	return err;
268 }
269 EXPORT_SYMBOL(tcp_v4_connect);
270 
271 /*
272  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
273  * It can be called through tcp_release_cb() if socket was owned by user
274  * at the time tcp_v4_err() was called to handle ICMP message.
275  */
276 void tcp_v4_mtu_reduced(struct sock *sk)
277 {
278 	struct dst_entry *dst;
279 	struct inet_sock *inet = inet_sk(sk);
280 	u32 mtu = tcp_sk(sk)->mtu_info;
281 
282 	dst = inet_csk_update_pmtu(sk, mtu);
283 	if (!dst)
284 		return;
285 
286 	/* Something is about to be wrong... Remember soft error
287 	 * for the case, if this connection will not able to recover.
288 	 */
289 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
290 		sk->sk_err_soft = EMSGSIZE;
291 
292 	mtu = dst_mtu(dst);
293 
294 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
295 	    ip_sk_accept_pmtu(sk) &&
296 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
297 		tcp_sync_mss(sk, mtu);
298 
299 		/* Resend the TCP packet because it's
300 		 * clear that the old packet has been
301 		 * dropped. This is the new "fast" path mtu
302 		 * discovery.
303 		 */
304 		tcp_simple_retransmit(sk);
305 	} /* else let the usual retransmit timer handle it */
306 }
307 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
308 
309 static void do_redirect(struct sk_buff *skb, struct sock *sk)
310 {
311 	struct dst_entry *dst = __sk_dst_check(sk, 0);
312 
313 	if (dst)
314 		dst->ops->redirect(dst, sk, skb);
315 }
316 
317 
318 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
319 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
320 {
321 	struct request_sock *req = inet_reqsk(sk);
322 	struct net *net = sock_net(sk);
323 
324 	/* ICMPs are not backlogged, hence we cannot get
325 	 * an established socket here.
326 	 */
327 	if (seq != tcp_rsk(req)->snt_isn) {
328 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
329 	} else if (abort) {
330 		/*
331 		 * Still in SYN_RECV, just remove it silently.
332 		 * There is no good way to pass the error to the newly
333 		 * created socket, and POSIX does not want network
334 		 * errors returned from accept().
335 		 */
336 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
337 		tcp_listendrop(req->rsk_listener);
338 	}
339 	reqsk_put(req);
340 }
341 EXPORT_SYMBOL(tcp_req_err);
342 
343 /*
344  * This routine is called by the ICMP module when it gets some
345  * sort of error condition.  If err < 0 then the socket should
346  * be closed and the error returned to the user.  If err > 0
347  * it's just the icmp type << 8 | icmp code.  After adjustment
348  * header points to the first 8 bytes of the tcp header.  We need
349  * to find the appropriate port.
350  *
351  * The locking strategy used here is very "optimistic". When
352  * someone else accesses the socket the ICMP is just dropped
353  * and for some paths there is no check at all.
354  * A more general error queue to queue errors for later handling
355  * is probably better.
356  *
357  */
358 
359 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
360 {
361 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
362 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
363 	struct inet_connection_sock *icsk;
364 	struct tcp_sock *tp;
365 	struct inet_sock *inet;
366 	const int type = icmp_hdr(icmp_skb)->type;
367 	const int code = icmp_hdr(icmp_skb)->code;
368 	struct sock *sk;
369 	struct sk_buff *skb;
370 	struct request_sock *fastopen;
371 	__u32 seq, snd_una;
372 	__u32 remaining;
373 	int err;
374 	struct net *net = dev_net(icmp_skb->dev);
375 
376 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
377 				       th->dest, iph->saddr, ntohs(th->source),
378 				       inet_iif(icmp_skb));
379 	if (!sk) {
380 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
381 		return;
382 	}
383 	if (sk->sk_state == TCP_TIME_WAIT) {
384 		inet_twsk_put(inet_twsk(sk));
385 		return;
386 	}
387 	seq = ntohl(th->seq);
388 	if (sk->sk_state == TCP_NEW_SYN_RECV)
389 		return tcp_req_err(sk, seq,
390 				  type == ICMP_PARAMETERPROB ||
391 				  type == ICMP_TIME_EXCEEDED ||
392 				  (type == ICMP_DEST_UNREACH &&
393 				   (code == ICMP_NET_UNREACH ||
394 				    code == ICMP_HOST_UNREACH)));
395 
396 	bh_lock_sock(sk);
397 	/* If too many ICMPs get dropped on busy
398 	 * servers this needs to be solved differently.
399 	 * We do take care of PMTU discovery (RFC1191) special case :
400 	 * we can receive locally generated ICMP messages while socket is held.
401 	 */
402 	if (sock_owned_by_user(sk)) {
403 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
404 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
405 	}
406 	if (sk->sk_state == TCP_CLOSE)
407 		goto out;
408 
409 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
410 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
411 		goto out;
412 	}
413 
414 	icsk = inet_csk(sk);
415 	tp = tcp_sk(sk);
416 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
417 	fastopen = tp->fastopen_rsk;
418 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
419 	if (sk->sk_state != TCP_LISTEN &&
420 	    !between(seq, snd_una, tp->snd_nxt)) {
421 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
422 		goto out;
423 	}
424 
425 	switch (type) {
426 	case ICMP_REDIRECT:
427 		do_redirect(icmp_skb, sk);
428 		goto out;
429 	case ICMP_SOURCE_QUENCH:
430 		/* Just silently ignore these. */
431 		goto out;
432 	case ICMP_PARAMETERPROB:
433 		err = EPROTO;
434 		break;
435 	case ICMP_DEST_UNREACH:
436 		if (code > NR_ICMP_UNREACH)
437 			goto out;
438 
439 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
440 			/* We are not interested in TCP_LISTEN and open_requests
441 			 * (SYN-ACKs send out by Linux are always <576bytes so
442 			 * they should go through unfragmented).
443 			 */
444 			if (sk->sk_state == TCP_LISTEN)
445 				goto out;
446 
447 			tp->mtu_info = info;
448 			if (!sock_owned_by_user(sk)) {
449 				tcp_v4_mtu_reduced(sk);
450 			} else {
451 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
452 					sock_hold(sk);
453 			}
454 			goto out;
455 		}
456 
457 		err = icmp_err_convert[code].errno;
458 		/* check if icmp_skb allows revert of backoff
459 		 * (see draft-zimmermann-tcp-lcd) */
460 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
461 			break;
462 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
463 		    !icsk->icsk_backoff || fastopen)
464 			break;
465 
466 		if (sock_owned_by_user(sk))
467 			break;
468 
469 		icsk->icsk_backoff--;
470 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
471 					       TCP_TIMEOUT_INIT;
472 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
473 
474 		skb = tcp_write_queue_head(sk);
475 		BUG_ON(!skb);
476 
477 		remaining = icsk->icsk_rto -
478 			    min(icsk->icsk_rto,
479 				tcp_time_stamp - tcp_skb_timestamp(skb));
480 
481 		if (remaining) {
482 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
483 						  remaining, TCP_RTO_MAX);
484 		} else {
485 			/* RTO revert clocked out retransmission.
486 			 * Will retransmit now */
487 			tcp_retransmit_timer(sk);
488 		}
489 
490 		break;
491 	case ICMP_TIME_EXCEEDED:
492 		err = EHOSTUNREACH;
493 		break;
494 	default:
495 		goto out;
496 	}
497 
498 	switch (sk->sk_state) {
499 	case TCP_SYN_SENT:
500 	case TCP_SYN_RECV:
501 		/* Only in fast or simultaneous open. If a fast open socket is
502 		 * is already accepted it is treated as a connected one below.
503 		 */
504 		if (fastopen && !fastopen->sk)
505 			break;
506 
507 		if (!sock_owned_by_user(sk)) {
508 			sk->sk_err = err;
509 
510 			sk->sk_error_report(sk);
511 
512 			tcp_done(sk);
513 		} else {
514 			sk->sk_err_soft = err;
515 		}
516 		goto out;
517 	}
518 
519 	/* If we've already connected we will keep trying
520 	 * until we time out, or the user gives up.
521 	 *
522 	 * rfc1122 4.2.3.9 allows to consider as hard errors
523 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
524 	 * but it is obsoleted by pmtu discovery).
525 	 *
526 	 * Note, that in modern internet, where routing is unreliable
527 	 * and in each dark corner broken firewalls sit, sending random
528 	 * errors ordered by their masters even this two messages finally lose
529 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
530 	 *
531 	 * Now we are in compliance with RFCs.
532 	 *							--ANK (980905)
533 	 */
534 
535 	inet = inet_sk(sk);
536 	if (!sock_owned_by_user(sk) && inet->recverr) {
537 		sk->sk_err = err;
538 		sk->sk_error_report(sk);
539 	} else	{ /* Only an error on timeout */
540 		sk->sk_err_soft = err;
541 	}
542 
543 out:
544 	bh_unlock_sock(sk);
545 	sock_put(sk);
546 }
547 
548 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
549 {
550 	struct tcphdr *th = tcp_hdr(skb);
551 
552 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
553 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
554 		skb->csum_start = skb_transport_header(skb) - skb->head;
555 		skb->csum_offset = offsetof(struct tcphdr, check);
556 	} else {
557 		th->check = tcp_v4_check(skb->len, saddr, daddr,
558 					 csum_partial(th,
559 						      th->doff << 2,
560 						      skb->csum));
561 	}
562 }
563 
564 /* This routine computes an IPv4 TCP checksum. */
565 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
566 {
567 	const struct inet_sock *inet = inet_sk(sk);
568 
569 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
570 }
571 EXPORT_SYMBOL(tcp_v4_send_check);
572 
573 /*
574  *	This routine will send an RST to the other tcp.
575  *
576  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
577  *		      for reset.
578  *	Answer: if a packet caused RST, it is not for a socket
579  *		existing in our system, if it is matched to a socket,
580  *		it is just duplicate segment or bug in other side's TCP.
581  *		So that we build reply only basing on parameters
582  *		arrived with segment.
583  *	Exception: precedence violation. We do not implement it in any case.
584  */
585 
586 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
587 {
588 	const struct tcphdr *th = tcp_hdr(skb);
589 	struct {
590 		struct tcphdr th;
591 #ifdef CONFIG_TCP_MD5SIG
592 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
593 #endif
594 	} rep;
595 	struct ip_reply_arg arg;
596 #ifdef CONFIG_TCP_MD5SIG
597 	struct tcp_md5sig_key *key = NULL;
598 	const __u8 *hash_location = NULL;
599 	unsigned char newhash[16];
600 	int genhash;
601 	struct sock *sk1 = NULL;
602 #endif
603 	struct net *net;
604 
605 	/* Never send a reset in response to a reset. */
606 	if (th->rst)
607 		return;
608 
609 	/* If sk not NULL, it means we did a successful lookup and incoming
610 	 * route had to be correct. prequeue might have dropped our dst.
611 	 */
612 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
613 		return;
614 
615 	/* Swap the send and the receive. */
616 	memset(&rep, 0, sizeof(rep));
617 	rep.th.dest   = th->source;
618 	rep.th.source = th->dest;
619 	rep.th.doff   = sizeof(struct tcphdr) / 4;
620 	rep.th.rst    = 1;
621 
622 	if (th->ack) {
623 		rep.th.seq = th->ack_seq;
624 	} else {
625 		rep.th.ack = 1;
626 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
627 				       skb->len - (th->doff << 2));
628 	}
629 
630 	memset(&arg, 0, sizeof(arg));
631 	arg.iov[0].iov_base = (unsigned char *)&rep;
632 	arg.iov[0].iov_len  = sizeof(rep.th);
633 
634 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
635 #ifdef CONFIG_TCP_MD5SIG
636 	rcu_read_lock();
637 	hash_location = tcp_parse_md5sig_option(th);
638 	if (sk && sk_fullsock(sk)) {
639 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
640 					&ip_hdr(skb)->saddr, AF_INET);
641 	} else if (hash_location) {
642 		/*
643 		 * active side is lost. Try to find listening socket through
644 		 * source port, and then find md5 key through listening socket.
645 		 * we are not loose security here:
646 		 * Incoming packet is checked with md5 hash with finding key,
647 		 * no RST generated if md5 hash doesn't match.
648 		 */
649 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
650 					     ip_hdr(skb)->saddr,
651 					     th->source, ip_hdr(skb)->daddr,
652 					     ntohs(th->source), inet_iif(skb));
653 		/* don't send rst if it can't find key */
654 		if (!sk1)
655 			goto out;
656 
657 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
658 					&ip_hdr(skb)->saddr, AF_INET);
659 		if (!key)
660 			goto out;
661 
662 
663 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
664 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
665 			goto out;
666 
667 	}
668 
669 	if (key) {
670 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
671 				   (TCPOPT_NOP << 16) |
672 				   (TCPOPT_MD5SIG << 8) |
673 				   TCPOLEN_MD5SIG);
674 		/* Update length and the length the header thinks exists */
675 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
676 		rep.th.doff = arg.iov[0].iov_len / 4;
677 
678 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
679 				     key, ip_hdr(skb)->saddr,
680 				     ip_hdr(skb)->daddr, &rep.th);
681 	}
682 #endif
683 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
684 				      ip_hdr(skb)->saddr, /* XXX */
685 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
686 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
687 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
688 
689 	/* When socket is gone, all binding information is lost.
690 	 * routing might fail in this case. No choice here, if we choose to force
691 	 * input interface, we will misroute in case of asymmetric route.
692 	 */
693 	if (sk)
694 		arg.bound_dev_if = sk->sk_bound_dev_if;
695 
696 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
697 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
698 
699 	arg.tos = ip_hdr(skb)->tos;
700 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
701 	local_bh_disable();
702 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
703 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
704 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
705 			      &arg, arg.iov[0].iov_len);
706 
707 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
708 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
709 	local_bh_enable();
710 
711 #ifdef CONFIG_TCP_MD5SIG
712 out:
713 	rcu_read_unlock();
714 #endif
715 }
716 
717 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
718    outside socket context is ugly, certainly. What can I do?
719  */
720 
721 static void tcp_v4_send_ack(const struct sock *sk,
722 			    struct sk_buff *skb, u32 seq, u32 ack,
723 			    u32 win, u32 tsval, u32 tsecr, int oif,
724 			    struct tcp_md5sig_key *key,
725 			    int reply_flags, u8 tos)
726 {
727 	const struct tcphdr *th = tcp_hdr(skb);
728 	struct {
729 		struct tcphdr th;
730 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
731 #ifdef CONFIG_TCP_MD5SIG
732 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
733 #endif
734 			];
735 	} rep;
736 	struct net *net = sock_net(sk);
737 	struct ip_reply_arg arg;
738 
739 	memset(&rep.th, 0, sizeof(struct tcphdr));
740 	memset(&arg, 0, sizeof(arg));
741 
742 	arg.iov[0].iov_base = (unsigned char *)&rep;
743 	arg.iov[0].iov_len  = sizeof(rep.th);
744 	if (tsecr) {
745 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
746 				   (TCPOPT_TIMESTAMP << 8) |
747 				   TCPOLEN_TIMESTAMP);
748 		rep.opt[1] = htonl(tsval);
749 		rep.opt[2] = htonl(tsecr);
750 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
751 	}
752 
753 	/* Swap the send and the receive. */
754 	rep.th.dest    = th->source;
755 	rep.th.source  = th->dest;
756 	rep.th.doff    = arg.iov[0].iov_len / 4;
757 	rep.th.seq     = htonl(seq);
758 	rep.th.ack_seq = htonl(ack);
759 	rep.th.ack     = 1;
760 	rep.th.window  = htons(win);
761 
762 #ifdef CONFIG_TCP_MD5SIG
763 	if (key) {
764 		int offset = (tsecr) ? 3 : 0;
765 
766 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
767 					  (TCPOPT_NOP << 16) |
768 					  (TCPOPT_MD5SIG << 8) |
769 					  TCPOLEN_MD5SIG);
770 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
771 		rep.th.doff = arg.iov[0].iov_len/4;
772 
773 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
774 				    key, ip_hdr(skb)->saddr,
775 				    ip_hdr(skb)->daddr, &rep.th);
776 	}
777 #endif
778 	arg.flags = reply_flags;
779 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
780 				      ip_hdr(skb)->saddr, /* XXX */
781 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
782 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
783 	if (oif)
784 		arg.bound_dev_if = oif;
785 	arg.tos = tos;
786 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
787 	local_bh_disable();
788 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
789 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
790 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
791 			      &arg, arg.iov[0].iov_len);
792 
793 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
794 	local_bh_enable();
795 }
796 
797 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
798 {
799 	struct inet_timewait_sock *tw = inet_twsk(sk);
800 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
801 
802 	tcp_v4_send_ack(sk, skb,
803 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
804 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
805 			tcp_time_stamp + tcptw->tw_ts_offset,
806 			tcptw->tw_ts_recent,
807 			tw->tw_bound_dev_if,
808 			tcp_twsk_md5_key(tcptw),
809 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
810 			tw->tw_tos
811 			);
812 
813 	inet_twsk_put(tw);
814 }
815 
816 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
817 				  struct request_sock *req)
818 {
819 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
820 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
821 	 */
822 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
823 					     tcp_sk(sk)->snd_nxt;
824 
825 	/* RFC 7323 2.3
826 	 * The window field (SEG.WND) of every outgoing segment, with the
827 	 * exception of <SYN> segments, MUST be right-shifted by
828 	 * Rcv.Wind.Shift bits:
829 	 */
830 	tcp_v4_send_ack(sk, skb, seq,
831 			tcp_rsk(req)->rcv_nxt,
832 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
833 			tcp_time_stamp + tcp_rsk(req)->ts_off,
834 			req->ts_recent,
835 			0,
836 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
837 					  AF_INET),
838 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
839 			ip_hdr(skb)->tos);
840 }
841 
842 /*
843  *	Send a SYN-ACK after having received a SYN.
844  *	This still operates on a request_sock only, not on a big
845  *	socket.
846  */
847 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
848 			      struct flowi *fl,
849 			      struct request_sock *req,
850 			      struct tcp_fastopen_cookie *foc,
851 			      enum tcp_synack_type synack_type)
852 {
853 	const struct inet_request_sock *ireq = inet_rsk(req);
854 	struct flowi4 fl4;
855 	int err = -1;
856 	struct sk_buff *skb;
857 
858 	/* First, grab a route. */
859 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
860 		return -1;
861 
862 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
863 
864 	if (skb) {
865 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
866 
867 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
868 					    ireq->ir_rmt_addr,
869 					    ireq->opt);
870 		err = net_xmit_eval(err);
871 	}
872 
873 	return err;
874 }
875 
876 /*
877  *	IPv4 request_sock destructor.
878  */
879 static void tcp_v4_reqsk_destructor(struct request_sock *req)
880 {
881 	kfree(inet_rsk(req)->opt);
882 }
883 
884 #ifdef CONFIG_TCP_MD5SIG
885 /*
886  * RFC2385 MD5 checksumming requires a mapping of
887  * IP address->MD5 Key.
888  * We need to maintain these in the sk structure.
889  */
890 
891 /* Find the Key structure for an address.  */
892 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
893 					 const union tcp_md5_addr *addr,
894 					 int family)
895 {
896 	const struct tcp_sock *tp = tcp_sk(sk);
897 	struct tcp_md5sig_key *key;
898 	unsigned int size = sizeof(struct in_addr);
899 	const struct tcp_md5sig_info *md5sig;
900 
901 	/* caller either holds rcu_read_lock() or socket lock */
902 	md5sig = rcu_dereference_check(tp->md5sig_info,
903 				       lockdep_sock_is_held(sk));
904 	if (!md5sig)
905 		return NULL;
906 #if IS_ENABLED(CONFIG_IPV6)
907 	if (family == AF_INET6)
908 		size = sizeof(struct in6_addr);
909 #endif
910 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
911 		if (key->family != family)
912 			continue;
913 		if (!memcmp(&key->addr, addr, size))
914 			return key;
915 	}
916 	return NULL;
917 }
918 EXPORT_SYMBOL(tcp_md5_do_lookup);
919 
920 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
921 					 const struct sock *addr_sk)
922 {
923 	const union tcp_md5_addr *addr;
924 
925 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
926 	return tcp_md5_do_lookup(sk, addr, AF_INET);
927 }
928 EXPORT_SYMBOL(tcp_v4_md5_lookup);
929 
930 /* This can be called on a newly created socket, from other files */
931 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
932 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
933 {
934 	/* Add Key to the list */
935 	struct tcp_md5sig_key *key;
936 	struct tcp_sock *tp = tcp_sk(sk);
937 	struct tcp_md5sig_info *md5sig;
938 
939 	key = tcp_md5_do_lookup(sk, addr, family);
940 	if (key) {
941 		/* Pre-existing entry - just update that one. */
942 		memcpy(key->key, newkey, newkeylen);
943 		key->keylen = newkeylen;
944 		return 0;
945 	}
946 
947 	md5sig = rcu_dereference_protected(tp->md5sig_info,
948 					   lockdep_sock_is_held(sk));
949 	if (!md5sig) {
950 		md5sig = kmalloc(sizeof(*md5sig), gfp);
951 		if (!md5sig)
952 			return -ENOMEM;
953 
954 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
955 		INIT_HLIST_HEAD(&md5sig->head);
956 		rcu_assign_pointer(tp->md5sig_info, md5sig);
957 	}
958 
959 	key = sock_kmalloc(sk, sizeof(*key), gfp);
960 	if (!key)
961 		return -ENOMEM;
962 	if (!tcp_alloc_md5sig_pool()) {
963 		sock_kfree_s(sk, key, sizeof(*key));
964 		return -ENOMEM;
965 	}
966 
967 	memcpy(key->key, newkey, newkeylen);
968 	key->keylen = newkeylen;
969 	key->family = family;
970 	memcpy(&key->addr, addr,
971 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
972 				      sizeof(struct in_addr));
973 	hlist_add_head_rcu(&key->node, &md5sig->head);
974 	return 0;
975 }
976 EXPORT_SYMBOL(tcp_md5_do_add);
977 
978 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
979 {
980 	struct tcp_md5sig_key *key;
981 
982 	key = tcp_md5_do_lookup(sk, addr, family);
983 	if (!key)
984 		return -ENOENT;
985 	hlist_del_rcu(&key->node);
986 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
987 	kfree_rcu(key, rcu);
988 	return 0;
989 }
990 EXPORT_SYMBOL(tcp_md5_do_del);
991 
992 static void tcp_clear_md5_list(struct sock *sk)
993 {
994 	struct tcp_sock *tp = tcp_sk(sk);
995 	struct tcp_md5sig_key *key;
996 	struct hlist_node *n;
997 	struct tcp_md5sig_info *md5sig;
998 
999 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1000 
1001 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1002 		hlist_del_rcu(&key->node);
1003 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1004 		kfree_rcu(key, rcu);
1005 	}
1006 }
1007 
1008 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1009 				 int optlen)
1010 {
1011 	struct tcp_md5sig cmd;
1012 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1013 
1014 	if (optlen < sizeof(cmd))
1015 		return -EINVAL;
1016 
1017 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1018 		return -EFAULT;
1019 
1020 	if (sin->sin_family != AF_INET)
1021 		return -EINVAL;
1022 
1023 	if (!cmd.tcpm_keylen)
1024 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1025 				      AF_INET);
1026 
1027 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1028 		return -EINVAL;
1029 
1030 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1031 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1032 			      GFP_KERNEL);
1033 }
1034 
1035 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1036 				   __be32 daddr, __be32 saddr,
1037 				   const struct tcphdr *th, int nbytes)
1038 {
1039 	struct tcp4_pseudohdr *bp;
1040 	struct scatterlist sg;
1041 	struct tcphdr *_th;
1042 
1043 	bp = hp->scratch;
1044 	bp->saddr = saddr;
1045 	bp->daddr = daddr;
1046 	bp->pad = 0;
1047 	bp->protocol = IPPROTO_TCP;
1048 	bp->len = cpu_to_be16(nbytes);
1049 
1050 	_th = (struct tcphdr *)(bp + 1);
1051 	memcpy(_th, th, sizeof(*th));
1052 	_th->check = 0;
1053 
1054 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1055 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1056 				sizeof(*bp) + sizeof(*th));
1057 	return crypto_ahash_update(hp->md5_req);
1058 }
1059 
1060 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1061 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1062 {
1063 	struct tcp_md5sig_pool *hp;
1064 	struct ahash_request *req;
1065 
1066 	hp = tcp_get_md5sig_pool();
1067 	if (!hp)
1068 		goto clear_hash_noput;
1069 	req = hp->md5_req;
1070 
1071 	if (crypto_ahash_init(req))
1072 		goto clear_hash;
1073 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1074 		goto clear_hash;
1075 	if (tcp_md5_hash_key(hp, key))
1076 		goto clear_hash;
1077 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1078 	if (crypto_ahash_final(req))
1079 		goto clear_hash;
1080 
1081 	tcp_put_md5sig_pool();
1082 	return 0;
1083 
1084 clear_hash:
1085 	tcp_put_md5sig_pool();
1086 clear_hash_noput:
1087 	memset(md5_hash, 0, 16);
1088 	return 1;
1089 }
1090 
1091 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1092 			const struct sock *sk,
1093 			const struct sk_buff *skb)
1094 {
1095 	struct tcp_md5sig_pool *hp;
1096 	struct ahash_request *req;
1097 	const struct tcphdr *th = tcp_hdr(skb);
1098 	__be32 saddr, daddr;
1099 
1100 	if (sk) { /* valid for establish/request sockets */
1101 		saddr = sk->sk_rcv_saddr;
1102 		daddr = sk->sk_daddr;
1103 	} else {
1104 		const struct iphdr *iph = ip_hdr(skb);
1105 		saddr = iph->saddr;
1106 		daddr = iph->daddr;
1107 	}
1108 
1109 	hp = tcp_get_md5sig_pool();
1110 	if (!hp)
1111 		goto clear_hash_noput;
1112 	req = hp->md5_req;
1113 
1114 	if (crypto_ahash_init(req))
1115 		goto clear_hash;
1116 
1117 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1118 		goto clear_hash;
1119 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1120 		goto clear_hash;
1121 	if (tcp_md5_hash_key(hp, key))
1122 		goto clear_hash;
1123 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1124 	if (crypto_ahash_final(req))
1125 		goto clear_hash;
1126 
1127 	tcp_put_md5sig_pool();
1128 	return 0;
1129 
1130 clear_hash:
1131 	tcp_put_md5sig_pool();
1132 clear_hash_noput:
1133 	memset(md5_hash, 0, 16);
1134 	return 1;
1135 }
1136 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1137 
1138 #endif
1139 
1140 /* Called with rcu_read_lock() */
1141 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1142 				    const struct sk_buff *skb)
1143 {
1144 #ifdef CONFIG_TCP_MD5SIG
1145 	/*
1146 	 * This gets called for each TCP segment that arrives
1147 	 * so we want to be efficient.
1148 	 * We have 3 drop cases:
1149 	 * o No MD5 hash and one expected.
1150 	 * o MD5 hash and we're not expecting one.
1151 	 * o MD5 hash and its wrong.
1152 	 */
1153 	const __u8 *hash_location = NULL;
1154 	struct tcp_md5sig_key *hash_expected;
1155 	const struct iphdr *iph = ip_hdr(skb);
1156 	const struct tcphdr *th = tcp_hdr(skb);
1157 	int genhash;
1158 	unsigned char newhash[16];
1159 
1160 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1161 					  AF_INET);
1162 	hash_location = tcp_parse_md5sig_option(th);
1163 
1164 	/* We've parsed the options - do we have a hash? */
1165 	if (!hash_expected && !hash_location)
1166 		return false;
1167 
1168 	if (hash_expected && !hash_location) {
1169 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1170 		return true;
1171 	}
1172 
1173 	if (!hash_expected && hash_location) {
1174 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1175 		return true;
1176 	}
1177 
1178 	/* Okay, so this is hash_expected and hash_location -
1179 	 * so we need to calculate the checksum.
1180 	 */
1181 	genhash = tcp_v4_md5_hash_skb(newhash,
1182 				      hash_expected,
1183 				      NULL, skb);
1184 
1185 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1186 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1187 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1188 				     &iph->saddr, ntohs(th->source),
1189 				     &iph->daddr, ntohs(th->dest),
1190 				     genhash ? " tcp_v4_calc_md5_hash failed"
1191 				     : "");
1192 		return true;
1193 	}
1194 	return false;
1195 #endif
1196 	return false;
1197 }
1198 
1199 static void tcp_v4_init_req(struct request_sock *req,
1200 			    const struct sock *sk_listener,
1201 			    struct sk_buff *skb)
1202 {
1203 	struct inet_request_sock *ireq = inet_rsk(req);
1204 
1205 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1206 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1207 	ireq->opt = tcp_v4_save_options(skb);
1208 }
1209 
1210 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1211 					  struct flowi *fl,
1212 					  const struct request_sock *req,
1213 					  bool *strict)
1214 {
1215 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1216 
1217 	if (strict) {
1218 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1219 			*strict = true;
1220 		else
1221 			*strict = false;
1222 	}
1223 
1224 	return dst;
1225 }
1226 
1227 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1228 	.family		=	PF_INET,
1229 	.obj_size	=	sizeof(struct tcp_request_sock),
1230 	.rtx_syn_ack	=	tcp_rtx_synack,
1231 	.send_ack	=	tcp_v4_reqsk_send_ack,
1232 	.destructor	=	tcp_v4_reqsk_destructor,
1233 	.send_reset	=	tcp_v4_send_reset,
1234 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1235 };
1236 
1237 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1238 	.mss_clamp	=	TCP_MSS_DEFAULT,
1239 #ifdef CONFIG_TCP_MD5SIG
1240 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1241 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1242 #endif
1243 	.init_req	=	tcp_v4_init_req,
1244 #ifdef CONFIG_SYN_COOKIES
1245 	.cookie_init_seq =	cookie_v4_init_sequence,
1246 #endif
1247 	.route_req	=	tcp_v4_route_req,
1248 	.init_seq	=	tcp_v4_init_sequence,
1249 	.send_synack	=	tcp_v4_send_synack,
1250 };
1251 
1252 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1253 {
1254 	/* Never answer to SYNs send to broadcast or multicast */
1255 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1256 		goto drop;
1257 
1258 	return tcp_conn_request(&tcp_request_sock_ops,
1259 				&tcp_request_sock_ipv4_ops, sk, skb);
1260 
1261 drop:
1262 	tcp_listendrop(sk);
1263 	return 0;
1264 }
1265 EXPORT_SYMBOL(tcp_v4_conn_request);
1266 
1267 
1268 /*
1269  * The three way handshake has completed - we got a valid synack -
1270  * now create the new socket.
1271  */
1272 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1273 				  struct request_sock *req,
1274 				  struct dst_entry *dst,
1275 				  struct request_sock *req_unhash,
1276 				  bool *own_req)
1277 {
1278 	struct inet_request_sock *ireq;
1279 	struct inet_sock *newinet;
1280 	struct tcp_sock *newtp;
1281 	struct sock *newsk;
1282 #ifdef CONFIG_TCP_MD5SIG
1283 	struct tcp_md5sig_key *key;
1284 #endif
1285 	struct ip_options_rcu *inet_opt;
1286 
1287 	if (sk_acceptq_is_full(sk))
1288 		goto exit_overflow;
1289 
1290 	newsk = tcp_create_openreq_child(sk, req, skb);
1291 	if (!newsk)
1292 		goto exit_nonewsk;
1293 
1294 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1295 	inet_sk_rx_dst_set(newsk, skb);
1296 
1297 	newtp		      = tcp_sk(newsk);
1298 	newinet		      = inet_sk(newsk);
1299 	ireq		      = inet_rsk(req);
1300 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1301 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1302 	newsk->sk_bound_dev_if = ireq->ir_iif;
1303 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1304 	inet_opt	      = ireq->opt;
1305 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1306 	ireq->opt	      = NULL;
1307 	newinet->mc_index     = inet_iif(skb);
1308 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1309 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1310 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1311 	if (inet_opt)
1312 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1313 	newinet->inet_id = newtp->write_seq ^ jiffies;
1314 
1315 	if (!dst) {
1316 		dst = inet_csk_route_child_sock(sk, newsk, req);
1317 		if (!dst)
1318 			goto put_and_exit;
1319 	} else {
1320 		/* syncookie case : see end of cookie_v4_check() */
1321 	}
1322 	sk_setup_caps(newsk, dst);
1323 
1324 	tcp_ca_openreq_child(newsk, dst);
1325 
1326 	tcp_sync_mss(newsk, dst_mtu(dst));
1327 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1328 
1329 	tcp_initialize_rcv_mss(newsk);
1330 
1331 #ifdef CONFIG_TCP_MD5SIG
1332 	/* Copy over the MD5 key from the original socket */
1333 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1334 				AF_INET);
1335 	if (key) {
1336 		/*
1337 		 * We're using one, so create a matching key
1338 		 * on the newsk structure. If we fail to get
1339 		 * memory, then we end up not copying the key
1340 		 * across. Shucks.
1341 		 */
1342 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1343 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1344 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1345 	}
1346 #endif
1347 
1348 	if (__inet_inherit_port(sk, newsk) < 0)
1349 		goto put_and_exit;
1350 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1351 	if (*own_req)
1352 		tcp_move_syn(newtp, req);
1353 
1354 	return newsk;
1355 
1356 exit_overflow:
1357 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1358 exit_nonewsk:
1359 	dst_release(dst);
1360 exit:
1361 	tcp_listendrop(sk);
1362 	return NULL;
1363 put_and_exit:
1364 	inet_csk_prepare_forced_close(newsk);
1365 	tcp_done(newsk);
1366 	goto exit;
1367 }
1368 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1369 
1370 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1371 {
1372 #ifdef CONFIG_SYN_COOKIES
1373 	const struct tcphdr *th = tcp_hdr(skb);
1374 
1375 	if (!th->syn)
1376 		sk = cookie_v4_check(sk, skb);
1377 #endif
1378 	return sk;
1379 }
1380 
1381 /* The socket must have it's spinlock held when we get
1382  * here, unless it is a TCP_LISTEN socket.
1383  *
1384  * We have a potential double-lock case here, so even when
1385  * doing backlog processing we use the BH locking scheme.
1386  * This is because we cannot sleep with the original spinlock
1387  * held.
1388  */
1389 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1390 {
1391 	struct sock *rsk;
1392 
1393 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1394 		struct dst_entry *dst = sk->sk_rx_dst;
1395 
1396 		sock_rps_save_rxhash(sk, skb);
1397 		sk_mark_napi_id(sk, skb);
1398 		if (dst) {
1399 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1400 			    !dst->ops->check(dst, 0)) {
1401 				dst_release(dst);
1402 				sk->sk_rx_dst = NULL;
1403 			}
1404 		}
1405 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1406 		return 0;
1407 	}
1408 
1409 	if (tcp_checksum_complete(skb))
1410 		goto csum_err;
1411 
1412 	if (sk->sk_state == TCP_LISTEN) {
1413 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1414 
1415 		if (!nsk)
1416 			goto discard;
1417 		if (nsk != sk) {
1418 			sock_rps_save_rxhash(nsk, skb);
1419 			sk_mark_napi_id(nsk, skb);
1420 			if (tcp_child_process(sk, nsk, skb)) {
1421 				rsk = nsk;
1422 				goto reset;
1423 			}
1424 			return 0;
1425 		}
1426 	} else
1427 		sock_rps_save_rxhash(sk, skb);
1428 
1429 	if (tcp_rcv_state_process(sk, skb)) {
1430 		rsk = sk;
1431 		goto reset;
1432 	}
1433 	return 0;
1434 
1435 reset:
1436 	tcp_v4_send_reset(rsk, skb);
1437 discard:
1438 	kfree_skb(skb);
1439 	/* Be careful here. If this function gets more complicated and
1440 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1441 	 * might be destroyed here. This current version compiles correctly,
1442 	 * but you have been warned.
1443 	 */
1444 	return 0;
1445 
1446 csum_err:
1447 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1448 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1449 	goto discard;
1450 }
1451 EXPORT_SYMBOL(tcp_v4_do_rcv);
1452 
1453 void tcp_v4_early_demux(struct sk_buff *skb)
1454 {
1455 	const struct iphdr *iph;
1456 	const struct tcphdr *th;
1457 	struct sock *sk;
1458 
1459 	if (skb->pkt_type != PACKET_HOST)
1460 		return;
1461 
1462 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1463 		return;
1464 
1465 	iph = ip_hdr(skb);
1466 	th = tcp_hdr(skb);
1467 
1468 	if (th->doff < sizeof(struct tcphdr) / 4)
1469 		return;
1470 
1471 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1472 				       iph->saddr, th->source,
1473 				       iph->daddr, ntohs(th->dest),
1474 				       skb->skb_iif);
1475 	if (sk) {
1476 		skb->sk = sk;
1477 		skb->destructor = sock_edemux;
1478 		if (sk_fullsock(sk)) {
1479 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1480 
1481 			if (dst)
1482 				dst = dst_check(dst, 0);
1483 			if (dst &&
1484 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1485 				skb_dst_set_noref(skb, dst);
1486 		}
1487 	}
1488 }
1489 
1490 /* Packet is added to VJ-style prequeue for processing in process
1491  * context, if a reader task is waiting. Apparently, this exciting
1492  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1493  * failed somewhere. Latency? Burstiness? Well, at least now we will
1494  * see, why it failed. 8)8)				  --ANK
1495  *
1496  */
1497 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1498 {
1499 	struct tcp_sock *tp = tcp_sk(sk);
1500 
1501 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1502 		return false;
1503 
1504 	if (skb->len <= tcp_hdrlen(skb) &&
1505 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1506 		return false;
1507 
1508 	/* Before escaping RCU protected region, we need to take care of skb
1509 	 * dst. Prequeue is only enabled for established sockets.
1510 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1511 	 * Instead of doing full sk_rx_dst validity here, let's perform
1512 	 * an optimistic check.
1513 	 */
1514 	if (likely(sk->sk_rx_dst))
1515 		skb_dst_drop(skb);
1516 	else
1517 		skb_dst_force_safe(skb);
1518 
1519 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1520 	tp->ucopy.memory += skb->truesize;
1521 	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1522 	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1523 		struct sk_buff *skb1;
1524 
1525 		BUG_ON(sock_owned_by_user(sk));
1526 		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1527 				skb_queue_len(&tp->ucopy.prequeue));
1528 
1529 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1530 			sk_backlog_rcv(sk, skb1);
1531 
1532 		tp->ucopy.memory = 0;
1533 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1534 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1535 					   POLLIN | POLLRDNORM | POLLRDBAND);
1536 		if (!inet_csk_ack_scheduled(sk))
1537 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1538 						  (3 * tcp_rto_min(sk)) / 4,
1539 						  TCP_RTO_MAX);
1540 	}
1541 	return true;
1542 }
1543 EXPORT_SYMBOL(tcp_prequeue);
1544 
1545 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1546 {
1547 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1548 
1549 	/* Only socket owner can try to collapse/prune rx queues
1550 	 * to reduce memory overhead, so add a little headroom here.
1551 	 * Few sockets backlog are possibly concurrently non empty.
1552 	 */
1553 	limit += 64*1024;
1554 
1555 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1556 	 * we can fix skb->truesize to its real value to avoid future drops.
1557 	 * This is valid because skb is not yet charged to the socket.
1558 	 * It has been noticed pure SACK packets were sometimes dropped
1559 	 * (if cooked by drivers without copybreak feature).
1560 	 */
1561 	skb_condense(skb);
1562 
1563 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1564 		bh_unlock_sock(sk);
1565 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1566 		return true;
1567 	}
1568 	return false;
1569 }
1570 EXPORT_SYMBOL(tcp_add_backlog);
1571 
1572 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1573 {
1574 	struct tcphdr *th = (struct tcphdr *)skb->data;
1575 	unsigned int eaten = skb->len;
1576 	int err;
1577 
1578 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1579 	if (!err) {
1580 		eaten -= skb->len;
1581 		TCP_SKB_CB(skb)->end_seq -= eaten;
1582 	}
1583 	return err;
1584 }
1585 EXPORT_SYMBOL(tcp_filter);
1586 
1587 /*
1588  *	From tcp_input.c
1589  */
1590 
1591 int tcp_v4_rcv(struct sk_buff *skb)
1592 {
1593 	struct net *net = dev_net(skb->dev);
1594 	const struct iphdr *iph;
1595 	const struct tcphdr *th;
1596 	bool refcounted;
1597 	struct sock *sk;
1598 	int ret;
1599 
1600 	if (skb->pkt_type != PACKET_HOST)
1601 		goto discard_it;
1602 
1603 	/* Count it even if it's bad */
1604 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1605 
1606 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1607 		goto discard_it;
1608 
1609 	th = (const struct tcphdr *)skb->data;
1610 
1611 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1612 		goto bad_packet;
1613 	if (!pskb_may_pull(skb, th->doff * 4))
1614 		goto discard_it;
1615 
1616 	/* An explanation is required here, I think.
1617 	 * Packet length and doff are validated by header prediction,
1618 	 * provided case of th->doff==0 is eliminated.
1619 	 * So, we defer the checks. */
1620 
1621 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1622 		goto csum_error;
1623 
1624 	th = (const struct tcphdr *)skb->data;
1625 	iph = ip_hdr(skb);
1626 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1627 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1628 	 */
1629 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1630 		sizeof(struct inet_skb_parm));
1631 	barrier();
1632 
1633 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1634 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1635 				    skb->len - th->doff * 4);
1636 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1637 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1638 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1639 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1640 	TCP_SKB_CB(skb)->sacked	 = 0;
1641 
1642 lookup:
1643 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1644 			       th->dest, &refcounted);
1645 	if (!sk)
1646 		goto no_tcp_socket;
1647 
1648 process:
1649 	if (sk->sk_state == TCP_TIME_WAIT)
1650 		goto do_time_wait;
1651 
1652 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1653 		struct request_sock *req = inet_reqsk(sk);
1654 		struct sock *nsk;
1655 
1656 		sk = req->rsk_listener;
1657 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1658 			sk_drops_add(sk, skb);
1659 			reqsk_put(req);
1660 			goto discard_it;
1661 		}
1662 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1663 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1664 			goto lookup;
1665 		}
1666 		/* We own a reference on the listener, increase it again
1667 		 * as we might lose it too soon.
1668 		 */
1669 		sock_hold(sk);
1670 		refcounted = true;
1671 		nsk = tcp_check_req(sk, skb, req, false);
1672 		if (!nsk) {
1673 			reqsk_put(req);
1674 			goto discard_and_relse;
1675 		}
1676 		if (nsk == sk) {
1677 			reqsk_put(req);
1678 		} else if (tcp_child_process(sk, nsk, skb)) {
1679 			tcp_v4_send_reset(nsk, skb);
1680 			goto discard_and_relse;
1681 		} else {
1682 			sock_put(sk);
1683 			return 0;
1684 		}
1685 	}
1686 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1687 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1688 		goto discard_and_relse;
1689 	}
1690 
1691 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1692 		goto discard_and_relse;
1693 
1694 	if (tcp_v4_inbound_md5_hash(sk, skb))
1695 		goto discard_and_relse;
1696 
1697 	nf_reset(skb);
1698 
1699 	if (tcp_filter(sk, skb))
1700 		goto discard_and_relse;
1701 	th = (const struct tcphdr *)skb->data;
1702 	iph = ip_hdr(skb);
1703 
1704 	skb->dev = NULL;
1705 
1706 	if (sk->sk_state == TCP_LISTEN) {
1707 		ret = tcp_v4_do_rcv(sk, skb);
1708 		goto put_and_return;
1709 	}
1710 
1711 	sk_incoming_cpu_update(sk);
1712 
1713 	bh_lock_sock_nested(sk);
1714 	tcp_segs_in(tcp_sk(sk), skb);
1715 	ret = 0;
1716 	if (!sock_owned_by_user(sk)) {
1717 		if (!tcp_prequeue(sk, skb))
1718 			ret = tcp_v4_do_rcv(sk, skb);
1719 	} else if (tcp_add_backlog(sk, skb)) {
1720 		goto discard_and_relse;
1721 	}
1722 	bh_unlock_sock(sk);
1723 
1724 put_and_return:
1725 	if (refcounted)
1726 		sock_put(sk);
1727 
1728 	return ret;
1729 
1730 no_tcp_socket:
1731 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1732 		goto discard_it;
1733 
1734 	if (tcp_checksum_complete(skb)) {
1735 csum_error:
1736 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1737 bad_packet:
1738 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1739 	} else {
1740 		tcp_v4_send_reset(NULL, skb);
1741 	}
1742 
1743 discard_it:
1744 	/* Discard frame. */
1745 	kfree_skb(skb);
1746 	return 0;
1747 
1748 discard_and_relse:
1749 	sk_drops_add(sk, skb);
1750 	if (refcounted)
1751 		sock_put(sk);
1752 	goto discard_it;
1753 
1754 do_time_wait:
1755 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1756 		inet_twsk_put(inet_twsk(sk));
1757 		goto discard_it;
1758 	}
1759 
1760 	if (tcp_checksum_complete(skb)) {
1761 		inet_twsk_put(inet_twsk(sk));
1762 		goto csum_error;
1763 	}
1764 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1765 	case TCP_TW_SYN: {
1766 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1767 							&tcp_hashinfo, skb,
1768 							__tcp_hdrlen(th),
1769 							iph->saddr, th->source,
1770 							iph->daddr, th->dest,
1771 							inet_iif(skb));
1772 		if (sk2) {
1773 			inet_twsk_deschedule_put(inet_twsk(sk));
1774 			sk = sk2;
1775 			refcounted = false;
1776 			goto process;
1777 		}
1778 		/* Fall through to ACK */
1779 	}
1780 	case TCP_TW_ACK:
1781 		tcp_v4_timewait_ack(sk, skb);
1782 		break;
1783 	case TCP_TW_RST:
1784 		tcp_v4_send_reset(sk, skb);
1785 		inet_twsk_deschedule_put(inet_twsk(sk));
1786 		goto discard_it;
1787 	case TCP_TW_SUCCESS:;
1788 	}
1789 	goto discard_it;
1790 }
1791 
1792 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1793 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1794 	.twsk_unique	= tcp_twsk_unique,
1795 	.twsk_destructor= tcp_twsk_destructor,
1796 };
1797 
1798 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1799 {
1800 	struct dst_entry *dst = skb_dst(skb);
1801 
1802 	if (dst && dst_hold_safe(dst)) {
1803 		sk->sk_rx_dst = dst;
1804 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1805 	}
1806 }
1807 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1808 
1809 const struct inet_connection_sock_af_ops ipv4_specific = {
1810 	.queue_xmit	   = ip_queue_xmit,
1811 	.send_check	   = tcp_v4_send_check,
1812 	.rebuild_header	   = inet_sk_rebuild_header,
1813 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1814 	.conn_request	   = tcp_v4_conn_request,
1815 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1816 	.net_header_len	   = sizeof(struct iphdr),
1817 	.setsockopt	   = ip_setsockopt,
1818 	.getsockopt	   = ip_getsockopt,
1819 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1820 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1821 #ifdef CONFIG_COMPAT
1822 	.compat_setsockopt = compat_ip_setsockopt,
1823 	.compat_getsockopt = compat_ip_getsockopt,
1824 #endif
1825 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1826 };
1827 EXPORT_SYMBOL(ipv4_specific);
1828 
1829 #ifdef CONFIG_TCP_MD5SIG
1830 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1831 	.md5_lookup		= tcp_v4_md5_lookup,
1832 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1833 	.md5_parse		= tcp_v4_parse_md5_keys,
1834 };
1835 #endif
1836 
1837 /* NOTE: A lot of things set to zero explicitly by call to
1838  *       sk_alloc() so need not be done here.
1839  */
1840 static int tcp_v4_init_sock(struct sock *sk)
1841 {
1842 	struct inet_connection_sock *icsk = inet_csk(sk);
1843 
1844 	tcp_init_sock(sk);
1845 
1846 	icsk->icsk_af_ops = &ipv4_specific;
1847 
1848 #ifdef CONFIG_TCP_MD5SIG
1849 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1850 #endif
1851 
1852 	return 0;
1853 }
1854 
1855 void tcp_v4_destroy_sock(struct sock *sk)
1856 {
1857 	struct tcp_sock *tp = tcp_sk(sk);
1858 
1859 	tcp_clear_xmit_timers(sk);
1860 
1861 	tcp_cleanup_congestion_control(sk);
1862 
1863 	/* Cleanup up the write buffer. */
1864 	tcp_write_queue_purge(sk);
1865 
1866 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1867 	skb_rbtree_purge(&tp->out_of_order_queue);
1868 
1869 #ifdef CONFIG_TCP_MD5SIG
1870 	/* Clean up the MD5 key list, if any */
1871 	if (tp->md5sig_info) {
1872 		tcp_clear_md5_list(sk);
1873 		kfree_rcu(tp->md5sig_info, rcu);
1874 		tp->md5sig_info = NULL;
1875 	}
1876 #endif
1877 
1878 	/* Clean prequeue, it must be empty really */
1879 	__skb_queue_purge(&tp->ucopy.prequeue);
1880 
1881 	/* Clean up a referenced TCP bind bucket. */
1882 	if (inet_csk(sk)->icsk_bind_hash)
1883 		inet_put_port(sk);
1884 
1885 	BUG_ON(tp->fastopen_rsk);
1886 
1887 	/* If socket is aborted during connect operation */
1888 	tcp_free_fastopen_req(tp);
1889 	tcp_saved_syn_free(tp);
1890 
1891 	sk_sockets_allocated_dec(sk);
1892 }
1893 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1894 
1895 #ifdef CONFIG_PROC_FS
1896 /* Proc filesystem TCP sock list dumping. */
1897 
1898 /*
1899  * Get next listener socket follow cur.  If cur is NULL, get first socket
1900  * starting from bucket given in st->bucket; when st->bucket is zero the
1901  * very first socket in the hash table is returned.
1902  */
1903 static void *listening_get_next(struct seq_file *seq, void *cur)
1904 {
1905 	struct tcp_iter_state *st = seq->private;
1906 	struct net *net = seq_file_net(seq);
1907 	struct inet_listen_hashbucket *ilb;
1908 	struct sock *sk = cur;
1909 
1910 	if (!sk) {
1911 get_head:
1912 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1913 		spin_lock(&ilb->lock);
1914 		sk = sk_head(&ilb->head);
1915 		st->offset = 0;
1916 		goto get_sk;
1917 	}
1918 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1919 	++st->num;
1920 	++st->offset;
1921 
1922 	sk = sk_next(sk);
1923 get_sk:
1924 	sk_for_each_from(sk) {
1925 		if (!net_eq(sock_net(sk), net))
1926 			continue;
1927 		if (sk->sk_family == st->family)
1928 			return sk;
1929 	}
1930 	spin_unlock(&ilb->lock);
1931 	st->offset = 0;
1932 	if (++st->bucket < INET_LHTABLE_SIZE)
1933 		goto get_head;
1934 	return NULL;
1935 }
1936 
1937 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1938 {
1939 	struct tcp_iter_state *st = seq->private;
1940 	void *rc;
1941 
1942 	st->bucket = 0;
1943 	st->offset = 0;
1944 	rc = listening_get_next(seq, NULL);
1945 
1946 	while (rc && *pos) {
1947 		rc = listening_get_next(seq, rc);
1948 		--*pos;
1949 	}
1950 	return rc;
1951 }
1952 
1953 static inline bool empty_bucket(const struct tcp_iter_state *st)
1954 {
1955 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1956 }
1957 
1958 /*
1959  * Get first established socket starting from bucket given in st->bucket.
1960  * If st->bucket is zero, the very first socket in the hash is returned.
1961  */
1962 static void *established_get_first(struct seq_file *seq)
1963 {
1964 	struct tcp_iter_state *st = seq->private;
1965 	struct net *net = seq_file_net(seq);
1966 	void *rc = NULL;
1967 
1968 	st->offset = 0;
1969 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1970 		struct sock *sk;
1971 		struct hlist_nulls_node *node;
1972 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1973 
1974 		/* Lockless fast path for the common case of empty buckets */
1975 		if (empty_bucket(st))
1976 			continue;
1977 
1978 		spin_lock_bh(lock);
1979 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1980 			if (sk->sk_family != st->family ||
1981 			    !net_eq(sock_net(sk), net)) {
1982 				continue;
1983 			}
1984 			rc = sk;
1985 			goto out;
1986 		}
1987 		spin_unlock_bh(lock);
1988 	}
1989 out:
1990 	return rc;
1991 }
1992 
1993 static void *established_get_next(struct seq_file *seq, void *cur)
1994 {
1995 	struct sock *sk = cur;
1996 	struct hlist_nulls_node *node;
1997 	struct tcp_iter_state *st = seq->private;
1998 	struct net *net = seq_file_net(seq);
1999 
2000 	++st->num;
2001 	++st->offset;
2002 
2003 	sk = sk_nulls_next(sk);
2004 
2005 	sk_nulls_for_each_from(sk, node) {
2006 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2007 			return sk;
2008 	}
2009 
2010 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2011 	++st->bucket;
2012 	return established_get_first(seq);
2013 }
2014 
2015 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2016 {
2017 	struct tcp_iter_state *st = seq->private;
2018 	void *rc;
2019 
2020 	st->bucket = 0;
2021 	rc = established_get_first(seq);
2022 
2023 	while (rc && pos) {
2024 		rc = established_get_next(seq, rc);
2025 		--pos;
2026 	}
2027 	return rc;
2028 }
2029 
2030 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2031 {
2032 	void *rc;
2033 	struct tcp_iter_state *st = seq->private;
2034 
2035 	st->state = TCP_SEQ_STATE_LISTENING;
2036 	rc	  = listening_get_idx(seq, &pos);
2037 
2038 	if (!rc) {
2039 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2040 		rc	  = established_get_idx(seq, pos);
2041 	}
2042 
2043 	return rc;
2044 }
2045 
2046 static void *tcp_seek_last_pos(struct seq_file *seq)
2047 {
2048 	struct tcp_iter_state *st = seq->private;
2049 	int offset = st->offset;
2050 	int orig_num = st->num;
2051 	void *rc = NULL;
2052 
2053 	switch (st->state) {
2054 	case TCP_SEQ_STATE_LISTENING:
2055 		if (st->bucket >= INET_LHTABLE_SIZE)
2056 			break;
2057 		st->state = TCP_SEQ_STATE_LISTENING;
2058 		rc = listening_get_next(seq, NULL);
2059 		while (offset-- && rc)
2060 			rc = listening_get_next(seq, rc);
2061 		if (rc)
2062 			break;
2063 		st->bucket = 0;
2064 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2065 		/* Fallthrough */
2066 	case TCP_SEQ_STATE_ESTABLISHED:
2067 		if (st->bucket > tcp_hashinfo.ehash_mask)
2068 			break;
2069 		rc = established_get_first(seq);
2070 		while (offset-- && rc)
2071 			rc = established_get_next(seq, rc);
2072 	}
2073 
2074 	st->num = orig_num;
2075 
2076 	return rc;
2077 }
2078 
2079 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2080 {
2081 	struct tcp_iter_state *st = seq->private;
2082 	void *rc;
2083 
2084 	if (*pos && *pos == st->last_pos) {
2085 		rc = tcp_seek_last_pos(seq);
2086 		if (rc)
2087 			goto out;
2088 	}
2089 
2090 	st->state = TCP_SEQ_STATE_LISTENING;
2091 	st->num = 0;
2092 	st->bucket = 0;
2093 	st->offset = 0;
2094 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2095 
2096 out:
2097 	st->last_pos = *pos;
2098 	return rc;
2099 }
2100 
2101 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2102 {
2103 	struct tcp_iter_state *st = seq->private;
2104 	void *rc = NULL;
2105 
2106 	if (v == SEQ_START_TOKEN) {
2107 		rc = tcp_get_idx(seq, 0);
2108 		goto out;
2109 	}
2110 
2111 	switch (st->state) {
2112 	case TCP_SEQ_STATE_LISTENING:
2113 		rc = listening_get_next(seq, v);
2114 		if (!rc) {
2115 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2116 			st->bucket = 0;
2117 			st->offset = 0;
2118 			rc	  = established_get_first(seq);
2119 		}
2120 		break;
2121 	case TCP_SEQ_STATE_ESTABLISHED:
2122 		rc = established_get_next(seq, v);
2123 		break;
2124 	}
2125 out:
2126 	++*pos;
2127 	st->last_pos = *pos;
2128 	return rc;
2129 }
2130 
2131 static void tcp_seq_stop(struct seq_file *seq, void *v)
2132 {
2133 	struct tcp_iter_state *st = seq->private;
2134 
2135 	switch (st->state) {
2136 	case TCP_SEQ_STATE_LISTENING:
2137 		if (v != SEQ_START_TOKEN)
2138 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2139 		break;
2140 	case TCP_SEQ_STATE_ESTABLISHED:
2141 		if (v)
2142 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2143 		break;
2144 	}
2145 }
2146 
2147 int tcp_seq_open(struct inode *inode, struct file *file)
2148 {
2149 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2150 	struct tcp_iter_state *s;
2151 	int err;
2152 
2153 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2154 			  sizeof(struct tcp_iter_state));
2155 	if (err < 0)
2156 		return err;
2157 
2158 	s = ((struct seq_file *)file->private_data)->private;
2159 	s->family		= afinfo->family;
2160 	s->last_pos		= 0;
2161 	return 0;
2162 }
2163 EXPORT_SYMBOL(tcp_seq_open);
2164 
2165 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2166 {
2167 	int rc = 0;
2168 	struct proc_dir_entry *p;
2169 
2170 	afinfo->seq_ops.start		= tcp_seq_start;
2171 	afinfo->seq_ops.next		= tcp_seq_next;
2172 	afinfo->seq_ops.stop		= tcp_seq_stop;
2173 
2174 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2175 			     afinfo->seq_fops, afinfo);
2176 	if (!p)
2177 		rc = -ENOMEM;
2178 	return rc;
2179 }
2180 EXPORT_SYMBOL(tcp_proc_register);
2181 
2182 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2183 {
2184 	remove_proc_entry(afinfo->name, net->proc_net);
2185 }
2186 EXPORT_SYMBOL(tcp_proc_unregister);
2187 
2188 static void get_openreq4(const struct request_sock *req,
2189 			 struct seq_file *f, int i)
2190 {
2191 	const struct inet_request_sock *ireq = inet_rsk(req);
2192 	long delta = req->rsk_timer.expires - jiffies;
2193 
2194 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2195 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2196 		i,
2197 		ireq->ir_loc_addr,
2198 		ireq->ir_num,
2199 		ireq->ir_rmt_addr,
2200 		ntohs(ireq->ir_rmt_port),
2201 		TCP_SYN_RECV,
2202 		0, 0, /* could print option size, but that is af dependent. */
2203 		1,    /* timers active (only the expire timer) */
2204 		jiffies_delta_to_clock_t(delta),
2205 		req->num_timeout,
2206 		from_kuid_munged(seq_user_ns(f),
2207 				 sock_i_uid(req->rsk_listener)),
2208 		0,  /* non standard timer */
2209 		0, /* open_requests have no inode */
2210 		0,
2211 		req);
2212 }
2213 
2214 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2215 {
2216 	int timer_active;
2217 	unsigned long timer_expires;
2218 	const struct tcp_sock *tp = tcp_sk(sk);
2219 	const struct inet_connection_sock *icsk = inet_csk(sk);
2220 	const struct inet_sock *inet = inet_sk(sk);
2221 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2222 	__be32 dest = inet->inet_daddr;
2223 	__be32 src = inet->inet_rcv_saddr;
2224 	__u16 destp = ntohs(inet->inet_dport);
2225 	__u16 srcp = ntohs(inet->inet_sport);
2226 	int rx_queue;
2227 	int state;
2228 
2229 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2230 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2231 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2232 		timer_active	= 1;
2233 		timer_expires	= icsk->icsk_timeout;
2234 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2235 		timer_active	= 4;
2236 		timer_expires	= icsk->icsk_timeout;
2237 	} else if (timer_pending(&sk->sk_timer)) {
2238 		timer_active	= 2;
2239 		timer_expires	= sk->sk_timer.expires;
2240 	} else {
2241 		timer_active	= 0;
2242 		timer_expires = jiffies;
2243 	}
2244 
2245 	state = sk_state_load(sk);
2246 	if (state == TCP_LISTEN)
2247 		rx_queue = sk->sk_ack_backlog;
2248 	else
2249 		/* Because we don't lock the socket,
2250 		 * we might find a transient negative value.
2251 		 */
2252 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2253 
2254 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2255 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2256 		i, src, srcp, dest, destp, state,
2257 		tp->write_seq - tp->snd_una,
2258 		rx_queue,
2259 		timer_active,
2260 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2261 		icsk->icsk_retransmits,
2262 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2263 		icsk->icsk_probes_out,
2264 		sock_i_ino(sk),
2265 		atomic_read(&sk->sk_refcnt), sk,
2266 		jiffies_to_clock_t(icsk->icsk_rto),
2267 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2268 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2269 		tp->snd_cwnd,
2270 		state == TCP_LISTEN ?
2271 		    fastopenq->max_qlen :
2272 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2273 }
2274 
2275 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2276 			       struct seq_file *f, int i)
2277 {
2278 	long delta = tw->tw_timer.expires - jiffies;
2279 	__be32 dest, src;
2280 	__u16 destp, srcp;
2281 
2282 	dest  = tw->tw_daddr;
2283 	src   = tw->tw_rcv_saddr;
2284 	destp = ntohs(tw->tw_dport);
2285 	srcp  = ntohs(tw->tw_sport);
2286 
2287 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2288 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2289 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2290 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2291 		atomic_read(&tw->tw_refcnt), tw);
2292 }
2293 
2294 #define TMPSZ 150
2295 
2296 static int tcp4_seq_show(struct seq_file *seq, void *v)
2297 {
2298 	struct tcp_iter_state *st;
2299 	struct sock *sk = v;
2300 
2301 	seq_setwidth(seq, TMPSZ - 1);
2302 	if (v == SEQ_START_TOKEN) {
2303 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2304 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2305 			   "inode");
2306 		goto out;
2307 	}
2308 	st = seq->private;
2309 
2310 	if (sk->sk_state == TCP_TIME_WAIT)
2311 		get_timewait4_sock(v, seq, st->num);
2312 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2313 		get_openreq4(v, seq, st->num);
2314 	else
2315 		get_tcp4_sock(v, seq, st->num);
2316 out:
2317 	seq_pad(seq, '\n');
2318 	return 0;
2319 }
2320 
2321 static const struct file_operations tcp_afinfo_seq_fops = {
2322 	.owner   = THIS_MODULE,
2323 	.open    = tcp_seq_open,
2324 	.read    = seq_read,
2325 	.llseek  = seq_lseek,
2326 	.release = seq_release_net
2327 };
2328 
2329 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2330 	.name		= "tcp",
2331 	.family		= AF_INET,
2332 	.seq_fops	= &tcp_afinfo_seq_fops,
2333 	.seq_ops	= {
2334 		.show		= tcp4_seq_show,
2335 	},
2336 };
2337 
2338 static int __net_init tcp4_proc_init_net(struct net *net)
2339 {
2340 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2341 }
2342 
2343 static void __net_exit tcp4_proc_exit_net(struct net *net)
2344 {
2345 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2346 }
2347 
2348 static struct pernet_operations tcp4_net_ops = {
2349 	.init = tcp4_proc_init_net,
2350 	.exit = tcp4_proc_exit_net,
2351 };
2352 
2353 int __init tcp4_proc_init(void)
2354 {
2355 	return register_pernet_subsys(&tcp4_net_ops);
2356 }
2357 
2358 void tcp4_proc_exit(void)
2359 {
2360 	unregister_pernet_subsys(&tcp4_net_ops);
2361 }
2362 #endif /* CONFIG_PROC_FS */
2363 
2364 struct proto tcp_prot = {
2365 	.name			= "TCP",
2366 	.owner			= THIS_MODULE,
2367 	.close			= tcp_close,
2368 	.connect		= tcp_v4_connect,
2369 	.disconnect		= tcp_disconnect,
2370 	.accept			= inet_csk_accept,
2371 	.ioctl			= tcp_ioctl,
2372 	.init			= tcp_v4_init_sock,
2373 	.destroy		= tcp_v4_destroy_sock,
2374 	.shutdown		= tcp_shutdown,
2375 	.setsockopt		= tcp_setsockopt,
2376 	.getsockopt		= tcp_getsockopt,
2377 	.keepalive		= tcp_set_keepalive,
2378 	.recvmsg		= tcp_recvmsg,
2379 	.sendmsg		= tcp_sendmsg,
2380 	.sendpage		= tcp_sendpage,
2381 	.backlog_rcv		= tcp_v4_do_rcv,
2382 	.release_cb		= tcp_release_cb,
2383 	.hash			= inet_hash,
2384 	.unhash			= inet_unhash,
2385 	.get_port		= inet_csk_get_port,
2386 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2387 	.stream_memory_free	= tcp_stream_memory_free,
2388 	.sockets_allocated	= &tcp_sockets_allocated,
2389 	.orphan_count		= &tcp_orphan_count,
2390 	.memory_allocated	= &tcp_memory_allocated,
2391 	.memory_pressure	= &tcp_memory_pressure,
2392 	.sysctl_mem		= sysctl_tcp_mem,
2393 	.sysctl_wmem		= sysctl_tcp_wmem,
2394 	.sysctl_rmem		= sysctl_tcp_rmem,
2395 	.max_header		= MAX_TCP_HEADER,
2396 	.obj_size		= sizeof(struct tcp_sock),
2397 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2398 	.twsk_prot		= &tcp_timewait_sock_ops,
2399 	.rsk_prot		= &tcp_request_sock_ops,
2400 	.h.hashinfo		= &tcp_hashinfo,
2401 	.no_autobind		= true,
2402 #ifdef CONFIG_COMPAT
2403 	.compat_setsockopt	= compat_tcp_setsockopt,
2404 	.compat_getsockopt	= compat_tcp_getsockopt,
2405 #endif
2406 	.diag_destroy		= tcp_abort,
2407 };
2408 EXPORT_SYMBOL(tcp_prot);
2409 
2410 static void __net_exit tcp_sk_exit(struct net *net)
2411 {
2412 	int cpu;
2413 
2414 	for_each_possible_cpu(cpu)
2415 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2416 	free_percpu(net->ipv4.tcp_sk);
2417 }
2418 
2419 static int __net_init tcp_sk_init(struct net *net)
2420 {
2421 	int res, cpu, cnt;
2422 
2423 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2424 	if (!net->ipv4.tcp_sk)
2425 		return -ENOMEM;
2426 
2427 	for_each_possible_cpu(cpu) {
2428 		struct sock *sk;
2429 
2430 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2431 					   IPPROTO_TCP, net);
2432 		if (res)
2433 			goto fail;
2434 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2435 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2436 	}
2437 
2438 	net->ipv4.sysctl_tcp_ecn = 2;
2439 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2440 
2441 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2442 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2443 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2444 
2445 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2446 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2447 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2448 
2449 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2450 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2451 	net->ipv4.sysctl_tcp_syncookies = 1;
2452 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2453 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2454 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2455 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2456 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2457 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2458 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2459 
2460 	cnt = tcp_hashinfo.ehash_mask + 1;
2461 	net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
2462 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2463 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2464 
2465 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2466 
2467 	return 0;
2468 fail:
2469 	tcp_sk_exit(net);
2470 
2471 	return res;
2472 }
2473 
2474 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2475 {
2476 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2477 }
2478 
2479 static struct pernet_operations __net_initdata tcp_sk_ops = {
2480        .init	   = tcp_sk_init,
2481        .exit	   = tcp_sk_exit,
2482        .exit_batch = tcp_sk_exit_batch,
2483 };
2484 
2485 void __init tcp_v4_init(void)
2486 {
2487 	if (register_pernet_subsys(&tcp_sk_ops))
2488 		panic("Failed to create the TCP control socket.\n");
2489 }
2490