xref: /linux/net/ipv4/tcp_ipv4.c (revision 3b812ecce736432e6b55e77028ea387eb1517d24)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95 
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98 
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 					  ip_hdr(skb)->saddr,
103 					  tcp_hdr(skb)->dest,
104 					  tcp_hdr(skb)->source);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 
112 	/* With PAWS, it is safe from the viewpoint
113 	   of data integrity. Even without PAWS it is safe provided sequence
114 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115 
116 	   Actually, the idea is close to VJ's one, only timestamp cache is
117 	   held not per host, but per port pair and TW bucket is used as state
118 	   holder.
119 
120 	   If TW bucket has been already destroyed we fall back to VJ's scheme
121 	   and use initial timestamp retrieved from peer table.
122 	 */
123 	if (tcptw->tw_ts_recent_stamp &&
124 	    (!twp || (sysctl_tcp_tw_reuse &&
125 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127 		if (tp->write_seq == 0)
128 			tp->write_seq = 1;
129 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
130 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131 		sock_hold(sktw);
132 		return 1;
133 	}
134 
135 	return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138 
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143 	struct inet_sock *inet = inet_sk(sk);
144 	struct tcp_sock *tp = tcp_sk(sk);
145 	__be16 orig_sport, orig_dport;
146 	__be32 daddr, nexthop;
147 	struct flowi4 *fl4;
148 	struct rtable *rt;
149 	int err;
150 	struct ip_options_rcu *inet_opt;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     sock_owned_by_user(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	if (tcp_death_row.sysctl_tw_recycle &&
202 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 		tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205 	inet->inet_dport = usin->sin_port;
206 	sk_daddr_set(sk, daddr);
207 
208 	inet_csk(sk)->icsk_ext_hdr_len = 0;
209 	if (inet_opt)
210 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214 	/* Socket identity is still unknown (sport may be zero).
215 	 * However we set state to SYN-SENT and not releasing socket
216 	 * lock select source port, enter ourselves into the hash tables and
217 	 * complete initialization after this.
218 	 */
219 	tcp_set_state(sk, TCP_SYN_SENT);
220 	err = inet_hash_connect(&tcp_death_row, sk);
221 	if (err)
222 		goto failure;
223 
224 	sk_set_txhash(sk);
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 
237 	if (!tp->write_seq && likely(!tp->repair))
238 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 							   inet->inet_daddr,
240 							   inet->inet_sport,
241 							   usin->sin_port);
242 
243 	inet->inet_id = tp->write_seq ^ jiffies;
244 
245 	err = tcp_connect(sk);
246 
247 	rt = NULL;
248 	if (err)
249 		goto failure;
250 
251 	return 0;
252 
253 failure:
254 	/*
255 	 * This unhashes the socket and releases the local port,
256 	 * if necessary.
257 	 */
258 	tcp_set_state(sk, TCP_CLOSE);
259 	ip_rt_put(rt);
260 	sk->sk_route_caps = 0;
261 	inet->inet_dport = 0;
262 	return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 	u32 mtu = tcp_sk(sk)->mtu_info;
276 
277 	dst = inet_csk_update_pmtu(sk, mtu);
278 	if (!dst)
279 		return;
280 
281 	/* Something is about to be wrong... Remember soft error
282 	 * for the case, if this connection will not able to recover.
283 	 */
284 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 		sk->sk_err_soft = EMSGSIZE;
286 
287 	mtu = dst_mtu(dst);
288 
289 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 	    ip_sk_accept_pmtu(sk) &&
291 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 		tcp_sync_mss(sk, mtu);
293 
294 		/* Resend the TCP packet because it's
295 		 * clear that the old packet has been
296 		 * dropped. This is the new "fast" path mtu
297 		 * discovery.
298 		 */
299 		tcp_simple_retransmit(sk);
300 	} /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 	struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308 	if (dst)
309 		dst->ops->redirect(dst, sk, skb);
310 }
311 
312 
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316 	struct request_sock *req = inet_reqsk(sk);
317 	struct net *net = sock_net(sk);
318 
319 	/* ICMPs are not backlogged, hence we cannot get
320 	 * an established socket here.
321 	 */
322 	WARN_ON(req->sk);
323 
324 	if (seq != tcp_rsk(req)->snt_isn) {
325 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
326 	} else if (abort) {
327 		/*
328 		 * Still in SYN_RECV, just remove it silently.
329 		 * There is no good way to pass the error to the newly
330 		 * created socket, and POSIX does not want network
331 		 * errors returned from accept().
332 		 */
333 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
334 		NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
335 	}
336 	reqsk_put(req);
337 }
338 EXPORT_SYMBOL(tcp_req_err);
339 
340 /*
341  * This routine is called by the ICMP module when it gets some
342  * sort of error condition.  If err < 0 then the socket should
343  * be closed and the error returned to the user.  If err > 0
344  * it's just the icmp type << 8 | icmp code.  After adjustment
345  * header points to the first 8 bytes of the tcp header.  We need
346  * to find the appropriate port.
347  *
348  * The locking strategy used here is very "optimistic". When
349  * someone else accesses the socket the ICMP is just dropped
350  * and for some paths there is no check at all.
351  * A more general error queue to queue errors for later handling
352  * is probably better.
353  *
354  */
355 
356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
357 {
358 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
359 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
360 	struct inet_connection_sock *icsk;
361 	struct tcp_sock *tp;
362 	struct inet_sock *inet;
363 	const int type = icmp_hdr(icmp_skb)->type;
364 	const int code = icmp_hdr(icmp_skb)->code;
365 	struct sock *sk;
366 	struct sk_buff *skb;
367 	struct request_sock *fastopen;
368 	__u32 seq, snd_una;
369 	__u32 remaining;
370 	int err;
371 	struct net *net = dev_net(icmp_skb->dev);
372 
373 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374 				       th->dest, iph->saddr, ntohs(th->source),
375 				       inet_iif(icmp_skb));
376 	if (!sk) {
377 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
378 		return;
379 	}
380 	if (sk->sk_state == TCP_TIME_WAIT) {
381 		inet_twsk_put(inet_twsk(sk));
382 		return;
383 	}
384 	seq = ntohl(th->seq);
385 	if (sk->sk_state == TCP_NEW_SYN_RECV)
386 		return tcp_req_err(sk, seq,
387 				  type == ICMP_PARAMETERPROB ||
388 				  type == ICMP_TIME_EXCEEDED ||
389 				  (type == ICMP_DEST_UNREACH &&
390 				   (code == ICMP_NET_UNREACH ||
391 				    code == ICMP_HOST_UNREACH)));
392 
393 	bh_lock_sock(sk);
394 	/* If too many ICMPs get dropped on busy
395 	 * servers this needs to be solved differently.
396 	 * We do take care of PMTU discovery (RFC1191) special case :
397 	 * we can receive locally generated ICMP messages while socket is held.
398 	 */
399 	if (sock_owned_by_user(sk)) {
400 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
401 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
402 	}
403 	if (sk->sk_state == TCP_CLOSE)
404 		goto out;
405 
406 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
407 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
408 		goto out;
409 	}
410 
411 	icsk = inet_csk(sk);
412 	tp = tcp_sk(sk);
413 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
414 	fastopen = tp->fastopen_rsk;
415 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
416 	if (sk->sk_state != TCP_LISTEN &&
417 	    !between(seq, snd_una, tp->snd_nxt)) {
418 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
419 		goto out;
420 	}
421 
422 	switch (type) {
423 	case ICMP_REDIRECT:
424 		do_redirect(icmp_skb, sk);
425 		goto out;
426 	case ICMP_SOURCE_QUENCH:
427 		/* Just silently ignore these. */
428 		goto out;
429 	case ICMP_PARAMETERPROB:
430 		err = EPROTO;
431 		break;
432 	case ICMP_DEST_UNREACH:
433 		if (code > NR_ICMP_UNREACH)
434 			goto out;
435 
436 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
437 			/* We are not interested in TCP_LISTEN and open_requests
438 			 * (SYN-ACKs send out by Linux are always <576bytes so
439 			 * they should go through unfragmented).
440 			 */
441 			if (sk->sk_state == TCP_LISTEN)
442 				goto out;
443 
444 			tp->mtu_info = info;
445 			if (!sock_owned_by_user(sk)) {
446 				tcp_v4_mtu_reduced(sk);
447 			} else {
448 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
449 					sock_hold(sk);
450 			}
451 			goto out;
452 		}
453 
454 		err = icmp_err_convert[code].errno;
455 		/* check if icmp_skb allows revert of backoff
456 		 * (see draft-zimmermann-tcp-lcd) */
457 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
458 			break;
459 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
460 		    !icsk->icsk_backoff || fastopen)
461 			break;
462 
463 		if (sock_owned_by_user(sk))
464 			break;
465 
466 		icsk->icsk_backoff--;
467 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
468 					       TCP_TIMEOUT_INIT;
469 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
470 
471 		skb = tcp_write_queue_head(sk);
472 		BUG_ON(!skb);
473 
474 		remaining = icsk->icsk_rto -
475 			    min(icsk->icsk_rto,
476 				tcp_time_stamp - tcp_skb_timestamp(skb));
477 
478 		if (remaining) {
479 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
480 						  remaining, TCP_RTO_MAX);
481 		} else {
482 			/* RTO revert clocked out retransmission.
483 			 * Will retransmit now */
484 			tcp_retransmit_timer(sk);
485 		}
486 
487 		break;
488 	case ICMP_TIME_EXCEEDED:
489 		err = EHOSTUNREACH;
490 		break;
491 	default:
492 		goto out;
493 	}
494 
495 	switch (sk->sk_state) {
496 	case TCP_SYN_SENT:
497 	case TCP_SYN_RECV:
498 		/* Only in fast or simultaneous open. If a fast open socket is
499 		 * is already accepted it is treated as a connected one below.
500 		 */
501 		if (fastopen && !fastopen->sk)
502 			break;
503 
504 		if (!sock_owned_by_user(sk)) {
505 			sk->sk_err = err;
506 
507 			sk->sk_error_report(sk);
508 
509 			tcp_done(sk);
510 		} else {
511 			sk->sk_err_soft = err;
512 		}
513 		goto out;
514 	}
515 
516 	/* If we've already connected we will keep trying
517 	 * until we time out, or the user gives up.
518 	 *
519 	 * rfc1122 4.2.3.9 allows to consider as hard errors
520 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
521 	 * but it is obsoleted by pmtu discovery).
522 	 *
523 	 * Note, that in modern internet, where routing is unreliable
524 	 * and in each dark corner broken firewalls sit, sending random
525 	 * errors ordered by their masters even this two messages finally lose
526 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
527 	 *
528 	 * Now we are in compliance with RFCs.
529 	 *							--ANK (980905)
530 	 */
531 
532 	inet = inet_sk(sk);
533 	if (!sock_owned_by_user(sk) && inet->recverr) {
534 		sk->sk_err = err;
535 		sk->sk_error_report(sk);
536 	} else	{ /* Only an error on timeout */
537 		sk->sk_err_soft = err;
538 	}
539 
540 out:
541 	bh_unlock_sock(sk);
542 	sock_put(sk);
543 }
544 
545 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
546 {
547 	struct tcphdr *th = tcp_hdr(skb);
548 
549 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
550 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
551 		skb->csum_start = skb_transport_header(skb) - skb->head;
552 		skb->csum_offset = offsetof(struct tcphdr, check);
553 	} else {
554 		th->check = tcp_v4_check(skb->len, saddr, daddr,
555 					 csum_partial(th,
556 						      th->doff << 2,
557 						      skb->csum));
558 	}
559 }
560 
561 /* This routine computes an IPv4 TCP checksum. */
562 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
563 {
564 	const struct inet_sock *inet = inet_sk(sk);
565 
566 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
567 }
568 EXPORT_SYMBOL(tcp_v4_send_check);
569 
570 /*
571  *	This routine will send an RST to the other tcp.
572  *
573  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
574  *		      for reset.
575  *	Answer: if a packet caused RST, it is not for a socket
576  *		existing in our system, if it is matched to a socket,
577  *		it is just duplicate segment or bug in other side's TCP.
578  *		So that we build reply only basing on parameters
579  *		arrived with segment.
580  *	Exception: precedence violation. We do not implement it in any case.
581  */
582 
583 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
584 {
585 	const struct tcphdr *th = tcp_hdr(skb);
586 	struct {
587 		struct tcphdr th;
588 #ifdef CONFIG_TCP_MD5SIG
589 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
590 #endif
591 	} rep;
592 	struct ip_reply_arg arg;
593 #ifdef CONFIG_TCP_MD5SIG
594 	struct tcp_md5sig_key *key = NULL;
595 	const __u8 *hash_location = NULL;
596 	unsigned char newhash[16];
597 	int genhash;
598 	struct sock *sk1 = NULL;
599 #endif
600 	struct net *net;
601 
602 	/* Never send a reset in response to a reset. */
603 	if (th->rst)
604 		return;
605 
606 	/* If sk not NULL, it means we did a successful lookup and incoming
607 	 * route had to be correct. prequeue might have dropped our dst.
608 	 */
609 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
610 		return;
611 
612 	/* Swap the send and the receive. */
613 	memset(&rep, 0, sizeof(rep));
614 	rep.th.dest   = th->source;
615 	rep.th.source = th->dest;
616 	rep.th.doff   = sizeof(struct tcphdr) / 4;
617 	rep.th.rst    = 1;
618 
619 	if (th->ack) {
620 		rep.th.seq = th->ack_seq;
621 	} else {
622 		rep.th.ack = 1;
623 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
624 				       skb->len - (th->doff << 2));
625 	}
626 
627 	memset(&arg, 0, sizeof(arg));
628 	arg.iov[0].iov_base = (unsigned char *)&rep;
629 	arg.iov[0].iov_len  = sizeof(rep.th);
630 
631 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
632 #ifdef CONFIG_TCP_MD5SIG
633 	hash_location = tcp_parse_md5sig_option(th);
634 	if (sk && sk_fullsock(sk)) {
635 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
636 					&ip_hdr(skb)->saddr, AF_INET);
637 	} else if (hash_location) {
638 		/*
639 		 * active side is lost. Try to find listening socket through
640 		 * source port, and then find md5 key through listening socket.
641 		 * we are not loose security here:
642 		 * Incoming packet is checked with md5 hash with finding key,
643 		 * no RST generated if md5 hash doesn't match.
644 		 */
645 		sk1 = __inet_lookup_listener(net,
646 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
647 					     th->source, ip_hdr(skb)->daddr,
648 					     ntohs(th->source), inet_iif(skb));
649 		/* don't send rst if it can't find key */
650 		if (!sk1)
651 			return;
652 		rcu_read_lock();
653 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
654 					&ip_hdr(skb)->saddr, AF_INET);
655 		if (!key)
656 			goto release_sk1;
657 
658 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
660 			goto release_sk1;
661 	}
662 
663 	if (key) {
664 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
665 				   (TCPOPT_NOP << 16) |
666 				   (TCPOPT_MD5SIG << 8) |
667 				   TCPOLEN_MD5SIG);
668 		/* Update length and the length the header thinks exists */
669 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
670 		rep.th.doff = arg.iov[0].iov_len / 4;
671 
672 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
673 				     key, ip_hdr(skb)->saddr,
674 				     ip_hdr(skb)->daddr, &rep.th);
675 	}
676 #endif
677 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
678 				      ip_hdr(skb)->saddr, /* XXX */
679 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
680 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
681 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
682 
683 	/* When socket is gone, all binding information is lost.
684 	 * routing might fail in this case. No choice here, if we choose to force
685 	 * input interface, we will misroute in case of asymmetric route.
686 	 */
687 	if (sk)
688 		arg.bound_dev_if = sk->sk_bound_dev_if;
689 
690 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
691 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
692 
693 	arg.tos = ip_hdr(skb)->tos;
694 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
695 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
696 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
697 			      &arg, arg.iov[0].iov_len);
698 
699 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
700 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
701 
702 #ifdef CONFIG_TCP_MD5SIG
703 release_sk1:
704 	if (sk1) {
705 		rcu_read_unlock();
706 		sock_put(sk1);
707 	}
708 #endif
709 }
710 
711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
712    outside socket context is ugly, certainly. What can I do?
713  */
714 
715 static void tcp_v4_send_ack(struct net *net,
716 			    struct sk_buff *skb, u32 seq, u32 ack,
717 			    u32 win, u32 tsval, u32 tsecr, int oif,
718 			    struct tcp_md5sig_key *key,
719 			    int reply_flags, u8 tos)
720 {
721 	const struct tcphdr *th = tcp_hdr(skb);
722 	struct {
723 		struct tcphdr th;
724 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
725 #ifdef CONFIG_TCP_MD5SIG
726 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 #endif
728 			];
729 	} rep;
730 	struct ip_reply_arg arg;
731 
732 	memset(&rep.th, 0, sizeof(struct tcphdr));
733 	memset(&arg, 0, sizeof(arg));
734 
735 	arg.iov[0].iov_base = (unsigned char *)&rep;
736 	arg.iov[0].iov_len  = sizeof(rep.th);
737 	if (tsecr) {
738 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739 				   (TCPOPT_TIMESTAMP << 8) |
740 				   TCPOLEN_TIMESTAMP);
741 		rep.opt[1] = htonl(tsval);
742 		rep.opt[2] = htonl(tsecr);
743 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744 	}
745 
746 	/* Swap the send and the receive. */
747 	rep.th.dest    = th->source;
748 	rep.th.source  = th->dest;
749 	rep.th.doff    = arg.iov[0].iov_len / 4;
750 	rep.th.seq     = htonl(seq);
751 	rep.th.ack_seq = htonl(ack);
752 	rep.th.ack     = 1;
753 	rep.th.window  = htons(win);
754 
755 #ifdef CONFIG_TCP_MD5SIG
756 	if (key) {
757 		int offset = (tsecr) ? 3 : 0;
758 
759 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
760 					  (TCPOPT_NOP << 16) |
761 					  (TCPOPT_MD5SIG << 8) |
762 					  TCPOLEN_MD5SIG);
763 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
764 		rep.th.doff = arg.iov[0].iov_len/4;
765 
766 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
767 				    key, ip_hdr(skb)->saddr,
768 				    ip_hdr(skb)->daddr, &rep.th);
769 	}
770 #endif
771 	arg.flags = reply_flags;
772 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773 				      ip_hdr(skb)->saddr, /* XXX */
774 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
775 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776 	if (oif)
777 		arg.bound_dev_if = oif;
778 	arg.tos = tos;
779 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
780 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
781 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
782 			      &arg, arg.iov[0].iov_len);
783 
784 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
785 }
786 
787 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
788 {
789 	struct inet_timewait_sock *tw = inet_twsk(sk);
790 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
791 
792 	tcp_v4_send_ack(sock_net(sk), skb,
793 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
794 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
795 			tcp_time_stamp + tcptw->tw_ts_offset,
796 			tcptw->tw_ts_recent,
797 			tw->tw_bound_dev_if,
798 			tcp_twsk_md5_key(tcptw),
799 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
800 			tw->tw_tos
801 			);
802 
803 	inet_twsk_put(tw);
804 }
805 
806 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
807 				  struct request_sock *req)
808 {
809 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
810 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
811 	 */
812 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
813 					     tcp_sk(sk)->snd_nxt;
814 
815 	tcp_v4_send_ack(sock_net(sk), skb, seq,
816 			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
817 			tcp_time_stamp,
818 			req->ts_recent,
819 			0,
820 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
821 					  AF_INET),
822 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
823 			ip_hdr(skb)->tos);
824 }
825 
826 /*
827  *	Send a SYN-ACK after having received a SYN.
828  *	This still operates on a request_sock only, not on a big
829  *	socket.
830  */
831 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
832 			      struct flowi *fl,
833 			      struct request_sock *req,
834 			      struct tcp_fastopen_cookie *foc,
835 				  bool attach_req)
836 {
837 	const struct inet_request_sock *ireq = inet_rsk(req);
838 	struct flowi4 fl4;
839 	int err = -1;
840 	struct sk_buff *skb;
841 
842 	/* First, grab a route. */
843 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
844 		return -1;
845 
846 	skb = tcp_make_synack(sk, dst, req, foc, attach_req);
847 
848 	if (skb) {
849 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
850 
851 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
852 					    ireq->ir_rmt_addr,
853 					    ireq->opt);
854 		err = net_xmit_eval(err);
855 	}
856 
857 	return err;
858 }
859 
860 /*
861  *	IPv4 request_sock destructor.
862  */
863 static void tcp_v4_reqsk_destructor(struct request_sock *req)
864 {
865 	kfree(inet_rsk(req)->opt);
866 }
867 
868 
869 #ifdef CONFIG_TCP_MD5SIG
870 /*
871  * RFC2385 MD5 checksumming requires a mapping of
872  * IP address->MD5 Key.
873  * We need to maintain these in the sk structure.
874  */
875 
876 /* Find the Key structure for an address.  */
877 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
878 					 const union tcp_md5_addr *addr,
879 					 int family)
880 {
881 	const struct tcp_sock *tp = tcp_sk(sk);
882 	struct tcp_md5sig_key *key;
883 	unsigned int size = sizeof(struct in_addr);
884 	const struct tcp_md5sig_info *md5sig;
885 
886 	/* caller either holds rcu_read_lock() or socket lock */
887 	md5sig = rcu_dereference_check(tp->md5sig_info,
888 				       sock_owned_by_user(sk) ||
889 				       lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
890 	if (!md5sig)
891 		return NULL;
892 #if IS_ENABLED(CONFIG_IPV6)
893 	if (family == AF_INET6)
894 		size = sizeof(struct in6_addr);
895 #endif
896 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
897 		if (key->family != family)
898 			continue;
899 		if (!memcmp(&key->addr, addr, size))
900 			return key;
901 	}
902 	return NULL;
903 }
904 EXPORT_SYMBOL(tcp_md5_do_lookup);
905 
906 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
907 					 const struct sock *addr_sk)
908 {
909 	const union tcp_md5_addr *addr;
910 
911 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
912 	return tcp_md5_do_lookup(sk, addr, AF_INET);
913 }
914 EXPORT_SYMBOL(tcp_v4_md5_lookup);
915 
916 /* This can be called on a newly created socket, from other files */
917 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
918 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
919 {
920 	/* Add Key to the list */
921 	struct tcp_md5sig_key *key;
922 	struct tcp_sock *tp = tcp_sk(sk);
923 	struct tcp_md5sig_info *md5sig;
924 
925 	key = tcp_md5_do_lookup(sk, addr, family);
926 	if (key) {
927 		/* Pre-existing entry - just update that one. */
928 		memcpy(key->key, newkey, newkeylen);
929 		key->keylen = newkeylen;
930 		return 0;
931 	}
932 
933 	md5sig = rcu_dereference_protected(tp->md5sig_info,
934 					   sock_owned_by_user(sk) ||
935 					   lockdep_is_held(&sk->sk_lock.slock));
936 	if (!md5sig) {
937 		md5sig = kmalloc(sizeof(*md5sig), gfp);
938 		if (!md5sig)
939 			return -ENOMEM;
940 
941 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
942 		INIT_HLIST_HEAD(&md5sig->head);
943 		rcu_assign_pointer(tp->md5sig_info, md5sig);
944 	}
945 
946 	key = sock_kmalloc(sk, sizeof(*key), gfp);
947 	if (!key)
948 		return -ENOMEM;
949 	if (!tcp_alloc_md5sig_pool()) {
950 		sock_kfree_s(sk, key, sizeof(*key));
951 		return -ENOMEM;
952 	}
953 
954 	memcpy(key->key, newkey, newkeylen);
955 	key->keylen = newkeylen;
956 	key->family = family;
957 	memcpy(&key->addr, addr,
958 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
959 				      sizeof(struct in_addr));
960 	hlist_add_head_rcu(&key->node, &md5sig->head);
961 	return 0;
962 }
963 EXPORT_SYMBOL(tcp_md5_do_add);
964 
965 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
966 {
967 	struct tcp_md5sig_key *key;
968 
969 	key = tcp_md5_do_lookup(sk, addr, family);
970 	if (!key)
971 		return -ENOENT;
972 	hlist_del_rcu(&key->node);
973 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
974 	kfree_rcu(key, rcu);
975 	return 0;
976 }
977 EXPORT_SYMBOL(tcp_md5_do_del);
978 
979 static void tcp_clear_md5_list(struct sock *sk)
980 {
981 	struct tcp_sock *tp = tcp_sk(sk);
982 	struct tcp_md5sig_key *key;
983 	struct hlist_node *n;
984 	struct tcp_md5sig_info *md5sig;
985 
986 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
987 
988 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
989 		hlist_del_rcu(&key->node);
990 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
991 		kfree_rcu(key, rcu);
992 	}
993 }
994 
995 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
996 				 int optlen)
997 {
998 	struct tcp_md5sig cmd;
999 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1000 
1001 	if (optlen < sizeof(cmd))
1002 		return -EINVAL;
1003 
1004 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1005 		return -EFAULT;
1006 
1007 	if (sin->sin_family != AF_INET)
1008 		return -EINVAL;
1009 
1010 	if (!cmd.tcpm_keylen)
1011 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1012 				      AF_INET);
1013 
1014 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1015 		return -EINVAL;
1016 
1017 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1018 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1019 			      GFP_KERNEL);
1020 }
1021 
1022 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1023 					__be32 daddr, __be32 saddr, int nbytes)
1024 {
1025 	struct tcp4_pseudohdr *bp;
1026 	struct scatterlist sg;
1027 
1028 	bp = &hp->md5_blk.ip4;
1029 
1030 	/*
1031 	 * 1. the TCP pseudo-header (in the order: source IP address,
1032 	 * destination IP address, zero-padded protocol number, and
1033 	 * segment length)
1034 	 */
1035 	bp->saddr = saddr;
1036 	bp->daddr = daddr;
1037 	bp->pad = 0;
1038 	bp->protocol = IPPROTO_TCP;
1039 	bp->len = cpu_to_be16(nbytes);
1040 
1041 	sg_init_one(&sg, bp, sizeof(*bp));
1042 	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
1043 	return crypto_ahash_update(hp->md5_req);
1044 }
1045 
1046 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1047 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1048 {
1049 	struct tcp_md5sig_pool *hp;
1050 	struct ahash_request *req;
1051 
1052 	hp = tcp_get_md5sig_pool();
1053 	if (!hp)
1054 		goto clear_hash_noput;
1055 	req = hp->md5_req;
1056 
1057 	if (crypto_ahash_init(req))
1058 		goto clear_hash;
1059 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1060 		goto clear_hash;
1061 	if (tcp_md5_hash_header(hp, th))
1062 		goto clear_hash;
1063 	if (tcp_md5_hash_key(hp, key))
1064 		goto clear_hash;
1065 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1066 	if (crypto_ahash_final(req))
1067 		goto clear_hash;
1068 
1069 	tcp_put_md5sig_pool();
1070 	return 0;
1071 
1072 clear_hash:
1073 	tcp_put_md5sig_pool();
1074 clear_hash_noput:
1075 	memset(md5_hash, 0, 16);
1076 	return 1;
1077 }
1078 
1079 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1080 			const struct sock *sk,
1081 			const struct sk_buff *skb)
1082 {
1083 	struct tcp_md5sig_pool *hp;
1084 	struct ahash_request *req;
1085 	const struct tcphdr *th = tcp_hdr(skb);
1086 	__be32 saddr, daddr;
1087 
1088 	if (sk) { /* valid for establish/request sockets */
1089 		saddr = sk->sk_rcv_saddr;
1090 		daddr = sk->sk_daddr;
1091 	} else {
1092 		const struct iphdr *iph = ip_hdr(skb);
1093 		saddr = iph->saddr;
1094 		daddr = iph->daddr;
1095 	}
1096 
1097 	hp = tcp_get_md5sig_pool();
1098 	if (!hp)
1099 		goto clear_hash_noput;
1100 	req = hp->md5_req;
1101 
1102 	if (crypto_ahash_init(req))
1103 		goto clear_hash;
1104 
1105 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1106 		goto clear_hash;
1107 	if (tcp_md5_hash_header(hp, th))
1108 		goto clear_hash;
1109 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1110 		goto clear_hash;
1111 	if (tcp_md5_hash_key(hp, key))
1112 		goto clear_hash;
1113 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1114 	if (crypto_ahash_final(req))
1115 		goto clear_hash;
1116 
1117 	tcp_put_md5sig_pool();
1118 	return 0;
1119 
1120 clear_hash:
1121 	tcp_put_md5sig_pool();
1122 clear_hash_noput:
1123 	memset(md5_hash, 0, 16);
1124 	return 1;
1125 }
1126 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1127 
1128 #endif
1129 
1130 /* Called with rcu_read_lock() */
1131 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1132 				    const struct sk_buff *skb)
1133 {
1134 #ifdef CONFIG_TCP_MD5SIG
1135 	/*
1136 	 * This gets called for each TCP segment that arrives
1137 	 * so we want to be efficient.
1138 	 * We have 3 drop cases:
1139 	 * o No MD5 hash and one expected.
1140 	 * o MD5 hash and we're not expecting one.
1141 	 * o MD5 hash and its wrong.
1142 	 */
1143 	const __u8 *hash_location = NULL;
1144 	struct tcp_md5sig_key *hash_expected;
1145 	const struct iphdr *iph = ip_hdr(skb);
1146 	const struct tcphdr *th = tcp_hdr(skb);
1147 	int genhash;
1148 	unsigned char newhash[16];
1149 
1150 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1151 					  AF_INET);
1152 	hash_location = tcp_parse_md5sig_option(th);
1153 
1154 	/* We've parsed the options - do we have a hash? */
1155 	if (!hash_expected && !hash_location)
1156 		return false;
1157 
1158 	if (hash_expected && !hash_location) {
1159 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1160 		return true;
1161 	}
1162 
1163 	if (!hash_expected && hash_location) {
1164 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1165 		return true;
1166 	}
1167 
1168 	/* Okay, so this is hash_expected and hash_location -
1169 	 * so we need to calculate the checksum.
1170 	 */
1171 	genhash = tcp_v4_md5_hash_skb(newhash,
1172 				      hash_expected,
1173 				      NULL, skb);
1174 
1175 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1176 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1177 				     &iph->saddr, ntohs(th->source),
1178 				     &iph->daddr, ntohs(th->dest),
1179 				     genhash ? " tcp_v4_calc_md5_hash failed"
1180 				     : "");
1181 		return true;
1182 	}
1183 	return false;
1184 #endif
1185 	return false;
1186 }
1187 
1188 static void tcp_v4_init_req(struct request_sock *req,
1189 			    const struct sock *sk_listener,
1190 			    struct sk_buff *skb)
1191 {
1192 	struct inet_request_sock *ireq = inet_rsk(req);
1193 
1194 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1195 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1196 	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1197 	ireq->opt = tcp_v4_save_options(skb);
1198 }
1199 
1200 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1201 					  struct flowi *fl,
1202 					  const struct request_sock *req,
1203 					  bool *strict)
1204 {
1205 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1206 
1207 	if (strict) {
1208 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1209 			*strict = true;
1210 		else
1211 			*strict = false;
1212 	}
1213 
1214 	return dst;
1215 }
1216 
1217 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1218 	.family		=	PF_INET,
1219 	.obj_size	=	sizeof(struct tcp_request_sock),
1220 	.rtx_syn_ack	=	tcp_rtx_synack,
1221 	.send_ack	=	tcp_v4_reqsk_send_ack,
1222 	.destructor	=	tcp_v4_reqsk_destructor,
1223 	.send_reset	=	tcp_v4_send_reset,
1224 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1225 };
1226 
1227 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1228 	.mss_clamp	=	TCP_MSS_DEFAULT,
1229 #ifdef CONFIG_TCP_MD5SIG
1230 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1231 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1232 #endif
1233 	.init_req	=	tcp_v4_init_req,
1234 #ifdef CONFIG_SYN_COOKIES
1235 	.cookie_init_seq =	cookie_v4_init_sequence,
1236 #endif
1237 	.route_req	=	tcp_v4_route_req,
1238 	.init_seq	=	tcp_v4_init_sequence,
1239 	.send_synack	=	tcp_v4_send_synack,
1240 };
1241 
1242 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1243 {
1244 	/* Never answer to SYNs send to broadcast or multicast */
1245 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1246 		goto drop;
1247 
1248 	return tcp_conn_request(&tcp_request_sock_ops,
1249 				&tcp_request_sock_ipv4_ops, sk, skb);
1250 
1251 drop:
1252 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1253 	return 0;
1254 }
1255 EXPORT_SYMBOL(tcp_v4_conn_request);
1256 
1257 
1258 /*
1259  * The three way handshake has completed - we got a valid synack -
1260  * now create the new socket.
1261  */
1262 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1263 				  struct request_sock *req,
1264 				  struct dst_entry *dst,
1265 				  struct request_sock *req_unhash,
1266 				  bool *own_req)
1267 {
1268 	struct inet_request_sock *ireq;
1269 	struct inet_sock *newinet;
1270 	struct tcp_sock *newtp;
1271 	struct sock *newsk;
1272 #ifdef CONFIG_TCP_MD5SIG
1273 	struct tcp_md5sig_key *key;
1274 #endif
1275 	struct ip_options_rcu *inet_opt;
1276 
1277 	if (sk_acceptq_is_full(sk))
1278 		goto exit_overflow;
1279 
1280 	newsk = tcp_create_openreq_child(sk, req, skb);
1281 	if (!newsk)
1282 		goto exit_nonewsk;
1283 
1284 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1285 	inet_sk_rx_dst_set(newsk, skb);
1286 
1287 	newtp		      = tcp_sk(newsk);
1288 	newinet		      = inet_sk(newsk);
1289 	ireq		      = inet_rsk(req);
1290 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1291 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1292 	newsk->sk_bound_dev_if = ireq->ir_iif;
1293 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1294 	inet_opt	      = ireq->opt;
1295 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1296 	ireq->opt	      = NULL;
1297 	newinet->mc_index     = inet_iif(skb);
1298 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1299 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1300 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1301 	if (inet_opt)
1302 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1303 	newinet->inet_id = newtp->write_seq ^ jiffies;
1304 
1305 	if (!dst) {
1306 		dst = inet_csk_route_child_sock(sk, newsk, req);
1307 		if (!dst)
1308 			goto put_and_exit;
1309 	} else {
1310 		/* syncookie case : see end of cookie_v4_check() */
1311 	}
1312 	sk_setup_caps(newsk, dst);
1313 
1314 	tcp_ca_openreq_child(newsk, dst);
1315 
1316 	tcp_sync_mss(newsk, dst_mtu(dst));
1317 	newtp->advmss = dst_metric_advmss(dst);
1318 	if (tcp_sk(sk)->rx_opt.user_mss &&
1319 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1320 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1321 
1322 	tcp_initialize_rcv_mss(newsk);
1323 
1324 #ifdef CONFIG_TCP_MD5SIG
1325 	/* Copy over the MD5 key from the original socket */
1326 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1327 				AF_INET);
1328 	if (key) {
1329 		/*
1330 		 * We're using one, so create a matching key
1331 		 * on the newsk structure. If we fail to get
1332 		 * memory, then we end up not copying the key
1333 		 * across. Shucks.
1334 		 */
1335 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1336 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1337 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1338 	}
1339 #endif
1340 
1341 	if (__inet_inherit_port(sk, newsk) < 0)
1342 		goto put_and_exit;
1343 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1344 	if (*own_req)
1345 		tcp_move_syn(newtp, req);
1346 
1347 	return newsk;
1348 
1349 exit_overflow:
1350 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1351 exit_nonewsk:
1352 	dst_release(dst);
1353 exit:
1354 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1355 	return NULL;
1356 put_and_exit:
1357 	inet_csk_prepare_forced_close(newsk);
1358 	tcp_done(newsk);
1359 	goto exit;
1360 }
1361 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1362 
1363 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1364 {
1365 #ifdef CONFIG_SYN_COOKIES
1366 	const struct tcphdr *th = tcp_hdr(skb);
1367 
1368 	if (!th->syn)
1369 		sk = cookie_v4_check(sk, skb);
1370 #endif
1371 	return sk;
1372 }
1373 
1374 /* The socket must have it's spinlock held when we get
1375  * here, unless it is a TCP_LISTEN socket.
1376  *
1377  * We have a potential double-lock case here, so even when
1378  * doing backlog processing we use the BH locking scheme.
1379  * This is because we cannot sleep with the original spinlock
1380  * held.
1381  */
1382 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1383 {
1384 	struct sock *rsk;
1385 
1386 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1387 		struct dst_entry *dst = sk->sk_rx_dst;
1388 
1389 		sock_rps_save_rxhash(sk, skb);
1390 		sk_mark_napi_id(sk, skb);
1391 		if (dst) {
1392 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1393 			    !dst->ops->check(dst, 0)) {
1394 				dst_release(dst);
1395 				sk->sk_rx_dst = NULL;
1396 			}
1397 		}
1398 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1399 		return 0;
1400 	}
1401 
1402 	if (tcp_checksum_complete(skb))
1403 		goto csum_err;
1404 
1405 	if (sk->sk_state == TCP_LISTEN) {
1406 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1407 
1408 		if (!nsk)
1409 			goto discard;
1410 		if (nsk != sk) {
1411 			sock_rps_save_rxhash(nsk, skb);
1412 			sk_mark_napi_id(nsk, skb);
1413 			if (tcp_child_process(sk, nsk, skb)) {
1414 				rsk = nsk;
1415 				goto reset;
1416 			}
1417 			return 0;
1418 		}
1419 	} else
1420 		sock_rps_save_rxhash(sk, skb);
1421 
1422 	if (tcp_rcv_state_process(sk, skb)) {
1423 		rsk = sk;
1424 		goto reset;
1425 	}
1426 	return 0;
1427 
1428 reset:
1429 	tcp_v4_send_reset(rsk, skb);
1430 discard:
1431 	kfree_skb(skb);
1432 	/* Be careful here. If this function gets more complicated and
1433 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1434 	 * might be destroyed here. This current version compiles correctly,
1435 	 * but you have been warned.
1436 	 */
1437 	return 0;
1438 
1439 csum_err:
1440 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1441 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1442 	goto discard;
1443 }
1444 EXPORT_SYMBOL(tcp_v4_do_rcv);
1445 
1446 void tcp_v4_early_demux(struct sk_buff *skb)
1447 {
1448 	const struct iphdr *iph;
1449 	const struct tcphdr *th;
1450 	struct sock *sk;
1451 
1452 	if (skb->pkt_type != PACKET_HOST)
1453 		return;
1454 
1455 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1456 		return;
1457 
1458 	iph = ip_hdr(skb);
1459 	th = tcp_hdr(skb);
1460 
1461 	if (th->doff < sizeof(struct tcphdr) / 4)
1462 		return;
1463 
1464 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1465 				       iph->saddr, th->source,
1466 				       iph->daddr, ntohs(th->dest),
1467 				       skb->skb_iif);
1468 	if (sk) {
1469 		skb->sk = sk;
1470 		skb->destructor = sock_edemux;
1471 		if (sk_fullsock(sk)) {
1472 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1473 
1474 			if (dst)
1475 				dst = dst_check(dst, 0);
1476 			if (dst &&
1477 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1478 				skb_dst_set_noref(skb, dst);
1479 		}
1480 	}
1481 }
1482 
1483 /* Packet is added to VJ-style prequeue for processing in process
1484  * context, if a reader task is waiting. Apparently, this exciting
1485  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1486  * failed somewhere. Latency? Burstiness? Well, at least now we will
1487  * see, why it failed. 8)8)				  --ANK
1488  *
1489  */
1490 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1491 {
1492 	struct tcp_sock *tp = tcp_sk(sk);
1493 
1494 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1495 		return false;
1496 
1497 	if (skb->len <= tcp_hdrlen(skb) &&
1498 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1499 		return false;
1500 
1501 	/* Before escaping RCU protected region, we need to take care of skb
1502 	 * dst. Prequeue is only enabled for established sockets.
1503 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1504 	 * Instead of doing full sk_rx_dst validity here, let's perform
1505 	 * an optimistic check.
1506 	 */
1507 	if (likely(sk->sk_rx_dst))
1508 		skb_dst_drop(skb);
1509 	else
1510 		skb_dst_force_safe(skb);
1511 
1512 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1513 	tp->ucopy.memory += skb->truesize;
1514 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1515 		struct sk_buff *skb1;
1516 
1517 		BUG_ON(sock_owned_by_user(sk));
1518 
1519 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1520 			sk_backlog_rcv(sk, skb1);
1521 			NET_INC_STATS_BH(sock_net(sk),
1522 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1523 		}
1524 
1525 		tp->ucopy.memory = 0;
1526 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1527 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1528 					   POLLIN | POLLRDNORM | POLLRDBAND);
1529 		if (!inet_csk_ack_scheduled(sk))
1530 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1531 						  (3 * tcp_rto_min(sk)) / 4,
1532 						  TCP_RTO_MAX);
1533 	}
1534 	return true;
1535 }
1536 EXPORT_SYMBOL(tcp_prequeue);
1537 
1538 /*
1539  *	From tcp_input.c
1540  */
1541 
1542 int tcp_v4_rcv(struct sk_buff *skb)
1543 {
1544 	const struct iphdr *iph;
1545 	const struct tcphdr *th;
1546 	struct sock *sk;
1547 	int ret;
1548 	struct net *net = dev_net(skb->dev);
1549 
1550 	if (skb->pkt_type != PACKET_HOST)
1551 		goto discard_it;
1552 
1553 	/* Count it even if it's bad */
1554 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1555 
1556 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1557 		goto discard_it;
1558 
1559 	th = tcp_hdr(skb);
1560 
1561 	if (th->doff < sizeof(struct tcphdr) / 4)
1562 		goto bad_packet;
1563 	if (!pskb_may_pull(skb, th->doff * 4))
1564 		goto discard_it;
1565 
1566 	/* An explanation is required here, I think.
1567 	 * Packet length and doff are validated by header prediction,
1568 	 * provided case of th->doff==0 is eliminated.
1569 	 * So, we defer the checks. */
1570 
1571 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1572 		goto csum_error;
1573 
1574 	th = tcp_hdr(skb);
1575 	iph = ip_hdr(skb);
1576 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1577 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1578 	 */
1579 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1580 		sizeof(struct inet_skb_parm));
1581 	barrier();
1582 
1583 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1584 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1585 				    skb->len - th->doff * 4);
1586 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1587 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1588 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1589 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1590 	TCP_SKB_CB(skb)->sacked	 = 0;
1591 
1592 lookup:
1593 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1594 	if (!sk)
1595 		goto no_tcp_socket;
1596 
1597 process:
1598 	if (sk->sk_state == TCP_TIME_WAIT)
1599 		goto do_time_wait;
1600 
1601 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1602 		struct request_sock *req = inet_reqsk(sk);
1603 		struct sock *nsk;
1604 
1605 		sk = req->rsk_listener;
1606 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1607 			reqsk_put(req);
1608 			goto discard_it;
1609 		}
1610 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1611 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1612 			goto lookup;
1613 		}
1614 		sock_hold(sk);
1615 		nsk = tcp_check_req(sk, skb, req, false);
1616 		if (!nsk) {
1617 			reqsk_put(req);
1618 			goto discard_and_relse;
1619 		}
1620 		if (nsk == sk) {
1621 			reqsk_put(req);
1622 		} else if (tcp_child_process(sk, nsk, skb)) {
1623 			tcp_v4_send_reset(nsk, skb);
1624 			goto discard_and_relse;
1625 		} else {
1626 			sock_put(sk);
1627 			return 0;
1628 		}
1629 	}
1630 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1631 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1632 		goto discard_and_relse;
1633 	}
1634 
1635 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1636 		goto discard_and_relse;
1637 
1638 	if (tcp_v4_inbound_md5_hash(sk, skb))
1639 		goto discard_and_relse;
1640 
1641 	nf_reset(skb);
1642 
1643 	if (sk_filter(sk, skb))
1644 		goto discard_and_relse;
1645 
1646 	skb->dev = NULL;
1647 
1648 	if (sk->sk_state == TCP_LISTEN) {
1649 		ret = tcp_v4_do_rcv(sk, skb);
1650 		goto put_and_return;
1651 	}
1652 
1653 	sk_incoming_cpu_update(sk);
1654 
1655 	bh_lock_sock_nested(sk);
1656 	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1657 	ret = 0;
1658 	if (!sock_owned_by_user(sk)) {
1659 		if (!tcp_prequeue(sk, skb))
1660 			ret = tcp_v4_do_rcv(sk, skb);
1661 	} else if (unlikely(sk_add_backlog(sk, skb,
1662 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1663 		bh_unlock_sock(sk);
1664 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1665 		goto discard_and_relse;
1666 	}
1667 	bh_unlock_sock(sk);
1668 
1669 put_and_return:
1670 	sock_put(sk);
1671 
1672 	return ret;
1673 
1674 no_tcp_socket:
1675 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1676 		goto discard_it;
1677 
1678 	if (tcp_checksum_complete(skb)) {
1679 csum_error:
1680 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1681 bad_packet:
1682 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1683 	} else {
1684 		tcp_v4_send_reset(NULL, skb);
1685 	}
1686 
1687 discard_it:
1688 	/* Discard frame. */
1689 	kfree_skb(skb);
1690 	return 0;
1691 
1692 discard_and_relse:
1693 	sock_put(sk);
1694 	goto discard_it;
1695 
1696 do_time_wait:
1697 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1698 		inet_twsk_put(inet_twsk(sk));
1699 		goto discard_it;
1700 	}
1701 
1702 	if (tcp_checksum_complete(skb)) {
1703 		inet_twsk_put(inet_twsk(sk));
1704 		goto csum_error;
1705 	}
1706 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1707 	case TCP_TW_SYN: {
1708 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1709 							&tcp_hashinfo,
1710 							iph->saddr, th->source,
1711 							iph->daddr, th->dest,
1712 							inet_iif(skb));
1713 		if (sk2) {
1714 			inet_twsk_deschedule_put(inet_twsk(sk));
1715 			sk = sk2;
1716 			goto process;
1717 		}
1718 		/* Fall through to ACK */
1719 	}
1720 	case TCP_TW_ACK:
1721 		tcp_v4_timewait_ack(sk, skb);
1722 		break;
1723 	case TCP_TW_RST:
1724 		tcp_v4_send_reset(sk, skb);
1725 		inet_twsk_deschedule_put(inet_twsk(sk));
1726 		goto discard_it;
1727 	case TCP_TW_SUCCESS:;
1728 	}
1729 	goto discard_it;
1730 }
1731 
1732 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1733 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1734 	.twsk_unique	= tcp_twsk_unique,
1735 	.twsk_destructor= tcp_twsk_destructor,
1736 };
1737 
1738 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1739 {
1740 	struct dst_entry *dst = skb_dst(skb);
1741 
1742 	if (dst && dst_hold_safe(dst)) {
1743 		sk->sk_rx_dst = dst;
1744 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1745 	}
1746 }
1747 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1748 
1749 const struct inet_connection_sock_af_ops ipv4_specific = {
1750 	.queue_xmit	   = ip_queue_xmit,
1751 	.send_check	   = tcp_v4_send_check,
1752 	.rebuild_header	   = inet_sk_rebuild_header,
1753 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1754 	.conn_request	   = tcp_v4_conn_request,
1755 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1756 	.net_header_len	   = sizeof(struct iphdr),
1757 	.setsockopt	   = ip_setsockopt,
1758 	.getsockopt	   = ip_getsockopt,
1759 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1760 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1761 	.bind_conflict	   = inet_csk_bind_conflict,
1762 #ifdef CONFIG_COMPAT
1763 	.compat_setsockopt = compat_ip_setsockopt,
1764 	.compat_getsockopt = compat_ip_getsockopt,
1765 #endif
1766 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1767 };
1768 EXPORT_SYMBOL(ipv4_specific);
1769 
1770 #ifdef CONFIG_TCP_MD5SIG
1771 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1772 	.md5_lookup		= tcp_v4_md5_lookup,
1773 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1774 	.md5_parse		= tcp_v4_parse_md5_keys,
1775 };
1776 #endif
1777 
1778 /* NOTE: A lot of things set to zero explicitly by call to
1779  *       sk_alloc() so need not be done here.
1780  */
1781 static int tcp_v4_init_sock(struct sock *sk)
1782 {
1783 	struct inet_connection_sock *icsk = inet_csk(sk);
1784 
1785 	tcp_init_sock(sk);
1786 
1787 	icsk->icsk_af_ops = &ipv4_specific;
1788 
1789 #ifdef CONFIG_TCP_MD5SIG
1790 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1791 #endif
1792 
1793 	return 0;
1794 }
1795 
1796 void tcp_v4_destroy_sock(struct sock *sk)
1797 {
1798 	struct tcp_sock *tp = tcp_sk(sk);
1799 
1800 	tcp_clear_xmit_timers(sk);
1801 
1802 	tcp_cleanup_congestion_control(sk);
1803 
1804 	/* Cleanup up the write buffer. */
1805 	tcp_write_queue_purge(sk);
1806 
1807 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1808 	__skb_queue_purge(&tp->out_of_order_queue);
1809 
1810 #ifdef CONFIG_TCP_MD5SIG
1811 	/* Clean up the MD5 key list, if any */
1812 	if (tp->md5sig_info) {
1813 		tcp_clear_md5_list(sk);
1814 		kfree_rcu(tp->md5sig_info, rcu);
1815 		tp->md5sig_info = NULL;
1816 	}
1817 #endif
1818 
1819 	/* Clean prequeue, it must be empty really */
1820 	__skb_queue_purge(&tp->ucopy.prequeue);
1821 
1822 	/* Clean up a referenced TCP bind bucket. */
1823 	if (inet_csk(sk)->icsk_bind_hash)
1824 		inet_put_port(sk);
1825 
1826 	BUG_ON(tp->fastopen_rsk);
1827 
1828 	/* If socket is aborted during connect operation */
1829 	tcp_free_fastopen_req(tp);
1830 	tcp_saved_syn_free(tp);
1831 
1832 	sk_sockets_allocated_dec(sk);
1833 
1834 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1835 		sock_release_memcg(sk);
1836 }
1837 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1838 
1839 #ifdef CONFIG_PROC_FS
1840 /* Proc filesystem TCP sock list dumping. */
1841 
1842 /*
1843  * Get next listener socket follow cur.  If cur is NULL, get first socket
1844  * starting from bucket given in st->bucket; when st->bucket is zero the
1845  * very first socket in the hash table is returned.
1846  */
1847 static void *listening_get_next(struct seq_file *seq, void *cur)
1848 {
1849 	struct inet_connection_sock *icsk;
1850 	struct hlist_nulls_node *node;
1851 	struct sock *sk = cur;
1852 	struct inet_listen_hashbucket *ilb;
1853 	struct tcp_iter_state *st = seq->private;
1854 	struct net *net = seq_file_net(seq);
1855 
1856 	if (!sk) {
1857 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1858 		spin_lock_bh(&ilb->lock);
1859 		sk = sk_nulls_head(&ilb->head);
1860 		st->offset = 0;
1861 		goto get_sk;
1862 	}
1863 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1864 	++st->num;
1865 	++st->offset;
1866 
1867 	sk = sk_nulls_next(sk);
1868 get_sk:
1869 	sk_nulls_for_each_from(sk, node) {
1870 		if (!net_eq(sock_net(sk), net))
1871 			continue;
1872 		if (sk->sk_family == st->family) {
1873 			cur = sk;
1874 			goto out;
1875 		}
1876 		icsk = inet_csk(sk);
1877 	}
1878 	spin_unlock_bh(&ilb->lock);
1879 	st->offset = 0;
1880 	if (++st->bucket < INET_LHTABLE_SIZE) {
1881 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1882 		spin_lock_bh(&ilb->lock);
1883 		sk = sk_nulls_head(&ilb->head);
1884 		goto get_sk;
1885 	}
1886 	cur = NULL;
1887 out:
1888 	return cur;
1889 }
1890 
1891 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1892 {
1893 	struct tcp_iter_state *st = seq->private;
1894 	void *rc;
1895 
1896 	st->bucket = 0;
1897 	st->offset = 0;
1898 	rc = listening_get_next(seq, NULL);
1899 
1900 	while (rc && *pos) {
1901 		rc = listening_get_next(seq, rc);
1902 		--*pos;
1903 	}
1904 	return rc;
1905 }
1906 
1907 static inline bool empty_bucket(const struct tcp_iter_state *st)
1908 {
1909 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1910 }
1911 
1912 /*
1913  * Get first established socket starting from bucket given in st->bucket.
1914  * If st->bucket is zero, the very first socket in the hash is returned.
1915  */
1916 static void *established_get_first(struct seq_file *seq)
1917 {
1918 	struct tcp_iter_state *st = seq->private;
1919 	struct net *net = seq_file_net(seq);
1920 	void *rc = NULL;
1921 
1922 	st->offset = 0;
1923 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1924 		struct sock *sk;
1925 		struct hlist_nulls_node *node;
1926 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1927 
1928 		/* Lockless fast path for the common case of empty buckets */
1929 		if (empty_bucket(st))
1930 			continue;
1931 
1932 		spin_lock_bh(lock);
1933 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1934 			if (sk->sk_family != st->family ||
1935 			    !net_eq(sock_net(sk), net)) {
1936 				continue;
1937 			}
1938 			rc = sk;
1939 			goto out;
1940 		}
1941 		spin_unlock_bh(lock);
1942 	}
1943 out:
1944 	return rc;
1945 }
1946 
1947 static void *established_get_next(struct seq_file *seq, void *cur)
1948 {
1949 	struct sock *sk = cur;
1950 	struct hlist_nulls_node *node;
1951 	struct tcp_iter_state *st = seq->private;
1952 	struct net *net = seq_file_net(seq);
1953 
1954 	++st->num;
1955 	++st->offset;
1956 
1957 	sk = sk_nulls_next(sk);
1958 
1959 	sk_nulls_for_each_from(sk, node) {
1960 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1961 			return sk;
1962 	}
1963 
1964 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1965 	++st->bucket;
1966 	return established_get_first(seq);
1967 }
1968 
1969 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1970 {
1971 	struct tcp_iter_state *st = seq->private;
1972 	void *rc;
1973 
1974 	st->bucket = 0;
1975 	rc = established_get_first(seq);
1976 
1977 	while (rc && pos) {
1978 		rc = established_get_next(seq, rc);
1979 		--pos;
1980 	}
1981 	return rc;
1982 }
1983 
1984 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1985 {
1986 	void *rc;
1987 	struct tcp_iter_state *st = seq->private;
1988 
1989 	st->state = TCP_SEQ_STATE_LISTENING;
1990 	rc	  = listening_get_idx(seq, &pos);
1991 
1992 	if (!rc) {
1993 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1994 		rc	  = established_get_idx(seq, pos);
1995 	}
1996 
1997 	return rc;
1998 }
1999 
2000 static void *tcp_seek_last_pos(struct seq_file *seq)
2001 {
2002 	struct tcp_iter_state *st = seq->private;
2003 	int offset = st->offset;
2004 	int orig_num = st->num;
2005 	void *rc = NULL;
2006 
2007 	switch (st->state) {
2008 	case TCP_SEQ_STATE_LISTENING:
2009 		if (st->bucket >= INET_LHTABLE_SIZE)
2010 			break;
2011 		st->state = TCP_SEQ_STATE_LISTENING;
2012 		rc = listening_get_next(seq, NULL);
2013 		while (offset-- && rc)
2014 			rc = listening_get_next(seq, rc);
2015 		if (rc)
2016 			break;
2017 		st->bucket = 0;
2018 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2019 		/* Fallthrough */
2020 	case TCP_SEQ_STATE_ESTABLISHED:
2021 		if (st->bucket > tcp_hashinfo.ehash_mask)
2022 			break;
2023 		rc = established_get_first(seq);
2024 		while (offset-- && rc)
2025 			rc = established_get_next(seq, rc);
2026 	}
2027 
2028 	st->num = orig_num;
2029 
2030 	return rc;
2031 }
2032 
2033 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2034 {
2035 	struct tcp_iter_state *st = seq->private;
2036 	void *rc;
2037 
2038 	if (*pos && *pos == st->last_pos) {
2039 		rc = tcp_seek_last_pos(seq);
2040 		if (rc)
2041 			goto out;
2042 	}
2043 
2044 	st->state = TCP_SEQ_STATE_LISTENING;
2045 	st->num = 0;
2046 	st->bucket = 0;
2047 	st->offset = 0;
2048 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2049 
2050 out:
2051 	st->last_pos = *pos;
2052 	return rc;
2053 }
2054 
2055 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2056 {
2057 	struct tcp_iter_state *st = seq->private;
2058 	void *rc = NULL;
2059 
2060 	if (v == SEQ_START_TOKEN) {
2061 		rc = tcp_get_idx(seq, 0);
2062 		goto out;
2063 	}
2064 
2065 	switch (st->state) {
2066 	case TCP_SEQ_STATE_LISTENING:
2067 		rc = listening_get_next(seq, v);
2068 		if (!rc) {
2069 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2070 			st->bucket = 0;
2071 			st->offset = 0;
2072 			rc	  = established_get_first(seq);
2073 		}
2074 		break;
2075 	case TCP_SEQ_STATE_ESTABLISHED:
2076 		rc = established_get_next(seq, v);
2077 		break;
2078 	}
2079 out:
2080 	++*pos;
2081 	st->last_pos = *pos;
2082 	return rc;
2083 }
2084 
2085 static void tcp_seq_stop(struct seq_file *seq, void *v)
2086 {
2087 	struct tcp_iter_state *st = seq->private;
2088 
2089 	switch (st->state) {
2090 	case TCP_SEQ_STATE_LISTENING:
2091 		if (v != SEQ_START_TOKEN)
2092 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2093 		break;
2094 	case TCP_SEQ_STATE_ESTABLISHED:
2095 		if (v)
2096 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2097 		break;
2098 	}
2099 }
2100 
2101 int tcp_seq_open(struct inode *inode, struct file *file)
2102 {
2103 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2104 	struct tcp_iter_state *s;
2105 	int err;
2106 
2107 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2108 			  sizeof(struct tcp_iter_state));
2109 	if (err < 0)
2110 		return err;
2111 
2112 	s = ((struct seq_file *)file->private_data)->private;
2113 	s->family		= afinfo->family;
2114 	s->last_pos		= 0;
2115 	return 0;
2116 }
2117 EXPORT_SYMBOL(tcp_seq_open);
2118 
2119 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2120 {
2121 	int rc = 0;
2122 	struct proc_dir_entry *p;
2123 
2124 	afinfo->seq_ops.start		= tcp_seq_start;
2125 	afinfo->seq_ops.next		= tcp_seq_next;
2126 	afinfo->seq_ops.stop		= tcp_seq_stop;
2127 
2128 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2129 			     afinfo->seq_fops, afinfo);
2130 	if (!p)
2131 		rc = -ENOMEM;
2132 	return rc;
2133 }
2134 EXPORT_SYMBOL(tcp_proc_register);
2135 
2136 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2137 {
2138 	remove_proc_entry(afinfo->name, net->proc_net);
2139 }
2140 EXPORT_SYMBOL(tcp_proc_unregister);
2141 
2142 static void get_openreq4(const struct request_sock *req,
2143 			 struct seq_file *f, int i)
2144 {
2145 	const struct inet_request_sock *ireq = inet_rsk(req);
2146 	long delta = req->rsk_timer.expires - jiffies;
2147 
2148 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2149 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2150 		i,
2151 		ireq->ir_loc_addr,
2152 		ireq->ir_num,
2153 		ireq->ir_rmt_addr,
2154 		ntohs(ireq->ir_rmt_port),
2155 		TCP_SYN_RECV,
2156 		0, 0, /* could print option size, but that is af dependent. */
2157 		1,    /* timers active (only the expire timer) */
2158 		jiffies_delta_to_clock_t(delta),
2159 		req->num_timeout,
2160 		from_kuid_munged(seq_user_ns(f),
2161 				 sock_i_uid(req->rsk_listener)),
2162 		0,  /* non standard timer */
2163 		0, /* open_requests have no inode */
2164 		0,
2165 		req);
2166 }
2167 
2168 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2169 {
2170 	int timer_active;
2171 	unsigned long timer_expires;
2172 	const struct tcp_sock *tp = tcp_sk(sk);
2173 	const struct inet_connection_sock *icsk = inet_csk(sk);
2174 	const struct inet_sock *inet = inet_sk(sk);
2175 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2176 	__be32 dest = inet->inet_daddr;
2177 	__be32 src = inet->inet_rcv_saddr;
2178 	__u16 destp = ntohs(inet->inet_dport);
2179 	__u16 srcp = ntohs(inet->inet_sport);
2180 	int rx_queue;
2181 	int state;
2182 
2183 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2184 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2185 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2186 		timer_active	= 1;
2187 		timer_expires	= icsk->icsk_timeout;
2188 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2189 		timer_active	= 4;
2190 		timer_expires	= icsk->icsk_timeout;
2191 	} else if (timer_pending(&sk->sk_timer)) {
2192 		timer_active	= 2;
2193 		timer_expires	= sk->sk_timer.expires;
2194 	} else {
2195 		timer_active	= 0;
2196 		timer_expires = jiffies;
2197 	}
2198 
2199 	state = sk_state_load(sk);
2200 	if (state == TCP_LISTEN)
2201 		rx_queue = sk->sk_ack_backlog;
2202 	else
2203 		/* Because we don't lock the socket,
2204 		 * we might find a transient negative value.
2205 		 */
2206 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2207 
2208 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2209 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2210 		i, src, srcp, dest, destp, state,
2211 		tp->write_seq - tp->snd_una,
2212 		rx_queue,
2213 		timer_active,
2214 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2215 		icsk->icsk_retransmits,
2216 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2217 		icsk->icsk_probes_out,
2218 		sock_i_ino(sk),
2219 		atomic_read(&sk->sk_refcnt), sk,
2220 		jiffies_to_clock_t(icsk->icsk_rto),
2221 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2222 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2223 		tp->snd_cwnd,
2224 		state == TCP_LISTEN ?
2225 		    fastopenq->max_qlen :
2226 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2227 }
2228 
2229 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2230 			       struct seq_file *f, int i)
2231 {
2232 	long delta = tw->tw_timer.expires - jiffies;
2233 	__be32 dest, src;
2234 	__u16 destp, srcp;
2235 
2236 	dest  = tw->tw_daddr;
2237 	src   = tw->tw_rcv_saddr;
2238 	destp = ntohs(tw->tw_dport);
2239 	srcp  = ntohs(tw->tw_sport);
2240 
2241 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2242 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2243 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2244 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2245 		atomic_read(&tw->tw_refcnt), tw);
2246 }
2247 
2248 #define TMPSZ 150
2249 
2250 static int tcp4_seq_show(struct seq_file *seq, void *v)
2251 {
2252 	struct tcp_iter_state *st;
2253 	struct sock *sk = v;
2254 
2255 	seq_setwidth(seq, TMPSZ - 1);
2256 	if (v == SEQ_START_TOKEN) {
2257 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2258 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2259 			   "inode");
2260 		goto out;
2261 	}
2262 	st = seq->private;
2263 
2264 	if (sk->sk_state == TCP_TIME_WAIT)
2265 		get_timewait4_sock(v, seq, st->num);
2266 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2267 		get_openreq4(v, seq, st->num);
2268 	else
2269 		get_tcp4_sock(v, seq, st->num);
2270 out:
2271 	seq_pad(seq, '\n');
2272 	return 0;
2273 }
2274 
2275 static const struct file_operations tcp_afinfo_seq_fops = {
2276 	.owner   = THIS_MODULE,
2277 	.open    = tcp_seq_open,
2278 	.read    = seq_read,
2279 	.llseek  = seq_lseek,
2280 	.release = seq_release_net
2281 };
2282 
2283 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2284 	.name		= "tcp",
2285 	.family		= AF_INET,
2286 	.seq_fops	= &tcp_afinfo_seq_fops,
2287 	.seq_ops	= {
2288 		.show		= tcp4_seq_show,
2289 	},
2290 };
2291 
2292 static int __net_init tcp4_proc_init_net(struct net *net)
2293 {
2294 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2295 }
2296 
2297 static void __net_exit tcp4_proc_exit_net(struct net *net)
2298 {
2299 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2300 }
2301 
2302 static struct pernet_operations tcp4_net_ops = {
2303 	.init = tcp4_proc_init_net,
2304 	.exit = tcp4_proc_exit_net,
2305 };
2306 
2307 int __init tcp4_proc_init(void)
2308 {
2309 	return register_pernet_subsys(&tcp4_net_ops);
2310 }
2311 
2312 void tcp4_proc_exit(void)
2313 {
2314 	unregister_pernet_subsys(&tcp4_net_ops);
2315 }
2316 #endif /* CONFIG_PROC_FS */
2317 
2318 struct proto tcp_prot = {
2319 	.name			= "TCP",
2320 	.owner			= THIS_MODULE,
2321 	.close			= tcp_close,
2322 	.connect		= tcp_v4_connect,
2323 	.disconnect		= tcp_disconnect,
2324 	.accept			= inet_csk_accept,
2325 	.ioctl			= tcp_ioctl,
2326 	.init			= tcp_v4_init_sock,
2327 	.destroy		= tcp_v4_destroy_sock,
2328 	.shutdown		= tcp_shutdown,
2329 	.setsockopt		= tcp_setsockopt,
2330 	.getsockopt		= tcp_getsockopt,
2331 	.recvmsg		= tcp_recvmsg,
2332 	.sendmsg		= tcp_sendmsg,
2333 	.sendpage		= tcp_sendpage,
2334 	.backlog_rcv		= tcp_v4_do_rcv,
2335 	.release_cb		= tcp_release_cb,
2336 	.hash			= inet_hash,
2337 	.unhash			= inet_unhash,
2338 	.get_port		= inet_csk_get_port,
2339 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2340 	.stream_memory_free	= tcp_stream_memory_free,
2341 	.sockets_allocated	= &tcp_sockets_allocated,
2342 	.orphan_count		= &tcp_orphan_count,
2343 	.memory_allocated	= &tcp_memory_allocated,
2344 	.memory_pressure	= &tcp_memory_pressure,
2345 	.sysctl_mem		= sysctl_tcp_mem,
2346 	.sysctl_wmem		= sysctl_tcp_wmem,
2347 	.sysctl_rmem		= sysctl_tcp_rmem,
2348 	.max_header		= MAX_TCP_HEADER,
2349 	.obj_size		= sizeof(struct tcp_sock),
2350 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2351 	.twsk_prot		= &tcp_timewait_sock_ops,
2352 	.rsk_prot		= &tcp_request_sock_ops,
2353 	.h.hashinfo		= &tcp_hashinfo,
2354 	.no_autobind		= true,
2355 #ifdef CONFIG_COMPAT
2356 	.compat_setsockopt	= compat_tcp_setsockopt,
2357 	.compat_getsockopt	= compat_tcp_getsockopt,
2358 #endif
2359 	.diag_destroy		= tcp_abort,
2360 };
2361 EXPORT_SYMBOL(tcp_prot);
2362 
2363 static void __net_exit tcp_sk_exit(struct net *net)
2364 {
2365 	int cpu;
2366 
2367 	for_each_possible_cpu(cpu)
2368 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2369 	free_percpu(net->ipv4.tcp_sk);
2370 }
2371 
2372 static int __net_init tcp_sk_init(struct net *net)
2373 {
2374 	int res, cpu;
2375 
2376 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2377 	if (!net->ipv4.tcp_sk)
2378 		return -ENOMEM;
2379 
2380 	for_each_possible_cpu(cpu) {
2381 		struct sock *sk;
2382 
2383 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2384 					   IPPROTO_TCP, net);
2385 		if (res)
2386 			goto fail;
2387 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2388 	}
2389 
2390 	net->ipv4.sysctl_tcp_ecn = 2;
2391 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2392 
2393 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2394 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2395 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2396 
2397 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2398 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2399 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2400 
2401 	return 0;
2402 fail:
2403 	tcp_sk_exit(net);
2404 
2405 	return res;
2406 }
2407 
2408 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2409 {
2410 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2411 }
2412 
2413 static struct pernet_operations __net_initdata tcp_sk_ops = {
2414        .init	   = tcp_sk_init,
2415        .exit	   = tcp_sk_exit,
2416        .exit_batch = tcp_sk_exit_batch,
2417 };
2418 
2419 void __init tcp_v4_init(void)
2420 {
2421 	inet_hashinfo_init(&tcp_hashinfo);
2422 	if (register_pernet_subsys(&tcp_sk_ops))
2423 		panic("Failed to create the TCP control socket.\n");
2424 }
2425