xref: /linux/net/ipv4/tcp_ipv4.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95 
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98 
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 					  ip_hdr(skb)->saddr,
103 					  tcp_hdr(skb)->dest,
104 					  tcp_hdr(skb)->source);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 
112 	/* With PAWS, it is safe from the viewpoint
113 	   of data integrity. Even without PAWS it is safe provided sequence
114 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115 
116 	   Actually, the idea is close to VJ's one, only timestamp cache is
117 	   held not per host, but per port pair and TW bucket is used as state
118 	   holder.
119 
120 	   If TW bucket has been already destroyed we fall back to VJ's scheme
121 	   and use initial timestamp retrieved from peer table.
122 	 */
123 	if (tcptw->tw_ts_recent_stamp &&
124 	    (!twp || (sysctl_tcp_tw_reuse &&
125 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127 		if (tp->write_seq == 0)
128 			tp->write_seq = 1;
129 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
130 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131 		sock_hold(sktw);
132 		return 1;
133 	}
134 
135 	return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138 
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143 	struct inet_sock *inet = inet_sk(sk);
144 	struct tcp_sock *tp = tcp_sk(sk);
145 	__be16 orig_sport, orig_dport;
146 	__be32 daddr, nexthop;
147 	struct flowi4 *fl4;
148 	struct rtable *rt;
149 	int err;
150 	struct ip_options_rcu *inet_opt;
151 
152 	if (addr_len < sizeof(struct sockaddr_in))
153 		return -EINVAL;
154 
155 	if (usin->sin_family != AF_INET)
156 		return -EAFNOSUPPORT;
157 
158 	nexthop = daddr = usin->sin_addr.s_addr;
159 	inet_opt = rcu_dereference_protected(inet->inet_opt,
160 					     lockdep_sock_is_held(sk));
161 	if (inet_opt && inet_opt->opt.srr) {
162 		if (!daddr)
163 			return -EINVAL;
164 		nexthop = inet_opt->opt.faddr;
165 	}
166 
167 	orig_sport = inet->inet_sport;
168 	orig_dport = usin->sin_port;
169 	fl4 = &inet->cork.fl.u.ip4;
170 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 			      IPPROTO_TCP,
173 			      orig_sport, orig_dport, sk);
174 	if (IS_ERR(rt)) {
175 		err = PTR_ERR(rt);
176 		if (err == -ENETUNREACH)
177 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178 		return err;
179 	}
180 
181 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 		ip_rt_put(rt);
183 		return -ENETUNREACH;
184 	}
185 
186 	if (!inet_opt || !inet_opt->opt.srr)
187 		daddr = fl4->daddr;
188 
189 	if (!inet->inet_saddr)
190 		inet->inet_saddr = fl4->saddr;
191 	sk_rcv_saddr_set(sk, inet->inet_saddr);
192 
193 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194 		/* Reset inherited state */
195 		tp->rx_opt.ts_recent	   = 0;
196 		tp->rx_opt.ts_recent_stamp = 0;
197 		if (likely(!tp->repair))
198 			tp->write_seq	   = 0;
199 	}
200 
201 	if (tcp_death_row.sysctl_tw_recycle &&
202 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 		tcp_fetch_timewait_stamp(sk, &rt->dst);
204 
205 	inet->inet_dport = usin->sin_port;
206 	sk_daddr_set(sk, daddr);
207 
208 	inet_csk(sk)->icsk_ext_hdr_len = 0;
209 	if (inet_opt)
210 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211 
212 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213 
214 	/* Socket identity is still unknown (sport may be zero).
215 	 * However we set state to SYN-SENT and not releasing socket
216 	 * lock select source port, enter ourselves into the hash tables and
217 	 * complete initialization after this.
218 	 */
219 	tcp_set_state(sk, TCP_SYN_SENT);
220 	err = inet_hash_connect(&tcp_death_row, sk);
221 	if (err)
222 		goto failure;
223 
224 	sk_set_txhash(sk);
225 
226 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227 			       inet->inet_sport, inet->inet_dport, sk);
228 	if (IS_ERR(rt)) {
229 		err = PTR_ERR(rt);
230 		rt = NULL;
231 		goto failure;
232 	}
233 	/* OK, now commit destination to socket.  */
234 	sk->sk_gso_type = SKB_GSO_TCPV4;
235 	sk_setup_caps(sk, &rt->dst);
236 
237 	if (!tp->write_seq && likely(!tp->repair))
238 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 							   inet->inet_daddr,
240 							   inet->inet_sport,
241 							   usin->sin_port);
242 
243 	inet->inet_id = tp->write_seq ^ jiffies;
244 
245 	err = tcp_connect(sk);
246 
247 	rt = NULL;
248 	if (err)
249 		goto failure;
250 
251 	return 0;
252 
253 failure:
254 	/*
255 	 * This unhashes the socket and releases the local port,
256 	 * if necessary.
257 	 */
258 	tcp_set_state(sk, TCP_CLOSE);
259 	ip_rt_put(rt);
260 	sk->sk_route_caps = 0;
261 	inet->inet_dport = 0;
262 	return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265 
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273 	struct dst_entry *dst;
274 	struct inet_sock *inet = inet_sk(sk);
275 	u32 mtu = tcp_sk(sk)->mtu_info;
276 
277 	dst = inet_csk_update_pmtu(sk, mtu);
278 	if (!dst)
279 		return;
280 
281 	/* Something is about to be wrong... Remember soft error
282 	 * for the case, if this connection will not able to recover.
283 	 */
284 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 		sk->sk_err_soft = EMSGSIZE;
286 
287 	mtu = dst_mtu(dst);
288 
289 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290 	    ip_sk_accept_pmtu(sk) &&
291 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292 		tcp_sync_mss(sk, mtu);
293 
294 		/* Resend the TCP packet because it's
295 		 * clear that the old packet has been
296 		 * dropped. This is the new "fast" path mtu
297 		 * discovery.
298 		 */
299 		tcp_simple_retransmit(sk);
300 	} /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303 
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306 	struct dst_entry *dst = __sk_dst_check(sk, 0);
307 
308 	if (dst)
309 		dst->ops->redirect(dst, sk, skb);
310 }
311 
312 
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316 	struct request_sock *req = inet_reqsk(sk);
317 	struct net *net = sock_net(sk);
318 
319 	/* ICMPs are not backlogged, hence we cannot get
320 	 * an established socket here.
321 	 */
322 	if (seq != tcp_rsk(req)->snt_isn) {
323 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
324 	} else if (abort) {
325 		/*
326 		 * Still in SYN_RECV, just remove it silently.
327 		 * There is no good way to pass the error to the newly
328 		 * created socket, and POSIX does not want network
329 		 * errors returned from accept().
330 		 */
331 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
332 		tcp_listendrop(req->rsk_listener);
333 	}
334 	reqsk_put(req);
335 }
336 EXPORT_SYMBOL(tcp_req_err);
337 
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353 
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
355 {
356 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358 	struct inet_connection_sock *icsk;
359 	struct tcp_sock *tp;
360 	struct inet_sock *inet;
361 	const int type = icmp_hdr(icmp_skb)->type;
362 	const int code = icmp_hdr(icmp_skb)->code;
363 	struct sock *sk;
364 	struct sk_buff *skb;
365 	struct request_sock *fastopen;
366 	__u32 seq, snd_una;
367 	__u32 remaining;
368 	int err;
369 	struct net *net = dev_net(icmp_skb->dev);
370 
371 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
372 				       th->dest, iph->saddr, ntohs(th->source),
373 				       inet_iif(icmp_skb));
374 	if (!sk) {
375 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
376 		return;
377 	}
378 	if (sk->sk_state == TCP_TIME_WAIT) {
379 		inet_twsk_put(inet_twsk(sk));
380 		return;
381 	}
382 	seq = ntohl(th->seq);
383 	if (sk->sk_state == TCP_NEW_SYN_RECV)
384 		return tcp_req_err(sk, seq,
385 				  type == ICMP_PARAMETERPROB ||
386 				  type == ICMP_TIME_EXCEEDED ||
387 				  (type == ICMP_DEST_UNREACH &&
388 				   (code == ICMP_NET_UNREACH ||
389 				    code == ICMP_HOST_UNREACH)));
390 
391 	bh_lock_sock(sk);
392 	/* If too many ICMPs get dropped on busy
393 	 * servers this needs to be solved differently.
394 	 * We do take care of PMTU discovery (RFC1191) special case :
395 	 * we can receive locally generated ICMP messages while socket is held.
396 	 */
397 	if (sock_owned_by_user(sk)) {
398 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
399 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
400 	}
401 	if (sk->sk_state == TCP_CLOSE)
402 		goto out;
403 
404 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
405 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
406 		goto out;
407 	}
408 
409 	icsk = inet_csk(sk);
410 	tp = tcp_sk(sk);
411 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
412 	fastopen = tp->fastopen_rsk;
413 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
414 	if (sk->sk_state != TCP_LISTEN &&
415 	    !between(seq, snd_una, tp->snd_nxt)) {
416 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
417 		goto out;
418 	}
419 
420 	switch (type) {
421 	case ICMP_REDIRECT:
422 		do_redirect(icmp_skb, sk);
423 		goto out;
424 	case ICMP_SOURCE_QUENCH:
425 		/* Just silently ignore these. */
426 		goto out;
427 	case ICMP_PARAMETERPROB:
428 		err = EPROTO;
429 		break;
430 	case ICMP_DEST_UNREACH:
431 		if (code > NR_ICMP_UNREACH)
432 			goto out;
433 
434 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
435 			/* We are not interested in TCP_LISTEN and open_requests
436 			 * (SYN-ACKs send out by Linux are always <576bytes so
437 			 * they should go through unfragmented).
438 			 */
439 			if (sk->sk_state == TCP_LISTEN)
440 				goto out;
441 
442 			tp->mtu_info = info;
443 			if (!sock_owned_by_user(sk)) {
444 				tcp_v4_mtu_reduced(sk);
445 			} else {
446 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
447 					sock_hold(sk);
448 			}
449 			goto out;
450 		}
451 
452 		err = icmp_err_convert[code].errno;
453 		/* check if icmp_skb allows revert of backoff
454 		 * (see draft-zimmermann-tcp-lcd) */
455 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
456 			break;
457 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
458 		    !icsk->icsk_backoff || fastopen)
459 			break;
460 
461 		if (sock_owned_by_user(sk))
462 			break;
463 
464 		icsk->icsk_backoff--;
465 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
466 					       TCP_TIMEOUT_INIT;
467 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
468 
469 		skb = tcp_write_queue_head(sk);
470 		BUG_ON(!skb);
471 
472 		remaining = icsk->icsk_rto -
473 			    min(icsk->icsk_rto,
474 				tcp_time_stamp - tcp_skb_timestamp(skb));
475 
476 		if (remaining) {
477 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
478 						  remaining, TCP_RTO_MAX);
479 		} else {
480 			/* RTO revert clocked out retransmission.
481 			 * Will retransmit now */
482 			tcp_retransmit_timer(sk);
483 		}
484 
485 		break;
486 	case ICMP_TIME_EXCEEDED:
487 		err = EHOSTUNREACH;
488 		break;
489 	default:
490 		goto out;
491 	}
492 
493 	switch (sk->sk_state) {
494 	case TCP_SYN_SENT:
495 	case TCP_SYN_RECV:
496 		/* Only in fast or simultaneous open. If a fast open socket is
497 		 * is already accepted it is treated as a connected one below.
498 		 */
499 		if (fastopen && !fastopen->sk)
500 			break;
501 
502 		if (!sock_owned_by_user(sk)) {
503 			sk->sk_err = err;
504 
505 			sk->sk_error_report(sk);
506 
507 			tcp_done(sk);
508 		} else {
509 			sk->sk_err_soft = err;
510 		}
511 		goto out;
512 	}
513 
514 	/* If we've already connected we will keep trying
515 	 * until we time out, or the user gives up.
516 	 *
517 	 * rfc1122 4.2.3.9 allows to consider as hard errors
518 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
519 	 * but it is obsoleted by pmtu discovery).
520 	 *
521 	 * Note, that in modern internet, where routing is unreliable
522 	 * and in each dark corner broken firewalls sit, sending random
523 	 * errors ordered by their masters even this two messages finally lose
524 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
525 	 *
526 	 * Now we are in compliance with RFCs.
527 	 *							--ANK (980905)
528 	 */
529 
530 	inet = inet_sk(sk);
531 	if (!sock_owned_by_user(sk) && inet->recverr) {
532 		sk->sk_err = err;
533 		sk->sk_error_report(sk);
534 	} else	{ /* Only an error on timeout */
535 		sk->sk_err_soft = err;
536 	}
537 
538 out:
539 	bh_unlock_sock(sk);
540 	sock_put(sk);
541 }
542 
543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
544 {
545 	struct tcphdr *th = tcp_hdr(skb);
546 
547 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
548 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549 		skb->csum_start = skb_transport_header(skb) - skb->head;
550 		skb->csum_offset = offsetof(struct tcphdr, check);
551 	} else {
552 		th->check = tcp_v4_check(skb->len, saddr, daddr,
553 					 csum_partial(th,
554 						      th->doff << 2,
555 						      skb->csum));
556 	}
557 }
558 
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561 {
562 	const struct inet_sock *inet = inet_sk(sk);
563 
564 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
565 }
566 EXPORT_SYMBOL(tcp_v4_send_check);
567 
568 /*
569  *	This routine will send an RST to the other tcp.
570  *
571  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
572  *		      for reset.
573  *	Answer: if a packet caused RST, it is not for a socket
574  *		existing in our system, if it is matched to a socket,
575  *		it is just duplicate segment or bug in other side's TCP.
576  *		So that we build reply only basing on parameters
577  *		arrived with segment.
578  *	Exception: precedence violation. We do not implement it in any case.
579  */
580 
581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
582 {
583 	const struct tcphdr *th = tcp_hdr(skb);
584 	struct {
585 		struct tcphdr th;
586 #ifdef CONFIG_TCP_MD5SIG
587 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
588 #endif
589 	} rep;
590 	struct ip_reply_arg arg;
591 #ifdef CONFIG_TCP_MD5SIG
592 	struct tcp_md5sig_key *key = NULL;
593 	const __u8 *hash_location = NULL;
594 	unsigned char newhash[16];
595 	int genhash;
596 	struct sock *sk1 = NULL;
597 #endif
598 	struct net *net;
599 
600 	/* Never send a reset in response to a reset. */
601 	if (th->rst)
602 		return;
603 
604 	/* If sk not NULL, it means we did a successful lookup and incoming
605 	 * route had to be correct. prequeue might have dropped our dst.
606 	 */
607 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
608 		return;
609 
610 	/* Swap the send and the receive. */
611 	memset(&rep, 0, sizeof(rep));
612 	rep.th.dest   = th->source;
613 	rep.th.source = th->dest;
614 	rep.th.doff   = sizeof(struct tcphdr) / 4;
615 	rep.th.rst    = 1;
616 
617 	if (th->ack) {
618 		rep.th.seq = th->ack_seq;
619 	} else {
620 		rep.th.ack = 1;
621 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
622 				       skb->len - (th->doff << 2));
623 	}
624 
625 	memset(&arg, 0, sizeof(arg));
626 	arg.iov[0].iov_base = (unsigned char *)&rep;
627 	arg.iov[0].iov_len  = sizeof(rep.th);
628 
629 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
630 #ifdef CONFIG_TCP_MD5SIG
631 	rcu_read_lock();
632 	hash_location = tcp_parse_md5sig_option(th);
633 	if (sk && sk_fullsock(sk)) {
634 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
635 					&ip_hdr(skb)->saddr, AF_INET);
636 	} else if (hash_location) {
637 		/*
638 		 * active side is lost. Try to find listening socket through
639 		 * source port, and then find md5 key through listening socket.
640 		 * we are not loose security here:
641 		 * Incoming packet is checked with md5 hash with finding key,
642 		 * no RST generated if md5 hash doesn't match.
643 		 */
644 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
645 					     ip_hdr(skb)->saddr,
646 					     th->source, ip_hdr(skb)->daddr,
647 					     ntohs(th->source), inet_iif(skb));
648 		/* don't send rst if it can't find key */
649 		if (!sk1)
650 			goto out;
651 
652 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
653 					&ip_hdr(skb)->saddr, AF_INET);
654 		if (!key)
655 			goto out;
656 
657 
658 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
660 			goto out;
661 
662 	}
663 
664 	if (key) {
665 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
666 				   (TCPOPT_NOP << 16) |
667 				   (TCPOPT_MD5SIG << 8) |
668 				   TCPOLEN_MD5SIG);
669 		/* Update length and the length the header thinks exists */
670 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
671 		rep.th.doff = arg.iov[0].iov_len / 4;
672 
673 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
674 				     key, ip_hdr(skb)->saddr,
675 				     ip_hdr(skb)->daddr, &rep.th);
676 	}
677 #endif
678 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
679 				      ip_hdr(skb)->saddr, /* XXX */
680 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
681 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
682 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
683 
684 	/* When socket is gone, all binding information is lost.
685 	 * routing might fail in this case. No choice here, if we choose to force
686 	 * input interface, we will misroute in case of asymmetric route.
687 	 */
688 	if (sk)
689 		arg.bound_dev_if = sk->sk_bound_dev_if;
690 
691 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
692 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
693 
694 	arg.tos = ip_hdr(skb)->tos;
695 	local_bh_disable();
696 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
697 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
698 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
699 			      &arg, arg.iov[0].iov_len);
700 
701 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
702 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
703 	local_bh_enable();
704 
705 #ifdef CONFIG_TCP_MD5SIG
706 out:
707 	rcu_read_unlock();
708 #endif
709 }
710 
711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
712    outside socket context is ugly, certainly. What can I do?
713  */
714 
715 static void tcp_v4_send_ack(struct net *net,
716 			    struct sk_buff *skb, u32 seq, u32 ack,
717 			    u32 win, u32 tsval, u32 tsecr, int oif,
718 			    struct tcp_md5sig_key *key,
719 			    int reply_flags, u8 tos)
720 {
721 	const struct tcphdr *th = tcp_hdr(skb);
722 	struct {
723 		struct tcphdr th;
724 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
725 #ifdef CONFIG_TCP_MD5SIG
726 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 #endif
728 			];
729 	} rep;
730 	struct ip_reply_arg arg;
731 
732 	memset(&rep.th, 0, sizeof(struct tcphdr));
733 	memset(&arg, 0, sizeof(arg));
734 
735 	arg.iov[0].iov_base = (unsigned char *)&rep;
736 	arg.iov[0].iov_len  = sizeof(rep.th);
737 	if (tsecr) {
738 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739 				   (TCPOPT_TIMESTAMP << 8) |
740 				   TCPOLEN_TIMESTAMP);
741 		rep.opt[1] = htonl(tsval);
742 		rep.opt[2] = htonl(tsecr);
743 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744 	}
745 
746 	/* Swap the send and the receive. */
747 	rep.th.dest    = th->source;
748 	rep.th.source  = th->dest;
749 	rep.th.doff    = arg.iov[0].iov_len / 4;
750 	rep.th.seq     = htonl(seq);
751 	rep.th.ack_seq = htonl(ack);
752 	rep.th.ack     = 1;
753 	rep.th.window  = htons(win);
754 
755 #ifdef CONFIG_TCP_MD5SIG
756 	if (key) {
757 		int offset = (tsecr) ? 3 : 0;
758 
759 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
760 					  (TCPOPT_NOP << 16) |
761 					  (TCPOPT_MD5SIG << 8) |
762 					  TCPOLEN_MD5SIG);
763 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
764 		rep.th.doff = arg.iov[0].iov_len/4;
765 
766 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
767 				    key, ip_hdr(skb)->saddr,
768 				    ip_hdr(skb)->daddr, &rep.th);
769 	}
770 #endif
771 	arg.flags = reply_flags;
772 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773 				      ip_hdr(skb)->saddr, /* XXX */
774 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
775 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776 	if (oif)
777 		arg.bound_dev_if = oif;
778 	arg.tos = tos;
779 	local_bh_disable();
780 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
781 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
782 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
783 			      &arg, arg.iov[0].iov_len);
784 
785 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
786 	local_bh_enable();
787 }
788 
789 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
790 {
791 	struct inet_timewait_sock *tw = inet_twsk(sk);
792 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
793 
794 	tcp_v4_send_ack(sock_net(sk), skb,
795 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
796 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
797 			tcp_time_stamp + tcptw->tw_ts_offset,
798 			tcptw->tw_ts_recent,
799 			tw->tw_bound_dev_if,
800 			tcp_twsk_md5_key(tcptw),
801 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
802 			tw->tw_tos
803 			);
804 
805 	inet_twsk_put(tw);
806 }
807 
808 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
809 				  struct request_sock *req)
810 {
811 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
812 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
813 	 */
814 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
815 					     tcp_sk(sk)->snd_nxt;
816 
817 	tcp_v4_send_ack(sock_net(sk), skb, seq,
818 			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
819 			tcp_time_stamp,
820 			req->ts_recent,
821 			0,
822 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
823 					  AF_INET),
824 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
825 			ip_hdr(skb)->tos);
826 }
827 
828 /*
829  *	Send a SYN-ACK after having received a SYN.
830  *	This still operates on a request_sock only, not on a big
831  *	socket.
832  */
833 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
834 			      struct flowi *fl,
835 			      struct request_sock *req,
836 			      struct tcp_fastopen_cookie *foc,
837 			      enum tcp_synack_type synack_type)
838 {
839 	const struct inet_request_sock *ireq = inet_rsk(req);
840 	struct flowi4 fl4;
841 	int err = -1;
842 	struct sk_buff *skb;
843 
844 	/* First, grab a route. */
845 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
846 		return -1;
847 
848 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
849 
850 	if (skb) {
851 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
852 
853 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
854 					    ireq->ir_rmt_addr,
855 					    ireq->opt);
856 		err = net_xmit_eval(err);
857 	}
858 
859 	return err;
860 }
861 
862 /*
863  *	IPv4 request_sock destructor.
864  */
865 static void tcp_v4_reqsk_destructor(struct request_sock *req)
866 {
867 	kfree(inet_rsk(req)->opt);
868 }
869 
870 #ifdef CONFIG_TCP_MD5SIG
871 /*
872  * RFC2385 MD5 checksumming requires a mapping of
873  * IP address->MD5 Key.
874  * We need to maintain these in the sk structure.
875  */
876 
877 /* Find the Key structure for an address.  */
878 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
879 					 const union tcp_md5_addr *addr,
880 					 int family)
881 {
882 	const struct tcp_sock *tp = tcp_sk(sk);
883 	struct tcp_md5sig_key *key;
884 	unsigned int size = sizeof(struct in_addr);
885 	const struct tcp_md5sig_info *md5sig;
886 
887 	/* caller either holds rcu_read_lock() or socket lock */
888 	md5sig = rcu_dereference_check(tp->md5sig_info,
889 				       lockdep_sock_is_held(sk));
890 	if (!md5sig)
891 		return NULL;
892 #if IS_ENABLED(CONFIG_IPV6)
893 	if (family == AF_INET6)
894 		size = sizeof(struct in6_addr);
895 #endif
896 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
897 		if (key->family != family)
898 			continue;
899 		if (!memcmp(&key->addr, addr, size))
900 			return key;
901 	}
902 	return NULL;
903 }
904 EXPORT_SYMBOL(tcp_md5_do_lookup);
905 
906 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
907 					 const struct sock *addr_sk)
908 {
909 	const union tcp_md5_addr *addr;
910 
911 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
912 	return tcp_md5_do_lookup(sk, addr, AF_INET);
913 }
914 EXPORT_SYMBOL(tcp_v4_md5_lookup);
915 
916 /* This can be called on a newly created socket, from other files */
917 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
918 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
919 {
920 	/* Add Key to the list */
921 	struct tcp_md5sig_key *key;
922 	struct tcp_sock *tp = tcp_sk(sk);
923 	struct tcp_md5sig_info *md5sig;
924 
925 	key = tcp_md5_do_lookup(sk, addr, family);
926 	if (key) {
927 		/* Pre-existing entry - just update that one. */
928 		memcpy(key->key, newkey, newkeylen);
929 		key->keylen = newkeylen;
930 		return 0;
931 	}
932 
933 	md5sig = rcu_dereference_protected(tp->md5sig_info,
934 					   lockdep_sock_is_held(sk));
935 	if (!md5sig) {
936 		md5sig = kmalloc(sizeof(*md5sig), gfp);
937 		if (!md5sig)
938 			return -ENOMEM;
939 
940 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
941 		INIT_HLIST_HEAD(&md5sig->head);
942 		rcu_assign_pointer(tp->md5sig_info, md5sig);
943 	}
944 
945 	key = sock_kmalloc(sk, sizeof(*key), gfp);
946 	if (!key)
947 		return -ENOMEM;
948 	if (!tcp_alloc_md5sig_pool()) {
949 		sock_kfree_s(sk, key, sizeof(*key));
950 		return -ENOMEM;
951 	}
952 
953 	memcpy(key->key, newkey, newkeylen);
954 	key->keylen = newkeylen;
955 	key->family = family;
956 	memcpy(&key->addr, addr,
957 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
958 				      sizeof(struct in_addr));
959 	hlist_add_head_rcu(&key->node, &md5sig->head);
960 	return 0;
961 }
962 EXPORT_SYMBOL(tcp_md5_do_add);
963 
964 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
965 {
966 	struct tcp_md5sig_key *key;
967 
968 	key = tcp_md5_do_lookup(sk, addr, family);
969 	if (!key)
970 		return -ENOENT;
971 	hlist_del_rcu(&key->node);
972 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
973 	kfree_rcu(key, rcu);
974 	return 0;
975 }
976 EXPORT_SYMBOL(tcp_md5_do_del);
977 
978 static void tcp_clear_md5_list(struct sock *sk)
979 {
980 	struct tcp_sock *tp = tcp_sk(sk);
981 	struct tcp_md5sig_key *key;
982 	struct hlist_node *n;
983 	struct tcp_md5sig_info *md5sig;
984 
985 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
986 
987 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
988 		hlist_del_rcu(&key->node);
989 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
990 		kfree_rcu(key, rcu);
991 	}
992 }
993 
994 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
995 				 int optlen)
996 {
997 	struct tcp_md5sig cmd;
998 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
999 
1000 	if (optlen < sizeof(cmd))
1001 		return -EINVAL;
1002 
1003 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1004 		return -EFAULT;
1005 
1006 	if (sin->sin_family != AF_INET)
1007 		return -EINVAL;
1008 
1009 	if (!cmd.tcpm_keylen)
1010 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1011 				      AF_INET);
1012 
1013 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1014 		return -EINVAL;
1015 
1016 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1017 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1018 			      GFP_KERNEL);
1019 }
1020 
1021 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1022 					__be32 daddr, __be32 saddr, int nbytes)
1023 {
1024 	struct tcp4_pseudohdr *bp;
1025 	struct scatterlist sg;
1026 
1027 	bp = &hp->md5_blk.ip4;
1028 
1029 	/*
1030 	 * 1. the TCP pseudo-header (in the order: source IP address,
1031 	 * destination IP address, zero-padded protocol number, and
1032 	 * segment length)
1033 	 */
1034 	bp->saddr = saddr;
1035 	bp->daddr = daddr;
1036 	bp->pad = 0;
1037 	bp->protocol = IPPROTO_TCP;
1038 	bp->len = cpu_to_be16(nbytes);
1039 
1040 	sg_init_one(&sg, bp, sizeof(*bp));
1041 	ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
1042 	return crypto_ahash_update(hp->md5_req);
1043 }
1044 
1045 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1046 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1047 {
1048 	struct tcp_md5sig_pool *hp;
1049 	struct ahash_request *req;
1050 
1051 	hp = tcp_get_md5sig_pool();
1052 	if (!hp)
1053 		goto clear_hash_noput;
1054 	req = hp->md5_req;
1055 
1056 	if (crypto_ahash_init(req))
1057 		goto clear_hash;
1058 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1059 		goto clear_hash;
1060 	if (tcp_md5_hash_header(hp, th))
1061 		goto clear_hash;
1062 	if (tcp_md5_hash_key(hp, key))
1063 		goto clear_hash;
1064 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1065 	if (crypto_ahash_final(req))
1066 		goto clear_hash;
1067 
1068 	tcp_put_md5sig_pool();
1069 	return 0;
1070 
1071 clear_hash:
1072 	tcp_put_md5sig_pool();
1073 clear_hash_noput:
1074 	memset(md5_hash, 0, 16);
1075 	return 1;
1076 }
1077 
1078 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1079 			const struct sock *sk,
1080 			const struct sk_buff *skb)
1081 {
1082 	struct tcp_md5sig_pool *hp;
1083 	struct ahash_request *req;
1084 	const struct tcphdr *th = tcp_hdr(skb);
1085 	__be32 saddr, daddr;
1086 
1087 	if (sk) { /* valid for establish/request sockets */
1088 		saddr = sk->sk_rcv_saddr;
1089 		daddr = sk->sk_daddr;
1090 	} else {
1091 		const struct iphdr *iph = ip_hdr(skb);
1092 		saddr = iph->saddr;
1093 		daddr = iph->daddr;
1094 	}
1095 
1096 	hp = tcp_get_md5sig_pool();
1097 	if (!hp)
1098 		goto clear_hash_noput;
1099 	req = hp->md5_req;
1100 
1101 	if (crypto_ahash_init(req))
1102 		goto clear_hash;
1103 
1104 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1105 		goto clear_hash;
1106 	if (tcp_md5_hash_header(hp, th))
1107 		goto clear_hash;
1108 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1109 		goto clear_hash;
1110 	if (tcp_md5_hash_key(hp, key))
1111 		goto clear_hash;
1112 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1113 	if (crypto_ahash_final(req))
1114 		goto clear_hash;
1115 
1116 	tcp_put_md5sig_pool();
1117 	return 0;
1118 
1119 clear_hash:
1120 	tcp_put_md5sig_pool();
1121 clear_hash_noput:
1122 	memset(md5_hash, 0, 16);
1123 	return 1;
1124 }
1125 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1126 
1127 #endif
1128 
1129 /* Called with rcu_read_lock() */
1130 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1131 				    const struct sk_buff *skb)
1132 {
1133 #ifdef CONFIG_TCP_MD5SIG
1134 	/*
1135 	 * This gets called for each TCP segment that arrives
1136 	 * so we want to be efficient.
1137 	 * We have 3 drop cases:
1138 	 * o No MD5 hash and one expected.
1139 	 * o MD5 hash and we're not expecting one.
1140 	 * o MD5 hash and its wrong.
1141 	 */
1142 	const __u8 *hash_location = NULL;
1143 	struct tcp_md5sig_key *hash_expected;
1144 	const struct iphdr *iph = ip_hdr(skb);
1145 	const struct tcphdr *th = tcp_hdr(skb);
1146 	int genhash;
1147 	unsigned char newhash[16];
1148 
1149 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1150 					  AF_INET);
1151 	hash_location = tcp_parse_md5sig_option(th);
1152 
1153 	/* We've parsed the options - do we have a hash? */
1154 	if (!hash_expected && !hash_location)
1155 		return false;
1156 
1157 	if (hash_expected && !hash_location) {
1158 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1159 		return true;
1160 	}
1161 
1162 	if (!hash_expected && hash_location) {
1163 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1164 		return true;
1165 	}
1166 
1167 	/* Okay, so this is hash_expected and hash_location -
1168 	 * so we need to calculate the checksum.
1169 	 */
1170 	genhash = tcp_v4_md5_hash_skb(newhash,
1171 				      hash_expected,
1172 				      NULL, skb);
1173 
1174 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1175 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1176 				     &iph->saddr, ntohs(th->source),
1177 				     &iph->daddr, ntohs(th->dest),
1178 				     genhash ? " tcp_v4_calc_md5_hash failed"
1179 				     : "");
1180 		return true;
1181 	}
1182 	return false;
1183 #endif
1184 	return false;
1185 }
1186 
1187 static void tcp_v4_init_req(struct request_sock *req,
1188 			    const struct sock *sk_listener,
1189 			    struct sk_buff *skb)
1190 {
1191 	struct inet_request_sock *ireq = inet_rsk(req);
1192 
1193 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1194 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1195 	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1196 	ireq->opt = tcp_v4_save_options(skb);
1197 }
1198 
1199 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1200 					  struct flowi *fl,
1201 					  const struct request_sock *req,
1202 					  bool *strict)
1203 {
1204 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1205 
1206 	if (strict) {
1207 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1208 			*strict = true;
1209 		else
1210 			*strict = false;
1211 	}
1212 
1213 	return dst;
1214 }
1215 
1216 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1217 	.family		=	PF_INET,
1218 	.obj_size	=	sizeof(struct tcp_request_sock),
1219 	.rtx_syn_ack	=	tcp_rtx_synack,
1220 	.send_ack	=	tcp_v4_reqsk_send_ack,
1221 	.destructor	=	tcp_v4_reqsk_destructor,
1222 	.send_reset	=	tcp_v4_send_reset,
1223 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1224 };
1225 
1226 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1227 	.mss_clamp	=	TCP_MSS_DEFAULT,
1228 #ifdef CONFIG_TCP_MD5SIG
1229 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1230 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1231 #endif
1232 	.init_req	=	tcp_v4_init_req,
1233 #ifdef CONFIG_SYN_COOKIES
1234 	.cookie_init_seq =	cookie_v4_init_sequence,
1235 #endif
1236 	.route_req	=	tcp_v4_route_req,
1237 	.init_seq	=	tcp_v4_init_sequence,
1238 	.send_synack	=	tcp_v4_send_synack,
1239 };
1240 
1241 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1242 {
1243 	/* Never answer to SYNs send to broadcast or multicast */
1244 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1245 		goto drop;
1246 
1247 	return tcp_conn_request(&tcp_request_sock_ops,
1248 				&tcp_request_sock_ipv4_ops, sk, skb);
1249 
1250 drop:
1251 	tcp_listendrop(sk);
1252 	return 0;
1253 }
1254 EXPORT_SYMBOL(tcp_v4_conn_request);
1255 
1256 
1257 /*
1258  * The three way handshake has completed - we got a valid synack -
1259  * now create the new socket.
1260  */
1261 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1262 				  struct request_sock *req,
1263 				  struct dst_entry *dst,
1264 				  struct request_sock *req_unhash,
1265 				  bool *own_req)
1266 {
1267 	struct inet_request_sock *ireq;
1268 	struct inet_sock *newinet;
1269 	struct tcp_sock *newtp;
1270 	struct sock *newsk;
1271 #ifdef CONFIG_TCP_MD5SIG
1272 	struct tcp_md5sig_key *key;
1273 #endif
1274 	struct ip_options_rcu *inet_opt;
1275 
1276 	if (sk_acceptq_is_full(sk))
1277 		goto exit_overflow;
1278 
1279 	newsk = tcp_create_openreq_child(sk, req, skb);
1280 	if (!newsk)
1281 		goto exit_nonewsk;
1282 
1283 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1284 	inet_sk_rx_dst_set(newsk, skb);
1285 
1286 	newtp		      = tcp_sk(newsk);
1287 	newinet		      = inet_sk(newsk);
1288 	ireq		      = inet_rsk(req);
1289 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1290 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1291 	newsk->sk_bound_dev_if = ireq->ir_iif;
1292 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1293 	inet_opt	      = ireq->opt;
1294 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1295 	ireq->opt	      = NULL;
1296 	newinet->mc_index     = inet_iif(skb);
1297 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1298 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1299 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1300 	if (inet_opt)
1301 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1302 	newinet->inet_id = newtp->write_seq ^ jiffies;
1303 
1304 	if (!dst) {
1305 		dst = inet_csk_route_child_sock(sk, newsk, req);
1306 		if (!dst)
1307 			goto put_and_exit;
1308 	} else {
1309 		/* syncookie case : see end of cookie_v4_check() */
1310 	}
1311 	sk_setup_caps(newsk, dst);
1312 
1313 	tcp_ca_openreq_child(newsk, dst);
1314 
1315 	tcp_sync_mss(newsk, dst_mtu(dst));
1316 	newtp->advmss = dst_metric_advmss(dst);
1317 	if (tcp_sk(sk)->rx_opt.user_mss &&
1318 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1319 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1320 
1321 	tcp_initialize_rcv_mss(newsk);
1322 
1323 #ifdef CONFIG_TCP_MD5SIG
1324 	/* Copy over the MD5 key from the original socket */
1325 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1326 				AF_INET);
1327 	if (key) {
1328 		/*
1329 		 * We're using one, so create a matching key
1330 		 * on the newsk structure. If we fail to get
1331 		 * memory, then we end up not copying the key
1332 		 * across. Shucks.
1333 		 */
1334 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1335 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1336 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1337 	}
1338 #endif
1339 
1340 	if (__inet_inherit_port(sk, newsk) < 0)
1341 		goto put_and_exit;
1342 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1343 	if (*own_req)
1344 		tcp_move_syn(newtp, req);
1345 
1346 	return newsk;
1347 
1348 exit_overflow:
1349 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1350 exit_nonewsk:
1351 	dst_release(dst);
1352 exit:
1353 	tcp_listendrop(sk);
1354 	return NULL;
1355 put_and_exit:
1356 	inet_csk_prepare_forced_close(newsk);
1357 	tcp_done(newsk);
1358 	goto exit;
1359 }
1360 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1361 
1362 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1363 {
1364 #ifdef CONFIG_SYN_COOKIES
1365 	const struct tcphdr *th = tcp_hdr(skb);
1366 
1367 	if (!th->syn)
1368 		sk = cookie_v4_check(sk, skb);
1369 #endif
1370 	return sk;
1371 }
1372 
1373 /* The socket must have it's spinlock held when we get
1374  * here, unless it is a TCP_LISTEN socket.
1375  *
1376  * We have a potential double-lock case here, so even when
1377  * doing backlog processing we use the BH locking scheme.
1378  * This is because we cannot sleep with the original spinlock
1379  * held.
1380  */
1381 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1382 {
1383 	struct sock *rsk;
1384 
1385 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1386 		struct dst_entry *dst = sk->sk_rx_dst;
1387 
1388 		sock_rps_save_rxhash(sk, skb);
1389 		sk_mark_napi_id(sk, skb);
1390 		if (dst) {
1391 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1392 			    !dst->ops->check(dst, 0)) {
1393 				dst_release(dst);
1394 				sk->sk_rx_dst = NULL;
1395 			}
1396 		}
1397 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1398 		return 0;
1399 	}
1400 
1401 	if (tcp_checksum_complete(skb))
1402 		goto csum_err;
1403 
1404 	if (sk->sk_state == TCP_LISTEN) {
1405 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1406 
1407 		if (!nsk)
1408 			goto discard;
1409 		if (nsk != sk) {
1410 			sock_rps_save_rxhash(nsk, skb);
1411 			sk_mark_napi_id(nsk, skb);
1412 			if (tcp_child_process(sk, nsk, skb)) {
1413 				rsk = nsk;
1414 				goto reset;
1415 			}
1416 			return 0;
1417 		}
1418 	} else
1419 		sock_rps_save_rxhash(sk, skb);
1420 
1421 	if (tcp_rcv_state_process(sk, skb)) {
1422 		rsk = sk;
1423 		goto reset;
1424 	}
1425 	return 0;
1426 
1427 reset:
1428 	tcp_v4_send_reset(rsk, skb);
1429 discard:
1430 	kfree_skb(skb);
1431 	/* Be careful here. If this function gets more complicated and
1432 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1433 	 * might be destroyed here. This current version compiles correctly,
1434 	 * but you have been warned.
1435 	 */
1436 	return 0;
1437 
1438 csum_err:
1439 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1440 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1441 	goto discard;
1442 }
1443 EXPORT_SYMBOL(tcp_v4_do_rcv);
1444 
1445 void tcp_v4_early_demux(struct sk_buff *skb)
1446 {
1447 	const struct iphdr *iph;
1448 	const struct tcphdr *th;
1449 	struct sock *sk;
1450 
1451 	if (skb->pkt_type != PACKET_HOST)
1452 		return;
1453 
1454 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1455 		return;
1456 
1457 	iph = ip_hdr(skb);
1458 	th = tcp_hdr(skb);
1459 
1460 	if (th->doff < sizeof(struct tcphdr) / 4)
1461 		return;
1462 
1463 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1464 				       iph->saddr, th->source,
1465 				       iph->daddr, ntohs(th->dest),
1466 				       skb->skb_iif);
1467 	if (sk) {
1468 		skb->sk = sk;
1469 		skb->destructor = sock_edemux;
1470 		if (sk_fullsock(sk)) {
1471 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1472 
1473 			if (dst)
1474 				dst = dst_check(dst, 0);
1475 			if (dst &&
1476 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1477 				skb_dst_set_noref(skb, dst);
1478 		}
1479 	}
1480 }
1481 
1482 /* Packet is added to VJ-style prequeue for processing in process
1483  * context, if a reader task is waiting. Apparently, this exciting
1484  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1485  * failed somewhere. Latency? Burstiness? Well, at least now we will
1486  * see, why it failed. 8)8)				  --ANK
1487  *
1488  */
1489 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1490 {
1491 	struct tcp_sock *tp = tcp_sk(sk);
1492 
1493 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1494 		return false;
1495 
1496 	if (skb->len <= tcp_hdrlen(skb) &&
1497 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1498 		return false;
1499 
1500 	/* Before escaping RCU protected region, we need to take care of skb
1501 	 * dst. Prequeue is only enabled for established sockets.
1502 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1503 	 * Instead of doing full sk_rx_dst validity here, let's perform
1504 	 * an optimistic check.
1505 	 */
1506 	if (likely(sk->sk_rx_dst))
1507 		skb_dst_drop(skb);
1508 	else
1509 		skb_dst_force_safe(skb);
1510 
1511 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1512 	tp->ucopy.memory += skb->truesize;
1513 	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1514 	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1515 		struct sk_buff *skb1;
1516 
1517 		BUG_ON(sock_owned_by_user(sk));
1518 		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1519 				skb_queue_len(&tp->ucopy.prequeue));
1520 
1521 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1522 			sk_backlog_rcv(sk, skb1);
1523 
1524 		tp->ucopy.memory = 0;
1525 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1526 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1527 					   POLLIN | POLLRDNORM | POLLRDBAND);
1528 		if (!inet_csk_ack_scheduled(sk))
1529 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1530 						  (3 * tcp_rto_min(sk)) / 4,
1531 						  TCP_RTO_MAX);
1532 	}
1533 	return true;
1534 }
1535 EXPORT_SYMBOL(tcp_prequeue);
1536 
1537 /*
1538  *	From tcp_input.c
1539  */
1540 
1541 int tcp_v4_rcv(struct sk_buff *skb)
1542 {
1543 	struct net *net = dev_net(skb->dev);
1544 	const struct iphdr *iph;
1545 	const struct tcphdr *th;
1546 	bool refcounted;
1547 	struct sock *sk;
1548 	int ret;
1549 
1550 	if (skb->pkt_type != PACKET_HOST)
1551 		goto discard_it;
1552 
1553 	/* Count it even if it's bad */
1554 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1555 
1556 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1557 		goto discard_it;
1558 
1559 	th = (const struct tcphdr *)skb->data;
1560 
1561 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1562 		goto bad_packet;
1563 	if (!pskb_may_pull(skb, th->doff * 4))
1564 		goto discard_it;
1565 
1566 	/* An explanation is required here, I think.
1567 	 * Packet length and doff are validated by header prediction,
1568 	 * provided case of th->doff==0 is eliminated.
1569 	 * So, we defer the checks. */
1570 
1571 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1572 		goto csum_error;
1573 
1574 	th = (const struct tcphdr *)skb->data;
1575 	iph = ip_hdr(skb);
1576 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1577 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1578 	 */
1579 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1580 		sizeof(struct inet_skb_parm));
1581 	barrier();
1582 
1583 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1584 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1585 				    skb->len - th->doff * 4);
1586 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1587 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1588 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1589 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1590 	TCP_SKB_CB(skb)->sacked	 = 0;
1591 
1592 lookup:
1593 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1594 			       th->dest, &refcounted);
1595 	if (!sk)
1596 		goto no_tcp_socket;
1597 
1598 process:
1599 	if (sk->sk_state == TCP_TIME_WAIT)
1600 		goto do_time_wait;
1601 
1602 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1603 		struct request_sock *req = inet_reqsk(sk);
1604 		struct sock *nsk;
1605 
1606 		sk = req->rsk_listener;
1607 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1608 			reqsk_put(req);
1609 			goto discard_it;
1610 		}
1611 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1612 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1613 			goto lookup;
1614 		}
1615 		/* We own a reference on the listener, increase it again
1616 		 * as we might lose it too soon.
1617 		 */
1618 		sock_hold(sk);
1619 		refcounted = true;
1620 		nsk = tcp_check_req(sk, skb, req, false);
1621 		if (!nsk) {
1622 			reqsk_put(req);
1623 			goto discard_and_relse;
1624 		}
1625 		if (nsk == sk) {
1626 			reqsk_put(req);
1627 		} else if (tcp_child_process(sk, nsk, skb)) {
1628 			tcp_v4_send_reset(nsk, skb);
1629 			goto discard_and_relse;
1630 		} else {
1631 			sock_put(sk);
1632 			return 0;
1633 		}
1634 	}
1635 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1636 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1637 		goto discard_and_relse;
1638 	}
1639 
1640 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1641 		goto discard_and_relse;
1642 
1643 	if (tcp_v4_inbound_md5_hash(sk, skb))
1644 		goto discard_and_relse;
1645 
1646 	nf_reset(skb);
1647 
1648 	if (sk_filter(sk, skb))
1649 		goto discard_and_relse;
1650 
1651 	skb->dev = NULL;
1652 
1653 	if (sk->sk_state == TCP_LISTEN) {
1654 		ret = tcp_v4_do_rcv(sk, skb);
1655 		goto put_and_return;
1656 	}
1657 
1658 	sk_incoming_cpu_update(sk);
1659 
1660 	bh_lock_sock_nested(sk);
1661 	tcp_segs_in(tcp_sk(sk), skb);
1662 	ret = 0;
1663 	if (!sock_owned_by_user(sk)) {
1664 		if (!tcp_prequeue(sk, skb))
1665 			ret = tcp_v4_do_rcv(sk, skb);
1666 	} else if (unlikely(sk_add_backlog(sk, skb,
1667 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1668 		bh_unlock_sock(sk);
1669 		__NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
1670 		goto discard_and_relse;
1671 	}
1672 	bh_unlock_sock(sk);
1673 
1674 put_and_return:
1675 	if (refcounted)
1676 		sock_put(sk);
1677 
1678 	return ret;
1679 
1680 no_tcp_socket:
1681 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1682 		goto discard_it;
1683 
1684 	if (tcp_checksum_complete(skb)) {
1685 csum_error:
1686 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1687 bad_packet:
1688 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1689 	} else {
1690 		tcp_v4_send_reset(NULL, skb);
1691 	}
1692 
1693 discard_it:
1694 	/* Discard frame. */
1695 	kfree_skb(skb);
1696 	return 0;
1697 
1698 discard_and_relse:
1699 	sk_drops_add(sk, skb);
1700 	if (refcounted)
1701 		sock_put(sk);
1702 	goto discard_it;
1703 
1704 do_time_wait:
1705 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1706 		inet_twsk_put(inet_twsk(sk));
1707 		goto discard_it;
1708 	}
1709 
1710 	if (tcp_checksum_complete(skb)) {
1711 		inet_twsk_put(inet_twsk(sk));
1712 		goto csum_error;
1713 	}
1714 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1715 	case TCP_TW_SYN: {
1716 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1717 							&tcp_hashinfo, skb,
1718 							__tcp_hdrlen(th),
1719 							iph->saddr, th->source,
1720 							iph->daddr, th->dest,
1721 							inet_iif(skb));
1722 		if (sk2) {
1723 			inet_twsk_deschedule_put(inet_twsk(sk));
1724 			sk = sk2;
1725 			refcounted = false;
1726 			goto process;
1727 		}
1728 		/* Fall through to ACK */
1729 	}
1730 	case TCP_TW_ACK:
1731 		tcp_v4_timewait_ack(sk, skb);
1732 		break;
1733 	case TCP_TW_RST:
1734 		tcp_v4_send_reset(sk, skb);
1735 		inet_twsk_deschedule_put(inet_twsk(sk));
1736 		goto discard_it;
1737 	case TCP_TW_SUCCESS:;
1738 	}
1739 	goto discard_it;
1740 }
1741 
1742 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1743 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1744 	.twsk_unique	= tcp_twsk_unique,
1745 	.twsk_destructor= tcp_twsk_destructor,
1746 };
1747 
1748 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1749 {
1750 	struct dst_entry *dst = skb_dst(skb);
1751 
1752 	if (dst && dst_hold_safe(dst)) {
1753 		sk->sk_rx_dst = dst;
1754 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1755 	}
1756 }
1757 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1758 
1759 const struct inet_connection_sock_af_ops ipv4_specific = {
1760 	.queue_xmit	   = ip_queue_xmit,
1761 	.send_check	   = tcp_v4_send_check,
1762 	.rebuild_header	   = inet_sk_rebuild_header,
1763 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1764 	.conn_request	   = tcp_v4_conn_request,
1765 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1766 	.net_header_len	   = sizeof(struct iphdr),
1767 	.setsockopt	   = ip_setsockopt,
1768 	.getsockopt	   = ip_getsockopt,
1769 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1770 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1771 	.bind_conflict	   = inet_csk_bind_conflict,
1772 #ifdef CONFIG_COMPAT
1773 	.compat_setsockopt = compat_ip_setsockopt,
1774 	.compat_getsockopt = compat_ip_getsockopt,
1775 #endif
1776 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1777 };
1778 EXPORT_SYMBOL(ipv4_specific);
1779 
1780 #ifdef CONFIG_TCP_MD5SIG
1781 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1782 	.md5_lookup		= tcp_v4_md5_lookup,
1783 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1784 	.md5_parse		= tcp_v4_parse_md5_keys,
1785 };
1786 #endif
1787 
1788 /* NOTE: A lot of things set to zero explicitly by call to
1789  *       sk_alloc() so need not be done here.
1790  */
1791 static int tcp_v4_init_sock(struct sock *sk)
1792 {
1793 	struct inet_connection_sock *icsk = inet_csk(sk);
1794 
1795 	tcp_init_sock(sk);
1796 
1797 	icsk->icsk_af_ops = &ipv4_specific;
1798 
1799 #ifdef CONFIG_TCP_MD5SIG
1800 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1801 #endif
1802 
1803 	return 0;
1804 }
1805 
1806 void tcp_v4_destroy_sock(struct sock *sk)
1807 {
1808 	struct tcp_sock *tp = tcp_sk(sk);
1809 
1810 	tcp_clear_xmit_timers(sk);
1811 
1812 	tcp_cleanup_congestion_control(sk);
1813 
1814 	/* Cleanup up the write buffer. */
1815 	tcp_write_queue_purge(sk);
1816 
1817 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1818 	__skb_queue_purge(&tp->out_of_order_queue);
1819 
1820 #ifdef CONFIG_TCP_MD5SIG
1821 	/* Clean up the MD5 key list, if any */
1822 	if (tp->md5sig_info) {
1823 		tcp_clear_md5_list(sk);
1824 		kfree_rcu(tp->md5sig_info, rcu);
1825 		tp->md5sig_info = NULL;
1826 	}
1827 #endif
1828 
1829 	/* Clean prequeue, it must be empty really */
1830 	__skb_queue_purge(&tp->ucopy.prequeue);
1831 
1832 	/* Clean up a referenced TCP bind bucket. */
1833 	if (inet_csk(sk)->icsk_bind_hash)
1834 		inet_put_port(sk);
1835 
1836 	BUG_ON(tp->fastopen_rsk);
1837 
1838 	/* If socket is aborted during connect operation */
1839 	tcp_free_fastopen_req(tp);
1840 	tcp_saved_syn_free(tp);
1841 
1842 	local_bh_disable();
1843 	sk_sockets_allocated_dec(sk);
1844 	local_bh_enable();
1845 
1846 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1847 		sock_release_memcg(sk);
1848 }
1849 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1850 
1851 #ifdef CONFIG_PROC_FS
1852 /* Proc filesystem TCP sock list dumping. */
1853 
1854 /*
1855  * Get next listener socket follow cur.  If cur is NULL, get first socket
1856  * starting from bucket given in st->bucket; when st->bucket is zero the
1857  * very first socket in the hash table is returned.
1858  */
1859 static void *listening_get_next(struct seq_file *seq, void *cur)
1860 {
1861 	struct tcp_iter_state *st = seq->private;
1862 	struct net *net = seq_file_net(seq);
1863 	struct inet_listen_hashbucket *ilb;
1864 	struct inet_connection_sock *icsk;
1865 	struct sock *sk = cur;
1866 
1867 	if (!sk) {
1868 get_head:
1869 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1870 		spin_lock_bh(&ilb->lock);
1871 		sk = sk_head(&ilb->head);
1872 		st->offset = 0;
1873 		goto get_sk;
1874 	}
1875 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1876 	++st->num;
1877 	++st->offset;
1878 
1879 	sk = sk_next(sk);
1880 get_sk:
1881 	sk_for_each_from(sk) {
1882 		if (!net_eq(sock_net(sk), net))
1883 			continue;
1884 		if (sk->sk_family == st->family)
1885 			return sk;
1886 		icsk = inet_csk(sk);
1887 	}
1888 	spin_unlock_bh(&ilb->lock);
1889 	st->offset = 0;
1890 	if (++st->bucket < INET_LHTABLE_SIZE)
1891 		goto get_head;
1892 	return NULL;
1893 }
1894 
1895 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1896 {
1897 	struct tcp_iter_state *st = seq->private;
1898 	void *rc;
1899 
1900 	st->bucket = 0;
1901 	st->offset = 0;
1902 	rc = listening_get_next(seq, NULL);
1903 
1904 	while (rc && *pos) {
1905 		rc = listening_get_next(seq, rc);
1906 		--*pos;
1907 	}
1908 	return rc;
1909 }
1910 
1911 static inline bool empty_bucket(const struct tcp_iter_state *st)
1912 {
1913 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1914 }
1915 
1916 /*
1917  * Get first established socket starting from bucket given in st->bucket.
1918  * If st->bucket is zero, the very first socket in the hash is returned.
1919  */
1920 static void *established_get_first(struct seq_file *seq)
1921 {
1922 	struct tcp_iter_state *st = seq->private;
1923 	struct net *net = seq_file_net(seq);
1924 	void *rc = NULL;
1925 
1926 	st->offset = 0;
1927 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1928 		struct sock *sk;
1929 		struct hlist_nulls_node *node;
1930 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1931 
1932 		/* Lockless fast path for the common case of empty buckets */
1933 		if (empty_bucket(st))
1934 			continue;
1935 
1936 		spin_lock_bh(lock);
1937 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1938 			if (sk->sk_family != st->family ||
1939 			    !net_eq(sock_net(sk), net)) {
1940 				continue;
1941 			}
1942 			rc = sk;
1943 			goto out;
1944 		}
1945 		spin_unlock_bh(lock);
1946 	}
1947 out:
1948 	return rc;
1949 }
1950 
1951 static void *established_get_next(struct seq_file *seq, void *cur)
1952 {
1953 	struct sock *sk = cur;
1954 	struct hlist_nulls_node *node;
1955 	struct tcp_iter_state *st = seq->private;
1956 	struct net *net = seq_file_net(seq);
1957 
1958 	++st->num;
1959 	++st->offset;
1960 
1961 	sk = sk_nulls_next(sk);
1962 
1963 	sk_nulls_for_each_from(sk, node) {
1964 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1965 			return sk;
1966 	}
1967 
1968 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1969 	++st->bucket;
1970 	return established_get_first(seq);
1971 }
1972 
1973 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1974 {
1975 	struct tcp_iter_state *st = seq->private;
1976 	void *rc;
1977 
1978 	st->bucket = 0;
1979 	rc = established_get_first(seq);
1980 
1981 	while (rc && pos) {
1982 		rc = established_get_next(seq, rc);
1983 		--pos;
1984 	}
1985 	return rc;
1986 }
1987 
1988 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1989 {
1990 	void *rc;
1991 	struct tcp_iter_state *st = seq->private;
1992 
1993 	st->state = TCP_SEQ_STATE_LISTENING;
1994 	rc	  = listening_get_idx(seq, &pos);
1995 
1996 	if (!rc) {
1997 		st->state = TCP_SEQ_STATE_ESTABLISHED;
1998 		rc	  = established_get_idx(seq, pos);
1999 	}
2000 
2001 	return rc;
2002 }
2003 
2004 static void *tcp_seek_last_pos(struct seq_file *seq)
2005 {
2006 	struct tcp_iter_state *st = seq->private;
2007 	int offset = st->offset;
2008 	int orig_num = st->num;
2009 	void *rc = NULL;
2010 
2011 	switch (st->state) {
2012 	case TCP_SEQ_STATE_LISTENING:
2013 		if (st->bucket >= INET_LHTABLE_SIZE)
2014 			break;
2015 		st->state = TCP_SEQ_STATE_LISTENING;
2016 		rc = listening_get_next(seq, NULL);
2017 		while (offset-- && rc)
2018 			rc = listening_get_next(seq, rc);
2019 		if (rc)
2020 			break;
2021 		st->bucket = 0;
2022 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2023 		/* Fallthrough */
2024 	case TCP_SEQ_STATE_ESTABLISHED:
2025 		if (st->bucket > tcp_hashinfo.ehash_mask)
2026 			break;
2027 		rc = established_get_first(seq);
2028 		while (offset-- && rc)
2029 			rc = established_get_next(seq, rc);
2030 	}
2031 
2032 	st->num = orig_num;
2033 
2034 	return rc;
2035 }
2036 
2037 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2038 {
2039 	struct tcp_iter_state *st = seq->private;
2040 	void *rc;
2041 
2042 	if (*pos && *pos == st->last_pos) {
2043 		rc = tcp_seek_last_pos(seq);
2044 		if (rc)
2045 			goto out;
2046 	}
2047 
2048 	st->state = TCP_SEQ_STATE_LISTENING;
2049 	st->num = 0;
2050 	st->bucket = 0;
2051 	st->offset = 0;
2052 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2053 
2054 out:
2055 	st->last_pos = *pos;
2056 	return rc;
2057 }
2058 
2059 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2060 {
2061 	struct tcp_iter_state *st = seq->private;
2062 	void *rc = NULL;
2063 
2064 	if (v == SEQ_START_TOKEN) {
2065 		rc = tcp_get_idx(seq, 0);
2066 		goto out;
2067 	}
2068 
2069 	switch (st->state) {
2070 	case TCP_SEQ_STATE_LISTENING:
2071 		rc = listening_get_next(seq, v);
2072 		if (!rc) {
2073 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2074 			st->bucket = 0;
2075 			st->offset = 0;
2076 			rc	  = established_get_first(seq);
2077 		}
2078 		break;
2079 	case TCP_SEQ_STATE_ESTABLISHED:
2080 		rc = established_get_next(seq, v);
2081 		break;
2082 	}
2083 out:
2084 	++*pos;
2085 	st->last_pos = *pos;
2086 	return rc;
2087 }
2088 
2089 static void tcp_seq_stop(struct seq_file *seq, void *v)
2090 {
2091 	struct tcp_iter_state *st = seq->private;
2092 
2093 	switch (st->state) {
2094 	case TCP_SEQ_STATE_LISTENING:
2095 		if (v != SEQ_START_TOKEN)
2096 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2097 		break;
2098 	case TCP_SEQ_STATE_ESTABLISHED:
2099 		if (v)
2100 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2101 		break;
2102 	}
2103 }
2104 
2105 int tcp_seq_open(struct inode *inode, struct file *file)
2106 {
2107 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2108 	struct tcp_iter_state *s;
2109 	int err;
2110 
2111 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2112 			  sizeof(struct tcp_iter_state));
2113 	if (err < 0)
2114 		return err;
2115 
2116 	s = ((struct seq_file *)file->private_data)->private;
2117 	s->family		= afinfo->family;
2118 	s->last_pos		= 0;
2119 	return 0;
2120 }
2121 EXPORT_SYMBOL(tcp_seq_open);
2122 
2123 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2124 {
2125 	int rc = 0;
2126 	struct proc_dir_entry *p;
2127 
2128 	afinfo->seq_ops.start		= tcp_seq_start;
2129 	afinfo->seq_ops.next		= tcp_seq_next;
2130 	afinfo->seq_ops.stop		= tcp_seq_stop;
2131 
2132 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2133 			     afinfo->seq_fops, afinfo);
2134 	if (!p)
2135 		rc = -ENOMEM;
2136 	return rc;
2137 }
2138 EXPORT_SYMBOL(tcp_proc_register);
2139 
2140 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2141 {
2142 	remove_proc_entry(afinfo->name, net->proc_net);
2143 }
2144 EXPORT_SYMBOL(tcp_proc_unregister);
2145 
2146 static void get_openreq4(const struct request_sock *req,
2147 			 struct seq_file *f, int i)
2148 {
2149 	const struct inet_request_sock *ireq = inet_rsk(req);
2150 	long delta = req->rsk_timer.expires - jiffies;
2151 
2152 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2153 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2154 		i,
2155 		ireq->ir_loc_addr,
2156 		ireq->ir_num,
2157 		ireq->ir_rmt_addr,
2158 		ntohs(ireq->ir_rmt_port),
2159 		TCP_SYN_RECV,
2160 		0, 0, /* could print option size, but that is af dependent. */
2161 		1,    /* timers active (only the expire timer) */
2162 		jiffies_delta_to_clock_t(delta),
2163 		req->num_timeout,
2164 		from_kuid_munged(seq_user_ns(f),
2165 				 sock_i_uid(req->rsk_listener)),
2166 		0,  /* non standard timer */
2167 		0, /* open_requests have no inode */
2168 		0,
2169 		req);
2170 }
2171 
2172 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2173 {
2174 	int timer_active;
2175 	unsigned long timer_expires;
2176 	const struct tcp_sock *tp = tcp_sk(sk);
2177 	const struct inet_connection_sock *icsk = inet_csk(sk);
2178 	const struct inet_sock *inet = inet_sk(sk);
2179 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2180 	__be32 dest = inet->inet_daddr;
2181 	__be32 src = inet->inet_rcv_saddr;
2182 	__u16 destp = ntohs(inet->inet_dport);
2183 	__u16 srcp = ntohs(inet->inet_sport);
2184 	int rx_queue;
2185 	int state;
2186 
2187 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2188 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2189 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2190 		timer_active	= 1;
2191 		timer_expires	= icsk->icsk_timeout;
2192 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2193 		timer_active	= 4;
2194 		timer_expires	= icsk->icsk_timeout;
2195 	} else if (timer_pending(&sk->sk_timer)) {
2196 		timer_active	= 2;
2197 		timer_expires	= sk->sk_timer.expires;
2198 	} else {
2199 		timer_active	= 0;
2200 		timer_expires = jiffies;
2201 	}
2202 
2203 	state = sk_state_load(sk);
2204 	if (state == TCP_LISTEN)
2205 		rx_queue = sk->sk_ack_backlog;
2206 	else
2207 		/* Because we don't lock the socket,
2208 		 * we might find a transient negative value.
2209 		 */
2210 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2211 
2212 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2213 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2214 		i, src, srcp, dest, destp, state,
2215 		tp->write_seq - tp->snd_una,
2216 		rx_queue,
2217 		timer_active,
2218 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2219 		icsk->icsk_retransmits,
2220 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2221 		icsk->icsk_probes_out,
2222 		sock_i_ino(sk),
2223 		atomic_read(&sk->sk_refcnt), sk,
2224 		jiffies_to_clock_t(icsk->icsk_rto),
2225 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2226 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2227 		tp->snd_cwnd,
2228 		state == TCP_LISTEN ?
2229 		    fastopenq->max_qlen :
2230 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2231 }
2232 
2233 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2234 			       struct seq_file *f, int i)
2235 {
2236 	long delta = tw->tw_timer.expires - jiffies;
2237 	__be32 dest, src;
2238 	__u16 destp, srcp;
2239 
2240 	dest  = tw->tw_daddr;
2241 	src   = tw->tw_rcv_saddr;
2242 	destp = ntohs(tw->tw_dport);
2243 	srcp  = ntohs(tw->tw_sport);
2244 
2245 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2246 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2247 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2248 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2249 		atomic_read(&tw->tw_refcnt), tw);
2250 }
2251 
2252 #define TMPSZ 150
2253 
2254 static int tcp4_seq_show(struct seq_file *seq, void *v)
2255 {
2256 	struct tcp_iter_state *st;
2257 	struct sock *sk = v;
2258 
2259 	seq_setwidth(seq, TMPSZ - 1);
2260 	if (v == SEQ_START_TOKEN) {
2261 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2262 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2263 			   "inode");
2264 		goto out;
2265 	}
2266 	st = seq->private;
2267 
2268 	if (sk->sk_state == TCP_TIME_WAIT)
2269 		get_timewait4_sock(v, seq, st->num);
2270 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2271 		get_openreq4(v, seq, st->num);
2272 	else
2273 		get_tcp4_sock(v, seq, st->num);
2274 out:
2275 	seq_pad(seq, '\n');
2276 	return 0;
2277 }
2278 
2279 static const struct file_operations tcp_afinfo_seq_fops = {
2280 	.owner   = THIS_MODULE,
2281 	.open    = tcp_seq_open,
2282 	.read    = seq_read,
2283 	.llseek  = seq_lseek,
2284 	.release = seq_release_net
2285 };
2286 
2287 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2288 	.name		= "tcp",
2289 	.family		= AF_INET,
2290 	.seq_fops	= &tcp_afinfo_seq_fops,
2291 	.seq_ops	= {
2292 		.show		= tcp4_seq_show,
2293 	},
2294 };
2295 
2296 static int __net_init tcp4_proc_init_net(struct net *net)
2297 {
2298 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2299 }
2300 
2301 static void __net_exit tcp4_proc_exit_net(struct net *net)
2302 {
2303 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2304 }
2305 
2306 static struct pernet_operations tcp4_net_ops = {
2307 	.init = tcp4_proc_init_net,
2308 	.exit = tcp4_proc_exit_net,
2309 };
2310 
2311 int __init tcp4_proc_init(void)
2312 {
2313 	return register_pernet_subsys(&tcp4_net_ops);
2314 }
2315 
2316 void tcp4_proc_exit(void)
2317 {
2318 	unregister_pernet_subsys(&tcp4_net_ops);
2319 }
2320 #endif /* CONFIG_PROC_FS */
2321 
2322 struct proto tcp_prot = {
2323 	.name			= "TCP",
2324 	.owner			= THIS_MODULE,
2325 	.close			= tcp_close,
2326 	.connect		= tcp_v4_connect,
2327 	.disconnect		= tcp_disconnect,
2328 	.accept			= inet_csk_accept,
2329 	.ioctl			= tcp_ioctl,
2330 	.init			= tcp_v4_init_sock,
2331 	.destroy		= tcp_v4_destroy_sock,
2332 	.shutdown		= tcp_shutdown,
2333 	.setsockopt		= tcp_setsockopt,
2334 	.getsockopt		= tcp_getsockopt,
2335 	.recvmsg		= tcp_recvmsg,
2336 	.sendmsg		= tcp_sendmsg,
2337 	.sendpage		= tcp_sendpage,
2338 	.backlog_rcv		= tcp_v4_do_rcv,
2339 	.release_cb		= tcp_release_cb,
2340 	.hash			= inet_hash,
2341 	.unhash			= inet_unhash,
2342 	.get_port		= inet_csk_get_port,
2343 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2344 	.stream_memory_free	= tcp_stream_memory_free,
2345 	.sockets_allocated	= &tcp_sockets_allocated,
2346 	.orphan_count		= &tcp_orphan_count,
2347 	.memory_allocated	= &tcp_memory_allocated,
2348 	.memory_pressure	= &tcp_memory_pressure,
2349 	.sysctl_mem		= sysctl_tcp_mem,
2350 	.sysctl_wmem		= sysctl_tcp_wmem,
2351 	.sysctl_rmem		= sysctl_tcp_rmem,
2352 	.max_header		= MAX_TCP_HEADER,
2353 	.obj_size		= sizeof(struct tcp_sock),
2354 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2355 	.twsk_prot		= &tcp_timewait_sock_ops,
2356 	.rsk_prot		= &tcp_request_sock_ops,
2357 	.h.hashinfo		= &tcp_hashinfo,
2358 	.no_autobind		= true,
2359 #ifdef CONFIG_COMPAT
2360 	.compat_setsockopt	= compat_tcp_setsockopt,
2361 	.compat_getsockopt	= compat_tcp_getsockopt,
2362 #endif
2363 	.diag_destroy		= tcp_abort,
2364 };
2365 EXPORT_SYMBOL(tcp_prot);
2366 
2367 static void __net_exit tcp_sk_exit(struct net *net)
2368 {
2369 	int cpu;
2370 
2371 	for_each_possible_cpu(cpu)
2372 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2373 	free_percpu(net->ipv4.tcp_sk);
2374 }
2375 
2376 static int __net_init tcp_sk_init(struct net *net)
2377 {
2378 	int res, cpu;
2379 
2380 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2381 	if (!net->ipv4.tcp_sk)
2382 		return -ENOMEM;
2383 
2384 	for_each_possible_cpu(cpu) {
2385 		struct sock *sk;
2386 
2387 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2388 					   IPPROTO_TCP, net);
2389 		if (res)
2390 			goto fail;
2391 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2392 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2393 	}
2394 
2395 	net->ipv4.sysctl_tcp_ecn = 2;
2396 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2397 
2398 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2399 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2400 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2401 
2402 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2403 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2404 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2405 
2406 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2407 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2408 	net->ipv4.sysctl_tcp_syncookies = 1;
2409 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2410 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2411 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2412 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2413 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2414 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2415 
2416 	return 0;
2417 fail:
2418 	tcp_sk_exit(net);
2419 
2420 	return res;
2421 }
2422 
2423 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2424 {
2425 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2426 }
2427 
2428 static struct pernet_operations __net_initdata tcp_sk_ops = {
2429        .init	   = tcp_sk_init,
2430        .exit	   = tcp_sk_exit,
2431        .exit_batch = tcp_sk_exit_batch,
2432 };
2433 
2434 void __init tcp_v4_init(void)
2435 {
2436 	inet_hashinfo_init(&tcp_hashinfo);
2437 	if (register_pernet_subsys(&tcp_sk_ops))
2438 		panic("Failed to create the TCP control socket.\n");
2439 }
2440