xref: /linux/net/ipv4/tcp_ipv4.c (revision 4949009eb8d40a441dcddcd96e101e77d31cf1b2)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78 
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84 
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87 
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96 
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99 
100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103 					  ip_hdr(skb)->saddr,
104 					  tcp_hdr(skb)->dest,
105 					  tcp_hdr(skb)->source);
106 }
107 
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 
113 	/* With PAWS, it is safe from the viewpoint
114 	   of data integrity. Even without PAWS it is safe provided sequence
115 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116 
117 	   Actually, the idea is close to VJ's one, only timestamp cache is
118 	   held not per host, but per port pair and TW bucket is used as state
119 	   holder.
120 
121 	   If TW bucket has been already destroyed we fall back to VJ's scheme
122 	   and use initial timestamp retrieved from peer table.
123 	 */
124 	if (tcptw->tw_ts_recent_stamp &&
125 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
126 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 		if (tp->write_seq == 0)
129 			tp->write_seq = 1;
130 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
131 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132 		sock_hold(sktw);
133 		return 1;
134 	}
135 
136 	return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139 
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144 	struct inet_sock *inet = inet_sk(sk);
145 	struct tcp_sock *tp = tcp_sk(sk);
146 	__be16 orig_sport, orig_dport;
147 	__be32 daddr, nexthop;
148 	struct flowi4 *fl4;
149 	struct rtable *rt;
150 	int err;
151 	struct ip_options_rcu *inet_opt;
152 
153 	if (addr_len < sizeof(struct sockaddr_in))
154 		return -EINVAL;
155 
156 	if (usin->sin_family != AF_INET)
157 		return -EAFNOSUPPORT;
158 
159 	nexthop = daddr = usin->sin_addr.s_addr;
160 	inet_opt = rcu_dereference_protected(inet->inet_opt,
161 					     sock_owned_by_user(sk));
162 	if (inet_opt && inet_opt->opt.srr) {
163 		if (!daddr)
164 			return -EINVAL;
165 		nexthop = inet_opt->opt.faddr;
166 	}
167 
168 	orig_sport = inet->inet_sport;
169 	orig_dport = usin->sin_port;
170 	fl4 = &inet->cork.fl.u.ip4;
171 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173 			      IPPROTO_TCP,
174 			      orig_sport, orig_dport, sk);
175 	if (IS_ERR(rt)) {
176 		err = PTR_ERR(rt);
177 		if (err == -ENETUNREACH)
178 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179 		return err;
180 	}
181 
182 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183 		ip_rt_put(rt);
184 		return -ENETUNREACH;
185 	}
186 
187 	if (!inet_opt || !inet_opt->opt.srr)
188 		daddr = fl4->daddr;
189 
190 	if (!inet->inet_saddr)
191 		inet->inet_saddr = fl4->saddr;
192 	inet->inet_rcv_saddr = inet->inet_saddr;
193 
194 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 		/* Reset inherited state */
196 		tp->rx_opt.ts_recent	   = 0;
197 		tp->rx_opt.ts_recent_stamp = 0;
198 		if (likely(!tp->repair))
199 			tp->write_seq	   = 0;
200 	}
201 
202 	if (tcp_death_row.sysctl_tw_recycle &&
203 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204 		tcp_fetch_timewait_stamp(sk, &rt->dst);
205 
206 	inet->inet_dport = usin->sin_port;
207 	inet->inet_daddr = daddr;
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(&tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	inet_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 
238 	if (!tp->write_seq && likely(!tp->repair))
239 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240 							   inet->inet_daddr,
241 							   inet->inet_sport,
242 							   usin->sin_port);
243 
244 	inet->inet_id = tp->write_seq ^ jiffies;
245 
246 	err = tcp_connect(sk);
247 
248 	rt = NULL;
249 	if (err)
250 		goto failure;
251 
252 	return 0;
253 
254 failure:
255 	/*
256 	 * This unhashes the socket and releases the local port,
257 	 * if necessary.
258 	 */
259 	tcp_set_state(sk, TCP_CLOSE);
260 	ip_rt_put(rt);
261 	sk->sk_route_caps = 0;
262 	inet->inet_dport = 0;
263 	return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266 
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274 	struct dst_entry *dst;
275 	struct inet_sock *inet = inet_sk(sk);
276 	u32 mtu = tcp_sk(sk)->mtu_info;
277 
278 	dst = inet_csk_update_pmtu(sk, mtu);
279 	if (!dst)
280 		return;
281 
282 	/* Something is about to be wrong... Remember soft error
283 	 * for the case, if this connection will not able to recover.
284 	 */
285 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286 		sk->sk_err_soft = EMSGSIZE;
287 
288 	mtu = dst_mtu(dst);
289 
290 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291 	    ip_sk_accept_pmtu(sk) &&
292 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293 		tcp_sync_mss(sk, mtu);
294 
295 		/* Resend the TCP packet because it's
296 		 * clear that the old packet has been
297 		 * dropped. This is the new "fast" path mtu
298 		 * discovery.
299 		 */
300 		tcp_simple_retransmit(sk);
301 	} /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304 
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307 	struct dst_entry *dst = __sk_dst_check(sk, 0);
308 
309 	if (dst)
310 		dst->ops->redirect(dst, sk, skb);
311 }
312 
313 /*
314  * This routine is called by the ICMP module when it gets some
315  * sort of error condition.  If err < 0 then the socket should
316  * be closed and the error returned to the user.  If err > 0
317  * it's just the icmp type << 8 | icmp code.  After adjustment
318  * header points to the first 8 bytes of the tcp header.  We need
319  * to find the appropriate port.
320  *
321  * The locking strategy used here is very "optimistic". When
322  * someone else accesses the socket the ICMP is just dropped
323  * and for some paths there is no check at all.
324  * A more general error queue to queue errors for later handling
325  * is probably better.
326  *
327  */
328 
329 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
330 {
331 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
332 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
333 	struct inet_connection_sock *icsk;
334 	struct tcp_sock *tp;
335 	struct inet_sock *inet;
336 	const int type = icmp_hdr(icmp_skb)->type;
337 	const int code = icmp_hdr(icmp_skb)->code;
338 	struct sock *sk;
339 	struct sk_buff *skb;
340 	struct request_sock *fastopen;
341 	__u32 seq, snd_una;
342 	__u32 remaining;
343 	int err;
344 	struct net *net = dev_net(icmp_skb->dev);
345 
346 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
347 			iph->saddr, th->source, inet_iif(icmp_skb));
348 	if (!sk) {
349 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
350 		return;
351 	}
352 	if (sk->sk_state == TCP_TIME_WAIT) {
353 		inet_twsk_put(inet_twsk(sk));
354 		return;
355 	}
356 
357 	bh_lock_sock(sk);
358 	/* If too many ICMPs get dropped on busy
359 	 * servers this needs to be solved differently.
360 	 * We do take care of PMTU discovery (RFC1191) special case :
361 	 * we can receive locally generated ICMP messages while socket is held.
362 	 */
363 	if (sock_owned_by_user(sk)) {
364 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
365 			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366 	}
367 	if (sk->sk_state == TCP_CLOSE)
368 		goto out;
369 
370 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
371 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
372 		goto out;
373 	}
374 
375 	icsk = inet_csk(sk);
376 	tp = tcp_sk(sk);
377 	seq = ntohl(th->seq);
378 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
379 	fastopen = tp->fastopen_rsk;
380 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
381 	if (sk->sk_state != TCP_LISTEN &&
382 	    !between(seq, snd_una, tp->snd_nxt)) {
383 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
384 		goto out;
385 	}
386 
387 	switch (type) {
388 	case ICMP_REDIRECT:
389 		do_redirect(icmp_skb, sk);
390 		goto out;
391 	case ICMP_SOURCE_QUENCH:
392 		/* Just silently ignore these. */
393 		goto out;
394 	case ICMP_PARAMETERPROB:
395 		err = EPROTO;
396 		break;
397 	case ICMP_DEST_UNREACH:
398 		if (code > NR_ICMP_UNREACH)
399 			goto out;
400 
401 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
402 			/* We are not interested in TCP_LISTEN and open_requests
403 			 * (SYN-ACKs send out by Linux are always <576bytes so
404 			 * they should go through unfragmented).
405 			 */
406 			if (sk->sk_state == TCP_LISTEN)
407 				goto out;
408 
409 			tp->mtu_info = info;
410 			if (!sock_owned_by_user(sk)) {
411 				tcp_v4_mtu_reduced(sk);
412 			} else {
413 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
414 					sock_hold(sk);
415 			}
416 			goto out;
417 		}
418 
419 		err = icmp_err_convert[code].errno;
420 		/* check if icmp_skb allows revert of backoff
421 		 * (see draft-zimmermann-tcp-lcd) */
422 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 			break;
424 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
425 		    !icsk->icsk_backoff || fastopen)
426 			break;
427 
428 		if (sock_owned_by_user(sk))
429 			break;
430 
431 		icsk->icsk_backoff--;
432 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
433 					       TCP_TIMEOUT_INIT;
434 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
435 
436 		skb = tcp_write_queue_head(sk);
437 		BUG_ON(!skb);
438 
439 		remaining = icsk->icsk_rto -
440 			    min(icsk->icsk_rto,
441 				tcp_time_stamp - tcp_skb_timestamp(skb));
442 
443 		if (remaining) {
444 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
445 						  remaining, TCP_RTO_MAX);
446 		} else {
447 			/* RTO revert clocked out retransmission.
448 			 * Will retransmit now */
449 			tcp_retransmit_timer(sk);
450 		}
451 
452 		break;
453 	case ICMP_TIME_EXCEEDED:
454 		err = EHOSTUNREACH;
455 		break;
456 	default:
457 		goto out;
458 	}
459 
460 	switch (sk->sk_state) {
461 		struct request_sock *req, **prev;
462 	case TCP_LISTEN:
463 		if (sock_owned_by_user(sk))
464 			goto out;
465 
466 		req = inet_csk_search_req(sk, &prev, th->dest,
467 					  iph->daddr, iph->saddr);
468 		if (!req)
469 			goto out;
470 
471 		/* ICMPs are not backlogged, hence we cannot get
472 		   an established socket here.
473 		 */
474 		WARN_ON(req->sk);
475 
476 		if (seq != tcp_rsk(req)->snt_isn) {
477 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
478 			goto out;
479 		}
480 
481 		/*
482 		 * Still in SYN_RECV, just remove it silently.
483 		 * There is no good way to pass the error to the newly
484 		 * created socket, and POSIX does not want network
485 		 * errors returned from accept().
486 		 */
487 		inet_csk_reqsk_queue_drop(sk, req, prev);
488 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
489 		goto out;
490 
491 	case TCP_SYN_SENT:
492 	case TCP_SYN_RECV:
493 		/* Only in fast or simultaneous open. If a fast open socket is
494 		 * is already accepted it is treated as a connected one below.
495 		 */
496 		if (fastopen && fastopen->sk == NULL)
497 			break;
498 
499 		if (!sock_owned_by_user(sk)) {
500 			sk->sk_err = err;
501 
502 			sk->sk_error_report(sk);
503 
504 			tcp_done(sk);
505 		} else {
506 			sk->sk_err_soft = err;
507 		}
508 		goto out;
509 	}
510 
511 	/* If we've already connected we will keep trying
512 	 * until we time out, or the user gives up.
513 	 *
514 	 * rfc1122 4.2.3.9 allows to consider as hard errors
515 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 	 * but it is obsoleted by pmtu discovery).
517 	 *
518 	 * Note, that in modern internet, where routing is unreliable
519 	 * and in each dark corner broken firewalls sit, sending random
520 	 * errors ordered by their masters even this two messages finally lose
521 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 	 *
523 	 * Now we are in compliance with RFCs.
524 	 *							--ANK (980905)
525 	 */
526 
527 	inet = inet_sk(sk);
528 	if (!sock_owned_by_user(sk) && inet->recverr) {
529 		sk->sk_err = err;
530 		sk->sk_error_report(sk);
531 	} else	{ /* Only an error on timeout */
532 		sk->sk_err_soft = err;
533 	}
534 
535 out:
536 	bh_unlock_sock(sk);
537 	sock_put(sk);
538 }
539 
540 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
541 {
542 	struct tcphdr *th = tcp_hdr(skb);
543 
544 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
545 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
546 		skb->csum_start = skb_transport_header(skb) - skb->head;
547 		skb->csum_offset = offsetof(struct tcphdr, check);
548 	} else {
549 		th->check = tcp_v4_check(skb->len, saddr, daddr,
550 					 csum_partial(th,
551 						      th->doff << 2,
552 						      skb->csum));
553 	}
554 }
555 
556 /* This routine computes an IPv4 TCP checksum. */
557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
558 {
559 	const struct inet_sock *inet = inet_sk(sk);
560 
561 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562 }
563 EXPORT_SYMBOL(tcp_v4_send_check);
564 
565 /*
566  *	This routine will send an RST to the other tcp.
567  *
568  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569  *		      for reset.
570  *	Answer: if a packet caused RST, it is not for a socket
571  *		existing in our system, if it is matched to a socket,
572  *		it is just duplicate segment or bug in other side's TCP.
573  *		So that we build reply only basing on parameters
574  *		arrived with segment.
575  *	Exception: precedence violation. We do not implement it in any case.
576  */
577 
578 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
579 {
580 	const struct tcphdr *th = tcp_hdr(skb);
581 	struct {
582 		struct tcphdr th;
583 #ifdef CONFIG_TCP_MD5SIG
584 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
585 #endif
586 	} rep;
587 	struct ip_reply_arg arg;
588 #ifdef CONFIG_TCP_MD5SIG
589 	struct tcp_md5sig_key *key;
590 	const __u8 *hash_location = NULL;
591 	unsigned char newhash[16];
592 	int genhash;
593 	struct sock *sk1 = NULL;
594 #endif
595 	struct net *net;
596 
597 	/* Never send a reset in response to a reset. */
598 	if (th->rst)
599 		return;
600 
601 	/* If sk not NULL, it means we did a successful lookup and incoming
602 	 * route had to be correct. prequeue might have dropped our dst.
603 	 */
604 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
605 		return;
606 
607 	/* Swap the send and the receive. */
608 	memset(&rep, 0, sizeof(rep));
609 	rep.th.dest   = th->source;
610 	rep.th.source = th->dest;
611 	rep.th.doff   = sizeof(struct tcphdr) / 4;
612 	rep.th.rst    = 1;
613 
614 	if (th->ack) {
615 		rep.th.seq = th->ack_seq;
616 	} else {
617 		rep.th.ack = 1;
618 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 				       skb->len - (th->doff << 2));
620 	}
621 
622 	memset(&arg, 0, sizeof(arg));
623 	arg.iov[0].iov_base = (unsigned char *)&rep;
624 	arg.iov[0].iov_len  = sizeof(rep.th);
625 
626 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
627 #ifdef CONFIG_TCP_MD5SIG
628 	hash_location = tcp_parse_md5sig_option(th);
629 	if (!sk && hash_location) {
630 		/*
631 		 * active side is lost. Try to find listening socket through
632 		 * source port, and then find md5 key through listening socket.
633 		 * we are not loose security here:
634 		 * Incoming packet is checked with md5 hash with finding key,
635 		 * no RST generated if md5 hash doesn't match.
636 		 */
637 		sk1 = __inet_lookup_listener(net,
638 					     &tcp_hashinfo, ip_hdr(skb)->saddr,
639 					     th->source, ip_hdr(skb)->daddr,
640 					     ntohs(th->source), inet_iif(skb));
641 		/* don't send rst if it can't find key */
642 		if (!sk1)
643 			return;
644 		rcu_read_lock();
645 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
646 					&ip_hdr(skb)->saddr, AF_INET);
647 		if (!key)
648 			goto release_sk1;
649 
650 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
651 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
652 			goto release_sk1;
653 	} else {
654 		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
655 					     &ip_hdr(skb)->saddr,
656 					     AF_INET) : NULL;
657 	}
658 
659 	if (key) {
660 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
661 				   (TCPOPT_NOP << 16) |
662 				   (TCPOPT_MD5SIG << 8) |
663 				   TCPOLEN_MD5SIG);
664 		/* Update length and the length the header thinks exists */
665 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
666 		rep.th.doff = arg.iov[0].iov_len / 4;
667 
668 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
669 				     key, ip_hdr(skb)->saddr,
670 				     ip_hdr(skb)->daddr, &rep.th);
671 	}
672 #endif
673 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
674 				      ip_hdr(skb)->saddr, /* XXX */
675 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
676 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
677 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
678 	/* When socket is gone, all binding information is lost.
679 	 * routing might fail in this case. No choice here, if we choose to force
680 	 * input interface, we will misroute in case of asymmetric route.
681 	 */
682 	if (sk)
683 		arg.bound_dev_if = sk->sk_bound_dev_if;
684 
685 	arg.tos = ip_hdr(skb)->tos;
686 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
687 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
688 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
689 			      &arg, arg.iov[0].iov_len);
690 
691 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
692 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
693 
694 #ifdef CONFIG_TCP_MD5SIG
695 release_sk1:
696 	if (sk1) {
697 		rcu_read_unlock();
698 		sock_put(sk1);
699 	}
700 #endif
701 }
702 
703 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
704    outside socket context is ugly, certainly. What can I do?
705  */
706 
707 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
708 			    u32 win, u32 tsval, u32 tsecr, int oif,
709 			    struct tcp_md5sig_key *key,
710 			    int reply_flags, u8 tos)
711 {
712 	const struct tcphdr *th = tcp_hdr(skb);
713 	struct {
714 		struct tcphdr th;
715 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
716 #ifdef CONFIG_TCP_MD5SIG
717 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
718 #endif
719 			];
720 	} rep;
721 	struct ip_reply_arg arg;
722 	struct net *net = dev_net(skb_dst(skb)->dev);
723 
724 	memset(&rep.th, 0, sizeof(struct tcphdr));
725 	memset(&arg, 0, sizeof(arg));
726 
727 	arg.iov[0].iov_base = (unsigned char *)&rep;
728 	arg.iov[0].iov_len  = sizeof(rep.th);
729 	if (tsecr) {
730 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
731 				   (TCPOPT_TIMESTAMP << 8) |
732 				   TCPOLEN_TIMESTAMP);
733 		rep.opt[1] = htonl(tsval);
734 		rep.opt[2] = htonl(tsecr);
735 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
736 	}
737 
738 	/* Swap the send and the receive. */
739 	rep.th.dest    = th->source;
740 	rep.th.source  = th->dest;
741 	rep.th.doff    = arg.iov[0].iov_len / 4;
742 	rep.th.seq     = htonl(seq);
743 	rep.th.ack_seq = htonl(ack);
744 	rep.th.ack     = 1;
745 	rep.th.window  = htons(win);
746 
747 #ifdef CONFIG_TCP_MD5SIG
748 	if (key) {
749 		int offset = (tsecr) ? 3 : 0;
750 
751 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
752 					  (TCPOPT_NOP << 16) |
753 					  (TCPOPT_MD5SIG << 8) |
754 					  TCPOLEN_MD5SIG);
755 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
756 		rep.th.doff = arg.iov[0].iov_len/4;
757 
758 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
759 				    key, ip_hdr(skb)->saddr,
760 				    ip_hdr(skb)->daddr, &rep.th);
761 	}
762 #endif
763 	arg.flags = reply_flags;
764 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
765 				      ip_hdr(skb)->saddr, /* XXX */
766 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
767 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
768 	if (oif)
769 		arg.bound_dev_if = oif;
770 	arg.tos = tos;
771 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
772 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
773 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
774 			      &arg, arg.iov[0].iov_len);
775 
776 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
777 }
778 
779 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
780 {
781 	struct inet_timewait_sock *tw = inet_twsk(sk);
782 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
783 
784 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
785 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
786 			tcp_time_stamp + tcptw->tw_ts_offset,
787 			tcptw->tw_ts_recent,
788 			tw->tw_bound_dev_if,
789 			tcp_twsk_md5_key(tcptw),
790 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
791 			tw->tw_tos
792 			);
793 
794 	inet_twsk_put(tw);
795 }
796 
797 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
798 				  struct request_sock *req)
799 {
800 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
801 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
802 	 */
803 	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
804 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
805 			tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
806 			tcp_time_stamp,
807 			req->ts_recent,
808 			0,
809 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
810 					  AF_INET),
811 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
812 			ip_hdr(skb)->tos);
813 }
814 
815 /*
816  *	Send a SYN-ACK after having received a SYN.
817  *	This still operates on a request_sock only, not on a big
818  *	socket.
819  */
820 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
821 			      struct flowi *fl,
822 			      struct request_sock *req,
823 			      u16 queue_mapping,
824 			      struct tcp_fastopen_cookie *foc)
825 {
826 	const struct inet_request_sock *ireq = inet_rsk(req);
827 	struct flowi4 fl4;
828 	int err = -1;
829 	struct sk_buff *skb;
830 
831 	/* First, grab a route. */
832 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
833 		return -1;
834 
835 	skb = tcp_make_synack(sk, dst, req, foc);
836 
837 	if (skb) {
838 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
839 
840 		skb_set_queue_mapping(skb, queue_mapping);
841 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
842 					    ireq->ir_rmt_addr,
843 					    ireq->opt);
844 		err = net_xmit_eval(err);
845 	}
846 
847 	return err;
848 }
849 
850 /*
851  *	IPv4 request_sock destructor.
852  */
853 static void tcp_v4_reqsk_destructor(struct request_sock *req)
854 {
855 	kfree(inet_rsk(req)->opt);
856 }
857 
858 /*
859  * Return true if a syncookie should be sent
860  */
861 bool tcp_syn_flood_action(struct sock *sk,
862 			 const struct sk_buff *skb,
863 			 const char *proto)
864 {
865 	const char *msg = "Dropping request";
866 	bool want_cookie = false;
867 	struct listen_sock *lopt;
868 
869 #ifdef CONFIG_SYN_COOKIES
870 	if (sysctl_tcp_syncookies) {
871 		msg = "Sending cookies";
872 		want_cookie = true;
873 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
874 	} else
875 #endif
876 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
877 
878 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
879 	if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
880 		lopt->synflood_warned = 1;
881 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
882 			proto, ntohs(tcp_hdr(skb)->dest), msg);
883 	}
884 	return want_cookie;
885 }
886 EXPORT_SYMBOL(tcp_syn_flood_action);
887 
888 #ifdef CONFIG_TCP_MD5SIG
889 /*
890  * RFC2385 MD5 checksumming requires a mapping of
891  * IP address->MD5 Key.
892  * We need to maintain these in the sk structure.
893  */
894 
895 /* Find the Key structure for an address.  */
896 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
897 					 const union tcp_md5_addr *addr,
898 					 int family)
899 {
900 	struct tcp_sock *tp = tcp_sk(sk);
901 	struct tcp_md5sig_key *key;
902 	unsigned int size = sizeof(struct in_addr);
903 	struct tcp_md5sig_info *md5sig;
904 
905 	/* caller either holds rcu_read_lock() or socket lock */
906 	md5sig = rcu_dereference_check(tp->md5sig_info,
907 				       sock_owned_by_user(sk) ||
908 				       lockdep_is_held(&sk->sk_lock.slock));
909 	if (!md5sig)
910 		return NULL;
911 #if IS_ENABLED(CONFIG_IPV6)
912 	if (family == AF_INET6)
913 		size = sizeof(struct in6_addr);
914 #endif
915 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
916 		if (key->family != family)
917 			continue;
918 		if (!memcmp(&key->addr, addr, size))
919 			return key;
920 	}
921 	return NULL;
922 }
923 EXPORT_SYMBOL(tcp_md5_do_lookup);
924 
925 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
926 					 struct sock *addr_sk)
927 {
928 	union tcp_md5_addr *addr;
929 
930 	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
931 	return tcp_md5_do_lookup(sk, addr, AF_INET);
932 }
933 EXPORT_SYMBOL(tcp_v4_md5_lookup);
934 
935 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
936 						      struct request_sock *req)
937 {
938 	union tcp_md5_addr *addr;
939 
940 	addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
941 	return tcp_md5_do_lookup(sk, addr, AF_INET);
942 }
943 
944 /* This can be called on a newly created socket, from other files */
945 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
946 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
947 {
948 	/* Add Key to the list */
949 	struct tcp_md5sig_key *key;
950 	struct tcp_sock *tp = tcp_sk(sk);
951 	struct tcp_md5sig_info *md5sig;
952 
953 	key = tcp_md5_do_lookup(sk, addr, family);
954 	if (key) {
955 		/* Pre-existing entry - just update that one. */
956 		memcpy(key->key, newkey, newkeylen);
957 		key->keylen = newkeylen;
958 		return 0;
959 	}
960 
961 	md5sig = rcu_dereference_protected(tp->md5sig_info,
962 					   sock_owned_by_user(sk));
963 	if (!md5sig) {
964 		md5sig = kmalloc(sizeof(*md5sig), gfp);
965 		if (!md5sig)
966 			return -ENOMEM;
967 
968 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
969 		INIT_HLIST_HEAD(&md5sig->head);
970 		rcu_assign_pointer(tp->md5sig_info, md5sig);
971 	}
972 
973 	key = sock_kmalloc(sk, sizeof(*key), gfp);
974 	if (!key)
975 		return -ENOMEM;
976 	if (!tcp_alloc_md5sig_pool()) {
977 		sock_kfree_s(sk, key, sizeof(*key));
978 		return -ENOMEM;
979 	}
980 
981 	memcpy(key->key, newkey, newkeylen);
982 	key->keylen = newkeylen;
983 	key->family = family;
984 	memcpy(&key->addr, addr,
985 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
986 				      sizeof(struct in_addr));
987 	hlist_add_head_rcu(&key->node, &md5sig->head);
988 	return 0;
989 }
990 EXPORT_SYMBOL(tcp_md5_do_add);
991 
992 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
993 {
994 	struct tcp_md5sig_key *key;
995 
996 	key = tcp_md5_do_lookup(sk, addr, family);
997 	if (!key)
998 		return -ENOENT;
999 	hlist_del_rcu(&key->node);
1000 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1001 	kfree_rcu(key, rcu);
1002 	return 0;
1003 }
1004 EXPORT_SYMBOL(tcp_md5_do_del);
1005 
1006 static void tcp_clear_md5_list(struct sock *sk)
1007 {
1008 	struct tcp_sock *tp = tcp_sk(sk);
1009 	struct tcp_md5sig_key *key;
1010 	struct hlist_node *n;
1011 	struct tcp_md5sig_info *md5sig;
1012 
1013 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1014 
1015 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1016 		hlist_del_rcu(&key->node);
1017 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1018 		kfree_rcu(key, rcu);
1019 	}
1020 }
1021 
1022 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1023 				 int optlen)
1024 {
1025 	struct tcp_md5sig cmd;
1026 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1027 
1028 	if (optlen < sizeof(cmd))
1029 		return -EINVAL;
1030 
1031 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1032 		return -EFAULT;
1033 
1034 	if (sin->sin_family != AF_INET)
1035 		return -EINVAL;
1036 
1037 	if (!cmd.tcpm_keylen)
1038 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1039 				      AF_INET);
1040 
1041 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1042 		return -EINVAL;
1043 
1044 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1045 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1046 			      GFP_KERNEL);
1047 }
1048 
1049 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1050 					__be32 daddr, __be32 saddr, int nbytes)
1051 {
1052 	struct tcp4_pseudohdr *bp;
1053 	struct scatterlist sg;
1054 
1055 	bp = &hp->md5_blk.ip4;
1056 
1057 	/*
1058 	 * 1. the TCP pseudo-header (in the order: source IP address,
1059 	 * destination IP address, zero-padded protocol number, and
1060 	 * segment length)
1061 	 */
1062 	bp->saddr = saddr;
1063 	bp->daddr = daddr;
1064 	bp->pad = 0;
1065 	bp->protocol = IPPROTO_TCP;
1066 	bp->len = cpu_to_be16(nbytes);
1067 
1068 	sg_init_one(&sg, bp, sizeof(*bp));
1069 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1070 }
1071 
1072 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1073 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1074 {
1075 	struct tcp_md5sig_pool *hp;
1076 	struct hash_desc *desc;
1077 
1078 	hp = tcp_get_md5sig_pool();
1079 	if (!hp)
1080 		goto clear_hash_noput;
1081 	desc = &hp->md5_desc;
1082 
1083 	if (crypto_hash_init(desc))
1084 		goto clear_hash;
1085 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1086 		goto clear_hash;
1087 	if (tcp_md5_hash_header(hp, th))
1088 		goto clear_hash;
1089 	if (tcp_md5_hash_key(hp, key))
1090 		goto clear_hash;
1091 	if (crypto_hash_final(desc, md5_hash))
1092 		goto clear_hash;
1093 
1094 	tcp_put_md5sig_pool();
1095 	return 0;
1096 
1097 clear_hash:
1098 	tcp_put_md5sig_pool();
1099 clear_hash_noput:
1100 	memset(md5_hash, 0, 16);
1101 	return 1;
1102 }
1103 
1104 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1105 			const struct sock *sk, const struct request_sock *req,
1106 			const struct sk_buff *skb)
1107 {
1108 	struct tcp_md5sig_pool *hp;
1109 	struct hash_desc *desc;
1110 	const struct tcphdr *th = tcp_hdr(skb);
1111 	__be32 saddr, daddr;
1112 
1113 	if (sk) {
1114 		saddr = inet_sk(sk)->inet_saddr;
1115 		daddr = inet_sk(sk)->inet_daddr;
1116 	} else if (req) {
1117 		saddr = inet_rsk(req)->ir_loc_addr;
1118 		daddr = inet_rsk(req)->ir_rmt_addr;
1119 	} else {
1120 		const struct iphdr *iph = ip_hdr(skb);
1121 		saddr = iph->saddr;
1122 		daddr = iph->daddr;
1123 	}
1124 
1125 	hp = tcp_get_md5sig_pool();
1126 	if (!hp)
1127 		goto clear_hash_noput;
1128 	desc = &hp->md5_desc;
1129 
1130 	if (crypto_hash_init(desc))
1131 		goto clear_hash;
1132 
1133 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1134 		goto clear_hash;
1135 	if (tcp_md5_hash_header(hp, th))
1136 		goto clear_hash;
1137 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1138 		goto clear_hash;
1139 	if (tcp_md5_hash_key(hp, key))
1140 		goto clear_hash;
1141 	if (crypto_hash_final(desc, md5_hash))
1142 		goto clear_hash;
1143 
1144 	tcp_put_md5sig_pool();
1145 	return 0;
1146 
1147 clear_hash:
1148 	tcp_put_md5sig_pool();
1149 clear_hash_noput:
1150 	memset(md5_hash, 0, 16);
1151 	return 1;
1152 }
1153 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1154 
1155 static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1156 				      const struct sk_buff *skb)
1157 {
1158 	/*
1159 	 * This gets called for each TCP segment that arrives
1160 	 * so we want to be efficient.
1161 	 * We have 3 drop cases:
1162 	 * o No MD5 hash and one expected.
1163 	 * o MD5 hash and we're not expecting one.
1164 	 * o MD5 hash and its wrong.
1165 	 */
1166 	const __u8 *hash_location = NULL;
1167 	struct tcp_md5sig_key *hash_expected;
1168 	const struct iphdr *iph = ip_hdr(skb);
1169 	const struct tcphdr *th = tcp_hdr(skb);
1170 	int genhash;
1171 	unsigned char newhash[16];
1172 
1173 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1174 					  AF_INET);
1175 	hash_location = tcp_parse_md5sig_option(th);
1176 
1177 	/* We've parsed the options - do we have a hash? */
1178 	if (!hash_expected && !hash_location)
1179 		return false;
1180 
1181 	if (hash_expected && !hash_location) {
1182 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1183 		return true;
1184 	}
1185 
1186 	if (!hash_expected && hash_location) {
1187 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1188 		return true;
1189 	}
1190 
1191 	/* Okay, so this is hash_expected and hash_location -
1192 	 * so we need to calculate the checksum.
1193 	 */
1194 	genhash = tcp_v4_md5_hash_skb(newhash,
1195 				      hash_expected,
1196 				      NULL, NULL, skb);
1197 
1198 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1199 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1200 				     &iph->saddr, ntohs(th->source),
1201 				     &iph->daddr, ntohs(th->dest),
1202 				     genhash ? " tcp_v4_calc_md5_hash failed"
1203 				     : "");
1204 		return true;
1205 	}
1206 	return false;
1207 }
1208 
1209 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1210 {
1211 	bool ret;
1212 
1213 	rcu_read_lock();
1214 	ret = __tcp_v4_inbound_md5_hash(sk, skb);
1215 	rcu_read_unlock();
1216 
1217 	return ret;
1218 }
1219 
1220 #endif
1221 
1222 static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1223 			    struct sk_buff *skb)
1224 {
1225 	struct inet_request_sock *ireq = inet_rsk(req);
1226 
1227 	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1228 	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1229 	ireq->no_srccheck = inet_sk(sk)->transparent;
1230 	ireq->opt = tcp_v4_save_options(skb);
1231 }
1232 
1233 static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1234 					  const struct request_sock *req,
1235 					  bool *strict)
1236 {
1237 	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1238 
1239 	if (strict) {
1240 		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1241 			*strict = true;
1242 		else
1243 			*strict = false;
1244 	}
1245 
1246 	return dst;
1247 }
1248 
1249 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1250 	.family		=	PF_INET,
1251 	.obj_size	=	sizeof(struct tcp_request_sock),
1252 	.rtx_syn_ack	=	tcp_rtx_synack,
1253 	.send_ack	=	tcp_v4_reqsk_send_ack,
1254 	.destructor	=	tcp_v4_reqsk_destructor,
1255 	.send_reset	=	tcp_v4_send_reset,
1256 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1257 };
1258 
1259 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1260 	.mss_clamp	=	TCP_MSS_DEFAULT,
1261 #ifdef CONFIG_TCP_MD5SIG
1262 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1263 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1264 #endif
1265 	.init_req	=	tcp_v4_init_req,
1266 #ifdef CONFIG_SYN_COOKIES
1267 	.cookie_init_seq =	cookie_v4_init_sequence,
1268 #endif
1269 	.route_req	=	tcp_v4_route_req,
1270 	.init_seq	=	tcp_v4_init_sequence,
1271 	.send_synack	=	tcp_v4_send_synack,
1272 	.queue_hash_add =	inet_csk_reqsk_queue_hash_add,
1273 };
1274 
1275 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1276 {
1277 	/* Never answer to SYNs send to broadcast or multicast */
1278 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1279 		goto drop;
1280 
1281 	return tcp_conn_request(&tcp_request_sock_ops,
1282 				&tcp_request_sock_ipv4_ops, sk, skb);
1283 
1284 drop:
1285 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1286 	return 0;
1287 }
1288 EXPORT_SYMBOL(tcp_v4_conn_request);
1289 
1290 
1291 /*
1292  * The three way handshake has completed - we got a valid synack -
1293  * now create the new socket.
1294  */
1295 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1296 				  struct request_sock *req,
1297 				  struct dst_entry *dst)
1298 {
1299 	struct inet_request_sock *ireq;
1300 	struct inet_sock *newinet;
1301 	struct tcp_sock *newtp;
1302 	struct sock *newsk;
1303 #ifdef CONFIG_TCP_MD5SIG
1304 	struct tcp_md5sig_key *key;
1305 #endif
1306 	struct ip_options_rcu *inet_opt;
1307 
1308 	if (sk_acceptq_is_full(sk))
1309 		goto exit_overflow;
1310 
1311 	newsk = tcp_create_openreq_child(sk, req, skb);
1312 	if (!newsk)
1313 		goto exit_nonewsk;
1314 
1315 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1316 	inet_sk_rx_dst_set(newsk, skb);
1317 
1318 	newtp		      = tcp_sk(newsk);
1319 	newinet		      = inet_sk(newsk);
1320 	ireq		      = inet_rsk(req);
1321 	newinet->inet_daddr   = ireq->ir_rmt_addr;
1322 	newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1323 	newinet->inet_saddr	      = ireq->ir_loc_addr;
1324 	inet_opt	      = ireq->opt;
1325 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1326 	ireq->opt	      = NULL;
1327 	newinet->mc_index     = inet_iif(skb);
1328 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1329 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1330 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1331 	inet_set_txhash(newsk);
1332 	if (inet_opt)
1333 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1334 	newinet->inet_id = newtp->write_seq ^ jiffies;
1335 
1336 	if (!dst) {
1337 		dst = inet_csk_route_child_sock(sk, newsk, req);
1338 		if (!dst)
1339 			goto put_and_exit;
1340 	} else {
1341 		/* syncookie case : see end of cookie_v4_check() */
1342 	}
1343 	sk_setup_caps(newsk, dst);
1344 
1345 	tcp_sync_mss(newsk, dst_mtu(dst));
1346 	newtp->advmss = dst_metric_advmss(dst);
1347 	if (tcp_sk(sk)->rx_opt.user_mss &&
1348 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1349 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1350 
1351 	tcp_initialize_rcv_mss(newsk);
1352 
1353 #ifdef CONFIG_TCP_MD5SIG
1354 	/* Copy over the MD5 key from the original socket */
1355 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1356 				AF_INET);
1357 	if (key != NULL) {
1358 		/*
1359 		 * We're using one, so create a matching key
1360 		 * on the newsk structure. If we fail to get
1361 		 * memory, then we end up not copying the key
1362 		 * across. Shucks.
1363 		 */
1364 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1365 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1366 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1367 	}
1368 #endif
1369 
1370 	if (__inet_inherit_port(sk, newsk) < 0)
1371 		goto put_and_exit;
1372 	__inet_hash_nolisten(newsk, NULL);
1373 
1374 	return newsk;
1375 
1376 exit_overflow:
1377 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1378 exit_nonewsk:
1379 	dst_release(dst);
1380 exit:
1381 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1382 	return NULL;
1383 put_and_exit:
1384 	inet_csk_prepare_forced_close(newsk);
1385 	tcp_done(newsk);
1386 	goto exit;
1387 }
1388 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1389 
1390 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1391 {
1392 	struct tcphdr *th = tcp_hdr(skb);
1393 	const struct iphdr *iph = ip_hdr(skb);
1394 	struct sock *nsk;
1395 	struct request_sock **prev;
1396 	/* Find possible connection requests. */
1397 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1398 						       iph->saddr, iph->daddr);
1399 	if (req)
1400 		return tcp_check_req(sk, skb, req, prev, false);
1401 
1402 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1403 			th->source, iph->daddr, th->dest, inet_iif(skb));
1404 
1405 	if (nsk) {
1406 		if (nsk->sk_state != TCP_TIME_WAIT) {
1407 			bh_lock_sock(nsk);
1408 			return nsk;
1409 		}
1410 		inet_twsk_put(inet_twsk(nsk));
1411 		return NULL;
1412 	}
1413 
1414 #ifdef CONFIG_SYN_COOKIES
1415 	if (!th->syn)
1416 		sk = cookie_v4_check(sk, skb);
1417 #endif
1418 	return sk;
1419 }
1420 
1421 /* The socket must have it's spinlock held when we get
1422  * here.
1423  *
1424  * We have a potential double-lock case here, so even when
1425  * doing backlog processing we use the BH locking scheme.
1426  * This is because we cannot sleep with the original spinlock
1427  * held.
1428  */
1429 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1430 {
1431 	struct sock *rsk;
1432 
1433 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1434 		struct dst_entry *dst = sk->sk_rx_dst;
1435 
1436 		sock_rps_save_rxhash(sk, skb);
1437 		sk_mark_napi_id(sk, skb);
1438 		if (dst) {
1439 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1440 			    dst->ops->check(dst, 0) == NULL) {
1441 				dst_release(dst);
1442 				sk->sk_rx_dst = NULL;
1443 			}
1444 		}
1445 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1446 		return 0;
1447 	}
1448 
1449 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1450 		goto csum_err;
1451 
1452 	if (sk->sk_state == TCP_LISTEN) {
1453 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1454 		if (!nsk)
1455 			goto discard;
1456 
1457 		if (nsk != sk) {
1458 			sock_rps_save_rxhash(nsk, skb);
1459 			sk_mark_napi_id(sk, skb);
1460 			if (tcp_child_process(sk, nsk, skb)) {
1461 				rsk = nsk;
1462 				goto reset;
1463 			}
1464 			return 0;
1465 		}
1466 	} else
1467 		sock_rps_save_rxhash(sk, skb);
1468 
1469 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1470 		rsk = sk;
1471 		goto reset;
1472 	}
1473 	return 0;
1474 
1475 reset:
1476 	tcp_v4_send_reset(rsk, skb);
1477 discard:
1478 	kfree_skb(skb);
1479 	/* Be careful here. If this function gets more complicated and
1480 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1481 	 * might be destroyed here. This current version compiles correctly,
1482 	 * but you have been warned.
1483 	 */
1484 	return 0;
1485 
1486 csum_err:
1487 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1488 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1489 	goto discard;
1490 }
1491 EXPORT_SYMBOL(tcp_v4_do_rcv);
1492 
1493 void tcp_v4_early_demux(struct sk_buff *skb)
1494 {
1495 	const struct iphdr *iph;
1496 	const struct tcphdr *th;
1497 	struct sock *sk;
1498 
1499 	if (skb->pkt_type != PACKET_HOST)
1500 		return;
1501 
1502 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1503 		return;
1504 
1505 	iph = ip_hdr(skb);
1506 	th = tcp_hdr(skb);
1507 
1508 	if (th->doff < sizeof(struct tcphdr) / 4)
1509 		return;
1510 
1511 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1512 				       iph->saddr, th->source,
1513 				       iph->daddr, ntohs(th->dest),
1514 				       skb->skb_iif);
1515 	if (sk) {
1516 		skb->sk = sk;
1517 		skb->destructor = sock_edemux;
1518 		if (sk->sk_state != TCP_TIME_WAIT) {
1519 			struct dst_entry *dst = sk->sk_rx_dst;
1520 
1521 			if (dst)
1522 				dst = dst_check(dst, 0);
1523 			if (dst &&
1524 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1525 				skb_dst_set_noref(skb, dst);
1526 		}
1527 	}
1528 }
1529 
1530 /* Packet is added to VJ-style prequeue for processing in process
1531  * context, if a reader task is waiting. Apparently, this exciting
1532  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1533  * failed somewhere. Latency? Burstiness? Well, at least now we will
1534  * see, why it failed. 8)8)				  --ANK
1535  *
1536  */
1537 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1538 {
1539 	struct tcp_sock *tp = tcp_sk(sk);
1540 
1541 	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1542 		return false;
1543 
1544 	if (skb->len <= tcp_hdrlen(skb) &&
1545 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1546 		return false;
1547 
1548 	/* Before escaping RCU protected region, we need to take care of skb
1549 	 * dst. Prequeue is only enabled for established sockets.
1550 	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1551 	 * Instead of doing full sk_rx_dst validity here, let's perform
1552 	 * an optimistic check.
1553 	 */
1554 	if (likely(sk->sk_rx_dst))
1555 		skb_dst_drop(skb);
1556 	else
1557 		skb_dst_force(skb);
1558 
1559 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1560 	tp->ucopy.memory += skb->truesize;
1561 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1562 		struct sk_buff *skb1;
1563 
1564 		BUG_ON(sock_owned_by_user(sk));
1565 
1566 		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1567 			sk_backlog_rcv(sk, skb1);
1568 			NET_INC_STATS_BH(sock_net(sk),
1569 					 LINUX_MIB_TCPPREQUEUEDROPPED);
1570 		}
1571 
1572 		tp->ucopy.memory = 0;
1573 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1574 		wake_up_interruptible_sync_poll(sk_sleep(sk),
1575 					   POLLIN | POLLRDNORM | POLLRDBAND);
1576 		if (!inet_csk_ack_scheduled(sk))
1577 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1578 						  (3 * tcp_rto_min(sk)) / 4,
1579 						  TCP_RTO_MAX);
1580 	}
1581 	return true;
1582 }
1583 EXPORT_SYMBOL(tcp_prequeue);
1584 
1585 /*
1586  *	From tcp_input.c
1587  */
1588 
1589 int tcp_v4_rcv(struct sk_buff *skb)
1590 {
1591 	const struct iphdr *iph;
1592 	const struct tcphdr *th;
1593 	struct sock *sk;
1594 	int ret;
1595 	struct net *net = dev_net(skb->dev);
1596 
1597 	if (skb->pkt_type != PACKET_HOST)
1598 		goto discard_it;
1599 
1600 	/* Count it even if it's bad */
1601 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1602 
1603 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1604 		goto discard_it;
1605 
1606 	th = tcp_hdr(skb);
1607 
1608 	if (th->doff < sizeof(struct tcphdr) / 4)
1609 		goto bad_packet;
1610 	if (!pskb_may_pull(skb, th->doff * 4))
1611 		goto discard_it;
1612 
1613 	/* An explanation is required here, I think.
1614 	 * Packet length and doff are validated by header prediction,
1615 	 * provided case of th->doff==0 is eliminated.
1616 	 * So, we defer the checks. */
1617 
1618 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1619 		goto csum_error;
1620 
1621 	th = tcp_hdr(skb);
1622 	iph = ip_hdr(skb);
1623 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1624 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1625 	 */
1626 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1627 		sizeof(struct inet_skb_parm));
1628 	barrier();
1629 
1630 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1631 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1632 				    skb->len - th->doff * 4);
1633 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1634 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1635 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1636 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1637 	TCP_SKB_CB(skb)->sacked	 = 0;
1638 
1639 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1640 	if (!sk)
1641 		goto no_tcp_socket;
1642 
1643 process:
1644 	if (sk->sk_state == TCP_TIME_WAIT)
1645 		goto do_time_wait;
1646 
1647 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1648 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1649 		goto discard_and_relse;
1650 	}
1651 
1652 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1653 		goto discard_and_relse;
1654 
1655 #ifdef CONFIG_TCP_MD5SIG
1656 	/*
1657 	 * We really want to reject the packet as early as possible
1658 	 * if:
1659 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1660 	 *  o There is an MD5 option and we're not expecting one
1661 	 */
1662 	if (tcp_v4_inbound_md5_hash(sk, skb))
1663 		goto discard_and_relse;
1664 #endif
1665 
1666 	nf_reset(skb);
1667 
1668 	if (sk_filter(sk, skb))
1669 		goto discard_and_relse;
1670 
1671 	sk_incoming_cpu_update(sk);
1672 	skb->dev = NULL;
1673 
1674 	bh_lock_sock_nested(sk);
1675 	ret = 0;
1676 	if (!sock_owned_by_user(sk)) {
1677 		if (!tcp_prequeue(sk, skb))
1678 			ret = tcp_v4_do_rcv(sk, skb);
1679 	} else if (unlikely(sk_add_backlog(sk, skb,
1680 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1681 		bh_unlock_sock(sk);
1682 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1683 		goto discard_and_relse;
1684 	}
1685 	bh_unlock_sock(sk);
1686 
1687 	sock_put(sk);
1688 
1689 	return ret;
1690 
1691 no_tcp_socket:
1692 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1693 		goto discard_it;
1694 
1695 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1696 csum_error:
1697 		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1698 bad_packet:
1699 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1700 	} else {
1701 		tcp_v4_send_reset(NULL, skb);
1702 	}
1703 
1704 discard_it:
1705 	/* Discard frame. */
1706 	kfree_skb(skb);
1707 	return 0;
1708 
1709 discard_and_relse:
1710 	sock_put(sk);
1711 	goto discard_it;
1712 
1713 do_time_wait:
1714 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1715 		inet_twsk_put(inet_twsk(sk));
1716 		goto discard_it;
1717 	}
1718 
1719 	if (skb->len < (th->doff << 2)) {
1720 		inet_twsk_put(inet_twsk(sk));
1721 		goto bad_packet;
1722 	}
1723 	if (tcp_checksum_complete(skb)) {
1724 		inet_twsk_put(inet_twsk(sk));
1725 		goto csum_error;
1726 	}
1727 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1728 	case TCP_TW_SYN: {
1729 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1730 							&tcp_hashinfo,
1731 							iph->saddr, th->source,
1732 							iph->daddr, th->dest,
1733 							inet_iif(skb));
1734 		if (sk2) {
1735 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1736 			inet_twsk_put(inet_twsk(sk));
1737 			sk = sk2;
1738 			goto process;
1739 		}
1740 		/* Fall through to ACK */
1741 	}
1742 	case TCP_TW_ACK:
1743 		tcp_v4_timewait_ack(sk, skb);
1744 		break;
1745 	case TCP_TW_RST:
1746 		goto no_tcp_socket;
1747 	case TCP_TW_SUCCESS:;
1748 	}
1749 	goto discard_it;
1750 }
1751 
1752 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1753 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1754 	.twsk_unique	= tcp_twsk_unique,
1755 	.twsk_destructor= tcp_twsk_destructor,
1756 };
1757 
1758 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1759 {
1760 	struct dst_entry *dst = skb_dst(skb);
1761 
1762 	if (dst) {
1763 		dst_hold(dst);
1764 		sk->sk_rx_dst = dst;
1765 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1766 	}
1767 }
1768 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1769 
1770 const struct inet_connection_sock_af_ops ipv4_specific = {
1771 	.queue_xmit	   = ip_queue_xmit,
1772 	.send_check	   = tcp_v4_send_check,
1773 	.rebuild_header	   = inet_sk_rebuild_header,
1774 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1775 	.conn_request	   = tcp_v4_conn_request,
1776 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1777 	.net_header_len	   = sizeof(struct iphdr),
1778 	.setsockopt	   = ip_setsockopt,
1779 	.getsockopt	   = ip_getsockopt,
1780 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1781 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1782 	.bind_conflict	   = inet_csk_bind_conflict,
1783 #ifdef CONFIG_COMPAT
1784 	.compat_setsockopt = compat_ip_setsockopt,
1785 	.compat_getsockopt = compat_ip_getsockopt,
1786 #endif
1787 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1788 };
1789 EXPORT_SYMBOL(ipv4_specific);
1790 
1791 #ifdef CONFIG_TCP_MD5SIG
1792 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1793 	.md5_lookup		= tcp_v4_md5_lookup,
1794 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1795 	.md5_parse		= tcp_v4_parse_md5_keys,
1796 };
1797 #endif
1798 
1799 /* NOTE: A lot of things set to zero explicitly by call to
1800  *       sk_alloc() so need not be done here.
1801  */
1802 static int tcp_v4_init_sock(struct sock *sk)
1803 {
1804 	struct inet_connection_sock *icsk = inet_csk(sk);
1805 
1806 	tcp_init_sock(sk);
1807 
1808 	icsk->icsk_af_ops = &ipv4_specific;
1809 
1810 #ifdef CONFIG_TCP_MD5SIG
1811 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1812 #endif
1813 
1814 	return 0;
1815 }
1816 
1817 void tcp_v4_destroy_sock(struct sock *sk)
1818 {
1819 	struct tcp_sock *tp = tcp_sk(sk);
1820 
1821 	tcp_clear_xmit_timers(sk);
1822 
1823 	tcp_cleanup_congestion_control(sk);
1824 
1825 	/* Cleanup up the write buffer. */
1826 	tcp_write_queue_purge(sk);
1827 
1828 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1829 	__skb_queue_purge(&tp->out_of_order_queue);
1830 
1831 #ifdef CONFIG_TCP_MD5SIG
1832 	/* Clean up the MD5 key list, if any */
1833 	if (tp->md5sig_info) {
1834 		tcp_clear_md5_list(sk);
1835 		kfree_rcu(tp->md5sig_info, rcu);
1836 		tp->md5sig_info = NULL;
1837 	}
1838 #endif
1839 
1840 	/* Clean prequeue, it must be empty really */
1841 	__skb_queue_purge(&tp->ucopy.prequeue);
1842 
1843 	/* Clean up a referenced TCP bind bucket. */
1844 	if (inet_csk(sk)->icsk_bind_hash)
1845 		inet_put_port(sk);
1846 
1847 	BUG_ON(tp->fastopen_rsk != NULL);
1848 
1849 	/* If socket is aborted during connect operation */
1850 	tcp_free_fastopen_req(tp);
1851 
1852 	sk_sockets_allocated_dec(sk);
1853 	sock_release_memcg(sk);
1854 }
1855 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1856 
1857 #ifdef CONFIG_PROC_FS
1858 /* Proc filesystem TCP sock list dumping. */
1859 
1860 /*
1861  * Get next listener socket follow cur.  If cur is NULL, get first socket
1862  * starting from bucket given in st->bucket; when st->bucket is zero the
1863  * very first socket in the hash table is returned.
1864  */
1865 static void *listening_get_next(struct seq_file *seq, void *cur)
1866 {
1867 	struct inet_connection_sock *icsk;
1868 	struct hlist_nulls_node *node;
1869 	struct sock *sk = cur;
1870 	struct inet_listen_hashbucket *ilb;
1871 	struct tcp_iter_state *st = seq->private;
1872 	struct net *net = seq_file_net(seq);
1873 
1874 	if (!sk) {
1875 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1876 		spin_lock_bh(&ilb->lock);
1877 		sk = sk_nulls_head(&ilb->head);
1878 		st->offset = 0;
1879 		goto get_sk;
1880 	}
1881 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1882 	++st->num;
1883 	++st->offset;
1884 
1885 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1886 		struct request_sock *req = cur;
1887 
1888 		icsk = inet_csk(st->syn_wait_sk);
1889 		req = req->dl_next;
1890 		while (1) {
1891 			while (req) {
1892 				if (req->rsk_ops->family == st->family) {
1893 					cur = req;
1894 					goto out;
1895 				}
1896 				req = req->dl_next;
1897 			}
1898 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1899 				break;
1900 get_req:
1901 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1902 		}
1903 		sk	  = sk_nulls_next(st->syn_wait_sk);
1904 		st->state = TCP_SEQ_STATE_LISTENING;
1905 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1906 	} else {
1907 		icsk = inet_csk(sk);
1908 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1909 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1910 			goto start_req;
1911 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1912 		sk = sk_nulls_next(sk);
1913 	}
1914 get_sk:
1915 	sk_nulls_for_each_from(sk, node) {
1916 		if (!net_eq(sock_net(sk), net))
1917 			continue;
1918 		if (sk->sk_family == st->family) {
1919 			cur = sk;
1920 			goto out;
1921 		}
1922 		icsk = inet_csk(sk);
1923 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1924 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1925 start_req:
1926 			st->uid		= sock_i_uid(sk);
1927 			st->syn_wait_sk = sk;
1928 			st->state	= TCP_SEQ_STATE_OPENREQ;
1929 			st->sbucket	= 0;
1930 			goto get_req;
1931 		}
1932 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1933 	}
1934 	spin_unlock_bh(&ilb->lock);
1935 	st->offset = 0;
1936 	if (++st->bucket < INET_LHTABLE_SIZE) {
1937 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1938 		spin_lock_bh(&ilb->lock);
1939 		sk = sk_nulls_head(&ilb->head);
1940 		goto get_sk;
1941 	}
1942 	cur = NULL;
1943 out:
1944 	return cur;
1945 }
1946 
1947 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1948 {
1949 	struct tcp_iter_state *st = seq->private;
1950 	void *rc;
1951 
1952 	st->bucket = 0;
1953 	st->offset = 0;
1954 	rc = listening_get_next(seq, NULL);
1955 
1956 	while (rc && *pos) {
1957 		rc = listening_get_next(seq, rc);
1958 		--*pos;
1959 	}
1960 	return rc;
1961 }
1962 
1963 static inline bool empty_bucket(const struct tcp_iter_state *st)
1964 {
1965 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1966 }
1967 
1968 /*
1969  * Get first established socket starting from bucket given in st->bucket.
1970  * If st->bucket is zero, the very first socket in the hash is returned.
1971  */
1972 static void *established_get_first(struct seq_file *seq)
1973 {
1974 	struct tcp_iter_state *st = seq->private;
1975 	struct net *net = seq_file_net(seq);
1976 	void *rc = NULL;
1977 
1978 	st->offset = 0;
1979 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1980 		struct sock *sk;
1981 		struct hlist_nulls_node *node;
1982 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1983 
1984 		/* Lockless fast path for the common case of empty buckets */
1985 		if (empty_bucket(st))
1986 			continue;
1987 
1988 		spin_lock_bh(lock);
1989 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1990 			if (sk->sk_family != st->family ||
1991 			    !net_eq(sock_net(sk), net)) {
1992 				continue;
1993 			}
1994 			rc = sk;
1995 			goto out;
1996 		}
1997 		spin_unlock_bh(lock);
1998 	}
1999 out:
2000 	return rc;
2001 }
2002 
2003 static void *established_get_next(struct seq_file *seq, void *cur)
2004 {
2005 	struct sock *sk = cur;
2006 	struct hlist_nulls_node *node;
2007 	struct tcp_iter_state *st = seq->private;
2008 	struct net *net = seq_file_net(seq);
2009 
2010 	++st->num;
2011 	++st->offset;
2012 
2013 	sk = sk_nulls_next(sk);
2014 
2015 	sk_nulls_for_each_from(sk, node) {
2016 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2017 			return sk;
2018 	}
2019 
2020 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2021 	++st->bucket;
2022 	return established_get_first(seq);
2023 }
2024 
2025 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2026 {
2027 	struct tcp_iter_state *st = seq->private;
2028 	void *rc;
2029 
2030 	st->bucket = 0;
2031 	rc = established_get_first(seq);
2032 
2033 	while (rc && pos) {
2034 		rc = established_get_next(seq, rc);
2035 		--pos;
2036 	}
2037 	return rc;
2038 }
2039 
2040 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2041 {
2042 	void *rc;
2043 	struct tcp_iter_state *st = seq->private;
2044 
2045 	st->state = TCP_SEQ_STATE_LISTENING;
2046 	rc	  = listening_get_idx(seq, &pos);
2047 
2048 	if (!rc) {
2049 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2050 		rc	  = established_get_idx(seq, pos);
2051 	}
2052 
2053 	return rc;
2054 }
2055 
2056 static void *tcp_seek_last_pos(struct seq_file *seq)
2057 {
2058 	struct tcp_iter_state *st = seq->private;
2059 	int offset = st->offset;
2060 	int orig_num = st->num;
2061 	void *rc = NULL;
2062 
2063 	switch (st->state) {
2064 	case TCP_SEQ_STATE_OPENREQ:
2065 	case TCP_SEQ_STATE_LISTENING:
2066 		if (st->bucket >= INET_LHTABLE_SIZE)
2067 			break;
2068 		st->state = TCP_SEQ_STATE_LISTENING;
2069 		rc = listening_get_next(seq, NULL);
2070 		while (offset-- && rc)
2071 			rc = listening_get_next(seq, rc);
2072 		if (rc)
2073 			break;
2074 		st->bucket = 0;
2075 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2076 		/* Fallthrough */
2077 	case TCP_SEQ_STATE_ESTABLISHED:
2078 		if (st->bucket > tcp_hashinfo.ehash_mask)
2079 			break;
2080 		rc = established_get_first(seq);
2081 		while (offset-- && rc)
2082 			rc = established_get_next(seq, rc);
2083 	}
2084 
2085 	st->num = orig_num;
2086 
2087 	return rc;
2088 }
2089 
2090 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2091 {
2092 	struct tcp_iter_state *st = seq->private;
2093 	void *rc;
2094 
2095 	if (*pos && *pos == st->last_pos) {
2096 		rc = tcp_seek_last_pos(seq);
2097 		if (rc)
2098 			goto out;
2099 	}
2100 
2101 	st->state = TCP_SEQ_STATE_LISTENING;
2102 	st->num = 0;
2103 	st->bucket = 0;
2104 	st->offset = 0;
2105 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2106 
2107 out:
2108 	st->last_pos = *pos;
2109 	return rc;
2110 }
2111 
2112 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2113 {
2114 	struct tcp_iter_state *st = seq->private;
2115 	void *rc = NULL;
2116 
2117 	if (v == SEQ_START_TOKEN) {
2118 		rc = tcp_get_idx(seq, 0);
2119 		goto out;
2120 	}
2121 
2122 	switch (st->state) {
2123 	case TCP_SEQ_STATE_OPENREQ:
2124 	case TCP_SEQ_STATE_LISTENING:
2125 		rc = listening_get_next(seq, v);
2126 		if (!rc) {
2127 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2128 			st->bucket = 0;
2129 			st->offset = 0;
2130 			rc	  = established_get_first(seq);
2131 		}
2132 		break;
2133 	case TCP_SEQ_STATE_ESTABLISHED:
2134 		rc = established_get_next(seq, v);
2135 		break;
2136 	}
2137 out:
2138 	++*pos;
2139 	st->last_pos = *pos;
2140 	return rc;
2141 }
2142 
2143 static void tcp_seq_stop(struct seq_file *seq, void *v)
2144 {
2145 	struct tcp_iter_state *st = seq->private;
2146 
2147 	switch (st->state) {
2148 	case TCP_SEQ_STATE_OPENREQ:
2149 		if (v) {
2150 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2151 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2152 		}
2153 	case TCP_SEQ_STATE_LISTENING:
2154 		if (v != SEQ_START_TOKEN)
2155 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2156 		break;
2157 	case TCP_SEQ_STATE_ESTABLISHED:
2158 		if (v)
2159 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2160 		break;
2161 	}
2162 }
2163 
2164 int tcp_seq_open(struct inode *inode, struct file *file)
2165 {
2166 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2167 	struct tcp_iter_state *s;
2168 	int err;
2169 
2170 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2171 			  sizeof(struct tcp_iter_state));
2172 	if (err < 0)
2173 		return err;
2174 
2175 	s = ((struct seq_file *)file->private_data)->private;
2176 	s->family		= afinfo->family;
2177 	s->last_pos		= 0;
2178 	return 0;
2179 }
2180 EXPORT_SYMBOL(tcp_seq_open);
2181 
2182 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2183 {
2184 	int rc = 0;
2185 	struct proc_dir_entry *p;
2186 
2187 	afinfo->seq_ops.start		= tcp_seq_start;
2188 	afinfo->seq_ops.next		= tcp_seq_next;
2189 	afinfo->seq_ops.stop		= tcp_seq_stop;
2190 
2191 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2192 			     afinfo->seq_fops, afinfo);
2193 	if (!p)
2194 		rc = -ENOMEM;
2195 	return rc;
2196 }
2197 EXPORT_SYMBOL(tcp_proc_register);
2198 
2199 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2200 {
2201 	remove_proc_entry(afinfo->name, net->proc_net);
2202 }
2203 EXPORT_SYMBOL(tcp_proc_unregister);
2204 
2205 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2206 			 struct seq_file *f, int i, kuid_t uid)
2207 {
2208 	const struct inet_request_sock *ireq = inet_rsk(req);
2209 	long delta = req->expires - jiffies;
2210 
2211 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2212 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2213 		i,
2214 		ireq->ir_loc_addr,
2215 		ntohs(inet_sk(sk)->inet_sport),
2216 		ireq->ir_rmt_addr,
2217 		ntohs(ireq->ir_rmt_port),
2218 		TCP_SYN_RECV,
2219 		0, 0, /* could print option size, but that is af dependent. */
2220 		1,    /* timers active (only the expire timer) */
2221 		jiffies_delta_to_clock_t(delta),
2222 		req->num_timeout,
2223 		from_kuid_munged(seq_user_ns(f), uid),
2224 		0,  /* non standard timer */
2225 		0, /* open_requests have no inode */
2226 		atomic_read(&sk->sk_refcnt),
2227 		req);
2228 }
2229 
2230 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2231 {
2232 	int timer_active;
2233 	unsigned long timer_expires;
2234 	const struct tcp_sock *tp = tcp_sk(sk);
2235 	const struct inet_connection_sock *icsk = inet_csk(sk);
2236 	const struct inet_sock *inet = inet_sk(sk);
2237 	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2238 	__be32 dest = inet->inet_daddr;
2239 	__be32 src = inet->inet_rcv_saddr;
2240 	__u16 destp = ntohs(inet->inet_dport);
2241 	__u16 srcp = ntohs(inet->inet_sport);
2242 	int rx_queue;
2243 
2244 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2245 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2246 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2247 		timer_active	= 1;
2248 		timer_expires	= icsk->icsk_timeout;
2249 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2250 		timer_active	= 4;
2251 		timer_expires	= icsk->icsk_timeout;
2252 	} else if (timer_pending(&sk->sk_timer)) {
2253 		timer_active	= 2;
2254 		timer_expires	= sk->sk_timer.expires;
2255 	} else {
2256 		timer_active	= 0;
2257 		timer_expires = jiffies;
2258 	}
2259 
2260 	if (sk->sk_state == TCP_LISTEN)
2261 		rx_queue = sk->sk_ack_backlog;
2262 	else
2263 		/*
2264 		 * because we dont lock socket, we might find a transient negative value
2265 		 */
2266 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2267 
2268 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2269 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2270 		i, src, srcp, dest, destp, sk->sk_state,
2271 		tp->write_seq - tp->snd_una,
2272 		rx_queue,
2273 		timer_active,
2274 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2275 		icsk->icsk_retransmits,
2276 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2277 		icsk->icsk_probes_out,
2278 		sock_i_ino(sk),
2279 		atomic_read(&sk->sk_refcnt), sk,
2280 		jiffies_to_clock_t(icsk->icsk_rto),
2281 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2282 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2283 		tp->snd_cwnd,
2284 		sk->sk_state == TCP_LISTEN ?
2285 		    (fastopenq ? fastopenq->max_qlen : 0) :
2286 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2287 }
2288 
2289 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2290 			       struct seq_file *f, int i)
2291 {
2292 	__be32 dest, src;
2293 	__u16 destp, srcp;
2294 	s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2295 
2296 	dest  = tw->tw_daddr;
2297 	src   = tw->tw_rcv_saddr;
2298 	destp = ntohs(tw->tw_dport);
2299 	srcp  = ntohs(tw->tw_sport);
2300 
2301 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2302 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2303 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2304 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2305 		atomic_read(&tw->tw_refcnt), tw);
2306 }
2307 
2308 #define TMPSZ 150
2309 
2310 static int tcp4_seq_show(struct seq_file *seq, void *v)
2311 {
2312 	struct tcp_iter_state *st;
2313 	struct sock *sk = v;
2314 
2315 	seq_setwidth(seq, TMPSZ - 1);
2316 	if (v == SEQ_START_TOKEN) {
2317 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2318 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2319 			   "inode");
2320 		goto out;
2321 	}
2322 	st = seq->private;
2323 
2324 	switch (st->state) {
2325 	case TCP_SEQ_STATE_LISTENING:
2326 	case TCP_SEQ_STATE_ESTABLISHED:
2327 		if (sk->sk_state == TCP_TIME_WAIT)
2328 			get_timewait4_sock(v, seq, st->num);
2329 		else
2330 			get_tcp4_sock(v, seq, st->num);
2331 		break;
2332 	case TCP_SEQ_STATE_OPENREQ:
2333 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2334 		break;
2335 	}
2336 out:
2337 	seq_pad(seq, '\n');
2338 	return 0;
2339 }
2340 
2341 static const struct file_operations tcp_afinfo_seq_fops = {
2342 	.owner   = THIS_MODULE,
2343 	.open    = tcp_seq_open,
2344 	.read    = seq_read,
2345 	.llseek  = seq_lseek,
2346 	.release = seq_release_net
2347 };
2348 
2349 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2350 	.name		= "tcp",
2351 	.family		= AF_INET,
2352 	.seq_fops	= &tcp_afinfo_seq_fops,
2353 	.seq_ops	= {
2354 		.show		= tcp4_seq_show,
2355 	},
2356 };
2357 
2358 static int __net_init tcp4_proc_init_net(struct net *net)
2359 {
2360 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2361 }
2362 
2363 static void __net_exit tcp4_proc_exit_net(struct net *net)
2364 {
2365 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2366 }
2367 
2368 static struct pernet_operations tcp4_net_ops = {
2369 	.init = tcp4_proc_init_net,
2370 	.exit = tcp4_proc_exit_net,
2371 };
2372 
2373 int __init tcp4_proc_init(void)
2374 {
2375 	return register_pernet_subsys(&tcp4_net_ops);
2376 }
2377 
2378 void tcp4_proc_exit(void)
2379 {
2380 	unregister_pernet_subsys(&tcp4_net_ops);
2381 }
2382 #endif /* CONFIG_PROC_FS */
2383 
2384 struct proto tcp_prot = {
2385 	.name			= "TCP",
2386 	.owner			= THIS_MODULE,
2387 	.close			= tcp_close,
2388 	.connect		= tcp_v4_connect,
2389 	.disconnect		= tcp_disconnect,
2390 	.accept			= inet_csk_accept,
2391 	.ioctl			= tcp_ioctl,
2392 	.init			= tcp_v4_init_sock,
2393 	.destroy		= tcp_v4_destroy_sock,
2394 	.shutdown		= tcp_shutdown,
2395 	.setsockopt		= tcp_setsockopt,
2396 	.getsockopt		= tcp_getsockopt,
2397 	.recvmsg		= tcp_recvmsg,
2398 	.sendmsg		= tcp_sendmsg,
2399 	.sendpage		= tcp_sendpage,
2400 	.backlog_rcv		= tcp_v4_do_rcv,
2401 	.release_cb		= tcp_release_cb,
2402 	.hash			= inet_hash,
2403 	.unhash			= inet_unhash,
2404 	.get_port		= inet_csk_get_port,
2405 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2406 	.stream_memory_free	= tcp_stream_memory_free,
2407 	.sockets_allocated	= &tcp_sockets_allocated,
2408 	.orphan_count		= &tcp_orphan_count,
2409 	.memory_allocated	= &tcp_memory_allocated,
2410 	.memory_pressure	= &tcp_memory_pressure,
2411 	.sysctl_mem		= sysctl_tcp_mem,
2412 	.sysctl_wmem		= sysctl_tcp_wmem,
2413 	.sysctl_rmem		= sysctl_tcp_rmem,
2414 	.max_header		= MAX_TCP_HEADER,
2415 	.obj_size		= sizeof(struct tcp_sock),
2416 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2417 	.twsk_prot		= &tcp_timewait_sock_ops,
2418 	.rsk_prot		= &tcp_request_sock_ops,
2419 	.h.hashinfo		= &tcp_hashinfo,
2420 	.no_autobind		= true,
2421 #ifdef CONFIG_COMPAT
2422 	.compat_setsockopt	= compat_tcp_setsockopt,
2423 	.compat_getsockopt	= compat_tcp_getsockopt,
2424 #endif
2425 #ifdef CONFIG_MEMCG_KMEM
2426 	.init_cgroup		= tcp_init_cgroup,
2427 	.destroy_cgroup		= tcp_destroy_cgroup,
2428 	.proto_cgroup		= tcp_proto_cgroup,
2429 #endif
2430 };
2431 EXPORT_SYMBOL(tcp_prot);
2432 
2433 static void __net_exit tcp_sk_exit(struct net *net)
2434 {
2435 	int cpu;
2436 
2437 	for_each_possible_cpu(cpu)
2438 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2439 	free_percpu(net->ipv4.tcp_sk);
2440 }
2441 
2442 static int __net_init tcp_sk_init(struct net *net)
2443 {
2444 	int res, cpu;
2445 
2446 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2447 	if (!net->ipv4.tcp_sk)
2448 		return -ENOMEM;
2449 
2450 	for_each_possible_cpu(cpu) {
2451 		struct sock *sk;
2452 
2453 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2454 					   IPPROTO_TCP, net);
2455 		if (res)
2456 			goto fail;
2457 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2458 	}
2459 	net->ipv4.sysctl_tcp_ecn = 2;
2460 	return 0;
2461 
2462 fail:
2463 	tcp_sk_exit(net);
2464 
2465 	return res;
2466 }
2467 
2468 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2469 {
2470 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2471 }
2472 
2473 static struct pernet_operations __net_initdata tcp_sk_ops = {
2474        .init	   = tcp_sk_init,
2475        .exit	   = tcp_sk_exit,
2476        .exit_batch = tcp_sk_exit_batch,
2477 };
2478 
2479 void __init tcp_v4_init(void)
2480 {
2481 	inet_hashinfo_init(&tcp_hashinfo);
2482 	if (register_pernet_subsys(&tcp_sk_ops))
2483 		panic("Failed to create the TCP control socket.\n");
2484 }
2485