xref: /linux/net/ipv4/tcp_ipv4.c (revision f7308991bfeea3f6a4c6281c64fc1ba9dc6e56b3)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #include <trace/events/tcp.h>
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	/* With PAWS, it is safe from the viewpoint
117 	   of data integrity. Even without PAWS it is safe provided sequence
118 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119 
120 	   Actually, the idea is close to VJ's one, only timestamp cache is
121 	   held not per host, but per port pair and TW bucket is used as state
122 	   holder.
123 
124 	   If TW bucket has been already destroyed we fall back to VJ's scheme
125 	   and use initial timestamp retrieved from peer table.
126 	 */
127 	if (tcptw->tw_ts_recent_stamp &&
128 	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 		if (tp->write_seq == 0)
132 			tp->write_seq = 1;
133 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
134 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 		sock_hold(sktw);
136 		return 1;
137 	}
138 
139 	return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142 
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 {
146 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
147 	struct inet_sock *inet = inet_sk(sk);
148 	struct tcp_sock *tp = tcp_sk(sk);
149 	__be16 orig_sport, orig_dport;
150 	__be32 daddr, nexthop;
151 	struct flowi4 *fl4;
152 	struct rtable *rt;
153 	int err;
154 	struct ip_options_rcu *inet_opt;
155 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
156 
157 	if (addr_len < sizeof(struct sockaddr_in))
158 		return -EINVAL;
159 
160 	if (usin->sin_family != AF_INET)
161 		return -EAFNOSUPPORT;
162 
163 	nexthop = daddr = usin->sin_addr.s_addr;
164 	inet_opt = rcu_dereference_protected(inet->inet_opt,
165 					     lockdep_sock_is_held(sk));
166 	if (inet_opt && inet_opt->opt.srr) {
167 		if (!daddr)
168 			return -EINVAL;
169 		nexthop = inet_opt->opt.faddr;
170 	}
171 
172 	orig_sport = inet->inet_sport;
173 	orig_dport = usin->sin_port;
174 	fl4 = &inet->cork.fl.u.ip4;
175 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
176 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
177 			      IPPROTO_TCP,
178 			      orig_sport, orig_dport, sk);
179 	if (IS_ERR(rt)) {
180 		err = PTR_ERR(rt);
181 		if (err == -ENETUNREACH)
182 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
183 		return err;
184 	}
185 
186 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 		ip_rt_put(rt);
188 		return -ENETUNREACH;
189 	}
190 
191 	if (!inet_opt || !inet_opt->opt.srr)
192 		daddr = fl4->daddr;
193 
194 	if (!inet->inet_saddr)
195 		inet->inet_saddr = fl4->saddr;
196 	sk_rcv_saddr_set(sk, inet->inet_saddr);
197 
198 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
199 		/* Reset inherited state */
200 		tp->rx_opt.ts_recent	   = 0;
201 		tp->rx_opt.ts_recent_stamp = 0;
202 		if (likely(!tp->repair))
203 			tp->write_seq	   = 0;
204 	}
205 
206 	inet->inet_dport = usin->sin_port;
207 	sk_daddr_set(sk, daddr);
208 
209 	inet_csk(sk)->icsk_ext_hdr_len = 0;
210 	if (inet_opt)
211 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212 
213 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214 
215 	/* Socket identity is still unknown (sport may be zero).
216 	 * However we set state to SYN-SENT and not releasing socket
217 	 * lock select source port, enter ourselves into the hash tables and
218 	 * complete initialization after this.
219 	 */
220 	tcp_set_state(sk, TCP_SYN_SENT);
221 	err = inet_hash_connect(tcp_death_row, sk);
222 	if (err)
223 		goto failure;
224 
225 	sk_set_txhash(sk);
226 
227 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228 			       inet->inet_sport, inet->inet_dport, sk);
229 	if (IS_ERR(rt)) {
230 		err = PTR_ERR(rt);
231 		rt = NULL;
232 		goto failure;
233 	}
234 	/* OK, now commit destination to socket.  */
235 	sk->sk_gso_type = SKB_GSO_TCPV4;
236 	sk_setup_caps(sk, &rt->dst);
237 	rt = NULL;
238 
239 	if (likely(!tp->repair)) {
240 		if (!tp->write_seq)
241 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
242 						       inet->inet_daddr,
243 						       inet->inet_sport,
244 						       usin->sin_port);
245 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
246 						 inet->inet_saddr,
247 						 inet->inet_daddr);
248 	}
249 
250 	inet->inet_id = tp->write_seq ^ jiffies;
251 
252 	if (tcp_fastopen_defer_connect(sk, &err))
253 		return err;
254 	if (err)
255 		goto failure;
256 
257 	err = tcp_connect(sk);
258 
259 	if (err)
260 		goto failure;
261 
262 	return 0;
263 
264 failure:
265 	/*
266 	 * This unhashes the socket and releases the local port,
267 	 * if necessary.
268 	 */
269 	tcp_set_state(sk, TCP_CLOSE);
270 	ip_rt_put(rt);
271 	sk->sk_route_caps = 0;
272 	inet->inet_dport = 0;
273 	return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276 
277 /*
278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279  * It can be called through tcp_release_cb() if socket was owned by user
280  * at the time tcp_v4_err() was called to handle ICMP message.
281  */
282 void tcp_v4_mtu_reduced(struct sock *sk)
283 {
284 	struct inet_sock *inet = inet_sk(sk);
285 	struct dst_entry *dst;
286 	u32 mtu;
287 
288 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
289 		return;
290 	mtu = tcp_sk(sk)->mtu_info;
291 	dst = inet_csk_update_pmtu(sk, mtu);
292 	if (!dst)
293 		return;
294 
295 	/* Something is about to be wrong... Remember soft error
296 	 * for the case, if this connection will not able to recover.
297 	 */
298 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
299 		sk->sk_err_soft = EMSGSIZE;
300 
301 	mtu = dst_mtu(dst);
302 
303 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
304 	    ip_sk_accept_pmtu(sk) &&
305 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
306 		tcp_sync_mss(sk, mtu);
307 
308 		/* Resend the TCP packet because it's
309 		 * clear that the old packet has been
310 		 * dropped. This is the new "fast" path mtu
311 		 * discovery.
312 		 */
313 		tcp_simple_retransmit(sk);
314 	} /* else let the usual retransmit timer handle it */
315 }
316 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
317 
318 static void do_redirect(struct sk_buff *skb, struct sock *sk)
319 {
320 	struct dst_entry *dst = __sk_dst_check(sk, 0);
321 
322 	if (dst)
323 		dst->ops->redirect(dst, sk, skb);
324 }
325 
326 
327 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
328 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
329 {
330 	struct request_sock *req = inet_reqsk(sk);
331 	struct net *net = sock_net(sk);
332 
333 	/* ICMPs are not backlogged, hence we cannot get
334 	 * an established socket here.
335 	 */
336 	if (seq != tcp_rsk(req)->snt_isn) {
337 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
338 	} else if (abort) {
339 		/*
340 		 * Still in SYN_RECV, just remove it silently.
341 		 * There is no good way to pass the error to the newly
342 		 * created socket, and POSIX does not want network
343 		 * errors returned from accept().
344 		 */
345 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
346 		tcp_listendrop(req->rsk_listener);
347 	}
348 	reqsk_put(req);
349 }
350 EXPORT_SYMBOL(tcp_req_err);
351 
352 /*
353  * This routine is called by the ICMP module when it gets some
354  * sort of error condition.  If err < 0 then the socket should
355  * be closed and the error returned to the user.  If err > 0
356  * it's just the icmp type << 8 | icmp code.  After adjustment
357  * header points to the first 8 bytes of the tcp header.  We need
358  * to find the appropriate port.
359  *
360  * The locking strategy used here is very "optimistic". When
361  * someone else accesses the socket the ICMP is just dropped
362  * and for some paths there is no check at all.
363  * A more general error queue to queue errors for later handling
364  * is probably better.
365  *
366  */
367 
368 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
369 {
370 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
371 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
372 	struct inet_connection_sock *icsk;
373 	struct tcp_sock *tp;
374 	struct inet_sock *inet;
375 	const int type = icmp_hdr(icmp_skb)->type;
376 	const int code = icmp_hdr(icmp_skb)->code;
377 	struct sock *sk;
378 	struct sk_buff *skb;
379 	struct request_sock *fastopen;
380 	u32 seq, snd_una;
381 	s32 remaining;
382 	u32 delta_us;
383 	int err;
384 	struct net *net = dev_net(icmp_skb->dev);
385 
386 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
387 				       th->dest, iph->saddr, ntohs(th->source),
388 				       inet_iif(icmp_skb), 0);
389 	if (!sk) {
390 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
391 		return;
392 	}
393 	if (sk->sk_state == TCP_TIME_WAIT) {
394 		inet_twsk_put(inet_twsk(sk));
395 		return;
396 	}
397 	seq = ntohl(th->seq);
398 	if (sk->sk_state == TCP_NEW_SYN_RECV)
399 		return tcp_req_err(sk, seq,
400 				  type == ICMP_PARAMETERPROB ||
401 				  type == ICMP_TIME_EXCEEDED ||
402 				  (type == ICMP_DEST_UNREACH &&
403 				   (code == ICMP_NET_UNREACH ||
404 				    code == ICMP_HOST_UNREACH)));
405 
406 	bh_lock_sock(sk);
407 	/* If too many ICMPs get dropped on busy
408 	 * servers this needs to be solved differently.
409 	 * We do take care of PMTU discovery (RFC1191) special case :
410 	 * we can receive locally generated ICMP messages while socket is held.
411 	 */
412 	if (sock_owned_by_user(sk)) {
413 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
414 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
415 	}
416 	if (sk->sk_state == TCP_CLOSE)
417 		goto out;
418 
419 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
420 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
421 		goto out;
422 	}
423 
424 	icsk = inet_csk(sk);
425 	tp = tcp_sk(sk);
426 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
427 	fastopen = tp->fastopen_rsk;
428 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
429 	if (sk->sk_state != TCP_LISTEN &&
430 	    !between(seq, snd_una, tp->snd_nxt)) {
431 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
432 		goto out;
433 	}
434 
435 	switch (type) {
436 	case ICMP_REDIRECT:
437 		if (!sock_owned_by_user(sk))
438 			do_redirect(icmp_skb, sk);
439 		goto out;
440 	case ICMP_SOURCE_QUENCH:
441 		/* Just silently ignore these. */
442 		goto out;
443 	case ICMP_PARAMETERPROB:
444 		err = EPROTO;
445 		break;
446 	case ICMP_DEST_UNREACH:
447 		if (code > NR_ICMP_UNREACH)
448 			goto out;
449 
450 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
451 			/* We are not interested in TCP_LISTEN and open_requests
452 			 * (SYN-ACKs send out by Linux are always <576bytes so
453 			 * they should go through unfragmented).
454 			 */
455 			if (sk->sk_state == TCP_LISTEN)
456 				goto out;
457 
458 			tp->mtu_info = info;
459 			if (!sock_owned_by_user(sk)) {
460 				tcp_v4_mtu_reduced(sk);
461 			} else {
462 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
463 					sock_hold(sk);
464 			}
465 			goto out;
466 		}
467 
468 		err = icmp_err_convert[code].errno;
469 		/* check if icmp_skb allows revert of backoff
470 		 * (see draft-zimmermann-tcp-lcd) */
471 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
472 			break;
473 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
474 		    !icsk->icsk_backoff || fastopen)
475 			break;
476 
477 		if (sock_owned_by_user(sk))
478 			break;
479 
480 		icsk->icsk_backoff--;
481 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
482 					       TCP_TIMEOUT_INIT;
483 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
484 
485 		skb = tcp_rtx_queue_head(sk);
486 		BUG_ON(!skb);
487 
488 		tcp_mstamp_refresh(tp);
489 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
490 		remaining = icsk->icsk_rto -
491 			    usecs_to_jiffies(delta_us);
492 
493 		if (remaining > 0) {
494 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
495 						  remaining, TCP_RTO_MAX);
496 		} else {
497 			/* RTO revert clocked out retransmission.
498 			 * Will retransmit now */
499 			tcp_retransmit_timer(sk);
500 		}
501 
502 		break;
503 	case ICMP_TIME_EXCEEDED:
504 		err = EHOSTUNREACH;
505 		break;
506 	default:
507 		goto out;
508 	}
509 
510 	switch (sk->sk_state) {
511 	case TCP_SYN_SENT:
512 	case TCP_SYN_RECV:
513 		/* Only in fast or simultaneous open. If a fast open socket is
514 		 * is already accepted it is treated as a connected one below.
515 		 */
516 		if (fastopen && !fastopen->sk)
517 			break;
518 
519 		if (!sock_owned_by_user(sk)) {
520 			sk->sk_err = err;
521 
522 			sk->sk_error_report(sk);
523 
524 			tcp_done(sk);
525 		} else {
526 			sk->sk_err_soft = err;
527 		}
528 		goto out;
529 	}
530 
531 	/* If we've already connected we will keep trying
532 	 * until we time out, or the user gives up.
533 	 *
534 	 * rfc1122 4.2.3.9 allows to consider as hard errors
535 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
536 	 * but it is obsoleted by pmtu discovery).
537 	 *
538 	 * Note, that in modern internet, where routing is unreliable
539 	 * and in each dark corner broken firewalls sit, sending random
540 	 * errors ordered by their masters even this two messages finally lose
541 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
542 	 *
543 	 * Now we are in compliance with RFCs.
544 	 *							--ANK (980905)
545 	 */
546 
547 	inet = inet_sk(sk);
548 	if (!sock_owned_by_user(sk) && inet->recverr) {
549 		sk->sk_err = err;
550 		sk->sk_error_report(sk);
551 	} else	{ /* Only an error on timeout */
552 		sk->sk_err_soft = err;
553 	}
554 
555 out:
556 	bh_unlock_sock(sk);
557 	sock_put(sk);
558 }
559 
560 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
561 {
562 	struct tcphdr *th = tcp_hdr(skb);
563 
564 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
565 	skb->csum_start = skb_transport_header(skb) - skb->head;
566 	skb->csum_offset = offsetof(struct tcphdr, check);
567 }
568 
569 /* This routine computes an IPv4 TCP checksum. */
570 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
571 {
572 	const struct inet_sock *inet = inet_sk(sk);
573 
574 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
575 }
576 EXPORT_SYMBOL(tcp_v4_send_check);
577 
578 /*
579  *	This routine will send an RST to the other tcp.
580  *
581  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
582  *		      for reset.
583  *	Answer: if a packet caused RST, it is not for a socket
584  *		existing in our system, if it is matched to a socket,
585  *		it is just duplicate segment or bug in other side's TCP.
586  *		So that we build reply only basing on parameters
587  *		arrived with segment.
588  *	Exception: precedence violation. We do not implement it in any case.
589  */
590 
591 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
592 {
593 	const struct tcphdr *th = tcp_hdr(skb);
594 	struct {
595 		struct tcphdr th;
596 #ifdef CONFIG_TCP_MD5SIG
597 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
598 #endif
599 	} rep;
600 	struct ip_reply_arg arg;
601 #ifdef CONFIG_TCP_MD5SIG
602 	struct tcp_md5sig_key *key = NULL;
603 	const __u8 *hash_location = NULL;
604 	unsigned char newhash[16];
605 	int genhash;
606 	struct sock *sk1 = NULL;
607 #endif
608 	struct net *net;
609 
610 	/* Never send a reset in response to a reset. */
611 	if (th->rst)
612 		return;
613 
614 	/* If sk not NULL, it means we did a successful lookup and incoming
615 	 * route had to be correct. prequeue might have dropped our dst.
616 	 */
617 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
618 		return;
619 
620 	/* Swap the send and the receive. */
621 	memset(&rep, 0, sizeof(rep));
622 	rep.th.dest   = th->source;
623 	rep.th.source = th->dest;
624 	rep.th.doff   = sizeof(struct tcphdr) / 4;
625 	rep.th.rst    = 1;
626 
627 	if (th->ack) {
628 		rep.th.seq = th->ack_seq;
629 	} else {
630 		rep.th.ack = 1;
631 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
632 				       skb->len - (th->doff << 2));
633 	}
634 
635 	memset(&arg, 0, sizeof(arg));
636 	arg.iov[0].iov_base = (unsigned char *)&rep;
637 	arg.iov[0].iov_len  = sizeof(rep.th);
638 
639 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
640 #ifdef CONFIG_TCP_MD5SIG
641 	rcu_read_lock();
642 	hash_location = tcp_parse_md5sig_option(th);
643 	if (sk && sk_fullsock(sk)) {
644 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
645 					&ip_hdr(skb)->saddr, AF_INET);
646 	} else if (hash_location) {
647 		/*
648 		 * active side is lost. Try to find listening socket through
649 		 * source port, and then find md5 key through listening socket.
650 		 * we are not loose security here:
651 		 * Incoming packet is checked with md5 hash with finding key,
652 		 * no RST generated if md5 hash doesn't match.
653 		 */
654 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
655 					     ip_hdr(skb)->saddr,
656 					     th->source, ip_hdr(skb)->daddr,
657 					     ntohs(th->source), inet_iif(skb),
658 					     tcp_v4_sdif(skb));
659 		/* don't send rst if it can't find key */
660 		if (!sk1)
661 			goto out;
662 
663 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
664 					&ip_hdr(skb)->saddr, AF_INET);
665 		if (!key)
666 			goto out;
667 
668 
669 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
670 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
671 			goto out;
672 
673 	}
674 
675 	if (key) {
676 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
677 				   (TCPOPT_NOP << 16) |
678 				   (TCPOPT_MD5SIG << 8) |
679 				   TCPOLEN_MD5SIG);
680 		/* Update length and the length the header thinks exists */
681 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
682 		rep.th.doff = arg.iov[0].iov_len / 4;
683 
684 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
685 				     key, ip_hdr(skb)->saddr,
686 				     ip_hdr(skb)->daddr, &rep.th);
687 	}
688 #endif
689 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
690 				      ip_hdr(skb)->saddr, /* XXX */
691 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
692 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
693 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
694 
695 	/* When socket is gone, all binding information is lost.
696 	 * routing might fail in this case. No choice here, if we choose to force
697 	 * input interface, we will misroute in case of asymmetric route.
698 	 */
699 	if (sk) {
700 		arg.bound_dev_if = sk->sk_bound_dev_if;
701 		if (sk_fullsock(sk))
702 			trace_tcp_send_reset(sk, skb);
703 	}
704 
705 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
706 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
707 
708 	arg.tos = ip_hdr(skb)->tos;
709 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
710 	local_bh_disable();
711 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
712 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
713 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
714 			      &arg, arg.iov[0].iov_len);
715 
716 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
717 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
718 	local_bh_enable();
719 
720 #ifdef CONFIG_TCP_MD5SIG
721 out:
722 	rcu_read_unlock();
723 #endif
724 }
725 
726 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
727    outside socket context is ugly, certainly. What can I do?
728  */
729 
730 static void tcp_v4_send_ack(const struct sock *sk,
731 			    struct sk_buff *skb, u32 seq, u32 ack,
732 			    u32 win, u32 tsval, u32 tsecr, int oif,
733 			    struct tcp_md5sig_key *key,
734 			    int reply_flags, u8 tos)
735 {
736 	const struct tcphdr *th = tcp_hdr(skb);
737 	struct {
738 		struct tcphdr th;
739 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
740 #ifdef CONFIG_TCP_MD5SIG
741 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
742 #endif
743 			];
744 	} rep;
745 	struct net *net = sock_net(sk);
746 	struct ip_reply_arg arg;
747 
748 	memset(&rep.th, 0, sizeof(struct tcphdr));
749 	memset(&arg, 0, sizeof(arg));
750 
751 	arg.iov[0].iov_base = (unsigned char *)&rep;
752 	arg.iov[0].iov_len  = sizeof(rep.th);
753 	if (tsecr) {
754 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
755 				   (TCPOPT_TIMESTAMP << 8) |
756 				   TCPOLEN_TIMESTAMP);
757 		rep.opt[1] = htonl(tsval);
758 		rep.opt[2] = htonl(tsecr);
759 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
760 	}
761 
762 	/* Swap the send and the receive. */
763 	rep.th.dest    = th->source;
764 	rep.th.source  = th->dest;
765 	rep.th.doff    = arg.iov[0].iov_len / 4;
766 	rep.th.seq     = htonl(seq);
767 	rep.th.ack_seq = htonl(ack);
768 	rep.th.ack     = 1;
769 	rep.th.window  = htons(win);
770 
771 #ifdef CONFIG_TCP_MD5SIG
772 	if (key) {
773 		int offset = (tsecr) ? 3 : 0;
774 
775 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
776 					  (TCPOPT_NOP << 16) |
777 					  (TCPOPT_MD5SIG << 8) |
778 					  TCPOLEN_MD5SIG);
779 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
780 		rep.th.doff = arg.iov[0].iov_len/4;
781 
782 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
783 				    key, ip_hdr(skb)->saddr,
784 				    ip_hdr(skb)->daddr, &rep.th);
785 	}
786 #endif
787 	arg.flags = reply_flags;
788 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 				      ip_hdr(skb)->saddr, /* XXX */
790 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
791 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792 	if (oif)
793 		arg.bound_dev_if = oif;
794 	arg.tos = tos;
795 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
796 	local_bh_disable();
797 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
798 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
799 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
800 			      &arg, arg.iov[0].iov_len);
801 
802 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
803 	local_bh_enable();
804 }
805 
806 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
807 {
808 	struct inet_timewait_sock *tw = inet_twsk(sk);
809 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
810 
811 	tcp_v4_send_ack(sk, skb,
812 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
813 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
814 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
815 			tcptw->tw_ts_recent,
816 			tw->tw_bound_dev_if,
817 			tcp_twsk_md5_key(tcptw),
818 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
819 			tw->tw_tos
820 			);
821 
822 	inet_twsk_put(tw);
823 }
824 
825 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
826 				  struct request_sock *req)
827 {
828 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
829 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
830 	 */
831 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
832 					     tcp_sk(sk)->snd_nxt;
833 
834 	/* RFC 7323 2.3
835 	 * The window field (SEG.WND) of every outgoing segment, with the
836 	 * exception of <SYN> segments, MUST be right-shifted by
837 	 * Rcv.Wind.Shift bits:
838 	 */
839 	tcp_v4_send_ack(sk, skb, seq,
840 			tcp_rsk(req)->rcv_nxt,
841 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
842 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
843 			req->ts_recent,
844 			0,
845 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
846 					  AF_INET),
847 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
848 			ip_hdr(skb)->tos);
849 }
850 
851 /*
852  *	Send a SYN-ACK after having received a SYN.
853  *	This still operates on a request_sock only, not on a big
854  *	socket.
855  */
856 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
857 			      struct flowi *fl,
858 			      struct request_sock *req,
859 			      struct tcp_fastopen_cookie *foc,
860 			      enum tcp_synack_type synack_type)
861 {
862 	const struct inet_request_sock *ireq = inet_rsk(req);
863 	struct flowi4 fl4;
864 	int err = -1;
865 	struct sk_buff *skb;
866 
867 	/* First, grab a route. */
868 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
869 		return -1;
870 
871 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
872 
873 	if (skb) {
874 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
875 
876 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
877 					    ireq->ir_rmt_addr,
878 					    ireq_opt_deref(ireq));
879 		err = net_xmit_eval(err);
880 	}
881 
882 	return err;
883 }
884 
885 /*
886  *	IPv4 request_sock destructor.
887  */
888 static void tcp_v4_reqsk_destructor(struct request_sock *req)
889 {
890 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
891 }
892 
893 #ifdef CONFIG_TCP_MD5SIG
894 /*
895  * RFC2385 MD5 checksumming requires a mapping of
896  * IP address->MD5 Key.
897  * We need to maintain these in the sk structure.
898  */
899 
900 /* Find the Key structure for an address.  */
901 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
902 					 const union tcp_md5_addr *addr,
903 					 int family)
904 {
905 	const struct tcp_sock *tp = tcp_sk(sk);
906 	struct tcp_md5sig_key *key;
907 	const struct tcp_md5sig_info *md5sig;
908 	__be32 mask;
909 	struct tcp_md5sig_key *best_match = NULL;
910 	bool match;
911 
912 	/* caller either holds rcu_read_lock() or socket lock */
913 	md5sig = rcu_dereference_check(tp->md5sig_info,
914 				       lockdep_sock_is_held(sk));
915 	if (!md5sig)
916 		return NULL;
917 
918 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
919 		if (key->family != family)
920 			continue;
921 
922 		if (family == AF_INET) {
923 			mask = inet_make_mask(key->prefixlen);
924 			match = (key->addr.a4.s_addr & mask) ==
925 				(addr->a4.s_addr & mask);
926 #if IS_ENABLED(CONFIG_IPV6)
927 		} else if (family == AF_INET6) {
928 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
929 						  key->prefixlen);
930 #endif
931 		} else {
932 			match = false;
933 		}
934 
935 		if (match && (!best_match ||
936 			      key->prefixlen > best_match->prefixlen))
937 			best_match = key;
938 	}
939 	return best_match;
940 }
941 EXPORT_SYMBOL(tcp_md5_do_lookup);
942 
943 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
944 						      const union tcp_md5_addr *addr,
945 						      int family, u8 prefixlen)
946 {
947 	const struct tcp_sock *tp = tcp_sk(sk);
948 	struct tcp_md5sig_key *key;
949 	unsigned int size = sizeof(struct in_addr);
950 	const struct tcp_md5sig_info *md5sig;
951 
952 	/* caller either holds rcu_read_lock() or socket lock */
953 	md5sig = rcu_dereference_check(tp->md5sig_info,
954 				       lockdep_sock_is_held(sk));
955 	if (!md5sig)
956 		return NULL;
957 #if IS_ENABLED(CONFIG_IPV6)
958 	if (family == AF_INET6)
959 		size = sizeof(struct in6_addr);
960 #endif
961 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
962 		if (key->family != family)
963 			continue;
964 		if (!memcmp(&key->addr, addr, size) &&
965 		    key->prefixlen == prefixlen)
966 			return key;
967 	}
968 	return NULL;
969 }
970 
971 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
972 					 const struct sock *addr_sk)
973 {
974 	const union tcp_md5_addr *addr;
975 
976 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
977 	return tcp_md5_do_lookup(sk, addr, AF_INET);
978 }
979 EXPORT_SYMBOL(tcp_v4_md5_lookup);
980 
981 /* This can be called on a newly created socket, from other files */
982 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
983 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
984 		   gfp_t gfp)
985 {
986 	/* Add Key to the list */
987 	struct tcp_md5sig_key *key;
988 	struct tcp_sock *tp = tcp_sk(sk);
989 	struct tcp_md5sig_info *md5sig;
990 
991 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
992 	if (key) {
993 		/* Pre-existing entry - just update that one. */
994 		memcpy(key->key, newkey, newkeylen);
995 		key->keylen = newkeylen;
996 		return 0;
997 	}
998 
999 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1000 					   lockdep_sock_is_held(sk));
1001 	if (!md5sig) {
1002 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1003 		if (!md5sig)
1004 			return -ENOMEM;
1005 
1006 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1007 		INIT_HLIST_HEAD(&md5sig->head);
1008 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1009 	}
1010 
1011 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1012 	if (!key)
1013 		return -ENOMEM;
1014 	if (!tcp_alloc_md5sig_pool()) {
1015 		sock_kfree_s(sk, key, sizeof(*key));
1016 		return -ENOMEM;
1017 	}
1018 
1019 	memcpy(key->key, newkey, newkeylen);
1020 	key->keylen = newkeylen;
1021 	key->family = family;
1022 	key->prefixlen = prefixlen;
1023 	memcpy(&key->addr, addr,
1024 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1025 				      sizeof(struct in_addr));
1026 	hlist_add_head_rcu(&key->node, &md5sig->head);
1027 	return 0;
1028 }
1029 EXPORT_SYMBOL(tcp_md5_do_add);
1030 
1031 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1032 		   u8 prefixlen)
1033 {
1034 	struct tcp_md5sig_key *key;
1035 
1036 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1037 	if (!key)
1038 		return -ENOENT;
1039 	hlist_del_rcu(&key->node);
1040 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1041 	kfree_rcu(key, rcu);
1042 	return 0;
1043 }
1044 EXPORT_SYMBOL(tcp_md5_do_del);
1045 
1046 static void tcp_clear_md5_list(struct sock *sk)
1047 {
1048 	struct tcp_sock *tp = tcp_sk(sk);
1049 	struct tcp_md5sig_key *key;
1050 	struct hlist_node *n;
1051 	struct tcp_md5sig_info *md5sig;
1052 
1053 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1054 
1055 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1056 		hlist_del_rcu(&key->node);
1057 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1058 		kfree_rcu(key, rcu);
1059 	}
1060 }
1061 
1062 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1063 				 char __user *optval, int optlen)
1064 {
1065 	struct tcp_md5sig cmd;
1066 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1067 	u8 prefixlen = 32;
1068 
1069 	if (optlen < sizeof(cmd))
1070 		return -EINVAL;
1071 
1072 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1073 		return -EFAULT;
1074 
1075 	if (sin->sin_family != AF_INET)
1076 		return -EINVAL;
1077 
1078 	if (optname == TCP_MD5SIG_EXT &&
1079 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1080 		prefixlen = cmd.tcpm_prefixlen;
1081 		if (prefixlen > 32)
1082 			return -EINVAL;
1083 	}
1084 
1085 	if (!cmd.tcpm_keylen)
1086 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1087 				      AF_INET, prefixlen);
1088 
1089 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1090 		return -EINVAL;
1091 
1092 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1093 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1094 			      GFP_KERNEL);
1095 }
1096 
1097 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1098 				   __be32 daddr, __be32 saddr,
1099 				   const struct tcphdr *th, int nbytes)
1100 {
1101 	struct tcp4_pseudohdr *bp;
1102 	struct scatterlist sg;
1103 	struct tcphdr *_th;
1104 
1105 	bp = hp->scratch;
1106 	bp->saddr = saddr;
1107 	bp->daddr = daddr;
1108 	bp->pad = 0;
1109 	bp->protocol = IPPROTO_TCP;
1110 	bp->len = cpu_to_be16(nbytes);
1111 
1112 	_th = (struct tcphdr *)(bp + 1);
1113 	memcpy(_th, th, sizeof(*th));
1114 	_th->check = 0;
1115 
1116 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1117 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1118 				sizeof(*bp) + sizeof(*th));
1119 	return crypto_ahash_update(hp->md5_req);
1120 }
1121 
1122 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1123 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1124 {
1125 	struct tcp_md5sig_pool *hp;
1126 	struct ahash_request *req;
1127 
1128 	hp = tcp_get_md5sig_pool();
1129 	if (!hp)
1130 		goto clear_hash_noput;
1131 	req = hp->md5_req;
1132 
1133 	if (crypto_ahash_init(req))
1134 		goto clear_hash;
1135 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1136 		goto clear_hash;
1137 	if (tcp_md5_hash_key(hp, key))
1138 		goto clear_hash;
1139 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1140 	if (crypto_ahash_final(req))
1141 		goto clear_hash;
1142 
1143 	tcp_put_md5sig_pool();
1144 	return 0;
1145 
1146 clear_hash:
1147 	tcp_put_md5sig_pool();
1148 clear_hash_noput:
1149 	memset(md5_hash, 0, 16);
1150 	return 1;
1151 }
1152 
1153 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1154 			const struct sock *sk,
1155 			const struct sk_buff *skb)
1156 {
1157 	struct tcp_md5sig_pool *hp;
1158 	struct ahash_request *req;
1159 	const struct tcphdr *th = tcp_hdr(skb);
1160 	__be32 saddr, daddr;
1161 
1162 	if (sk) { /* valid for establish/request sockets */
1163 		saddr = sk->sk_rcv_saddr;
1164 		daddr = sk->sk_daddr;
1165 	} else {
1166 		const struct iphdr *iph = ip_hdr(skb);
1167 		saddr = iph->saddr;
1168 		daddr = iph->daddr;
1169 	}
1170 
1171 	hp = tcp_get_md5sig_pool();
1172 	if (!hp)
1173 		goto clear_hash_noput;
1174 	req = hp->md5_req;
1175 
1176 	if (crypto_ahash_init(req))
1177 		goto clear_hash;
1178 
1179 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1180 		goto clear_hash;
1181 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1182 		goto clear_hash;
1183 	if (tcp_md5_hash_key(hp, key))
1184 		goto clear_hash;
1185 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1186 	if (crypto_ahash_final(req))
1187 		goto clear_hash;
1188 
1189 	tcp_put_md5sig_pool();
1190 	return 0;
1191 
1192 clear_hash:
1193 	tcp_put_md5sig_pool();
1194 clear_hash_noput:
1195 	memset(md5_hash, 0, 16);
1196 	return 1;
1197 }
1198 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1199 
1200 #endif
1201 
1202 /* Called with rcu_read_lock() */
1203 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1204 				    const struct sk_buff *skb)
1205 {
1206 #ifdef CONFIG_TCP_MD5SIG
1207 	/*
1208 	 * This gets called for each TCP segment that arrives
1209 	 * so we want to be efficient.
1210 	 * We have 3 drop cases:
1211 	 * o No MD5 hash and one expected.
1212 	 * o MD5 hash and we're not expecting one.
1213 	 * o MD5 hash and its wrong.
1214 	 */
1215 	const __u8 *hash_location = NULL;
1216 	struct tcp_md5sig_key *hash_expected;
1217 	const struct iphdr *iph = ip_hdr(skb);
1218 	const struct tcphdr *th = tcp_hdr(skb);
1219 	int genhash;
1220 	unsigned char newhash[16];
1221 
1222 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1223 					  AF_INET);
1224 	hash_location = tcp_parse_md5sig_option(th);
1225 
1226 	/* We've parsed the options - do we have a hash? */
1227 	if (!hash_expected && !hash_location)
1228 		return false;
1229 
1230 	if (hash_expected && !hash_location) {
1231 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1232 		return true;
1233 	}
1234 
1235 	if (!hash_expected && hash_location) {
1236 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1237 		return true;
1238 	}
1239 
1240 	/* Okay, so this is hash_expected and hash_location -
1241 	 * so we need to calculate the checksum.
1242 	 */
1243 	genhash = tcp_v4_md5_hash_skb(newhash,
1244 				      hash_expected,
1245 				      NULL, skb);
1246 
1247 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1248 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1249 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1250 				     &iph->saddr, ntohs(th->source),
1251 				     &iph->daddr, ntohs(th->dest),
1252 				     genhash ? " tcp_v4_calc_md5_hash failed"
1253 				     : "");
1254 		return true;
1255 	}
1256 	return false;
1257 #endif
1258 	return false;
1259 }
1260 
1261 static void tcp_v4_init_req(struct request_sock *req,
1262 			    const struct sock *sk_listener,
1263 			    struct sk_buff *skb)
1264 {
1265 	struct inet_request_sock *ireq = inet_rsk(req);
1266 	struct net *net = sock_net(sk_listener);
1267 
1268 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1269 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1270 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1271 }
1272 
1273 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1274 					  struct flowi *fl,
1275 					  const struct request_sock *req)
1276 {
1277 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1278 }
1279 
1280 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1281 	.family		=	PF_INET,
1282 	.obj_size	=	sizeof(struct tcp_request_sock),
1283 	.rtx_syn_ack	=	tcp_rtx_synack,
1284 	.send_ack	=	tcp_v4_reqsk_send_ack,
1285 	.destructor	=	tcp_v4_reqsk_destructor,
1286 	.send_reset	=	tcp_v4_send_reset,
1287 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1288 };
1289 
1290 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1291 	.mss_clamp	=	TCP_MSS_DEFAULT,
1292 #ifdef CONFIG_TCP_MD5SIG
1293 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1294 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1295 #endif
1296 	.init_req	=	tcp_v4_init_req,
1297 #ifdef CONFIG_SYN_COOKIES
1298 	.cookie_init_seq =	cookie_v4_init_sequence,
1299 #endif
1300 	.route_req	=	tcp_v4_route_req,
1301 	.init_seq	=	tcp_v4_init_seq,
1302 	.init_ts_off	=	tcp_v4_init_ts_off,
1303 	.send_synack	=	tcp_v4_send_synack,
1304 };
1305 
1306 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1307 {
1308 	/* Never answer to SYNs send to broadcast or multicast */
1309 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1310 		goto drop;
1311 
1312 	return tcp_conn_request(&tcp_request_sock_ops,
1313 				&tcp_request_sock_ipv4_ops, sk, skb);
1314 
1315 drop:
1316 	tcp_listendrop(sk);
1317 	return 0;
1318 }
1319 EXPORT_SYMBOL(tcp_v4_conn_request);
1320 
1321 
1322 /*
1323  * The three way handshake has completed - we got a valid synack -
1324  * now create the new socket.
1325  */
1326 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1327 				  struct request_sock *req,
1328 				  struct dst_entry *dst,
1329 				  struct request_sock *req_unhash,
1330 				  bool *own_req)
1331 {
1332 	struct inet_request_sock *ireq;
1333 	struct inet_sock *newinet;
1334 	struct tcp_sock *newtp;
1335 	struct sock *newsk;
1336 #ifdef CONFIG_TCP_MD5SIG
1337 	struct tcp_md5sig_key *key;
1338 #endif
1339 	struct ip_options_rcu *inet_opt;
1340 
1341 	if (sk_acceptq_is_full(sk))
1342 		goto exit_overflow;
1343 
1344 	newsk = tcp_create_openreq_child(sk, req, skb);
1345 	if (!newsk)
1346 		goto exit_nonewsk;
1347 
1348 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1349 	inet_sk_rx_dst_set(newsk, skb);
1350 
1351 	newtp		      = tcp_sk(newsk);
1352 	newinet		      = inet_sk(newsk);
1353 	ireq		      = inet_rsk(req);
1354 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1355 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1356 	newsk->sk_bound_dev_if = ireq->ir_iif;
1357 	newinet->inet_saddr   = ireq->ir_loc_addr;
1358 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1359 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1360 	newinet->mc_index     = inet_iif(skb);
1361 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1362 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1363 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1364 	if (inet_opt)
1365 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1366 	newinet->inet_id = newtp->write_seq ^ jiffies;
1367 
1368 	if (!dst) {
1369 		dst = inet_csk_route_child_sock(sk, newsk, req);
1370 		if (!dst)
1371 			goto put_and_exit;
1372 	} else {
1373 		/* syncookie case : see end of cookie_v4_check() */
1374 	}
1375 	sk_setup_caps(newsk, dst);
1376 
1377 	tcp_ca_openreq_child(newsk, dst);
1378 
1379 	tcp_sync_mss(newsk, dst_mtu(dst));
1380 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1381 
1382 	tcp_initialize_rcv_mss(newsk);
1383 
1384 #ifdef CONFIG_TCP_MD5SIG
1385 	/* Copy over the MD5 key from the original socket */
1386 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1387 				AF_INET);
1388 	if (key) {
1389 		/*
1390 		 * We're using one, so create a matching key
1391 		 * on the newsk structure. If we fail to get
1392 		 * memory, then we end up not copying the key
1393 		 * across. Shucks.
1394 		 */
1395 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1396 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1397 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1398 	}
1399 #endif
1400 
1401 	if (__inet_inherit_port(sk, newsk) < 0)
1402 		goto put_and_exit;
1403 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1404 	if (likely(*own_req)) {
1405 		tcp_move_syn(newtp, req);
1406 		ireq->ireq_opt = NULL;
1407 	} else {
1408 		newinet->inet_opt = NULL;
1409 	}
1410 	return newsk;
1411 
1412 exit_overflow:
1413 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1414 exit_nonewsk:
1415 	dst_release(dst);
1416 exit:
1417 	tcp_listendrop(sk);
1418 	return NULL;
1419 put_and_exit:
1420 	newinet->inet_opt = NULL;
1421 	inet_csk_prepare_forced_close(newsk);
1422 	tcp_done(newsk);
1423 	goto exit;
1424 }
1425 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1426 
1427 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1428 {
1429 #ifdef CONFIG_SYN_COOKIES
1430 	const struct tcphdr *th = tcp_hdr(skb);
1431 
1432 	if (!th->syn)
1433 		sk = cookie_v4_check(sk, skb);
1434 #endif
1435 	return sk;
1436 }
1437 
1438 /* The socket must have it's spinlock held when we get
1439  * here, unless it is a TCP_LISTEN socket.
1440  *
1441  * We have a potential double-lock case here, so even when
1442  * doing backlog processing we use the BH locking scheme.
1443  * This is because we cannot sleep with the original spinlock
1444  * held.
1445  */
1446 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1447 {
1448 	struct sock *rsk;
1449 
1450 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1451 		struct dst_entry *dst = sk->sk_rx_dst;
1452 
1453 		sock_rps_save_rxhash(sk, skb);
1454 		sk_mark_napi_id(sk, skb);
1455 		if (dst) {
1456 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1457 			    !dst->ops->check(dst, 0)) {
1458 				dst_release(dst);
1459 				sk->sk_rx_dst = NULL;
1460 			}
1461 		}
1462 		tcp_rcv_established(sk, skb, tcp_hdr(skb));
1463 		return 0;
1464 	}
1465 
1466 	if (tcp_checksum_complete(skb))
1467 		goto csum_err;
1468 
1469 	if (sk->sk_state == TCP_LISTEN) {
1470 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1471 
1472 		if (!nsk)
1473 			goto discard;
1474 		if (nsk != sk) {
1475 			if (tcp_child_process(sk, nsk, skb)) {
1476 				rsk = nsk;
1477 				goto reset;
1478 			}
1479 			return 0;
1480 		}
1481 	} else
1482 		sock_rps_save_rxhash(sk, skb);
1483 
1484 	if (tcp_rcv_state_process(sk, skb)) {
1485 		rsk = sk;
1486 		goto reset;
1487 	}
1488 	return 0;
1489 
1490 reset:
1491 	tcp_v4_send_reset(rsk, skb);
1492 discard:
1493 	kfree_skb(skb);
1494 	/* Be careful here. If this function gets more complicated and
1495 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1496 	 * might be destroyed here. This current version compiles correctly,
1497 	 * but you have been warned.
1498 	 */
1499 	return 0;
1500 
1501 csum_err:
1502 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1503 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1504 	goto discard;
1505 }
1506 EXPORT_SYMBOL(tcp_v4_do_rcv);
1507 
1508 int tcp_v4_early_demux(struct sk_buff *skb)
1509 {
1510 	const struct iphdr *iph;
1511 	const struct tcphdr *th;
1512 	struct sock *sk;
1513 
1514 	if (skb->pkt_type != PACKET_HOST)
1515 		return 0;
1516 
1517 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1518 		return 0;
1519 
1520 	iph = ip_hdr(skb);
1521 	th = tcp_hdr(skb);
1522 
1523 	if (th->doff < sizeof(struct tcphdr) / 4)
1524 		return 0;
1525 
1526 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1527 				       iph->saddr, th->source,
1528 				       iph->daddr, ntohs(th->dest),
1529 				       skb->skb_iif, inet_sdif(skb));
1530 	if (sk) {
1531 		skb->sk = sk;
1532 		skb->destructor = sock_edemux;
1533 		if (sk_fullsock(sk)) {
1534 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1535 
1536 			if (dst)
1537 				dst = dst_check(dst, 0);
1538 			if (dst &&
1539 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1540 				skb_dst_set_noref(skb, dst);
1541 		}
1542 	}
1543 	return 0;
1544 }
1545 
1546 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1547 {
1548 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1549 
1550 	/* Only socket owner can try to collapse/prune rx queues
1551 	 * to reduce memory overhead, so add a little headroom here.
1552 	 * Few sockets backlog are possibly concurrently non empty.
1553 	 */
1554 	limit += 64*1024;
1555 
1556 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1557 	 * we can fix skb->truesize to its real value to avoid future drops.
1558 	 * This is valid because skb is not yet charged to the socket.
1559 	 * It has been noticed pure SACK packets were sometimes dropped
1560 	 * (if cooked by drivers without copybreak feature).
1561 	 */
1562 	skb_condense(skb);
1563 
1564 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1565 		bh_unlock_sock(sk);
1566 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1567 		return true;
1568 	}
1569 	return false;
1570 }
1571 EXPORT_SYMBOL(tcp_add_backlog);
1572 
1573 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1574 {
1575 	struct tcphdr *th = (struct tcphdr *)skb->data;
1576 	unsigned int eaten = skb->len;
1577 	int err;
1578 
1579 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1580 	if (!err) {
1581 		eaten -= skb->len;
1582 		TCP_SKB_CB(skb)->end_seq -= eaten;
1583 	}
1584 	return err;
1585 }
1586 EXPORT_SYMBOL(tcp_filter);
1587 
1588 static void tcp_v4_restore_cb(struct sk_buff *skb)
1589 {
1590 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1591 		sizeof(struct inet_skb_parm));
1592 }
1593 
1594 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1595 			   const struct tcphdr *th)
1596 {
1597 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1598 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1599 	 */
1600 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1601 		sizeof(struct inet_skb_parm));
1602 	barrier();
1603 
1604 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1605 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1606 				    skb->len - th->doff * 4);
1607 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1608 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1609 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1610 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1611 	TCP_SKB_CB(skb)->sacked	 = 0;
1612 	TCP_SKB_CB(skb)->has_rxtstamp =
1613 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1614 }
1615 
1616 /*
1617  *	From tcp_input.c
1618  */
1619 
1620 int tcp_v4_rcv(struct sk_buff *skb)
1621 {
1622 	struct net *net = dev_net(skb->dev);
1623 	int sdif = inet_sdif(skb);
1624 	const struct iphdr *iph;
1625 	const struct tcphdr *th;
1626 	bool refcounted;
1627 	struct sock *sk;
1628 	int ret;
1629 
1630 	if (skb->pkt_type != PACKET_HOST)
1631 		goto discard_it;
1632 
1633 	/* Count it even if it's bad */
1634 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1635 
1636 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1637 		goto discard_it;
1638 
1639 	th = (const struct tcphdr *)skb->data;
1640 
1641 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1642 		goto bad_packet;
1643 	if (!pskb_may_pull(skb, th->doff * 4))
1644 		goto discard_it;
1645 
1646 	/* An explanation is required here, I think.
1647 	 * Packet length and doff are validated by header prediction,
1648 	 * provided case of th->doff==0 is eliminated.
1649 	 * So, we defer the checks. */
1650 
1651 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1652 		goto csum_error;
1653 
1654 	th = (const struct tcphdr *)skb->data;
1655 	iph = ip_hdr(skb);
1656 lookup:
1657 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1658 			       th->dest, sdif, &refcounted);
1659 	if (!sk)
1660 		goto no_tcp_socket;
1661 
1662 process:
1663 	if (sk->sk_state == TCP_TIME_WAIT)
1664 		goto do_time_wait;
1665 
1666 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1667 		struct request_sock *req = inet_reqsk(sk);
1668 		bool req_stolen = false;
1669 		struct sock *nsk;
1670 
1671 		sk = req->rsk_listener;
1672 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1673 			sk_drops_add(sk, skb);
1674 			reqsk_put(req);
1675 			goto discard_it;
1676 		}
1677 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1678 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1679 			goto lookup;
1680 		}
1681 		/* We own a reference on the listener, increase it again
1682 		 * as we might lose it too soon.
1683 		 */
1684 		sock_hold(sk);
1685 		refcounted = true;
1686 		nsk = NULL;
1687 		if (!tcp_filter(sk, skb)) {
1688 			th = (const struct tcphdr *)skb->data;
1689 			iph = ip_hdr(skb);
1690 			tcp_v4_fill_cb(skb, iph, th);
1691 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1692 		}
1693 		if (!nsk) {
1694 			reqsk_put(req);
1695 			if (req_stolen) {
1696 				/* Another cpu got exclusive access to req
1697 				 * and created a full blown socket.
1698 				 * Try to feed this packet to this socket
1699 				 * instead of discarding it.
1700 				 */
1701 				tcp_v4_restore_cb(skb);
1702 				sock_put(sk);
1703 				goto lookup;
1704 			}
1705 			goto discard_and_relse;
1706 		}
1707 		if (nsk == sk) {
1708 			reqsk_put(req);
1709 			tcp_v4_restore_cb(skb);
1710 		} else if (tcp_child_process(sk, nsk, skb)) {
1711 			tcp_v4_send_reset(nsk, skb);
1712 			goto discard_and_relse;
1713 		} else {
1714 			sock_put(sk);
1715 			return 0;
1716 		}
1717 	}
1718 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1719 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1720 		goto discard_and_relse;
1721 	}
1722 
1723 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1724 		goto discard_and_relse;
1725 
1726 	if (tcp_v4_inbound_md5_hash(sk, skb))
1727 		goto discard_and_relse;
1728 
1729 	nf_reset(skb);
1730 
1731 	if (tcp_filter(sk, skb))
1732 		goto discard_and_relse;
1733 	th = (const struct tcphdr *)skb->data;
1734 	iph = ip_hdr(skb);
1735 	tcp_v4_fill_cb(skb, iph, th);
1736 
1737 	skb->dev = NULL;
1738 
1739 	if (sk->sk_state == TCP_LISTEN) {
1740 		ret = tcp_v4_do_rcv(sk, skb);
1741 		goto put_and_return;
1742 	}
1743 
1744 	sk_incoming_cpu_update(sk);
1745 
1746 	bh_lock_sock_nested(sk);
1747 	tcp_segs_in(tcp_sk(sk), skb);
1748 	ret = 0;
1749 	if (!sock_owned_by_user(sk)) {
1750 		ret = tcp_v4_do_rcv(sk, skb);
1751 	} else if (tcp_add_backlog(sk, skb)) {
1752 		goto discard_and_relse;
1753 	}
1754 	bh_unlock_sock(sk);
1755 
1756 put_and_return:
1757 	if (refcounted)
1758 		sock_put(sk);
1759 
1760 	return ret;
1761 
1762 no_tcp_socket:
1763 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1764 		goto discard_it;
1765 
1766 	tcp_v4_fill_cb(skb, iph, th);
1767 
1768 	if (tcp_checksum_complete(skb)) {
1769 csum_error:
1770 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1771 bad_packet:
1772 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1773 	} else {
1774 		tcp_v4_send_reset(NULL, skb);
1775 	}
1776 
1777 discard_it:
1778 	/* Discard frame. */
1779 	kfree_skb(skb);
1780 	return 0;
1781 
1782 discard_and_relse:
1783 	sk_drops_add(sk, skb);
1784 	if (refcounted)
1785 		sock_put(sk);
1786 	goto discard_it;
1787 
1788 do_time_wait:
1789 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1790 		inet_twsk_put(inet_twsk(sk));
1791 		goto discard_it;
1792 	}
1793 
1794 	tcp_v4_fill_cb(skb, iph, th);
1795 
1796 	if (tcp_checksum_complete(skb)) {
1797 		inet_twsk_put(inet_twsk(sk));
1798 		goto csum_error;
1799 	}
1800 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1801 	case TCP_TW_SYN: {
1802 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1803 							&tcp_hashinfo, skb,
1804 							__tcp_hdrlen(th),
1805 							iph->saddr, th->source,
1806 							iph->daddr, th->dest,
1807 							inet_iif(skb),
1808 							sdif);
1809 		if (sk2) {
1810 			inet_twsk_deschedule_put(inet_twsk(sk));
1811 			sk = sk2;
1812 			tcp_v4_restore_cb(skb);
1813 			refcounted = false;
1814 			goto process;
1815 		}
1816 	}
1817 		/* to ACK */
1818 		/* fall through */
1819 	case TCP_TW_ACK:
1820 		tcp_v4_timewait_ack(sk, skb);
1821 		break;
1822 	case TCP_TW_RST:
1823 		tcp_v4_send_reset(sk, skb);
1824 		inet_twsk_deschedule_put(inet_twsk(sk));
1825 		goto discard_it;
1826 	case TCP_TW_SUCCESS:;
1827 	}
1828 	goto discard_it;
1829 }
1830 
1831 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1832 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1833 	.twsk_unique	= tcp_twsk_unique,
1834 	.twsk_destructor= tcp_twsk_destructor,
1835 };
1836 
1837 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1838 {
1839 	struct dst_entry *dst = skb_dst(skb);
1840 
1841 	if (dst && dst_hold_safe(dst)) {
1842 		sk->sk_rx_dst = dst;
1843 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1844 	}
1845 }
1846 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1847 
1848 const struct inet_connection_sock_af_ops ipv4_specific = {
1849 	.queue_xmit	   = ip_queue_xmit,
1850 	.send_check	   = tcp_v4_send_check,
1851 	.rebuild_header	   = inet_sk_rebuild_header,
1852 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1853 	.conn_request	   = tcp_v4_conn_request,
1854 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1855 	.net_header_len	   = sizeof(struct iphdr),
1856 	.setsockopt	   = ip_setsockopt,
1857 	.getsockopt	   = ip_getsockopt,
1858 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1859 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1860 #ifdef CONFIG_COMPAT
1861 	.compat_setsockopt = compat_ip_setsockopt,
1862 	.compat_getsockopt = compat_ip_getsockopt,
1863 #endif
1864 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1865 };
1866 EXPORT_SYMBOL(ipv4_specific);
1867 
1868 #ifdef CONFIG_TCP_MD5SIG
1869 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1870 	.md5_lookup		= tcp_v4_md5_lookup,
1871 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1872 	.md5_parse		= tcp_v4_parse_md5_keys,
1873 };
1874 #endif
1875 
1876 /* NOTE: A lot of things set to zero explicitly by call to
1877  *       sk_alloc() so need not be done here.
1878  */
1879 static int tcp_v4_init_sock(struct sock *sk)
1880 {
1881 	struct inet_connection_sock *icsk = inet_csk(sk);
1882 
1883 	tcp_init_sock(sk);
1884 
1885 	icsk->icsk_af_ops = &ipv4_specific;
1886 
1887 #ifdef CONFIG_TCP_MD5SIG
1888 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1889 #endif
1890 
1891 	return 0;
1892 }
1893 
1894 void tcp_v4_destroy_sock(struct sock *sk)
1895 {
1896 	struct tcp_sock *tp = tcp_sk(sk);
1897 
1898 	trace_tcp_destroy_sock(sk);
1899 
1900 	tcp_clear_xmit_timers(sk);
1901 
1902 	tcp_cleanup_congestion_control(sk);
1903 
1904 	tcp_cleanup_ulp(sk);
1905 
1906 	/* Cleanup up the write buffer. */
1907 	tcp_write_queue_purge(sk);
1908 
1909 	/* Check if we want to disable active TFO */
1910 	tcp_fastopen_active_disable_ofo_check(sk);
1911 
1912 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1913 	skb_rbtree_purge(&tp->out_of_order_queue);
1914 
1915 #ifdef CONFIG_TCP_MD5SIG
1916 	/* Clean up the MD5 key list, if any */
1917 	if (tp->md5sig_info) {
1918 		tcp_clear_md5_list(sk);
1919 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1920 		tp->md5sig_info = NULL;
1921 	}
1922 #endif
1923 
1924 	/* Clean up a referenced TCP bind bucket. */
1925 	if (inet_csk(sk)->icsk_bind_hash)
1926 		inet_put_port(sk);
1927 
1928 	BUG_ON(tp->fastopen_rsk);
1929 
1930 	/* If socket is aborted during connect operation */
1931 	tcp_free_fastopen_req(tp);
1932 	tcp_fastopen_destroy_cipher(sk);
1933 	tcp_saved_syn_free(tp);
1934 
1935 	sk_sockets_allocated_dec(sk);
1936 }
1937 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1938 
1939 #ifdef CONFIG_PROC_FS
1940 /* Proc filesystem TCP sock list dumping. */
1941 
1942 /*
1943  * Get next listener socket follow cur.  If cur is NULL, get first socket
1944  * starting from bucket given in st->bucket; when st->bucket is zero the
1945  * very first socket in the hash table is returned.
1946  */
1947 static void *listening_get_next(struct seq_file *seq, void *cur)
1948 {
1949 	struct tcp_iter_state *st = seq->private;
1950 	struct net *net = seq_file_net(seq);
1951 	struct inet_listen_hashbucket *ilb;
1952 	struct sock *sk = cur;
1953 
1954 	if (!sk) {
1955 get_head:
1956 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1957 		spin_lock(&ilb->lock);
1958 		sk = sk_head(&ilb->head);
1959 		st->offset = 0;
1960 		goto get_sk;
1961 	}
1962 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1963 	++st->num;
1964 	++st->offset;
1965 
1966 	sk = sk_next(sk);
1967 get_sk:
1968 	sk_for_each_from(sk) {
1969 		if (!net_eq(sock_net(sk), net))
1970 			continue;
1971 		if (sk->sk_family == st->family)
1972 			return sk;
1973 	}
1974 	spin_unlock(&ilb->lock);
1975 	st->offset = 0;
1976 	if (++st->bucket < INET_LHTABLE_SIZE)
1977 		goto get_head;
1978 	return NULL;
1979 }
1980 
1981 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1982 {
1983 	struct tcp_iter_state *st = seq->private;
1984 	void *rc;
1985 
1986 	st->bucket = 0;
1987 	st->offset = 0;
1988 	rc = listening_get_next(seq, NULL);
1989 
1990 	while (rc && *pos) {
1991 		rc = listening_get_next(seq, rc);
1992 		--*pos;
1993 	}
1994 	return rc;
1995 }
1996 
1997 static inline bool empty_bucket(const struct tcp_iter_state *st)
1998 {
1999 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2000 }
2001 
2002 /*
2003  * Get first established socket starting from bucket given in st->bucket.
2004  * If st->bucket is zero, the very first socket in the hash is returned.
2005  */
2006 static void *established_get_first(struct seq_file *seq)
2007 {
2008 	struct tcp_iter_state *st = seq->private;
2009 	struct net *net = seq_file_net(seq);
2010 	void *rc = NULL;
2011 
2012 	st->offset = 0;
2013 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2014 		struct sock *sk;
2015 		struct hlist_nulls_node *node;
2016 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2017 
2018 		/* Lockless fast path for the common case of empty buckets */
2019 		if (empty_bucket(st))
2020 			continue;
2021 
2022 		spin_lock_bh(lock);
2023 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2024 			if (sk->sk_family != st->family ||
2025 			    !net_eq(sock_net(sk), net)) {
2026 				continue;
2027 			}
2028 			rc = sk;
2029 			goto out;
2030 		}
2031 		spin_unlock_bh(lock);
2032 	}
2033 out:
2034 	return rc;
2035 }
2036 
2037 static void *established_get_next(struct seq_file *seq, void *cur)
2038 {
2039 	struct sock *sk = cur;
2040 	struct hlist_nulls_node *node;
2041 	struct tcp_iter_state *st = seq->private;
2042 	struct net *net = seq_file_net(seq);
2043 
2044 	++st->num;
2045 	++st->offset;
2046 
2047 	sk = sk_nulls_next(sk);
2048 
2049 	sk_nulls_for_each_from(sk, node) {
2050 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2051 			return sk;
2052 	}
2053 
2054 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2055 	++st->bucket;
2056 	return established_get_first(seq);
2057 }
2058 
2059 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2060 {
2061 	struct tcp_iter_state *st = seq->private;
2062 	void *rc;
2063 
2064 	st->bucket = 0;
2065 	rc = established_get_first(seq);
2066 
2067 	while (rc && pos) {
2068 		rc = established_get_next(seq, rc);
2069 		--pos;
2070 	}
2071 	return rc;
2072 }
2073 
2074 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2075 {
2076 	void *rc;
2077 	struct tcp_iter_state *st = seq->private;
2078 
2079 	st->state = TCP_SEQ_STATE_LISTENING;
2080 	rc	  = listening_get_idx(seq, &pos);
2081 
2082 	if (!rc) {
2083 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2084 		rc	  = established_get_idx(seq, pos);
2085 	}
2086 
2087 	return rc;
2088 }
2089 
2090 static void *tcp_seek_last_pos(struct seq_file *seq)
2091 {
2092 	struct tcp_iter_state *st = seq->private;
2093 	int offset = st->offset;
2094 	int orig_num = st->num;
2095 	void *rc = NULL;
2096 
2097 	switch (st->state) {
2098 	case TCP_SEQ_STATE_LISTENING:
2099 		if (st->bucket >= INET_LHTABLE_SIZE)
2100 			break;
2101 		st->state = TCP_SEQ_STATE_LISTENING;
2102 		rc = listening_get_next(seq, NULL);
2103 		while (offset-- && rc)
2104 			rc = listening_get_next(seq, rc);
2105 		if (rc)
2106 			break;
2107 		st->bucket = 0;
2108 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2109 		/* Fallthrough */
2110 	case TCP_SEQ_STATE_ESTABLISHED:
2111 		if (st->bucket > tcp_hashinfo.ehash_mask)
2112 			break;
2113 		rc = established_get_first(seq);
2114 		while (offset-- && rc)
2115 			rc = established_get_next(seq, rc);
2116 	}
2117 
2118 	st->num = orig_num;
2119 
2120 	return rc;
2121 }
2122 
2123 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2124 {
2125 	struct tcp_iter_state *st = seq->private;
2126 	void *rc;
2127 
2128 	if (*pos && *pos == st->last_pos) {
2129 		rc = tcp_seek_last_pos(seq);
2130 		if (rc)
2131 			goto out;
2132 	}
2133 
2134 	st->state = TCP_SEQ_STATE_LISTENING;
2135 	st->num = 0;
2136 	st->bucket = 0;
2137 	st->offset = 0;
2138 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2139 
2140 out:
2141 	st->last_pos = *pos;
2142 	return rc;
2143 }
2144 
2145 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2146 {
2147 	struct tcp_iter_state *st = seq->private;
2148 	void *rc = NULL;
2149 
2150 	if (v == SEQ_START_TOKEN) {
2151 		rc = tcp_get_idx(seq, 0);
2152 		goto out;
2153 	}
2154 
2155 	switch (st->state) {
2156 	case TCP_SEQ_STATE_LISTENING:
2157 		rc = listening_get_next(seq, v);
2158 		if (!rc) {
2159 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2160 			st->bucket = 0;
2161 			st->offset = 0;
2162 			rc	  = established_get_first(seq);
2163 		}
2164 		break;
2165 	case TCP_SEQ_STATE_ESTABLISHED:
2166 		rc = established_get_next(seq, v);
2167 		break;
2168 	}
2169 out:
2170 	++*pos;
2171 	st->last_pos = *pos;
2172 	return rc;
2173 }
2174 
2175 static void tcp_seq_stop(struct seq_file *seq, void *v)
2176 {
2177 	struct tcp_iter_state *st = seq->private;
2178 
2179 	switch (st->state) {
2180 	case TCP_SEQ_STATE_LISTENING:
2181 		if (v != SEQ_START_TOKEN)
2182 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2183 		break;
2184 	case TCP_SEQ_STATE_ESTABLISHED:
2185 		if (v)
2186 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2187 		break;
2188 	}
2189 }
2190 
2191 int tcp_seq_open(struct inode *inode, struct file *file)
2192 {
2193 	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2194 	struct tcp_iter_state *s;
2195 	int err;
2196 
2197 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2198 			  sizeof(struct tcp_iter_state));
2199 	if (err < 0)
2200 		return err;
2201 
2202 	s = ((struct seq_file *)file->private_data)->private;
2203 	s->family		= afinfo->family;
2204 	s->last_pos		= 0;
2205 	return 0;
2206 }
2207 EXPORT_SYMBOL(tcp_seq_open);
2208 
2209 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2210 {
2211 	int rc = 0;
2212 	struct proc_dir_entry *p;
2213 
2214 	afinfo->seq_ops.start		= tcp_seq_start;
2215 	afinfo->seq_ops.next		= tcp_seq_next;
2216 	afinfo->seq_ops.stop		= tcp_seq_stop;
2217 
2218 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2219 			     afinfo->seq_fops, afinfo);
2220 	if (!p)
2221 		rc = -ENOMEM;
2222 	return rc;
2223 }
2224 EXPORT_SYMBOL(tcp_proc_register);
2225 
2226 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2227 {
2228 	remove_proc_entry(afinfo->name, net->proc_net);
2229 }
2230 EXPORT_SYMBOL(tcp_proc_unregister);
2231 
2232 static void get_openreq4(const struct request_sock *req,
2233 			 struct seq_file *f, int i)
2234 {
2235 	const struct inet_request_sock *ireq = inet_rsk(req);
2236 	long delta = req->rsk_timer.expires - jiffies;
2237 
2238 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2239 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2240 		i,
2241 		ireq->ir_loc_addr,
2242 		ireq->ir_num,
2243 		ireq->ir_rmt_addr,
2244 		ntohs(ireq->ir_rmt_port),
2245 		TCP_SYN_RECV,
2246 		0, 0, /* could print option size, but that is af dependent. */
2247 		1,    /* timers active (only the expire timer) */
2248 		jiffies_delta_to_clock_t(delta),
2249 		req->num_timeout,
2250 		from_kuid_munged(seq_user_ns(f),
2251 				 sock_i_uid(req->rsk_listener)),
2252 		0,  /* non standard timer */
2253 		0, /* open_requests have no inode */
2254 		0,
2255 		req);
2256 }
2257 
2258 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2259 {
2260 	int timer_active;
2261 	unsigned long timer_expires;
2262 	const struct tcp_sock *tp = tcp_sk(sk);
2263 	const struct inet_connection_sock *icsk = inet_csk(sk);
2264 	const struct inet_sock *inet = inet_sk(sk);
2265 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2266 	__be32 dest = inet->inet_daddr;
2267 	__be32 src = inet->inet_rcv_saddr;
2268 	__u16 destp = ntohs(inet->inet_dport);
2269 	__u16 srcp = ntohs(inet->inet_sport);
2270 	int rx_queue;
2271 	int state;
2272 
2273 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2274 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2275 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2276 		timer_active	= 1;
2277 		timer_expires	= icsk->icsk_timeout;
2278 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2279 		timer_active	= 4;
2280 		timer_expires	= icsk->icsk_timeout;
2281 	} else if (timer_pending(&sk->sk_timer)) {
2282 		timer_active	= 2;
2283 		timer_expires	= sk->sk_timer.expires;
2284 	} else {
2285 		timer_active	= 0;
2286 		timer_expires = jiffies;
2287 	}
2288 
2289 	state = inet_sk_state_load(sk);
2290 	if (state == TCP_LISTEN)
2291 		rx_queue = sk->sk_ack_backlog;
2292 	else
2293 		/* Because we don't lock the socket,
2294 		 * we might find a transient negative value.
2295 		 */
2296 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2297 
2298 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2299 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2300 		i, src, srcp, dest, destp, state,
2301 		tp->write_seq - tp->snd_una,
2302 		rx_queue,
2303 		timer_active,
2304 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2305 		icsk->icsk_retransmits,
2306 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2307 		icsk->icsk_probes_out,
2308 		sock_i_ino(sk),
2309 		refcount_read(&sk->sk_refcnt), sk,
2310 		jiffies_to_clock_t(icsk->icsk_rto),
2311 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2312 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2313 		tp->snd_cwnd,
2314 		state == TCP_LISTEN ?
2315 		    fastopenq->max_qlen :
2316 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2317 }
2318 
2319 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2320 			       struct seq_file *f, int i)
2321 {
2322 	long delta = tw->tw_timer.expires - jiffies;
2323 	__be32 dest, src;
2324 	__u16 destp, srcp;
2325 
2326 	dest  = tw->tw_daddr;
2327 	src   = tw->tw_rcv_saddr;
2328 	destp = ntohs(tw->tw_dport);
2329 	srcp  = ntohs(tw->tw_sport);
2330 
2331 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2332 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2333 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2334 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2335 		refcount_read(&tw->tw_refcnt), tw);
2336 }
2337 
2338 #define TMPSZ 150
2339 
2340 static int tcp4_seq_show(struct seq_file *seq, void *v)
2341 {
2342 	struct tcp_iter_state *st;
2343 	struct sock *sk = v;
2344 
2345 	seq_setwidth(seq, TMPSZ - 1);
2346 	if (v == SEQ_START_TOKEN) {
2347 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2348 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2349 			   "inode");
2350 		goto out;
2351 	}
2352 	st = seq->private;
2353 
2354 	if (sk->sk_state == TCP_TIME_WAIT)
2355 		get_timewait4_sock(v, seq, st->num);
2356 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2357 		get_openreq4(v, seq, st->num);
2358 	else
2359 		get_tcp4_sock(v, seq, st->num);
2360 out:
2361 	seq_pad(seq, '\n');
2362 	return 0;
2363 }
2364 
2365 static const struct file_operations tcp_afinfo_seq_fops = {
2366 	.open    = tcp_seq_open,
2367 	.read    = seq_read,
2368 	.llseek  = seq_lseek,
2369 	.release = seq_release_net
2370 };
2371 
2372 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2373 	.name		= "tcp",
2374 	.family		= AF_INET,
2375 	.seq_fops	= &tcp_afinfo_seq_fops,
2376 	.seq_ops	= {
2377 		.show		= tcp4_seq_show,
2378 	},
2379 };
2380 
2381 static int __net_init tcp4_proc_init_net(struct net *net)
2382 {
2383 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2384 }
2385 
2386 static void __net_exit tcp4_proc_exit_net(struct net *net)
2387 {
2388 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2389 }
2390 
2391 static struct pernet_operations tcp4_net_ops = {
2392 	.init = tcp4_proc_init_net,
2393 	.exit = tcp4_proc_exit_net,
2394 	.async = true,
2395 };
2396 
2397 int __init tcp4_proc_init(void)
2398 {
2399 	return register_pernet_subsys(&tcp4_net_ops);
2400 }
2401 
2402 void tcp4_proc_exit(void)
2403 {
2404 	unregister_pernet_subsys(&tcp4_net_ops);
2405 }
2406 #endif /* CONFIG_PROC_FS */
2407 
2408 struct proto tcp_prot = {
2409 	.name			= "TCP",
2410 	.owner			= THIS_MODULE,
2411 	.close			= tcp_close,
2412 	.connect		= tcp_v4_connect,
2413 	.disconnect		= tcp_disconnect,
2414 	.accept			= inet_csk_accept,
2415 	.ioctl			= tcp_ioctl,
2416 	.init			= tcp_v4_init_sock,
2417 	.destroy		= tcp_v4_destroy_sock,
2418 	.shutdown		= tcp_shutdown,
2419 	.setsockopt		= tcp_setsockopt,
2420 	.getsockopt		= tcp_getsockopt,
2421 	.keepalive		= tcp_set_keepalive,
2422 	.recvmsg		= tcp_recvmsg,
2423 	.sendmsg		= tcp_sendmsg,
2424 	.sendpage		= tcp_sendpage,
2425 	.backlog_rcv		= tcp_v4_do_rcv,
2426 	.release_cb		= tcp_release_cb,
2427 	.hash			= inet_hash,
2428 	.unhash			= inet_unhash,
2429 	.get_port		= inet_csk_get_port,
2430 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2431 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2432 	.stream_memory_free	= tcp_stream_memory_free,
2433 	.sockets_allocated	= &tcp_sockets_allocated,
2434 	.orphan_count		= &tcp_orphan_count,
2435 	.memory_allocated	= &tcp_memory_allocated,
2436 	.memory_pressure	= &tcp_memory_pressure,
2437 	.sysctl_mem		= sysctl_tcp_mem,
2438 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2439 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2440 	.max_header		= MAX_TCP_HEADER,
2441 	.obj_size		= sizeof(struct tcp_sock),
2442 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2443 	.twsk_prot		= &tcp_timewait_sock_ops,
2444 	.rsk_prot		= &tcp_request_sock_ops,
2445 	.h.hashinfo		= &tcp_hashinfo,
2446 	.no_autobind		= true,
2447 #ifdef CONFIG_COMPAT
2448 	.compat_setsockopt	= compat_tcp_setsockopt,
2449 	.compat_getsockopt	= compat_tcp_getsockopt,
2450 #endif
2451 	.diag_destroy		= tcp_abort,
2452 };
2453 EXPORT_SYMBOL(tcp_prot);
2454 
2455 static void __net_exit tcp_sk_exit(struct net *net)
2456 {
2457 	int cpu;
2458 
2459 	module_put(net->ipv4.tcp_congestion_control->owner);
2460 
2461 	for_each_possible_cpu(cpu)
2462 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2463 	free_percpu(net->ipv4.tcp_sk);
2464 }
2465 
2466 static int __net_init tcp_sk_init(struct net *net)
2467 {
2468 	int res, cpu, cnt;
2469 
2470 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2471 	if (!net->ipv4.tcp_sk)
2472 		return -ENOMEM;
2473 
2474 	for_each_possible_cpu(cpu) {
2475 		struct sock *sk;
2476 
2477 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2478 					   IPPROTO_TCP, net);
2479 		if (res)
2480 			goto fail;
2481 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2482 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2483 	}
2484 
2485 	net->ipv4.sysctl_tcp_ecn = 2;
2486 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2487 
2488 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2489 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2490 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2491 
2492 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2493 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2494 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2495 
2496 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2497 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2498 	net->ipv4.sysctl_tcp_syncookies = 1;
2499 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2500 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2501 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2502 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2503 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2504 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2505 	net->ipv4.sysctl_tcp_tw_reuse = 0;
2506 
2507 	cnt = tcp_hashinfo.ehash_mask + 1;
2508 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2509 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2510 
2511 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2512 	net->ipv4.sysctl_tcp_sack = 1;
2513 	net->ipv4.sysctl_tcp_window_scaling = 1;
2514 	net->ipv4.sysctl_tcp_timestamps = 1;
2515 	net->ipv4.sysctl_tcp_early_retrans = 3;
2516 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2517 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2518 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2519 	net->ipv4.sysctl_tcp_max_reordering = 300;
2520 	net->ipv4.sysctl_tcp_dsack = 1;
2521 	net->ipv4.sysctl_tcp_app_win = 31;
2522 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2523 	net->ipv4.sysctl_tcp_frto = 2;
2524 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2525 	/* This limits the percentage of the congestion window which we
2526 	 * will allow a single TSO frame to consume.  Building TSO frames
2527 	 * which are too large can cause TCP streams to be bursty.
2528 	 */
2529 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2530 	/* Default TSQ limit of four TSO segments */
2531 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2532 	/* rfc5961 challenge ack rate limiting */
2533 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2534 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2535 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2536 	net->ipv4.sysctl_tcp_autocorking = 1;
2537 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2538 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2539 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2540 	if (net != &init_net) {
2541 		memcpy(net->ipv4.sysctl_tcp_rmem,
2542 		       init_net.ipv4.sysctl_tcp_rmem,
2543 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2544 		memcpy(net->ipv4.sysctl_tcp_wmem,
2545 		       init_net.ipv4.sysctl_tcp_wmem,
2546 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2547 	}
2548 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2549 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2550 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2551 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2552 
2553 	/* Reno is always built in */
2554 	if (!net_eq(net, &init_net) &&
2555 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2556 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2557 	else
2558 		net->ipv4.tcp_congestion_control = &tcp_reno;
2559 
2560 	return 0;
2561 fail:
2562 	tcp_sk_exit(net);
2563 
2564 	return res;
2565 }
2566 
2567 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2568 {
2569 	struct net *net;
2570 
2571 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2572 
2573 	list_for_each_entry(net, net_exit_list, exit_list)
2574 		tcp_fastopen_ctx_destroy(net);
2575 }
2576 
2577 static struct pernet_operations __net_initdata tcp_sk_ops = {
2578        .init	   = tcp_sk_init,
2579        .exit	   = tcp_sk_exit,
2580        .exit_batch = tcp_sk_exit_batch,
2581        .async	   = true,
2582 };
2583 
2584 void __init tcp_v4_init(void)
2585 {
2586 	if (register_pernet_subsys(&tcp_sk_ops))
2587 		panic("Failed to create the TCP control socket.\n");
2588 }
2589